diff options
Diffstat (limited to 'tools/testing/selftests')
565 files changed, 40080 insertions, 4152 deletions
diff --git a/tools/testing/selftests/.gitignore b/tools/testing/selftests/.gitignore index 61df01cdf0b2..055a5019b13c 100644 --- a/tools/testing/selftests/.gitignore +++ b/tools/testing/selftests/.gitignore @@ -1,6 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0-only gpiogpio-event-mon gpiogpio-hammer gpioinclude/ gpiolsgpio tpm2/SpaceTest.log -tpm2/*.pyc + +# Python bytecode and cache +__pycache__/ +*.py[cod] diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index b93fa645ee54..1195bd85af38 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -36,9 +36,9 @@ TARGETS += net TARGETS += net/forwarding TARGETS += net/mptcp TARGETS += netfilter -TARGETS += networking/timestamping TARGETS += nsfs TARGETS += pidfd +TARGETS += pid_namespace TARGETS += powerpc TARGETS += proc TARGETS += pstore @@ -92,7 +92,7 @@ override LDFLAGS = override MAKEFLAGS = endif -# Append kselftest to KBUILD_OUTPUT to avoid cluttering +# Append kselftest to KBUILD_OUTPUT and O to avoid cluttering # KBUILD_OUTPUT with selftest objects and headers installed # by selftests Makefile or lib.mk. ifdef building_out_of_srctree @@ -100,7 +100,7 @@ override LDFLAGS = endif ifneq ($(O),) - BUILD := $(O) + BUILD := $(O)/kselftest else ifneq ($(KBUILD_OUTPUT),) BUILD := $(KBUILD_OUTPUT)/kselftest @@ -249,10 +249,17 @@ else $(error Error: set INSTALL_PATH to use install) endif +FORMAT ?= .gz +TAR_PATH = $(abspath ${INSTALL_PATH}/kselftest-packages/kselftest.tar${FORMAT}) +gen_tar: install + @mkdir -p ${INSTALL_PATH}/kselftest-packages/ + @tar caf ${TAR_PATH} --exclude=kselftest-packages -C ${INSTALL_PATH} . + @echo "Created ${TAR_PATH}" + clean: @for TARGET in $(TARGETS); do \ BUILD_TARGET=$$BUILD/$$TARGET; \ $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET clean;\ done; -.PHONY: khdr all run_tests hotplug run_hotplug clean_hotplug run_pstore_crash install clean +.PHONY: khdr all run_tests hotplug run_hotplug clean_hotplug run_pstore_crash install clean gen_tar diff --git a/tools/testing/selftests/android/Makefile b/tools/testing/selftests/android/Makefile index 7c462714b418..9258306cafe9 100644 --- a/tools/testing/selftests/android/Makefile +++ b/tools/testing/selftests/android/Makefile @@ -21,7 +21,7 @@ all: override define INSTALL_RULE mkdir -p $(INSTALL_PATH) - install -t $(INSTALL_PATH) $(TEST_PROGS) $(TEST_PROGS_EXTENDED) $(TEST_FILES) +install -t $(INSTALL_PATH) $(TEST_PROGS) $(TEST_PROGS_EXTENDED) $(TEST_FILES) $(TEST_GEN_PROGS) $(TEST_CUSTOM_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) @for SUBDIR in $(SUBDIRS); do \ BUILD_TARGET=$(OUTPUT)/$$SUBDIR; \ diff --git a/tools/testing/selftests/android/ion/.gitignore b/tools/testing/selftests/android/ion/.gitignore index 95e8f4561474..78eae9972bb1 100644 --- a/tools/testing/selftests/android/ion/.gitignore +++ b/tools/testing/selftests/android/ion/.gitignore @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only ionapp_export ionapp_import ionmap_test diff --git a/tools/testing/selftests/android/ion/Makefile b/tools/testing/selftests/android/ion/Makefile index 0eb7ab626e1c..42b71f005332 100644 --- a/tools/testing/selftests/android/ion/Makefile +++ b/tools/testing/selftests/android/ion/Makefile @@ -17,4 +17,4 @@ include ../../lib.mk $(OUTPUT)/ionapp_export: ionapp_export.c ipcsocket.c ionutils.c $(OUTPUT)/ionapp_import: ionapp_import.c ipcsocket.c ionutils.c -$(OUTPUT)/ionmap_test: ionmap_test.c ionutils.c +$(OUTPUT)/ionmap_test: ionmap_test.c ionutils.c ipcsocket.c diff --git a/tools/testing/selftests/arm64/signal/.gitignore b/tools/testing/selftests/arm64/signal/.gitignore index 3c5b4e8ff894..78c902045ca7 100644 --- a/tools/testing/selftests/arm64/signal/.gitignore +++ b/tools/testing/selftests/arm64/signal/.gitignore @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only mangle_* fake_sigreturn_* !*.[ch] diff --git a/tools/testing/selftests/arm64/tags/.gitignore b/tools/testing/selftests/arm64/tags/.gitignore index e8fae8d61ed6..f4f6c5112463 100644 --- a/tools/testing/selftests/arm64/tags/.gitignore +++ b/tools/testing/selftests/arm64/tags/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only tags_test diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index ec464859c6b6..1bb204cee853 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only test_verifier test_maps test_lru_map @@ -29,12 +30,12 @@ test_tcpnotify_user test_libbpf test_tcp_check_syncookie_user test_sysctl -test_hashmap -test_btf_dump +test_current_pid_tgid_new_ns xdping test_cpp *.skel.h /no_alu32 /bpf_gcc /tools - +/runqslower +/bench diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 257a1aaaa37d..22aaec74ea0a 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -2,6 +2,8 @@ include ../../../../scripts/Kbuild.include include ../../../scripts/Makefile.arch +CXX ?= $(CROSS_COMPILE)g++ + CURDIR := $(abspath .) TOOLSDIR := $(abspath ../../..) LIBDIR := $(TOOLSDIR)/lib @@ -20,8 +22,10 @@ CLANG ?= clang LLC ?= llc LLVM_OBJCOPY ?= llvm-objcopy BPF_GCC ?= $(shell command -v bpf-gcc;) -CFLAGS += -g -Wall -O2 $(GENFLAGS) -I$(CURDIR) -I$(APIDIR) \ - -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) -I$(TOOLSINCDIR) \ +SAN_CFLAGS ?= +CFLAGS += -g -rdynamic -Wall -O2 $(GENFLAGS) $(SAN_CFLAGS) \ + -I$(CURDIR) -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \ + -I$(TOOLSINCDIR) -I$(APIDIR) \ -Dbpf_prog_load=bpf_prog_test_load \ -Dbpf_load_program=bpf_test_load_program LDLIBS += -lcap -lelf -lz -lrt -lpthread @@ -31,8 +35,9 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test test_align test_verifier_log test_dev_cgroup test_tcpbpf_user \ test_sock test_btf test_sockmap get_cgroup_id_user test_socket_cookie \ test_cgroup_storage \ - test_netcnt test_tcpnotify_user test_sock_fields test_sysctl test_hashmap \ - test_progs-no_alu32 + test_netcnt test_tcpnotify_user test_sock_fields test_sysctl \ + test_progs-no_alu32 \ + test_current_pid_tgid_new_ns # Also test bpf-gcc, if present ifneq ($(BPF_GCC),) @@ -62,7 +67,8 @@ TEST_PROGS := test_kmod.sh \ test_tc_tunnel.sh \ test_tc_edt.sh \ test_xdping.sh \ - test_bpftool_build.sh + test_bpftool_build.sh \ + test_bpftool.sh TEST_PROGS_EXTENDED := with_addr.sh \ with_tunnels.sh \ @@ -73,7 +79,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \ # Compile but not part of 'make run_tests' TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \ - test_lirc_mode2_user xdping test_cpp runqslower + test_lirc_mode2_user xdping test_cpp runqslower bench TEST_CUSTOM_PROGS = urandom_read @@ -128,14 +134,18 @@ $(OUTPUT)/test_stub.o: test_stub.c $(BPFOBJ) $(call msg,CC,,$@) $(CC) -c $(CFLAGS) -o $@ $< -VMLINUX_BTF_PATHS := $(abspath ../../../../vmlinux) \ - /sys/kernel/btf/vmlinux \ - /boot/vmlinux-$(shell uname -r) -VMLINUX_BTF:= $(firstword $(wildcard $(VMLINUX_BTF_PATHS))) +VMLINUX_BTF_PATHS := $(if $(O),$(O)/vmlinux) \ + $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux) \ + ../../../../vmlinux \ + /sys/kernel/btf/vmlinux \ + /boot/vmlinux-$(shell uname -r) +VMLINUX_BTF := $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) + $(OUTPUT)/runqslower: $(BPFOBJ) $(Q)$(MAKE) $(submake_extras) -C $(TOOLSDIR)/bpf/runqslower \ OUTPUT=$(SCRATCH_DIR)/ VMLINUX_BTF=$(VMLINUX_BTF) \ - BPFOBJ=$(BPFOBJ) BPF_INCLUDE=$(INCLUDE_DIR) + BPFOBJ=$(BPFOBJ) BPF_INCLUDE=$(INCLUDE_DIR) && \ + cp $(SCRATCH_DIR)/runqslower $@ $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED): $(OUTPUT)/test_stub.o $(BPFOBJ) @@ -171,6 +181,10 @@ $(BUILD_DIR)/libbpf $(BUILD_DIR)/bpftool $(INCLUDE_DIR): $(call msg,MKDIR,,$@) mkdir -p $@ +$(INCLUDE_DIR)/vmlinux.h: $(VMLINUX_BTF) | $(BPFTOOL) $(INCLUDE_DIR) + $(call msg,GEN,,$@) + $(BPFTOOL) btf dump file $(VMLINUX_BTF) format c > $@ + # Get Clang's default includes on this system, as opposed to those seen by # '-target bpf'. This fixes "missing" files on some architectures/distros, # such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc. @@ -189,8 +203,8 @@ MENDIAN=$(if $(IS_LITTLE_ENDIAN),-mlittle-endian,-mbig-endian) CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG)) BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH) $(MENDIAN) \ - -I$(INCLUDE_DIR) -I$(CURDIR) -I$(CURDIR)/include/uapi \ - -I$(APIDIR) -I$(abspath $(OUTPUT)/../usr/include) + -I$(INCLUDE_DIR) -I$(CURDIR) -I$(APIDIR) \ + -I$(abspath $(OUTPUT)/../usr/include) CLANG_CFLAGS = $(CLANG_SYS_INCLUDES) \ -Wno-compare-distinct-pointer-types @@ -209,7 +223,7 @@ define CLANG_BPF_BUILD_RULE $(call msg,CLNG-LLC,$(TRUNNER_BINARY),$2) ($(CLANG) $3 -O2 -target bpf -emit-llvm \ -c $1 -o - || echo "BPF obj compilation failed") | \ - $(LLC) -mattr=dwarfris -march=bpf -mcpu=probe $4 -filetype=obj -o $2 + $(LLC) -mattr=dwarfris -march=bpf -mcpu=v3 $4 -filetype=obj -o $2 endef # Similar to CLANG_BPF_BUILD_RULE, but with disabled alu32 define CLANG_NOALU32_BPF_BUILD_RULE @@ -223,7 +237,7 @@ define CLANG_NATIVE_BPF_BUILD_RULE $(call msg,CLNG-BPF,$(TRUNNER_BINARY),$2) ($(CLANG) $3 -O2 -emit-llvm \ -c $1 -o - || echo "BPF obj compilation failed") | \ - $(LLC) -march=bpf -mcpu=probe $4 -filetype=obj -o $2 + $(LLC) -march=bpf -mcpu=v3 $4 -filetype=obj -o $2 endef # Build BPF object using GCC define GCC_BPF_BUILD_RULE @@ -231,7 +245,7 @@ define GCC_BPF_BUILD_RULE $(BPF_GCC) $3 $4 -O2 -c $1 -o $2 endef -SKEL_BLACKLIST := btf__% test_pinning_invalid.c +SKEL_BLACKLIST := btf__% test_pinning_invalid.c test_sk_assign.c # Set up extra TRUNNER_XXX "temporary" variables in the environment (relies on # $eval()) and pass control to DEFINE_TEST_RUNNER_RULES. @@ -253,6 +267,7 @@ TRUNNER_BPF_OBJS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.o, $$(TRUNNER_BPF_SRCS) TRUNNER_BPF_SKELS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.skel.h, \ $$(filter-out $(SKEL_BLACKLIST), \ $$(TRUNNER_BPF_SRCS))) +TEST_GEN_FILES += $$(TRUNNER_BPF_OBJS) # Evaluate rules now with extra TRUNNER_XXX variables above already defined $$(eval $$(call DEFINE_TEST_RUNNER_RULES,$1,$2)) @@ -279,6 +294,7 @@ $(TRUNNER_BPF_PROGS_DIR)$(if $2,-)$2-bpfobjs := y $(TRUNNER_BPF_OBJS): $(TRUNNER_OUTPUT)/%.o: \ $(TRUNNER_BPF_PROGS_DIR)/%.c \ $(TRUNNER_BPF_PROGS_DIR)/*.h \ + $$(INCLUDE_DIR)/vmlinux.h \ $$(BPFOBJ) | $(TRUNNER_OUTPUT) $$(call $(TRUNNER_BPF_BUILD_RULE),$$<,$$@, \ $(TRUNNER_BPF_CFLAGS), \ @@ -312,7 +328,7 @@ $(TRUNNER_TEST_OBJS): $(TRUNNER_OUTPUT)/%.test.o: \ $(TRUNNER_BPF_SKELS) \ $$(BPFOBJ) | $(TRUNNER_OUTPUT) $$(call msg,TEST-OBJ,$(TRUNNER_BINARY),$$@) - cd $$(@D) && $$(CC) $$(CFLAGS) -c $(CURDIR)/$$< $$(LDLIBS) -o $$(@F) + cd $$(@D) && $$(CC) -I. $$(CFLAGS) -c $(CURDIR)/$$< $$(LDLIBS) -o $$(@F) $(TRUNNER_EXTRA_OBJS): $(TRUNNER_OUTPUT)/%.o: \ %.c \ @@ -341,6 +357,7 @@ endef TRUNNER_TESTS_DIR := prog_tests TRUNNER_BPF_PROGS_DIR := progs TRUNNER_EXTRA_SOURCES := test_progs.c cgroup_helpers.c trace_helpers.c \ + network_helpers.c testing_helpers.c \ flow_dissector_load.h TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read \ $(wildcard progs/btf_dump_test_case_*.c) @@ -392,6 +409,24 @@ $(OUTPUT)/test_cpp: test_cpp.cpp $(OUTPUT)/test_core_extern.skel.h $(BPFOBJ) $(call msg,CXX,,$@) $(CXX) $(CFLAGS) $^ $(LDLIBS) -o $@ +# Benchmark runner +$(OUTPUT)/bench_%.o: benchs/bench_%.c bench.h + $(call msg,CC,,$@) + $(CC) $(CFLAGS) -c $(filter %.c,$^) $(LDLIBS) -o $@ +$(OUTPUT)/bench_rename.o: $(OUTPUT)/test_overhead.skel.h +$(OUTPUT)/bench_trigger.o: $(OUTPUT)/trigger_bench.skel.h +$(OUTPUT)/bench_ringbufs.o: $(OUTPUT)/ringbuf_bench.skel.h \ + $(OUTPUT)/perfbuf_bench.skel.h +$(OUTPUT)/bench.o: bench.h testing_helpers.h +$(OUTPUT)/bench: LDLIBS += -lm +$(OUTPUT)/bench: $(OUTPUT)/bench.o $(OUTPUT)/testing_helpers.o \ + $(OUTPUT)/bench_count.o \ + $(OUTPUT)/bench_rename.o \ + $(OUTPUT)/bench_trigger.o \ + $(OUTPUT)/bench_ringbufs.o + $(call msg,BINARY,,$@) + $(CC) $(LDFLAGS) -o $@ $(filter %.a %.o,$^) $(LDLIBS) + EXTRA_CLEAN := $(TEST_CUSTOM_PROGS) $(SCRATCH_DIR) \ prog_tests/tests.h map_tests/tests.h verifier/tests.h \ feature \ diff --git a/tools/testing/selftests/bpf/README.rst b/tools/testing/selftests/bpf/README.rst new file mode 100644 index 000000000000..e885d351595f --- /dev/null +++ b/tools/testing/selftests/bpf/README.rst @@ -0,0 +1,45 @@ +================== +BPF Selftest Notes +================== +General instructions on running selftests can be found in +`Documentation/bpf/bpf_devel_QA.rst`_. + +Additional information about selftest failures are +documented here. + +bpf_iter test failures with clang/llvm 10.0.0 +============================================= + +With clang/llvm 10.0.0, the following two bpf_iter tests failed: + * ``bpf_iter/ipv6_route`` + * ``bpf_iter/netlink`` + +The symptom for ``bpf_iter/ipv6_route`` looks like + +.. code-block:: c + + 2: (79) r8 = *(u64 *)(r1 +8) + ... + 14: (bf) r2 = r8 + 15: (0f) r2 += r1 + ; BPF_SEQ_PRINTF(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen); + 16: (7b) *(u64 *)(r8 +64) = r2 + only read is supported + +The symptom for ``bpf_iter/netlink`` looks like + +.. code-block:: c + + ; struct netlink_sock *nlk = ctx->sk; + 2: (79) r7 = *(u64 *)(r1 +8) + ... + 15: (bf) r2 = r7 + 16: (0f) r2 += r1 + ; BPF_SEQ_PRINTF(seq, "%pK %-3d ", s, s->sk_protocol); + 17: (7b) *(u64 *)(r7 +0) = r2 + only read is supported + +This is due to a llvm BPF backend bug. The fix + https://reviews.llvm.org/D78466 +has been pushed to llvm 10.x release branch and will be +available in 10.0.1. The fix is available in llvm 11.0.0 trunk. diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c new file mode 100644 index 000000000000..944ad4721c83 --- /dev/null +++ b/tools/testing/selftests/bpf/bench.c @@ -0,0 +1,465 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#define _GNU_SOURCE +#include <argp.h> +#include <linux/compiler.h> +#include <sys/time.h> +#include <sched.h> +#include <fcntl.h> +#include <pthread.h> +#include <sys/sysinfo.h> +#include <sys/resource.h> +#include <signal.h> +#include "bench.h" +#include "testing_helpers.h" + +struct env env = { + .warmup_sec = 1, + .duration_sec = 5, + .affinity = false, + .consumer_cnt = 1, + .producer_cnt = 1, +}; + +static int libbpf_print_fn(enum libbpf_print_level level, + const char *format, va_list args) +{ + if (level == LIBBPF_DEBUG && !env.verbose) + return 0; + return vfprintf(stderr, format, args); +} + +static int bump_memlock_rlimit(void) +{ + struct rlimit rlim_new = { + .rlim_cur = RLIM_INFINITY, + .rlim_max = RLIM_INFINITY, + }; + + return setrlimit(RLIMIT_MEMLOCK, &rlim_new); +} + +void setup_libbpf() +{ + int err; + + libbpf_set_print(libbpf_print_fn); + + err = bump_memlock_rlimit(); + if (err) + fprintf(stderr, "failed to increase RLIMIT_MEMLOCK: %d", err); +} + +void hits_drops_report_progress(int iter, struct bench_res *res, long delta_ns) +{ + double hits_per_sec, drops_per_sec; + double hits_per_prod; + + hits_per_sec = res->hits / 1000000.0 / (delta_ns / 1000000000.0); + hits_per_prod = hits_per_sec / env.producer_cnt; + drops_per_sec = res->drops / 1000000.0 / (delta_ns / 1000000000.0); + + printf("Iter %3d (%7.3lfus): ", + iter, (delta_ns - 1000000000) / 1000.0); + + printf("hits %8.3lfM/s (%7.3lfM/prod), drops %8.3lfM/s\n", + hits_per_sec, hits_per_prod, drops_per_sec); +} + +void hits_drops_report_final(struct bench_res res[], int res_cnt) +{ + int i; + double hits_mean = 0.0, drops_mean = 0.0; + double hits_stddev = 0.0, drops_stddev = 0.0; + + for (i = 0; i < res_cnt; i++) { + hits_mean += res[i].hits / 1000000.0 / (0.0 + res_cnt); + drops_mean += res[i].drops / 1000000.0 / (0.0 + res_cnt); + } + + if (res_cnt > 1) { + for (i = 0; i < res_cnt; i++) { + hits_stddev += (hits_mean - res[i].hits / 1000000.0) * + (hits_mean - res[i].hits / 1000000.0) / + (res_cnt - 1.0); + drops_stddev += (drops_mean - res[i].drops / 1000000.0) * + (drops_mean - res[i].drops / 1000000.0) / + (res_cnt - 1.0); + } + hits_stddev = sqrt(hits_stddev); + drops_stddev = sqrt(drops_stddev); + } + printf("Summary: hits %8.3lf \u00B1 %5.3lfM/s (%7.3lfM/prod), ", + hits_mean, hits_stddev, hits_mean / env.producer_cnt); + printf("drops %8.3lf \u00B1 %5.3lfM/s\n", + drops_mean, drops_stddev); +} + +const char *argp_program_version = "benchmark"; +const char *argp_program_bug_address = "<bpf@vger.kernel.org>"; +const char argp_program_doc[] = +"benchmark Generic benchmarking framework.\n" +"\n" +"This tool runs benchmarks.\n" +"\n" +"USAGE: benchmark <bench-name>\n" +"\n" +"EXAMPLES:\n" +" # run 'count-local' benchmark with 1 producer and 1 consumer\n" +" benchmark count-local\n" +" # run 'count-local' with 16 producer and 8 consumer thread, pinned to CPUs\n" +" benchmark -p16 -c8 -a count-local\n"; + +enum { + ARG_PROD_AFFINITY_SET = 1000, + ARG_CONS_AFFINITY_SET = 1001, +}; + +static const struct argp_option opts[] = { + { "list", 'l', NULL, 0, "List available benchmarks"}, + { "duration", 'd', "SEC", 0, "Duration of benchmark, seconds"}, + { "warmup", 'w', "SEC", 0, "Warm-up period, seconds"}, + { "producers", 'p', "NUM", 0, "Number of producer threads"}, + { "consumers", 'c', "NUM", 0, "Number of consumer threads"}, + { "verbose", 'v', NULL, 0, "Verbose debug output"}, + { "affinity", 'a', NULL, 0, "Set consumer/producer thread affinity"}, + { "prod-affinity", ARG_PROD_AFFINITY_SET, "CPUSET", 0, + "Set of CPUs for producer threads; implies --affinity"}, + { "cons-affinity", ARG_CONS_AFFINITY_SET, "CPUSET", 0, + "Set of CPUs for consumer threads; implies --affinity"}, + {}, +}; + +extern struct argp bench_ringbufs_argp; + +static const struct argp_child bench_parsers[] = { + { &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 }, + {}, +}; + +static error_t parse_arg(int key, char *arg, struct argp_state *state) +{ + static int pos_args; + + switch (key) { + case 'v': + env.verbose = true; + break; + case 'l': + env.list = true; + break; + case 'd': + env.duration_sec = strtol(arg, NULL, 10); + if (env.duration_sec <= 0) { + fprintf(stderr, "Invalid duration: %s\n", arg); + argp_usage(state); + } + break; + case 'w': + env.warmup_sec = strtol(arg, NULL, 10); + if (env.warmup_sec <= 0) { + fprintf(stderr, "Invalid warm-up duration: %s\n", arg); + argp_usage(state); + } + break; + case 'p': + env.producer_cnt = strtol(arg, NULL, 10); + if (env.producer_cnt <= 0) { + fprintf(stderr, "Invalid producer count: %s\n", arg); + argp_usage(state); + } + break; + case 'c': + env.consumer_cnt = strtol(arg, NULL, 10); + if (env.consumer_cnt <= 0) { + fprintf(stderr, "Invalid consumer count: %s\n", arg); + argp_usage(state); + } + break; + case 'a': + env.affinity = true; + break; + case ARG_PROD_AFFINITY_SET: + env.affinity = true; + if (parse_num_list(arg, &env.prod_cpus.cpus, + &env.prod_cpus.cpus_len)) { + fprintf(stderr, "Invalid format of CPU set for producers."); + argp_usage(state); + } + break; + case ARG_CONS_AFFINITY_SET: + env.affinity = true; + if (parse_num_list(arg, &env.cons_cpus.cpus, + &env.cons_cpus.cpus_len)) { + fprintf(stderr, "Invalid format of CPU set for consumers."); + argp_usage(state); + } + break; + case ARGP_KEY_ARG: + if (pos_args++) { + fprintf(stderr, + "Unrecognized positional argument: %s\n", arg); + argp_usage(state); + } + env.bench_name = strdup(arg); + break; + default: + return ARGP_ERR_UNKNOWN; + } + return 0; +} + +static void parse_cmdline_args(int argc, char **argv) +{ + static const struct argp argp = { + .options = opts, + .parser = parse_arg, + .doc = argp_program_doc, + .children = bench_parsers, + }; + if (argp_parse(&argp, argc, argv, 0, NULL, NULL)) + exit(1); + if (!env.list && !env.bench_name) { + argp_help(&argp, stderr, ARGP_HELP_DOC, "bench"); + exit(1); + } +} + +static void collect_measurements(long delta_ns); + +static __u64 last_time_ns; +static void sigalarm_handler(int signo) +{ + long new_time_ns = get_time_ns(); + long delta_ns = new_time_ns - last_time_ns; + + collect_measurements(delta_ns); + + last_time_ns = new_time_ns; +} + +/* set up periodic 1-second timer */ +static void setup_timer() +{ + static struct sigaction sigalarm_action = { + .sa_handler = sigalarm_handler, + }; + struct itimerval timer_settings = {}; + int err; + + last_time_ns = get_time_ns(); + err = sigaction(SIGALRM, &sigalarm_action, NULL); + if (err < 0) { + fprintf(stderr, "failed to install SIGALRM handler: %d\n", -errno); + exit(1); + } + timer_settings.it_interval.tv_sec = 1; + timer_settings.it_value.tv_sec = 1; + err = setitimer(ITIMER_REAL, &timer_settings, NULL); + if (err < 0) { + fprintf(stderr, "failed to arm interval timer: %d\n", -errno); + exit(1); + } +} + +static void set_thread_affinity(pthread_t thread, int cpu) +{ + cpu_set_t cpuset; + + CPU_ZERO(&cpuset); + CPU_SET(cpu, &cpuset); + if (pthread_setaffinity_np(thread, sizeof(cpuset), &cpuset)) { + fprintf(stderr, "setting affinity to CPU #%d failed: %d\n", + cpu, errno); + exit(1); + } +} + +static int next_cpu(struct cpu_set *cpu_set) +{ + if (cpu_set->cpus) { + int i; + + /* find next available CPU */ + for (i = cpu_set->next_cpu; i < cpu_set->cpus_len; i++) { + if (cpu_set->cpus[i]) { + cpu_set->next_cpu = i + 1; + return i; + } + } + fprintf(stderr, "Not enough CPUs specified, need CPU #%d or higher.\n", i); + exit(1); + } + + return cpu_set->next_cpu++; +} + +static struct bench_state { + int res_cnt; + struct bench_res *results; + pthread_t *consumers; + pthread_t *producers; +} state; + +const struct bench *bench = NULL; + +extern const struct bench bench_count_global; +extern const struct bench bench_count_local; +extern const struct bench bench_rename_base; +extern const struct bench bench_rename_kprobe; +extern const struct bench bench_rename_kretprobe; +extern const struct bench bench_rename_rawtp; +extern const struct bench bench_rename_fentry; +extern const struct bench bench_rename_fexit; +extern const struct bench bench_rename_fmodret; +extern const struct bench bench_trig_base; +extern const struct bench bench_trig_tp; +extern const struct bench bench_trig_rawtp; +extern const struct bench bench_trig_kprobe; +extern const struct bench bench_trig_fentry; +extern const struct bench bench_trig_fmodret; +extern const struct bench bench_rb_libbpf; +extern const struct bench bench_rb_custom; +extern const struct bench bench_pb_libbpf; +extern const struct bench bench_pb_custom; + +static const struct bench *benchs[] = { + &bench_count_global, + &bench_count_local, + &bench_rename_base, + &bench_rename_kprobe, + &bench_rename_kretprobe, + &bench_rename_rawtp, + &bench_rename_fentry, + &bench_rename_fexit, + &bench_rename_fmodret, + &bench_trig_base, + &bench_trig_tp, + &bench_trig_rawtp, + &bench_trig_kprobe, + &bench_trig_fentry, + &bench_trig_fmodret, + &bench_rb_libbpf, + &bench_rb_custom, + &bench_pb_libbpf, + &bench_pb_custom, +}; + +static void setup_benchmark() +{ + int i, err; + + if (!env.bench_name) { + fprintf(stderr, "benchmark name is not specified\n"); + exit(1); + } + + for (i = 0; i < ARRAY_SIZE(benchs); i++) { + if (strcmp(benchs[i]->name, env.bench_name) == 0) { + bench = benchs[i]; + break; + } + } + if (!bench) { + fprintf(stderr, "benchmark '%s' not found\n", env.bench_name); + exit(1); + } + + printf("Setting up benchmark '%s'...\n", bench->name); + + state.producers = calloc(env.producer_cnt, sizeof(*state.producers)); + state.consumers = calloc(env.consumer_cnt, sizeof(*state.consumers)); + state.results = calloc(env.duration_sec + env.warmup_sec + 2, + sizeof(*state.results)); + if (!state.producers || !state.consumers || !state.results) + exit(1); + + if (bench->validate) + bench->validate(); + if (bench->setup) + bench->setup(); + + for (i = 0; i < env.consumer_cnt; i++) { + err = pthread_create(&state.consumers[i], NULL, + bench->consumer_thread, (void *)(long)i); + if (err) { + fprintf(stderr, "failed to create consumer thread #%d: %d\n", + i, -errno); + exit(1); + } + if (env.affinity) + set_thread_affinity(state.consumers[i], + next_cpu(&env.cons_cpus)); + } + + /* unless explicit producer CPU list is specified, continue after + * last consumer CPU + */ + if (!env.prod_cpus.cpus) + env.prod_cpus.next_cpu = env.cons_cpus.next_cpu; + + for (i = 0; i < env.producer_cnt; i++) { + err = pthread_create(&state.producers[i], NULL, + bench->producer_thread, (void *)(long)i); + if (err) { + fprintf(stderr, "failed to create producer thread #%d: %d\n", + i, -errno); + exit(1); + } + if (env.affinity) + set_thread_affinity(state.producers[i], + next_cpu(&env.prod_cpus)); + } + + printf("Benchmark '%s' started.\n", bench->name); +} + +static pthread_mutex_t bench_done_mtx = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t bench_done = PTHREAD_COND_INITIALIZER; + +static void collect_measurements(long delta_ns) { + int iter = state.res_cnt++; + struct bench_res *res = &state.results[iter]; + + bench->measure(res); + + if (bench->report_progress) + bench->report_progress(iter, res, delta_ns); + + if (iter == env.duration_sec + env.warmup_sec) { + pthread_mutex_lock(&bench_done_mtx); + pthread_cond_signal(&bench_done); + pthread_mutex_unlock(&bench_done_mtx); + } +} + +int main(int argc, char **argv) +{ + parse_cmdline_args(argc, argv); + + if (env.list) { + int i; + + printf("Available benchmarks:\n"); + for (i = 0; i < ARRAY_SIZE(benchs); i++) { + printf("- %s\n", benchs[i]->name); + } + return 0; + } + + setup_benchmark(); + + setup_timer(); + + pthread_mutex_lock(&bench_done_mtx); + pthread_cond_wait(&bench_done, &bench_done_mtx); + pthread_mutex_unlock(&bench_done_mtx); + + if (bench->report_final) + /* skip first sample */ + bench->report_final(state.results + env.warmup_sec, + state.res_cnt - env.warmup_sec); + + return 0; +} + diff --git a/tools/testing/selftests/bpf/bench.h b/tools/testing/selftests/bpf/bench.h new file mode 100644 index 000000000000..c1f48a473b02 --- /dev/null +++ b/tools/testing/selftests/bpf/bench.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#pragma once +#include <stdlib.h> +#include <stdbool.h> +#include <linux/err.h> +#include <errno.h> +#include <unistd.h> +#include <bpf/bpf.h> +#include <bpf/libbpf.h> +#include <math.h> +#include <time.h> +#include <sys/syscall.h> + +struct cpu_set { + bool *cpus; + int cpus_len; + int next_cpu; +}; + +struct env { + char *bench_name; + int duration_sec; + int warmup_sec; + bool verbose; + bool list; + bool affinity; + int consumer_cnt; + int producer_cnt; + struct cpu_set prod_cpus; + struct cpu_set cons_cpus; +}; + +struct bench_res { + long hits; + long drops; +}; + +struct bench { + const char *name; + void (*validate)(); + void (*setup)(); + void *(*producer_thread)(void *ctx); + void *(*consumer_thread)(void *ctx); + void (*measure)(struct bench_res* res); + void (*report_progress)(int iter, struct bench_res* res, long delta_ns); + void (*report_final)(struct bench_res res[], int res_cnt); +}; + +struct counter { + long value; +} __attribute__((aligned(128))); + +extern struct env env; +extern const struct bench *bench; + +void setup_libbpf(); +void hits_drops_report_progress(int iter, struct bench_res *res, long delta_ns); +void hits_drops_report_final(struct bench_res res[], int res_cnt); + +static inline __u64 get_time_ns() { + struct timespec t; + + clock_gettime(CLOCK_MONOTONIC, &t); + + return (u64)t.tv_sec * 1000000000 + t.tv_nsec; +} + +static inline void atomic_inc(long *value) +{ + (void)__atomic_add_fetch(value, 1, __ATOMIC_RELAXED); +} + +static inline void atomic_add(long *value, long n) +{ + (void)__atomic_add_fetch(value, n, __ATOMIC_RELAXED); +} + +static inline long atomic_swap(long *value, long n) +{ + return __atomic_exchange_n(value, n, __ATOMIC_RELAXED); +} diff --git a/tools/testing/selftests/bpf/benchs/bench_count.c b/tools/testing/selftests/bpf/benchs/bench_count.c new file mode 100644 index 000000000000..befba7a82643 --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/bench_count.c @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include "bench.h" + +/* COUNT-GLOBAL benchmark */ + +static struct count_global_ctx { + struct counter hits; +} count_global_ctx; + +static void *count_global_producer(void *input) +{ + struct count_global_ctx *ctx = &count_global_ctx; + + while (true) { + atomic_inc(&ctx->hits.value); + } + return NULL; +} + +static void *count_global_consumer(void *input) +{ + return NULL; +} + +static void count_global_measure(struct bench_res *res) +{ + struct count_global_ctx *ctx = &count_global_ctx; + + res->hits = atomic_swap(&ctx->hits.value, 0); +} + +/* COUNT-local benchmark */ + +static struct count_local_ctx { + struct counter *hits; +} count_local_ctx; + +static void count_local_setup() +{ + struct count_local_ctx *ctx = &count_local_ctx; + + ctx->hits = calloc(env.consumer_cnt, sizeof(*ctx->hits)); + if (!ctx->hits) + exit(1); +} + +static void *count_local_producer(void *input) +{ + struct count_local_ctx *ctx = &count_local_ctx; + int idx = (long)input; + + while (true) { + atomic_inc(&ctx->hits[idx].value); + } + return NULL; +} + +static void *count_local_consumer(void *input) +{ + return NULL; +} + +static void count_local_measure(struct bench_res *res) +{ + struct count_local_ctx *ctx = &count_local_ctx; + int i; + + for (i = 0; i < env.producer_cnt; i++) { + res->hits += atomic_swap(&ctx->hits[i].value, 0); + } +} + +const struct bench bench_count_global = { + .name = "count-global", + .producer_thread = count_global_producer, + .consumer_thread = count_global_consumer, + .measure = count_global_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_count_local = { + .name = "count-local", + .setup = count_local_setup, + .producer_thread = count_local_producer, + .consumer_thread = count_local_consumer, + .measure = count_local_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; diff --git a/tools/testing/selftests/bpf/benchs/bench_rename.c b/tools/testing/selftests/bpf/benchs/bench_rename.c new file mode 100644 index 000000000000..e74cff40f4fe --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/bench_rename.c @@ -0,0 +1,195 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include <fcntl.h> +#include "bench.h" +#include "test_overhead.skel.h" + +/* BPF triggering benchmarks */ +static struct ctx { + struct test_overhead *skel; + struct counter hits; + int fd; +} ctx; + +static void validate() +{ + if (env.producer_cnt != 1) { + fprintf(stderr, "benchmark doesn't support multi-producer!\n"); + exit(1); + } + if (env.consumer_cnt != 1) { + fprintf(stderr, "benchmark doesn't support multi-consumer!\n"); + exit(1); + } +} + +static void *producer(void *input) +{ + char buf[] = "test_overhead"; + int err; + + while (true) { + err = write(ctx.fd, buf, sizeof(buf)); + if (err < 0) { + fprintf(stderr, "write failed\n"); + exit(1); + } + atomic_inc(&ctx.hits.value); + } +} + +static void measure(struct bench_res *res) +{ + res->hits = atomic_swap(&ctx.hits.value, 0); +} + +static void setup_ctx() +{ + setup_libbpf(); + + ctx.skel = test_overhead__open_and_load(); + if (!ctx.skel) { + fprintf(stderr, "failed to open skeleton\n"); + exit(1); + } + + ctx.fd = open("/proc/self/comm", O_WRONLY|O_TRUNC); + if (ctx.fd < 0) { + fprintf(stderr, "failed to open /proc/self/comm: %d\n", -errno); + exit(1); + } +} + +static void attach_bpf(struct bpf_program *prog) +{ + struct bpf_link *link; + + link = bpf_program__attach(prog); + if (IS_ERR(link)) { + fprintf(stderr, "failed to attach program!\n"); + exit(1); + } +} + +static void setup_base() +{ + setup_ctx(); +} + +static void setup_kprobe() +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.prog1); +} + +static void setup_kretprobe() +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.prog2); +} + +static void setup_rawtp() +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.prog3); +} + +static void setup_fentry() +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.prog4); +} + +static void setup_fexit() +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.prog5); +} + +static void setup_fmodret() +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.prog6); +} + +static void *consumer(void *input) +{ + return NULL; +} + +const struct bench bench_rename_base = { + .name = "rename-base", + .validate = validate, + .setup = setup_base, + .producer_thread = producer, + .consumer_thread = consumer, + .measure = measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_rename_kprobe = { + .name = "rename-kprobe", + .validate = validate, + .setup = setup_kprobe, + .producer_thread = producer, + .consumer_thread = consumer, + .measure = measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_rename_kretprobe = { + .name = "rename-kretprobe", + .validate = validate, + .setup = setup_kretprobe, + .producer_thread = producer, + .consumer_thread = consumer, + .measure = measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_rename_rawtp = { + .name = "rename-rawtp", + .validate = validate, + .setup = setup_rawtp, + .producer_thread = producer, + .consumer_thread = consumer, + .measure = measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_rename_fentry = { + .name = "rename-fentry", + .validate = validate, + .setup = setup_fentry, + .producer_thread = producer, + .consumer_thread = consumer, + .measure = measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_rename_fexit = { + .name = "rename-fexit", + .validate = validate, + .setup = setup_fexit, + .producer_thread = producer, + .consumer_thread = consumer, + .measure = measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_rename_fmodret = { + .name = "rename-fmodret", + .validate = validate, + .setup = setup_fmodret, + .producer_thread = producer, + .consumer_thread = consumer, + .measure = measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; diff --git a/tools/testing/selftests/bpf/benchs/bench_ringbufs.c b/tools/testing/selftests/bpf/benchs/bench_ringbufs.c new file mode 100644 index 000000000000..da87c7f31891 --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/bench_ringbufs.c @@ -0,0 +1,566 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include <asm/barrier.h> +#include <linux/perf_event.h> +#include <linux/ring_buffer.h> +#include <sys/epoll.h> +#include <sys/mman.h> +#include <argp.h> +#include <stdlib.h> +#include "bench.h" +#include "ringbuf_bench.skel.h" +#include "perfbuf_bench.skel.h" + +static struct { + bool back2back; + int batch_cnt; + bool sampled; + int sample_rate; + int ringbuf_sz; /* per-ringbuf, in bytes */ + bool ringbuf_use_output; /* use slower output API */ + int perfbuf_sz; /* per-CPU size, in pages */ +} args = { + .back2back = false, + .batch_cnt = 500, + .sampled = false, + .sample_rate = 500, + .ringbuf_sz = 512 * 1024, + .ringbuf_use_output = false, + .perfbuf_sz = 128, +}; + +enum { + ARG_RB_BACK2BACK = 2000, + ARG_RB_USE_OUTPUT = 2001, + ARG_RB_BATCH_CNT = 2002, + ARG_RB_SAMPLED = 2003, + ARG_RB_SAMPLE_RATE = 2004, +}; + +static const struct argp_option opts[] = { + { "rb-b2b", ARG_RB_BACK2BACK, NULL, 0, "Back-to-back mode"}, + { "rb-use-output", ARG_RB_USE_OUTPUT, NULL, 0, "Use bpf_ringbuf_output() instead of bpf_ringbuf_reserve()"}, + { "rb-batch-cnt", ARG_RB_BATCH_CNT, "CNT", 0, "Set BPF-side record batch count"}, + { "rb-sampled", ARG_RB_SAMPLED, NULL, 0, "Notification sampling"}, + { "rb-sample-rate", ARG_RB_SAMPLE_RATE, "RATE", 0, "Notification sample rate"}, + {}, +}; + +static error_t parse_arg(int key, char *arg, struct argp_state *state) +{ + switch (key) { + case ARG_RB_BACK2BACK: + args.back2back = true; + break; + case ARG_RB_USE_OUTPUT: + args.ringbuf_use_output = true; + break; + case ARG_RB_BATCH_CNT: + args.batch_cnt = strtol(arg, NULL, 10); + if (args.batch_cnt < 0) { + fprintf(stderr, "Invalid batch count."); + argp_usage(state); + } + break; + case ARG_RB_SAMPLED: + args.sampled = true; + break; + case ARG_RB_SAMPLE_RATE: + args.sample_rate = strtol(arg, NULL, 10); + if (args.sample_rate < 0) { + fprintf(stderr, "Invalid perfbuf sample rate."); + argp_usage(state); + } + break; + default: + return ARGP_ERR_UNKNOWN; + } + return 0; +} + +/* exported into benchmark runner */ +const struct argp bench_ringbufs_argp = { + .options = opts, + .parser = parse_arg, +}; + +/* RINGBUF-LIBBPF benchmark */ + +static struct counter buf_hits; + +static inline void bufs_trigger_batch() +{ + (void)syscall(__NR_getpgid); +} + +static void bufs_validate() +{ + if (env.consumer_cnt != 1) { + fprintf(stderr, "rb-libbpf benchmark doesn't support multi-consumer!\n"); + exit(1); + } + + if (args.back2back && env.producer_cnt > 1) { + fprintf(stderr, "back-to-back mode makes sense only for single-producer case!\n"); + exit(1); + } +} + +static void *bufs_sample_producer(void *input) +{ + if (args.back2back) { + /* initial batch to get everything started */ + bufs_trigger_batch(); + return NULL; + } + + while (true) + bufs_trigger_batch(); + return NULL; +} + +static struct ringbuf_libbpf_ctx { + struct ringbuf_bench *skel; + struct ring_buffer *ringbuf; +} ringbuf_libbpf_ctx; + +static void ringbuf_libbpf_measure(struct bench_res *res) +{ + struct ringbuf_libbpf_ctx *ctx = &ringbuf_libbpf_ctx; + + res->hits = atomic_swap(&buf_hits.value, 0); + res->drops = atomic_swap(&ctx->skel->bss->dropped, 0); +} + +static struct ringbuf_bench *ringbuf_setup_skeleton() +{ + struct ringbuf_bench *skel; + + setup_libbpf(); + + skel = ringbuf_bench__open(); + if (!skel) { + fprintf(stderr, "failed to open skeleton\n"); + exit(1); + } + + skel->rodata->batch_cnt = args.batch_cnt; + skel->rodata->use_output = args.ringbuf_use_output ? 1 : 0; + + if (args.sampled) + /* record data + header take 16 bytes */ + skel->rodata->wakeup_data_size = args.sample_rate * 16; + + bpf_map__resize(skel->maps.ringbuf, args.ringbuf_sz); + + if (ringbuf_bench__load(skel)) { + fprintf(stderr, "failed to load skeleton\n"); + exit(1); + } + + return skel; +} + +static int buf_process_sample(void *ctx, void *data, size_t len) +{ + atomic_inc(&buf_hits.value); + return 0; +} + +static void ringbuf_libbpf_setup() +{ + struct ringbuf_libbpf_ctx *ctx = &ringbuf_libbpf_ctx; + struct bpf_link *link; + + ctx->skel = ringbuf_setup_skeleton(); + ctx->ringbuf = ring_buffer__new(bpf_map__fd(ctx->skel->maps.ringbuf), + buf_process_sample, NULL, NULL); + if (!ctx->ringbuf) { + fprintf(stderr, "failed to create ringbuf\n"); + exit(1); + } + + link = bpf_program__attach(ctx->skel->progs.bench_ringbuf); + if (IS_ERR(link)) { + fprintf(stderr, "failed to attach program!\n"); + exit(1); + } +} + +static void *ringbuf_libbpf_consumer(void *input) +{ + struct ringbuf_libbpf_ctx *ctx = &ringbuf_libbpf_ctx; + + while (ring_buffer__poll(ctx->ringbuf, -1) >= 0) { + if (args.back2back) + bufs_trigger_batch(); + } + fprintf(stderr, "ringbuf polling failed!\n"); + return NULL; +} + +/* RINGBUF-CUSTOM benchmark */ +struct ringbuf_custom { + __u64 *consumer_pos; + __u64 *producer_pos; + __u64 mask; + void *data; + int map_fd; +}; + +static struct ringbuf_custom_ctx { + struct ringbuf_bench *skel; + struct ringbuf_custom ringbuf; + int epoll_fd; + struct epoll_event event; +} ringbuf_custom_ctx; + +static void ringbuf_custom_measure(struct bench_res *res) +{ + struct ringbuf_custom_ctx *ctx = &ringbuf_custom_ctx; + + res->hits = atomic_swap(&buf_hits.value, 0); + res->drops = atomic_swap(&ctx->skel->bss->dropped, 0); +} + +static void ringbuf_custom_setup() +{ + struct ringbuf_custom_ctx *ctx = &ringbuf_custom_ctx; + const size_t page_size = getpagesize(); + struct bpf_link *link; + struct ringbuf_custom *r; + void *tmp; + int err; + + ctx->skel = ringbuf_setup_skeleton(); + + ctx->epoll_fd = epoll_create1(EPOLL_CLOEXEC); + if (ctx->epoll_fd < 0) { + fprintf(stderr, "failed to create epoll fd: %d\n", -errno); + exit(1); + } + + r = &ctx->ringbuf; + r->map_fd = bpf_map__fd(ctx->skel->maps.ringbuf); + r->mask = args.ringbuf_sz - 1; + + /* Map writable consumer page */ + tmp = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, + r->map_fd, 0); + if (tmp == MAP_FAILED) { + fprintf(stderr, "failed to mmap consumer page: %d\n", -errno); + exit(1); + } + r->consumer_pos = tmp; + + /* Map read-only producer page and data pages. */ + tmp = mmap(NULL, page_size + 2 * args.ringbuf_sz, PROT_READ, MAP_SHARED, + r->map_fd, page_size); + if (tmp == MAP_FAILED) { + fprintf(stderr, "failed to mmap data pages: %d\n", -errno); + exit(1); + } + r->producer_pos = tmp; + r->data = tmp + page_size; + + ctx->event.events = EPOLLIN; + err = epoll_ctl(ctx->epoll_fd, EPOLL_CTL_ADD, r->map_fd, &ctx->event); + if (err < 0) { + fprintf(stderr, "failed to epoll add ringbuf: %d\n", -errno); + exit(1); + } + + link = bpf_program__attach(ctx->skel->progs.bench_ringbuf); + if (IS_ERR(link)) { + fprintf(stderr, "failed to attach program\n"); + exit(1); + } +} + +#define RINGBUF_BUSY_BIT (1 << 31) +#define RINGBUF_DISCARD_BIT (1 << 30) +#define RINGBUF_META_LEN 8 + +static inline int roundup_len(__u32 len) +{ + /* clear out top 2 bits */ + len <<= 2; + len >>= 2; + /* add length prefix */ + len += RINGBUF_META_LEN; + /* round up to 8 byte alignment */ + return (len + 7) / 8 * 8; +} + +static void ringbuf_custom_process_ring(struct ringbuf_custom *r) +{ + unsigned long cons_pos, prod_pos; + int *len_ptr, len; + bool got_new_data; + + cons_pos = smp_load_acquire(r->consumer_pos); + while (true) { + got_new_data = false; + prod_pos = smp_load_acquire(r->producer_pos); + while (cons_pos < prod_pos) { + len_ptr = r->data + (cons_pos & r->mask); + len = smp_load_acquire(len_ptr); + + /* sample not committed yet, bail out for now */ + if (len & RINGBUF_BUSY_BIT) + return; + + got_new_data = true; + cons_pos += roundup_len(len); + + atomic_inc(&buf_hits.value); + } + if (got_new_data) + smp_store_release(r->consumer_pos, cons_pos); + else + break; + }; +} + +static void *ringbuf_custom_consumer(void *input) +{ + struct ringbuf_custom_ctx *ctx = &ringbuf_custom_ctx; + int cnt; + + do { + if (args.back2back) + bufs_trigger_batch(); + cnt = epoll_wait(ctx->epoll_fd, &ctx->event, 1, -1); + if (cnt > 0) + ringbuf_custom_process_ring(&ctx->ringbuf); + } while (cnt >= 0); + fprintf(stderr, "ringbuf polling failed!\n"); + return 0; +} + +/* PERFBUF-LIBBPF benchmark */ +static struct perfbuf_libbpf_ctx { + struct perfbuf_bench *skel; + struct perf_buffer *perfbuf; +} perfbuf_libbpf_ctx; + +static void perfbuf_measure(struct bench_res *res) +{ + struct perfbuf_libbpf_ctx *ctx = &perfbuf_libbpf_ctx; + + res->hits = atomic_swap(&buf_hits.value, 0); + res->drops = atomic_swap(&ctx->skel->bss->dropped, 0); +} + +static struct perfbuf_bench *perfbuf_setup_skeleton() +{ + struct perfbuf_bench *skel; + + setup_libbpf(); + + skel = perfbuf_bench__open(); + if (!skel) { + fprintf(stderr, "failed to open skeleton\n"); + exit(1); + } + + skel->rodata->batch_cnt = args.batch_cnt; + + if (perfbuf_bench__load(skel)) { + fprintf(stderr, "failed to load skeleton\n"); + exit(1); + } + + return skel; +} + +static enum bpf_perf_event_ret +perfbuf_process_sample_raw(void *input_ctx, int cpu, + struct perf_event_header *e) +{ + switch (e->type) { + case PERF_RECORD_SAMPLE: + atomic_inc(&buf_hits.value); + break; + case PERF_RECORD_LOST: + break; + default: + return LIBBPF_PERF_EVENT_ERROR; + } + return LIBBPF_PERF_EVENT_CONT; +} + +static void perfbuf_libbpf_setup() +{ + struct perfbuf_libbpf_ctx *ctx = &perfbuf_libbpf_ctx; + struct perf_event_attr attr; + struct perf_buffer_raw_opts pb_opts = { + .event_cb = perfbuf_process_sample_raw, + .ctx = (void *)(long)0, + .attr = &attr, + }; + struct bpf_link *link; + + ctx->skel = perfbuf_setup_skeleton(); + + memset(&attr, 0, sizeof(attr)); + attr.config = PERF_COUNT_SW_BPF_OUTPUT, + attr.type = PERF_TYPE_SOFTWARE; + attr.sample_type = PERF_SAMPLE_RAW; + /* notify only every Nth sample */ + if (args.sampled) { + attr.sample_period = args.sample_rate; + attr.wakeup_events = args.sample_rate; + } else { + attr.sample_period = 1; + attr.wakeup_events = 1; + } + + if (args.sample_rate > args.batch_cnt) { + fprintf(stderr, "sample rate %d is too high for given batch count %d\n", + args.sample_rate, args.batch_cnt); + exit(1); + } + + ctx->perfbuf = perf_buffer__new_raw(bpf_map__fd(ctx->skel->maps.perfbuf), + args.perfbuf_sz, &pb_opts); + if (!ctx->perfbuf) { + fprintf(stderr, "failed to create perfbuf\n"); + exit(1); + } + + link = bpf_program__attach(ctx->skel->progs.bench_perfbuf); + if (IS_ERR(link)) { + fprintf(stderr, "failed to attach program\n"); + exit(1); + } +} + +static void *perfbuf_libbpf_consumer(void *input) +{ + struct perfbuf_libbpf_ctx *ctx = &perfbuf_libbpf_ctx; + + while (perf_buffer__poll(ctx->perfbuf, -1) >= 0) { + if (args.back2back) + bufs_trigger_batch(); + } + fprintf(stderr, "perfbuf polling failed!\n"); + return NULL; +} + +/* PERFBUF-CUSTOM benchmark */ + +/* copies of internal libbpf definitions */ +struct perf_cpu_buf { + struct perf_buffer *pb; + void *base; /* mmap()'ed memory */ + void *buf; /* for reconstructing segmented data */ + size_t buf_size; + int fd; + int cpu; + int map_key; +}; + +struct perf_buffer { + perf_buffer_event_fn event_cb; + perf_buffer_sample_fn sample_cb; + perf_buffer_lost_fn lost_cb; + void *ctx; /* passed into callbacks */ + + size_t page_size; + size_t mmap_size; + struct perf_cpu_buf **cpu_bufs; + struct epoll_event *events; + int cpu_cnt; /* number of allocated CPU buffers */ + int epoll_fd; /* perf event FD */ + int map_fd; /* BPF_MAP_TYPE_PERF_EVENT_ARRAY BPF map FD */ +}; + +static void *perfbuf_custom_consumer(void *input) +{ + struct perfbuf_libbpf_ctx *ctx = &perfbuf_libbpf_ctx; + struct perf_buffer *pb = ctx->perfbuf; + struct perf_cpu_buf *cpu_buf; + struct perf_event_mmap_page *header; + size_t mmap_mask = pb->mmap_size - 1; + struct perf_event_header *ehdr; + __u64 data_head, data_tail; + size_t ehdr_size; + void *base; + int i, cnt; + + while (true) { + if (args.back2back) + bufs_trigger_batch(); + cnt = epoll_wait(pb->epoll_fd, pb->events, pb->cpu_cnt, -1); + if (cnt <= 0) { + fprintf(stderr, "perf epoll failed: %d\n", -errno); + exit(1); + } + + for (i = 0; i < cnt; ++i) { + cpu_buf = pb->events[i].data.ptr; + header = cpu_buf->base; + base = ((void *)header) + pb->page_size; + + data_head = ring_buffer_read_head(header); + data_tail = header->data_tail; + while (data_head != data_tail) { + ehdr = base + (data_tail & mmap_mask); + ehdr_size = ehdr->size; + + if (ehdr->type == PERF_RECORD_SAMPLE) + atomic_inc(&buf_hits.value); + + data_tail += ehdr_size; + } + ring_buffer_write_tail(header, data_tail); + } + } + return NULL; +} + +const struct bench bench_rb_libbpf = { + .name = "rb-libbpf", + .validate = bufs_validate, + .setup = ringbuf_libbpf_setup, + .producer_thread = bufs_sample_producer, + .consumer_thread = ringbuf_libbpf_consumer, + .measure = ringbuf_libbpf_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_rb_custom = { + .name = "rb-custom", + .validate = bufs_validate, + .setup = ringbuf_custom_setup, + .producer_thread = bufs_sample_producer, + .consumer_thread = ringbuf_custom_consumer, + .measure = ringbuf_custom_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_pb_libbpf = { + .name = "pb-libbpf", + .validate = bufs_validate, + .setup = perfbuf_libbpf_setup, + .producer_thread = bufs_sample_producer, + .consumer_thread = perfbuf_libbpf_consumer, + .measure = perfbuf_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_pb_custom = { + .name = "pb-custom", + .validate = bufs_validate, + .setup = perfbuf_libbpf_setup, + .producer_thread = bufs_sample_producer, + .consumer_thread = perfbuf_custom_consumer, + .measure = perfbuf_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c new file mode 100644 index 000000000000..49c22832f216 --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c @@ -0,0 +1,167 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include "bench.h" +#include "trigger_bench.skel.h" + +/* BPF triggering benchmarks */ +static struct trigger_ctx { + struct trigger_bench *skel; +} ctx; + +static struct counter base_hits; + +static void trigger_validate() +{ + if (env.consumer_cnt != 1) { + fprintf(stderr, "benchmark doesn't support multi-consumer!\n"); + exit(1); + } +} + +static void *trigger_base_producer(void *input) +{ + while (true) { + (void)syscall(__NR_getpgid); + atomic_inc(&base_hits.value); + } + return NULL; +} + +static void trigger_base_measure(struct bench_res *res) +{ + res->hits = atomic_swap(&base_hits.value, 0); +} + +static void *trigger_producer(void *input) +{ + while (true) + (void)syscall(__NR_getpgid); + return NULL; +} + +static void trigger_measure(struct bench_res *res) +{ + res->hits = atomic_swap(&ctx.skel->bss->hits, 0); +} + +static void setup_ctx() +{ + setup_libbpf(); + + ctx.skel = trigger_bench__open_and_load(); + if (!ctx.skel) { + fprintf(stderr, "failed to open skeleton\n"); + exit(1); + } +} + +static void attach_bpf(struct bpf_program *prog) +{ + struct bpf_link *link; + + link = bpf_program__attach(prog); + if (IS_ERR(link)) { + fprintf(stderr, "failed to attach program!\n"); + exit(1); + } +} + +static void trigger_tp_setup() +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.bench_trigger_tp); +} + +static void trigger_rawtp_setup() +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.bench_trigger_raw_tp); +} + +static void trigger_kprobe_setup() +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.bench_trigger_kprobe); +} + +static void trigger_fentry_setup() +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.bench_trigger_fentry); +} + +static void trigger_fmodret_setup() +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.bench_trigger_fmodret); +} + +static void *trigger_consumer(void *input) +{ + return NULL; +} + +const struct bench bench_trig_base = { + .name = "trig-base", + .validate = trigger_validate, + .producer_thread = trigger_base_producer, + .consumer_thread = trigger_consumer, + .measure = trigger_base_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_trig_tp = { + .name = "trig-tp", + .validate = trigger_validate, + .setup = trigger_tp_setup, + .producer_thread = trigger_producer, + .consumer_thread = trigger_consumer, + .measure = trigger_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_trig_rawtp = { + .name = "trig-rawtp", + .validate = trigger_validate, + .setup = trigger_rawtp_setup, + .producer_thread = trigger_producer, + .consumer_thread = trigger_consumer, + .measure = trigger_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_trig_kprobe = { + .name = "trig-kprobe", + .validate = trigger_validate, + .setup = trigger_kprobe_setup, + .producer_thread = trigger_producer, + .consumer_thread = trigger_consumer, + .measure = trigger_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_trig_fentry = { + .name = "trig-fentry", + .validate = trigger_validate, + .setup = trigger_fentry_setup, + .producer_thread = trigger_producer, + .consumer_thread = trigger_consumer, + .measure = trigger_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_trig_fmodret = { + .name = "trig-fmodret", + .validate = trigger_validate, + .setup = trigger_fmodret_setup, + .producer_thread = trigger_producer, + .consumer_thread = trigger_consumer, + .measure = trigger_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; diff --git a/tools/testing/selftests/bpf/benchs/run_bench_rename.sh b/tools/testing/selftests/bpf/benchs/run_bench_rename.sh new file mode 100755 index 000000000000..16f774b1cdbe --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/run_bench_rename.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +set -eufo pipefail + +for i in base kprobe kretprobe rawtp fentry fexit fmodret +do + summary=$(sudo ./bench -w2 -d5 -a rename-$i | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-) + printf "%-10s: %s\n" $i "$summary" +done diff --git a/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh b/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh new file mode 100755 index 000000000000..af4aa04caba6 --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh @@ -0,0 +1,75 @@ +#!/bin/bash + +set -eufo pipefail + +RUN_BENCH="sudo ./bench -w3 -d10 -a" + +function hits() +{ + echo "$*" | sed -E "s/.*hits\s+([0-9]+\.[0-9]+ ± [0-9]+\.[0-9]+M\/s).*/\1/" +} + +function drops() +{ + echo "$*" | sed -E "s/.*drops\s+([0-9]+\.[0-9]+ ± [0-9]+\.[0-9]+M\/s).*/\1/" +} + +function header() +{ + local len=${#1} + + printf "\n%s\n" "$1" + for i in $(seq 1 $len); do printf '='; done + printf '\n' +} + +function summarize() +{ + bench="$1" + summary=$(echo $2 | tail -n1) + printf "%-20s %s (drops %s)\n" "$bench" "$(hits $summary)" "$(drops $summary)" +} + +header "Single-producer, parallel producer" +for b in rb-libbpf rb-custom pb-libbpf pb-custom; do + summarize $b "$($RUN_BENCH $b)" +done + +header "Single-producer, parallel producer, sampled notification" +for b in rb-libbpf rb-custom pb-libbpf pb-custom; do + summarize $b "$($RUN_BENCH --rb-sampled $b)" +done + +header "Single-producer, back-to-back mode" +for b in rb-libbpf rb-custom pb-libbpf pb-custom; do + summarize $b "$($RUN_BENCH --rb-b2b $b)" + summarize $b-sampled "$($RUN_BENCH --rb-sampled --rb-b2b $b)" +done + +header "Ringbuf back-to-back, effect of sample rate" +for b in 1 5 10 25 50 100 250 500 1000 2000 3000; do + summarize "rb-sampled-$b" "$($RUN_BENCH --rb-b2b --rb-batch-cnt $b --rb-sampled --rb-sample-rate $b rb-custom)" +done +header "Perfbuf back-to-back, effect of sample rate" +for b in 1 5 10 25 50 100 250 500 1000 2000 3000; do + summarize "pb-sampled-$b" "$($RUN_BENCH --rb-b2b --rb-batch-cnt $b --rb-sampled --rb-sample-rate $b pb-custom)" +done + +header "Ringbuf back-to-back, reserve+commit vs output" +summarize "reserve" "$($RUN_BENCH --rb-b2b rb-custom)" +summarize "output" "$($RUN_BENCH --rb-b2b --rb-use-output rb-custom)" + +header "Ringbuf sampled, reserve+commit vs output" +summarize "reserve-sampled" "$($RUN_BENCH --rb-sampled rb-custom)" +summarize "output-sampled" "$($RUN_BENCH --rb-sampled --rb-use-output rb-custom)" + +header "Single-producer, consumer/producer competing on the same CPU, low batch count" +for b in rb-libbpf rb-custom pb-libbpf pb-custom; do + summarize $b "$($RUN_BENCH --rb-batch-cnt 1 --rb-sample-rate 1 --prod-affinity 0 --cons-affinity 0 $b)" +done + +header "Ringbuf, multi-producer contention" +for b in 1 2 3 4 8 12 16 20 24 28 32 36 40 44 48 52; do + summarize "rb-libbpf nr_prod $b" "$($RUN_BENCH -p$b --rb-batch-cnt 50 rb-libbpf)" +done + diff --git a/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh new file mode 100755 index 000000000000..78e83f243294 --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +set -eufo pipefail + +for i in base tp rawtp kprobe fentry fmodret +do + summary=$(sudo ./bench -w2 -d5 -a trig-$i | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-) + printf "%-10s: %s\n" $i "$summary" +done diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h index 8f21965ffc6c..5bf2fe9b1efa 100644 --- a/tools/testing/selftests/bpf/bpf_tcp_helpers.h +++ b/tools/testing/selftests/bpf/bpf_tcp_helpers.h @@ -6,7 +6,7 @@ #include <linux/types.h> #include <bpf/bpf_helpers.h> #include <bpf/bpf_core_read.h> -#include "bpf_trace_helpers.h" +#include <bpf/bpf_tracing.h> #define BPF_STRUCT_OPS(name, args...) \ SEC("struct_ops/"#name) \ diff --git a/tools/testing/selftests/bpf/bpf_trace_helpers.h b/tools/testing/selftests/bpf/bpf_trace_helpers.h deleted file mode 100644 index c6f1354d93fb..000000000000 --- a/tools/testing/selftests/bpf/bpf_trace_helpers.h +++ /dev/null @@ -1,120 +0,0 @@ -/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ -#ifndef __BPF_TRACE_HELPERS_H -#define __BPF_TRACE_HELPERS_H - -#include <bpf/bpf_helpers.h> - -#define ___bpf_concat(a, b) a ## b -#define ___bpf_apply(fn, n) ___bpf_concat(fn, n) -#define ___bpf_nth(_, _1, _2, _3, _4, _5, _6, _7, _8, _9, _a, _b, _c, N, ...) N -#define ___bpf_narg(...) \ - ___bpf_nth(_, ##__VA_ARGS__, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) -#define ___bpf_empty(...) \ - ___bpf_nth(_, ##__VA_ARGS__, N, N, N, N, N, N, N, N, N, N, 0) - -#define ___bpf_ctx_cast0() ctx -#define ___bpf_ctx_cast1(x) ___bpf_ctx_cast0(), (void *)ctx[0] -#define ___bpf_ctx_cast2(x, args...) ___bpf_ctx_cast1(args), (void *)ctx[1] -#define ___bpf_ctx_cast3(x, args...) ___bpf_ctx_cast2(args), (void *)ctx[2] -#define ___bpf_ctx_cast4(x, args...) ___bpf_ctx_cast3(args), (void *)ctx[3] -#define ___bpf_ctx_cast5(x, args...) ___bpf_ctx_cast4(args), (void *)ctx[4] -#define ___bpf_ctx_cast6(x, args...) ___bpf_ctx_cast5(args), (void *)ctx[5] -#define ___bpf_ctx_cast7(x, args...) ___bpf_ctx_cast6(args), (void *)ctx[6] -#define ___bpf_ctx_cast8(x, args...) ___bpf_ctx_cast7(args), (void *)ctx[7] -#define ___bpf_ctx_cast9(x, args...) ___bpf_ctx_cast8(args), (void *)ctx[8] -#define ___bpf_ctx_cast10(x, args...) ___bpf_ctx_cast9(args), (void *)ctx[9] -#define ___bpf_ctx_cast11(x, args...) ___bpf_ctx_cast10(args), (void *)ctx[10] -#define ___bpf_ctx_cast12(x, args...) ___bpf_ctx_cast11(args), (void *)ctx[11] -#define ___bpf_ctx_cast(args...) \ - ___bpf_apply(___bpf_ctx_cast, ___bpf_narg(args))(args) - -/* - * BPF_PROG is a convenience wrapper for generic tp_btf/fentry/fexit and - * similar kinds of BPF programs, that accept input arguments as a single - * pointer to untyped u64 array, where each u64 can actually be a typed - * pointer or integer of different size. Instead of requring user to write - * manual casts and work with array elements by index, BPF_PROG macro - * allows user to declare a list of named and typed input arguments in the - * same syntax as for normal C function. All the casting is hidden and - * performed transparently, while user code can just assume working with - * function arguments of specified type and name. - * - * Original raw context argument is preserved as well as 'ctx' argument. - * This is useful when using BPF helpers that expect original context - * as one of the parameters (e.g., for bpf_perf_event_output()). - */ -#define BPF_PROG(name, args...) \ -name(unsigned long long *ctx); \ -static __always_inline typeof(name(0)) \ -____##name(unsigned long long *ctx, ##args); \ -typeof(name(0)) name(unsigned long long *ctx) \ -{ \ - _Pragma("GCC diagnostic push") \ - _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ - return ____##name(___bpf_ctx_cast(args)); \ - _Pragma("GCC diagnostic pop") \ -} \ -static __always_inline typeof(name(0)) \ -____##name(unsigned long long *ctx, ##args) - -struct pt_regs; - -#define ___bpf_kprobe_args0() ctx -#define ___bpf_kprobe_args1(x) \ - ___bpf_kprobe_args0(), (void *)PT_REGS_PARM1(ctx) -#define ___bpf_kprobe_args2(x, args...) \ - ___bpf_kprobe_args1(args), (void *)PT_REGS_PARM2(ctx) -#define ___bpf_kprobe_args3(x, args...) \ - ___bpf_kprobe_args2(args), (void *)PT_REGS_PARM3(ctx) -#define ___bpf_kprobe_args4(x, args...) \ - ___bpf_kprobe_args3(args), (void *)PT_REGS_PARM4(ctx) -#define ___bpf_kprobe_args5(x, args...) \ - ___bpf_kprobe_args4(args), (void *)PT_REGS_PARM5(ctx) -#define ___bpf_kprobe_args(args...) \ - ___bpf_apply(___bpf_kprobe_args, ___bpf_narg(args))(args) - -/* - * BPF_KPROBE serves the same purpose for kprobes as BPF_PROG for - * tp_btf/fentry/fexit BPF programs. It hides the underlying platform-specific - * low-level way of getting kprobe input arguments from struct pt_regs, and - * provides a familiar typed and named function arguments syntax and - * semantics of accessing kprobe input paremeters. - * - * Original struct pt_regs* context is preserved as 'ctx' argument. This might - * be necessary when using BPF helpers like bpf_perf_event_output(). - */ -#define BPF_KPROBE(name, args...) \ -name(struct pt_regs *ctx); \ -static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args);\ -typeof(name(0)) name(struct pt_regs *ctx) \ -{ \ - _Pragma("GCC diagnostic push") \ - _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ - return ____##name(___bpf_kprobe_args(args)); \ - _Pragma("GCC diagnostic pop") \ -} \ -static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args) - -#define ___bpf_kretprobe_args0() ctx -#define ___bpf_kretprobe_argsN(x, args...) \ - ___bpf_kprobe_args(args), (void *)PT_REGS_RET(ctx) -#define ___bpf_kretprobe_args(args...) \ - ___bpf_apply(___bpf_kretprobe_args, ___bpf_empty(args))(args) - -/* - * BPF_KRETPROBE is similar to BPF_KPROBE, except, in addition to listing all - * input kprobe arguments, one last extra argument has to be specified, which - * captures kprobe return value. - */ -#define BPF_KRETPROBE(name, args...) \ -name(struct pt_regs *ctx); \ -static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args);\ -typeof(name(0)) name(struct pt_regs *ctx) \ -{ \ - _Pragma("GCC diagnostic push") \ - _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ - return ____##name(___bpf_kretprobe_args(args)); \ - _Pragma("GCC diagnostic pop") \ -} \ -static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args) -#endif diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 5dc109f4c097..2118e23ac07a 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -25,6 +25,7 @@ CONFIG_XDP_SOCKETS=y CONFIG_FTRACE_SYSCALLS=y CONFIG_IPV6_TUNNEL=y CONFIG_IPV6_GRE=y +CONFIG_IPV6_SEG6_BPF=y CONFIG_NET_FOU=m CONFIG_NET_FOU_IP_TUNNELS=y CONFIG_IPV6_FOU=m @@ -35,3 +36,6 @@ CONFIG_MPLS_ROUTING=m CONFIG_MPLS_IPTUNNEL=m CONFIG_IPV6_SIT=m CONFIG_BPF_JIT=y +CONFIG_BPF_LSM=y +CONFIG_SECURITY=y +CONFIG_LIRC=y diff --git a/tools/testing/selftests/bpf/include/uapi/linux/types.h b/tools/testing/selftests/bpf/include/uapi/linux/types.h deleted file mode 100644 index 91fa51a9c31d..000000000000 --- a/tools/testing/selftests/bpf/include/uapi/linux/types.h +++ /dev/null @@ -1,23 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _UAPI_LINUX_TYPES_H -#define _UAPI_LINUX_TYPES_H - -#include <asm-generic/int-ll64.h> - -/* copied from linux:include/uapi/linux/types.h */ -#define __bitwise -typedef __u16 __bitwise __le16; -typedef __u16 __bitwise __be16; -typedef __u32 __bitwise __le32; -typedef __u32 __bitwise __be32; -typedef __u64 __bitwise __le64; -typedef __u64 __bitwise __be64; - -typedef __u16 __bitwise __sum16; -typedef __u32 __bitwise __wsum; - -#define __aligned_u64 __u64 __attribute__((aligned(8))) -#define __aligned_be64 __be64 __attribute__((aligned(8))) -#define __aligned_le64 __le64 __attribute__((aligned(8))) - -#endif /* _UAPI_LINUX_TYPES_H */ diff --git a/tools/testing/selftests/bpf/map_tests/.gitignore b/tools/testing/selftests/bpf/map_tests/.gitignore index 45984a364647..89c4a3d37544 100644 --- a/tools/testing/selftests/bpf/map_tests/.gitignore +++ b/tools/testing/selftests/bpf/map_tests/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only tests.h diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c new file mode 100644 index 000000000000..e36dd1a1780d --- /dev/null +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -0,0 +1,167 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <errno.h> +#include <stdbool.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> + +#include <arpa/inet.h> + +#include <sys/epoll.h> + +#include <linux/err.h> +#include <linux/in.h> +#include <linux/in6.h> + +#include "bpf_util.h" +#include "network_helpers.h" + +#define clean_errno() (errno == 0 ? "None" : strerror(errno)) +#define log_err(MSG, ...) fprintf(stderr, "(%s:%d: errno: %s) " MSG "\n", \ + __FILE__, __LINE__, clean_errno(), ##__VA_ARGS__) + +struct ipv4_packet pkt_v4 = { + .eth.h_proto = __bpf_constant_htons(ETH_P_IP), + .iph.ihl = 5, + .iph.protocol = IPPROTO_TCP, + .iph.tot_len = __bpf_constant_htons(MAGIC_BYTES), + .tcp.urg_ptr = 123, + .tcp.doff = 5, +}; + +struct ipv6_packet pkt_v6 = { + .eth.h_proto = __bpf_constant_htons(ETH_P_IPV6), + .iph.nexthdr = IPPROTO_TCP, + .iph.payload_len = __bpf_constant_htons(MAGIC_BYTES), + .tcp.urg_ptr = 123, + .tcp.doff = 5, +}; + +int start_server_with_port(int family, int type, __u16 port) +{ + struct sockaddr_storage addr = {}; + socklen_t len; + int fd; + + if (family == AF_INET) { + struct sockaddr_in *sin = (void *)&addr; + + sin->sin_family = AF_INET; + sin->sin_port = htons(port); + len = sizeof(*sin); + } else { + struct sockaddr_in6 *sin6 = (void *)&addr; + + sin6->sin6_family = AF_INET6; + sin6->sin6_port = htons(port); + len = sizeof(*sin6); + } + + fd = socket(family, type | SOCK_NONBLOCK, 0); + if (fd < 0) { + log_err("Failed to create server socket"); + return -1; + } + + if (bind(fd, (const struct sockaddr *)&addr, len) < 0) { + log_err("Failed to bind socket"); + close(fd); + return -1; + } + + if (type == SOCK_STREAM) { + if (listen(fd, 1) < 0) { + log_err("Failed to listed on socket"); + close(fd); + return -1; + } + } + + return fd; +} + +int start_server(int family, int type) +{ + return start_server_with_port(family, type, 0); +} + +static const struct timeval timeo_sec = { .tv_sec = 3 }; +static const size_t timeo_optlen = sizeof(timeo_sec); + +int connect_to_fd(int family, int type, int server_fd) +{ + int fd, save_errno; + + fd = socket(family, type, 0); + if (fd < 0) { + log_err("Failed to create client socket"); + return -1; + } + + if (connect_fd_to_fd(fd, server_fd) < 0 && errno != EINPROGRESS) { + save_errno = errno; + close(fd); + errno = save_errno; + return -1; + } + + return fd; +} + +int connect_fd_to_fd(int client_fd, int server_fd) +{ + struct sockaddr_storage addr; + socklen_t len = sizeof(addr); + int save_errno; + + if (setsockopt(client_fd, SOL_SOCKET, SO_RCVTIMEO, &timeo_sec, + timeo_optlen)) { + log_err("Failed to set SO_RCVTIMEO"); + return -1; + } + + if (getsockname(server_fd, (struct sockaddr *)&addr, &len)) { + log_err("Failed to get server addr"); + return -1; + } + + if (connect(client_fd, (const struct sockaddr *)&addr, len) < 0) { + if (errno != EINPROGRESS) { + save_errno = errno; + log_err("Failed to connect to server"); + errno = save_errno; + } + return -1; + } + + return 0; +} + +int connect_wait(int fd) +{ + struct epoll_event ev = {}, events[2]; + int timeout_ms = 1000; + int efd, nfd; + + efd = epoll_create1(EPOLL_CLOEXEC); + if (efd < 0) { + log_err("Failed to open epoll fd"); + return -1; + } + + ev.events = EPOLLRDHUP | EPOLLOUT; + ev.data.fd = fd; + + if (epoll_ctl(efd, EPOLL_CTL_ADD, fd, &ev) < 0) { + log_err("Failed to register fd=%d on epoll fd=%d", fd, efd); + close(efd); + return -1; + } + + nfd = epoll_wait(efd, events, ARRAY_SIZE(events), timeout_ms); + if (nfd < 0) + log_err("Failed to wait for I/O event on epoll fd=%d", efd); + + close(efd); + return nfd; +} diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h new file mode 100644 index 000000000000..6a8009605670 --- /dev/null +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __NETWORK_HELPERS_H +#define __NETWORK_HELPERS_H +#include <sys/socket.h> +#include <sys/types.h> +#include <linux/types.h> +typedef __u16 __sum16; +#include <linux/if_ether.h> +#include <linux/if_packet.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <netinet/tcp.h> +#include <bpf/bpf_endian.h> + +#define MAGIC_VAL 0x1234 +#define NUM_ITER 100000 +#define VIP_NUM 5 +#define MAGIC_BYTES 123 + +/* ipv4 test vector */ +struct ipv4_packet { + struct ethhdr eth; + struct iphdr iph; + struct tcphdr tcp; +} __packed; +extern struct ipv4_packet pkt_v4; + +/* ipv6 test vector */ +struct ipv6_packet { + struct ethhdr eth; + struct ipv6hdr iph; + struct tcphdr tcp; +} __packed; +extern struct ipv6_packet pkt_v6; + +int start_server(int family, int type); +int start_server_with_port(int family, int type, __u16 port); +int connect_to_fd(int family, int type, int server_fd); +int connect_fd_to_fd(int client_fd, int server_fd); +int connect_wait(int client_fd); + +#endif diff --git a/tools/testing/selftests/bpf/prog_tests/.gitignore b/tools/testing/selftests/bpf/prog_tests/.gitignore index 45984a364647..89c4a3d37544 100644 --- a/tools/testing/selftests/bpf/prog_tests/.gitignore +++ b/tools/testing/selftests/bpf/prog_tests/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only tests.h diff --git a/tools/testing/selftests/bpf/test_align.c b/tools/testing/selftests/bpf/prog_tests/align.c index 0262f7b374f9..c548aded6585 100644 --- a/tools/testing/selftests/bpf/test_align.c +++ b/tools/testing/selftests/bpf/prog_tests/align.c @@ -1,24 +1,5 @@ -#include <asm/types.h> -#include <linux/types.h> -#include <stdint.h> -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <errno.h> -#include <string.h> -#include <stddef.h> -#include <stdbool.h> - -#include <linux/unistd.h> -#include <linux/filter.h> -#include <linux/bpf_perf_event.h> -#include <linux/bpf.h> - -#include <bpf/bpf.h> - -#include "../../../include/linux/filter.h" -#include "bpf_rlimit.h" -#include "bpf_util.h" +// SPDX-License-Identifier: GPL-2.0 +#include <test_progs.h> #define MAX_INSNS 512 #define MAX_MATCHES 16 @@ -359,15 +340,15 @@ static struct bpf_align_test tests[] = { * is still (4n), fixed offset is not changed. * Also, we create a new reg->id. */ - {29, "R5_w=pkt(id=4,off=18,r=0,umax_value=2040,var_off=(0x0; 0x7fc))"}, + {29, "R5_w=pkt(id=4,off=18,r=0,umax_value=2040,var_off=(0x0; 0x7fc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (18) * which is 20. Then the variable offset is (4n), so * the total offset is 4-byte aligned and meets the * load's requirements. */ - {33, "R4=pkt(id=4,off=22,r=22,umax_value=2040,var_off=(0x0; 0x7fc))"}, - {33, "R5=pkt(id=4,off=18,r=22,umax_value=2040,var_off=(0x0; 0x7fc))"}, + {33, "R4=pkt(id=4,off=22,r=22,umax_value=2040,var_off=(0x0; 0x7fc)"}, + {33, "R5=pkt(id=4,off=18,r=22,umax_value=2040,var_off=(0x0; 0x7fc)"}, }, }, { @@ -410,15 +391,15 @@ static struct bpf_align_test tests[] = { /* Adding 14 makes R6 be (4n+2) */ {9, "R6_w=inv(id=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"}, /* Packet pointer has (4n+2) offset */ - {11, "R5_w=pkt(id=1,off=0,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"}, - {13, "R4=pkt(id=1,off=4,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"}, + {11, "R5_w=pkt(id=1,off=0,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc)"}, + {13, "R4=pkt(id=1,off=4,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (0) * which is 2. Then the variable offset is (4n+2), so * the total offset is 4-byte aligned and meets the * load's requirements. */ - {15, "R5=pkt(id=1,off=0,r=4,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"}, + {15, "R5=pkt(id=1,off=0,r=4,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc)"}, /* Newly read value in R6 was shifted left by 2, so has * known alignment of 4. */ @@ -426,15 +407,15 @@ static struct bpf_align_test tests[] = { /* Added (4n) to packet pointer's (4n+2) var_off, giving * another (4n+2). */ - {19, "R5_w=pkt(id=2,off=0,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc))"}, - {21, "R4=pkt(id=2,off=4,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc))"}, + {19, "R5_w=pkt(id=2,off=0,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc)"}, + {21, "R4=pkt(id=2,off=4,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (0) * which is 2. Then the variable offset is (4n+2), so * the total offset is 4-byte aligned and meets the * load's requirements. */ - {23, "R5=pkt(id=2,off=0,r=4,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc))"}, + {23, "R5=pkt(id=2,off=0,r=4,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc)"}, }, }, { @@ -469,16 +450,16 @@ static struct bpf_align_test tests[] = { .matches = { {4, "R5_w=pkt_end(id=0,off=0,imm=0)"}, /* (ptr - ptr) << 2 == unknown, (4n) */ - {6, "R5_w=inv(id=0,smax_value=9223372036854775804,umax_value=18446744073709551612,var_off=(0x0; 0xfffffffffffffffc))"}, + {6, "R5_w=inv(id=0,smax_value=9223372036854775804,umax_value=18446744073709551612,var_off=(0x0; 0xfffffffffffffffc)"}, /* (4n) + 14 == (4n+2). We blow our bounds, because * the add could overflow. */ - {7, "R5_w=inv(id=0,var_off=(0x2; 0xfffffffffffffffc))"}, + {7, "R5_w=inv(id=0,smin_value=-9223372036854775806,smax_value=9223372036854775806,umin_value=2,umax_value=18446744073709551614,var_off=(0x2; 0xfffffffffffffffc)"}, /* Checked s>=0 */ - {9, "R5=inv(id=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"}, + {9, "R5=inv(id=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"}, /* packet pointer + nonnegative (4n+2) */ - {11, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"}, - {13, "R4_w=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"}, + {11, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"}, + {13, "R4_w=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"}, /* NET_IP_ALIGN + (4n+2) == (4n), alignment is fine. * We checked the bounds, but it might have been able * to overflow if the packet pointer started in the @@ -486,7 +467,7 @@ static struct bpf_align_test tests[] = { * So we did not get a 'range' on R6, and the access * attempt will fail. */ - {15, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"}, + {15, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"}, } }, { @@ -528,7 +509,7 @@ static struct bpf_align_test tests[] = { /* New unknown value in R7 is (4n) */ {11, "R7_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, /* Subtracting it from R6 blows our unsigned bounds */ - {12, "R6=inv(id=0,smin_value=-1006,smax_value=1034,var_off=(0x2; 0xfffffffffffffffc))"}, + {12, "R6=inv(id=0,smin_value=-1006,smax_value=1034,umin_value=2,umax_value=18446744073709551614,var_off=(0x2; 0xfffffffffffffffc)"}, /* Checked s>= 0 */ {14, "R6=inv(id=0,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc))"}, /* At the time the word size load is performed from R5, @@ -537,7 +518,8 @@ static struct bpf_align_test tests[] = { * the total offset is 4-byte aligned and meets the * load's requirements. */ - {20, "R5=pkt(id=1,off=0,r=4,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc))"}, + {20, "R5=pkt(id=1,off=0,r=4,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc)"}, + }, }, { @@ -579,18 +561,18 @@ static struct bpf_align_test tests[] = { /* Adding 14 makes R6 be (4n+2) */ {11, "R6_w=inv(id=0,umin_value=14,umax_value=74,var_off=(0x2; 0x7c))"}, /* Subtracting from packet pointer overflows ubounds */ - {13, "R5_w=pkt(id=1,off=0,r=8,umin_value=18446744073709551542,umax_value=18446744073709551602,var_off=(0xffffffffffffff82; 0x7c))"}, + {13, "R5_w=pkt(id=1,off=0,r=8,umin_value=18446744073709551542,umax_value=18446744073709551602,var_off=(0xffffffffffffff82; 0x7c)"}, /* New unknown value in R7 is (4n), >= 76 */ {15, "R7_w=inv(id=0,umin_value=76,umax_value=1096,var_off=(0x0; 0x7fc))"}, /* Adding it to packet pointer gives nice bounds again */ - {16, "R5_w=pkt(id=2,off=0,r=0,umin_value=2,umax_value=1082,var_off=(0x2; 0x7fc))"}, + {16, "R5_w=pkt(id=2,off=0,r=0,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (0) * which is 2. Then the variable offset is (4n+2), so * the total offset is 4-byte aligned and meets the * load's requirements. */ - {20, "R5=pkt(id=2,off=0,r=4,umin_value=2,umax_value=1082,var_off=(0x2; 0x7fc))"}, + {20, "R5=pkt(id=2,off=0,r=4,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"}, }, }, }; @@ -669,51 +651,16 @@ static int do_test_single(struct bpf_align_test *test) return ret; } -static int do_test(unsigned int from, unsigned int to) +void test_align(void) { - int all_pass = 0; - int all_fail = 0; unsigned int i; - for (i = from; i < to; i++) { + for (i = 0; i < ARRAY_SIZE(tests); i++) { struct bpf_align_test *test = &tests[i]; - int fail; - printf("Test %3d: %s ... ", - i, test->descr); - fail = do_test_single(test); - if (fail) { - all_fail++; - printf("FAIL\n"); - } else { - all_pass++; - printf("PASS\n"); - } - } - printf("Results: %d pass %d fail\n", - all_pass, all_fail); - return all_fail ? EXIT_FAILURE : EXIT_SUCCESS; -} + if (!test__start_subtest(test->descr)) + continue; -int main(int argc, char **argv) -{ - unsigned int from = 0, to = ARRAY_SIZE(tests); - - if (argc == 3) { - unsigned int l = atoi(argv[argc - 2]); - unsigned int u = atoi(argv[argc - 1]); - - if (l < to && u < to) { - from = l; - to = u + 1; - } - } else if (argc == 2) { - unsigned int t = atoi(argv[argc - 1]); - - if (t < to) { - from = t; - to = t + 1; - } + CHECK_FAIL(do_test_single(test)); } - return do_test(from, to); } diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c new file mode 100644 index 000000000000..87c29dde1cf9 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c @@ -0,0 +1,409 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include <test_progs.h> +#include "bpf_iter_ipv6_route.skel.h" +#include "bpf_iter_netlink.skel.h" +#include "bpf_iter_bpf_map.skel.h" +#include "bpf_iter_task.skel.h" +#include "bpf_iter_task_file.skel.h" +#include "bpf_iter_test_kern1.skel.h" +#include "bpf_iter_test_kern2.skel.h" +#include "bpf_iter_test_kern3.skel.h" +#include "bpf_iter_test_kern4.skel.h" + +static int duration; + +static void test_btf_id_or_null(void) +{ + struct bpf_iter_test_kern3 *skel; + + skel = bpf_iter_test_kern3__open_and_load(); + if (CHECK(skel, "bpf_iter_test_kern3__open_and_load", + "skeleton open_and_load unexpectedly succeeded\n")) { + bpf_iter_test_kern3__destroy(skel); + return; + } +} + +static void do_dummy_read(struct bpf_program *prog) +{ + struct bpf_link *link; + char buf[16] = {}; + int iter_fd, len; + + link = bpf_program__attach_iter(prog, NULL); + if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n")) + return; + + iter_fd = bpf_iter_create(bpf_link__fd(link)); + if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n")) + goto free_link; + + /* not check contents, but ensure read() ends without error */ + while ((len = read(iter_fd, buf, sizeof(buf))) > 0) + ; + CHECK(len < 0, "read", "read failed: %s\n", strerror(errno)); + + close(iter_fd); + +free_link: + bpf_link__destroy(link); +} + +static void test_ipv6_route(void) +{ + struct bpf_iter_ipv6_route *skel; + + skel = bpf_iter_ipv6_route__open_and_load(); + if (CHECK(!skel, "bpf_iter_ipv6_route__open_and_load", + "skeleton open_and_load failed\n")) + return; + + do_dummy_read(skel->progs.dump_ipv6_route); + + bpf_iter_ipv6_route__destroy(skel); +} + +static void test_netlink(void) +{ + struct bpf_iter_netlink *skel; + + skel = bpf_iter_netlink__open_and_load(); + if (CHECK(!skel, "bpf_iter_netlink__open_and_load", + "skeleton open_and_load failed\n")) + return; + + do_dummy_read(skel->progs.dump_netlink); + + bpf_iter_netlink__destroy(skel); +} + +static void test_bpf_map(void) +{ + struct bpf_iter_bpf_map *skel; + + skel = bpf_iter_bpf_map__open_and_load(); + if (CHECK(!skel, "bpf_iter_bpf_map__open_and_load", + "skeleton open_and_load failed\n")) + return; + + do_dummy_read(skel->progs.dump_bpf_map); + + bpf_iter_bpf_map__destroy(skel); +} + +static void test_task(void) +{ + struct bpf_iter_task *skel; + + skel = bpf_iter_task__open_and_load(); + if (CHECK(!skel, "bpf_iter_task__open_and_load", + "skeleton open_and_load failed\n")) + return; + + do_dummy_read(skel->progs.dump_task); + + bpf_iter_task__destroy(skel); +} + +static void test_task_file(void) +{ + struct bpf_iter_task_file *skel; + + skel = bpf_iter_task_file__open_and_load(); + if (CHECK(!skel, "bpf_iter_task_file__open_and_load", + "skeleton open_and_load failed\n")) + return; + + do_dummy_read(skel->progs.dump_task_file); + + bpf_iter_task_file__destroy(skel); +} + +/* The expected string is less than 16 bytes */ +static int do_read_with_fd(int iter_fd, const char *expected, + bool read_one_char) +{ + int err = -1, len, read_buf_len, start; + char buf[16] = {}; + + read_buf_len = read_one_char ? 1 : 16; + start = 0; + while ((len = read(iter_fd, buf + start, read_buf_len)) > 0) { + start += len; + if (CHECK(start >= 16, "read", "read len %d\n", len)) + return -1; + read_buf_len = read_one_char ? 1 : 16 - start; + } + if (CHECK(len < 0, "read", "read failed: %s\n", strerror(errno))) + return -1; + + err = strcmp(buf, expected); + if (CHECK(err, "read", "incorrect read result: buf %s, expected %s\n", + buf, expected)) + return -1; + + return 0; +} + +static void test_anon_iter(bool read_one_char) +{ + struct bpf_iter_test_kern1 *skel; + struct bpf_link *link; + int iter_fd, err; + + skel = bpf_iter_test_kern1__open_and_load(); + if (CHECK(!skel, "bpf_iter_test_kern1__open_and_load", + "skeleton open_and_load failed\n")) + return; + + err = bpf_iter_test_kern1__attach(skel); + if (CHECK(err, "bpf_iter_test_kern1__attach", + "skeleton attach failed\n")) { + goto out; + } + + link = skel->links.dump_task; + iter_fd = bpf_iter_create(bpf_link__fd(link)); + if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n")) + goto out; + + do_read_with_fd(iter_fd, "abcd", read_one_char); + close(iter_fd); + +out: + bpf_iter_test_kern1__destroy(skel); +} + +static int do_read(const char *path, const char *expected) +{ + int err, iter_fd; + + iter_fd = open(path, O_RDONLY); + if (CHECK(iter_fd < 0, "open", "open %s failed: %s\n", + path, strerror(errno))) + return -1; + + err = do_read_with_fd(iter_fd, expected, false); + close(iter_fd); + return err; +} + +static void test_file_iter(void) +{ + const char *path = "/sys/fs/bpf/bpf_iter_test1"; + struct bpf_iter_test_kern1 *skel1; + struct bpf_iter_test_kern2 *skel2; + struct bpf_link *link; + int err; + + skel1 = bpf_iter_test_kern1__open_and_load(); + if (CHECK(!skel1, "bpf_iter_test_kern1__open_and_load", + "skeleton open_and_load failed\n")) + return; + + link = bpf_program__attach_iter(skel1->progs.dump_task, NULL); + if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n")) + goto out; + + /* unlink this path if it exists. */ + unlink(path); + + err = bpf_link__pin(link, path); + if (CHECK(err, "pin_iter", "pin_iter to %s failed: %d\n", path, err)) + goto free_link; + + err = do_read(path, "abcd"); + if (err) + goto unlink_path; + + /* file based iterator seems working fine. Let us a link update + * of the underlying link and `cat` the iterator again, its content + * should change. + */ + skel2 = bpf_iter_test_kern2__open_and_load(); + if (CHECK(!skel2, "bpf_iter_test_kern2__open_and_load", + "skeleton open_and_load failed\n")) + goto unlink_path; + + err = bpf_link__update_program(link, skel2->progs.dump_task); + if (CHECK(err, "update_prog", "update_prog failed\n")) + goto destroy_skel2; + + do_read(path, "ABCD"); + +destroy_skel2: + bpf_iter_test_kern2__destroy(skel2); +unlink_path: + unlink(path); +free_link: + bpf_link__destroy(link); +out: + bpf_iter_test_kern1__destroy(skel1); +} + +static void test_overflow(bool test_e2big_overflow, bool ret1) +{ + __u32 map_info_len, total_read_len, expected_read_len; + int err, iter_fd, map1_fd, map2_fd, len; + struct bpf_map_info map_info = {}; + struct bpf_iter_test_kern4 *skel; + struct bpf_link *link; + __u32 page_size; + char *buf; + + skel = bpf_iter_test_kern4__open(); + if (CHECK(!skel, "bpf_iter_test_kern4__open", + "skeleton open failed\n")) + return; + + /* create two maps: bpf program will only do bpf_seq_write + * for these two maps. The goal is one map output almost + * fills seq_file buffer and then the other will trigger + * overflow and needs restart. + */ + map1_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, 4, 8, 1, 0); + if (CHECK(map1_fd < 0, "bpf_create_map", + "map_creation failed: %s\n", strerror(errno))) + goto out; + map2_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, 4, 8, 1, 0); + if (CHECK(map2_fd < 0, "bpf_create_map", + "map_creation failed: %s\n", strerror(errno))) + goto free_map1; + + /* bpf_seq_printf kernel buffer is one page, so one map + * bpf_seq_write will mostly fill it, and the other map + * will partially fill and then trigger overflow and need + * bpf_seq_read restart. + */ + page_size = sysconf(_SC_PAGE_SIZE); + + if (test_e2big_overflow) { + skel->rodata->print_len = (page_size + 8) / 8; + expected_read_len = 2 * (page_size + 8); + } else if (!ret1) { + skel->rodata->print_len = (page_size - 8) / 8; + expected_read_len = 2 * (page_size - 8); + } else { + skel->rodata->print_len = 1; + expected_read_len = 2 * 8; + } + skel->rodata->ret1 = ret1; + + if (CHECK(bpf_iter_test_kern4__load(skel), + "bpf_iter_test_kern4__load", "skeleton load failed\n")) + goto free_map2; + + /* setup filtering map_id in bpf program */ + map_info_len = sizeof(map_info); + err = bpf_obj_get_info_by_fd(map1_fd, &map_info, &map_info_len); + if (CHECK(err, "get_map_info", "get map info failed: %s\n", + strerror(errno))) + goto free_map2; + skel->bss->map1_id = map_info.id; + + err = bpf_obj_get_info_by_fd(map2_fd, &map_info, &map_info_len); + if (CHECK(err, "get_map_info", "get map info failed: %s\n", + strerror(errno))) + goto free_map2; + skel->bss->map2_id = map_info.id; + + link = bpf_program__attach_iter(skel->progs.dump_bpf_map, NULL); + if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n")) + goto free_map2; + + iter_fd = bpf_iter_create(bpf_link__fd(link)); + if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n")) + goto free_link; + + buf = malloc(expected_read_len); + if (!buf) + goto close_iter; + + /* do read */ + total_read_len = 0; + if (test_e2big_overflow) { + while ((len = read(iter_fd, buf, expected_read_len)) > 0) + total_read_len += len; + + CHECK(len != -1 || errno != E2BIG, "read", + "expected ret -1, errno E2BIG, but get ret %d, error %s\n", + len, strerror(errno)); + goto free_buf; + } else if (!ret1) { + while ((len = read(iter_fd, buf, expected_read_len)) > 0) + total_read_len += len; + + if (CHECK(len < 0, "read", "read failed: %s\n", + strerror(errno))) + goto free_buf; + } else { + do { + len = read(iter_fd, buf, expected_read_len); + if (len > 0) + total_read_len += len; + } while (len > 0 || len == -EAGAIN); + + if (CHECK(len < 0, "read", "read failed: %s\n", + strerror(errno))) + goto free_buf; + } + + if (CHECK(total_read_len != expected_read_len, "read", + "total len %u, expected len %u\n", total_read_len, + expected_read_len)) + goto free_buf; + + if (CHECK(skel->bss->map1_accessed != 1, "map1_accessed", + "expected 1 actual %d\n", skel->bss->map1_accessed)) + goto free_buf; + + if (CHECK(skel->bss->map2_accessed != 2, "map2_accessed", + "expected 2 actual %d\n", skel->bss->map2_accessed)) + goto free_buf; + + CHECK(skel->bss->map2_seqnum1 != skel->bss->map2_seqnum2, + "map2_seqnum", "two different seqnum %lld %lld\n", + skel->bss->map2_seqnum1, skel->bss->map2_seqnum2); + +free_buf: + free(buf); +close_iter: + close(iter_fd); +free_link: + bpf_link__destroy(link); +free_map2: + close(map2_fd); +free_map1: + close(map1_fd); +out: + bpf_iter_test_kern4__destroy(skel); +} + +void test_bpf_iter(void) +{ + if (test__start_subtest("btf_id_or_null")) + test_btf_id_or_null(); + if (test__start_subtest("ipv6_route")) + test_ipv6_route(); + if (test__start_subtest("netlink")) + test_netlink(); + if (test__start_subtest("bpf_map")) + test_bpf_map(); + if (test__start_subtest("task")) + test_task(); + if (test__start_subtest("task_file")) + test_task_file(); + if (test__start_subtest("anon")) + test_anon_iter(false); + if (test__start_subtest("anon-read-one-char")) + test_anon_iter(true); + if (test__start_subtest("file")) + test_file_iter(); + if (test__start_subtest("overflow")) + test_overflow(false, false); + if (test__start_subtest("overflow-e2big")) + test_overflow(true, false); + if (test__start_subtest("prog-ret-1")) + test_overflow(false, true); +} diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c b/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c index f10029821e16..7afa4160416f 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c @@ -1,26 +1,30 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> +#define nr_iters 2 + void test_bpf_obj_id(void) { const __u64 array_magic_value = 0xfaceb00c; const __u32 array_key = 0; - const int nr_iters = 2; const char *file = "./test_obj_id.o"; const char *expected_prog_name = "test_obj_id"; const char *expected_map_name = "test_map_id"; const __u64 nsec_per_sec = 1000000000; - struct bpf_object *objs[nr_iters]; + struct bpf_object *objs[nr_iters] = {}; + struct bpf_link *links[nr_iters] = {}; + struct bpf_program *prog; int prog_fds[nr_iters], map_fds[nr_iters]; /* +1 to test for the info_len returned by kernel */ struct bpf_prog_info prog_infos[nr_iters + 1]; struct bpf_map_info map_infos[nr_iters + 1]; + struct bpf_link_info link_infos[nr_iters + 1]; /* Each prog only uses one map. +1 to test nr_map_ids * returned by kernel. */ __u32 map_ids[nr_iters + 1]; - char jited_insns[128], xlated_insns[128], zeros[128]; + char jited_insns[128], xlated_insns[128], zeros[128], tp_name[128]; __u32 i, next_id, info_len, nr_id_found, duration = 0; struct timespec real_time_ts, boot_time_ts; int err = 0; @@ -36,14 +40,15 @@ void test_bpf_obj_id(void) CHECK(err >= 0 || errno != ENOENT, "get-fd-by-notexist-map-id", "err %d errno %d\n", err, errno); - for (i = 0; i < nr_iters; i++) - objs[i] = NULL; + err = bpf_link_get_fd_by_id(0); + CHECK(err >= 0 || errno != ENOENT, + "get-fd-by-notexist-link-id", "err %d errno %d\n", err, errno); /* Check bpf_obj_get_info_by_fd() */ bzero(zeros, sizeof(zeros)); for (i = 0; i < nr_iters; i++) { now = time(NULL); - err = bpf_prog_load(file, BPF_PROG_TYPE_SOCKET_FILTER, + err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT, &objs[i], &prog_fds[i]); /* test_obj_id.o is a dumb prog. It should never fail * to load. @@ -60,6 +65,17 @@ void test_bpf_obj_id(void) if (CHECK_FAIL(err)) goto done; + prog = bpf_object__find_program_by_title(objs[i], + "raw_tp/sys_enter"); + if (CHECK_FAIL(!prog)) + goto done; + links[i] = bpf_program__attach(prog); + err = libbpf_get_error(links[i]); + if (CHECK(err, "prog_attach", "prog #%d, err %d\n", i, err)) { + links[i] = NULL; + goto done; + } + /* Check getting map info */ info_len = sizeof(struct bpf_map_info) * 2; bzero(&map_infos[i], info_len); @@ -107,7 +123,7 @@ void test_bpf_obj_id(void) load_time = (real_time_ts.tv_sec - boot_time_ts.tv_sec) + (prog_infos[i].load_time / nsec_per_sec); if (CHECK(err || - prog_infos[i].type != BPF_PROG_TYPE_SOCKET_FILTER || + prog_infos[i].type != BPF_PROG_TYPE_RAW_TRACEPOINT || info_len != sizeof(struct bpf_prog_info) || (env.jit_enabled && !prog_infos[i].jited_prog_len) || (env.jit_enabled && @@ -120,7 +136,11 @@ void test_bpf_obj_id(void) *(int *)(long)prog_infos[i].map_ids != map_infos[i].id || strcmp((char *)prog_infos[i].name, expected_prog_name), "get-prog-info(fd)", - "err %d errno %d i %d type %d(%d) info_len %u(%zu) jit_enabled %d jited_prog_len %u xlated_prog_len %u jited_prog %d xlated_prog %d load_time %lu(%lu) uid %u(%u) nr_map_ids %u(%u) map_id %u(%u) name %s(%s)\n", + "err %d errno %d i %d type %d(%d) info_len %u(%zu) " + "jit_enabled %d jited_prog_len %u xlated_prog_len %u " + "jited_prog %d xlated_prog %d load_time %lu(%lu) " + "uid %u(%u) nr_map_ids %u(%u) map_id %u(%u) " + "name %s(%s)\n", err, errno, i, prog_infos[i].type, BPF_PROG_TYPE_SOCKET_FILTER, info_len, sizeof(struct bpf_prog_info), @@ -135,6 +155,33 @@ void test_bpf_obj_id(void) *(int *)(long)prog_infos[i].map_ids, map_infos[i].id, prog_infos[i].name, expected_prog_name)) goto done; + + /* Check getting link info */ + info_len = sizeof(struct bpf_link_info) * 2; + bzero(&link_infos[i], info_len); + link_infos[i].raw_tracepoint.tp_name = (__u64)&tp_name; + link_infos[i].raw_tracepoint.tp_name_len = sizeof(tp_name); + err = bpf_obj_get_info_by_fd(bpf_link__fd(links[i]), + &link_infos[i], &info_len); + if (CHECK(err || + link_infos[i].type != BPF_LINK_TYPE_RAW_TRACEPOINT || + link_infos[i].prog_id != prog_infos[i].id || + link_infos[i].raw_tracepoint.tp_name != (__u64)&tp_name || + strcmp((char *)link_infos[i].raw_tracepoint.tp_name, + "sys_enter") || + info_len != sizeof(struct bpf_link_info), + "get-link-info(fd)", + "err %d errno %d info_len %u(%zu) type %d(%d) id %d " + "prog_id %d (%d) tp_name %s(%s)\n", + err, errno, + info_len, sizeof(struct bpf_link_info), + link_infos[i].type, BPF_LINK_TYPE_RAW_TRACEPOINT, + link_infos[i].id, + link_infos[i].prog_id, prog_infos[i].id, + (char *)link_infos[i].raw_tracepoint.tp_name, + "sys_enter")) + goto done; + } /* Check bpf_prog_get_next_id() */ @@ -247,7 +294,52 @@ void test_bpf_obj_id(void) "nr_id_found %u(%u)\n", nr_id_found, nr_iters); + /* Check bpf_link_get_next_id() */ + nr_id_found = 0; + next_id = 0; + while (!bpf_link_get_next_id(next_id, &next_id)) { + struct bpf_link_info link_info; + int link_fd, cmp_res; + + info_len = sizeof(link_info); + memset(&link_info, 0, info_len); + + link_fd = bpf_link_get_fd_by_id(next_id); + if (link_fd < 0 && errno == ENOENT) + /* The bpf_link is in the dead row */ + continue; + if (CHECK(link_fd < 0, "get-link-fd(next_id)", + "link_fd %d next_id %u errno %d\n", + link_fd, next_id, errno)) + break; + + for (i = 0; i < nr_iters; i++) + if (link_infos[i].id == next_id) + break; + + if (i == nr_iters) + continue; + + nr_id_found++; + + err = bpf_obj_get_info_by_fd(link_fd, &link_info, &info_len); + cmp_res = memcmp(&link_info, &link_infos[i], + offsetof(struct bpf_link_info, raw_tracepoint)); + CHECK(err || info_len != sizeof(link_info) || cmp_res, + "check get-link-info(next_id->fd)", + "err %d errno %d info_len %u(%zu) memcmp %d\n", + err, errno, info_len, sizeof(struct bpf_link_info), + cmp_res); + + close(link_fd); + } + CHECK(nr_id_found != nr_iters, + "check total link id found by get_next_id", + "nr_id_found %u(%u)\n", nr_id_found, nr_iters); + done: - for (i = 0; i < nr_iters; i++) + for (i = 0; i < nr_iters; i++) { + bpf_link__destroy(links[i]); bpf_object__close(objs[i]); + } } diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index 8482bbc67eec..9a8f47fc0b91 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -11,6 +11,7 @@ static const unsigned int total_bytes = 10 * 1024 * 1024; static const struct timeval timeo_sec = { .tv_sec = 10 }; static const size_t timeo_optlen = sizeof(timeo_sec); +static int expected_stg = 0xeB9F; static int stop, duration; static int settimeo(int fd) @@ -88,7 +89,7 @@ done: return NULL; } -static void do_test(const char *tcp_ca) +static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map) { struct sockaddr_in6 sa6 = {}; ssize_t nr_recv = 0, bytes = 0; @@ -126,14 +127,34 @@ static void do_test(const char *tcp_ca) err = listen(lfd, 1); if (CHECK(err == -1, "listen", "errno:%d\n", errno)) goto done; - err = pthread_create(&srv_thread, NULL, server, (void *)(long)lfd); - if (CHECK(err != 0, "pthread_create", "err:%d\n", err)) - goto done; + + if (sk_stg_map) { + err = bpf_map_update_elem(bpf_map__fd(sk_stg_map), &fd, + &expected_stg, BPF_NOEXIST); + if (CHECK(err, "bpf_map_update_elem(sk_stg_map)", + "err:%d errno:%d\n", err, errno)) + goto done; + } /* connect to server */ err = connect(fd, (struct sockaddr *)&sa6, addrlen); if (CHECK(err == -1, "connect", "errno:%d\n", errno)) - goto wait_thread; + goto done; + + if (sk_stg_map) { + int tmp_stg; + + err = bpf_map_lookup_elem(bpf_map__fd(sk_stg_map), &fd, + &tmp_stg); + if (CHECK(!err || errno != ENOENT, + "bpf_map_lookup_elem(sk_stg_map)", + "err:%d errno:%d\n", err, errno)) + goto done; + } + + err = pthread_create(&srv_thread, NULL, server, (void *)(long)lfd); + if (CHECK(err != 0, "pthread_create", "err:%d errno:%d\n", err, errno)) + goto done; /* recv total_bytes */ while (bytes < total_bytes && !READ_ONCE(stop)) { @@ -149,7 +170,6 @@ static void do_test(const char *tcp_ca) CHECK(bytes != total_bytes, "recv", "%zd != %u nr_recv:%zd errno:%d\n", bytes, total_bytes, nr_recv, errno); -wait_thread: WRITE_ONCE(stop, 1); pthread_join(srv_thread, &thread_ret); CHECK(IS_ERR(thread_ret), "pthread_join", "thread_ret:%ld", @@ -175,7 +195,7 @@ static void test_cubic(void) return; } - do_test("bpf_cubic"); + do_test("bpf_cubic", NULL); bpf_link__destroy(link); bpf_cubic__destroy(cubic_skel); @@ -197,7 +217,10 @@ static void test_dctcp(void) return; } - do_test("bpf_dctcp"); + do_test("bpf_dctcp", dctcp_skel->maps.sk_stg_map); + CHECK(dctcp_skel->bss->stg_result != expected_stg, + "Unexpected stg_result", "stg_result (%x) != expected_stg (%x)\n", + dctcp_skel->bss->stg_result, expected_stg); bpf_link__destroy(link); bpf_dctcp__destroy(dctcp_skel); diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c index 7390d3061065..cb33a7ee4e04 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c @@ -125,6 +125,6 @@ void test_btf_dump() { if (!test__start_subtest(t->name)) continue; - test_btf_dump_case(i, &btf_dump_test_cases[i]); + test_btf_dump_case(i, &btf_dump_test_cases[i]); } } diff --git a/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c b/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c new file mode 100644 index 000000000000..f7ee8fa377ad --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ + +#include <test_progs.h> + +#include "test_btf_map_in_map.skel.h" + +void test_btf_map_in_map(void) +{ + int duration = 0, err, key = 0, val; + struct test_btf_map_in_map* skel; + + skel = test_btf_map_in_map__open_and_load(); + if (CHECK(!skel, "skel_open", "failed to open&load skeleton\n")) + return; + + err = test_btf_map_in_map__attach(skel); + if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err)) + goto cleanup; + + /* inner1 = input, inner2 = input + 1 */ + val = bpf_map__fd(skel->maps.inner_map1); + bpf_map_update_elem(bpf_map__fd(skel->maps.outer_arr), &key, &val, 0); + val = bpf_map__fd(skel->maps.inner_map2); + bpf_map_update_elem(bpf_map__fd(skel->maps.outer_hash), &key, &val, 0); + skel->bss->input = 1; + usleep(1); + + bpf_map_lookup_elem(bpf_map__fd(skel->maps.inner_map1), &key, &val); + CHECK(val != 1, "inner1", "got %d != exp %d\n", val, 1); + bpf_map_lookup_elem(bpf_map__fd(skel->maps.inner_map2), &key, &val); + CHECK(val != 2, "inner2", "got %d != exp %d\n", val, 2); + + /* inner1 = input + 1, inner2 = input */ + val = bpf_map__fd(skel->maps.inner_map2); + bpf_map_update_elem(bpf_map__fd(skel->maps.outer_arr), &key, &val, 0); + val = bpf_map__fd(skel->maps.inner_map1); + bpf_map_update_elem(bpf_map__fd(skel->maps.outer_hash), &key, &val, 0); + skel->bss->input = 3; + usleep(1); + + bpf_map_lookup_elem(bpf_map__fd(skel->maps.inner_map1), &key, &val); + CHECK(val != 4, "inner1", "got %d != exp %d\n", val, 4); + bpf_map_lookup_elem(bpf_map__fd(skel->maps.inner_map2), &key, &val); + CHECK(val != 3, "inner2", "got %d != exp %d\n", val, 3); + +cleanup: + test_btf_map_in_map__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_autodetach.c b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_autodetach.c index 5b13f2c6c402..70e94e783070 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_autodetach.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_autodetach.c @@ -6,7 +6,7 @@ #define PING_CMD "ping -q -c1 -w1 127.0.0.1 > /dev/null" -char bpf_log_buf[BPF_LOG_BUF_SIZE]; +static char bpf_log_buf[BPF_LOG_BUF_SIZE]; static int prog_load(void) { diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c index 2ff21dbce179..139f8e82c7c6 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c @@ -6,7 +6,7 @@ #define PING_CMD "ping -q -c1 -w1 127.0.0.1 > /dev/null" -char bpf_log_buf[BPF_LOG_BUF_SIZE]; +static char bpf_log_buf[BPF_LOG_BUF_SIZE]; static int map_fd = -1; diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_override.c b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_override.c index 9d8cb48b99de..9e96f8d87fea 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_override.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_override.c @@ -8,7 +8,7 @@ #define BAR "/foo/bar/" #define PING_CMD "ping -q -c1 -w1 127.0.0.1 > /dev/null" -char bpf_log_buf[BPF_LOG_BUF_SIZE]; +static char bpf_log_buf[BPF_LOG_BUF_SIZE]; static int prog_load(int verdict) { diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_link.c b/tools/testing/selftests/bpf/prog_tests/cgroup_link.c new file mode 100644 index 000000000000..6e04f8d1d15b --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_link.c @@ -0,0 +1,244 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <test_progs.h> +#include "cgroup_helpers.h" +#include "test_cgroup_link.skel.h" + +static __u32 duration = 0; +#define PING_CMD "ping -q -c1 -w1 127.0.0.1 > /dev/null" + +static struct test_cgroup_link *skel = NULL; + +int ping_and_check(int exp_calls, int exp_alt_calls) +{ + skel->bss->calls = 0; + skel->bss->alt_calls = 0; + CHECK_FAIL(system(PING_CMD)); + if (CHECK(skel->bss->calls != exp_calls, "call_cnt", + "exp %d, got %d\n", exp_calls, skel->bss->calls)) + return -EINVAL; + if (CHECK(skel->bss->alt_calls != exp_alt_calls, "alt_call_cnt", + "exp %d, got %d\n", exp_alt_calls, skel->bss->alt_calls)) + return -EINVAL; + return 0; +} + +void test_cgroup_link(void) +{ + struct { + const char *path; + int fd; + } cgs[] = { + { "/cg1" }, + { "/cg1/cg2" }, + { "/cg1/cg2/cg3" }, + { "/cg1/cg2/cg3/cg4" }, + }; + int last_cg = ARRAY_SIZE(cgs) - 1, cg_nr = ARRAY_SIZE(cgs); + DECLARE_LIBBPF_OPTS(bpf_link_update_opts, link_upd_opts); + struct bpf_link *links[ARRAY_SIZE(cgs)] = {}, *tmp_link; + __u32 prog_ids[ARRAY_SIZE(cgs)], prog_cnt = 0, attach_flags; + int i = 0, err, prog_fd; + bool detach_legacy = false; + + skel = test_cgroup_link__open_and_load(); + if (CHECK(!skel, "skel_open_load", "failed to open/load skeleton\n")) + return; + prog_fd = bpf_program__fd(skel->progs.egress); + + err = setup_cgroup_environment(); + if (CHECK(err, "cg_init", "failed: %d\n", err)) + goto cleanup; + + for (i = 0; i < cg_nr; i++) { + cgs[i].fd = create_and_get_cgroup(cgs[i].path); + if (CHECK(cgs[i].fd < 0, "cg_create", "fail: %d\n", cgs[i].fd)) + goto cleanup; + } + + err = join_cgroup(cgs[last_cg].path); + if (CHECK(err, "cg_join", "fail: %d\n", err)) + goto cleanup; + + for (i = 0; i < cg_nr; i++) { + links[i] = bpf_program__attach_cgroup(skel->progs.egress, + cgs[i].fd); + if (CHECK(IS_ERR(links[i]), "cg_attach", "i: %d, err: %ld\n", + i, PTR_ERR(links[i]))) + goto cleanup; + } + + ping_and_check(cg_nr, 0); + + /* query the number of effective progs and attach flags in root cg */ + err = bpf_prog_query(cgs[0].fd, BPF_CGROUP_INET_EGRESS, + BPF_F_QUERY_EFFECTIVE, &attach_flags, NULL, + &prog_cnt); + CHECK_FAIL(err); + CHECK_FAIL(attach_flags != BPF_F_ALLOW_MULTI); + if (CHECK(prog_cnt != 1, "effect_cnt", "exp %d, got %d\n", 1, prog_cnt)) + goto cleanup; + + /* query the number of effective progs in last cg */ + err = bpf_prog_query(cgs[last_cg].fd, BPF_CGROUP_INET_EGRESS, + BPF_F_QUERY_EFFECTIVE, NULL, NULL, + &prog_cnt); + CHECK_FAIL(err); + CHECK_FAIL(attach_flags != BPF_F_ALLOW_MULTI); + if (CHECK(prog_cnt != cg_nr, "effect_cnt", "exp %d, got %d\n", + cg_nr, prog_cnt)) + goto cleanup; + + /* query the effective prog IDs in last cg */ + err = bpf_prog_query(cgs[last_cg].fd, BPF_CGROUP_INET_EGRESS, + BPF_F_QUERY_EFFECTIVE, &attach_flags, + prog_ids, &prog_cnt); + CHECK_FAIL(err); + CHECK_FAIL(attach_flags != BPF_F_ALLOW_MULTI); + if (CHECK(prog_cnt != cg_nr, "effect_cnt", "exp %d, got %d\n", + cg_nr, prog_cnt)) + goto cleanup; + for (i = 1; i < prog_cnt; i++) { + CHECK(prog_ids[i - 1] != prog_ids[i], "prog_id_check", + "idx %d, prev id %d, cur id %d\n", + i, prog_ids[i - 1], prog_ids[i]); + } + + /* detach bottom program and ping again */ + bpf_link__destroy(links[last_cg]); + links[last_cg] = NULL; + + ping_and_check(cg_nr - 1, 0); + + /* mix in with non link-based multi-attachments */ + err = bpf_prog_attach(prog_fd, cgs[last_cg].fd, + BPF_CGROUP_INET_EGRESS, BPF_F_ALLOW_MULTI); + if (CHECK(err, "cg_attach_legacy", "errno=%d\n", errno)) + goto cleanup; + detach_legacy = true; + + links[last_cg] = bpf_program__attach_cgroup(skel->progs.egress, + cgs[last_cg].fd); + if (CHECK(IS_ERR(links[last_cg]), "cg_attach", "err: %ld\n", + PTR_ERR(links[last_cg]))) + goto cleanup; + + ping_and_check(cg_nr + 1, 0); + + /* detach link */ + bpf_link__destroy(links[last_cg]); + links[last_cg] = NULL; + + /* detach legacy */ + err = bpf_prog_detach2(prog_fd, cgs[last_cg].fd, BPF_CGROUP_INET_EGRESS); + if (CHECK(err, "cg_detach_legacy", "errno=%d\n", errno)) + goto cleanup; + detach_legacy = false; + + /* attach legacy exclusive prog attachment */ + err = bpf_prog_attach(prog_fd, cgs[last_cg].fd, + BPF_CGROUP_INET_EGRESS, 0); + if (CHECK(err, "cg_attach_exclusive", "errno=%d\n", errno)) + goto cleanup; + detach_legacy = true; + + /* attempt to mix in with multi-attach bpf_link */ + tmp_link = bpf_program__attach_cgroup(skel->progs.egress, + cgs[last_cg].fd); + if (CHECK(!IS_ERR(tmp_link), "cg_attach_fail", "unexpected success!\n")) { + bpf_link__destroy(tmp_link); + goto cleanup; + } + + ping_and_check(cg_nr, 0); + + /* detach */ + err = bpf_prog_detach2(prog_fd, cgs[last_cg].fd, BPF_CGROUP_INET_EGRESS); + if (CHECK(err, "cg_detach_legacy", "errno=%d\n", errno)) + goto cleanup; + detach_legacy = false; + + ping_and_check(cg_nr - 1, 0); + + /* attach back link-based one */ + links[last_cg] = bpf_program__attach_cgroup(skel->progs.egress, + cgs[last_cg].fd); + if (CHECK(IS_ERR(links[last_cg]), "cg_attach", "err: %ld\n", + PTR_ERR(links[last_cg]))) + goto cleanup; + + ping_and_check(cg_nr, 0); + + /* check legacy exclusive prog can't be attached */ + err = bpf_prog_attach(prog_fd, cgs[last_cg].fd, + BPF_CGROUP_INET_EGRESS, 0); + if (CHECK(!err, "cg_attach_exclusive", "unexpected success")) { + bpf_prog_detach2(prog_fd, cgs[last_cg].fd, BPF_CGROUP_INET_EGRESS); + goto cleanup; + } + + /* replace BPF programs inside their links for all but first link */ + for (i = 1; i < cg_nr; i++) { + err = bpf_link__update_program(links[i], skel->progs.egress_alt); + if (CHECK(err, "prog_upd", "link #%d\n", i)) + goto cleanup; + } + + ping_and_check(1, cg_nr - 1); + + /* Attempt program update with wrong expected BPF program */ + link_upd_opts.old_prog_fd = bpf_program__fd(skel->progs.egress_alt); + link_upd_opts.flags = BPF_F_REPLACE; + err = bpf_link_update(bpf_link__fd(links[0]), + bpf_program__fd(skel->progs.egress_alt), + &link_upd_opts); + if (CHECK(err == 0 || errno != EPERM, "prog_cmpxchg1", + "unexpectedly succeeded, err %d, errno %d\n", err, -errno)) + goto cleanup; + + /* Compare-exchange single link program from egress to egress_alt */ + link_upd_opts.old_prog_fd = bpf_program__fd(skel->progs.egress); + link_upd_opts.flags = BPF_F_REPLACE; + err = bpf_link_update(bpf_link__fd(links[0]), + bpf_program__fd(skel->progs.egress_alt), + &link_upd_opts); + if (CHECK(err, "prog_cmpxchg2", "errno %d\n", -errno)) + goto cleanup; + + /* ping */ + ping_and_check(0, cg_nr); + + /* close cgroup FDs before detaching links */ + for (i = 0; i < cg_nr; i++) { + if (cgs[i].fd > 0) { + close(cgs[i].fd); + cgs[i].fd = -1; + } + } + + /* BPF programs should still get called */ + ping_and_check(0, cg_nr); + + /* leave cgroup and remove them, don't detach programs */ + cleanup_cgroup_environment(); + + /* BPF programs should have been auto-detached */ + ping_and_check(0, 0); + +cleanup: + if (detach_legacy) + bpf_prog_detach2(prog_fd, cgs[last_cg].fd, + BPF_CGROUP_INET_EGRESS); + + for (i = 0; i < cg_nr; i++) { + if (!IS_ERR(links[i])) + bpf_link__destroy(links[i]); + } + test_cgroup_link__destroy(skel); + + for (i = 0; i < cg_nr; i++) { + if (cgs[i].fd > 0) + close(cgs[i].fd); + } + cleanup_cgroup_environment(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_skb_sk_lookup.c b/tools/testing/selftests/bpf/prog_tests/cgroup_skb_sk_lookup.c new file mode 100644 index 000000000000..059047af7df3 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_skb_sk_lookup.c @@ -0,0 +1,95 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Facebook + +#include <test_progs.h> + +#include "network_helpers.h" +#include "cgroup_skb_sk_lookup_kern.skel.h" + +static void run_lookup_test(__u16 *g_serv_port, int out_sk) +{ + int serv_sk = -1, in_sk = -1, serv_in_sk = -1, err; + struct sockaddr_in6 addr = {}; + socklen_t addr_len = sizeof(addr); + __u32 duration = 0; + + serv_sk = start_server(AF_INET6, SOCK_STREAM); + if (CHECK(serv_sk < 0, "start_server", "failed to start server\n")) + return; + + err = getsockname(serv_sk, (struct sockaddr *)&addr, &addr_len); + if (CHECK(err, "getsockname", "errno %d\n", errno)) + goto cleanup; + + *g_serv_port = addr.sin6_port; + + /* Client outside of test cgroup should fail to connect by timeout. */ + err = connect_fd_to_fd(out_sk, serv_sk); + if (CHECK(!err || errno != EINPROGRESS, "connect_fd_to_fd", + "unexpected result err %d errno %d\n", err, errno)) + goto cleanup; + + err = connect_wait(out_sk); + if (CHECK(err, "connect_wait", "unexpected result %d\n", err)) + goto cleanup; + + /* Client inside test cgroup should connect just fine. */ + in_sk = connect_to_fd(AF_INET6, SOCK_STREAM, serv_sk); + if (CHECK(in_sk < 0, "connect_to_fd", "errno %d\n", errno)) + goto cleanup; + + serv_in_sk = accept(serv_sk, NULL, NULL); + if (CHECK(serv_in_sk < 0, "accept", "errno %d\n", errno)) + goto cleanup; + +cleanup: + close(serv_in_sk); + close(in_sk); + close(serv_sk); +} + +static void run_cgroup_bpf_test(const char *cg_path, int out_sk) +{ + struct cgroup_skb_sk_lookup_kern *skel; + struct bpf_link *link; + __u32 duration = 0; + int cgfd = -1; + + skel = cgroup_skb_sk_lookup_kern__open_and_load(); + if (CHECK(!skel, "skel_open_load", "open_load failed\n")) + return; + + cgfd = test__join_cgroup(cg_path); + if (CHECK(cgfd < 0, "cgroup_join", "cgroup setup failed\n")) + goto cleanup; + + link = bpf_program__attach_cgroup(skel->progs.ingress_lookup, cgfd); + if (CHECK(IS_ERR(link), "cgroup_attach", "err: %ld\n", PTR_ERR(link))) + goto cleanup; + + run_lookup_test(&skel->bss->g_serv_port, out_sk); + + bpf_link__destroy(link); + +cleanup: + close(cgfd); + cgroup_skb_sk_lookup_kern__destroy(skel); +} + +void test_cgroup_skb_sk_lookup(void) +{ + const char *cg_path = "/foo"; + int out_sk; + + /* Create a socket before joining testing cgroup so that its cgroup id + * differs from that of testing cgroup. Moving selftests process to + * testing cgroup won't change cgroup id of an already created socket. + */ + out_sk = socket(AF_INET6, SOCK_STREAM | SOCK_NONBLOCK, 0); + if (CHECK_FAIL(out_sk < 0)) + return; + + run_cgroup_bpf_test(cg_path, out_sk); + + close(out_sk); +} diff --git a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c new file mode 100644 index 000000000000..f259085cca6a --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c @@ -0,0 +1,456 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +// Copyright (c) 2020 Cloudflare + +#define _GNU_SOURCE + +#include <arpa/inet.h> +#include <string.h> + +#include <linux/pkt_cls.h> + +#include <test_progs.h> + +#include "progs/test_cls_redirect.h" +#include "test_cls_redirect.skel.h" + +#define ENCAP_IP INADDR_LOOPBACK +#define ENCAP_PORT (1234) + +struct addr_port { + in_port_t port; + union { + struct in_addr in_addr; + struct in6_addr in6_addr; + }; +}; + +struct tuple { + int family; + struct addr_port src; + struct addr_port dst; +}; + +static int start_server(const struct sockaddr *addr, socklen_t len, int type) +{ + int fd = socket(addr->sa_family, type, 0); + if (CHECK_FAIL(fd == -1)) + return -1; + if (CHECK_FAIL(bind(fd, addr, len) == -1)) + goto err; + if (type == SOCK_STREAM && CHECK_FAIL(listen(fd, 128) == -1)) + goto err; + + return fd; + +err: + close(fd); + return -1; +} + +static int connect_to_server(const struct sockaddr *addr, socklen_t len, + int type) +{ + int fd = socket(addr->sa_family, type, 0); + if (CHECK_FAIL(fd == -1)) + return -1; + if (CHECK_FAIL(connect(fd, addr, len))) + goto err; + + return fd; + +err: + close(fd); + return -1; +} + +static bool fill_addr_port(const struct sockaddr *sa, struct addr_port *ap) +{ + const struct sockaddr_in6 *in6; + const struct sockaddr_in *in; + + switch (sa->sa_family) { + case AF_INET: + in = (const struct sockaddr_in *)sa; + ap->in_addr = in->sin_addr; + ap->port = in->sin_port; + return true; + + case AF_INET6: + in6 = (const struct sockaddr_in6 *)sa; + ap->in6_addr = in6->sin6_addr; + ap->port = in6->sin6_port; + return true; + + default: + return false; + } +} + +static bool set_up_conn(const struct sockaddr *addr, socklen_t len, int type, + int *server, int *conn, struct tuple *tuple) +{ + struct sockaddr_storage ss; + socklen_t slen = sizeof(ss); + struct sockaddr *sa = (struct sockaddr *)&ss; + + *server = start_server(addr, len, type); + if (*server < 0) + return false; + + if (CHECK_FAIL(getsockname(*server, sa, &slen))) + goto close_server; + + *conn = connect_to_server(sa, slen, type); + if (*conn < 0) + goto close_server; + + /* We want to simulate packets arriving at conn, so we have to + * swap src and dst. + */ + slen = sizeof(ss); + if (CHECK_FAIL(getsockname(*conn, sa, &slen))) + goto close_conn; + + if (CHECK_FAIL(!fill_addr_port(sa, &tuple->dst))) + goto close_conn; + + slen = sizeof(ss); + if (CHECK_FAIL(getpeername(*conn, sa, &slen))) + goto close_conn; + + if (CHECK_FAIL(!fill_addr_port(sa, &tuple->src))) + goto close_conn; + + tuple->family = ss.ss_family; + return true; + +close_conn: + close(*conn); + *conn = -1; +close_server: + close(*server); + *server = -1; + return false; +} + +static socklen_t prepare_addr(struct sockaddr_storage *addr, int family) +{ + struct sockaddr_in *addr4; + struct sockaddr_in6 *addr6; + + switch (family) { + case AF_INET: + addr4 = (struct sockaddr_in *)addr; + memset(addr4, 0, sizeof(*addr4)); + addr4->sin_family = family; + addr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK); + return sizeof(*addr4); + case AF_INET6: + addr6 = (struct sockaddr_in6 *)addr; + memset(addr6, 0, sizeof(*addr6)); + addr6->sin6_family = family; + addr6->sin6_addr = in6addr_loopback; + return sizeof(*addr6); + default: + fprintf(stderr, "Invalid family %d", family); + return 0; + } +} + +static bool was_decapsulated(struct bpf_prog_test_run_attr *tattr) +{ + return tattr->data_size_out < tattr->data_size_in; +} + +enum type { + UDP, + TCP, + __NR_KIND, +}; + +enum hops { + NO_HOPS, + ONE_HOP, +}; + +enum flags { + NONE, + SYN, + ACK, +}; + +enum conn { + KNOWN_CONN, + UNKNOWN_CONN, +}; + +enum result { + ACCEPT, + FORWARD, +}; + +struct test_cfg { + enum type type; + enum result result; + enum conn conn; + enum hops hops; + enum flags flags; +}; + +static int test_str(void *buf, size_t len, const struct test_cfg *test, + int family) +{ + const char *family_str, *type, *conn, *hops, *result, *flags; + + family_str = "IPv4"; + if (family == AF_INET6) + family_str = "IPv6"; + + type = "TCP"; + if (test->type == UDP) + type = "UDP"; + + conn = "known"; + if (test->conn == UNKNOWN_CONN) + conn = "unknown"; + + hops = "no hops"; + if (test->hops == ONE_HOP) + hops = "one hop"; + + result = "accept"; + if (test->result == FORWARD) + result = "forward"; + + flags = "none"; + if (test->flags == SYN) + flags = "SYN"; + else if (test->flags == ACK) + flags = "ACK"; + + return snprintf(buf, len, "%s %s %s %s (%s, flags: %s)", family_str, + type, result, conn, hops, flags); +} + +static struct test_cfg tests[] = { + { TCP, ACCEPT, UNKNOWN_CONN, NO_HOPS, SYN }, + { TCP, ACCEPT, UNKNOWN_CONN, NO_HOPS, ACK }, + { TCP, FORWARD, UNKNOWN_CONN, ONE_HOP, ACK }, + { TCP, ACCEPT, KNOWN_CONN, ONE_HOP, ACK }, + { UDP, ACCEPT, UNKNOWN_CONN, NO_HOPS, NONE }, + { UDP, FORWARD, UNKNOWN_CONN, ONE_HOP, NONE }, + { UDP, ACCEPT, KNOWN_CONN, ONE_HOP, NONE }, +}; + +static void encap_init(encap_headers_t *encap, uint8_t hop_count, uint8_t proto) +{ + const uint8_t hlen = + (sizeof(struct guehdr) / sizeof(uint32_t)) + hop_count; + *encap = (encap_headers_t){ + .eth = { .h_proto = htons(ETH_P_IP) }, + .ip = { + .ihl = 5, + .version = 4, + .ttl = IPDEFTTL, + .protocol = IPPROTO_UDP, + .daddr = htonl(ENCAP_IP) + }, + .udp = { + .dest = htons(ENCAP_PORT), + }, + .gue = { + .hlen = hlen, + .proto_ctype = proto + }, + .unigue = { + .hop_count = hop_count + }, + }; +} + +static size_t build_input(const struct test_cfg *test, void *const buf, + const struct tuple *tuple) +{ + in_port_t sport = tuple->src.port; + encap_headers_t encap; + struct iphdr ip; + struct ipv6hdr ipv6; + struct tcphdr tcp; + struct udphdr udp; + struct in_addr next_hop; + uint8_t *p = buf; + int proto; + + proto = IPPROTO_IPIP; + if (tuple->family == AF_INET6) + proto = IPPROTO_IPV6; + + encap_init(&encap, test->hops == ONE_HOP ? 1 : 0, proto); + p = mempcpy(p, &encap, sizeof(encap)); + + if (test->hops == ONE_HOP) { + next_hop = (struct in_addr){ .s_addr = htonl(0x7f000002) }; + p = mempcpy(p, &next_hop, sizeof(next_hop)); + } + + proto = IPPROTO_TCP; + if (test->type == UDP) + proto = IPPROTO_UDP; + + switch (tuple->family) { + case AF_INET: + ip = (struct iphdr){ + .ihl = 5, + .version = 4, + .ttl = IPDEFTTL, + .protocol = proto, + .saddr = tuple->src.in_addr.s_addr, + .daddr = tuple->dst.in_addr.s_addr, + }; + p = mempcpy(p, &ip, sizeof(ip)); + break; + case AF_INET6: + ipv6 = (struct ipv6hdr){ + .version = 6, + .hop_limit = IPDEFTTL, + .nexthdr = proto, + .saddr = tuple->src.in6_addr, + .daddr = tuple->dst.in6_addr, + }; + p = mempcpy(p, &ipv6, sizeof(ipv6)); + break; + default: + return 0; + } + + if (test->conn == UNKNOWN_CONN) + sport--; + + switch (test->type) { + case TCP: + tcp = (struct tcphdr){ + .source = sport, + .dest = tuple->dst.port, + }; + if (test->flags == SYN) + tcp.syn = true; + if (test->flags == ACK) + tcp.ack = true; + p = mempcpy(p, &tcp, sizeof(tcp)); + break; + case UDP: + udp = (struct udphdr){ + .source = sport, + .dest = tuple->dst.port, + }; + p = mempcpy(p, &udp, sizeof(udp)); + break; + default: + return 0; + } + + return (void *)p - buf; +} + +static void close_fds(int *fds, int n) +{ + int i; + + for (i = 0; i < n; i++) + if (fds[i] > 0) + close(fds[i]); +} + +void test_cls_redirect(void) +{ + struct test_cls_redirect *skel = NULL; + struct bpf_prog_test_run_attr tattr = {}; + int families[] = { AF_INET, AF_INET6 }; + struct sockaddr_storage ss; + struct sockaddr *addr; + socklen_t slen; + int i, j, err; + + int servers[__NR_KIND][ARRAY_SIZE(families)] = {}; + int conns[__NR_KIND][ARRAY_SIZE(families)] = {}; + struct tuple tuples[__NR_KIND][ARRAY_SIZE(families)]; + + skel = test_cls_redirect__open(); + if (CHECK_FAIL(!skel)) + return; + + skel->rodata->ENCAPSULATION_IP = htonl(ENCAP_IP); + skel->rodata->ENCAPSULATION_PORT = htons(ENCAP_PORT); + + if (CHECK_FAIL(test_cls_redirect__load(skel))) + goto cleanup; + + addr = (struct sockaddr *)&ss; + for (i = 0; i < ARRAY_SIZE(families); i++) { + slen = prepare_addr(&ss, families[i]); + if (CHECK_FAIL(!slen)) + goto cleanup; + + if (CHECK_FAIL(!set_up_conn(addr, slen, SOCK_DGRAM, + &servers[UDP][i], &conns[UDP][i], + &tuples[UDP][i]))) + goto cleanup; + + if (CHECK_FAIL(!set_up_conn(addr, slen, SOCK_STREAM, + &servers[TCP][i], &conns[TCP][i], + &tuples[TCP][i]))) + goto cleanup; + } + + tattr.prog_fd = bpf_program__fd(skel->progs.cls_redirect); + for (i = 0; i < ARRAY_SIZE(tests); i++) { + struct test_cfg *test = &tests[i]; + + for (j = 0; j < ARRAY_SIZE(families); j++) { + struct tuple *tuple = &tuples[test->type][j]; + char input[256]; + char tmp[256]; + + test_str(tmp, sizeof(tmp), test, tuple->family); + if (!test__start_subtest(tmp)) + continue; + + tattr.data_out = tmp; + tattr.data_size_out = sizeof(tmp); + + tattr.data_in = input; + tattr.data_size_in = build_input(test, input, tuple); + if (CHECK_FAIL(!tattr.data_size_in)) + continue; + + err = bpf_prog_test_run_xattr(&tattr); + if (CHECK_FAIL(err)) + continue; + + if (tattr.retval != TC_ACT_REDIRECT) { + PRINT_FAIL("expected TC_ACT_REDIRECT, got %d\n", + tattr.retval); + continue; + } + + switch (test->result) { + case ACCEPT: + if (CHECK_FAIL(!was_decapsulated(&tattr))) + continue; + break; + case FORWARD: + if (CHECK_FAIL(was_decapsulated(&tattr))) + continue; + break; + default: + PRINT_FAIL("unknown result %d\n", test->result); + continue; + } + } + } + +cleanup: + test_cls_redirect__destroy(skel); + close_fds((int *)servers, sizeof(servers) / sizeof(servers[0][0])); + close_fds((int *)conns, sizeof(conns) / sizeof(conns[0][0])); +} diff --git a/tools/testing/selftests/bpf/prog_tests/connect_force_port.c b/tools/testing/selftests/bpf/prog_tests/connect_force_port.c new file mode 100644 index 000000000000..17bbf76812ca --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/connect_force_port.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <test_progs.h> +#include "cgroup_helpers.h" +#include "network_helpers.h" + +static int verify_ports(int family, int fd, + __u16 expected_local, __u16 expected_peer) +{ + struct sockaddr_storage addr; + socklen_t len = sizeof(addr); + __u16 port; + + if (getsockname(fd, (struct sockaddr *)&addr, &len)) { + log_err("Failed to get server addr"); + return -1; + } + + if (family == AF_INET) + port = ((struct sockaddr_in *)&addr)->sin_port; + else + port = ((struct sockaddr_in6 *)&addr)->sin6_port; + + if (ntohs(port) != expected_local) { + log_err("Unexpected local port %d, expected %d", ntohs(port), + expected_local); + return -1; + } + + if (getpeername(fd, (struct sockaddr *)&addr, &len)) { + log_err("Failed to get peer addr"); + return -1; + } + + if (family == AF_INET) + port = ((struct sockaddr_in *)&addr)->sin_port; + else + port = ((struct sockaddr_in6 *)&addr)->sin6_port; + + if (ntohs(port) != expected_peer) { + log_err("Unexpected peer port %d, expected %d", ntohs(port), + expected_peer); + return -1; + } + + return 0; +} + +static int run_test(int cgroup_fd, int server_fd, int family, int type) +{ + bool v4 = family == AF_INET; + __u16 expected_local_port = v4 ? 22222 : 22223; + __u16 expected_peer_port = 60000; + struct bpf_prog_load_attr attr = { + .file = v4 ? "./connect_force_port4.o" : + "./connect_force_port6.o", + }; + struct bpf_program *prog; + struct bpf_object *obj; + int xlate_fd, fd, err; + __u32 duration = 0; + + err = bpf_prog_load_xattr(&attr, &obj, &xlate_fd); + if (err) { + log_err("Failed to load BPF object"); + return -1; + } + + prog = bpf_object__find_program_by_title(obj, v4 ? + "cgroup/connect4" : + "cgroup/connect6"); + if (CHECK(!prog, "find_prog", "connect prog not found\n")) { + err = -EIO; + goto close_bpf_object; + } + + err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd, v4 ? + BPF_CGROUP_INET4_CONNECT : + BPF_CGROUP_INET6_CONNECT, 0); + if (err) { + log_err("Failed to attach BPF program"); + goto close_bpf_object; + } + + prog = bpf_object__find_program_by_title(obj, v4 ? + "cgroup/getpeername4" : + "cgroup/getpeername6"); + if (CHECK(!prog, "find_prog", "getpeername prog not found\n")) { + err = -EIO; + goto close_bpf_object; + } + + err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd, v4 ? + BPF_CGROUP_INET4_GETPEERNAME : + BPF_CGROUP_INET6_GETPEERNAME, 0); + if (err) { + log_err("Failed to attach BPF program"); + goto close_bpf_object; + } + + prog = bpf_object__find_program_by_title(obj, v4 ? + "cgroup/getsockname4" : + "cgroup/getsockname6"); + if (CHECK(!prog, "find_prog", "getsockname prog not found\n")) { + err = -EIO; + goto close_bpf_object; + } + + err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd, v4 ? + BPF_CGROUP_INET4_GETSOCKNAME : + BPF_CGROUP_INET6_GETSOCKNAME, 0); + if (err) { + log_err("Failed to attach BPF program"); + goto close_bpf_object; + } + + fd = connect_to_fd(family, type, server_fd); + if (fd < 0) { + err = -1; + goto close_bpf_object; + } + + err = verify_ports(family, fd, expected_local_port, + expected_peer_port); + close(fd); + +close_bpf_object: + bpf_object__close(obj); + return err; +} + +void test_connect_force_port(void) +{ + int server_fd, cgroup_fd; + + cgroup_fd = test__join_cgroup("/connect_force_port"); + if (CHECK_FAIL(cgroup_fd < 0)) + return; + + server_fd = start_server_with_port(AF_INET, SOCK_STREAM, 60123); + if (CHECK_FAIL(server_fd < 0)) + goto close_cgroup_fd; + CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET, SOCK_STREAM)); + close(server_fd); + + server_fd = start_server_with_port(AF_INET6, SOCK_STREAM, 60124); + if (CHECK_FAIL(server_fd < 0)) + goto close_cgroup_fd; + CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET6, SOCK_STREAM)); + close(server_fd); + + server_fd = start_server_with_port(AF_INET, SOCK_DGRAM, 60123); + if (CHECK_FAIL(server_fd < 0)) + goto close_cgroup_fd; + CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET, SOCK_DGRAM)); + close(server_fd); + + server_fd = start_server_with_port(AF_INET6, SOCK_DGRAM, 60124); + if (CHECK_FAIL(server_fd < 0)) + goto close_cgroup_fd; + CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET6, SOCK_DGRAM)); + close(server_fd); + +close_cgroup_fd: + close(cgroup_fd); +} diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c index 31e177adbdf1..084ed26a7d78 100644 --- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c +++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c @@ -392,7 +392,7 @@ static struct core_reloc_test_case test_cases[] = { .input = STRUCT_TO_CHAR_PTR(core_reloc_existence___minimal) { .a = 42, }, - .input_len = sizeof(struct core_reloc_existence), + .input_len = sizeof(struct core_reloc_existence___minimal), .output = STRUCT_TO_CHAR_PTR(core_reloc_existence_output) { .a_exists = 1, .b_exists = 0, diff --git a/tools/testing/selftests/bpf/prog_tests/enable_stats.c b/tools/testing/selftests/bpf/prog_tests/enable_stats.c new file mode 100644 index 000000000000..2cb2085917e7 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/enable_stats.c @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <test_progs.h> +#include "test_enable_stats.skel.h" + +void test_enable_stats(void) +{ + struct test_enable_stats *skel; + int stats_fd, err, prog_fd; + struct bpf_prog_info info; + __u32 info_len = sizeof(info); + int duration = 0; + + skel = test_enable_stats__open_and_load(); + if (CHECK(!skel, "skel_open_and_load", "skeleton open/load failed\n")) + return; + + stats_fd = bpf_enable_stats(BPF_STATS_RUN_TIME); + if (CHECK(stats_fd < 0, "get_stats_fd", "failed %d\n", errno)) { + test_enable_stats__destroy(skel); + return; + } + + err = test_enable_stats__attach(skel); + if (CHECK(err, "attach_raw_tp", "err %d\n", err)) + goto cleanup; + + test_enable_stats__detach(skel); + + prog_fd = bpf_program__fd(skel->progs.test_enable_stats); + memset(&info, 0, info_len); + err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); + if (CHECK(err, "get_prog_info", + "failed to get bpf_prog_info for fd %d\n", prog_fd)) + goto cleanup; + if (CHECK(info.run_time_ns == 0, "check_stats_enabled", + "failed to enable run_time_ns stats\n")) + goto cleanup; + + CHECK(info.run_cnt != skel->bss->count, "check_run_cnt_valid", + "invalid run_cnt stats\n"); + +cleanup: + test_enable_stats__destroy(skel); + close(stats_fd); +} diff --git a/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c b/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c index 235ac4f67f5b..83493bd5745c 100644 --- a/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c +++ b/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c @@ -1,22 +1,17 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 Facebook */ #include <test_progs.h> -#include "test_pkt_access.skel.h" #include "fentry_test.skel.h" #include "fexit_test.skel.h" void test_fentry_fexit(void) { - struct test_pkt_access *pkt_skel = NULL; struct fentry_test *fentry_skel = NULL; struct fexit_test *fexit_skel = NULL; __u64 *fentry_res, *fexit_res; __u32 duration = 0, retval; - int err, pkt_fd, i; + int err, prog_fd, i; - pkt_skel = test_pkt_access__open_and_load(); - if (CHECK(!pkt_skel, "pkt_skel_load", "pkt_access skeleton failed\n")) - return; fentry_skel = fentry_test__open_and_load(); if (CHECK(!fentry_skel, "fentry_skel_load", "fentry skeleton failed\n")) goto close_prog; @@ -31,8 +26,8 @@ void test_fentry_fexit(void) if (CHECK(err, "fexit_attach", "fexit attach failed: %d\n", err)) goto close_prog; - pkt_fd = bpf_program__fd(pkt_skel->progs.test_pkt_access); - err = bpf_prog_test_run(pkt_fd, 1, &pkt_v6, sizeof(pkt_v6), + prog_fd = bpf_program__fd(fexit_skel->progs.test1); + err = bpf_prog_test_run(prog_fd, 1, NULL, 0, NULL, NULL, &retval, &duration); CHECK(err || retval, "ipv6", "err %d errno %d retval %d duration %d\n", @@ -49,7 +44,6 @@ void test_fentry_fexit(void) } close_prog: - test_pkt_access__destroy(pkt_skel); fentry_test__destroy(fentry_skel); fexit_test__destroy(fexit_skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/fentry_test.c b/tools/testing/selftests/bpf/prog_tests/fentry_test.c index 5cc06021f27d..04ebbf1cb390 100644 --- a/tools/testing/selftests/bpf/prog_tests/fentry_test.c +++ b/tools/testing/selftests/bpf/prog_tests/fentry_test.c @@ -1,20 +1,15 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 Facebook */ #include <test_progs.h> -#include "test_pkt_access.skel.h" #include "fentry_test.skel.h" void test_fentry_test(void) { - struct test_pkt_access *pkt_skel = NULL; struct fentry_test *fentry_skel = NULL; - int err, pkt_fd, i; + int err, prog_fd, i; __u32 duration = 0, retval; __u64 *result; - pkt_skel = test_pkt_access__open_and_load(); - if (CHECK(!pkt_skel, "pkt_skel_load", "pkt_access skeleton failed\n")) - return; fentry_skel = fentry_test__open_and_load(); if (CHECK(!fentry_skel, "fentry_skel_load", "fentry skeleton failed\n")) goto cleanup; @@ -23,10 +18,10 @@ void test_fentry_test(void) if (CHECK(err, "fentry_attach", "fentry attach failed: %d\n", err)) goto cleanup; - pkt_fd = bpf_program__fd(pkt_skel->progs.test_pkt_access); - err = bpf_prog_test_run(pkt_fd, 1, &pkt_v6, sizeof(pkt_v6), + prog_fd = bpf_program__fd(fentry_skel->progs.test1); + err = bpf_prog_test_run(prog_fd, 1, NULL, 0, NULL, NULL, &retval, &duration); - CHECK(err || retval, "ipv6", + CHECK(err || retval, "test_run", "err %d errno %d retval %d duration %d\n", err, errno, retval, duration); @@ -39,5 +34,4 @@ void test_fentry_test(void) cleanup: fentry_test__destroy(fentry_skel); - test_pkt_access__destroy(pkt_skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c b/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c index cde463af7071..a895bfed55db 100644 --- a/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c +++ b/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c @@ -1,11 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 Facebook */ #include <test_progs.h> +#include <network_helpers.h> static void test_fexit_bpf2bpf_common(const char *obj_file, const char *target_obj_file, int prog_cnt, - const char **prog_name) + const char **prog_name, + bool run_prog) { struct bpf_object *obj = NULL, *pkt_obj; int err, pkt_fd, i; @@ -18,7 +20,8 @@ static void test_fexit_bpf2bpf_common(const char *obj_file, err = bpf_prog_load(target_obj_file, BPF_PROG_TYPE_UNSPEC, &pkt_obj, &pkt_fd); - if (CHECK(err, "prog_load sched cls", "err %d errno %d\n", err, errno)) + if (CHECK(err, "tgt_prog_load", "file %s err %d errno %d\n", + target_obj_file, err, errno)) return; DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts, .attach_prog_fd = pkt_fd, @@ -33,7 +36,7 @@ static void test_fexit_bpf2bpf_common(const char *obj_file, obj = bpf_object__open_file(obj_file, &opts); if (CHECK(IS_ERR_OR_NULL(obj), "obj_open", - "failed to open fexit_bpf2bpf: %ld\n", + "failed to open %s: %ld\n", obj_file, PTR_ERR(obj))) goto close_prog; @@ -49,6 +52,10 @@ static void test_fexit_bpf2bpf_common(const char *obj_file, if (CHECK(IS_ERR(link[i]), "attach_trace", "failed to link\n")) goto close_prog; } + + if (!run_prog) + goto close_prog; + data_map = bpf_object__find_map_by_name(obj, "fexit_bp.bss"); if (CHECK(!data_map, "find_data_map", "data map not found\n")) goto close_prog; @@ -89,7 +96,7 @@ static void test_target_no_callees(void) test_fexit_bpf2bpf_common("./fexit_bpf2bpf_simple.o", "./test_pkt_md_access.o", ARRAY_SIZE(prog_name), - prog_name); + prog_name, true); } static void test_target_yes_callees(void) @@ -103,7 +110,7 @@ static void test_target_yes_callees(void) test_fexit_bpf2bpf_common("./fexit_bpf2bpf.o", "./test_pkt_access.o", ARRAY_SIZE(prog_name), - prog_name); + prog_name, true); } static void test_func_replace(void) @@ -120,7 +127,18 @@ static void test_func_replace(void) test_fexit_bpf2bpf_common("./fexit_bpf2bpf.o", "./test_pkt_access.o", ARRAY_SIZE(prog_name), - prog_name); + prog_name, true); +} + +static void test_func_replace_verify(void) +{ + const char *prog_name[] = { + "freplace/do_bind", + }; + test_fexit_bpf2bpf_common("./freplace_connect4.o", + "./connect4_prog.o", + ARRAY_SIZE(prog_name), + prog_name, false); } void test_fexit_bpf2bpf(void) @@ -128,4 +146,5 @@ void test_fexit_bpf2bpf(void) test_target_no_callees(); test_target_yes_callees(); test_func_replace(); + test_func_replace_verify(); } diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_test.c b/tools/testing/selftests/bpf/prog_tests/fexit_test.c index d2c3655dd7a3..78d7a2765c27 100644 --- a/tools/testing/selftests/bpf/prog_tests/fexit_test.c +++ b/tools/testing/selftests/bpf/prog_tests/fexit_test.c @@ -1,64 +1,37 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 Facebook */ #include <test_progs.h> +#include "fexit_test.skel.h" void test_fexit_test(void) { - struct bpf_prog_load_attr attr = { - .file = "./fexit_test.o", - }; - - char prog_name[] = "fexit/bpf_fentry_testX"; - struct bpf_object *obj = NULL, *pkt_obj; - int err, pkt_fd, kfree_skb_fd, i; - struct bpf_link *link[6] = {}; - struct bpf_program *prog[6]; + struct fexit_test *fexit_skel = NULL; + int err, prog_fd, i; __u32 duration = 0, retval; - struct bpf_map *data_map; - const int zero = 0; - u64 result[6]; + __u64 *result; - err = bpf_prog_load("./test_pkt_access.o", BPF_PROG_TYPE_SCHED_CLS, - &pkt_obj, &pkt_fd); - if (CHECK(err, "prog_load sched cls", "err %d errno %d\n", err, errno)) - return; - err = bpf_prog_load_xattr(&attr, &obj, &kfree_skb_fd); - if (CHECK(err, "prog_load fail", "err %d errno %d\n", err, errno)) - goto close_prog; + fexit_skel = fexit_test__open_and_load(); + if (CHECK(!fexit_skel, "fexit_skel_load", "fexit skeleton failed\n")) + goto cleanup; - for (i = 0; i < 6; i++) { - prog_name[sizeof(prog_name) - 2] = '1' + i; - prog[i] = bpf_object__find_program_by_title(obj, prog_name); - if (CHECK(!prog[i], "find_prog", "prog %s not found\n", prog_name)) - goto close_prog; - link[i] = bpf_program__attach_trace(prog[i]); - if (CHECK(IS_ERR(link[i]), "attach_trace", "failed to link\n")) - goto close_prog; - } - data_map = bpf_object__find_map_by_name(obj, "fexit_te.bss"); - if (CHECK(!data_map, "find_data_map", "data map not found\n")) - goto close_prog; + err = fexit_test__attach(fexit_skel); + if (CHECK(err, "fexit_attach", "fexit attach failed: %d\n", err)) + goto cleanup; - err = bpf_prog_test_run(pkt_fd, 1, &pkt_v6, sizeof(pkt_v6), + prog_fd = bpf_program__fd(fexit_skel->progs.test1); + err = bpf_prog_test_run(prog_fd, 1, NULL, 0, NULL, NULL, &retval, &duration); - CHECK(err || retval, "ipv6", + CHECK(err || retval, "test_run", "err %d errno %d retval %d duration %d\n", err, errno, retval, duration); - err = bpf_map_lookup_elem(bpf_map__fd(data_map), &zero, &result); - if (CHECK(err, "get_result", - "failed to get output data: %d\n", err)) - goto close_prog; - - for (i = 0; i < 6; i++) - if (CHECK(result[i] != 1, "result", "bpf_fentry_test%d failed err %ld\n", - i + 1, result[i])) - goto close_prog; + result = (__u64 *)fexit_skel->bss; + for (i = 0; i < 6; i++) { + if (CHECK(result[i] != 1, "result", + "fexit_test%d failed err %lld\n", i + 1, result[i])) + goto cleanup; + } -close_prog: - for (i = 0; i < 6; i++) - if (!IS_ERR_OR_NULL(link[i])) - bpf_link__destroy(link[i]); - bpf_object__close(obj); - bpf_object__close(pkt_obj); +cleanup: + fexit_test__destroy(fexit_skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c index 92563898867c..ea14e3ece812 100644 --- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c +++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c @@ -1,10 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> +#include <network_helpers.h> #include <error.h> #include <linux/if.h> #include <linux/if_tun.h> #include <sys/uio.h> +#include "bpf_flow.skel.h" + #ifndef IP_MF #define IP_MF 0x2000 #endif @@ -100,6 +103,7 @@ struct test { #define VLAN_HLEN 4 +static __u32 duration; struct test tests[] = { { .name = "ipv4", @@ -443,17 +447,130 @@ static int ifup(const char *ifname) return 0; } +static int init_prog_array(struct bpf_object *obj, struct bpf_map *prog_array) +{ + int i, err, map_fd, prog_fd; + struct bpf_program *prog; + char prog_name[32]; + + map_fd = bpf_map__fd(prog_array); + if (map_fd < 0) + return -1; + + for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) { + snprintf(prog_name, sizeof(prog_name), "flow_dissector/%i", i); + + prog = bpf_object__find_program_by_title(obj, prog_name); + if (!prog) + return -1; + + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) + return -1; + + err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY); + if (err) + return -1; + } + return 0; +} + +static void run_tests_skb_less(int tap_fd, struct bpf_map *keys) +{ + int i, err, keys_fd; + + keys_fd = bpf_map__fd(keys); + if (CHECK(keys_fd < 0, "bpf_map__fd", "err %d\n", keys_fd)) + return; + + for (i = 0; i < ARRAY_SIZE(tests); i++) { + /* Keep in sync with 'flags' from eth_get_headlen. */ + __u32 eth_get_headlen_flags = + BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG; + struct bpf_prog_test_run_attr tattr = {}; + struct bpf_flow_keys flow_keys = {}; + __u32 key = (__u32)(tests[i].keys.sport) << 16 | + tests[i].keys.dport; + + /* For skb-less case we can't pass input flags; run + * only the tests that have a matching set of flags. + */ + + if (tests[i].flags != eth_get_headlen_flags) + continue; + + err = tx_tap(tap_fd, &tests[i].pkt, sizeof(tests[i].pkt)); + CHECK(err < 0, "tx_tap", "err %d errno %d\n", err, errno); + + err = bpf_map_lookup_elem(keys_fd, &key, &flow_keys); + CHECK_ATTR(err, tests[i].name, "bpf_map_lookup_elem %d\n", err); + + CHECK_ATTR(err, tests[i].name, "skb-less err %d\n", err); + CHECK_FLOW_KEYS(tests[i].name, flow_keys, tests[i].keys); + + err = bpf_map_delete_elem(keys_fd, &key); + CHECK_ATTR(err, tests[i].name, "bpf_map_delete_elem %d\n", err); + } +} + +static void test_skb_less_prog_attach(struct bpf_flow *skel, int tap_fd) +{ + int err, prog_fd; + + prog_fd = bpf_program__fd(skel->progs._dissect); + if (CHECK(prog_fd < 0, "bpf_program__fd", "err %d\n", prog_fd)) + return; + + err = bpf_prog_attach(prog_fd, 0, BPF_FLOW_DISSECTOR, 0); + if (CHECK(err, "bpf_prog_attach", "err %d errno %d\n", err, errno)) + return; + + run_tests_skb_less(tap_fd, skel->maps.last_dissection); + + err = bpf_prog_detach(prog_fd, BPF_FLOW_DISSECTOR); + CHECK(err, "bpf_prog_detach", "err %d errno %d\n", err, errno); +} + +static void test_skb_less_link_create(struct bpf_flow *skel, int tap_fd) +{ + struct bpf_link *link; + int err, net_fd; + + net_fd = open("/proc/self/ns/net", O_RDONLY); + if (CHECK(net_fd < 0, "open(/proc/self/ns/net)", "err %d\n", errno)) + return; + + link = bpf_program__attach_netns(skel->progs._dissect, net_fd); + if (CHECK(IS_ERR(link), "attach_netns", "err %ld\n", PTR_ERR(link))) + goto out_close; + + run_tests_skb_less(tap_fd, skel->maps.last_dissection); + + err = bpf_link__destroy(link); + CHECK(err, "bpf_link__destroy", "err %d\n", err); +out_close: + close(net_fd); +} + void test_flow_dissector(void) { int i, err, prog_fd, keys_fd = -1, tap_fd; - struct bpf_object *obj; - __u32 duration = 0; + struct bpf_flow *skel; - err = bpf_flow_load(&obj, "./bpf_flow.o", "flow_dissector", - "jmp_table", "last_dissection", &prog_fd, &keys_fd); - if (CHECK_FAIL(err)) + skel = bpf_flow__open_and_load(); + if (CHECK(!skel, "skel", "failed to open/load skeleton\n")) return; + prog_fd = bpf_program__fd(skel->progs._dissect); + if (CHECK(prog_fd < 0, "bpf_program__fd", "err %d\n", prog_fd)) + goto out_destroy_skel; + keys_fd = bpf_map__fd(skel->maps.last_dissection); + if (CHECK(keys_fd < 0, "bpf_map__fd", "err %d\n", keys_fd)) + goto out_destroy_skel; + err = init_prog_array(skel->obj, skel->maps.jmp_table); + if (CHECK(err, "init_prog_array", "err %d\n", err)) + goto out_destroy_skel; + for (i = 0; i < ARRAY_SIZE(tests); i++) { struct bpf_flow_keys flow_keys; struct bpf_prog_test_run_attr tattr = { @@ -486,43 +603,17 @@ void test_flow_dissector(void) * via BPF map in this case. */ - err = bpf_prog_attach(prog_fd, 0, BPF_FLOW_DISSECTOR, 0); - CHECK(err, "bpf_prog_attach", "err %d errno %d\n", err, errno); - tap_fd = create_tap("tap0"); CHECK(tap_fd < 0, "create_tap", "tap_fd %d errno %d\n", tap_fd, errno); err = ifup("tap0"); CHECK(err, "ifup", "err %d errno %d\n", err, errno); - for (i = 0; i < ARRAY_SIZE(tests); i++) { - /* Keep in sync with 'flags' from eth_get_headlen. */ - __u32 eth_get_headlen_flags = - BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG; - struct bpf_prog_test_run_attr tattr = {}; - struct bpf_flow_keys flow_keys = {}; - __u32 key = (__u32)(tests[i].keys.sport) << 16 | - tests[i].keys.dport; - - /* For skb-less case we can't pass input flags; run - * only the tests that have a matching set of flags. - */ - - if (tests[i].flags != eth_get_headlen_flags) - continue; - - err = tx_tap(tap_fd, &tests[i].pkt, sizeof(tests[i].pkt)); - CHECK(err < 0, "tx_tap", "err %d errno %d\n", err, errno); - - err = bpf_map_lookup_elem(keys_fd, &key, &flow_keys); - CHECK_ATTR(err, tests[i].name, "bpf_map_lookup_elem %d\n", err); - - CHECK_ATTR(err, tests[i].name, "skb-less err %d\n", err); - CHECK_FLOW_KEYS(tests[i].name, flow_keys, tests[i].keys); - - err = bpf_map_delete_elem(keys_fd, &key); - CHECK_ATTR(err, tests[i].name, "bpf_map_delete_elem %d\n", err); - } + /* Test direct prog attachment */ + test_skb_less_prog_attach(skel, tap_fd); + /* Test indirect prog attachment via link */ + test_skb_less_link_create(skel, tap_fd); - bpf_prog_detach(prog_fd, BPF_FLOW_DISSECTOR); - bpf_object__close(obj); + close(tap_fd); +out_destroy_skel: + bpf_flow__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector_load_bytes.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector_load_bytes.c index dc5ef155ec28..0e8a4d2f023d 100644 --- a/tools/testing/selftests/bpf/prog_tests/flow_dissector_load_bytes.c +++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector_load_bytes.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> +#include <network_helpers.h> void test_flow_dissector_load_bytes(void) { diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c index 1f51ba66b98b..15cb554a66d8 100644 --- a/tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c +++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c @@ -11,6 +11,7 @@ #include <fcntl.h> #include <sched.h> #include <stdbool.h> +#include <sys/stat.h> #include <unistd.h> #include <linux/bpf.h> @@ -18,21 +19,30 @@ #include "test_progs.h" -static bool is_attached(int netns) +static int init_net = -1; + +static __u32 query_attached_prog_id(int netns) { - __u32 cnt; + __u32 prog_ids[1] = {}; + __u32 prog_cnt = ARRAY_SIZE(prog_ids); int err; - err = bpf_prog_query(netns, BPF_FLOW_DISSECTOR, 0, NULL, NULL, &cnt); + err = bpf_prog_query(netns, BPF_FLOW_DISSECTOR, 0, NULL, + prog_ids, &prog_cnt); if (CHECK_FAIL(err)) { perror("bpf_prog_query"); - return true; /* fail-safe */ + return 0; } - return cnt > 0; + return prog_cnt == 1 ? prog_ids[0] : 0; +} + +static bool prog_is_attached(int netns) +{ + return query_attached_prog_id(netns) > 0; } -static int load_prog(void) +static int load_prog(enum bpf_prog_type type) { struct bpf_insn prog[] = { BPF_MOV64_IMM(BPF_REG_0, BPF_OK), @@ -40,61 +50,566 @@ static int load_prog(void) }; int fd; - fd = bpf_load_program(BPF_PROG_TYPE_FLOW_DISSECTOR, prog, - ARRAY_SIZE(prog), "GPL", 0, NULL, 0); + fd = bpf_load_program(type, prog, ARRAY_SIZE(prog), "GPL", 0, NULL, 0); if (CHECK_FAIL(fd < 0)) perror("bpf_load_program"); return fd; } -static void do_flow_dissector_reattach(void) +static __u32 query_prog_id(int prog) { - int prog_fd[2] = { -1, -1 }; + struct bpf_prog_info info = {}; + __u32 info_len = sizeof(info); int err; - prog_fd[0] = load_prog(); - if (prog_fd[0] < 0) - return; + err = bpf_obj_get_info_by_fd(prog, &info, &info_len); + if (CHECK_FAIL(err || info_len != sizeof(info))) { + perror("bpf_obj_get_info_by_fd"); + return 0; + } - prog_fd[1] = load_prog(); - if (prog_fd[1] < 0) - goto out_close; + return info.id; +} + +static int unshare_net(int old_net) +{ + int err, new_net; - err = bpf_prog_attach(prog_fd[0], 0, BPF_FLOW_DISSECTOR, 0); + err = unshare(CLONE_NEWNET); if (CHECK_FAIL(err)) { - perror("bpf_prog_attach-0"); - goto out_close; + perror("unshare(CLONE_NEWNET)"); + return -1; + } + new_net = open("/proc/self/ns/net", O_RDONLY); + if (CHECK_FAIL(new_net < 0)) { + perror("open(/proc/self/ns/net)"); + setns(old_net, CLONE_NEWNET); + return -1; } + return new_net; +} + +static void test_prog_attach_prog_attach(int netns, int prog1, int prog2) +{ + int err; + + err = bpf_prog_attach(prog1, 0, BPF_FLOW_DISSECTOR, 0); + if (CHECK_FAIL(err)) { + perror("bpf_prog_attach(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); /* Expect success when attaching a different program */ - err = bpf_prog_attach(prog_fd[1], 0, BPF_FLOW_DISSECTOR, 0); + err = bpf_prog_attach(prog2, 0, BPF_FLOW_DISSECTOR, 0); if (CHECK_FAIL(err)) { - perror("bpf_prog_attach-1"); + perror("bpf_prog_attach(prog2) #1"); goto out_detach; } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog2)); /* Expect failure when attaching the same program twice */ - err = bpf_prog_attach(prog_fd[1], 0, BPF_FLOW_DISSECTOR, 0); + err = bpf_prog_attach(prog2, 0, BPF_FLOW_DISSECTOR, 0); if (CHECK_FAIL(!err || errno != EINVAL)) - perror("bpf_prog_attach-2"); + perror("bpf_prog_attach(prog2) #2"); + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog2)); out_detach: err = bpf_prog_detach(0, BPF_FLOW_DISSECTOR); if (CHECK_FAIL(err)) perror("bpf_prog_detach"); + CHECK_FAIL(prog_is_attached(netns)); +} + +static void test_link_create_link_create(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts); + int link1, link2; + + link1 = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &opts); + if (CHECK_FAIL(link < 0)) { + perror("bpf_link_create(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect failure creating link when another link exists */ + errno = 0; + link2 = bpf_link_create(prog2, netns, BPF_FLOW_DISSECTOR, &opts); + if (CHECK_FAIL(link2 != -1 || errno != E2BIG)) + perror("bpf_prog_attach(prog2) expected E2BIG"); + if (link2 != -1) + close(link2); + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + close(link1); + CHECK_FAIL(prog_is_attached(netns)); +} + +static void test_prog_attach_link_create(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts); + int err, link; + + err = bpf_prog_attach(prog1, -1, BPF_FLOW_DISSECTOR, 0); + if (CHECK_FAIL(err)) { + perror("bpf_prog_attach(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect failure creating link when prog attached */ + errno = 0; + link = bpf_link_create(prog2, netns, BPF_FLOW_DISSECTOR, &opts); + if (CHECK_FAIL(link != -1 || errno != EEXIST)) + perror("bpf_link_create(prog2) expected EEXIST"); + if (link != -1) + close(link); + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + err = bpf_prog_detach(-1, BPF_FLOW_DISSECTOR); + if (CHECK_FAIL(err)) + perror("bpf_prog_detach"); + CHECK_FAIL(prog_is_attached(netns)); +} + +static void test_link_create_prog_attach(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts); + int err, link; + + link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &opts); + if (CHECK_FAIL(link < 0)) { + perror("bpf_link_create(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect failure attaching prog when link exists */ + errno = 0; + err = bpf_prog_attach(prog2, -1, BPF_FLOW_DISSECTOR, 0); + if (CHECK_FAIL(!err || errno != EEXIST)) + perror("bpf_prog_attach(prog2) expected EEXIST"); + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + close(link); + CHECK_FAIL(prog_is_attached(netns)); +} + +static void test_link_create_prog_detach(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts); + int err, link; + + link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &opts); + if (CHECK_FAIL(link < 0)) { + perror("bpf_link_create(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect failure detaching prog when link exists */ + errno = 0; + err = bpf_prog_detach(-1, BPF_FLOW_DISSECTOR); + if (CHECK_FAIL(!err || errno != EINVAL)) + perror("bpf_prog_detach expected EINVAL"); + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + close(link); + CHECK_FAIL(prog_is_attached(netns)); +} + +static void test_prog_attach_detach_query(int netns, int prog1, int prog2) +{ + int err; + + err = bpf_prog_attach(prog1, 0, BPF_FLOW_DISSECTOR, 0); + if (CHECK_FAIL(err)) { + perror("bpf_prog_attach(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + err = bpf_prog_detach(0, BPF_FLOW_DISSECTOR); + if (CHECK_FAIL(err)) { + perror("bpf_prog_detach"); + return; + } + + /* Expect no prog attached after successful detach */ + CHECK_FAIL(prog_is_attached(netns)); +} + +static void test_link_create_close_query(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts); + int link; + + link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &opts); + if (CHECK_FAIL(link < 0)) { + perror("bpf_link_create(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + close(link); + /* Expect no prog attached after closing last link FD */ + CHECK_FAIL(prog_is_attached(netns)); +} + +static void test_link_update_no_old_prog(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, create_opts); + DECLARE_LIBBPF_OPTS(bpf_link_update_opts, update_opts); + int err, link; + + link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &create_opts); + if (CHECK_FAIL(link < 0)) { + perror("bpf_link_create(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect success replacing the prog when old prog not specified */ + update_opts.flags = 0; + update_opts.old_prog_fd = 0; + err = bpf_link_update(link, prog2, &update_opts); + if (CHECK_FAIL(err)) + perror("bpf_link_update"); + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog2)); + + close(link); + CHECK_FAIL(prog_is_attached(netns)); +} + +static void test_link_update_replace_old_prog(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, create_opts); + DECLARE_LIBBPF_OPTS(bpf_link_update_opts, update_opts); + int err, link; + link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &create_opts); + if (CHECK_FAIL(link < 0)) { + perror("bpf_link_create(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect success F_REPLACE and old prog specified to succeed */ + update_opts.flags = BPF_F_REPLACE; + update_opts.old_prog_fd = prog1; + err = bpf_link_update(link, prog2, &update_opts); + if (CHECK_FAIL(err)) + perror("bpf_link_update"); + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog2)); + + close(link); + CHECK_FAIL(prog_is_attached(netns)); +} + +static void test_link_update_invalid_opts(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, create_opts); + DECLARE_LIBBPF_OPTS(bpf_link_update_opts, update_opts); + int err, link; + + link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &create_opts); + if (CHECK_FAIL(link < 0)) { + perror("bpf_link_create(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect update to fail w/ old prog FD but w/o F_REPLACE*/ + errno = 0; + update_opts.flags = 0; + update_opts.old_prog_fd = prog1; + err = bpf_link_update(link, prog2, &update_opts); + if (CHECK_FAIL(!err || errno != EINVAL)) { + perror("bpf_link_update expected EINVAL"); + goto out_close; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect update to fail on old prog FD mismatch */ + errno = 0; + update_opts.flags = BPF_F_REPLACE; + update_opts.old_prog_fd = prog2; + err = bpf_link_update(link, prog2, &update_opts); + if (CHECK_FAIL(!err || errno != EPERM)) { + perror("bpf_link_update expected EPERM"); + goto out_close; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect update to fail for invalid old prog FD */ + errno = 0; + update_opts.flags = BPF_F_REPLACE; + update_opts.old_prog_fd = -1; + err = bpf_link_update(link, prog2, &update_opts); + if (CHECK_FAIL(!err || errno != EBADF)) { + perror("bpf_link_update expected EBADF"); + goto out_close; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect update to fail with invalid flags */ + errno = 0; + update_opts.flags = BPF_F_ALLOW_MULTI; + update_opts.old_prog_fd = 0; + err = bpf_link_update(link, prog2, &update_opts); + if (CHECK_FAIL(!err || errno != EINVAL)) + perror("bpf_link_update expected EINVAL"); + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + +out_close: + close(link); + CHECK_FAIL(prog_is_attached(netns)); +} + +static void test_link_update_invalid_prog(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, create_opts); + DECLARE_LIBBPF_OPTS(bpf_link_update_opts, update_opts); + int err, link, prog3; + + link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &create_opts); + if (CHECK_FAIL(link < 0)) { + perror("bpf_link_create(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect failure when new prog FD is not valid */ + errno = 0; + update_opts.flags = 0; + update_opts.old_prog_fd = 0; + err = bpf_link_update(link, -1, &update_opts); + if (CHECK_FAIL(!err || errno != EBADF)) { + perror("bpf_link_update expected EINVAL"); + goto out_close_link; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + prog3 = load_prog(BPF_PROG_TYPE_SOCKET_FILTER); + if (prog3 < 0) + goto out_close_link; + + /* Expect failure when new prog FD type doesn't match */ + errno = 0; + update_opts.flags = 0; + update_opts.old_prog_fd = 0; + err = bpf_link_update(link, prog3, &update_opts); + if (CHECK_FAIL(!err || errno != EINVAL)) + perror("bpf_link_update expected EINVAL"); + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + close(prog3); +out_close_link: + close(link); + CHECK_FAIL(prog_is_attached(netns)); +} + +static void test_link_update_netns_gone(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, create_opts); + DECLARE_LIBBPF_OPTS(bpf_link_update_opts, update_opts); + int err, link, old_net; + + old_net = netns; + netns = unshare_net(old_net); + if (netns < 0) + return; + + link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &create_opts); + if (CHECK_FAIL(link < 0)) { + perror("bpf_link_create(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + close(netns); + err = setns(old_net, CLONE_NEWNET); + if (CHECK_FAIL(err)) { + perror("setns(CLONE_NEWNET)"); + close(link); + return; + } + + /* Expect failure when netns destroyed */ + errno = 0; + update_opts.flags = 0; + update_opts.old_prog_fd = 0; + err = bpf_link_update(link, prog2, &update_opts); + if (CHECK_FAIL(!err || errno != ENOLINK)) + perror("bpf_link_update"); + + close(link); +} + +static void test_link_get_info(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, create_opts); + DECLARE_LIBBPF_OPTS(bpf_link_update_opts, update_opts); + struct bpf_link_info info = {}; + struct stat netns_stat = {}; + __u32 info_len, link_id; + int err, link, old_net; + + old_net = netns; + netns = unshare_net(old_net); + if (netns < 0) + return; + + err = fstat(netns, &netns_stat); + if (CHECK_FAIL(err)) { + perror("stat(netns)"); + goto out_resetns; + } + + link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &create_opts); + if (CHECK_FAIL(link < 0)) { + perror("bpf_link_create(prog1)"); + goto out_resetns; + } + + info_len = sizeof(info); + err = bpf_obj_get_info_by_fd(link, &info, &info_len); + if (CHECK_FAIL(err)) { + perror("bpf_obj_get_info"); + goto out_unlink; + } + CHECK_FAIL(info_len != sizeof(info)); + + /* Expect link info to be sane and match prog and netns details */ + CHECK_FAIL(info.type != BPF_LINK_TYPE_NETNS); + CHECK_FAIL(info.id == 0); + CHECK_FAIL(info.prog_id != query_prog_id(prog1)); + CHECK_FAIL(info.netns.netns_ino != netns_stat.st_ino); + CHECK_FAIL(info.netns.attach_type != BPF_FLOW_DISSECTOR); + + update_opts.flags = 0; + update_opts.old_prog_fd = 0; + err = bpf_link_update(link, prog2, &update_opts); + if (CHECK_FAIL(err)) { + perror("bpf_link_update(prog2)"); + goto out_unlink; + } + + link_id = info.id; + info_len = sizeof(info); + err = bpf_obj_get_info_by_fd(link, &info, &info_len); + if (CHECK_FAIL(err)) { + perror("bpf_obj_get_info"); + goto out_unlink; + } + CHECK_FAIL(info_len != sizeof(info)); + + /* Expect no info change after update except in prog id */ + CHECK_FAIL(info.type != BPF_LINK_TYPE_NETNS); + CHECK_FAIL(info.id != link_id); + CHECK_FAIL(info.prog_id != query_prog_id(prog2)); + CHECK_FAIL(info.netns.netns_ino != netns_stat.st_ino); + CHECK_FAIL(info.netns.attach_type != BPF_FLOW_DISSECTOR); + + /* Leave netns link is attached to and close last FD to it */ + err = setns(old_net, CLONE_NEWNET); + if (CHECK_FAIL(err)) { + perror("setns(NEWNET)"); + goto out_unlink; + } + close(netns); + old_net = -1; + netns = -1; + + info_len = sizeof(info); + err = bpf_obj_get_info_by_fd(link, &info, &info_len); + if (CHECK_FAIL(err)) { + perror("bpf_obj_get_info"); + goto out_unlink; + } + CHECK_FAIL(info_len != sizeof(info)); + + /* Expect netns_ino to change to 0 */ + CHECK_FAIL(info.type != BPF_LINK_TYPE_NETNS); + CHECK_FAIL(info.id != link_id); + CHECK_FAIL(info.prog_id != query_prog_id(prog2)); + CHECK_FAIL(info.netns.netns_ino != 0); + CHECK_FAIL(info.netns.attach_type != BPF_FLOW_DISSECTOR); + +out_unlink: + close(link); +out_resetns: + if (old_net != -1) + setns(old_net, CLONE_NEWNET); + if (netns != -1) + close(netns); +} + +static void run_tests(int netns) +{ + struct test { + const char *test_name; + void (*test_func)(int netns, int prog1, int prog2); + } tests[] = { + { "prog attach, prog attach", + test_prog_attach_prog_attach }, + { "link create, link create", + test_link_create_link_create }, + { "prog attach, link create", + test_prog_attach_link_create }, + { "link create, prog attach", + test_link_create_prog_attach }, + { "link create, prog detach", + test_link_create_prog_detach }, + { "prog attach, detach, query", + test_prog_attach_detach_query }, + { "link create, close, query", + test_link_create_close_query }, + { "link update no old prog", + test_link_update_no_old_prog }, + { "link update with replace old prog", + test_link_update_replace_old_prog }, + { "link update invalid opts", + test_link_update_invalid_opts }, + { "link update invalid prog", + test_link_update_invalid_prog }, + { "link update netns gone", + test_link_update_netns_gone }, + { "link get info", + test_link_get_info }, + }; + int i, progs[2] = { -1, -1 }; + char test_name[80]; + + for (i = 0; i < ARRAY_SIZE(progs); i++) { + progs[i] = load_prog(BPF_PROG_TYPE_FLOW_DISSECTOR); + if (progs[i] < 0) + goto out_close; + } + + for (i = 0; i < ARRAY_SIZE(tests); i++) { + snprintf(test_name, sizeof(test_name), + "flow dissector %s%s", + tests[i].test_name, + netns == init_net ? " (init_net)" : ""); + if (test__start_subtest(test_name)) + tests[i].test_func(netns, progs[0], progs[1]); + } out_close: - close(prog_fd[1]); - close(prog_fd[0]); + for (i = 0; i < ARRAY_SIZE(progs); i++) { + if (progs[i] != -1) + CHECK_FAIL(close(progs[i])); + } } void test_flow_dissector_reattach(void) { - int init_net, self_net, err; + int err, new_net, saved_net; - self_net = open("/proc/self/ns/net", O_RDONLY); - if (CHECK_FAIL(self_net < 0)) { + saved_net = open("/proc/self/ns/net", O_RDONLY); + if (CHECK_FAIL(saved_net < 0)) { perror("open(/proc/self/ns/net"); return; } @@ -111,30 +626,29 @@ void test_flow_dissector_reattach(void) goto out_close; } - if (is_attached(init_net)) { + if (prog_is_attached(init_net)) { test__skip(); printf("Can't test with flow dissector attached to init_net\n"); goto out_setns; } /* First run tests in root network namespace */ - do_flow_dissector_reattach(); + run_tests(init_net); /* Then repeat tests in a non-root namespace */ - err = unshare(CLONE_NEWNET); - if (CHECK_FAIL(err)) { - perror("unshare(CLONE_NEWNET)"); + new_net = unshare_net(init_net); + if (new_net < 0) goto out_setns; - } - do_flow_dissector_reattach(); + run_tests(new_net); + close(new_net); out_setns: /* Move back to netns we started in. */ - err = setns(self_net, CLONE_NEWNET); + err = setns(saved_net, CLONE_NEWNET); if (CHECK_FAIL(err)) perror("setns(/proc/self/ns/net)"); out_close: close(init_net); - close(self_net); + close(saved_net); } diff --git a/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c b/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c index eba9a970703b..925722217edf 100644 --- a/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c +++ b/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c @@ -82,6 +82,7 @@ static void get_stack_print_output(void *ctx, int cpu, void *data, __u32 size) void test_get_stack_raw_tp(void) { const char *file = "./test_get_stack_rawtp.o"; + const char *file_err = "./test_get_stack_rawtp_err.o"; const char *prog_name = "raw_tracepoint/sys_enter"; int i, err, prog_fd, exp_cnt = MAX_CNT_RAWTP; struct perf_buffer_opts pb_opts = {}; @@ -93,6 +94,10 @@ void test_get_stack_raw_tp(void) struct bpf_map *map; cpu_set_t cpu_set; + err = bpf_prog_load(file_err, BPF_PROG_TYPE_RAW_TRACEPOINT, &obj, &prog_fd); + if (CHECK(err >= 0, "prog_load raw tp", "err %d errno %d\n", err, errno)) + return; + err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT, &obj, &prog_fd); if (CHECK(err, "prog_load raw tp", "err %d errno %d\n", err, errno)) return; diff --git a/tools/testing/selftests/bpf/prog_tests/global_data.c b/tools/testing/selftests/bpf/prog_tests/global_data.c index c680926fce73..e3cb62b0a110 100644 --- a/tools/testing/selftests/bpf/prog_tests/global_data.c +++ b/tools/testing/selftests/bpf/prog_tests/global_data.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> +#include <network_helpers.h> static void test_global_data_number(struct bpf_object *obj, __u32 duration) { diff --git a/tools/testing/selftests/bpf/prog_tests/global_data_init.c b/tools/testing/selftests/bpf/prog_tests/global_data_init.c new file mode 100644 index 000000000000..3bdaa5a40744 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/global_data_init.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <test_progs.h> + +void test_global_data_init(void) +{ + const char *file = "./test_global_data.o"; + int err = -ENOMEM, map_fd, zero = 0; + __u8 *buff = NULL, *newval = NULL; + struct bpf_object *obj; + struct bpf_map *map; + __u32 duration = 0; + size_t sz; + + obj = bpf_object__open_file(file, NULL); + if (CHECK_FAIL(!obj)) + return; + + map = bpf_object__find_map_by_name(obj, "test_glo.rodata"); + if (CHECK_FAIL(!map || !bpf_map__is_internal(map))) + goto out; + + sz = bpf_map__def(map)->value_size; + newval = malloc(sz); + if (CHECK_FAIL(!newval)) + goto out; + + memset(newval, 0, sz); + /* wrong size, should fail */ + err = bpf_map__set_initial_value(map, newval, sz - 1); + if (CHECK(!err, "reject set initial value wrong size", "err %d\n", err)) + goto out; + + err = bpf_map__set_initial_value(map, newval, sz); + if (CHECK(err, "set initial value", "err %d\n", err)) + goto out; + + err = bpf_object__load(obj); + if (CHECK_FAIL(err)) + goto out; + + map_fd = bpf_map__fd(map); + if (CHECK_FAIL(map_fd < 0)) + goto out; + + buff = malloc(sz); + if (buff) + err = bpf_map_lookup_elem(map_fd, &zero, buff); + if (CHECK(!buff || err || memcmp(buff, newval, sz), + "compare .rodata map data override", + "err %d errno %d\n", err, errno)) + goto out; + + memset(newval, 1, sz); + /* object loaded - should fail */ + err = bpf_map__set_initial_value(map, newval, sz); + CHECK(!err, "reject set initial value after load", "err %d\n", err); +out: + free(buff); + free(newval); + bpf_object__close(obj); +} diff --git a/tools/testing/selftests/bpf/test_hashmap.c b/tools/testing/selftests/bpf/prog_tests/hashmap.c index c490e012c23f..428d488830c6 100644 --- a/tools/testing/selftests/bpf/test_hashmap.c +++ b/tools/testing/selftests/bpf/prog_tests/hashmap.c @@ -5,26 +5,17 @@ * * Copyright (c) 2019 Facebook */ -#include <stdio.h> -#include <errno.h> -#include <linux/err.h> +#include "test_progs.h" #include "bpf/hashmap.h" -#define CHECK(condition, format...) ({ \ - int __ret = !!(condition); \ - if (__ret) { \ - fprintf(stderr, "%s:%d:FAIL ", __func__, __LINE__); \ - fprintf(stderr, format); \ - } \ - __ret; \ -}) +static int duration = 0; -size_t hash_fn(const void *k, void *ctx) +static size_t hash_fn(const void *k, void *ctx) { return (long)k; } -bool equal_fn(const void *a, const void *b, void *ctx) +static bool equal_fn(const void *a, const void *b, void *ctx) { return (long)a == (long)b; } @@ -49,53 +40,55 @@ static inline size_t exp_cap(size_t sz) #define ELEM_CNT 62 -int test_hashmap_generic(void) +static void test_hashmap_generic(void) { struct hashmap_entry *entry, *tmp; int err, bkt, found_cnt, i; long long found_msk; struct hashmap *map; - fprintf(stderr, "%s: ", __func__); - map = hashmap__new(hash_fn, equal_fn, NULL); - if (CHECK(IS_ERR(map), "failed to create map: %ld\n", PTR_ERR(map))) - return 1; + if (CHECK(IS_ERR(map), "hashmap__new", + "failed to create map: %ld\n", PTR_ERR(map))) + return; for (i = 0; i < ELEM_CNT; i++) { const void *oldk, *k = (const void *)(long)i; void *oldv, *v = (void *)(long)(1024 + i); err = hashmap__update(map, k, v, &oldk, &oldv); - if (CHECK(err != -ENOENT, "unexpected result: %d\n", err)) - return 1; + if (CHECK(err != -ENOENT, "hashmap__update", + "unexpected result: %d\n", err)) + goto cleanup; if (i % 2) { err = hashmap__add(map, k, v); } else { err = hashmap__set(map, k, v, &oldk, &oldv); - if (CHECK(oldk != NULL || oldv != NULL, + if (CHECK(oldk != NULL || oldv != NULL, "check_kv", "unexpected k/v: %p=%p\n", oldk, oldv)) - return 1; + goto cleanup; } - if (CHECK(err, "failed to add k/v %ld = %ld: %d\n", + if (CHECK(err, "elem_add", "failed to add k/v %ld = %ld: %d\n", (long)k, (long)v, err)) - return 1; + goto cleanup; - if (CHECK(!hashmap__find(map, k, &oldv), + if (CHECK(!hashmap__find(map, k, &oldv), "elem_find", "failed to find key %ld\n", (long)k)) - return 1; - if (CHECK(oldv != v, "found value is wrong: %ld\n", (long)oldv)) - return 1; + goto cleanup; + if (CHECK(oldv != v, "elem_val", + "found value is wrong: %ld\n", (long)oldv)) + goto cleanup; } - if (CHECK(hashmap__size(map) != ELEM_CNT, + if (CHECK(hashmap__size(map) != ELEM_CNT, "hashmap__size", "invalid map size: %zu\n", hashmap__size(map))) - return 1; + goto cleanup; if (CHECK(hashmap__capacity(map) != exp_cap(hashmap__size(map)), + "hashmap_cap", "unexpected map capacity: %zu\n", hashmap__capacity(map))) - return 1; + goto cleanup; found_msk = 0; hashmap__for_each_entry(map, entry, bkt) { @@ -103,42 +96,47 @@ int test_hashmap_generic(void) long v = (long)entry->value; found_msk |= 1ULL << k; - if (CHECK(v - k != 1024, "invalid k/v pair: %ld = %ld\n", k, v)) - return 1; + if (CHECK(v - k != 1024, "check_kv", + "invalid k/v pair: %ld = %ld\n", k, v)) + goto cleanup; } - if (CHECK(found_msk != (1ULL << ELEM_CNT) - 1, + if (CHECK(found_msk != (1ULL << ELEM_CNT) - 1, "elem_cnt", "not all keys iterated: %llx\n", found_msk)) - return 1; + goto cleanup; for (i = 0; i < ELEM_CNT; i++) { const void *oldk, *k = (const void *)(long)i; void *oldv, *v = (void *)(long)(256 + i); err = hashmap__add(map, k, v); - if (CHECK(err != -EEXIST, "unexpected add result: %d\n", err)) - return 1; + if (CHECK(err != -EEXIST, "hashmap__add", + "unexpected add result: %d\n", err)) + goto cleanup; if (i % 2) err = hashmap__update(map, k, v, &oldk, &oldv); else err = hashmap__set(map, k, v, &oldk, &oldv); - if (CHECK(err, "failed to update k/v %ld = %ld: %d\n", - (long)k, (long)v, err)) - return 1; - if (CHECK(!hashmap__find(map, k, &oldv), + if (CHECK(err, "elem_upd", + "failed to update k/v %ld = %ld: %d\n", + (long)k, (long)v, err)) + goto cleanup; + if (CHECK(!hashmap__find(map, k, &oldv), "elem_find", "failed to find key %ld\n", (long)k)) - return 1; - if (CHECK(oldv != v, "found value is wrong: %ld\n", (long)oldv)) - return 1; + goto cleanup; + if (CHECK(oldv != v, "elem_val", + "found value is wrong: %ld\n", (long)oldv)) + goto cleanup; } - if (CHECK(hashmap__size(map) != ELEM_CNT, + if (CHECK(hashmap__size(map) != ELEM_CNT, "hashmap__size", "invalid updated map size: %zu\n", hashmap__size(map))) - return 1; + goto cleanup; if (CHECK(hashmap__capacity(map) != exp_cap(hashmap__size(map)), + "hashmap__capacity", "unexpected map capacity: %zu\n", hashmap__capacity(map))) - return 1; + goto cleanup; found_msk = 0; hashmap__for_each_entry_safe(map, entry, tmp, bkt) { @@ -146,20 +144,21 @@ int test_hashmap_generic(void) long v = (long)entry->value; found_msk |= 1ULL << k; - if (CHECK(v - k != 256, + if (CHECK(v - k != 256, "elem_check", "invalid updated k/v pair: %ld = %ld\n", k, v)) - return 1; + goto cleanup; } - if (CHECK(found_msk != (1ULL << ELEM_CNT) - 1, + if (CHECK(found_msk != (1ULL << ELEM_CNT) - 1, "elem_cnt", "not all keys iterated after update: %llx\n", found_msk)) - return 1; + goto cleanup; found_cnt = 0; hashmap__for_each_key_entry(map, entry, (void *)0) { found_cnt++; } - if (CHECK(!found_cnt, "didn't find any entries for key 0\n")) - return 1; + if (CHECK(!found_cnt, "found_cnt", + "didn't find any entries for key 0\n")) + goto cleanup; found_msk = 0; found_cnt = 0; @@ -173,30 +172,31 @@ int test_hashmap_generic(void) found_cnt++; found_msk |= 1ULL << (long)k; - if (CHECK(!hashmap__delete(map, k, &oldk, &oldv), + if (CHECK(!hashmap__delete(map, k, &oldk, &oldv), "elem_del", "failed to delete k/v %ld = %ld\n", (long)k, (long)v)) - return 1; - if (CHECK(oldk != k || oldv != v, + goto cleanup; + if (CHECK(oldk != k || oldv != v, "check_old", "invalid deleted k/v: expected %ld = %ld, got %ld = %ld\n", (long)k, (long)v, (long)oldk, (long)oldv)) - return 1; - if (CHECK(hashmap__delete(map, k, &oldk, &oldv), + goto cleanup; + if (CHECK(hashmap__delete(map, k, &oldk, &oldv), "elem_del", "unexpectedly deleted k/v %ld = %ld\n", (long)oldk, (long)oldv)) - return 1; + goto cleanup; } - if (CHECK(!found_cnt || !found_msk, + if (CHECK(!found_cnt || !found_msk, "found_entries", "didn't delete any key entries\n")) - return 1; - if (CHECK(hashmap__size(map) != ELEM_CNT - found_cnt, + goto cleanup; + if (CHECK(hashmap__size(map) != ELEM_CNT - found_cnt, "elem_cnt", "invalid updated map size (already deleted: %d): %zu\n", found_cnt, hashmap__size(map))) - return 1; + goto cleanup; if (CHECK(hashmap__capacity(map) != exp_cap(hashmap__size(map)), + "hashmap__capacity", "unexpected map capacity: %zu\n", hashmap__capacity(map))) - return 1; + goto cleanup; hashmap__for_each_entry_safe(map, entry, tmp, bkt) { const void *oldk, *k; @@ -208,53 +208,56 @@ int test_hashmap_generic(void) found_cnt++; found_msk |= 1ULL << (long)k; - if (CHECK(!hashmap__delete(map, k, &oldk, &oldv), + if (CHECK(!hashmap__delete(map, k, &oldk, &oldv), "elem_del", "failed to delete k/v %ld = %ld\n", (long)k, (long)v)) - return 1; - if (CHECK(oldk != k || oldv != v, + goto cleanup; + if (CHECK(oldk != k || oldv != v, "elem_check", "invalid old k/v: expect %ld = %ld, got %ld = %ld\n", (long)k, (long)v, (long)oldk, (long)oldv)) - return 1; - if (CHECK(hashmap__delete(map, k, &oldk, &oldv), + goto cleanup; + if (CHECK(hashmap__delete(map, k, &oldk, &oldv), "elem_del", "unexpectedly deleted k/v %ld = %ld\n", (long)k, (long)v)) - return 1; + goto cleanup; } if (CHECK(found_cnt != ELEM_CNT || found_msk != (1ULL << ELEM_CNT) - 1, + "found_cnt", "not all keys were deleted: found_cnt:%d, found_msk:%llx\n", found_cnt, found_msk)) - return 1; - if (CHECK(hashmap__size(map) != 0, + goto cleanup; + if (CHECK(hashmap__size(map) != 0, "hashmap__size", "invalid updated map size (already deleted: %d): %zu\n", found_cnt, hashmap__size(map))) - return 1; + goto cleanup; found_cnt = 0; hashmap__for_each_entry(map, entry, bkt) { - CHECK(false, "unexpected map entries left: %ld = %ld\n", - (long)entry->key, (long)entry->value); - return 1; + CHECK(false, "elem_exists", + "unexpected map entries left: %ld = %ld\n", + (long)entry->key, (long)entry->value); + goto cleanup; } - hashmap__free(map); + hashmap__clear(map); hashmap__for_each_entry(map, entry, bkt) { - CHECK(false, "unexpected map entries left: %ld = %ld\n", - (long)entry->key, (long)entry->value); - return 1; + CHECK(false, "elem_exists", + "unexpected map entries left: %ld = %ld\n", + (long)entry->key, (long)entry->value); + goto cleanup; } - fprintf(stderr, "OK\n"); - return 0; +cleanup: + hashmap__free(map); } -size_t collision_hash_fn(const void *k, void *ctx) +static size_t collision_hash_fn(const void *k, void *ctx) { return 0; } -int test_hashmap_multimap(void) +static void test_hashmap_multimap(void) { void *k1 = (void *)0, *k2 = (void *)1; struct hashmap_entry *entry; @@ -262,121 +265,116 @@ int test_hashmap_multimap(void) long found_msk; int err, bkt; - fprintf(stderr, "%s: ", __func__); - /* force collisions */ map = hashmap__new(collision_hash_fn, equal_fn, NULL); - if (CHECK(IS_ERR(map), "failed to create map: %ld\n", PTR_ERR(map))) - return 1; - + if (CHECK(IS_ERR(map), "hashmap__new", + "failed to create map: %ld\n", PTR_ERR(map))) + return; /* set up multimap: * [0] -> 1, 2, 4; * [1] -> 8, 16, 32; */ err = hashmap__append(map, k1, (void *)1); - if (CHECK(err, "failed to add k/v: %d\n", err)) - return 1; + if (CHECK(err, "elem_add", "failed to add k/v: %d\n", err)) + goto cleanup; err = hashmap__append(map, k1, (void *)2); - if (CHECK(err, "failed to add k/v: %d\n", err)) - return 1; + if (CHECK(err, "elem_add", "failed to add k/v: %d\n", err)) + goto cleanup; err = hashmap__append(map, k1, (void *)4); - if (CHECK(err, "failed to add k/v: %d\n", err)) - return 1; + if (CHECK(err, "elem_add", "failed to add k/v: %d\n", err)) + goto cleanup; err = hashmap__append(map, k2, (void *)8); - if (CHECK(err, "failed to add k/v: %d\n", err)) - return 1; + if (CHECK(err, "elem_add", "failed to add k/v: %d\n", err)) + goto cleanup; err = hashmap__append(map, k2, (void *)16); - if (CHECK(err, "failed to add k/v: %d\n", err)) - return 1; + if (CHECK(err, "elem_add", "failed to add k/v: %d\n", err)) + goto cleanup; err = hashmap__append(map, k2, (void *)32); - if (CHECK(err, "failed to add k/v: %d\n", err)) - return 1; + if (CHECK(err, "elem_add", "failed to add k/v: %d\n", err)) + goto cleanup; - if (CHECK(hashmap__size(map) != 6, + if (CHECK(hashmap__size(map) != 6, "hashmap_size", "invalid map size: %zu\n", hashmap__size(map))) - return 1; + goto cleanup; /* verify global iteration still works and sees all values */ found_msk = 0; hashmap__for_each_entry(map, entry, bkt) { found_msk |= (long)entry->value; } - if (CHECK(found_msk != (1 << 6) - 1, + if (CHECK(found_msk != (1 << 6) - 1, "found_msk", "not all keys iterated: %lx\n", found_msk)) - return 1; + goto cleanup; /* iterate values for key 1 */ found_msk = 0; hashmap__for_each_key_entry(map, entry, k1) { found_msk |= (long)entry->value; } - if (CHECK(found_msk != (1 | 2 | 4), + if (CHECK(found_msk != (1 | 2 | 4), "found_msk", "invalid k1 values: %lx\n", found_msk)) - return 1; + goto cleanup; /* iterate values for key 2 */ found_msk = 0; hashmap__for_each_key_entry(map, entry, k2) { found_msk |= (long)entry->value; } - if (CHECK(found_msk != (8 | 16 | 32), + if (CHECK(found_msk != (8 | 16 | 32), "found_msk", "invalid k2 values: %lx\n", found_msk)) - return 1; + goto cleanup; - fprintf(stderr, "OK\n"); - return 0; +cleanup: + hashmap__free(map); } -int test_hashmap_empty() +static void test_hashmap_empty() { struct hashmap_entry *entry; int bkt; struct hashmap *map; void *k = (void *)0; - fprintf(stderr, "%s: ", __func__); - /* force collisions */ map = hashmap__new(hash_fn, equal_fn, NULL); - if (CHECK(IS_ERR(map), "failed to create map: %ld\n", PTR_ERR(map))) - return 1; + if (CHECK(IS_ERR(map), "hashmap__new", + "failed to create map: %ld\n", PTR_ERR(map))) + goto cleanup; - if (CHECK(hashmap__size(map) != 0, + if (CHECK(hashmap__size(map) != 0, "hashmap__size", "invalid map size: %zu\n", hashmap__size(map))) - return 1; - if (CHECK(hashmap__capacity(map) != 0, + goto cleanup; + if (CHECK(hashmap__capacity(map) != 0, "hashmap__capacity", "invalid map capacity: %zu\n", hashmap__capacity(map))) - return 1; - if (CHECK(hashmap__find(map, k, NULL), "unexpected find\n")) - return 1; - if (CHECK(hashmap__delete(map, k, NULL, NULL), "unexpected delete\n")) - return 1; + goto cleanup; + if (CHECK(hashmap__find(map, k, NULL), "elem_find", + "unexpected find\n")) + goto cleanup; + if (CHECK(hashmap__delete(map, k, NULL, NULL), "elem_del", + "unexpected delete\n")) + goto cleanup; hashmap__for_each_entry(map, entry, bkt) { - CHECK(false, "unexpected iterated entry\n"); - return 1; + CHECK(false, "elem_found", "unexpected iterated entry\n"); + goto cleanup; } hashmap__for_each_key_entry(map, entry, k) { - CHECK(false, "unexpected key entry\n"); - return 1; + CHECK(false, "key_found", "unexpected key entry\n"); + goto cleanup; } - fprintf(stderr, "OK\n"); - return 0; +cleanup: + hashmap__free(map); } -int main(int argc, char **argv) +void test_hashmap() { - bool failed = false; - - if (test_hashmap_generic()) - failed = true; - if (test_hashmap_multimap()) - failed = true; - if (test_hashmap_empty()) - failed = true; - - return failed; + if (test__start_subtest("generic")) + test_hashmap_generic(); + if (test__start_subtest("multimap")) + test_hashmap_multimap(); + if (test__start_subtest("empty")) + test_hashmap_empty(); } diff --git a/tools/testing/selftests/bpf/prog_tests/kfree_skb.c b/tools/testing/selftests/bpf/prog_tests/kfree_skb.c index 7507c8f689bc..42c3a3103c26 100644 --- a/tools/testing/selftests/bpf/prog_tests/kfree_skb.c +++ b/tools/testing/selftests/bpf/prog_tests/kfree_skb.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> +#include <network_helpers.h> struct meta { int ifindex; diff --git a/tools/testing/selftests/bpf/prog_tests/l4lb_all.c b/tools/testing/selftests/bpf/prog_tests/l4lb_all.c index eaf64595be88..c2d373e294bb 100644 --- a/tools/testing/selftests/bpf/prog_tests/l4lb_all.c +++ b/tools/testing/selftests/bpf/prog_tests/l4lb_all.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> +#include <network_helpers.h> static void test_l4lb(const char *file) { diff --git a/tools/testing/selftests/bpf/prog_tests/link_pinning.c b/tools/testing/selftests/bpf/prog_tests/link_pinning.c new file mode 100644 index 000000000000..a743288cf384 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/link_pinning.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ + +#include <test_progs.h> +#include <sys/stat.h> + +#include "test_link_pinning.skel.h" + +static int duration = 0; + +void test_link_pinning_subtest(struct bpf_program *prog, + struct test_link_pinning__bss *bss) +{ + const char *link_pin_path = "/sys/fs/bpf/pinned_link_test"; + struct stat statbuf = {}; + struct bpf_link *link; + int err, i; + + link = bpf_program__attach(prog); + if (CHECK(IS_ERR(link), "link_attach", "err: %ld\n", PTR_ERR(link))) + goto cleanup; + + bss->in = 1; + usleep(1); + CHECK(bss->out != 1, "res_check1", "exp %d, got %d\n", 1, bss->out); + + /* pin link */ + err = bpf_link__pin(link, link_pin_path); + if (CHECK(err, "link_pin", "err: %d\n", err)) + goto cleanup; + + CHECK(strcmp(link_pin_path, bpf_link__pin_path(link)), "pin_path1", + "exp %s, got %s\n", link_pin_path, bpf_link__pin_path(link)); + + /* check that link was pinned */ + err = stat(link_pin_path, &statbuf); + if (CHECK(err, "stat_link", "err %d errno %d\n", err, errno)) + goto cleanup; + + bss->in = 2; + usleep(1); + CHECK(bss->out != 2, "res_check2", "exp %d, got %d\n", 2, bss->out); + + /* destroy link, pinned link should keep program attached */ + bpf_link__destroy(link); + link = NULL; + + bss->in = 3; + usleep(1); + CHECK(bss->out != 3, "res_check3", "exp %d, got %d\n", 3, bss->out); + + /* re-open link from BPFFS */ + link = bpf_link__open(link_pin_path); + if (CHECK(IS_ERR(link), "link_open", "err: %ld\n", PTR_ERR(link))) + goto cleanup; + + CHECK(strcmp(link_pin_path, bpf_link__pin_path(link)), "pin_path2", + "exp %s, got %s\n", link_pin_path, bpf_link__pin_path(link)); + + /* unpin link from BPFFS, program still attached */ + err = bpf_link__unpin(link); + if (CHECK(err, "link_unpin", "err: %d\n", err)) + goto cleanup; + + /* still active, as we have FD open now */ + bss->in = 4; + usleep(1); + CHECK(bss->out != 4, "res_check4", "exp %d, got %d\n", 4, bss->out); + + bpf_link__destroy(link); + link = NULL; + + /* Validate it's finally detached. + * Actual detachment might get delayed a bit, so there is no reliable + * way to validate it immediately here, let's count up for long enough + * and see if eventually output stops being updated + */ + for (i = 5; i < 10000; i++) { + bss->in = i; + usleep(1); + if (bss->out == i - 1) + break; + } + CHECK(i == 10000, "link_attached", "got to iteration #%d\n", i); + +cleanup: + if (!IS_ERR(link)) + bpf_link__destroy(link); +} + +void test_link_pinning(void) +{ + struct test_link_pinning* skel; + + skel = test_link_pinning__open_and_load(); + if (CHECK(!skel, "skel_open", "failed to open skeleton\n")) + return; + + if (test__start_subtest("pin_raw_tp")) + test_link_pinning_subtest(skel->progs.raw_tp_prog, skel->bss); + if (test__start_subtest("pin_tp_btf")) + test_link_pinning_subtest(skel->progs.tp_btf_prog, skel->bss); + + test_link_pinning__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/map_lock.c b/tools/testing/selftests/bpf/prog_tests/map_lock.c index 8f91f1881d11..ce17b1ed8709 100644 --- a/tools/testing/selftests/bpf/prog_tests/map_lock.c +++ b/tools/testing/selftests/bpf/prog_tests/map_lock.c @@ -1,5 +1,19 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> +#include <network_helpers.h> + +static void *spin_lock_thread(void *arg) +{ + __u32 duration, retval; + int err, prog_fd = *(u32 *) arg; + + err = bpf_prog_test_run(prog_fd, 10000, &pkt_v4, sizeof(pkt_v4), + NULL, NULL, &retval, &duration); + CHECK(err || retval, "", + "err %d errno %d retval %d duration %d\n", + err, errno, retval, duration); + pthread_exit(arg); +} static void *parallel_map_access(void *arg) { diff --git a/tools/testing/selftests/bpf/prog_tests/mmap.c b/tools/testing/selftests/bpf/prog_tests/mmap.c index 16a814eb4d64..43d0b5578f46 100644 --- a/tools/testing/selftests/bpf/prog_tests/mmap.c +++ b/tools/testing/selftests/bpf/prog_tests/mmap.c @@ -19,15 +19,16 @@ void test_mmap(void) const size_t map_sz = roundup_page(sizeof(struct map_data)); const int zero = 0, one = 1, two = 2, far = 1500; const long page_size = sysconf(_SC_PAGE_SIZE); - int err, duration = 0, i, data_map_fd; + int err, duration = 0, i, data_map_fd, data_map_id, tmp_fd, rdmap_fd; struct bpf_map *data_map, *bss_map; void *bss_mmaped = NULL, *map_mmaped = NULL, *tmp1, *tmp2; struct test_mmap__bss *bss_data; + struct bpf_map_info map_info; + __u32 map_info_sz = sizeof(map_info); struct map_data *map_data; struct test_mmap *skel; __u64 val = 0; - skel = test_mmap__open_and_load(); if (CHECK(!skel, "skel_open_and_load", "skeleton open/load failed\n")) return; @@ -36,6 +37,25 @@ void test_mmap(void) data_map = skel->maps.data_map; data_map_fd = bpf_map__fd(data_map); + rdmap_fd = bpf_map__fd(skel->maps.rdonly_map); + tmp1 = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, rdmap_fd, 0); + if (CHECK(tmp1 != MAP_FAILED, "rdonly_write_mmap", "unexpected success\n")) { + munmap(tmp1, 4096); + goto cleanup; + } + /* now double-check if it's mmap()'able at all */ + tmp1 = mmap(NULL, 4096, PROT_READ, MAP_SHARED, rdmap_fd, 0); + if (CHECK(tmp1 == MAP_FAILED, "rdonly_read_mmap", "failed: %d\n", errno)) + goto cleanup; + + /* get map's ID */ + memset(&map_info, 0, map_info_sz); + err = bpf_obj_get_info_by_fd(data_map_fd, &map_info, &map_info_sz); + if (CHECK(err, "map_get_info", "failed %d\n", errno)) + goto cleanup; + data_map_id = map_info.id; + + /* mmap BSS map */ bss_mmaped = mmap(NULL, bss_sz, PROT_READ | PROT_WRITE, MAP_SHARED, bpf_map__fd(bss_map), 0); if (CHECK(bss_mmaped == MAP_FAILED, "bss_mmap", @@ -98,6 +118,10 @@ void test_mmap(void) "data_map freeze succeeded: err=%d, errno=%d\n", err, errno)) goto cleanup; + err = mprotect(map_mmaped, map_sz, PROT_READ); + if (CHECK(err, "mprotect_ro", "mprotect to r/o failed %d\n", errno)) + goto cleanup; + /* unmap R/W mapping */ err = munmap(map_mmaped, map_sz); map_mmaped = NULL; @@ -111,6 +135,12 @@ void test_mmap(void) map_mmaped = NULL; goto cleanup; } + err = mprotect(map_mmaped, map_sz, PROT_WRITE); + if (CHECK(!err, "mprotect_wr", "mprotect() succeeded unexpectedly!\n")) + goto cleanup; + err = mprotect(map_mmaped, map_sz, PROT_EXEC); + if (CHECK(!err, "mprotect_ex", "mprotect() succeeded unexpectedly!\n")) + goto cleanup; map_data = map_mmaped; /* map/unmap in a loop to test ref counting */ @@ -197,6 +227,53 @@ void test_mmap(void) CHECK_FAIL(map_data->val[far] != 3 * 321); munmap(tmp2, 4 * page_size); + + /* map all 4 pages, but with pg_off=1 page, should fail */ + tmp1 = mmap(NULL, 4 * page_size, PROT_READ, MAP_SHARED | MAP_FIXED, + data_map_fd, page_size /* initial page shift */); + if (CHECK(tmp1 != MAP_FAILED, "adv_mmap7", "unexpected success")) { + munmap(tmp1, 4 * page_size); + goto cleanup; + } + + tmp1 = mmap(NULL, map_sz, PROT_READ, MAP_SHARED, data_map_fd, 0); + if (CHECK(tmp1 == MAP_FAILED, "last_mmap", "failed %d\n", errno)) + goto cleanup; + + test_mmap__destroy(skel); + skel = NULL; + CHECK_FAIL(munmap(bss_mmaped, bss_sz)); + bss_mmaped = NULL; + CHECK_FAIL(munmap(map_mmaped, map_sz)); + map_mmaped = NULL; + + /* map should be still held by active mmap */ + tmp_fd = bpf_map_get_fd_by_id(data_map_id); + if (CHECK(tmp_fd < 0, "get_map_by_id", "failed %d\n", errno)) { + munmap(tmp1, map_sz); + goto cleanup; + } + close(tmp_fd); + + /* this should release data map finally */ + munmap(tmp1, map_sz); + + /* we need to wait for RCU grace period */ + for (i = 0; i < 10000; i++) { + __u32 id = data_map_id - 1; + if (bpf_map_get_next_id(id, &id) || id > data_map_id) + break; + usleep(1); + } + + /* should fail to get map FD by non-existing ID */ + tmp_fd = bpf_map_get_fd_by_id(data_map_id); + if (CHECK(tmp_fd >= 0, "get_map_by_id_after", + "unexpectedly succeeded %d\n", tmp_fd)) { + close(tmp_fd); + goto cleanup; + } + cleanup: if (bss_mmaped) CHECK_FAIL(munmap(bss_mmaped, bss_sz)); diff --git a/tools/testing/selftests/bpf/prog_tests/modify_return.c b/tools/testing/selftests/bpf/prog_tests/modify_return.c new file mode 100644 index 000000000000..97fec70c600b --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/modify_return.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2020 Google LLC. + */ + +#include <test_progs.h> +#include "modify_return.skel.h" + +#define LOWER(x) ((x) & 0xffff) +#define UPPER(x) ((x) >> 16) + + +static void run_test(__u32 input_retval, __u16 want_side_effect, __s16 want_ret) +{ + struct modify_return *skel = NULL; + int err, prog_fd; + __u32 duration = 0, retval; + __u16 side_effect; + __s16 ret; + + skel = modify_return__open_and_load(); + if (CHECK(!skel, "skel_load", "modify_return skeleton failed\n")) + goto cleanup; + + err = modify_return__attach(skel); + if (CHECK(err, "modify_return", "attach failed: %d\n", err)) + goto cleanup; + + skel->bss->input_retval = input_retval; + prog_fd = bpf_program__fd(skel->progs.fmod_ret_test); + err = bpf_prog_test_run(prog_fd, 1, NULL, 0, NULL, 0, + &retval, &duration); + + CHECK(err, "test_run", "err %d errno %d\n", err, errno); + + side_effect = UPPER(retval); + ret = LOWER(retval); + + CHECK(ret != want_ret, "test_run", + "unexpected ret: %d, expected: %d\n", ret, want_ret); + CHECK(side_effect != want_side_effect, "modify_return", + "unexpected side_effect: %d\n", side_effect); + + CHECK(skel->bss->fentry_result != 1, "modify_return", + "fentry failed\n"); + CHECK(skel->bss->fexit_result != 1, "modify_return", + "fexit failed\n"); + CHECK(skel->bss->fmod_ret_result != 1, "modify_return", + "fmod_ret failed\n"); + +cleanup: + modify_return__destroy(skel); +} + +void test_modify_return(void) +{ + run_test(0 /* input_retval */, + 1 /* want_side_effect */, + 4 /* want_ret */); + run_test(-EINVAL /* input_retval */, + 0 /* want_side_effect */, + -EINVAL /* want_ret */); +} + diff --git a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c new file mode 100644 index 000000000000..e74dc501b27f --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Carlos Neira cneirabustos@gmail.com */ +#include <test_progs.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> +#include <sys/syscall.h> + +struct bss { + __u64 dev; + __u64 ino; + __u64 pid_tgid; + __u64 user_pid_tgid; +}; + +void test_ns_current_pid_tgid(void) +{ + const char *probe_name = "raw_tracepoint/sys_enter"; + const char *file = "test_ns_current_pid_tgid.o"; + int err, key = 0, duration = 0; + struct bpf_link *link = NULL; + struct bpf_program *prog; + struct bpf_map *bss_map; + struct bpf_object *obj; + struct bss bss; + struct stat st; + __u64 id; + + obj = bpf_object__open_file(file, NULL); + if (CHECK(IS_ERR(obj), "obj_open", "err %ld\n", PTR_ERR(obj))) + return; + + err = bpf_object__load(obj); + if (CHECK(err, "obj_load", "err %d errno %d\n", err, errno)) + goto cleanup; + + bss_map = bpf_object__find_map_by_name(obj, "test_ns_.bss"); + if (CHECK(!bss_map, "find_bss_map", "failed\n")) + goto cleanup; + + prog = bpf_object__find_program_by_title(obj, probe_name); + if (CHECK(!prog, "find_prog", "prog '%s' not found\n", + probe_name)) + goto cleanup; + + memset(&bss, 0, sizeof(bss)); + pid_t tid = syscall(SYS_gettid); + pid_t pid = getpid(); + + id = (__u64) tid << 32 | pid; + bss.user_pid_tgid = id; + + if (CHECK_FAIL(stat("/proc/self/ns/pid", &st))) { + perror("Failed to stat /proc/self/ns/pid"); + goto cleanup; + } + + bss.dev = st.st_dev; + bss.ino = st.st_ino; + + err = bpf_map_update_elem(bpf_map__fd(bss_map), &key, &bss, 0); + if (CHECK(err, "setting_bss", "failed to set bss : %d\n", err)) + goto cleanup; + + link = bpf_program__attach_raw_tracepoint(prog, "sys_enter"); + if (CHECK(IS_ERR(link), "attach_raw_tp", "err %ld\n", + PTR_ERR(link))) { + link = NULL; + goto cleanup; + } + + /* trigger some syscalls */ + usleep(1); + + err = bpf_map_lookup_elem(bpf_map__fd(bss_map), &key, &bss); + if (CHECK(err, "set_bss", "failed to get bss : %d\n", err)) + goto cleanup; + + if (CHECK(id != bss.pid_tgid, "Compare user pid/tgid vs. bpf pid/tgid", + "User pid/tgid %llu BPF pid/tgid %llu\n", id, bss.pid_tgid)) + goto cleanup; +cleanup: + bpf_link__destroy(link); + bpf_object__close(obj); +} diff --git a/tools/testing/selftests/bpf/prog_tests/perf_branches.c b/tools/testing/selftests/bpf/prog_tests/perf_branches.c new file mode 100644 index 000000000000..e35c444902a7 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/perf_branches.c @@ -0,0 +1,170 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <pthread.h> +#include <sched.h> +#include <sys/socket.h> +#include <test_progs.h> +#include "bpf/libbpf_internal.h" +#include "test_perf_branches.skel.h" + +static void check_good_sample(struct test_perf_branches *skel) +{ + int written_global = skel->bss->written_global_out; + int required_size = skel->bss->required_size_out; + int written_stack = skel->bss->written_stack_out; + int pbe_size = sizeof(struct perf_branch_entry); + int duration = 0; + + if (CHECK(!skel->bss->valid, "output not valid", + "no valid sample from prog")) + return; + + /* + * It's hard to validate the contents of the branch entries b/c it + * would require some kind of disassembler and also encoding the + * valid jump instructions for supported architectures. So just check + * the easy stuff for now. + */ + CHECK(required_size <= 0, "read_branches_size", "err %d\n", required_size); + CHECK(written_stack < 0, "read_branches_stack", "err %d\n", written_stack); + CHECK(written_stack % pbe_size != 0, "read_branches_stack", + "stack bytes written=%d not multiple of struct size=%d\n", + written_stack, pbe_size); + CHECK(written_global < 0, "read_branches_global", "err %d\n", written_global); + CHECK(written_global % pbe_size != 0, "read_branches_global", + "global bytes written=%d not multiple of struct size=%d\n", + written_global, pbe_size); + CHECK(written_global < written_stack, "read_branches_size", + "written_global=%d < written_stack=%d\n", written_global, written_stack); +} + +static void check_bad_sample(struct test_perf_branches *skel) +{ + int written_global = skel->bss->written_global_out; + int required_size = skel->bss->required_size_out; + int written_stack = skel->bss->written_stack_out; + int duration = 0; + + if (CHECK(!skel->bss->valid, "output not valid", + "no valid sample from prog")) + return; + + CHECK((required_size != -EINVAL && required_size != -ENOENT), + "read_branches_size", "err %d\n", required_size); + CHECK((written_stack != -EINVAL && written_stack != -ENOENT), + "read_branches_stack", "written %d\n", written_stack); + CHECK((written_global != -EINVAL && written_global != -ENOENT), + "read_branches_global", "written %d\n", written_global); +} + +static void test_perf_branches_common(int perf_fd, + void (*cb)(struct test_perf_branches *)) +{ + struct test_perf_branches *skel; + int err, i, duration = 0; + bool detached = false; + struct bpf_link *link; + volatile int j = 0; + cpu_set_t cpu_set; + + skel = test_perf_branches__open_and_load(); + if (CHECK(!skel, "test_perf_branches_load", + "perf_branches skeleton failed\n")) + return; + + /* attach perf_event */ + link = bpf_program__attach_perf_event(skel->progs.perf_branches, perf_fd); + if (CHECK(IS_ERR(link), "attach_perf_event", "err %ld\n", PTR_ERR(link))) + goto out_destroy_skel; + + /* generate some branches on cpu 0 */ + CPU_ZERO(&cpu_set); + CPU_SET(0, &cpu_set); + err = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set), &cpu_set); + if (CHECK(err, "set_affinity", "cpu #0, err %d\n", err)) + goto out_destroy; + /* spin the loop for a while (random high number) */ + for (i = 0; i < 1000000; ++i) + ++j; + + test_perf_branches__detach(skel); + detached = true; + + cb(skel); +out_destroy: + bpf_link__destroy(link); +out_destroy_skel: + if (!detached) + test_perf_branches__detach(skel); + test_perf_branches__destroy(skel); +} + +static void test_perf_branches_hw(void) +{ + struct perf_event_attr attr = {0}; + int duration = 0; + int pfd; + + /* create perf event */ + attr.size = sizeof(attr); + attr.type = PERF_TYPE_HARDWARE; + attr.config = PERF_COUNT_HW_CPU_CYCLES; + attr.freq = 1; + attr.sample_freq = 4000; + attr.sample_type = PERF_SAMPLE_BRANCH_STACK; + attr.branch_sample_type = PERF_SAMPLE_BRANCH_USER | PERF_SAMPLE_BRANCH_ANY; + pfd = syscall(__NR_perf_event_open, &attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC); + + /* + * Some setups don't support branch records (virtual machines, !x86), + * so skip test in this case. + */ + if (pfd == -1) { + if (errno == ENOENT || errno == EOPNOTSUPP) { + printf("%s:SKIP:no PERF_SAMPLE_BRANCH_STACK\n", + __func__); + test__skip(); + return; + } + if (CHECK(pfd < 0, "perf_event_open", "err %d errno %d\n", + pfd, errno)) + return; + } + + test_perf_branches_common(pfd, check_good_sample); + + close(pfd); +} + +/* + * Tests negative case -- run bpf_read_branch_records() on improperly configured + * perf event. + */ +static void test_perf_branches_no_hw(void) +{ + struct perf_event_attr attr = {0}; + int duration = 0; + int pfd; + + /* create perf event */ + attr.size = sizeof(attr); + attr.type = PERF_TYPE_SOFTWARE; + attr.config = PERF_COUNT_SW_CPU_CLOCK; + attr.freq = 1; + attr.sample_freq = 4000; + pfd = syscall(__NR_perf_event_open, &attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC); + if (CHECK(pfd < 0, "perf_event_open", "err %d\n", pfd)) + return; + + test_perf_branches_common(pfd, check_bad_sample); + + close(pfd); +} + +void test_perf_branches(void) +{ + if (test__start_subtest("perf_branches_hw")) + test_perf_branches_hw(); + if (test__start_subtest("perf_branches_no_hw")) + test_perf_branches_no_hw(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/perf_buffer.c b/tools/testing/selftests/bpf/prog_tests/perf_buffer.c index 1450ea2dd4cc..a122ce3b360e 100644 --- a/tools/testing/selftests/bpf/prog_tests/perf_buffer.c +++ b/tools/testing/selftests/bpf/prog_tests/perf_buffer.c @@ -6,6 +6,11 @@ #include <test_progs.h> #include "bpf/libbpf_internal.h" +/* AddressSanitizer sometimes crashes due to data dereference below, due to + * this being mmap()'ed memory. Disable instrumentation with + * no_sanitize_address attribute + */ +__attribute__((no_sanitize_address)) static void on_sample(void *ctx, int cpu, void *data, __u32 size) { int cpu_data = *(int *)data, duration = 0; diff --git a/tools/testing/selftests/bpf/prog_tests/pkt_access.c b/tools/testing/selftests/bpf/prog_tests/pkt_access.c index a2537dfa899c..44b514fabccd 100644 --- a/tools/testing/selftests/bpf/prog_tests/pkt_access.c +++ b/tools/testing/selftests/bpf/prog_tests/pkt_access.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> +#include <network_helpers.h> void test_pkt_access(void) { diff --git a/tools/testing/selftests/bpf/prog_tests/pkt_md_access.c b/tools/testing/selftests/bpf/prog_tests/pkt_md_access.c index 5f7aea605019..939015cd6dba 100644 --- a/tools/testing/selftests/bpf/prog_tests/pkt_md_access.c +++ b/tools/testing/selftests/bpf/prog_tests/pkt_md_access.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> +#include <network_helpers.h> void test_pkt_md_access(void) { diff --git a/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c b/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c index 5dd89b941f53..dde2b7ae7bc9 100644 --- a/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c +++ b/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> +#include <network_helpers.h> void test_prog_run_xattr(void) { diff --git a/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c b/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c index faccc66f4e39..f47e7b1cb32c 100644 --- a/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c +++ b/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> +#include <network_helpers.h> enum { QUEUE, diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf.c b/tools/testing/selftests/bpf/prog_tests/ringbuf.c new file mode 100644 index 000000000000..2bba908dfa63 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf.c @@ -0,0 +1,211 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <linux/compiler.h> +#include <asm/barrier.h> +#include <test_progs.h> +#include <sys/mman.h> +#include <sys/epoll.h> +#include <time.h> +#include <sched.h> +#include <signal.h> +#include <pthread.h> +#include <sys/sysinfo.h> +#include <linux/perf_event.h> +#include <linux/ring_buffer.h> +#include "test_ringbuf.skel.h" + +#define EDONE 7777 + +static int duration = 0; + +struct sample { + int pid; + int seq; + long value; + char comm[16]; +}; + +static volatile int sample_cnt; + +static int process_sample(void *ctx, void *data, size_t len) +{ + struct sample *s = data; + + sample_cnt++; + + switch (s->seq) { + case 0: + CHECK(s->value != 333, "sample1_value", "exp %ld, got %ld\n", + 333L, s->value); + return 0; + case 1: + CHECK(s->value != 777, "sample2_value", "exp %ld, got %ld\n", + 777L, s->value); + return -EDONE; + default: + /* we don't care about the rest */ + return 0; + } +} + +static struct test_ringbuf *skel; +static struct ring_buffer *ringbuf; + +static void trigger_samples() +{ + skel->bss->dropped = 0; + skel->bss->total = 0; + skel->bss->discarded = 0; + + /* trigger exactly two samples */ + skel->bss->value = 333; + syscall(__NR_getpgid); + skel->bss->value = 777; + syscall(__NR_getpgid); +} + +static void *poll_thread(void *input) +{ + long timeout = (long)input; + + return (void *)(long)ring_buffer__poll(ringbuf, timeout); +} + +void test_ringbuf(void) +{ + const size_t rec_sz = BPF_RINGBUF_HDR_SZ + sizeof(struct sample); + pthread_t thread; + long bg_ret = -1; + int err; + + skel = test_ringbuf__open_and_load(); + if (CHECK(!skel, "skel_open_load", "skeleton open&load failed\n")) + return; + + /* only trigger BPF program for current process */ + skel->bss->pid = getpid(); + + ringbuf = ring_buffer__new(bpf_map__fd(skel->maps.ringbuf), + process_sample, NULL, NULL); + if (CHECK(!ringbuf, "ringbuf_create", "failed to create ringbuf\n")) + goto cleanup; + + err = test_ringbuf__attach(skel); + if (CHECK(err, "skel_attach", "skeleton attachment failed: %d\n", err)) + goto cleanup; + + trigger_samples(); + + /* 2 submitted + 1 discarded records */ + CHECK(skel->bss->avail_data != 3 * rec_sz, + "err_avail_size", "exp %ld, got %ld\n", + 3L * rec_sz, skel->bss->avail_data); + CHECK(skel->bss->ring_size != 4096, + "err_ring_size", "exp %ld, got %ld\n", + 4096L, skel->bss->ring_size); + CHECK(skel->bss->cons_pos != 0, + "err_cons_pos", "exp %ld, got %ld\n", + 0L, skel->bss->cons_pos); + CHECK(skel->bss->prod_pos != 3 * rec_sz, + "err_prod_pos", "exp %ld, got %ld\n", + 3L * rec_sz, skel->bss->prod_pos); + + /* poll for samples */ + err = ring_buffer__poll(ringbuf, -1); + + /* -EDONE is used as an indicator that we are done */ + if (CHECK(err != -EDONE, "err_done", "done err: %d\n", err)) + goto cleanup; + + /* we expect extra polling to return nothing */ + err = ring_buffer__poll(ringbuf, 0); + if (CHECK(err != 0, "extra_samples", "poll result: %d\n", err)) + goto cleanup; + + CHECK(skel->bss->dropped != 0, "err_dropped", "exp %ld, got %ld\n", + 0L, skel->bss->dropped); + CHECK(skel->bss->total != 2, "err_total", "exp %ld, got %ld\n", + 2L, skel->bss->total); + CHECK(skel->bss->discarded != 1, "err_discarded", "exp %ld, got %ld\n", + 1L, skel->bss->discarded); + + /* now validate consumer position is updated and returned */ + trigger_samples(); + CHECK(skel->bss->cons_pos != 3 * rec_sz, + "err_cons_pos", "exp %ld, got %ld\n", + 3L * rec_sz, skel->bss->cons_pos); + err = ring_buffer__poll(ringbuf, -1); + CHECK(err <= 0, "poll_err", "err %d\n", err); + + /* start poll in background w/ long timeout */ + err = pthread_create(&thread, NULL, poll_thread, (void *)(long)10000); + if (CHECK(err, "bg_poll", "pthread_create failed: %d\n", err)) + goto cleanup; + + /* turn off notifications now */ + skel->bss->flags = BPF_RB_NO_WAKEUP; + + /* give background thread a bit of a time */ + usleep(50000); + trigger_samples(); + /* sleeping arbitrarily is bad, but no better way to know that + * epoll_wait() **DID NOT** unblock in background thread + */ + usleep(50000); + /* background poll should still be blocked */ + err = pthread_tryjoin_np(thread, (void **)&bg_ret); + if (CHECK(err != EBUSY, "try_join", "err %d\n", err)) + goto cleanup; + + /* BPF side did everything right */ + CHECK(skel->bss->dropped != 0, "err_dropped", "exp %ld, got %ld\n", + 0L, skel->bss->dropped); + CHECK(skel->bss->total != 2, "err_total", "exp %ld, got %ld\n", + 2L, skel->bss->total); + CHECK(skel->bss->discarded != 1, "err_discarded", "exp %ld, got %ld\n", + 1L, skel->bss->discarded); + + /* clear flags to return to "adaptive" notification mode */ + skel->bss->flags = 0; + + /* produce new samples, no notification should be triggered, because + * consumer is now behind + */ + trigger_samples(); + + /* background poll should still be blocked */ + err = pthread_tryjoin_np(thread, (void **)&bg_ret); + if (CHECK(err != EBUSY, "try_join", "err %d\n", err)) + goto cleanup; + + /* now force notifications */ + skel->bss->flags = BPF_RB_FORCE_WAKEUP; + sample_cnt = 0; + trigger_samples(); + + /* now we should get a pending notification */ + usleep(50000); + err = pthread_tryjoin_np(thread, (void **)&bg_ret); + if (CHECK(err, "join_bg", "err %d\n", err)) + goto cleanup; + + if (CHECK(bg_ret != 1, "bg_ret", "epoll_wait result: %ld", bg_ret)) + goto cleanup; + + /* 3 rounds, 2 samples each */ + CHECK(sample_cnt != 6, "wrong_sample_cnt", + "expected to see %d samples, got %d\n", 6, sample_cnt); + + /* BPF side did everything right */ + CHECK(skel->bss->dropped != 0, "err_dropped", "exp %ld, got %ld\n", + 0L, skel->bss->dropped); + CHECK(skel->bss->total != 2, "err_total", "exp %ld, got %ld\n", + 2L, skel->bss->total); + CHECK(skel->bss->discarded != 1, "err_discarded", "exp %ld, got %ld\n", + 1L, skel->bss->discarded); + + test_ringbuf__detach(skel); +cleanup: + ring_buffer__free(ringbuf); + test_ringbuf__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c b/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c new file mode 100644 index 000000000000..78e450609803 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <test_progs.h> +#include <sys/epoll.h> +#include "test_ringbuf_multi.skel.h" + +static int duration = 0; + +struct sample { + int pid; + int seq; + long value; + char comm[16]; +}; + +static int process_sample(void *ctx, void *data, size_t len) +{ + int ring = (unsigned long)ctx; + struct sample *s = data; + + switch (s->seq) { + case 0: + CHECK(ring != 1, "sample1_ring", "exp %d, got %d\n", 1, ring); + CHECK(s->value != 333, "sample1_value", "exp %ld, got %ld\n", + 333L, s->value); + break; + case 1: + CHECK(ring != 2, "sample2_ring", "exp %d, got %d\n", 2, ring); + CHECK(s->value != 777, "sample2_value", "exp %ld, got %ld\n", + 777L, s->value); + break; + default: + CHECK(true, "extra_sample", "unexpected sample seq %d, val %ld\n", + s->seq, s->value); + return -1; + } + + return 0; +} + +void test_ringbuf_multi(void) +{ + struct test_ringbuf_multi *skel; + struct ring_buffer *ringbuf; + int err; + + skel = test_ringbuf_multi__open_and_load(); + if (CHECK(!skel, "skel_open_load", "skeleton open&load failed\n")) + return; + + /* only trigger BPF program for current process */ + skel->bss->pid = getpid(); + + ringbuf = ring_buffer__new(bpf_map__fd(skel->maps.ringbuf1), + process_sample, (void *)(long)1, NULL); + if (CHECK(!ringbuf, "ringbuf_create", "failed to create ringbuf\n")) + goto cleanup; + + err = ring_buffer__add(ringbuf, bpf_map__fd(skel->maps.ringbuf2), + process_sample, (void *)(long)2); + if (CHECK(err, "ringbuf_add", "failed to add another ring\n")) + goto cleanup; + + err = test_ringbuf_multi__attach(skel); + if (CHECK(err, "skel_attach", "skeleton attachment failed: %d\n", err)) + goto cleanup; + + /* trigger few samples, some will be skipped */ + skel->bss->target_ring = 0; + skel->bss->value = 333; + syscall(__NR_getpgid); + + /* skipped, no ringbuf in slot 1 */ + skel->bss->target_ring = 1; + skel->bss->value = 555; + syscall(__NR_getpgid); + + skel->bss->target_ring = 2; + skel->bss->value = 777; + syscall(__NR_getpgid); + + /* poll for samples, should get 2 ringbufs back */ + err = ring_buffer__poll(ringbuf, -1); + if (CHECK(err != 4, "poll_res", "expected 4 records, got %d\n", err)) + goto cleanup; + + /* expect extra polling to return nothing */ + err = ring_buffer__poll(ringbuf, 0); + if (CHECK(err < 0, "extra_samples", "poll result: %d\n", err)) + goto cleanup; + + CHECK(skel->bss->dropped != 0, "err_dropped", "exp %ld, got %ld\n", + 0L, skel->bss->dropped); + CHECK(skel->bss->skipped != 1, "err_skipped", "exp %ld, got %ld\n", + 1L, skel->bss->skipped); + CHECK(skel->bss->total != 2, "err_total", "exp %ld, got %ld\n", + 2L, skel->bss->total); + +cleanup: + ring_buffer__free(ringbuf); + test_ringbuf_multi__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/section_names.c b/tools/testing/selftests/bpf/prog_tests/section_names.c index 9d9351dc2ded..713167449c98 100644 --- a/tools/testing/selftests/bpf/prog_tests/section_names.c +++ b/tools/testing/selftests/bpf/prog_tests/section_names.c @@ -43,18 +43,18 @@ static struct sec_name_test tests[] = { {"lwt_seg6local", {0, BPF_PROG_TYPE_LWT_SEG6LOCAL, 0}, {-EINVAL, 0} }, { "cgroup_skb/ingress", - {0, BPF_PROG_TYPE_CGROUP_SKB, 0}, + {0, BPF_PROG_TYPE_CGROUP_SKB, BPF_CGROUP_INET_INGRESS}, {0, BPF_CGROUP_INET_INGRESS}, }, { "cgroup_skb/egress", - {0, BPF_PROG_TYPE_CGROUP_SKB, 0}, + {0, BPF_PROG_TYPE_CGROUP_SKB, BPF_CGROUP_INET_EGRESS}, {0, BPF_CGROUP_INET_EGRESS}, }, {"cgroup/skb", {0, BPF_PROG_TYPE_CGROUP_SKB, 0}, {-EINVAL, 0} }, { "cgroup/sock", - {0, BPF_PROG_TYPE_CGROUP_SOCK, 0}, + {0, BPF_PROG_TYPE_CGROUP_SOCK, BPF_CGROUP_INET_SOCK_CREATE}, {0, BPF_CGROUP_INET_SOCK_CREATE}, }, { @@ -69,26 +69,38 @@ static struct sec_name_test tests[] = { }, { "cgroup/dev", - {0, BPF_PROG_TYPE_CGROUP_DEVICE, 0}, + {0, BPF_PROG_TYPE_CGROUP_DEVICE, BPF_CGROUP_DEVICE}, {0, BPF_CGROUP_DEVICE}, }, - {"sockops", {0, BPF_PROG_TYPE_SOCK_OPS, 0}, {0, BPF_CGROUP_SOCK_OPS} }, + { + "sockops", + {0, BPF_PROG_TYPE_SOCK_OPS, BPF_CGROUP_SOCK_OPS}, + {0, BPF_CGROUP_SOCK_OPS}, + }, { "sk_skb/stream_parser", - {0, BPF_PROG_TYPE_SK_SKB, 0}, + {0, BPF_PROG_TYPE_SK_SKB, BPF_SK_SKB_STREAM_PARSER}, {0, BPF_SK_SKB_STREAM_PARSER}, }, { "sk_skb/stream_verdict", - {0, BPF_PROG_TYPE_SK_SKB, 0}, + {0, BPF_PROG_TYPE_SK_SKB, BPF_SK_SKB_STREAM_VERDICT}, {0, BPF_SK_SKB_STREAM_VERDICT}, }, {"sk_skb", {0, BPF_PROG_TYPE_SK_SKB, 0}, {-EINVAL, 0} }, - {"sk_msg", {0, BPF_PROG_TYPE_SK_MSG, 0}, {0, BPF_SK_MSG_VERDICT} }, - {"lirc_mode2", {0, BPF_PROG_TYPE_LIRC_MODE2, 0}, {0, BPF_LIRC_MODE2} }, + { + "sk_msg", + {0, BPF_PROG_TYPE_SK_MSG, BPF_SK_MSG_VERDICT}, + {0, BPF_SK_MSG_VERDICT}, + }, + { + "lirc_mode2", + {0, BPF_PROG_TYPE_LIRC_MODE2, BPF_LIRC_MODE2}, + {0, BPF_LIRC_MODE2}, + }, { "flow_dissector", - {0, BPF_PROG_TYPE_FLOW_DISSECTOR, 0}, + {0, BPF_PROG_TYPE_FLOW_DISSECTOR, BPF_FLOW_DISSECTOR}, {0, BPF_FLOW_DISSECTOR}, }, { @@ -158,17 +170,17 @@ static void test_prog_type_by_name(const struct sec_name_test *test) &expected_attach_type); CHECK(rc != test->expected_load.rc, "check_code", - "prog: unexpected rc=%d for %s", rc, test->sec_name); + "prog: unexpected rc=%d for %s\n", rc, test->sec_name); if (rc) return; CHECK(prog_type != test->expected_load.prog_type, "check_prog_type", - "prog: unexpected prog_type=%d for %s", + "prog: unexpected prog_type=%d for %s\n", prog_type, test->sec_name); CHECK(expected_attach_type != test->expected_load.expected_attach_type, - "check_attach_type", "prog: unexpected expected_attach_type=%d for %s", + "check_attach_type", "prog: unexpected expected_attach_type=%d for %s\n", expected_attach_type, test->sec_name); } @@ -180,13 +192,13 @@ static void test_attach_type_by_name(const struct sec_name_test *test) rc = libbpf_attach_type_by_name(test->sec_name, &attach_type); CHECK(rc != test->expected_attach.rc, "check_ret", - "attach: unexpected rc=%d for %s", rc, test->sec_name); + "attach: unexpected rc=%d for %s\n", rc, test->sec_name); if (rc) return; CHECK(attach_type != test->expected_attach.attach_type, - "check_attach_type", "attach: unexpected attach_type=%d for %s", + "check_attach_type", "attach: unexpected attach_type=%d for %s\n", attach_type, test->sec_name); } diff --git a/tools/testing/selftests/bpf/prog_tests/select_reuseport.c b/tools/testing/selftests/bpf/prog_tests/select_reuseport.c index 0800036ed654..821b4146b7b6 100644 --- a/tools/testing/selftests/bpf/prog_tests/select_reuseport.c +++ b/tools/testing/selftests/bpf/prog_tests/select_reuseport.c @@ -36,6 +36,7 @@ static int result_map, tmp_index_ovr_map, linum_map, data_check_map; static __u32 expected_results[NR_RESULTS]; static int sk_fds[REUSEPORT_ARRAY_SIZE]; static int reuseport_array = -1, outer_map = -1; +static enum bpf_map_type inner_map_type; static int select_by_skb_data_prog; static int saved_tcp_syncookie = -1; static struct bpf_object *obj; @@ -63,13 +64,15 @@ static union sa46 { } \ }) -static int create_maps(void) +static int create_maps(enum bpf_map_type inner_type) { struct bpf_create_map_attr attr = {}; + inner_map_type = inner_type; + /* Creating reuseport_array */ attr.name = "reuseport_array"; - attr.map_type = BPF_MAP_TYPE_REUSEPORT_SOCKARRAY; + attr.map_type = inner_type; attr.key_size = sizeof(__u32); attr.value_size = sizeof(__u32); attr.max_entries = REUSEPORT_ARRAY_SIZE; @@ -506,11 +509,6 @@ static void test_syncookie(int type, sa_family_t family) .pass_on_failure = 0, }; - if (type != SOCK_STREAM) { - test__skip(); - return; - } - /* * +1 for TCP-SYN and * +1 for the TCP-ACK (ack the syncookie) @@ -728,12 +726,36 @@ static void cleanup_per_test(bool no_inner_map) static void cleanup(void) { - if (outer_map != -1) + if (outer_map != -1) { close(outer_map); - if (reuseport_array != -1) + outer_map = -1; + } + + if (reuseport_array != -1) { close(reuseport_array); - if (obj) + reuseport_array = -1; + } + + if (obj) { bpf_object__close(obj); + obj = NULL; + } + + memset(expected_results, 0, sizeof(expected_results)); +} + +static const char *maptype_str(enum bpf_map_type type) +{ + switch (type) { + case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY: + return "reuseport_sockarray"; + case BPF_MAP_TYPE_SOCKMAP: + return "sockmap"; + case BPF_MAP_TYPE_SOCKHASH: + return "sockhash"; + default: + return "unknown"; + } } static const char *family_str(sa_family_t family) @@ -760,7 +782,7 @@ static const char *sotype_str(int sotype) } } -#define TEST_INIT(fn, ...) { fn, #fn, __VA_ARGS__ } +#define TEST_INIT(fn_, ...) { .fn = fn_, .name = #fn_, __VA_ARGS__ } static void test_config(int sotype, sa_family_t family, bool inany) { @@ -768,12 +790,15 @@ static void test_config(int sotype, sa_family_t family, bool inany) void (*fn)(int sotype, sa_family_t family); const char *name; bool no_inner_map; + int need_sotype; } tests[] = { - TEST_INIT(test_err_inner_map, true /* no_inner_map */), + TEST_INIT(test_err_inner_map, + .no_inner_map = true), TEST_INIT(test_err_skb_data), TEST_INIT(test_err_sk_select_port), TEST_INIT(test_pass), - TEST_INIT(test_syncookie), + TEST_INIT(test_syncookie, + .need_sotype = SOCK_STREAM), TEST_INIT(test_pass_on_err), TEST_INIT(test_detach_bpf), }; @@ -781,7 +806,11 @@ static void test_config(int sotype, sa_family_t family, bool inany) const struct test *t; for (t = tests; t < tests + ARRAY_SIZE(tests); t++) { - snprintf(s, sizeof(s), "%s/%s %s %s", + if (t->need_sotype && t->need_sotype != sotype) + continue; /* test not compatible with socket type */ + + snprintf(s, sizeof(s), "%s %s/%s %s %s", + maptype_str(inner_map_type), family_str(family), sotype_str(sotype), inany ? "INANY" : "LOOPBACK", t->name); @@ -816,13 +845,20 @@ static void test_all(void) test_config(c->sotype, c->family, c->inany); } -void test_select_reuseport(void) +void test_map_type(enum bpf_map_type mt) { - if (create_maps()) + if (create_maps(mt)) goto out; if (prepare_bpf_obj()) goto out; + test_all(); +out: + cleanup(); +} + +void test_select_reuseport(void) +{ saved_tcp_fo = read_int_sysctl(TCP_FO_SYSCTL); if (saved_tcp_fo < 0) goto out; @@ -835,8 +871,9 @@ void test_select_reuseport(void) if (disable_syncookie()) goto out; - test_all(); + test_map_type(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY); + test_map_type(BPF_MAP_TYPE_SOCKMAP); + test_map_type(BPF_MAP_TYPE_SOCKHASH); out: - cleanup(); restore_sysctls(); } diff --git a/tools/testing/selftests/bpf/prog_tests/signal_pending.c b/tools/testing/selftests/bpf/prog_tests/signal_pending.c index 996e808f43a2..dfcbddcbe4d3 100644 --- a/tools/testing/selftests/bpf/prog_tests/signal_pending.c +++ b/tools/testing/selftests/bpf/prog_tests/signal_pending.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> +#include <network_helpers.h> static void sigalrm_handler(int s) {} static struct sigaction sigalrm_action = { diff --git a/tools/testing/selftests/bpf/prog_tests/sk_assign.c b/tools/testing/selftests/bpf/prog_tests/sk_assign.c new file mode 100644 index 000000000000..47fa04adc147 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/sk_assign.c @@ -0,0 +1,328 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2018 Facebook +// Copyright (c) 2019 Cloudflare +// Copyright (c) 2020 Isovalent, Inc. +/* + * Test that the socket assign program is able to redirect traffic towards a + * socket, regardless of whether the port or address destination of the traffic + * matches the port. + */ + +#define _GNU_SOURCE +#include <fcntl.h> +#include <signal.h> +#include <stdlib.h> +#include <unistd.h> + +#include "test_progs.h" + +#define BIND_PORT 1234 +#define CONNECT_PORT 4321 +#define TEST_DADDR (0xC0A80203) +#define NS_SELF "/proc/self/ns/net" +#define SERVER_MAP_PATH "/sys/fs/bpf/tc/globals/server_map" + +static const struct timeval timeo_sec = { .tv_sec = 3 }; +static const size_t timeo_optlen = sizeof(timeo_sec); +static int stop, duration; + +static bool +configure_stack(void) +{ + char tc_cmd[BUFSIZ]; + + /* Move to a new networking namespace */ + if (CHECK_FAIL(unshare(CLONE_NEWNET))) + return false; + + /* Configure necessary links, routes */ + if (CHECK_FAIL(system("ip link set dev lo up"))) + return false; + if (CHECK_FAIL(system("ip route add local default dev lo"))) + return false; + if (CHECK_FAIL(system("ip -6 route add local default dev lo"))) + return false; + + /* Load qdisc, BPF program */ + if (CHECK_FAIL(system("tc qdisc add dev lo clsact"))) + return false; + sprintf(tc_cmd, "%s %s %s %s", "tc filter add dev lo ingress bpf", + "direct-action object-file ./test_sk_assign.o", + "section classifier/sk_assign_test", + (env.verbosity < VERBOSE_VERY) ? " 2>/dev/null" : ""); + if (CHECK(system(tc_cmd), "BPF load failed;", + "run with -vv for more info\n")) + return false; + + return true; +} + +static int +start_server(const struct sockaddr *addr, socklen_t len, int type) +{ + int fd; + + fd = socket(addr->sa_family, type, 0); + if (CHECK_FAIL(fd == -1)) + goto out; + if (CHECK_FAIL(setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &timeo_sec, + timeo_optlen))) + goto close_out; + if (CHECK_FAIL(bind(fd, addr, len) == -1)) + goto close_out; + if (type == SOCK_STREAM && CHECK_FAIL(listen(fd, 128) == -1)) + goto close_out; + + goto out; +close_out: + close(fd); + fd = -1; +out: + return fd; +} + +static int +connect_to_server(const struct sockaddr *addr, socklen_t len, int type) +{ + int fd = -1; + + fd = socket(addr->sa_family, type, 0); + if (CHECK_FAIL(fd == -1)) + goto out; + if (CHECK_FAIL(setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &timeo_sec, + timeo_optlen))) + goto close_out; + if (CHECK_FAIL(connect(fd, addr, len))) + goto close_out; + + goto out; +close_out: + close(fd); + fd = -1; +out: + return fd; +} + +static in_port_t +get_port(int fd) +{ + struct sockaddr_storage ss; + socklen_t slen = sizeof(ss); + in_port_t port = 0; + + if (CHECK_FAIL(getsockname(fd, (struct sockaddr *)&ss, &slen))) + return port; + + switch (ss.ss_family) { + case AF_INET: + port = ((struct sockaddr_in *)&ss)->sin_port; + break; + case AF_INET6: + port = ((struct sockaddr_in6 *)&ss)->sin6_port; + break; + default: + CHECK(1, "Invalid address family", "%d\n", ss.ss_family); + } + return port; +} + +static ssize_t +rcv_msg(int srv_client, int type) +{ + struct sockaddr_storage ss; + char buf[BUFSIZ]; + socklen_t slen; + + if (type == SOCK_STREAM) + return read(srv_client, &buf, sizeof(buf)); + else + return recvfrom(srv_client, &buf, sizeof(buf), 0, + (struct sockaddr *)&ss, &slen); +} + +static int +run_test(int server_fd, const struct sockaddr *addr, socklen_t len, int type) +{ + int client = -1, srv_client = -1; + char buf[] = "testing"; + in_port_t port; + int ret = 1; + + client = connect_to_server(addr, len, type); + if (client == -1) { + perror("Cannot connect to server"); + goto out; + } + + if (type == SOCK_STREAM) { + srv_client = accept(server_fd, NULL, NULL); + if (CHECK_FAIL(srv_client == -1)) { + perror("Can't accept connection"); + goto out; + } + } else { + srv_client = server_fd; + } + if (CHECK_FAIL(write(client, buf, sizeof(buf)) != sizeof(buf))) { + perror("Can't write on client"); + goto out; + } + if (CHECK_FAIL(rcv_msg(srv_client, type) != sizeof(buf))) { + perror("Can't read on server"); + goto out; + } + + port = get_port(srv_client); + if (CHECK_FAIL(!port)) + goto out; + /* SOCK_STREAM is connected via accept(), so the server's local address + * will be the CONNECT_PORT rather than the BIND port that corresponds + * to the listen socket. SOCK_DGRAM on the other hand is connectionless + * so we can't really do the same check there; the server doesn't ever + * create a socket with CONNECT_PORT. + */ + if (type == SOCK_STREAM && + CHECK(port != htons(CONNECT_PORT), "Expected", "port %u but got %u", + CONNECT_PORT, ntohs(port))) + goto out; + else if (type == SOCK_DGRAM && + CHECK(port != htons(BIND_PORT), "Expected", + "port %u but got %u", BIND_PORT, ntohs(port))) + goto out; + + ret = 0; +out: + close(client); + if (srv_client != server_fd) + close(srv_client); + if (ret) + WRITE_ONCE(stop, 1); + return ret; +} + +static void +prepare_addr(struct sockaddr *addr, int family, __u16 port, bool rewrite_addr) +{ + struct sockaddr_in *addr4; + struct sockaddr_in6 *addr6; + + switch (family) { + case AF_INET: + addr4 = (struct sockaddr_in *)addr; + memset(addr4, 0, sizeof(*addr4)); + addr4->sin_family = family; + addr4->sin_port = htons(port); + if (rewrite_addr) + addr4->sin_addr.s_addr = htonl(TEST_DADDR); + else + addr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK); + break; + case AF_INET6: + addr6 = (struct sockaddr_in6 *)addr; + memset(addr6, 0, sizeof(*addr6)); + addr6->sin6_family = family; + addr6->sin6_port = htons(port); + addr6->sin6_addr = in6addr_loopback; + if (rewrite_addr) + addr6->sin6_addr.s6_addr32[3] = htonl(TEST_DADDR); + break; + default: + fprintf(stderr, "Invalid family %d", family); + } +} + +struct test_sk_cfg { + const char *name; + int family; + struct sockaddr *addr; + socklen_t len; + int type; + bool rewrite_addr; +}; + +#define TEST(NAME, FAMILY, TYPE, REWRITE) \ +{ \ + .name = NAME, \ + .family = FAMILY, \ + .addr = (FAMILY == AF_INET) ? (struct sockaddr *)&addr4 \ + : (struct sockaddr *)&addr6, \ + .len = (FAMILY == AF_INET) ? sizeof(addr4) : sizeof(addr6), \ + .type = TYPE, \ + .rewrite_addr = REWRITE, \ +} + +void test_sk_assign(void) +{ + struct sockaddr_in addr4; + struct sockaddr_in6 addr6; + struct test_sk_cfg tests[] = { + TEST("ipv4 tcp port redir", AF_INET, SOCK_STREAM, false), + TEST("ipv4 tcp addr redir", AF_INET, SOCK_STREAM, true), + TEST("ipv6 tcp port redir", AF_INET6, SOCK_STREAM, false), + TEST("ipv6 tcp addr redir", AF_INET6, SOCK_STREAM, true), + TEST("ipv4 udp port redir", AF_INET, SOCK_DGRAM, false), + TEST("ipv4 udp addr redir", AF_INET, SOCK_DGRAM, true), + TEST("ipv6 udp port redir", AF_INET6, SOCK_DGRAM, false), + TEST("ipv6 udp addr redir", AF_INET6, SOCK_DGRAM, true), + }; + int server = -1; + int server_map; + int self_net; + + self_net = open(NS_SELF, O_RDONLY); + if (CHECK_FAIL(self_net < 0)) { + perror("Unable to open "NS_SELF); + return; + } + + if (!configure_stack()) { + perror("configure_stack"); + goto cleanup; + } + + server_map = bpf_obj_get(SERVER_MAP_PATH); + if (CHECK_FAIL(server_map < 0)) { + perror("Unable to open " SERVER_MAP_PATH); + goto cleanup; + } + + for (int i = 0; i < ARRAY_SIZE(tests) && !READ_ONCE(stop); i++) { + struct test_sk_cfg *test = &tests[i]; + const struct sockaddr *addr; + const int zero = 0; + int err; + + if (!test__start_subtest(test->name)) + continue; + prepare_addr(test->addr, test->family, BIND_PORT, false); + addr = (const struct sockaddr *)test->addr; + server = start_server(addr, test->len, test->type); + if (server == -1) + goto close; + + err = bpf_map_update_elem(server_map, &zero, &server, BPF_ANY); + if (CHECK_FAIL(err)) { + perror("Unable to update server_map"); + goto close; + } + + /* connect to unbound ports */ + prepare_addr(test->addr, test->family, CONNECT_PORT, + test->rewrite_addr); + if (run_test(server, addr, test->len, test->type)) + goto close; + + close(server); + server = -1; + } + +close: + close(server); + close(server_map); +cleanup: + if (CHECK_FAIL(unlink(SERVER_MAP_PATH))) + perror("Unable to unlink " SERVER_MAP_PATH); + if (CHECK_FAIL(setns(self_net, CLONE_NEWNET))) + perror("Failed to setns("NS_SELF")"); + close(self_net); +} diff --git a/tools/testing/selftests/bpf/prog_tests/skb_ctx.c b/tools/testing/selftests/bpf/prog_tests/skb_ctx.c index c6d6b685a946..7021b92af313 100644 --- a/tools/testing/selftests/bpf/prog_tests/skb_ctx.c +++ b/tools/testing/selftests/bpf/prog_tests/skb_ctx.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> +#include <network_helpers.h> void test_skb_ctx(void) { @@ -14,6 +15,7 @@ void test_skb_ctx(void) .wire_len = 100, .gso_segs = 8, .mark = 9, + .gso_size = 10, }; struct bpf_prog_test_run_attr tattr = { .data_in = &pkt_v4, diff --git a/tools/testing/selftests/bpf/prog_tests/skb_helpers.c b/tools/testing/selftests/bpf/prog_tests/skb_helpers.c new file mode 100644 index 000000000000..f302ad84a298 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/skb_helpers.c @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <test_progs.h> +#include <network_helpers.h> + +void test_skb_helpers(void) +{ + struct __sk_buff skb = { + .wire_len = 100, + .gso_segs = 8, + .gso_size = 10, + }; + struct bpf_prog_test_run_attr tattr = { + .data_in = &pkt_v4, + .data_size_in = sizeof(pkt_v4), + .ctx_in = &skb, + .ctx_size_in = sizeof(skb), + .ctx_out = &skb, + .ctx_size_out = sizeof(skb), + }; + struct bpf_object *obj; + int err; + + err = bpf_prog_load("./test_skb_helpers.o", BPF_PROG_TYPE_SCHED_CLS, &obj, + &tattr.prog_fd); + if (CHECK_ATTR(err, "load", "err %d errno %d\n", err, errno)) + return; + err = bpf_prog_test_run_xattr(&tattr); + CHECK_ATTR(err, "len", "err %d errno %d\n", err, errno); + bpf_object__close(obj); +} diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index aa43e0bd210c..96e7b7f84c65 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -1,7 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2020 Cloudflare +#include <error.h> #include "test_progs.h" +#include "test_skmsg_load_helpers.skel.h" #define TCP_REPAIR 19 /* TCP sock is under repair right now */ @@ -70,10 +72,43 @@ out: close(s); } +static void test_skmsg_helpers(enum bpf_map_type map_type) +{ + struct test_skmsg_load_helpers *skel; + int err, map, verdict; + + skel = test_skmsg_load_helpers__open_and_load(); + if (CHECK_FAIL(!skel)) { + perror("test_skmsg_load_helpers__open_and_load"); + return; + } + + verdict = bpf_program__fd(skel->progs.prog_msg_verdict); + map = bpf_map__fd(skel->maps.sock_map); + + err = bpf_prog_attach(verdict, map, BPF_SK_MSG_VERDICT, 0); + if (CHECK_FAIL(err)) { + perror("bpf_prog_attach"); + goto out; + } + + err = bpf_prog_detach2(verdict, map, BPF_SK_MSG_VERDICT); + if (CHECK_FAIL(err)) { + perror("bpf_prog_detach2"); + goto out; + } +out: + test_skmsg_load_helpers__destroy(skel); +} + void test_sockmap_basic(void) { if (test__start_subtest("sockmap create_update_free")) test_sockmap_create_update_free(BPF_MAP_TYPE_SOCKMAP); if (test__start_subtest("sockhash create_update_free")) test_sockmap_create_update_free(BPF_MAP_TYPE_SOCKHASH); + if (test__start_subtest("sockmap sk_msg load helpers")) + test_skmsg_helpers(BPF_MAP_TYPE_SOCKMAP); + if (test__start_subtest("sockhash sk_msg load helpers")) + test_skmsg_helpers(BPF_MAP_TYPE_SOCKHASH); } diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c b/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c new file mode 100644 index 000000000000..06b86addc181 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c @@ -0,0 +1,124 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Cloudflare +/* + * Tests for sockmap/sockhash holding kTLS sockets. + */ + +#include "test_progs.h" + +#define MAX_TEST_NAME 80 +#define TCP_ULP 31 + +static int tcp_server(int family) +{ + int err, s; + + s = socket(family, SOCK_STREAM, 0); + if (CHECK_FAIL(s == -1)) { + perror("socket"); + return -1; + } + + err = listen(s, SOMAXCONN); + if (CHECK_FAIL(err)) { + perror("listen"); + return -1; + } + + return s; +} + +static int disconnect(int fd) +{ + struct sockaddr unspec = { AF_UNSPEC }; + + return connect(fd, &unspec, sizeof(unspec)); +} + +/* Disconnect (unhash) a kTLS socket after removing it from sockmap. */ +static void test_sockmap_ktls_disconnect_after_delete(int family, int map) +{ + struct sockaddr_storage addr = {0}; + socklen_t len = sizeof(addr); + int err, cli, srv, zero = 0; + + srv = tcp_server(family); + if (srv == -1) + return; + + err = getsockname(srv, (struct sockaddr *)&addr, &len); + if (CHECK_FAIL(err)) { + perror("getsockopt"); + goto close_srv; + } + + cli = socket(family, SOCK_STREAM, 0); + if (CHECK_FAIL(cli == -1)) { + perror("socket"); + goto close_srv; + } + + err = connect(cli, (struct sockaddr *)&addr, len); + if (CHECK_FAIL(err)) { + perror("connect"); + goto close_cli; + } + + err = bpf_map_update_elem(map, &zero, &cli, 0); + if (CHECK_FAIL(err)) { + perror("bpf_map_update_elem"); + goto close_cli; + } + + err = setsockopt(cli, IPPROTO_TCP, TCP_ULP, "tls", strlen("tls")); + if (CHECK_FAIL(err)) { + perror("setsockopt(TCP_ULP)"); + goto close_cli; + } + + err = bpf_map_delete_elem(map, &zero); + if (CHECK_FAIL(err)) { + perror("bpf_map_delete_elem"); + goto close_cli; + } + + err = disconnect(cli); + if (CHECK_FAIL(err)) + perror("disconnect"); + +close_cli: + close(cli); +close_srv: + close(srv); +} + +static void run_tests(int family, enum bpf_map_type map_type) +{ + char test_name[MAX_TEST_NAME]; + int map; + + map = bpf_create_map(map_type, sizeof(int), sizeof(int), 1, 0); + if (CHECK_FAIL(map == -1)) { + perror("bpf_map_create"); + return; + } + + snprintf(test_name, MAX_TEST_NAME, + "sockmap_ktls disconnect_after_delete %s %s", + family == AF_INET ? "IPv4" : "IPv6", + map_type == BPF_MAP_TYPE_SOCKMAP ? "SOCKMAP" : "SOCKHASH"); + if (!test__start_subtest(test_name)) + return; + + test_sockmap_ktls_disconnect_after_delete(family, map); + + close(map); +} + +void test_sockmap_ktls(void) +{ + run_tests(AF_INET, BPF_MAP_TYPE_SOCKMAP); + run_tests(AF_INET, BPF_MAP_TYPE_SOCKHASH); + run_tests(AF_INET6, BPF_MAP_TYPE_SOCKMAP); + run_tests(AF_INET6, BPF_MAP_TYPE_SOCKHASH); +} diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c new file mode 100644 index 000000000000..d7d65a700799 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c @@ -0,0 +1,1635 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Cloudflare +/* + * Test suite for SOCKMAP/SOCKHASH holding listening sockets. + * Covers: + * 1. BPF map operations - bpf_map_{update,lookup delete}_elem + * 2. BPF redirect helpers - bpf_{sk,msg}_redirect_map + * 3. BPF reuseport helper - bpf_sk_select_reuseport + */ + +#include <linux/compiler.h> +#include <errno.h> +#include <error.h> +#include <limits.h> +#include <netinet/in.h> +#include <pthread.h> +#include <stdlib.h> +#include <string.h> +#include <sys/select.h> +#include <unistd.h> + +#include <bpf/bpf.h> +#include <bpf/libbpf.h> + +#include "bpf_util.h" +#include "test_progs.h" +#include "test_sockmap_listen.skel.h" + +#define IO_TIMEOUT_SEC 30 +#define MAX_STRERR_LEN 256 +#define MAX_TEST_NAME 80 + +#define _FAIL(errnum, fmt...) \ + ({ \ + error_at_line(0, (errnum), __func__, __LINE__, fmt); \ + CHECK_FAIL(true); \ + }) +#define FAIL(fmt...) _FAIL(0, fmt) +#define FAIL_ERRNO(fmt...) _FAIL(errno, fmt) +#define FAIL_LIBBPF(err, msg) \ + ({ \ + char __buf[MAX_STRERR_LEN]; \ + libbpf_strerror((err), __buf, sizeof(__buf)); \ + FAIL("%s: %s", (msg), __buf); \ + }) + +/* Wrappers that fail the test on error and report it. */ + +#define xaccept_nonblock(fd, addr, len) \ + ({ \ + int __ret = \ + accept_timeout((fd), (addr), (len), IO_TIMEOUT_SEC); \ + if (__ret == -1) \ + FAIL_ERRNO("accept"); \ + __ret; \ + }) + +#define xbind(fd, addr, len) \ + ({ \ + int __ret = bind((fd), (addr), (len)); \ + if (__ret == -1) \ + FAIL_ERRNO("bind"); \ + __ret; \ + }) + +#define xclose(fd) \ + ({ \ + int __ret = close((fd)); \ + if (__ret == -1) \ + FAIL_ERRNO("close"); \ + __ret; \ + }) + +#define xconnect(fd, addr, len) \ + ({ \ + int __ret = connect((fd), (addr), (len)); \ + if (__ret == -1) \ + FAIL_ERRNO("connect"); \ + __ret; \ + }) + +#define xgetsockname(fd, addr, len) \ + ({ \ + int __ret = getsockname((fd), (addr), (len)); \ + if (__ret == -1) \ + FAIL_ERRNO("getsockname"); \ + __ret; \ + }) + +#define xgetsockopt(fd, level, name, val, len) \ + ({ \ + int __ret = getsockopt((fd), (level), (name), (val), (len)); \ + if (__ret == -1) \ + FAIL_ERRNO("getsockopt(" #name ")"); \ + __ret; \ + }) + +#define xlisten(fd, backlog) \ + ({ \ + int __ret = listen((fd), (backlog)); \ + if (__ret == -1) \ + FAIL_ERRNO("listen"); \ + __ret; \ + }) + +#define xsetsockopt(fd, level, name, val, len) \ + ({ \ + int __ret = setsockopt((fd), (level), (name), (val), (len)); \ + if (__ret == -1) \ + FAIL_ERRNO("setsockopt(" #name ")"); \ + __ret; \ + }) + +#define xsend(fd, buf, len, flags) \ + ({ \ + ssize_t __ret = send((fd), (buf), (len), (flags)); \ + if (__ret == -1) \ + FAIL_ERRNO("send"); \ + __ret; \ + }) + +#define xrecv_nonblock(fd, buf, len, flags) \ + ({ \ + ssize_t __ret = recv_timeout((fd), (buf), (len), (flags), \ + IO_TIMEOUT_SEC); \ + if (__ret == -1) \ + FAIL_ERRNO("recv"); \ + __ret; \ + }) + +#define xsocket(family, sotype, flags) \ + ({ \ + int __ret = socket(family, sotype, flags); \ + if (__ret == -1) \ + FAIL_ERRNO("socket"); \ + __ret; \ + }) + +#define xbpf_map_delete_elem(fd, key) \ + ({ \ + int __ret = bpf_map_delete_elem((fd), (key)); \ + if (__ret == -1) \ + FAIL_ERRNO("map_delete"); \ + __ret; \ + }) + +#define xbpf_map_lookup_elem(fd, key, val) \ + ({ \ + int __ret = bpf_map_lookup_elem((fd), (key), (val)); \ + if (__ret == -1) \ + FAIL_ERRNO("map_lookup"); \ + __ret; \ + }) + +#define xbpf_map_update_elem(fd, key, val, flags) \ + ({ \ + int __ret = bpf_map_update_elem((fd), (key), (val), (flags)); \ + if (__ret == -1) \ + FAIL_ERRNO("map_update"); \ + __ret; \ + }) + +#define xbpf_prog_attach(prog, target, type, flags) \ + ({ \ + int __ret = \ + bpf_prog_attach((prog), (target), (type), (flags)); \ + if (__ret == -1) \ + FAIL_ERRNO("prog_attach(" #type ")"); \ + __ret; \ + }) + +#define xbpf_prog_detach2(prog, target, type) \ + ({ \ + int __ret = bpf_prog_detach2((prog), (target), (type)); \ + if (__ret == -1) \ + FAIL_ERRNO("prog_detach2(" #type ")"); \ + __ret; \ + }) + +#define xpthread_create(thread, attr, func, arg) \ + ({ \ + int __ret = pthread_create((thread), (attr), (func), (arg)); \ + errno = __ret; \ + if (__ret) \ + FAIL_ERRNO("pthread_create"); \ + __ret; \ + }) + +#define xpthread_join(thread, retval) \ + ({ \ + int __ret = pthread_join((thread), (retval)); \ + errno = __ret; \ + if (__ret) \ + FAIL_ERRNO("pthread_join"); \ + __ret; \ + }) + +static int poll_read(int fd, unsigned int timeout_sec) +{ + struct timeval timeout = { .tv_sec = timeout_sec }; + fd_set rfds; + int r; + + FD_ZERO(&rfds); + FD_SET(fd, &rfds); + + r = select(fd + 1, &rfds, NULL, NULL, &timeout); + if (r == 0) + errno = ETIME; + + return r == 1 ? 0 : -1; +} + +static int accept_timeout(int fd, struct sockaddr *addr, socklen_t *len, + unsigned int timeout_sec) +{ + if (poll_read(fd, timeout_sec)) + return -1; + + return accept(fd, addr, len); +} + +static int recv_timeout(int fd, void *buf, size_t len, int flags, + unsigned int timeout_sec) +{ + if (poll_read(fd, timeout_sec)) + return -1; + + return recv(fd, buf, len, flags); +} + +static void init_addr_loopback4(struct sockaddr_storage *ss, socklen_t *len) +{ + struct sockaddr_in *addr4 = memset(ss, 0, sizeof(*ss)); + + addr4->sin_family = AF_INET; + addr4->sin_port = 0; + addr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK); + *len = sizeof(*addr4); +} + +static void init_addr_loopback6(struct sockaddr_storage *ss, socklen_t *len) +{ + struct sockaddr_in6 *addr6 = memset(ss, 0, sizeof(*ss)); + + addr6->sin6_family = AF_INET6; + addr6->sin6_port = 0; + addr6->sin6_addr = in6addr_loopback; + *len = sizeof(*addr6); +} + +static void init_addr_loopback(int family, struct sockaddr_storage *ss, + socklen_t *len) +{ + switch (family) { + case AF_INET: + init_addr_loopback4(ss, len); + return; + case AF_INET6: + init_addr_loopback6(ss, len); + return; + default: + FAIL("unsupported address family %d", family); + } +} + +static inline struct sockaddr *sockaddr(struct sockaddr_storage *ss) +{ + return (struct sockaddr *)ss; +} + +static int enable_reuseport(int s, int progfd) +{ + int err, one = 1; + + err = xsetsockopt(s, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(one)); + if (err) + return -1; + err = xsetsockopt(s, SOL_SOCKET, SO_ATTACH_REUSEPORT_EBPF, &progfd, + sizeof(progfd)); + if (err) + return -1; + + return 0; +} + +static int socket_loopback_reuseport(int family, int sotype, int progfd) +{ + struct sockaddr_storage addr; + socklen_t len; + int err, s; + + init_addr_loopback(family, &addr, &len); + + s = xsocket(family, sotype, 0); + if (s == -1) + return -1; + + if (progfd >= 0) + enable_reuseport(s, progfd); + + err = xbind(s, sockaddr(&addr), len); + if (err) + goto close; + + if (sotype & SOCK_DGRAM) + return s; + + err = xlisten(s, SOMAXCONN); + if (err) + goto close; + + return s; +close: + xclose(s); + return -1; +} + +static int socket_loopback(int family, int sotype) +{ + return socket_loopback_reuseport(family, sotype, -1); +} + +static void test_insert_invalid(int family, int sotype, int mapfd) +{ + u32 key = 0; + u64 value; + int err; + + value = -1; + err = bpf_map_update_elem(mapfd, &key, &value, BPF_NOEXIST); + if (!err || errno != EINVAL) + FAIL_ERRNO("map_update: expected EINVAL"); + + value = INT_MAX; + err = bpf_map_update_elem(mapfd, &key, &value, BPF_NOEXIST); + if (!err || errno != EBADF) + FAIL_ERRNO("map_update: expected EBADF"); +} + +static void test_insert_opened(int family, int sotype, int mapfd) +{ + u32 key = 0; + u64 value; + int err, s; + + s = xsocket(family, sotype, 0); + if (s == -1) + return; + + errno = 0; + value = s; + err = bpf_map_update_elem(mapfd, &key, &value, BPF_NOEXIST); + if (!err || errno != EOPNOTSUPP) + FAIL_ERRNO("map_update: expected EOPNOTSUPP"); + + xclose(s); +} + +static void test_insert_bound(int family, int sotype, int mapfd) +{ + struct sockaddr_storage addr; + socklen_t len; + u32 key = 0; + u64 value; + int err, s; + + init_addr_loopback(family, &addr, &len); + + s = xsocket(family, sotype, 0); + if (s == -1) + return; + + err = xbind(s, sockaddr(&addr), len); + if (err) + goto close; + + errno = 0; + value = s; + err = bpf_map_update_elem(mapfd, &key, &value, BPF_NOEXIST); + if (!err || errno != EOPNOTSUPP) + FAIL_ERRNO("map_update: expected EOPNOTSUPP"); +close: + xclose(s); +} + +static void test_insert(int family, int sotype, int mapfd) +{ + u64 value; + u32 key; + int s; + + s = socket_loopback(family, sotype); + if (s < 0) + return; + + key = 0; + value = s; + xbpf_map_update_elem(mapfd, &key, &value, BPF_NOEXIST); + xclose(s); +} + +static void test_delete_after_insert(int family, int sotype, int mapfd) +{ + u64 value; + u32 key; + int s; + + s = socket_loopback(family, sotype); + if (s < 0) + return; + + key = 0; + value = s; + xbpf_map_update_elem(mapfd, &key, &value, BPF_NOEXIST); + xbpf_map_delete_elem(mapfd, &key); + xclose(s); +} + +static void test_delete_after_close(int family, int sotype, int mapfd) +{ + int err, s; + u64 value; + u32 key; + + s = socket_loopback(family, sotype); + if (s < 0) + return; + + key = 0; + value = s; + xbpf_map_update_elem(mapfd, &key, &value, BPF_NOEXIST); + + xclose(s); + + errno = 0; + err = bpf_map_delete_elem(mapfd, &key); + if (!err || (errno != EINVAL && errno != ENOENT)) + /* SOCKMAP and SOCKHASH return different error codes */ + FAIL_ERRNO("map_delete: expected EINVAL/EINVAL"); +} + +static void test_lookup_after_insert(int family, int sotype, int mapfd) +{ + u64 cookie, value; + socklen_t len; + u32 key; + int s; + + s = socket_loopback(family, sotype); + if (s < 0) + return; + + key = 0; + value = s; + xbpf_map_update_elem(mapfd, &key, &value, BPF_NOEXIST); + + len = sizeof(cookie); + xgetsockopt(s, SOL_SOCKET, SO_COOKIE, &cookie, &len); + + xbpf_map_lookup_elem(mapfd, &key, &value); + + if (value != cookie) { + FAIL("map_lookup: have %#llx, want %#llx", + (unsigned long long)value, (unsigned long long)cookie); + } + + xclose(s); +} + +static void test_lookup_after_delete(int family, int sotype, int mapfd) +{ + int err, s; + u64 value; + u32 key; + + s = socket_loopback(family, sotype); + if (s < 0) + return; + + key = 0; + value = s; + xbpf_map_update_elem(mapfd, &key, &value, BPF_NOEXIST); + xbpf_map_delete_elem(mapfd, &key); + + errno = 0; + err = bpf_map_lookup_elem(mapfd, &key, &value); + if (!err || errno != ENOENT) + FAIL_ERRNO("map_lookup: expected ENOENT"); + + xclose(s); +} + +static void test_lookup_32_bit_value(int family, int sotype, int mapfd) +{ + u32 key, value32; + int err, s; + + s = socket_loopback(family, sotype); + if (s < 0) + return; + + mapfd = bpf_create_map(BPF_MAP_TYPE_SOCKMAP, sizeof(key), + sizeof(value32), 1, 0); + if (mapfd < 0) { + FAIL_ERRNO("map_create"); + goto close; + } + + key = 0; + value32 = s; + xbpf_map_update_elem(mapfd, &key, &value32, BPF_NOEXIST); + + errno = 0; + err = bpf_map_lookup_elem(mapfd, &key, &value32); + if (!err || errno != ENOSPC) + FAIL_ERRNO("map_lookup: expected ENOSPC"); + + xclose(mapfd); +close: + xclose(s); +} + +static void test_update_existing(int family, int sotype, int mapfd) +{ + int s1, s2; + u64 value; + u32 key; + + s1 = socket_loopback(family, sotype); + if (s1 < 0) + return; + + s2 = socket_loopback(family, sotype); + if (s2 < 0) + goto close_s1; + + key = 0; + value = s1; + xbpf_map_update_elem(mapfd, &key, &value, BPF_NOEXIST); + + value = s2; + xbpf_map_update_elem(mapfd, &key, &value, BPF_EXIST); + xclose(s2); +close_s1: + xclose(s1); +} + +/* Exercise the code path where we destroy child sockets that never + * got accept()'ed, aka orphans, when parent socket gets closed. + */ +static void test_destroy_orphan_child(int family, int sotype, int mapfd) +{ + struct sockaddr_storage addr; + socklen_t len; + int err, s, c; + u64 value; + u32 key; + + s = socket_loopback(family, sotype); + if (s < 0) + return; + + len = sizeof(addr); + err = xgetsockname(s, sockaddr(&addr), &len); + if (err) + goto close_srv; + + key = 0; + value = s; + xbpf_map_update_elem(mapfd, &key, &value, BPF_NOEXIST); + + c = xsocket(family, sotype, 0); + if (c == -1) + goto close_srv; + + xconnect(c, sockaddr(&addr), len); + xclose(c); +close_srv: + xclose(s); +} + +/* Perform a passive open after removing listening socket from SOCKMAP + * to ensure that callbacks get restored properly. + */ +static void test_clone_after_delete(int family, int sotype, int mapfd) +{ + struct sockaddr_storage addr; + socklen_t len; + int err, s, c; + u64 value; + u32 key; + + s = socket_loopback(family, sotype); + if (s < 0) + return; + + len = sizeof(addr); + err = xgetsockname(s, sockaddr(&addr), &len); + if (err) + goto close_srv; + + key = 0; + value = s; + xbpf_map_update_elem(mapfd, &key, &value, BPF_NOEXIST); + xbpf_map_delete_elem(mapfd, &key); + + c = xsocket(family, sotype, 0); + if (c < 0) + goto close_srv; + + xconnect(c, sockaddr(&addr), len); + xclose(c); +close_srv: + xclose(s); +} + +/* Check that child socket that got created while parent was in a + * SOCKMAP, but got accept()'ed only after the parent has been removed + * from SOCKMAP, gets cloned without parent psock state or callbacks. + */ +static void test_accept_after_delete(int family, int sotype, int mapfd) +{ + struct sockaddr_storage addr; + const u32 zero = 0; + int err, s, c, p; + socklen_t len; + u64 value; + + s = socket_loopback(family, sotype | SOCK_NONBLOCK); + if (s == -1) + return; + + len = sizeof(addr); + err = xgetsockname(s, sockaddr(&addr), &len); + if (err) + goto close_srv; + + value = s; + err = xbpf_map_update_elem(mapfd, &zero, &value, BPF_NOEXIST); + if (err) + goto close_srv; + + c = xsocket(family, sotype, 0); + if (c == -1) + goto close_srv; + + /* Create child while parent is in sockmap */ + err = xconnect(c, sockaddr(&addr), len); + if (err) + goto close_cli; + + /* Remove parent from sockmap */ + err = xbpf_map_delete_elem(mapfd, &zero); + if (err) + goto close_cli; + + p = xaccept_nonblock(s, NULL, NULL); + if (p == -1) + goto close_cli; + + /* Check that child sk_user_data is not set */ + value = p; + xbpf_map_update_elem(mapfd, &zero, &value, BPF_NOEXIST); + + xclose(p); +close_cli: + xclose(c); +close_srv: + xclose(s); +} + +/* Check that child socket that got created and accepted while parent + * was in a SOCKMAP is cloned without parent psock state or callbacks. + */ +static void test_accept_before_delete(int family, int sotype, int mapfd) +{ + struct sockaddr_storage addr; + const u32 zero = 0, one = 1; + int err, s, c, p; + socklen_t len; + u64 value; + + s = socket_loopback(family, sotype | SOCK_NONBLOCK); + if (s == -1) + return; + + len = sizeof(addr); + err = xgetsockname(s, sockaddr(&addr), &len); + if (err) + goto close_srv; + + value = s; + err = xbpf_map_update_elem(mapfd, &zero, &value, BPF_NOEXIST); + if (err) + goto close_srv; + + c = xsocket(family, sotype, 0); + if (c == -1) + goto close_srv; + + /* Create & accept child while parent is in sockmap */ + err = xconnect(c, sockaddr(&addr), len); + if (err) + goto close_cli; + + p = xaccept_nonblock(s, NULL, NULL); + if (p == -1) + goto close_cli; + + /* Check that child sk_user_data is not set */ + value = p; + xbpf_map_update_elem(mapfd, &one, &value, BPF_NOEXIST); + + xclose(p); +close_cli: + xclose(c); +close_srv: + xclose(s); +} + +struct connect_accept_ctx { + int sockfd; + unsigned int done; + unsigned int nr_iter; +}; + +static bool is_thread_done(struct connect_accept_ctx *ctx) +{ + return READ_ONCE(ctx->done); +} + +static void *connect_accept_thread(void *arg) +{ + struct connect_accept_ctx *ctx = arg; + struct sockaddr_storage addr; + int family, socktype; + socklen_t len; + int err, i, s; + + s = ctx->sockfd; + + len = sizeof(addr); + err = xgetsockname(s, sockaddr(&addr), &len); + if (err) + goto done; + + len = sizeof(family); + err = xgetsockopt(s, SOL_SOCKET, SO_DOMAIN, &family, &len); + if (err) + goto done; + + len = sizeof(socktype); + err = xgetsockopt(s, SOL_SOCKET, SO_TYPE, &socktype, &len); + if (err) + goto done; + + for (i = 0; i < ctx->nr_iter; i++) { + int c, p; + + c = xsocket(family, socktype, 0); + if (c < 0) + break; + + err = xconnect(c, (struct sockaddr *)&addr, sizeof(addr)); + if (err) { + xclose(c); + break; + } + + p = xaccept_nonblock(s, NULL, NULL); + if (p < 0) { + xclose(c); + break; + } + + xclose(p); + xclose(c); + } +done: + WRITE_ONCE(ctx->done, 1); + return NULL; +} + +static void test_syn_recv_insert_delete(int family, int sotype, int mapfd) +{ + struct connect_accept_ctx ctx = { 0 }; + struct sockaddr_storage addr; + socklen_t len; + u32 zero = 0; + pthread_t t; + int err, s; + u64 value; + + s = socket_loopback(family, sotype | SOCK_NONBLOCK); + if (s < 0) + return; + + len = sizeof(addr); + err = xgetsockname(s, sockaddr(&addr), &len); + if (err) + goto close; + + ctx.sockfd = s; + ctx.nr_iter = 1000; + + err = xpthread_create(&t, NULL, connect_accept_thread, &ctx); + if (err) + goto close; + + value = s; + while (!is_thread_done(&ctx)) { + err = xbpf_map_update_elem(mapfd, &zero, &value, BPF_NOEXIST); + if (err) + break; + + err = xbpf_map_delete_elem(mapfd, &zero); + if (err) + break; + } + + xpthread_join(t, NULL); +close: + xclose(s); +} + +static void *listen_thread(void *arg) +{ + struct sockaddr unspec = { AF_UNSPEC }; + struct connect_accept_ctx *ctx = arg; + int err, i, s; + + s = ctx->sockfd; + + for (i = 0; i < ctx->nr_iter; i++) { + err = xlisten(s, 1); + if (err) + break; + err = xconnect(s, &unspec, sizeof(unspec)); + if (err) + break; + } + + WRITE_ONCE(ctx->done, 1); + return NULL; +} + +static void test_race_insert_listen(int family, int socktype, int mapfd) +{ + struct connect_accept_ctx ctx = { 0 }; + const u32 zero = 0; + const int one = 1; + pthread_t t; + int err, s; + u64 value; + + s = xsocket(family, socktype, 0); + if (s < 0) + return; + + err = xsetsockopt(s, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)); + if (err) + goto close; + + ctx.sockfd = s; + ctx.nr_iter = 10000; + + err = pthread_create(&t, NULL, listen_thread, &ctx); + if (err) + goto close; + + value = s; + while (!is_thread_done(&ctx)) { + err = bpf_map_update_elem(mapfd, &zero, &value, BPF_NOEXIST); + /* Expecting EOPNOTSUPP before listen() */ + if (err && errno != EOPNOTSUPP) { + FAIL_ERRNO("map_update"); + break; + } + + err = bpf_map_delete_elem(mapfd, &zero); + /* Expecting no entry after unhash on connect(AF_UNSPEC) */ + if (err && errno != EINVAL && errno != ENOENT) { + FAIL_ERRNO("map_delete"); + break; + } + } + + xpthread_join(t, NULL); +close: + xclose(s); +} + +static void zero_verdict_count(int mapfd) +{ + unsigned int zero = 0; + int key; + + key = SK_DROP; + xbpf_map_update_elem(mapfd, &key, &zero, BPF_ANY); + key = SK_PASS; + xbpf_map_update_elem(mapfd, &key, &zero, BPF_ANY); +} + +enum redir_mode { + REDIR_INGRESS, + REDIR_EGRESS, +}; + +static const char *redir_mode_str(enum redir_mode mode) +{ + switch (mode) { + case REDIR_INGRESS: + return "ingress"; + case REDIR_EGRESS: + return "egress"; + default: + return "unknown"; + } +} + +static void redir_to_connected(int family, int sotype, int sock_mapfd, + int verd_mapfd, enum redir_mode mode) +{ + const char *log_prefix = redir_mode_str(mode); + struct sockaddr_storage addr; + int s, c0, c1, p0, p1; + unsigned int pass; + socklen_t len; + int err, n; + u64 value; + u32 key; + char b; + + zero_verdict_count(verd_mapfd); + + s = socket_loopback(family, sotype | SOCK_NONBLOCK); + if (s < 0) + return; + + len = sizeof(addr); + err = xgetsockname(s, sockaddr(&addr), &len); + if (err) + goto close_srv; + + c0 = xsocket(family, sotype, 0); + if (c0 < 0) + goto close_srv; + err = xconnect(c0, sockaddr(&addr), len); + if (err) + goto close_cli0; + + p0 = xaccept_nonblock(s, NULL, NULL); + if (p0 < 0) + goto close_cli0; + + c1 = xsocket(family, sotype, 0); + if (c1 < 0) + goto close_peer0; + err = xconnect(c1, sockaddr(&addr), len); + if (err) + goto close_cli1; + + p1 = xaccept_nonblock(s, NULL, NULL); + if (p1 < 0) + goto close_cli1; + + key = 0; + value = p0; + err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST); + if (err) + goto close_peer1; + + key = 1; + value = p1; + err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST); + if (err) + goto close_peer1; + + n = write(mode == REDIR_INGRESS ? c1 : p1, "a", 1); + if (n < 0) + FAIL_ERRNO("%s: write", log_prefix); + if (n == 0) + FAIL("%s: incomplete write", log_prefix); + if (n < 1) + goto close_peer1; + + key = SK_PASS; + err = xbpf_map_lookup_elem(verd_mapfd, &key, &pass); + if (err) + goto close_peer1; + if (pass != 1) + FAIL("%s: want pass count 1, have %d", log_prefix, pass); + + n = read(c0, &b, 1); + if (n < 0) + FAIL_ERRNO("%s: read", log_prefix); + if (n == 0) + FAIL("%s: incomplete read", log_prefix); + +close_peer1: + xclose(p1); +close_cli1: + xclose(c1); +close_peer0: + xclose(p0); +close_cli0: + xclose(c0); +close_srv: + xclose(s); +} + +static void test_skb_redir_to_connected(struct test_sockmap_listen *skel, + struct bpf_map *inner_map, int family, + int sotype) +{ + int verdict = bpf_program__fd(skel->progs.prog_skb_verdict); + int parser = bpf_program__fd(skel->progs.prog_skb_parser); + int verdict_map = bpf_map__fd(skel->maps.verdict_map); + int sock_map = bpf_map__fd(inner_map); + int err; + + err = xbpf_prog_attach(parser, sock_map, BPF_SK_SKB_STREAM_PARSER, 0); + if (err) + return; + err = xbpf_prog_attach(verdict, sock_map, BPF_SK_SKB_STREAM_VERDICT, 0); + if (err) + goto detach; + + redir_to_connected(family, sotype, sock_map, verdict_map, + REDIR_INGRESS); + + xbpf_prog_detach2(verdict, sock_map, BPF_SK_SKB_STREAM_VERDICT); +detach: + xbpf_prog_detach2(parser, sock_map, BPF_SK_SKB_STREAM_PARSER); +} + +static void test_msg_redir_to_connected(struct test_sockmap_listen *skel, + struct bpf_map *inner_map, int family, + int sotype) +{ + int verdict = bpf_program__fd(skel->progs.prog_msg_verdict); + int verdict_map = bpf_map__fd(skel->maps.verdict_map); + int sock_map = bpf_map__fd(inner_map); + int err; + + err = xbpf_prog_attach(verdict, sock_map, BPF_SK_MSG_VERDICT, 0); + if (err) + return; + + redir_to_connected(family, sotype, sock_map, verdict_map, REDIR_EGRESS); + + xbpf_prog_detach2(verdict, sock_map, BPF_SK_MSG_VERDICT); +} + +static void redir_to_listening(int family, int sotype, int sock_mapfd, + int verd_mapfd, enum redir_mode mode) +{ + const char *log_prefix = redir_mode_str(mode); + struct sockaddr_storage addr; + int s, c, p, err, n; + unsigned int drop; + socklen_t len; + u64 value; + u32 key; + + zero_verdict_count(verd_mapfd); + + s = socket_loopback(family, sotype | SOCK_NONBLOCK); + if (s < 0) + return; + + len = sizeof(addr); + err = xgetsockname(s, sockaddr(&addr), &len); + if (err) + goto close_srv; + + c = xsocket(family, sotype, 0); + if (c < 0) + goto close_srv; + err = xconnect(c, sockaddr(&addr), len); + if (err) + goto close_cli; + + p = xaccept_nonblock(s, NULL, NULL); + if (p < 0) + goto close_cli; + + key = 0; + value = s; + err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST); + if (err) + goto close_peer; + + key = 1; + value = p; + err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST); + if (err) + goto close_peer; + + n = write(mode == REDIR_INGRESS ? c : p, "a", 1); + if (n < 0 && errno != EACCES) + FAIL_ERRNO("%s: write", log_prefix); + if (n == 0) + FAIL("%s: incomplete write", log_prefix); + if (n < 1) + goto close_peer; + + key = SK_DROP; + err = xbpf_map_lookup_elem(verd_mapfd, &key, &drop); + if (err) + goto close_peer; + if (drop != 1) + FAIL("%s: want drop count 1, have %d", log_prefix, drop); + +close_peer: + xclose(p); +close_cli: + xclose(c); +close_srv: + xclose(s); +} + +static void test_skb_redir_to_listening(struct test_sockmap_listen *skel, + struct bpf_map *inner_map, int family, + int sotype) +{ + int verdict = bpf_program__fd(skel->progs.prog_skb_verdict); + int parser = bpf_program__fd(skel->progs.prog_skb_parser); + int verdict_map = bpf_map__fd(skel->maps.verdict_map); + int sock_map = bpf_map__fd(inner_map); + int err; + + err = xbpf_prog_attach(parser, sock_map, BPF_SK_SKB_STREAM_PARSER, 0); + if (err) + return; + err = xbpf_prog_attach(verdict, sock_map, BPF_SK_SKB_STREAM_VERDICT, 0); + if (err) + goto detach; + + redir_to_listening(family, sotype, sock_map, verdict_map, + REDIR_INGRESS); + + xbpf_prog_detach2(verdict, sock_map, BPF_SK_SKB_STREAM_VERDICT); +detach: + xbpf_prog_detach2(parser, sock_map, BPF_SK_SKB_STREAM_PARSER); +} + +static void test_msg_redir_to_listening(struct test_sockmap_listen *skel, + struct bpf_map *inner_map, int family, + int sotype) +{ + int verdict = bpf_program__fd(skel->progs.prog_msg_verdict); + int verdict_map = bpf_map__fd(skel->maps.verdict_map); + int sock_map = bpf_map__fd(inner_map); + int err; + + err = xbpf_prog_attach(verdict, sock_map, BPF_SK_MSG_VERDICT, 0); + if (err) + return; + + redir_to_listening(family, sotype, sock_map, verdict_map, REDIR_EGRESS); + + xbpf_prog_detach2(verdict, sock_map, BPF_SK_MSG_VERDICT); +} + +static void test_reuseport_select_listening(int family, int sotype, + int sock_map, int verd_map, + int reuseport_prog) +{ + struct sockaddr_storage addr; + unsigned int pass; + int s, c, err; + socklen_t len; + u64 value; + u32 key; + + zero_verdict_count(verd_map); + + s = socket_loopback_reuseport(family, sotype | SOCK_NONBLOCK, + reuseport_prog); + if (s < 0) + return; + + len = sizeof(addr); + err = xgetsockname(s, sockaddr(&addr), &len); + if (err) + goto close_srv; + + key = 0; + value = s; + err = xbpf_map_update_elem(sock_map, &key, &value, BPF_NOEXIST); + if (err) + goto close_srv; + + c = xsocket(family, sotype, 0); + if (c < 0) + goto close_srv; + err = xconnect(c, sockaddr(&addr), len); + if (err) + goto close_cli; + + if (sotype == SOCK_STREAM) { + int p; + + p = xaccept_nonblock(s, NULL, NULL); + if (p < 0) + goto close_cli; + xclose(p); + } else { + char b = 'a'; + ssize_t n; + + n = xsend(c, &b, sizeof(b), 0); + if (n == -1) + goto close_cli; + + n = xrecv_nonblock(s, &b, sizeof(b), 0); + if (n == -1) + goto close_cli; + } + + key = SK_PASS; + err = xbpf_map_lookup_elem(verd_map, &key, &pass); + if (err) + goto close_cli; + if (pass != 1) + FAIL("want pass count 1, have %d", pass); + +close_cli: + xclose(c); +close_srv: + xclose(s); +} + +static void test_reuseport_select_connected(int family, int sotype, + int sock_map, int verd_map, + int reuseport_prog) +{ + struct sockaddr_storage addr; + int s, c0, c1, p0, err; + unsigned int drop; + socklen_t len; + u64 value; + u32 key; + + zero_verdict_count(verd_map); + + s = socket_loopback_reuseport(family, sotype, reuseport_prog); + if (s < 0) + return; + + /* Populate sock_map[0] to avoid ENOENT on first connection */ + key = 0; + value = s; + err = xbpf_map_update_elem(sock_map, &key, &value, BPF_NOEXIST); + if (err) + goto close_srv; + + len = sizeof(addr); + err = xgetsockname(s, sockaddr(&addr), &len); + if (err) + goto close_srv; + + c0 = xsocket(family, sotype, 0); + if (c0 < 0) + goto close_srv; + + err = xconnect(c0, sockaddr(&addr), len); + if (err) + goto close_cli0; + + if (sotype == SOCK_STREAM) { + p0 = xaccept_nonblock(s, NULL, NULL); + if (p0 < 0) + goto close_cli0; + } else { + p0 = xsocket(family, sotype, 0); + if (p0 < 0) + goto close_cli0; + + len = sizeof(addr); + err = xgetsockname(c0, sockaddr(&addr), &len); + if (err) + goto close_cli0; + + err = xconnect(p0, sockaddr(&addr), len); + if (err) + goto close_cli0; + } + + /* Update sock_map[0] to redirect to a connected socket */ + key = 0; + value = p0; + err = xbpf_map_update_elem(sock_map, &key, &value, BPF_EXIST); + if (err) + goto close_peer0; + + c1 = xsocket(family, sotype, 0); + if (c1 < 0) + goto close_peer0; + + len = sizeof(addr); + err = xgetsockname(s, sockaddr(&addr), &len); + if (err) + goto close_srv; + + errno = 0; + err = connect(c1, sockaddr(&addr), len); + if (sotype == SOCK_DGRAM) { + char b = 'a'; + ssize_t n; + + n = xsend(c1, &b, sizeof(b), 0); + if (n == -1) + goto close_cli1; + + n = recv_timeout(c1, &b, sizeof(b), 0, IO_TIMEOUT_SEC); + err = n == -1; + } + if (!err || errno != ECONNREFUSED) + FAIL_ERRNO("connect: expected ECONNREFUSED"); + + key = SK_DROP; + err = xbpf_map_lookup_elem(verd_map, &key, &drop); + if (err) + goto close_cli1; + if (drop != 1) + FAIL("want drop count 1, have %d", drop); + +close_cli1: + xclose(c1); +close_peer0: + xclose(p0); +close_cli0: + xclose(c0); +close_srv: + xclose(s); +} + +/* Check that redirecting across reuseport groups is not allowed. */ +static void test_reuseport_mixed_groups(int family, int sotype, int sock_map, + int verd_map, int reuseport_prog) +{ + struct sockaddr_storage addr; + int s1, s2, c, err; + unsigned int drop; + socklen_t len; + u64 value; + u32 key; + + zero_verdict_count(verd_map); + + /* Create two listeners, each in its own reuseport group */ + s1 = socket_loopback_reuseport(family, sotype, reuseport_prog); + if (s1 < 0) + return; + + s2 = socket_loopback_reuseport(family, sotype, reuseport_prog); + if (s2 < 0) + goto close_srv1; + + key = 0; + value = s1; + err = xbpf_map_update_elem(sock_map, &key, &value, BPF_NOEXIST); + if (err) + goto close_srv2; + + key = 1; + value = s2; + err = xbpf_map_update_elem(sock_map, &key, &value, BPF_NOEXIST); + + /* Connect to s2, reuseport BPF selects s1 via sock_map[0] */ + len = sizeof(addr); + err = xgetsockname(s2, sockaddr(&addr), &len); + if (err) + goto close_srv2; + + c = xsocket(family, sotype, 0); + if (c < 0) + goto close_srv2; + + err = connect(c, sockaddr(&addr), len); + if (sotype == SOCK_DGRAM) { + char b = 'a'; + ssize_t n; + + n = xsend(c, &b, sizeof(b), 0); + if (n == -1) + goto close_cli; + + n = recv_timeout(c, &b, sizeof(b), 0, IO_TIMEOUT_SEC); + err = n == -1; + } + if (!err || errno != ECONNREFUSED) { + FAIL_ERRNO("connect: expected ECONNREFUSED"); + goto close_cli; + } + + /* Expect drop, can't redirect outside of reuseport group */ + key = SK_DROP; + err = xbpf_map_lookup_elem(verd_map, &key, &drop); + if (err) + goto close_cli; + if (drop != 1) + FAIL("want drop count 1, have %d", drop); + +close_cli: + xclose(c); +close_srv2: + xclose(s2); +close_srv1: + xclose(s1); +} + +#define TEST(fn, ...) \ + { \ + fn, #fn, __VA_ARGS__ \ + } + +static void test_ops_cleanup(const struct bpf_map *map) +{ + const struct bpf_map_def *def; + int err, mapfd; + u32 key; + + def = bpf_map__def(map); + mapfd = bpf_map__fd(map); + + for (key = 0; key < def->max_entries; key++) { + err = bpf_map_delete_elem(mapfd, &key); + if (err && errno != EINVAL && errno != ENOENT) + FAIL_ERRNO("map_delete: expected EINVAL/ENOENT"); + } +} + +static const char *family_str(sa_family_t family) +{ + switch (family) { + case AF_INET: + return "IPv4"; + case AF_INET6: + return "IPv6"; + default: + return "unknown"; + } +} + +static const char *map_type_str(const struct bpf_map *map) +{ + const struct bpf_map_def *def; + + def = bpf_map__def(map); + if (IS_ERR(def)) + return "invalid"; + + switch (def->type) { + case BPF_MAP_TYPE_SOCKMAP: + return "sockmap"; + case BPF_MAP_TYPE_SOCKHASH: + return "sockhash"; + default: + return "unknown"; + } +} + +static const char *sotype_str(int sotype) +{ + switch (sotype) { + case SOCK_DGRAM: + return "UDP"; + case SOCK_STREAM: + return "TCP"; + default: + return "unknown"; + } +} + +static void test_ops(struct test_sockmap_listen *skel, struct bpf_map *map, + int family, int sotype) +{ + const struct op_test { + void (*fn)(int family, int sotype, int mapfd); + const char *name; + int sotype; + } tests[] = { + /* insert */ + TEST(test_insert_invalid), + TEST(test_insert_opened), + TEST(test_insert_bound, SOCK_STREAM), + TEST(test_insert), + /* delete */ + TEST(test_delete_after_insert), + TEST(test_delete_after_close), + /* lookup */ + TEST(test_lookup_after_insert), + TEST(test_lookup_after_delete), + TEST(test_lookup_32_bit_value), + /* update */ + TEST(test_update_existing), + /* races with insert/delete */ + TEST(test_destroy_orphan_child, SOCK_STREAM), + TEST(test_syn_recv_insert_delete, SOCK_STREAM), + TEST(test_race_insert_listen, SOCK_STREAM), + /* child clone */ + TEST(test_clone_after_delete, SOCK_STREAM), + TEST(test_accept_after_delete, SOCK_STREAM), + TEST(test_accept_before_delete, SOCK_STREAM), + }; + const char *family_name, *map_name, *sotype_name; + const struct op_test *t; + char s[MAX_TEST_NAME]; + int map_fd; + + family_name = family_str(family); + map_name = map_type_str(map); + sotype_name = sotype_str(sotype); + map_fd = bpf_map__fd(map); + + for (t = tests; t < tests + ARRAY_SIZE(tests); t++) { + snprintf(s, sizeof(s), "%s %s %s %s", map_name, family_name, + sotype_name, t->name); + + if (t->sotype != 0 && t->sotype != sotype) + continue; + + if (!test__start_subtest(s)) + continue; + + t->fn(family, sotype, map_fd); + test_ops_cleanup(map); + } +} + +static void test_redir(struct test_sockmap_listen *skel, struct bpf_map *map, + int family, int sotype) +{ + const struct redir_test { + void (*fn)(struct test_sockmap_listen *skel, + struct bpf_map *map, int family, int sotype); + const char *name; + } tests[] = { + TEST(test_skb_redir_to_connected), + TEST(test_skb_redir_to_listening), + TEST(test_msg_redir_to_connected), + TEST(test_msg_redir_to_listening), + }; + const char *family_name, *map_name; + const struct redir_test *t; + char s[MAX_TEST_NAME]; + + family_name = family_str(family); + map_name = map_type_str(map); + + for (t = tests; t < tests + ARRAY_SIZE(tests); t++) { + snprintf(s, sizeof(s), "%s %s %s", map_name, family_name, + t->name); + + if (!test__start_subtest(s)) + continue; + + t->fn(skel, map, family, sotype); + } +} + +static void test_reuseport(struct test_sockmap_listen *skel, + struct bpf_map *map, int family, int sotype) +{ + const struct reuseport_test { + void (*fn)(int family, int sotype, int socket_map, + int verdict_map, int reuseport_prog); + const char *name; + int sotype; + } tests[] = { + TEST(test_reuseport_select_listening), + TEST(test_reuseport_select_connected), + TEST(test_reuseport_mixed_groups), + }; + int socket_map, verdict_map, reuseport_prog; + const char *family_name, *map_name, *sotype_name; + const struct reuseport_test *t; + char s[MAX_TEST_NAME]; + + family_name = family_str(family); + map_name = map_type_str(map); + sotype_name = sotype_str(sotype); + + socket_map = bpf_map__fd(map); + verdict_map = bpf_map__fd(skel->maps.verdict_map); + reuseport_prog = bpf_program__fd(skel->progs.prog_reuseport); + + for (t = tests; t < tests + ARRAY_SIZE(tests); t++) { + snprintf(s, sizeof(s), "%s %s %s %s", map_name, family_name, + sotype_name, t->name); + + if (t->sotype != 0 && t->sotype != sotype) + continue; + + if (!test__start_subtest(s)) + continue; + + t->fn(family, sotype, socket_map, verdict_map, reuseport_prog); + } +} + +static void run_tests(struct test_sockmap_listen *skel, struct bpf_map *map, + int family) +{ + test_ops(skel, map, family, SOCK_STREAM); + test_ops(skel, map, family, SOCK_DGRAM); + test_redir(skel, map, family, SOCK_STREAM); + test_reuseport(skel, map, family, SOCK_STREAM); + test_reuseport(skel, map, family, SOCK_DGRAM); +} + +void test_sockmap_listen(void) +{ + struct test_sockmap_listen *skel; + + skel = test_sockmap_listen__open_and_load(); + if (!skel) { + FAIL("skeleton open/load failed"); + return; + } + + skel->bss->test_sockmap = true; + run_tests(skel, skel->maps.sock_map, AF_INET); + run_tests(skel, skel->maps.sock_map, AF_INET6); + + skel->bss->test_sockmap = false; + run_tests(skel, skel->maps.sock_hash, AF_INET); + run_tests(skel, skel->maps.sock_hash, AF_INET6); + + test_sockmap_listen__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/spinlock.c b/tools/testing/selftests/bpf/prog_tests/spinlock.c index 1ae00cd3174e..7577a77a4c4c 100644 --- a/tools/testing/selftests/bpf/prog_tests/spinlock.c +++ b/tools/testing/selftests/bpf/prog_tests/spinlock.c @@ -1,5 +1,19 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> +#include <network_helpers.h> + +static void *spin_lock_thread(void *arg) +{ + __u32 duration, retval; + int err, prog_fd = *(u32 *) arg; + + err = bpf_prog_test_run(prog_fd, 10000, &pkt_v4, sizeof(pkt_v4), + NULL, NULL, &retval, &duration); + CHECK(err || retval, "", + "err %d errno %d retval %d duration %d\n", + err, errno, retval, duration); + pthread_exit(arg); +} void test_spinlock(void) { diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c index f4cd60d6fba2..9013a0c01eed 100644 --- a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c +++ b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> #include "cgroup_helpers.h" +#include "network_helpers.h" struct tcp_rtt_storage { __u32 invoked; @@ -87,34 +88,6 @@ static int verify_sk(int map_fd, int client_fd, const char *msg, __u32 invoked, return err; } -static int connect_to_server(int server_fd) -{ - struct sockaddr_storage addr; - socklen_t len = sizeof(addr); - int fd; - - fd = socket(AF_INET, SOCK_STREAM, 0); - if (fd < 0) { - log_err("Failed to create client socket"); - return -1; - } - - if (getsockname(server_fd, (struct sockaddr *)&addr, &len)) { - log_err("Failed to get server addr"); - goto out; - } - - if (connect(fd, (const struct sockaddr *)&addr, len) < 0) { - log_err("Fail to connect to server"); - goto out; - } - - return fd; - -out: - close(fd); - return -1; -} static int run_test(int cgroup_fd, int server_fd) { @@ -145,7 +118,7 @@ static int run_test(int cgroup_fd, int server_fd) goto close_bpf_object; } - client_fd = connect_to_server(server_fd); + client_fd = connect_to_fd(AF_INET, SOCK_STREAM, server_fd); if (client_fd < 0) { err = -1; goto close_bpf_object; @@ -180,95 +153,22 @@ close_bpf_object: return err; } -static int start_server(void) -{ - struct sockaddr_in addr = { - .sin_family = AF_INET, - .sin_addr.s_addr = htonl(INADDR_LOOPBACK), - }; - int fd; - - fd = socket(AF_INET, SOCK_STREAM, 0); - if (fd < 0) { - log_err("Failed to create server socket"); - return -1; - } - - if (bind(fd, (const struct sockaddr *)&addr, sizeof(addr)) < 0) { - log_err("Failed to bind socket"); - close(fd); - return -1; - } - - return fd; -} - -static pthread_mutex_t server_started_mtx = PTHREAD_MUTEX_INITIALIZER; -static pthread_cond_t server_started = PTHREAD_COND_INITIALIZER; - -static void *server_thread(void *arg) -{ - struct sockaddr_storage addr; - socklen_t len = sizeof(addr); - int fd = *(int *)arg; - int client_fd; - int err; - - err = listen(fd, 1); - - pthread_mutex_lock(&server_started_mtx); - pthread_cond_signal(&server_started); - pthread_mutex_unlock(&server_started_mtx); - - if (CHECK_FAIL(err < 0)) { - perror("Failed to listed on socket"); - return NULL; - } - - client_fd = accept(fd, (struct sockaddr *)&addr, &len); - if (CHECK_FAIL(client_fd < 0)) { - perror("Failed to accept client"); - return NULL; - } - - /* Wait for the next connection (that never arrives) - * to keep this thread alive to prevent calling - * close() on client_fd. - */ - if (CHECK_FAIL(accept(fd, (struct sockaddr *)&addr, &len) >= 0)) { - perror("Unexpected success in second accept"); - return NULL; - } - - close(client_fd); - - return NULL; -} - void test_tcp_rtt(void) { int server_fd, cgroup_fd; - pthread_t tid; cgroup_fd = test__join_cgroup("/tcp_rtt"); if (CHECK_FAIL(cgroup_fd < 0)) return; - server_fd = start_server(); + server_fd = start_server(AF_INET, SOCK_STREAM); if (CHECK_FAIL(server_fd < 0)) goto close_cgroup_fd; - if (CHECK_FAIL(pthread_create(&tid, NULL, server_thread, - (void *)&server_fd))) - goto close_server_fd; - - pthread_mutex_lock(&server_started_mtx); - pthread_cond_wait(&server_started, &server_started_mtx); - pthread_mutex_unlock(&server_started_mtx); - CHECK_FAIL(run_test(cgroup_fd, server_fd)); -close_server_fd: + close(server_fd); + close_cgroup_fd: close(cgroup_fd); } diff --git a/tools/testing/selftests/bpf/prog_tests/test_lsm.c b/tools/testing/selftests/bpf/prog_tests/test_lsm.c new file mode 100644 index 000000000000..b17eb2045c1d --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_lsm.c @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (C) 2020 Google LLC. + */ + +#include <test_progs.h> +#include <sys/mman.h> +#include <sys/wait.h> +#include <unistd.h> +#include <malloc.h> +#include <stdlib.h> + +#include "lsm.skel.h" + +char *CMD_ARGS[] = {"true", NULL}; + +#define GET_PAGE_ADDR(ADDR, PAGE_SIZE) \ + (char *)(((unsigned long) (ADDR + PAGE_SIZE)) & ~(PAGE_SIZE-1)) + +int stack_mprotect(void) +{ + void *buf; + long sz; + int ret; + + sz = sysconf(_SC_PAGESIZE); + if (sz < 0) + return sz; + + buf = alloca(sz * 3); + ret = mprotect(GET_PAGE_ADDR(buf, sz), sz, + PROT_READ | PROT_WRITE | PROT_EXEC); + return ret; +} + +int exec_cmd(int *monitored_pid) +{ + int child_pid, child_status; + + child_pid = fork(); + if (child_pid == 0) { + *monitored_pid = getpid(); + execvp(CMD_ARGS[0], CMD_ARGS); + return -EINVAL; + } else if (child_pid > 0) { + waitpid(child_pid, &child_status, 0); + return child_status; + } + + return -EINVAL; +} + +void test_test_lsm(void) +{ + struct lsm *skel = NULL; + int err, duration = 0; + + skel = lsm__open_and_load(); + if (CHECK(!skel, "skel_load", "lsm skeleton failed\n")) + goto close_prog; + + err = lsm__attach(skel); + if (CHECK(err, "attach", "lsm attach failed: %d\n", err)) + goto close_prog; + + err = exec_cmd(&skel->bss->monitored_pid); + if (CHECK(err < 0, "exec_cmd", "err %d errno %d\n", err, errno)) + goto close_prog; + + CHECK(skel->bss->bprm_count != 1, "bprm_count", "bprm_count = %d\n", + skel->bss->bprm_count); + + skel->bss->monitored_pid = getpid(); + + err = stack_mprotect(); + if (CHECK(errno != EPERM, "stack_mprotect", "want err=EPERM, got %d\n", + errno)) + goto close_prog; + + CHECK(skel->bss->mprotect_count != 1, "mprotect_count", + "mprotect_count = %d\n", skel->bss->mprotect_count); + +close_prog: + lsm__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/test_overhead.c b/tools/testing/selftests/bpf/prog_tests/test_overhead.c index 465b371a561d..2702df2b2343 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_overhead.c +++ b/tools/testing/selftests/bpf/prog_tests/test_overhead.c @@ -61,9 +61,10 @@ void test_test_overhead(void) const char *raw_tp_name = "raw_tp/task_rename"; const char *fentry_name = "fentry/__set_task_comm"; const char *fexit_name = "fexit/__set_task_comm"; + const char *fmodret_name = "fmod_ret/__set_task_comm"; const char *kprobe_func = "__set_task_comm"; struct bpf_program *kprobe_prog, *kretprobe_prog, *raw_tp_prog; - struct bpf_program *fentry_prog, *fexit_prog; + struct bpf_program *fentry_prog, *fexit_prog, *fmodret_prog; struct bpf_object *obj; struct bpf_link *link; int err, duration = 0; @@ -96,6 +97,10 @@ void test_test_overhead(void) if (CHECK(!fexit_prog, "find_probe", "prog '%s' not found\n", fexit_name)) goto cleanup; + fmodret_prog = bpf_object__find_program_by_title(obj, fmodret_name); + if (CHECK(!fmodret_prog, "find_probe", + "prog '%s' not found\n", fmodret_name)) + goto cleanup; err = bpf_object__load(obj); if (CHECK(err, "obj_load", "err %d\n", err)) @@ -142,6 +147,13 @@ void test_test_overhead(void) goto cleanup; test_run("fexit"); bpf_link__destroy(link); + + /* attach fmod_ret */ + link = bpf_program__attach_trace(fmodret_prog); + if (CHECK(IS_ERR(link), "attach fmod_ret", "err %ld\n", PTR_ERR(link))) + goto cleanup; + test_run("fmod_ret"); + bpf_link__destroy(link); cleanup: prctl(PR_SET_NAME, comm, 0L, 0L, 0L); bpf_object__close(obj); diff --git a/tools/testing/selftests/bpf/prog_tests/trampoline_count.c b/tools/testing/selftests/bpf/prog_tests/trampoline_count.c index 1f6ccdaed1ac..781c8d11604b 100644 --- a/tools/testing/selftests/bpf/prog_tests/trampoline_count.c +++ b/tools/testing/selftests/bpf/prog_tests/trampoline_count.c @@ -55,31 +55,40 @@ void test_trampoline_count(void) /* attach 'allowed' 40 trampoline programs */ for (i = 0; i < MAX_TRAMP_PROGS; i++) { obj = bpf_object__open_file(object, NULL); - if (CHECK(IS_ERR(obj), "obj_open_file", "err %ld\n", PTR_ERR(obj))) + if (CHECK(IS_ERR(obj), "obj_open_file", "err %ld\n", PTR_ERR(obj))) { + obj = NULL; goto cleanup; + } err = bpf_object__load(obj); if (CHECK(err, "obj_load", "err %d\n", err)) goto cleanup; inst[i].obj = obj; + obj = NULL; if (rand() % 2) { - link = load(obj, fentry_name); - if (CHECK(IS_ERR(link), "attach prog", "err %ld\n", PTR_ERR(link))) + link = load(inst[i].obj, fentry_name); + if (CHECK(IS_ERR(link), "attach prog", "err %ld\n", PTR_ERR(link))) { + link = NULL; goto cleanup; + } inst[i].link_fentry = link; } else { - link = load(obj, fexit_name); - if (CHECK(IS_ERR(link), "attach prog", "err %ld\n", PTR_ERR(link))) + link = load(inst[i].obj, fexit_name); + if (CHECK(IS_ERR(link), "attach prog", "err %ld\n", PTR_ERR(link))) { + link = NULL; goto cleanup; + } inst[i].link_fexit = link; } } /* and try 1 extra.. */ obj = bpf_object__open_file(object, NULL); - if (CHECK(IS_ERR(obj), "obj_open_file", "err %ld\n", PTR_ERR(obj))) + if (CHECK(IS_ERR(obj), "obj_open_file", "err %ld\n", PTR_ERR(obj))) { + obj = NULL; goto cleanup; + } err = bpf_object__load(obj); if (CHECK(err, "obj_load", "err %d\n", err)) @@ -104,7 +113,9 @@ void test_trampoline_count(void) cleanup_extra: bpf_object__close(obj); cleanup: - while (--i) { + if (i >= MAX_TRAMP_PROGS) + i = MAX_TRAMP_PROGS - 1; + for (; i >= 0; i--) { bpf_link__destroy(inst[i].link_fentry); bpf_link__destroy(inst[i].link_fexit); bpf_object__close(inst[i].obj); diff --git a/tools/testing/selftests/bpf/prog_tests/vmlinux.c b/tools/testing/selftests/bpf/prog_tests/vmlinux.c new file mode 100644 index 000000000000..72310cfc6474 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/vmlinux.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ + +#include <test_progs.h> +#include <time.h> +#include "test_vmlinux.skel.h" + +#define MY_TV_NSEC 1337 + +static void nsleep() +{ + struct timespec ts = { .tv_nsec = MY_TV_NSEC }; + + (void)syscall(__NR_nanosleep, &ts, NULL); +} + +void test_vmlinux(void) +{ + int duration = 0, err; + struct test_vmlinux* skel; + struct test_vmlinux__bss *bss; + + skel = test_vmlinux__open_and_load(); + if (CHECK(!skel, "skel_open", "failed to open skeleton\n")) + return; + bss = skel->bss; + + err = test_vmlinux__attach(skel); + if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err)) + goto cleanup; + + /* trigger everything */ + nsleep(); + + CHECK(!bss->tp_called, "tp", "not called\n"); + CHECK(!bss->raw_tp_called, "raw_tp", "not called\n"); + CHECK(!bss->tp_btf_called, "tp_btf", "not called\n"); + CHECK(!bss->kprobe_called, "kprobe", "not called\n"); + CHECK(!bss->fentry_called, "fentry", "not called\n"); + +cleanup: + test_vmlinux__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/xdp.c b/tools/testing/selftests/bpf/prog_tests/xdp.c index dcb5ecac778e..48921ff74850 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> +#include <network_helpers.h> void test_xdp(void) { diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c b/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c index 3744196d7cba..d5c98f2cb12f 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c @@ -1,13 +1,14 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> +#include <network_helpers.h> -void test_xdp_adjust_tail(void) +void test_xdp_adjust_tail_shrink(void) { - const char *file = "./test_adjust_tail.o"; + const char *file = "./test_xdp_adjust_tail_shrink.o"; + __u32 duration, retval, size, expect_sz; struct bpf_object *obj; - char buf[128]; - __u32 duration, retval, size; int err, prog_fd; + char buf[128]; err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd); if (CHECK_FAIL(err)) @@ -20,10 +21,121 @@ void test_xdp_adjust_tail(void) "ipv4", "err %d errno %d retval %d size %d\n", err, errno, retval, size); + expect_sz = sizeof(pkt_v6) - 20; /* Test shrink with 20 bytes */ err = bpf_prog_test_run(prog_fd, 1, &pkt_v6, sizeof(pkt_v6), buf, &size, &retval, &duration); - CHECK(err || retval != XDP_TX || size != 54, - "ipv6", "err %d errno %d retval %d size %d\n", + CHECK(err || retval != XDP_TX || size != expect_sz, + "ipv6", "err %d errno %d retval %d size %d expect-size %d\n", + err, errno, retval, size, expect_sz); + bpf_object__close(obj); +} + +void test_xdp_adjust_tail_grow(void) +{ + const char *file = "./test_xdp_adjust_tail_grow.o"; + struct bpf_object *obj; + char buf[4096]; /* avoid segfault: large buf to hold grow results */ + __u32 duration, retval, size, expect_sz; + int err, prog_fd; + + err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd); + if (CHECK_FAIL(err)) + return; + + err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4), + buf, &size, &retval, &duration); + CHECK(err || retval != XDP_DROP, + "ipv4", "err %d errno %d retval %d size %d\n", err, errno, retval, size); + + expect_sz = sizeof(pkt_v6) + 40; /* Test grow with 40 bytes */ + err = bpf_prog_test_run(prog_fd, 1, &pkt_v6, sizeof(pkt_v6) /* 74 */, + buf, &size, &retval, &duration); + CHECK(err || retval != XDP_TX || size != expect_sz, + "ipv6", "err %d errno %d retval %d size %d expect-size %d\n", + err, errno, retval, size, expect_sz); + + bpf_object__close(obj); +} + +void test_xdp_adjust_tail_grow2(void) +{ + const char *file = "./test_xdp_adjust_tail_grow.o"; + char buf[4096]; /* avoid segfault: large buf to hold grow results */ + int tailroom = 320; /* SKB_DATA_ALIGN(sizeof(struct skb_shared_info))*/; + struct bpf_object *obj; + int err, cnt, i; + int max_grow; + + struct bpf_prog_test_run_attr tattr = { + .repeat = 1, + .data_in = &buf, + .data_out = &buf, + .data_size_in = 0, /* Per test */ + .data_size_out = 0, /* Per test */ + }; + + err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &tattr.prog_fd); + if (CHECK_ATTR(err, "load", "err %d errno %d\n", err, errno)) + return; + + /* Test case-64 */ + memset(buf, 1, sizeof(buf)); + tattr.data_size_in = 64; /* Determine test case via pkt size */ + tattr.data_size_out = 128; /* Limit copy_size */ + /* Kernel side alloc packet memory area that is zero init */ + err = bpf_prog_test_run_xattr(&tattr); + + CHECK_ATTR(errno != ENOSPC /* Due limit copy_size in bpf_test_finish */ + || tattr.retval != XDP_TX + || tattr.data_size_out != 192, /* Expected grow size */ + "case-64", + "err %d errno %d retval %d size %d\n", + err, errno, tattr.retval, tattr.data_size_out); + + /* Extra checks for data contents */ + CHECK_ATTR(tattr.data_size_out != 192 + || buf[0] != 1 || buf[63] != 1 /* 0-63 memset to 1 */ + || buf[64] != 0 || buf[127] != 0 /* 64-127 memset to 0 */ + || buf[128] != 1 || buf[191] != 1, /*128-191 memset to 1 */ + "case-64-data", + "err %d errno %d retval %d size %d\n", + err, errno, tattr.retval, tattr.data_size_out); + + /* Test case-128 */ + memset(buf, 2, sizeof(buf)); + tattr.data_size_in = 128; /* Determine test case via pkt size */ + tattr.data_size_out = sizeof(buf); /* Copy everything */ + err = bpf_prog_test_run_xattr(&tattr); + + max_grow = 4096 - XDP_PACKET_HEADROOM - tailroom; /* 3520 */ + CHECK_ATTR(err + || tattr.retval != XDP_TX + || tattr.data_size_out != max_grow,/* Expect max grow size */ + "case-128", + "err %d errno %d retval %d size %d expect-size %d\n", + err, errno, tattr.retval, tattr.data_size_out, max_grow); + + /* Extra checks for data content: Count grow size, will contain zeros */ + for (i = 0, cnt = 0; i < sizeof(buf); i++) { + if (buf[i] == 0) + cnt++; + } + CHECK_ATTR((cnt != (max_grow - tattr.data_size_in)) /* Grow increase */ + || tattr.data_size_out != max_grow, /* Total grow size */ + "case-128-data", + "err %d errno %d retval %d size %d grow-size %d\n", + err, errno, tattr.retval, tattr.data_size_out, cnt); + bpf_object__close(obj); } + +void test_xdp_adjust_tail(void) +{ + if (test__start_subtest("xdp_adjust_tail_shrink")) + test_xdp_adjust_tail_shrink(); + if (test__start_subtest("xdp_adjust_tail_grow")) + test_xdp_adjust_tail_grow(); + if (test__start_subtest("xdp_adjust_tail_grow2")) + test_xdp_adjust_tail_grow2(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_attach.c b/tools/testing/selftests/bpf/prog_tests/xdp_attach.c new file mode 100644 index 000000000000..15ef3531483e --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/xdp_attach.c @@ -0,0 +1,90 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <test_progs.h> + +#define IFINDEX_LO 1 +#define XDP_FLAGS_REPLACE (1U << 4) + +void test_xdp_attach(void) +{ + __u32 duration = 0, id1, id2, id0 = 0, len; + struct bpf_object *obj1, *obj2, *obj3; + const char *file = "./test_xdp.o"; + struct bpf_prog_info info = {}; + int err, fd1, fd2, fd3; + DECLARE_LIBBPF_OPTS(bpf_xdp_set_link_opts, opts, + .old_fd = -1); + + len = sizeof(info); + + err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj1, &fd1); + if (CHECK_FAIL(err)) + return; + err = bpf_obj_get_info_by_fd(fd1, &info, &len); + if (CHECK_FAIL(err)) + goto out_1; + id1 = info.id; + + err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj2, &fd2); + if (CHECK_FAIL(err)) + goto out_1; + + memset(&info, 0, sizeof(info)); + err = bpf_obj_get_info_by_fd(fd2, &info, &len); + if (CHECK_FAIL(err)) + goto out_2; + id2 = info.id; + + err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj3, &fd3); + if (CHECK_FAIL(err)) + goto out_2; + + err = bpf_set_link_xdp_fd_opts(IFINDEX_LO, fd1, XDP_FLAGS_REPLACE, + &opts); + if (CHECK(err, "load_ok", "initial load failed")) + goto out_close; + + err = bpf_get_link_xdp_id(IFINDEX_LO, &id0, 0); + if (CHECK(err || id0 != id1, "id1_check", + "loaded prog id %u != id1 %u, err %d", id0, id1, err)) + goto out_close; + + err = bpf_set_link_xdp_fd_opts(IFINDEX_LO, fd2, XDP_FLAGS_REPLACE, + &opts); + if (CHECK(!err, "load_fail", "load with expected id didn't fail")) + goto out; + + opts.old_fd = fd1; + err = bpf_set_link_xdp_fd_opts(IFINDEX_LO, fd2, 0, &opts); + if (CHECK(err, "replace_ok", "replace valid old_fd failed")) + goto out; + err = bpf_get_link_xdp_id(IFINDEX_LO, &id0, 0); + if (CHECK(err || id0 != id2, "id2_check", + "loaded prog id %u != id2 %u, err %d", id0, id2, err)) + goto out_close; + + err = bpf_set_link_xdp_fd_opts(IFINDEX_LO, fd3, 0, &opts); + if (CHECK(!err, "replace_fail", "replace invalid old_fd didn't fail")) + goto out; + + err = bpf_set_link_xdp_fd_opts(IFINDEX_LO, -1, 0, &opts); + if (CHECK(!err, "remove_fail", "remove invalid old_fd didn't fail")) + goto out; + + opts.old_fd = fd2; + err = bpf_set_link_xdp_fd_opts(IFINDEX_LO, -1, 0, &opts); + if (CHECK(err, "remove_ok", "remove valid old_fd failed")) + goto out; + + err = bpf_get_link_xdp_id(IFINDEX_LO, &id0, 0); + if (CHECK(err || id0 != 0, "unload_check", + "loaded prog id %u != 0, err %d", id0, err)) + goto out_close; +out: + bpf_set_link_xdp_fd(IFINDEX_LO, -1, 0); +out_close: + bpf_object__close(obj3); +out_2: + bpf_object__close(obj2); +out_1: + bpf_object__close(obj1); +} diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c b/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c index 6b56bdc73ebc..2c6c570b21f8 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c @@ -1,20 +1,55 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> +#include <network_helpers.h> #include <net/if.h> #include "test_xdp.skel.h" #include "test_xdp_bpf2bpf.skel.h" +struct meta { + int ifindex; + int pkt_len; +}; + +static void on_sample(void *ctx, int cpu, void *data, __u32 size) +{ + int duration = 0; + struct meta *meta = (struct meta *)data; + struct ipv4_packet *trace_pkt_v4 = data + sizeof(*meta); + + if (CHECK(size < sizeof(pkt_v4) + sizeof(*meta), + "check_size", "size %u < %zu\n", + size, sizeof(pkt_v4) + sizeof(*meta))) + return; + + if (CHECK(meta->ifindex != if_nametoindex("lo"), "check_meta_ifindex", + "meta->ifindex = %d\n", meta->ifindex)) + return; + + if (CHECK(meta->pkt_len != sizeof(pkt_v4), "check_meta_pkt_len", + "meta->pkt_len = %zd\n", sizeof(pkt_v4))) + return; + + if (CHECK(memcmp(trace_pkt_v4, &pkt_v4, sizeof(pkt_v4)), + "check_packet_content", "content not the same\n")) + return; + + *(bool *)ctx = true; +} + void test_xdp_bpf2bpf(void) { __u32 duration = 0, retval, size; char buf[128]; int err, pkt_fd, map_fd; + bool passed = false; struct iphdr *iph = (void *)buf + sizeof(struct ethhdr); struct iptnl_info value4 = {.family = AF_INET}; struct test_xdp *pkt_skel = NULL; struct test_xdp_bpf2bpf *ftrace_skel = NULL; struct vip key4 = {.protocol = 6, .family = AF_INET}; - DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts); + struct bpf_program *prog; + struct perf_buffer *pb = NULL; + struct perf_buffer_opts pb_opts = {}; /* Load XDP program to introspect */ pkt_skel = test_xdp__open_and_load(); @@ -27,11 +62,21 @@ void test_xdp_bpf2bpf(void) bpf_map_update_elem(map_fd, &key4, &value4, 0); /* Load trace program */ - opts.attach_prog_fd = pkt_fd, - ftrace_skel = test_xdp_bpf2bpf__open_opts(&opts); + ftrace_skel = test_xdp_bpf2bpf__open(); if (CHECK(!ftrace_skel, "__open", "ftrace skeleton failed\n")) goto out; + /* Demonstrate the bpf_program__set_attach_target() API rather than + * the load with options, i.e. opts.attach_prog_fd. + */ + prog = ftrace_skel->progs.trace_on_entry; + bpf_program__set_expected_attach_type(prog, BPF_TRACE_FENTRY); + bpf_program__set_attach_target(prog, pkt_fd, "_xdp_tx_iptunnel"); + + prog = ftrace_skel->progs.trace_on_exit; + bpf_program__set_expected_attach_type(prog, BPF_TRACE_FEXIT); + bpf_program__set_attach_target(prog, pkt_fd, "_xdp_tx_iptunnel"); + err = test_xdp_bpf2bpf__load(ftrace_skel); if (CHECK(err, "__load", "ftrace skeleton failed\n")) goto out; @@ -40,6 +85,14 @@ void test_xdp_bpf2bpf(void) if (CHECK(err, "ftrace_attach", "ftrace attach failed: %d\n", err)) goto out; + /* Set up perf buffer */ + pb_opts.sample_cb = on_sample; + pb_opts.ctx = &passed; + pb = perf_buffer__new(bpf_map__fd(ftrace_skel->maps.perf_buf_map), + 1, &pb_opts); + if (CHECK(IS_ERR(pb), "perf_buf__new", "err %ld\n", PTR_ERR(pb))) + goto out; + /* Run test program */ err = bpf_prog_test_run(pkt_fd, 1, &pkt_v4, sizeof(pkt_v4), buf, &size, &retval, &duration); @@ -50,6 +103,15 @@ void test_xdp_bpf2bpf(void) err, errno, retval, size)) goto out; + /* Make sure bpf_xdp_output() was triggered and it sent the expected + * data to the perf ring buffer. + */ + err = perf_buffer__poll(pb, 100); + if (CHECK(err < 0, "perf_buffer__poll", "err %d\n", err)) + goto out; + + CHECK_FAIL(!passed); + /* Verify test results */ if (CHECK(ftrace_skel->bss->test_result_fentry != if_nametoindex("lo"), "result", "fentry failed err %llu\n", @@ -60,6 +122,8 @@ void test_xdp_bpf2bpf(void) "fexit failed err %llu\n", ftrace_skel->bss->test_result_fexit); out: + if (pb) + perf_buffer__free(pb); test_xdp__destroy(pkt_skel); test_xdp_bpf2bpf__destroy(ftrace_skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c b/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c new file mode 100644 index 000000000000..d19dbd668f6a --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c @@ -0,0 +1,97 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <uapi/linux/bpf.h> +#include <linux/if_link.h> +#include <test_progs.h> + +#include "test_xdp_devmap_helpers.skel.h" +#include "test_xdp_with_devmap_helpers.skel.h" + +#define IFINDEX_LO 1 + +struct bpf_devmap_val { + u32 ifindex; /* device index */ + union { + int fd; /* prog fd on map write */ + u32 id; /* prog id on map read */ + } bpf_prog; +}; + +void test_xdp_with_devmap_helpers(void) +{ + struct test_xdp_with_devmap_helpers *skel; + struct bpf_prog_info info = {}; + struct bpf_devmap_val val = { + .ifindex = IFINDEX_LO, + }; + __u32 len = sizeof(info); + __u32 duration = 0, idx = 0; + int err, dm_fd, map_fd; + + + skel = test_xdp_with_devmap_helpers__open_and_load(); + if (CHECK_FAIL(!skel)) { + perror("test_xdp_with_devmap_helpers__open_and_load"); + return; + } + + /* can not attach program with DEVMAPs that allow programs + * as xdp generic + */ + dm_fd = bpf_program__fd(skel->progs.xdp_redir_prog); + err = bpf_set_link_xdp_fd(IFINDEX_LO, dm_fd, XDP_FLAGS_SKB_MODE); + CHECK(err == 0, "Generic attach of program with 8-byte devmap", + "should have failed\n"); + + dm_fd = bpf_program__fd(skel->progs.xdp_dummy_dm); + map_fd = bpf_map__fd(skel->maps.dm_ports); + err = bpf_obj_get_info_by_fd(dm_fd, &info, &len); + if (CHECK_FAIL(err)) + goto out_close; + + val.bpf_prog.fd = dm_fd; + err = bpf_map_update_elem(map_fd, &idx, &val, 0); + CHECK(err, "Add program to devmap entry", + "err %d errno %d\n", err, errno); + + err = bpf_map_lookup_elem(map_fd, &idx, &val); + CHECK(err, "Read devmap entry", "err %d errno %d\n", err, errno); + CHECK(info.id != val.bpf_prog.id, "Expected program id in devmap entry", + "expected %u read %u\n", info.id, val.bpf_prog.id); + + /* can not attach BPF_XDP_DEVMAP program to a device */ + err = bpf_set_link_xdp_fd(IFINDEX_LO, dm_fd, XDP_FLAGS_SKB_MODE); + CHECK(err == 0, "Attach of BPF_XDP_DEVMAP program", + "should have failed\n"); + + val.ifindex = 1; + val.bpf_prog.fd = bpf_program__fd(skel->progs.xdp_dummy_prog); + err = bpf_map_update_elem(map_fd, &idx, &val, 0); + CHECK(err == 0, "Add non-BPF_XDP_DEVMAP program to devmap entry", + "should have failed\n"); + +out_close: + test_xdp_with_devmap_helpers__destroy(skel); +} + +void test_neg_xdp_devmap_helpers(void) +{ + struct test_xdp_devmap_helpers *skel; + __u32 duration = 0; + + skel = test_xdp_devmap_helpers__open_and_load(); + if (CHECK(skel, + "Load of XDP program accessing egress ifindex without attach type", + "should have failed\n")) { + test_xdp_devmap_helpers__destroy(skel); + } +} + + +void test_xdp_devmap_attach(void) +{ + if (test__start_subtest("DEVMAP with programs in entries")) + test_xdp_with_devmap_helpers(); + + if (test__start_subtest("Verifier check of DEVMAP programs")) + test_neg_xdp_devmap_helpers(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_info.c b/tools/testing/selftests/bpf/prog_tests/xdp_info.c new file mode 100644 index 000000000000..d2d7a283d72f --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/xdp_info.c @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/if_link.h> +#include <test_progs.h> + +#define IFINDEX_LO 1 + +void test_xdp_info(void) +{ + __u32 len = sizeof(struct bpf_prog_info), duration = 0, prog_id; + const char *file = "./xdp_dummy.o"; + struct bpf_prog_info info = {}; + struct bpf_object *obj; + int err, prog_fd; + + /* Get prog_id for XDP_ATTACHED_NONE mode */ + + err = bpf_get_link_xdp_id(IFINDEX_LO, &prog_id, 0); + if (CHECK(err, "get_xdp_none", "errno=%d\n", errno)) + return; + if (CHECK(prog_id, "prog_id_none", "unexpected prog_id=%u\n", prog_id)) + return; + + err = bpf_get_link_xdp_id(IFINDEX_LO, &prog_id, XDP_FLAGS_SKB_MODE); + if (CHECK(err, "get_xdp_none_skb", "errno=%d\n", errno)) + return; + if (CHECK(prog_id, "prog_id_none_skb", "unexpected prog_id=%u\n", + prog_id)) + return; + + /* Setup prog */ + + err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd); + if (CHECK_FAIL(err)) + return; + + err = bpf_obj_get_info_by_fd(prog_fd, &info, &len); + if (CHECK(err, "get_prog_info", "errno=%d\n", errno)) + goto out_close; + + err = bpf_set_link_xdp_fd(IFINDEX_LO, prog_fd, XDP_FLAGS_SKB_MODE); + if (CHECK(err, "set_xdp_skb", "errno=%d\n", errno)) + goto out_close; + + /* Get prog_id for single prog mode */ + + err = bpf_get_link_xdp_id(IFINDEX_LO, &prog_id, 0); + if (CHECK(err, "get_xdp", "errno=%d\n", errno)) + goto out; + if (CHECK(prog_id != info.id, "prog_id", "prog_id not available\n")) + goto out; + + err = bpf_get_link_xdp_id(IFINDEX_LO, &prog_id, XDP_FLAGS_SKB_MODE); + if (CHECK(err, "get_xdp_skb", "errno=%d\n", errno)) + goto out; + if (CHECK(prog_id != info.id, "prog_id_skb", "prog_id not available\n")) + goto out; + + err = bpf_get_link_xdp_id(IFINDEX_LO, &prog_id, XDP_FLAGS_DRV_MODE); + if (CHECK(err, "get_xdp_drv", "errno=%d\n", errno)) + goto out; + if (CHECK(prog_id, "prog_id_drv", "unexpected prog_id=%u\n", prog_id)) + goto out; + +out: + bpf_set_link_xdp_fd(IFINDEX_LO, -1, 0); +out_close: + bpf_object__close(obj); +} diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_noinline.c b/tools/testing/selftests/bpf/prog_tests/xdp_noinline.c index c9404e6b226e..f284f72158ef 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_noinline.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_noinline.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> +#include <network_helpers.h> void test_xdp_noinline(void) { diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp.c b/tools/testing/selftests/bpf/progs/bpf_dctcp.c index b631fb5032d2..3fb4260570b1 100644 --- a/tools/testing/selftests/bpf/progs/bpf_dctcp.c +++ b/tools/testing/selftests/bpf/progs/bpf_dctcp.c @@ -6,14 +6,24 @@ * the kernel BPF logic. */ +#include <stddef.h> #include <linux/bpf.h> #include <linux/types.h> #include <bpf/bpf_helpers.h> -#include "bpf_trace_helpers.h" +#include <bpf/bpf_tracing.h> #include "bpf_tcp_helpers.h" char _license[] SEC("license") = "GPL"; +int stg_result = 0; + +struct { + __uint(type, BPF_MAP_TYPE_SK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, int); +} sk_stg_map SEC(".maps"); + #define DCTCP_MAX_ALPHA 1024U struct dctcp { @@ -43,12 +53,18 @@ void BPF_PROG(dctcp_init, struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); struct dctcp *ca = inet_csk_ca(sk); + int *stg; ca->prior_rcv_nxt = tp->rcv_nxt; ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA); ca->loss_cwnd = 0; ca->ce_state = 0; + stg = bpf_sk_storage_get(&sk_stg_map, (void *)tp, NULL, 0); + if (stg) { + stg_result = *stg; + bpf_sk_storage_delete(&sk_stg_map, (void *)tp); + } dctcp_reset(tp, ca); } diff --git a/tools/testing/selftests/bpf/progs/bpf_flow.c b/tools/testing/selftests/bpf/progs/bpf_flow.c index 9941f0ba471e..de6de9221518 100644 --- a/tools/testing/selftests/bpf/progs/bpf_flow.c +++ b/tools/testing/selftests/bpf/progs/bpf_flow.c @@ -20,20 +20,20 @@ #include <bpf/bpf_endian.h> int _version SEC("version") = 1; -#define PROG(F) SEC(#F) int bpf_func_##F +#define PROG(F) PROG_(F, _##F) +#define PROG_(NUM, NAME) SEC("flow_dissector/"#NUM) int bpf_func##NAME /* These are the identifiers of the BPF programs that will be used in tail * calls. Name is limited to 16 characters, with the terminating character and * bpf_func_ above, we have only 6 to work with, anything after will be cropped. */ -enum { - IP, - IPV6, - IPV6OP, /* Destination/Hop-by-Hop Options IPv6 Extension header */ - IPV6FR, /* Fragmentation IPv6 Extension Header */ - MPLS, - VLAN, -}; +#define IP 0 +#define IPV6 1 +#define IPV6OP 2 /* Destination/Hop-by-Hop Options IPv6 Ext. Header */ +#define IPV6FR 3 /* Fragmentation IPv6 Extension Header */ +#define MPLS 4 +#define VLAN 5 +#define MAX_PROG 6 #define IP_MF 0x2000 #define IP_OFFSET 0x1FFF @@ -59,7 +59,7 @@ struct frag_hdr { struct { __uint(type, BPF_MAP_TYPE_PROG_ARRAY); - __uint(max_entries, 8); + __uint(max_entries, MAX_PROG); __uint(key_size, sizeof(__u32)); __uint(value_size, sizeof(__u32)); } jmp_table SEC(".maps"); diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c new file mode 100644 index 000000000000..b57bd6fef208 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +/* "undefine" structs in vmlinux.h, because we "override" them below */ +#define bpf_iter_meta bpf_iter_meta___not_used +#define bpf_iter__bpf_map bpf_iter__bpf_map___not_used +#include "vmlinux.h" +#undef bpf_iter_meta +#undef bpf_iter__bpf_map +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char _license[] SEC("license") = "GPL"; + +struct bpf_iter_meta { + struct seq_file *seq; + __u64 session_id; + __u64 seq_num; +} __attribute__((preserve_access_index)); + +struct bpf_iter__bpf_map { + struct bpf_iter_meta *meta; + struct bpf_map *map; +} __attribute__((preserve_access_index)); + +SEC("iter/bpf_map") +int dump_bpf_map(struct bpf_iter__bpf_map *ctx) +{ + struct seq_file *seq = ctx->meta->seq; + __u64 seq_num = ctx->meta->seq_num; + struct bpf_map *map = ctx->map; + + if (map == (void *)0) { + BPF_SEQ_PRINTF(seq, " %%%%%% END %%%%%%\n"); + return 0; + } + + if (seq_num == 0) + BPF_SEQ_PRINTF(seq, " id refcnt usercnt locked_vm\n"); + + BPF_SEQ_PRINTF(seq, "%8u %8ld %8ld %10lu\n", map->id, map->refcnt.counter, + map->usercnt.counter, + map->memory.user->locked_vm.counter); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c b/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c new file mode 100644 index 000000000000..c8e9ca74c87b --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +/* "undefine" structs in vmlinux.h, because we "override" them below */ +#define bpf_iter_meta bpf_iter_meta___not_used +#define bpf_iter__ipv6_route bpf_iter__ipv6_route___not_used +#include "vmlinux.h" +#undef bpf_iter_meta +#undef bpf_iter__ipv6_route +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +struct bpf_iter_meta { + struct seq_file *seq; + __u64 session_id; + __u64 seq_num; +} __attribute__((preserve_access_index)); + +struct bpf_iter__ipv6_route { + struct bpf_iter_meta *meta; + struct fib6_info *rt; +} __attribute__((preserve_access_index)); + +char _license[] SEC("license") = "GPL"; + +extern bool CONFIG_IPV6_SUBTREES __kconfig __weak; + +#define RTF_GATEWAY 0x0002 +#define IFNAMSIZ 16 +#define fib_nh_gw_family nh_common.nhc_gw_family +#define fib_nh_gw6 nh_common.nhc_gw.ipv6 +#define fib_nh_dev nh_common.nhc_dev + +SEC("iter/ipv6_route") +int dump_ipv6_route(struct bpf_iter__ipv6_route *ctx) +{ + struct seq_file *seq = ctx->meta->seq; + struct fib6_info *rt = ctx->rt; + const struct net_device *dev; + struct fib6_nh *fib6_nh; + unsigned int flags; + struct nexthop *nh; + + if (rt == (void *)0) + return 0; + + fib6_nh = &rt->fib6_nh[0]; + flags = rt->fib6_flags; + + /* FIXME: nexthop_is_multipath is not handled here. */ + nh = rt->nh; + if (rt->nh) + fib6_nh = &nh->nh_info->fib6_nh; + + BPF_SEQ_PRINTF(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen); + + if (CONFIG_IPV6_SUBTREES) + BPF_SEQ_PRINTF(seq, "%pi6 %02x ", &rt->fib6_src.addr, + rt->fib6_src.plen); + else + BPF_SEQ_PRINTF(seq, "00000000000000000000000000000000 00 "); + + if (fib6_nh->fib_nh_gw_family) { + flags |= RTF_GATEWAY; + BPF_SEQ_PRINTF(seq, "%pi6 ", &fib6_nh->fib_nh_gw6); + } else { + BPF_SEQ_PRINTF(seq, "00000000000000000000000000000000 "); + } + + dev = fib6_nh->fib_nh_dev; + if (dev) + BPF_SEQ_PRINTF(seq, "%08x %08x %08x %08x %8s\n", rt->fib6_metric, + rt->fib6_ref.refs.counter, 0, flags, dev->name); + else + BPF_SEQ_PRINTF(seq, "%08x %08x %08x %08x\n", rt->fib6_metric, + rt->fib6_ref.refs.counter, 0, flags); + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c b/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c new file mode 100644 index 000000000000..e7b8753eac0b --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +/* "undefine" structs in vmlinux.h, because we "override" them below */ +#define bpf_iter_meta bpf_iter_meta___not_used +#define bpf_iter__netlink bpf_iter__netlink___not_used +#include "vmlinux.h" +#undef bpf_iter_meta +#undef bpf_iter__netlink +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char _license[] SEC("license") = "GPL"; + +#define sk_rmem_alloc sk_backlog.rmem_alloc +#define sk_refcnt __sk_common.skc_refcnt + +struct bpf_iter_meta { + struct seq_file *seq; + __u64 session_id; + __u64 seq_num; +} __attribute__((preserve_access_index)); + +struct bpf_iter__netlink { + struct bpf_iter_meta *meta; + struct netlink_sock *sk; +} __attribute__((preserve_access_index)); + +static inline struct inode *SOCK_INODE(struct socket *socket) +{ + return &container_of(socket, struct socket_alloc, socket)->vfs_inode; +} + +SEC("iter/netlink") +int dump_netlink(struct bpf_iter__netlink *ctx) +{ + struct seq_file *seq = ctx->meta->seq; + struct netlink_sock *nlk = ctx->sk; + unsigned long group, ino; + struct inode *inode; + struct socket *sk; + struct sock *s; + + if (nlk == (void *)0) + return 0; + + if (ctx->meta->seq_num == 0) + BPF_SEQ_PRINTF(seq, "sk Eth Pid Groups " + "Rmem Wmem Dump Locks Drops " + "Inode\n"); + + s = &nlk->sk; + BPF_SEQ_PRINTF(seq, "%pK %-3d ", s, s->sk_protocol); + + if (!nlk->groups) { + group = 0; + } else { + /* FIXME: temporary use bpf_probe_read here, needs + * verifier support to do direct access. + */ + bpf_probe_read(&group, sizeof(group), &nlk->groups[0]); + } + BPF_SEQ_PRINTF(seq, "%-10u %08x %-8d %-8d %-5d %-8d ", + nlk->portid, (u32)group, + s->sk_rmem_alloc.counter, + s->sk_wmem_alloc.refs.counter - 1, + nlk->cb_running, s->sk_refcnt.refs.counter); + + sk = s->sk_socket; + if (!sk) { + ino = 0; + } else { + /* FIXME: container_of inside SOCK_INODE has a forced + * type conversion, and direct access cannot be used + * with current verifier. + */ + inode = SOCK_INODE(sk); + bpf_probe_read(&ino, sizeof(ino), &inode->i_ino); + } + BPF_SEQ_PRINTF(seq, "%-8u %-8lu\n", s->sk_drops.counter, ino); + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task.c b/tools/testing/selftests/bpf/progs/bpf_iter_task.c new file mode 100644 index 000000000000..ee754021f98e --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_task.c @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +/* "undefine" structs in vmlinux.h, because we "override" them below */ +#define bpf_iter_meta bpf_iter_meta___not_used +#define bpf_iter__task bpf_iter__task___not_used +#include "vmlinux.h" +#undef bpf_iter_meta +#undef bpf_iter__task +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char _license[] SEC("license") = "GPL"; + +struct bpf_iter_meta { + struct seq_file *seq; + __u64 session_id; + __u64 seq_num; +} __attribute__((preserve_access_index)); + +struct bpf_iter__task { + struct bpf_iter_meta *meta; + struct task_struct *task; +} __attribute__((preserve_access_index)); + +SEC("iter/task") +int dump_task(struct bpf_iter__task *ctx) +{ + struct seq_file *seq = ctx->meta->seq; + struct task_struct *task = ctx->task; + + if (task == (void *)0) { + BPF_SEQ_PRINTF(seq, " === END ===\n"); + return 0; + } + + if (ctx->meta->seq_num == 0) + BPF_SEQ_PRINTF(seq, " tgid gid\n"); + + BPF_SEQ_PRINTF(seq, "%8d %8d\n", task->tgid, task->pid); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c new file mode 100644 index 000000000000..0f0ec3db20ba --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +/* "undefine" structs in vmlinux.h, because we "override" them below */ +#define bpf_iter_meta bpf_iter_meta___not_used +#define bpf_iter__task_file bpf_iter__task_file___not_used +#include "vmlinux.h" +#undef bpf_iter_meta +#undef bpf_iter__task_file +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char _license[] SEC("license") = "GPL"; + +struct bpf_iter_meta { + struct seq_file *seq; + __u64 session_id; + __u64 seq_num; +} __attribute__((preserve_access_index)); + +struct bpf_iter__task_file { + struct bpf_iter_meta *meta; + struct task_struct *task; + __u32 fd; + struct file *file; +} __attribute__((preserve_access_index)); + +SEC("iter/task_file") +int dump_task_file(struct bpf_iter__task_file *ctx) +{ + struct seq_file *seq = ctx->meta->seq; + struct task_struct *task = ctx->task; + __u32 fd = ctx->fd; + struct file *file = ctx->file; + + if (task == (void *)0 || file == (void *)0) + return 0; + + if (ctx->meta->seq_num == 0) + BPF_SEQ_PRINTF(seq, " tgid gid fd file\n"); + + BPF_SEQ_PRINTF(seq, "%8d %8d %8d %lx\n", task->tgid, task->pid, fd, + (long)file->f_op); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern1.c b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern1.c new file mode 100644 index 000000000000..c71a7c283108 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern1.c @@ -0,0 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#define START_CHAR 'a' +#include "bpf_iter_test_kern_common.h" diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern2.c b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern2.c new file mode 100644 index 000000000000..8bdc8dc07444 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern2.c @@ -0,0 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#define START_CHAR 'A' +#include "bpf_iter_test_kern_common.h" diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c new file mode 100644 index 000000000000..13c2c90c835f --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#define bpf_iter_meta bpf_iter_meta___not_used +#define bpf_iter__task bpf_iter__task___not_used +#include "vmlinux.h" +#undef bpf_iter_meta +#undef bpf_iter__task +#include <bpf/bpf_helpers.h> + +char _license[] SEC("license") = "GPL"; + +struct bpf_iter_meta { + struct seq_file *seq; + __u64 session_id; + __u64 seq_num; +} __attribute__((preserve_access_index)); + +struct bpf_iter__task { + struct bpf_iter_meta *meta; + struct task_struct *task; +} __attribute__((preserve_access_index)); + +SEC("iter/task") +int dump_task(struct bpf_iter__task *ctx) +{ + struct seq_file *seq = ctx->meta->seq; + struct task_struct *task = ctx->task; + int tgid; + + tgid = task->tgid; + bpf_seq_write(seq, &tgid, sizeof(tgid)); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c new file mode 100644 index 000000000000..0aa71b333cf3 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#define bpf_iter_meta bpf_iter_meta___not_used +#define bpf_iter__bpf_map bpf_iter__bpf_map___not_used +#include "vmlinux.h" +#undef bpf_iter_meta +#undef bpf_iter__bpf_map +#include <bpf/bpf_helpers.h> + +char _license[] SEC("license") = "GPL"; + +struct bpf_iter_meta { + struct seq_file *seq; + __u64 session_id; + __u64 seq_num; +} __attribute__((preserve_access_index)); + +struct bpf_iter__bpf_map { + struct bpf_iter_meta *meta; + struct bpf_map *map; +} __attribute__((preserve_access_index)); + +__u32 map1_id = 0, map2_id = 0; +__u32 map1_accessed = 0, map2_accessed = 0; +__u64 map1_seqnum = 0, map2_seqnum1 = 0, map2_seqnum2 = 0; + +static volatile const __u32 print_len; +static volatile const __u32 ret1; + +SEC("iter/bpf_map") +int dump_bpf_map(struct bpf_iter__bpf_map *ctx) +{ + struct seq_file *seq = ctx->meta->seq; + struct bpf_map *map = ctx->map; + __u64 seq_num; + int i, ret = 0; + + if (map == (void *)0) + return 0; + + /* only dump map1_id and map2_id */ + if (map->id != map1_id && map->id != map2_id) + return 0; + + seq_num = ctx->meta->seq_num; + if (map->id == map1_id) { + map1_seqnum = seq_num; + map1_accessed++; + } + + if (map->id == map2_id) { + if (map2_accessed == 0) { + map2_seqnum1 = seq_num; + if (ret1) + ret = 1; + } else { + map2_seqnum2 = seq_num; + } + map2_accessed++; + } + + /* fill seq_file buffer */ + for (i = 0; i < print_len; i++) + bpf_seq_write(seq, &seq_num, sizeof(seq_num)); + + return ret; +} diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h new file mode 100644 index 000000000000..dee1339e6905 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2020 Facebook */ +/* "undefine" structs in vmlinux.h, because we "override" them below */ +#define bpf_iter_meta bpf_iter_meta___not_used +#define bpf_iter__task bpf_iter__task___not_used +#include "vmlinux.h" +#undef bpf_iter_meta +#undef bpf_iter__task +#include <bpf/bpf_helpers.h> + +char _license[] SEC("license") = "GPL"; +int count = 0; + +struct bpf_iter_meta { + struct seq_file *seq; + __u64 session_id; + __u64 seq_num; +} __attribute__((preserve_access_index)); + +struct bpf_iter__task { + struct bpf_iter_meta *meta; + struct task_struct *task; +} __attribute__((preserve_access_index)); + +SEC("iter/task") +int dump_task(struct bpf_iter__task *ctx) +{ + struct seq_file *seq = ctx->meta->seq; + char c; + + if (count < 4) { + c = START_CHAR + count; + bpf_seq_write(seq, &c, sizeof(c)); + count++; + } + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c b/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c index d4a02fe44a12..31975c96e2c9 100644 --- a/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c +++ b/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c @@ -13,7 +13,7 @@ enum e1 { enum e2 { C = 100, - D = -100, + D = 4294967295, E = 0, }; diff --git a/tools/testing/selftests/bpf/progs/cgroup_skb_sk_lookup_kern.c b/tools/testing/selftests/bpf/progs/cgroup_skb_sk_lookup_kern.c new file mode 100644 index 000000000000..3f757e30d7a0 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/cgroup_skb_sk_lookup_kern.c @@ -0,0 +1,97 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Facebook + +#include <linux/bpf.h> +#include <bpf/bpf_endian.h> +#include <bpf/bpf_helpers.h> + +#include <linux/if_ether.h> +#include <linux/in.h> +#include <linux/in6.h> +#include <linux/ipv6.h> +#include <linux/tcp.h> + +#include <sys/types.h> +#include <sys/socket.h> + +int _version SEC("version") = 1; +char _license[] SEC("license") = "GPL"; + +__u16 g_serv_port = 0; + +static inline void set_ip(__u32 *dst, const struct in6_addr *src) +{ + dst[0] = src->in6_u.u6_addr32[0]; + dst[1] = src->in6_u.u6_addr32[1]; + dst[2] = src->in6_u.u6_addr32[2]; + dst[3] = src->in6_u.u6_addr32[3]; +} + +static inline void set_tuple(struct bpf_sock_tuple *tuple, + const struct ipv6hdr *ip6h, + const struct tcphdr *tcph) +{ + set_ip(tuple->ipv6.saddr, &ip6h->daddr); + set_ip(tuple->ipv6.daddr, &ip6h->saddr); + tuple->ipv6.sport = tcph->dest; + tuple->ipv6.dport = tcph->source; +} + +static inline int is_allowed_peer_cg(struct __sk_buff *skb, + const struct ipv6hdr *ip6h, + const struct tcphdr *tcph) +{ + __u64 cgid, acgid, peer_cgid, peer_acgid; + struct bpf_sock_tuple tuple; + size_t tuple_len = sizeof(tuple.ipv6); + struct bpf_sock *peer_sk; + + set_tuple(&tuple, ip6h, tcph); + + peer_sk = bpf_sk_lookup_tcp(skb, &tuple, tuple_len, + BPF_F_CURRENT_NETNS, 0); + if (!peer_sk) + return 0; + + cgid = bpf_skb_cgroup_id(skb); + peer_cgid = bpf_sk_cgroup_id(peer_sk); + + acgid = bpf_skb_ancestor_cgroup_id(skb, 2); + peer_acgid = bpf_sk_ancestor_cgroup_id(peer_sk, 2); + + bpf_sk_release(peer_sk); + + return cgid && cgid == peer_cgid && acgid && acgid == peer_acgid; +} + +SEC("cgroup_skb/ingress") +int ingress_lookup(struct __sk_buff *skb) +{ + __u32 serv_port_key = 0; + struct ipv6hdr ip6h; + struct tcphdr tcph; + + if (skb->protocol != bpf_htons(ETH_P_IPV6)) + return 1; + + /* For SYN packets coming to listening socket skb->remote_port will be + * zero, so IPv6/TCP headers are loaded to identify remote peer + * instead. + */ + if (bpf_skb_load_bytes(skb, 0, &ip6h, sizeof(ip6h))) + return 1; + + if (ip6h.nexthdr != IPPROTO_TCP) + return 1; + + if (bpf_skb_load_bytes(skb, sizeof(ip6h), &tcph, sizeof(tcph))) + return 1; + + if (!g_serv_port) + return 0; + + if (tcph.dest != g_serv_port) + return 1; + + return is_allowed_peer_cg(skb, &ip6h, &tcph); +} diff --git a/tools/testing/selftests/bpf/progs/connect4_prog.c b/tools/testing/selftests/bpf/progs/connect4_prog.c index 75085119c5bb..1ab2c5eba86c 100644 --- a/tools/testing/selftests/bpf/progs/connect4_prog.c +++ b/tools/testing/selftests/bpf/progs/connect4_prog.c @@ -8,6 +8,9 @@ #include <linux/in.h> #include <linux/in6.h> #include <sys/socket.h> +#include <netinet/tcp.h> +#include <linux/if.h> +#include <errno.h> #include <bpf/bpf_helpers.h> #include <bpf/bpf_endian.h> @@ -16,13 +19,95 @@ #define DST_REWRITE_IP4 0x7f000001U #define DST_REWRITE_PORT4 4444 +#ifndef TCP_CA_NAME_MAX +#define TCP_CA_NAME_MAX 16 +#endif + +#ifndef IFNAMSIZ +#define IFNAMSIZ 16 +#endif + int _version SEC("version") = 1; +__attribute__ ((noinline)) +int do_bind(struct bpf_sock_addr *ctx) +{ + struct sockaddr_in sa = {}; + + sa.sin_family = AF_INET; + sa.sin_port = bpf_htons(0); + sa.sin_addr.s_addr = bpf_htonl(SRC_REWRITE_IP4); + + if (bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa)) != 0) + return 0; + + return 1; +} + +static __inline int verify_cc(struct bpf_sock_addr *ctx, + char expected[TCP_CA_NAME_MAX]) +{ + char buf[TCP_CA_NAME_MAX]; + int i; + + if (bpf_getsockopt(ctx, SOL_TCP, TCP_CONGESTION, &buf, sizeof(buf))) + return 1; + + for (i = 0; i < TCP_CA_NAME_MAX; i++) { + if (buf[i] != expected[i]) + return 1; + if (buf[i] == 0) + break; + } + + return 0; +} + +static __inline int set_cc(struct bpf_sock_addr *ctx) +{ + char reno[TCP_CA_NAME_MAX] = "reno"; + char cubic[TCP_CA_NAME_MAX] = "cubic"; + + if (bpf_setsockopt(ctx, SOL_TCP, TCP_CONGESTION, &reno, sizeof(reno))) + return 1; + if (verify_cc(ctx, reno)) + return 1; + + if (bpf_setsockopt(ctx, SOL_TCP, TCP_CONGESTION, &cubic, sizeof(cubic))) + return 1; + if (verify_cc(ctx, cubic)) + return 1; + + return 0; +} + +static __inline int bind_to_device(struct bpf_sock_addr *ctx) +{ + char veth1[IFNAMSIZ] = "test_sock_addr1"; + char veth2[IFNAMSIZ] = "test_sock_addr2"; + char missing[IFNAMSIZ] = "nonexistent_dev"; + char del_bind[IFNAMSIZ] = ""; + + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, + &veth1, sizeof(veth1))) + return 1; + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, + &veth2, sizeof(veth2))) + return 1; + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, + &missing, sizeof(missing)) != -ENODEV) + return 1; + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE, + &del_bind, sizeof(del_bind))) + return 1; + + return 0; +} + SEC("cgroup/connect4") int connect_v4_prog(struct bpf_sock_addr *ctx) { struct bpf_sock_tuple tuple = {}; - struct sockaddr_in sa; struct bpf_sock *sk; /* Verify that new destination is available. */ @@ -32,6 +117,10 @@ int connect_v4_prog(struct bpf_sock_addr *ctx) tuple.ipv4.daddr = bpf_htonl(DST_REWRITE_IP4); tuple.ipv4.dport = bpf_htons(DST_REWRITE_PORT4); + /* Bind to device and unbind it. */ + if (bind_to_device(ctx)) + return 0; + if (ctx->type != SOCK_STREAM && ctx->type != SOCK_DGRAM) return 0; else if (ctx->type == SOCK_STREAM) @@ -52,21 +141,15 @@ int connect_v4_prog(struct bpf_sock_addr *ctx) bpf_sk_release(sk); + /* Rewrite congestion control. */ + if (ctx->type == SOCK_STREAM && set_cc(ctx)) + return 0; + /* Rewrite destination. */ ctx->user_ip4 = bpf_htonl(DST_REWRITE_IP4); ctx->user_port = bpf_htons(DST_REWRITE_PORT4); - /* Rewrite source. */ - memset(&sa, 0, sizeof(sa)); - - sa.sin_family = AF_INET; - sa.sin_port = bpf_htons(0); - sa.sin_addr.s_addr = bpf_htonl(SRC_REWRITE_IP4); - - if (bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa)) != 0) - return 0; - - return 1; + return do_bind(ctx) ? 1 : 0; } char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/connect_force_port4.c b/tools/testing/selftests/bpf/progs/connect_force_port4.c new file mode 100644 index 000000000000..7396308677a3 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/connect_force_port4.c @@ -0,0 +1,83 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <string.h> +#include <stdbool.h> + +#include <linux/bpf.h> +#include <linux/in.h> +#include <linux/in6.h> +#include <sys/socket.h> + +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +char _license[] SEC("license") = "GPL"; +int _version SEC("version") = 1; + +struct svc_addr { + __be32 addr; + __be16 port; +}; + +struct { + __uint(type, BPF_MAP_TYPE_SK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct svc_addr); +} service_mapping SEC(".maps"); + +SEC("cgroup/connect4") +int connect4(struct bpf_sock_addr *ctx) +{ + struct sockaddr_in sa = {}; + struct svc_addr *orig; + + /* Force local address to 127.0.0.1:22222. */ + sa.sin_family = AF_INET; + sa.sin_port = bpf_htons(22222); + sa.sin_addr.s_addr = bpf_htonl(0x7f000001); + + if (bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa)) != 0) + return 0; + + /* Rewire service 1.2.3.4:60000 to backend 127.0.0.1:60123. */ + if (ctx->user_port == bpf_htons(60000)) { + orig = bpf_sk_storage_get(&service_mapping, ctx->sk, 0, + BPF_SK_STORAGE_GET_F_CREATE); + if (!orig) + return 0; + + orig->addr = ctx->user_ip4; + orig->port = ctx->user_port; + + ctx->user_ip4 = bpf_htonl(0x7f000001); + ctx->user_port = bpf_htons(60123); + } + return 1; +} + +SEC("cgroup/getsockname4") +int getsockname4(struct bpf_sock_addr *ctx) +{ + /* Expose local server as 1.2.3.4:60000 to client. */ + if (ctx->user_port == bpf_htons(60123)) { + ctx->user_ip4 = bpf_htonl(0x01020304); + ctx->user_port = bpf_htons(60000); + } + return 1; +} + +SEC("cgroup/getpeername4") +int getpeername4(struct bpf_sock_addr *ctx) +{ + struct svc_addr *orig; + + /* Expose service 1.2.3.4:60000 as peer instead of backend. */ + if (ctx->user_port == bpf_htons(60123)) { + orig = bpf_sk_storage_get(&service_mapping, ctx->sk, 0, 0); + if (orig) { + ctx->user_ip4 = orig->addr; + ctx->user_port = orig->port; + } + } + return 1; +} diff --git a/tools/testing/selftests/bpf/progs/connect_force_port6.c b/tools/testing/selftests/bpf/progs/connect_force_port6.c new file mode 100644 index 000000000000..c1a2b555e9ad --- /dev/null +++ b/tools/testing/selftests/bpf/progs/connect_force_port6.c @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <string.h> + +#include <linux/bpf.h> +#include <linux/in.h> +#include <linux/in6.h> +#include <sys/socket.h> + +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +char _license[] SEC("license") = "GPL"; +int _version SEC("version") = 1; + +struct svc_addr { + __be32 addr[4]; + __be16 port; +}; + +struct { + __uint(type, BPF_MAP_TYPE_SK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct svc_addr); +} service_mapping SEC(".maps"); + +SEC("cgroup/connect6") +int connect6(struct bpf_sock_addr *ctx) +{ + struct sockaddr_in6 sa = {}; + struct svc_addr *orig; + + /* Force local address to [::1]:22223. */ + sa.sin6_family = AF_INET6; + sa.sin6_port = bpf_htons(22223); + sa.sin6_addr.s6_addr32[3] = bpf_htonl(1); + + if (bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa)) != 0) + return 0; + + /* Rewire service [fc00::1]:60000 to backend [::1]:60124. */ + if (ctx->user_port == bpf_htons(60000)) { + orig = bpf_sk_storage_get(&service_mapping, ctx->sk, 0, + BPF_SK_STORAGE_GET_F_CREATE); + if (!orig) + return 0; + + orig->addr[0] = ctx->user_ip6[0]; + orig->addr[1] = ctx->user_ip6[1]; + orig->addr[2] = ctx->user_ip6[2]; + orig->addr[3] = ctx->user_ip6[3]; + orig->port = ctx->user_port; + + ctx->user_ip6[0] = 0; + ctx->user_ip6[1] = 0; + ctx->user_ip6[2] = 0; + ctx->user_ip6[3] = bpf_htonl(1); + ctx->user_port = bpf_htons(60124); + } + return 1; +} + +SEC("cgroup/getsockname6") +int getsockname6(struct bpf_sock_addr *ctx) +{ + /* Expose local server as [fc00::1]:60000 to client. */ + if (ctx->user_port == bpf_htons(60124)) { + ctx->user_ip6[0] = bpf_htonl(0xfc000000); + ctx->user_ip6[1] = 0; + ctx->user_ip6[2] = 0; + ctx->user_ip6[3] = bpf_htonl(1); + ctx->user_port = bpf_htons(60000); + } + return 1; +} + +SEC("cgroup/getpeername6") +int getpeername6(struct bpf_sock_addr *ctx) +{ + struct svc_addr *orig; + + /* Expose service [fc00::1]:60000 as peer instead of backend. */ + if (ctx->user_port == bpf_htons(60124)) { + orig = bpf_sk_storage_get(&service_mapping, ctx->sk, 0, 0); + if (orig) { + ctx->user_ip6[0] = orig->addr[0]; + ctx->user_ip6[1] = orig->addr[1]; + ctx->user_ip6[2] = orig->addr[2]; + ctx->user_ip6[3] = orig->addr[3]; + ctx->user_port = orig->port; + } + } + return 1; +} diff --git a/tools/testing/selftests/bpf/progs/core_reloc_types.h b/tools/testing/selftests/bpf/progs/core_reloc_types.h index 6d598cfbdb3e..34d84717c946 100644 --- a/tools/testing/selftests/bpf/progs/core_reloc_types.h +++ b/tools/testing/selftests/bpf/progs/core_reloc_types.h @@ -379,7 +379,7 @@ struct core_reloc_arrays___equiv_zero_sz_arr { struct core_reloc_arrays_substruct c[3]; struct core_reloc_arrays_substruct d[1][2]; /* equivalent to flexible array */ - struct core_reloc_arrays_substruct f[0][2]; + struct core_reloc_arrays_substruct f[][2]; }; struct core_reloc_arrays___fixed_arr { diff --git a/tools/testing/selftests/bpf/progs/fentry_test.c b/tools/testing/selftests/bpf/progs/fentry_test.c index 38d3a82144ca..9365b686f84b 100644 --- a/tools/testing/selftests/bpf/progs/fentry_test.c +++ b/tools/testing/selftests/bpf/progs/fentry_test.c @@ -2,7 +2,7 @@ /* Copyright (c) 2019 Facebook */ #include <linux/bpf.h> #include <bpf/bpf_helpers.h> -#include "bpf_trace_helpers.h" +#include <bpf/bpf_tracing.h> char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c b/tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c index c329fccf9842..98e1efe14549 100644 --- a/tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c +++ b/tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c @@ -5,7 +5,7 @@ #include <linux/bpf.h> #include <bpf/bpf_helpers.h> #include <bpf/bpf_endian.h> -#include "bpf_trace_helpers.h" +#include <bpf/bpf_tracing.h> struct sk_buff { unsigned int len; diff --git a/tools/testing/selftests/bpf/progs/fexit_bpf2bpf_simple.c b/tools/testing/selftests/bpf/progs/fexit_bpf2bpf_simple.c index 92f3fa47cf40..85c0b516d6ee 100644 --- a/tools/testing/selftests/bpf/progs/fexit_bpf2bpf_simple.c +++ b/tools/testing/selftests/bpf/progs/fexit_bpf2bpf_simple.c @@ -2,7 +2,7 @@ /* Copyright (c) 2019 Facebook */ #include <linux/bpf.h> #include <bpf/bpf_helpers.h> -#include "bpf_trace_helpers.h" +#include <bpf/bpf_tracing.h> struct sk_buff { unsigned int len; diff --git a/tools/testing/selftests/bpf/progs/fexit_test.c b/tools/testing/selftests/bpf/progs/fexit_test.c index 348109b9ea07..bd1e17d8024c 100644 --- a/tools/testing/selftests/bpf/progs/fexit_test.c +++ b/tools/testing/selftests/bpf/progs/fexit_test.c @@ -2,7 +2,7 @@ /* Copyright (c) 2019 Facebook */ #include <linux/bpf.h> #include <bpf/bpf_helpers.h> -#include "bpf_trace_helpers.h" +#include <bpf/bpf_tracing.h> char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/freplace_connect4.c b/tools/testing/selftests/bpf/progs/freplace_connect4.c new file mode 100644 index 000000000000..a0ae84230699 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/freplace_connect4.c @@ -0,0 +1,18 @@ +#include <linux/stddef.h> +#include <linux/ipv6.h> +#include <linux/bpf.h> +#include <linux/in.h> +#include <sys/socket.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +SEC("freplace/do_bind") +int new_do_bind(struct bpf_sock_addr *ctx) +{ + struct sockaddr_in sa = {}; + + bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa)); + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/kfree_skb.c b/tools/testing/selftests/bpf/progs/kfree_skb.c index 8f48a909f079..a46a264ce24e 100644 --- a/tools/testing/selftests/bpf/progs/kfree_skb.c +++ b/tools/testing/selftests/bpf/progs/kfree_skb.c @@ -4,7 +4,7 @@ #include <stdbool.h> #include <bpf/bpf_helpers.h> #include <bpf/bpf_endian.h> -#include "bpf_trace_helpers.h" +#include <bpf/bpf_tracing.h> char _license[] SEC("license") = "GPL"; struct { diff --git a/tools/testing/selftests/bpf/progs/lsm.c b/tools/testing/selftests/bpf/progs/lsm.c new file mode 100644 index 000000000000..b4598d4bc4f7 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/lsm.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2020 Google LLC. + */ + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <errno.h> + +char _license[] SEC("license") = "GPL"; + +int monitored_pid = 0; +int mprotect_count = 0; +int bprm_count = 0; + +SEC("lsm/file_mprotect") +int BPF_PROG(test_int_hook, struct vm_area_struct *vma, + unsigned long reqprot, unsigned long prot, int ret) +{ + if (ret != 0) + return ret; + + __u32 pid = bpf_get_current_pid_tgid() >> 32; + int is_stack = 0; + + is_stack = (vma->vm_start <= vma->vm_mm->start_stack && + vma->vm_end >= vma->vm_mm->start_stack); + + if (is_stack && monitored_pid == pid) { + mprotect_count++; + ret = -EPERM; + } + + return ret; +} + +SEC("lsm/bprm_committed_creds") +int BPF_PROG(test_void_hook, struct linux_binprm *bprm) +{ + __u32 pid = bpf_get_current_pid_tgid() >> 32; + + if (monitored_pid == pid) + bprm_count++; + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/modify_return.c b/tools/testing/selftests/bpf/progs/modify_return.c new file mode 100644 index 000000000000..8b7466a15c6b --- /dev/null +++ b/tools/testing/selftests/bpf/progs/modify_return.c @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2020 Google LLC. + */ + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char _license[] SEC("license") = "GPL"; + +static int sequence = 0; +__s32 input_retval = 0; + +__u64 fentry_result = 0; +SEC("fentry/bpf_modify_return_test") +int BPF_PROG(fentry_test, int a, __u64 b) +{ + sequence++; + fentry_result = (sequence == 1); + return 0; +} + +__u64 fmod_ret_result = 0; +SEC("fmod_ret/bpf_modify_return_test") +int BPF_PROG(fmod_ret_test, int a, int *b, int ret) +{ + sequence++; + /* This is the first fmod_ret program, the ret passed should be 0 */ + fmod_ret_result = (sequence == 2 && ret == 0); + return input_retval; +} + +__u64 fexit_result = 0; +SEC("fexit/bpf_modify_return_test") +int BPF_PROG(fexit_test, int a, __u64 b, int ret) +{ + sequence++; + /* If the input_reval is non-zero a successful modification should have + * occurred. + */ + if (input_retval) + fexit_result = (sequence == 3 && ret == input_retval); + else + fexit_result = (sequence == 3 && ret == 4); + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/perfbuf_bench.c b/tools/testing/selftests/bpf/progs/perfbuf_bench.c new file mode 100644 index 000000000000..e5ab4836a641 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/perfbuf_bench.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Facebook + +#include <linux/bpf.h> +#include <stdint.h> +#include <bpf/bpf_helpers.h> + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(value_size, sizeof(int)); + __uint(key_size, sizeof(int)); +} perfbuf SEC(".maps"); + +const volatile int batch_cnt = 0; + +long sample_val = 42; +long dropped __attribute__((aligned(128))) = 0; + +SEC("fentry/__x64_sys_getpgid") +int bench_perfbuf(void *ctx) +{ + __u64 *sample; + int i; + + for (i = 0; i < batch_cnt; i++) { + if (bpf_perf_event_output(ctx, &perfbuf, BPF_F_CURRENT_CPU, + &sample_val, sizeof(sample_val))) + __sync_add_and_fetch(&dropped, 1); + } + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/ringbuf_bench.c b/tools/testing/selftests/bpf/progs/ringbuf_bench.c new file mode 100644 index 000000000000..123607d314d6 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/ringbuf_bench.c @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Facebook + +#include <linux/bpf.h> +#include <stdint.h> +#include <bpf/bpf_helpers.h> + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); +} ringbuf SEC(".maps"); + +const volatile int batch_cnt = 0; +const volatile long use_output = 0; + +long sample_val = 42; +long dropped __attribute__((aligned(128))) = 0; + +const volatile long wakeup_data_size = 0; + +static __always_inline long get_flags() +{ + long sz; + + if (!wakeup_data_size) + return 0; + + sz = bpf_ringbuf_query(&ringbuf, BPF_RB_AVAIL_DATA); + return sz >= wakeup_data_size ? BPF_RB_FORCE_WAKEUP : BPF_RB_NO_WAKEUP; +} + +SEC("fentry/__x64_sys_getpgid") +int bench_ringbuf(void *ctx) +{ + long *sample, flags; + int i; + + if (!use_output) { + for (i = 0; i < batch_cnt; i++) { + sample = bpf_ringbuf_reserve(&ringbuf, + sizeof(sample_val), 0); + if (!sample) { + __sync_add_and_fetch(&dropped, 1); + } else { + *sample = sample_val; + flags = get_flags(); + bpf_ringbuf_submit(sample, flags); + } + } + } else { + for (i = 0; i < batch_cnt; i++) { + flags = get_flags(); + if (bpf_ringbuf_output(&ringbuf, &sample_val, + sizeof(sample_val), flags)) + __sync_add_and_fetch(&dropped, 1); + } + } + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/sockmap_parse_prog.c b/tools/testing/selftests/bpf/progs/sockmap_parse_prog.c index a5c6d5903b22..ca283af80d4e 100644 --- a/tools/testing/selftests/bpf/progs/sockmap_parse_prog.c +++ b/tools/testing/selftests/bpf/progs/sockmap_parse_prog.c @@ -12,7 +12,6 @@ int bpf_prog1(struct __sk_buff *skb) __u32 lport = skb->local_port; __u32 rport = skb->remote_port; __u8 *d = data; - __u32 len = (__u32) data_end - (__u32) data; int err; if (data + 10 > data_end) { diff --git a/tools/testing/selftests/bpf/progs/test_attach_probe.c b/tools/testing/selftests/bpf/progs/test_attach_probe.c index dd8fae6660ab..8056a4c6d918 100644 --- a/tools/testing/selftests/bpf/progs/test_attach_probe.c +++ b/tools/testing/selftests/bpf/progs/test_attach_probe.c @@ -4,6 +4,7 @@ #include <linux/ptrace.h> #include <linux/bpf.h> #include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> int kprobe_res = 0; int kretprobe_res = 0; @@ -18,7 +19,7 @@ int handle_kprobe(struct pt_regs *ctx) } SEC("kretprobe/sys_nanosleep") -int handle_kretprobe(struct pt_regs *ctx) +int BPF_KRETPROBE(handle_kretprobe) { kretprobe_res = 2; return 0; diff --git a/tools/testing/selftests/bpf/progs/test_btf_haskv.c b/tools/testing/selftests/bpf/progs/test_btf_haskv.c index 88b0566da13d..31538c9ed193 100644 --- a/tools/testing/selftests/bpf/progs/test_btf_haskv.c +++ b/tools/testing/selftests/bpf/progs/test_btf_haskv.c @@ -20,20 +20,12 @@ struct bpf_map_def SEC("maps") btf_map = { BPF_ANNOTATE_KV_PAIR(btf_map, int, struct ipv_counts); -struct dummy_tracepoint_args { - unsigned long long pad; - struct sock *sock; -}; - __attribute__((noinline)) -int test_long_fname_2(struct dummy_tracepoint_args *arg) +int test_long_fname_2(void) { struct ipv_counts *counts; int key = 0; - if (!arg->sock) - return 0; - counts = bpf_map_lookup_elem(&btf_map, &key); if (!counts) return 0; @@ -44,15 +36,15 @@ int test_long_fname_2(struct dummy_tracepoint_args *arg) } __attribute__((noinline)) -int test_long_fname_1(struct dummy_tracepoint_args *arg) +int test_long_fname_1(void) { - return test_long_fname_2(arg); + return test_long_fname_2(); } SEC("dummy_tracepoint") -int _dummy_tracepoint(struct dummy_tracepoint_args *arg) +int _dummy_tracepoint(void *arg) { - return test_long_fname_1(arg); + return test_long_fname_1(); } char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_btf_map_in_map.c b/tools/testing/selftests/bpf/progs/test_btf_map_in_map.c new file mode 100644 index 000000000000..e5093796be97 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_btf_map_in_map.c @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2020 Facebook */ +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +struct inner_map { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, int); +} inner_map1 SEC(".maps"), + inner_map2 SEC(".maps"); + +struct outer_arr { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, 3); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); + /* it's possible to use anonymous struct as inner map definition here */ + __array(values, struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + /* changing max_entries to 2 will fail during load + * due to incompatibility with inner_map definition */ + __uint(max_entries, 1); + __type(key, int); + __type(value, int); + }); +} outer_arr SEC(".maps") = { + /* (void *) cast is necessary because we didn't use `struct inner_map` + * in __inner(values, ...) + * Actually, a conscious effort is required to screw up initialization + * of inner map slots, which is a great thing! + */ + .values = { (void *)&inner_map1, 0, (void *)&inner_map2 }, +}; + +struct outer_hash { + __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS); + __uint(max_entries, 5); + __uint(key_size, sizeof(int)); + /* Here everything works flawlessly due to reuse of struct inner_map + * and compiler will complain at the attempt to use non-inner_map + * references below. This is great experience. + */ + __array(values, struct inner_map); +} outer_hash SEC(".maps") = { + .values = { + [0] = &inner_map2, + [4] = &inner_map1, + }, +}; + +int input = 0; + +SEC("raw_tp/sys_enter") +int handle__sys_enter(void *ctx) +{ + struct inner_map *inner_map; + int key = 0, val; + + inner_map = bpf_map_lookup_elem(&outer_arr, &key); + if (!inner_map) + return 1; + val = input; + bpf_map_update_elem(inner_map, &key, &val, 0); + + inner_map = bpf_map_lookup_elem(&outer_hash, &key); + if (!inner_map) + return 1; + val = input + 1; + bpf_map_update_elem(inner_map, &key, &val, 0); + + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_btf_newkv.c b/tools/testing/selftests/bpf/progs/test_btf_newkv.c index a924e53c8e9d..6c5560162746 100644 --- a/tools/testing/selftests/bpf/progs/test_btf_newkv.c +++ b/tools/testing/selftests/bpf/progs/test_btf_newkv.c @@ -28,20 +28,12 @@ struct { __type(value, struct ipv_counts); } btf_map SEC(".maps"); -struct dummy_tracepoint_args { - unsigned long long pad; - struct sock *sock; -}; - __attribute__((noinline)) -int test_long_fname_2(struct dummy_tracepoint_args *arg) +int test_long_fname_2(void) { struct ipv_counts *counts; int key = 0; - if (!arg->sock) - return 0; - counts = bpf_map_lookup_elem(&btf_map, &key); if (!counts) return 0; @@ -57,15 +49,15 @@ int test_long_fname_2(struct dummy_tracepoint_args *arg) } __attribute__((noinline)) -int test_long_fname_1(struct dummy_tracepoint_args *arg) +int test_long_fname_1(void) { - return test_long_fname_2(arg); + return test_long_fname_2(); } SEC("dummy_tracepoint") -int _dummy_tracepoint(struct dummy_tracepoint_args *arg) +int _dummy_tracepoint(void *arg) { - return test_long_fname_1(arg); + return test_long_fname_1(); } char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_btf_nokv.c b/tools/testing/selftests/bpf/progs/test_btf_nokv.c index 983aedd1c072..506da7fd2da2 100644 --- a/tools/testing/selftests/bpf/progs/test_btf_nokv.c +++ b/tools/testing/selftests/bpf/progs/test_btf_nokv.c @@ -17,20 +17,12 @@ struct bpf_map_def SEC("maps") btf_map = { .max_entries = 4, }; -struct dummy_tracepoint_args { - unsigned long long pad; - struct sock *sock; -}; - __attribute__((noinline)) -int test_long_fname_2(struct dummy_tracepoint_args *arg) +int test_long_fname_2(void) { struct ipv_counts *counts; int key = 0; - if (!arg->sock) - return 0; - counts = bpf_map_lookup_elem(&btf_map, &key); if (!counts) return 0; @@ -41,15 +33,15 @@ int test_long_fname_2(struct dummy_tracepoint_args *arg) } __attribute__((noinline)) -int test_long_fname_1(struct dummy_tracepoint_args *arg) +int test_long_fname_1(void) { - return test_long_fname_2(arg); + return test_long_fname_2(); } SEC("dummy_tracepoint") -int _dummy_tracepoint(struct dummy_tracepoint_args *arg) +int _dummy_tracepoint(void *arg) { - return test_long_fname_1(arg); + return test_long_fname_1(); } char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_cgroup_link.c b/tools/testing/selftests/bpf/progs/test_cgroup_link.c new file mode 100644 index 000000000000..77e47b9e4446 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_cgroup_link.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Facebook +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +int calls = 0; +int alt_calls = 0; + +SEC("cgroup_skb/egress1") +int egress(struct __sk_buff *skb) +{ + __sync_fetch_and_add(&calls, 1); + return 1; +} + +SEC("cgroup_skb/egress2") +int egress_alt(struct __sk_buff *skb) +{ + __sync_fetch_and_add(&alt_calls, 1); + return 1; +} + +char _license[] SEC("license") = "GPL"; + diff --git a/tools/testing/selftests/bpf/progs/test_cls_redirect.c b/tools/testing/selftests/bpf/progs/test_cls_redirect.c new file mode 100644 index 000000000000..f0b72e86bee5 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_cls_redirect.c @@ -0,0 +1,1061 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +// Copyright (c) 2019, 2020 Cloudflare + +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> +#include <string.h> + +#include <linux/bpf.h> +#include <linux/icmp.h> +#include <linux/icmpv6.h> +#include <linux/if_ether.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/pkt_cls.h> +#include <linux/tcp.h> +#include <linux/udp.h> + +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +#include "test_cls_redirect.h" + +#define offsetofend(TYPE, MEMBER) \ + (offsetof(TYPE, MEMBER) + sizeof((((TYPE *)0)->MEMBER))) + +#define IP_OFFSET_MASK (0x1FFF) +#define IP_MF (0x2000) + +char _license[] SEC("license") = "Dual BSD/GPL"; + +/** + * Destination port and IP used for UDP encapsulation. + */ +static volatile const __be16 ENCAPSULATION_PORT; +static volatile const __be32 ENCAPSULATION_IP; + +typedef struct { + uint64_t processed_packets_total; + uint64_t l3_protocol_packets_total_ipv4; + uint64_t l3_protocol_packets_total_ipv6; + uint64_t l4_protocol_packets_total_tcp; + uint64_t l4_protocol_packets_total_udp; + uint64_t accepted_packets_total_syn; + uint64_t accepted_packets_total_syn_cookies; + uint64_t accepted_packets_total_last_hop; + uint64_t accepted_packets_total_icmp_echo_request; + uint64_t accepted_packets_total_established; + uint64_t forwarded_packets_total_gue; + uint64_t forwarded_packets_total_gre; + + uint64_t errors_total_unknown_l3_proto; + uint64_t errors_total_unknown_l4_proto; + uint64_t errors_total_malformed_ip; + uint64_t errors_total_fragmented_ip; + uint64_t errors_total_malformed_icmp; + uint64_t errors_total_unwanted_icmp; + uint64_t errors_total_malformed_icmp_pkt_too_big; + uint64_t errors_total_malformed_tcp; + uint64_t errors_total_malformed_udp; + uint64_t errors_total_icmp_echo_replies; + uint64_t errors_total_malformed_encapsulation; + uint64_t errors_total_encap_adjust_failed; + uint64_t errors_total_encap_buffer_too_small; + uint64_t errors_total_redirect_loop; +} metrics_t; + +typedef enum { + INVALID = 0, + UNKNOWN, + ECHO_REQUEST, + SYN, + SYN_COOKIE, + ESTABLISHED, +} verdict_t; + +typedef struct { + uint16_t src, dst; +} flow_ports_t; + +_Static_assert( + sizeof(flow_ports_t) != + offsetofend(struct bpf_sock_tuple, ipv4.dport) - + offsetof(struct bpf_sock_tuple, ipv4.sport) - 1, + "flow_ports_t must match sport and dport in struct bpf_sock_tuple"); +_Static_assert( + sizeof(flow_ports_t) != + offsetofend(struct bpf_sock_tuple, ipv6.dport) - + offsetof(struct bpf_sock_tuple, ipv6.sport) - 1, + "flow_ports_t must match sport and dport in struct bpf_sock_tuple"); + +typedef int ret_t; + +/* This is a bit of a hack. We need a return value which allows us to + * indicate that the regular flow of the program should continue, + * while allowing functions to use XDP_PASS and XDP_DROP, etc. + */ +static const ret_t CONTINUE_PROCESSING = -1; + +/* Convenience macro to call functions which return ret_t. + */ +#define MAYBE_RETURN(x) \ + do { \ + ret_t __ret = x; \ + if (__ret != CONTINUE_PROCESSING) \ + return __ret; \ + } while (0) + +/* Linux packet pointers are either aligned to NET_IP_ALIGN (aka 2 bytes), + * or not aligned if the arch supports efficient unaligned access. + * + * Since the verifier ensures that eBPF packet accesses follow these rules, + * we can tell LLVM to emit code as if we always had a larger alignment. + * It will yell at us if we end up on a platform where this is not valid. + */ +typedef uint8_t *net_ptr __attribute__((align_value(8))); + +typedef struct buf { + struct __sk_buff *skb; + net_ptr head; + /* NB: tail musn't have alignment other than 1, otherwise + * LLVM will go and eliminate code, e.g. when checking packet lengths. + */ + uint8_t *const tail; +} buf_t; + +static size_t buf_off(const buf_t *buf) +{ + /* Clang seems to optimize constructs like + * a - b + c + * if c is known: + * r? = c + * r? -= b + * r? += a + * + * This is a problem if a and b are packet pointers, + * since the verifier allows subtracting two pointers to + * get a scalar, but not a scalar and a pointer. + * + * Use inline asm to break this optimization. + */ + size_t off = (size_t)buf->head; + asm("%0 -= %1" : "+r"(off) : "r"(buf->skb->data)); + return off; +} + +static bool buf_copy(buf_t *buf, void *dst, size_t len) +{ + if (bpf_skb_load_bytes(buf->skb, buf_off(buf), dst, len)) { + return false; + } + + buf->head += len; + return true; +} + +static bool buf_skip(buf_t *buf, const size_t len) +{ + /* Check whether off + len is valid in the non-linear part. */ + if (buf_off(buf) + len > buf->skb->len) { + return false; + } + + buf->head += len; + return true; +} + +/* Returns a pointer to the start of buf, or NULL if len is + * larger than the remaining data. Consumes len bytes on a successful + * call. + * + * If scratch is not NULL, the function will attempt to load non-linear + * data via bpf_skb_load_bytes. On success, scratch is returned. + */ +static void *buf_assign(buf_t *buf, const size_t len, void *scratch) +{ + if (buf->head + len > buf->tail) { + if (scratch == NULL) { + return NULL; + } + + return buf_copy(buf, scratch, len) ? scratch : NULL; + } + + void *ptr = buf->head; + buf->head += len; + return ptr; +} + +static bool pkt_skip_ipv4_options(buf_t *buf, const struct iphdr *ipv4) +{ + if (ipv4->ihl <= 5) { + return true; + } + + return buf_skip(buf, (ipv4->ihl - 5) * 4); +} + +static bool ipv4_is_fragment(const struct iphdr *ip) +{ + uint16_t frag_off = ip->frag_off & bpf_htons(IP_OFFSET_MASK); + return (ip->frag_off & bpf_htons(IP_MF)) != 0 || frag_off > 0; +} + +static struct iphdr *pkt_parse_ipv4(buf_t *pkt, struct iphdr *scratch) +{ + struct iphdr *ipv4 = buf_assign(pkt, sizeof(*ipv4), scratch); + if (ipv4 == NULL) { + return NULL; + } + + if (ipv4->ihl < 5) { + return NULL; + } + + if (!pkt_skip_ipv4_options(pkt, ipv4)) { + return NULL; + } + + return ipv4; +} + +/* Parse the L4 ports from a packet, assuming a layout like TCP or UDP. */ +static bool pkt_parse_icmp_l4_ports(buf_t *pkt, flow_ports_t *ports) +{ + if (!buf_copy(pkt, ports, sizeof(*ports))) { + return false; + } + + /* Ports in the L4 headers are reversed, since we are parsing an ICMP + * payload which is going towards the eyeball. + */ + uint16_t dst = ports->src; + ports->src = ports->dst; + ports->dst = dst; + return true; +} + +static uint16_t pkt_checksum_fold(uint32_t csum) +{ + /* The highest reasonable value for an IPv4 header + * checksum requires two folds, so we just do that always. + */ + csum = (csum & 0xffff) + (csum >> 16); + csum = (csum & 0xffff) + (csum >> 16); + return (uint16_t)~csum; +} + +static void pkt_ipv4_checksum(struct iphdr *iph) +{ + iph->check = 0; + + /* An IP header without options is 20 bytes. Two of those + * are the checksum, which we always set to zero. Hence, + * the maximum accumulated value is 18 / 2 * 0xffff = 0x8fff7, + * which fits in 32 bit. + */ + _Static_assert(sizeof(struct iphdr) == 20, "iphdr must be 20 bytes"); + uint32_t acc = 0; + uint16_t *ipw = (uint16_t *)iph; + +#pragma clang loop unroll(full) + for (size_t i = 0; i < sizeof(struct iphdr) / 2; i++) { + acc += ipw[i]; + } + + iph->check = pkt_checksum_fold(acc); +} + +static bool pkt_skip_ipv6_extension_headers(buf_t *pkt, + const struct ipv6hdr *ipv6, + uint8_t *upper_proto, + bool *is_fragment) +{ + /* We understand five extension headers. + * https://tools.ietf.org/html/rfc8200#section-4.1 states that all + * headers should occur once, except Destination Options, which may + * occur twice. Hence we give up after 6 headers. + */ + struct { + uint8_t next; + uint8_t len; + } exthdr = { + .next = ipv6->nexthdr, + }; + *is_fragment = false; + +#pragma clang loop unroll(full) + for (int i = 0; i < 6; i++) { + switch (exthdr.next) { + case IPPROTO_FRAGMENT: + *is_fragment = true; + /* NB: We don't check that hdrlen == 0 as per spec. */ + /* fallthrough; */ + + case IPPROTO_HOPOPTS: + case IPPROTO_ROUTING: + case IPPROTO_DSTOPTS: + case IPPROTO_MH: + if (!buf_copy(pkt, &exthdr, sizeof(exthdr))) { + return false; + } + + /* hdrlen is in 8-octet units, and excludes the first 8 octets. */ + if (!buf_skip(pkt, + (exthdr.len + 1) * 8 - sizeof(exthdr))) { + return false; + } + + /* Decode next header */ + break; + + default: + /* The next header is not one of the known extension + * headers, treat it as the upper layer header. + * + * This handles IPPROTO_NONE. + * + * Encapsulating Security Payload (50) and Authentication + * Header (51) also end up here (and will trigger an + * unknown proto error later). They have a custom header + * format and seem too esoteric to care about. + */ + *upper_proto = exthdr.next; + return true; + } + } + + /* We never found an upper layer header. */ + return false; +} + +/* This function has to be inlined, because the verifier otherwise rejects it + * due to returning a pointer to the stack. This is technically correct, since + * scratch is allocated on the stack. However, this usage should be safe since + * it's the callers stack after all. + */ +static inline __attribute__((__always_inline__)) struct ipv6hdr * +pkt_parse_ipv6(buf_t *pkt, struct ipv6hdr *scratch, uint8_t *proto, + bool *is_fragment) +{ + struct ipv6hdr *ipv6 = buf_assign(pkt, sizeof(*ipv6), scratch); + if (ipv6 == NULL) { + return NULL; + } + + if (!pkt_skip_ipv6_extension_headers(pkt, ipv6, proto, is_fragment)) { + return NULL; + } + + return ipv6; +} + +/* Global metrics, per CPU + */ +struct bpf_map_def metrics_map SEC("maps") = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(unsigned int), + .value_size = sizeof(metrics_t), + .max_entries = 1, +}; + +static metrics_t *get_global_metrics(void) +{ + uint64_t key = 0; + return bpf_map_lookup_elem(&metrics_map, &key); +} + +static ret_t accept_locally(struct __sk_buff *skb, encap_headers_t *encap) +{ + const int payload_off = + sizeof(*encap) + + sizeof(struct in_addr) * encap->unigue.hop_count; + int32_t encap_overhead = payload_off - sizeof(struct ethhdr); + + // Changing the ethertype if the encapsulated packet is ipv6 + if (encap->gue.proto_ctype == IPPROTO_IPV6) { + encap->eth.h_proto = bpf_htons(ETH_P_IPV6); + } + + if (bpf_skb_adjust_room(skb, -encap_overhead, BPF_ADJ_ROOM_MAC, + BPF_F_ADJ_ROOM_FIXED_GSO | + BPF_F_ADJ_ROOM_NO_CSUM_RESET) || + bpf_csum_level(skb, BPF_CSUM_LEVEL_DEC)) + return TC_ACT_SHOT; + + return bpf_redirect(skb->ifindex, BPF_F_INGRESS); +} + +static ret_t forward_with_gre(struct __sk_buff *skb, encap_headers_t *encap, + struct in_addr *next_hop, metrics_t *metrics) +{ + metrics->forwarded_packets_total_gre++; + + const int payload_off = + sizeof(*encap) + + sizeof(struct in_addr) * encap->unigue.hop_count; + int32_t encap_overhead = + payload_off - sizeof(struct ethhdr) - sizeof(struct iphdr); + int32_t delta = sizeof(struct gre_base_hdr) - encap_overhead; + uint16_t proto = ETH_P_IP; + + /* Loop protection: the inner packet's TTL is decremented as a safeguard + * against any forwarding loop. As the only interesting field is the TTL + * hop limit for IPv6, it is easier to use bpf_skb_load_bytes/bpf_skb_store_bytes + * as they handle the split packets if needed (no need for the data to be + * in the linear section). + */ + if (encap->gue.proto_ctype == IPPROTO_IPV6) { + proto = ETH_P_IPV6; + uint8_t ttl; + int rc; + + rc = bpf_skb_load_bytes( + skb, payload_off + offsetof(struct ipv6hdr, hop_limit), + &ttl, 1); + if (rc != 0) { + metrics->errors_total_malformed_encapsulation++; + return TC_ACT_SHOT; + } + + if (ttl == 0) { + metrics->errors_total_redirect_loop++; + return TC_ACT_SHOT; + } + + ttl--; + rc = bpf_skb_store_bytes( + skb, payload_off + offsetof(struct ipv6hdr, hop_limit), + &ttl, 1, 0); + if (rc != 0) { + metrics->errors_total_malformed_encapsulation++; + return TC_ACT_SHOT; + } + } else { + uint8_t ttl; + int rc; + + rc = bpf_skb_load_bytes( + skb, payload_off + offsetof(struct iphdr, ttl), &ttl, + 1); + if (rc != 0) { + metrics->errors_total_malformed_encapsulation++; + return TC_ACT_SHOT; + } + + if (ttl == 0) { + metrics->errors_total_redirect_loop++; + return TC_ACT_SHOT; + } + + /* IPv4 also has a checksum to patch. While the TTL is only one byte, + * this function only works for 2 and 4 bytes arguments (the result is + * the same). + */ + rc = bpf_l3_csum_replace( + skb, payload_off + offsetof(struct iphdr, check), ttl, + ttl - 1, 2); + if (rc != 0) { + metrics->errors_total_malformed_encapsulation++; + return TC_ACT_SHOT; + } + + ttl--; + rc = bpf_skb_store_bytes( + skb, payload_off + offsetof(struct iphdr, ttl), &ttl, 1, + 0); + if (rc != 0) { + metrics->errors_total_malformed_encapsulation++; + return TC_ACT_SHOT; + } + } + + if (bpf_skb_adjust_room(skb, delta, BPF_ADJ_ROOM_NET, + BPF_F_ADJ_ROOM_FIXED_GSO | + BPF_F_ADJ_ROOM_NO_CSUM_RESET) || + bpf_csum_level(skb, BPF_CSUM_LEVEL_INC)) { + metrics->errors_total_encap_adjust_failed++; + return TC_ACT_SHOT; + } + + if (bpf_skb_pull_data(skb, sizeof(encap_gre_t))) { + metrics->errors_total_encap_buffer_too_small++; + return TC_ACT_SHOT; + } + + buf_t pkt = { + .skb = skb, + .head = (uint8_t *)(long)skb->data, + .tail = (uint8_t *)(long)skb->data_end, + }; + + encap_gre_t *encap_gre = buf_assign(&pkt, sizeof(encap_gre_t), NULL); + if (encap_gre == NULL) { + metrics->errors_total_encap_buffer_too_small++; + return TC_ACT_SHOT; + } + + encap_gre->ip.protocol = IPPROTO_GRE; + encap_gre->ip.daddr = next_hop->s_addr; + encap_gre->ip.saddr = ENCAPSULATION_IP; + encap_gre->ip.tot_len = + bpf_htons(bpf_ntohs(encap_gre->ip.tot_len) + delta); + encap_gre->gre.flags = 0; + encap_gre->gre.protocol = bpf_htons(proto); + pkt_ipv4_checksum((void *)&encap_gre->ip); + + return bpf_redirect(skb->ifindex, 0); +} + +static ret_t forward_to_next_hop(struct __sk_buff *skb, encap_headers_t *encap, + struct in_addr *next_hop, metrics_t *metrics) +{ + /* swap L2 addresses */ + /* This assumes that packets are received from a router. + * So just swapping the MAC addresses here will make the packet go back to + * the router, which will send it to the appropriate machine. + */ + unsigned char temp[ETH_ALEN]; + memcpy(temp, encap->eth.h_dest, sizeof(temp)); + memcpy(encap->eth.h_dest, encap->eth.h_source, + sizeof(encap->eth.h_dest)); + memcpy(encap->eth.h_source, temp, sizeof(encap->eth.h_source)); + + if (encap->unigue.next_hop == encap->unigue.hop_count - 1 && + encap->unigue.last_hop_gre) { + return forward_with_gre(skb, encap, next_hop, metrics); + } + + metrics->forwarded_packets_total_gue++; + uint32_t old_saddr = encap->ip.saddr; + encap->ip.saddr = encap->ip.daddr; + encap->ip.daddr = next_hop->s_addr; + if (encap->unigue.next_hop < encap->unigue.hop_count) { + encap->unigue.next_hop++; + } + + /* Remove ip->saddr, add next_hop->s_addr */ + const uint64_t off = offsetof(typeof(*encap), ip.check); + int ret = bpf_l3_csum_replace(skb, off, old_saddr, next_hop->s_addr, 4); + if (ret < 0) { + return TC_ACT_SHOT; + } + + return bpf_redirect(skb->ifindex, 0); +} + +static ret_t skip_next_hops(buf_t *pkt, int n) +{ + switch (n) { + case 1: + if (!buf_skip(pkt, sizeof(struct in_addr))) + return TC_ACT_SHOT; + case 0: + return CONTINUE_PROCESSING; + + default: + return TC_ACT_SHOT; + } +} + +/* Get the next hop from the GLB header. + * + * Sets next_hop->s_addr to 0 if there are no more hops left. + * pkt is positioned just after the variable length GLB header + * iff the call is successful. + */ +static ret_t get_next_hop(buf_t *pkt, encap_headers_t *encap, + struct in_addr *next_hop) +{ + if (encap->unigue.next_hop > encap->unigue.hop_count) { + return TC_ACT_SHOT; + } + + /* Skip "used" next hops. */ + MAYBE_RETURN(skip_next_hops(pkt, encap->unigue.next_hop)); + + if (encap->unigue.next_hop == encap->unigue.hop_count) { + /* No more next hops, we are at the end of the GLB header. */ + next_hop->s_addr = 0; + return CONTINUE_PROCESSING; + } + + if (!buf_copy(pkt, next_hop, sizeof(*next_hop))) { + return TC_ACT_SHOT; + } + + /* Skip the remainig next hops (may be zero). */ + return skip_next_hops(pkt, encap->unigue.hop_count - + encap->unigue.next_hop - 1); +} + +/* Fill a bpf_sock_tuple to be used with the socket lookup functions. + * This is a kludge that let's us work around verifier limitations: + * + * fill_tuple(&t, foo, sizeof(struct iphdr), 123, 321) + * + * clang will substitue a costant for sizeof, which allows the verifier + * to track it's value. Based on this, it can figure out the constant + * return value, and calling code works while still being "generic" to + * IPv4 and IPv6. + */ +static uint64_t fill_tuple(struct bpf_sock_tuple *tuple, void *iph, + uint64_t iphlen, uint16_t sport, uint16_t dport) +{ + switch (iphlen) { + case sizeof(struct iphdr): { + struct iphdr *ipv4 = (struct iphdr *)iph; + tuple->ipv4.daddr = ipv4->daddr; + tuple->ipv4.saddr = ipv4->saddr; + tuple->ipv4.sport = sport; + tuple->ipv4.dport = dport; + return sizeof(tuple->ipv4); + } + + case sizeof(struct ipv6hdr): { + struct ipv6hdr *ipv6 = (struct ipv6hdr *)iph; + memcpy(&tuple->ipv6.daddr, &ipv6->daddr, + sizeof(tuple->ipv6.daddr)); + memcpy(&tuple->ipv6.saddr, &ipv6->saddr, + sizeof(tuple->ipv6.saddr)); + tuple->ipv6.sport = sport; + tuple->ipv6.dport = dport; + return sizeof(tuple->ipv6); + } + + default: + return 0; + } +} + +static verdict_t classify_tcp(struct __sk_buff *skb, + struct bpf_sock_tuple *tuple, uint64_t tuplen, + void *iph, struct tcphdr *tcp) +{ + struct bpf_sock *sk = + bpf_skc_lookup_tcp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0); + if (sk == NULL) { + return UNKNOWN; + } + + if (sk->state != BPF_TCP_LISTEN) { + bpf_sk_release(sk); + return ESTABLISHED; + } + + if (iph != NULL && tcp != NULL) { + /* Kludge: we've run out of arguments, but need the length of the ip header. */ + uint64_t iphlen = sizeof(struct iphdr); + if (tuplen == sizeof(tuple->ipv6)) { + iphlen = sizeof(struct ipv6hdr); + } + + if (bpf_tcp_check_syncookie(sk, iph, iphlen, tcp, + sizeof(*tcp)) == 0) { + bpf_sk_release(sk); + return SYN_COOKIE; + } + } + + bpf_sk_release(sk); + return UNKNOWN; +} + +static verdict_t classify_udp(struct __sk_buff *skb, + struct bpf_sock_tuple *tuple, uint64_t tuplen) +{ + struct bpf_sock *sk = + bpf_sk_lookup_udp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0); + if (sk == NULL) { + return UNKNOWN; + } + + if (sk->state == BPF_TCP_ESTABLISHED) { + bpf_sk_release(sk); + return ESTABLISHED; + } + + bpf_sk_release(sk); + return UNKNOWN; +} + +static verdict_t classify_icmp(struct __sk_buff *skb, uint8_t proto, + struct bpf_sock_tuple *tuple, uint64_t tuplen, + metrics_t *metrics) +{ + switch (proto) { + case IPPROTO_TCP: + return classify_tcp(skb, tuple, tuplen, NULL, NULL); + + case IPPROTO_UDP: + return classify_udp(skb, tuple, tuplen); + + default: + metrics->errors_total_malformed_icmp++; + return INVALID; + } +} + +static verdict_t process_icmpv4(buf_t *pkt, metrics_t *metrics) +{ + struct icmphdr icmp; + if (!buf_copy(pkt, &icmp, sizeof(icmp))) { + metrics->errors_total_malformed_icmp++; + return INVALID; + } + + /* We should never receive encapsulated echo replies. */ + if (icmp.type == ICMP_ECHOREPLY) { + metrics->errors_total_icmp_echo_replies++; + return INVALID; + } + + if (icmp.type == ICMP_ECHO) { + return ECHO_REQUEST; + } + + if (icmp.type != ICMP_DEST_UNREACH || icmp.code != ICMP_FRAG_NEEDED) { + metrics->errors_total_unwanted_icmp++; + return INVALID; + } + + struct iphdr _ip4; + const struct iphdr *ipv4 = pkt_parse_ipv4(pkt, &_ip4); + if (ipv4 == NULL) { + metrics->errors_total_malformed_icmp_pkt_too_big++; + return INVALID; + } + + /* The source address in the outer IP header is from the entity that + * originated the ICMP message. Use the original IP header to restore + * the correct flow tuple. + */ + struct bpf_sock_tuple tuple; + tuple.ipv4.saddr = ipv4->daddr; + tuple.ipv4.daddr = ipv4->saddr; + + if (!pkt_parse_icmp_l4_ports(pkt, (flow_ports_t *)&tuple.ipv4.sport)) { + metrics->errors_total_malformed_icmp_pkt_too_big++; + return INVALID; + } + + return classify_icmp(pkt->skb, ipv4->protocol, &tuple, + sizeof(tuple.ipv4), metrics); +} + +static verdict_t process_icmpv6(buf_t *pkt, metrics_t *metrics) +{ + struct icmp6hdr icmp6; + if (!buf_copy(pkt, &icmp6, sizeof(icmp6))) { + metrics->errors_total_malformed_icmp++; + return INVALID; + } + + /* We should never receive encapsulated echo replies. */ + if (icmp6.icmp6_type == ICMPV6_ECHO_REPLY) { + metrics->errors_total_icmp_echo_replies++; + return INVALID; + } + + if (icmp6.icmp6_type == ICMPV6_ECHO_REQUEST) { + return ECHO_REQUEST; + } + + if (icmp6.icmp6_type != ICMPV6_PKT_TOOBIG) { + metrics->errors_total_unwanted_icmp++; + return INVALID; + } + + bool is_fragment; + uint8_t l4_proto; + struct ipv6hdr _ipv6; + const struct ipv6hdr *ipv6 = + pkt_parse_ipv6(pkt, &_ipv6, &l4_proto, &is_fragment); + if (ipv6 == NULL) { + metrics->errors_total_malformed_icmp_pkt_too_big++; + return INVALID; + } + + if (is_fragment) { + metrics->errors_total_fragmented_ip++; + return INVALID; + } + + /* Swap source and dest addresses. */ + struct bpf_sock_tuple tuple; + memcpy(&tuple.ipv6.saddr, &ipv6->daddr, sizeof(tuple.ipv6.saddr)); + memcpy(&tuple.ipv6.daddr, &ipv6->saddr, sizeof(tuple.ipv6.daddr)); + + if (!pkt_parse_icmp_l4_ports(pkt, (flow_ports_t *)&tuple.ipv6.sport)) { + metrics->errors_total_malformed_icmp_pkt_too_big++; + return INVALID; + } + + return classify_icmp(pkt->skb, l4_proto, &tuple, sizeof(tuple.ipv6), + metrics); +} + +static verdict_t process_tcp(buf_t *pkt, void *iph, uint64_t iphlen, + metrics_t *metrics) +{ + metrics->l4_protocol_packets_total_tcp++; + + struct tcphdr _tcp; + struct tcphdr *tcp = buf_assign(pkt, sizeof(_tcp), &_tcp); + if (tcp == NULL) { + metrics->errors_total_malformed_tcp++; + return INVALID; + } + + if (tcp->syn) { + return SYN; + } + + struct bpf_sock_tuple tuple; + uint64_t tuplen = + fill_tuple(&tuple, iph, iphlen, tcp->source, tcp->dest); + return classify_tcp(pkt->skb, &tuple, tuplen, iph, tcp); +} + +static verdict_t process_udp(buf_t *pkt, void *iph, uint64_t iphlen, + metrics_t *metrics) +{ + metrics->l4_protocol_packets_total_udp++; + + struct udphdr _udp; + struct udphdr *udph = buf_assign(pkt, sizeof(_udp), &_udp); + if (udph == NULL) { + metrics->errors_total_malformed_udp++; + return INVALID; + } + + struct bpf_sock_tuple tuple; + uint64_t tuplen = + fill_tuple(&tuple, iph, iphlen, udph->source, udph->dest); + return classify_udp(pkt->skb, &tuple, tuplen); +} + +static verdict_t process_ipv4(buf_t *pkt, metrics_t *metrics) +{ + metrics->l3_protocol_packets_total_ipv4++; + + struct iphdr _ip4; + struct iphdr *ipv4 = pkt_parse_ipv4(pkt, &_ip4); + if (ipv4 == NULL) { + metrics->errors_total_malformed_ip++; + return INVALID; + } + + if (ipv4->version != 4) { + metrics->errors_total_malformed_ip++; + return INVALID; + } + + if (ipv4_is_fragment(ipv4)) { + metrics->errors_total_fragmented_ip++; + return INVALID; + } + + switch (ipv4->protocol) { + case IPPROTO_ICMP: + return process_icmpv4(pkt, metrics); + + case IPPROTO_TCP: + return process_tcp(pkt, ipv4, sizeof(*ipv4), metrics); + + case IPPROTO_UDP: + return process_udp(pkt, ipv4, sizeof(*ipv4), metrics); + + default: + metrics->errors_total_unknown_l4_proto++; + return INVALID; + } +} + +static verdict_t process_ipv6(buf_t *pkt, metrics_t *metrics) +{ + metrics->l3_protocol_packets_total_ipv6++; + + uint8_t l4_proto; + bool is_fragment; + struct ipv6hdr _ipv6; + struct ipv6hdr *ipv6 = + pkt_parse_ipv6(pkt, &_ipv6, &l4_proto, &is_fragment); + if (ipv6 == NULL) { + metrics->errors_total_malformed_ip++; + return INVALID; + } + + if (ipv6->version != 6) { + metrics->errors_total_malformed_ip++; + return INVALID; + } + + if (is_fragment) { + metrics->errors_total_fragmented_ip++; + return INVALID; + } + + switch (l4_proto) { + case IPPROTO_ICMPV6: + return process_icmpv6(pkt, metrics); + + case IPPROTO_TCP: + return process_tcp(pkt, ipv6, sizeof(*ipv6), metrics); + + case IPPROTO_UDP: + return process_udp(pkt, ipv6, sizeof(*ipv6), metrics); + + default: + metrics->errors_total_unknown_l4_proto++; + return INVALID; + } +} + +SEC("classifier/cls_redirect") +int cls_redirect(struct __sk_buff *skb) +{ + metrics_t *metrics = get_global_metrics(); + if (metrics == NULL) { + return TC_ACT_SHOT; + } + + metrics->processed_packets_total++; + + /* Pass bogus packets as long as we're not sure they're + * destined for us. + */ + if (skb->protocol != bpf_htons(ETH_P_IP)) { + return TC_ACT_OK; + } + + encap_headers_t *encap; + + /* Make sure that all encapsulation headers are available in + * the linear portion of the skb. This makes it easy to manipulate them. + */ + if (bpf_skb_pull_data(skb, sizeof(*encap))) { + return TC_ACT_OK; + } + + buf_t pkt = { + .skb = skb, + .head = (uint8_t *)(long)skb->data, + .tail = (uint8_t *)(long)skb->data_end, + }; + + encap = buf_assign(&pkt, sizeof(*encap), NULL); + if (encap == NULL) { + return TC_ACT_OK; + } + + if (encap->ip.ihl != 5) { + /* We never have any options. */ + return TC_ACT_OK; + } + + if (encap->ip.daddr != ENCAPSULATION_IP || + encap->ip.protocol != IPPROTO_UDP) { + return TC_ACT_OK; + } + + /* TODO Check UDP length? */ + if (encap->udp.dest != ENCAPSULATION_PORT) { + return TC_ACT_OK; + } + + /* We now know that the packet is destined to us, we can + * drop bogus ones. + */ + if (ipv4_is_fragment((void *)&encap->ip)) { + metrics->errors_total_fragmented_ip++; + return TC_ACT_SHOT; + } + + if (encap->gue.variant != 0) { + metrics->errors_total_malformed_encapsulation++; + return TC_ACT_SHOT; + } + + if (encap->gue.control != 0) { + metrics->errors_total_malformed_encapsulation++; + return TC_ACT_SHOT; + } + + if (encap->gue.flags != 0) { + metrics->errors_total_malformed_encapsulation++; + return TC_ACT_SHOT; + } + + if (encap->gue.hlen != + sizeof(encap->unigue) / 4 + encap->unigue.hop_count) { + metrics->errors_total_malformed_encapsulation++; + return TC_ACT_SHOT; + } + + if (encap->unigue.version != 0) { + metrics->errors_total_malformed_encapsulation++; + return TC_ACT_SHOT; + } + + if (encap->unigue.reserved != 0) { + return TC_ACT_SHOT; + } + + struct in_addr next_hop; + MAYBE_RETURN(get_next_hop(&pkt, encap, &next_hop)); + + if (next_hop.s_addr == 0) { + metrics->accepted_packets_total_last_hop++; + return accept_locally(skb, encap); + } + + verdict_t verdict; + switch (encap->gue.proto_ctype) { + case IPPROTO_IPIP: + verdict = process_ipv4(&pkt, metrics); + break; + + case IPPROTO_IPV6: + verdict = process_ipv6(&pkt, metrics); + break; + + default: + metrics->errors_total_unknown_l3_proto++; + return TC_ACT_SHOT; + } + + switch (verdict) { + case INVALID: + /* metrics have already been bumped */ + return TC_ACT_SHOT; + + case UNKNOWN: + return forward_to_next_hop(skb, encap, &next_hop, metrics); + + case ECHO_REQUEST: + metrics->accepted_packets_total_icmp_echo_request++; + break; + + case SYN: + if (encap->unigue.forward_syn) { + return forward_to_next_hop(skb, encap, &next_hop, + metrics); + } + + metrics->accepted_packets_total_syn++; + break; + + case SYN_COOKIE: + metrics->accepted_packets_total_syn_cookies++; + break; + + case ESTABLISHED: + metrics->accepted_packets_total_established++; + break; + } + + return accept_locally(skb, encap); +} diff --git a/tools/testing/selftests/bpf/progs/test_cls_redirect.h b/tools/testing/selftests/bpf/progs/test_cls_redirect.h new file mode 100644 index 000000000000..76eab0aacba0 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_cls_redirect.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* Copyright 2019, 2020 Cloudflare */ + +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> +#include <string.h> + +#include <linux/if_ether.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/udp.h> + +struct gre_base_hdr { + uint16_t flags; + uint16_t protocol; +} __attribute__((packed)); + +struct guehdr { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + uint8_t hlen : 5, control : 1, variant : 2; +#else + uint8_t variant : 2, control : 1, hlen : 5; +#endif + uint8_t proto_ctype; + uint16_t flags; +}; + +struct unigue { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + uint8_t _r : 2, last_hop_gre : 1, forward_syn : 1, version : 4; +#else + uint8_t version : 4, forward_syn : 1, last_hop_gre : 1, _r : 2; +#endif + uint8_t reserved; + uint8_t next_hop; + uint8_t hop_count; + // Next hops go here +} __attribute__((packed)); + +typedef struct { + struct ethhdr eth; + struct iphdr ip; + struct gre_base_hdr gre; +} __attribute__((packed)) encap_gre_t; + +typedef struct { + struct ethhdr eth; + struct iphdr ip; + struct udphdr udp; + struct guehdr gue; + struct unigue unigue; +} __attribute__((packed)) encap_headers_t; diff --git a/tools/testing/selftests/bpf/progs/test_enable_stats.c b/tools/testing/selftests/bpf/progs/test_enable_stats.c new file mode 100644 index 000000000000..01a002ade529 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_enable_stats.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Facebook + +#include <linux/bpf.h> +#include <stdint.h> +#include <linux/types.h> +#include <bpf/bpf_helpers.h> + +char _license[] SEC("license") = "GPL"; + +__u64 count = 0; + +SEC("raw_tracepoint/sys_enter") +int test_enable_stats(void *ctx) +{ + count += 1; + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/test_get_stack_rawtp_err.c b/tools/testing/selftests/bpf/progs/test_get_stack_rawtp_err.c new file mode 100644 index 000000000000..8941a41c2a55 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_get_stack_rawtp_err.c @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +#define MAX_STACK_RAWTP 10 + +SEC("raw_tracepoint/sys_enter") +int bpf_prog2(void *ctx) +{ + __u64 stack[MAX_STACK_RAWTP]; + int error; + + /* set all the flags which should return -EINVAL */ + error = bpf_get_stack(ctx, stack, 0, -1); + if (error < 0) + goto loop; + + return error; +loop: + while (1) { + error++; + } +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_global_data.c b/tools/testing/selftests/bpf/progs/test_global_data.c index dd7a4d3dbc0d..1319be1c54ba 100644 --- a/tools/testing/selftests/bpf/progs/test_global_data.c +++ b/tools/testing/selftests/bpf/progs/test_global_data.c @@ -68,7 +68,7 @@ static struct foo struct3 = { bpf_map_update_elem(&result_##map, &key, var, 0); \ } while (0) -SEC("static_data_load") +SEC("classifier/static_data_load") int load_static_data(struct __sk_buff *skb) { static const __u64 bar = ~0; diff --git a/tools/testing/selftests/bpf/progs/test_link_pinning.c b/tools/testing/selftests/bpf/progs/test_link_pinning.c new file mode 100644 index 000000000000..bbf2a5264dc0 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_link_pinning.c @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ + +#include <stdbool.h> +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +int in = 0; +int out = 0; + +SEC("raw_tp/sys_enter") +int raw_tp_prog(const void *ctx) +{ + out = in; + return 0; +} + +SEC("tp_btf/sys_enter") +int tp_btf_prog(const void *ctx) +{ + out = in; + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_mmap.c b/tools/testing/selftests/bpf/progs/test_mmap.c index 6239596cd14e..4eb42cff5fe9 100644 --- a/tools/testing/selftests/bpf/progs/test_mmap.c +++ b/tools/testing/selftests/bpf/progs/test_mmap.c @@ -9,6 +9,14 @@ char _license[] SEC("license") = "GPL"; struct { __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 4096); + __uint(map_flags, BPF_F_MMAPABLE | BPF_F_RDONLY_PROG); + __type(key, __u32); + __type(value, char); +} rdonly_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); __uint(max_entries, 512 * 4); /* at least 4 pages of data */ __uint(map_flags, BPF_F_MMAPABLE); __type(key, __u32); diff --git a/tools/testing/selftests/bpf/progs/test_ns_current_pid_tgid.c b/tools/testing/selftests/bpf/progs/test_ns_current_pid_tgid.c new file mode 100644 index 000000000000..1dca70a6de2f --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_ns_current_pid_tgid.c @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Carlos Neira cneirabustos@gmail.com */ + +#include <linux/bpf.h> +#include <stdint.h> +#include <bpf/bpf_helpers.h> + +static volatile struct { + __u64 dev; + __u64 ino; + __u64 pid_tgid; + __u64 user_pid_tgid; +} res; + +SEC("raw_tracepoint/sys_enter") +int trace(void *ctx) +{ + __u64 ns_pid_tgid, expected_pid; + struct bpf_pidns_info nsdata; + __u32 key = 0; + + if (bpf_get_ns_current_pid_tgid(res.dev, res.ino, &nsdata, + sizeof(struct bpf_pidns_info))) + return 0; + + ns_pid_tgid = (__u64)nsdata.tgid << 32 | nsdata.pid; + expected_pid = res.user_pid_tgid; + + if (expected_pid != ns_pid_tgid) + return 0; + + res.pid_tgid = ns_pid_tgid; + + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_obj_id.c b/tools/testing/selftests/bpf/progs/test_obj_id.c index 98b9de2fafd0..ded71b3ff6b4 100644 --- a/tools/testing/selftests/bpf/progs/test_obj_id.c +++ b/tools/testing/selftests/bpf/progs/test_obj_id.c @@ -3,16 +3,8 @@ */ #include <stddef.h> #include <linux/bpf.h> -#include <linux/pkt_cls.h> #include <bpf/bpf_helpers.h> -/* It is a dumb bpf program such that it must have no - * issue to be loaded since testing the verifier is - * not the focus here. - */ - -int _version SEC("version") = 1; - struct { __uint(type, BPF_MAP_TYPE_ARRAY); __uint(max_entries, 1); @@ -20,13 +12,13 @@ struct { __type(value, __u64); } test_map_id SEC(".maps"); -SEC("test_obj_id_dummy") -int test_obj_id(struct __sk_buff *skb) +SEC("raw_tp/sys_enter") +int test_obj_id(void *ctx) { __u32 key = 0; __u64 *value; value = bpf_map_lookup_elem(&test_map_id, &key); - return TC_ACT_OK; + return 0; } diff --git a/tools/testing/selftests/bpf/progs/test_overhead.c b/tools/testing/selftests/bpf/progs/test_overhead.c index bfe9fbcb9684..42403d088abc 100644 --- a/tools/testing/selftests/bpf/progs/test_overhead.c +++ b/tools/testing/selftests/bpf/progs/test_overhead.c @@ -6,7 +6,6 @@ #include <linux/ptrace.h> #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> -#include "bpf_trace_helpers.h" struct task_struct; @@ -17,11 +16,9 @@ int BPF_KPROBE(prog1, struct task_struct *tsk, const char *buf, bool exec) } SEC("kretprobe/__set_task_comm") -int BPF_KRETPROBE(prog2, - struct task_struct *tsk, const char *buf, bool exec, - int ret) +int BPF_KRETPROBE(prog2, int ret) { - return !PT_REGS_PARM1(ctx) && ret; + return ret; } SEC("raw_tp/task_rename") @@ -33,12 +30,18 @@ int prog3(struct bpf_raw_tracepoint_args *ctx) SEC("fentry/__set_task_comm") int BPF_PROG(prog4, struct task_struct *tsk, const char *buf, bool exec) { - return !tsk; + return 0; } SEC("fexit/__set_task_comm") int BPF_PROG(prog5, struct task_struct *tsk, const char *buf, bool exec) { + return 0; +} + +SEC("fmod_ret/__set_task_comm") +int BPF_PROG(prog6, struct task_struct *tsk, const char *buf, bool exec) +{ return !tsk; } diff --git a/tools/testing/selftests/bpf/progs/test_perf_branches.c b/tools/testing/selftests/bpf/progs/test_perf_branches.c new file mode 100644 index 000000000000..a1ccc831c882 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_perf_branches.c @@ -0,0 +1,50 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2019 Facebook + +#include <stddef.h> +#include <linux/ptrace.h> +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +int valid = 0; +int required_size_out = 0; +int written_stack_out = 0; +int written_global_out = 0; + +struct { + __u64 _a; + __u64 _b; + __u64 _c; +} fpbe[30] = {0}; + +SEC("perf_event") +int perf_branches(void *ctx) +{ + __u64 entries[4 * 3] = {0}; + int required_size, written_stack, written_global; + + /* write to stack */ + written_stack = bpf_read_branch_records(ctx, entries, sizeof(entries), 0); + /* ignore spurious events */ + if (!written_stack) + return 1; + + /* get required size */ + required_size = bpf_read_branch_records(ctx, NULL, 0, + BPF_F_GET_BRANCH_RECORDS_SIZE); + + written_global = bpf_read_branch_records(ctx, fpbe, sizeof(fpbe), 0); + /* ignore spurious events */ + if (!written_global) + return 1; + + required_size_out = required_size; + written_stack_out = written_stack; + written_global_out = written_global; + valid = 1; + + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_perf_buffer.c b/tools/testing/selftests/bpf/progs/test_perf_buffer.c index ebfcc9f50c35..ad59c4c9aba8 100644 --- a/tools/testing/selftests/bpf/progs/test_perf_buffer.c +++ b/tools/testing/selftests/bpf/progs/test_perf_buffer.c @@ -4,7 +4,7 @@ #include <linux/ptrace.h> #include <linux/bpf.h> #include <bpf/bpf_helpers.h> -#include "bpf_trace_helpers.h" +#include <bpf/bpf_tracing.h> struct { __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); diff --git a/tools/testing/selftests/bpf/progs/test_probe_user.c b/tools/testing/selftests/bpf/progs/test_probe_user.c index d556b1572cc6..89b3532ccc75 100644 --- a/tools/testing/selftests/bpf/progs/test_probe_user.c +++ b/tools/testing/selftests/bpf/progs/test_probe_user.c @@ -7,7 +7,6 @@ #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> -#include "bpf_trace_helpers.h" static struct sockaddr_in old; diff --git a/tools/testing/selftests/bpf/progs/test_ringbuf.c b/tools/testing/selftests/bpf/progs/test_ringbuf.c new file mode 100644 index 000000000000..8ba9959b036b --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_ringbuf.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Facebook + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +char _license[] SEC("license") = "GPL"; + +struct sample { + int pid; + int seq; + long value; + char comm[16]; +}; + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 1 << 12); +} ringbuf SEC(".maps"); + +/* inputs */ +int pid = 0; +long value = 0; +long flags = 0; + +/* outputs */ +long total = 0; +long discarded = 0; +long dropped = 0; + +long avail_data = 0; +long ring_size = 0; +long cons_pos = 0; +long prod_pos = 0; + +/* inner state */ +long seq = 0; + +SEC("tp/syscalls/sys_enter_getpgid") +int test_ringbuf(void *ctx) +{ + int cur_pid = bpf_get_current_pid_tgid() >> 32; + struct sample *sample; + int zero = 0; + + if (cur_pid != pid) + return 0; + + sample = bpf_ringbuf_reserve(&ringbuf, sizeof(*sample), 0); + if (!sample) { + __sync_fetch_and_add(&dropped, 1); + return 1; + } + + sample->pid = pid; + bpf_get_current_comm(sample->comm, sizeof(sample->comm)); + sample->value = value; + + sample->seq = seq++; + __sync_fetch_and_add(&total, 1); + + if (sample->seq & 1) { + /* copy from reserved sample to a new one... */ + bpf_ringbuf_output(&ringbuf, sample, sizeof(*sample), flags); + /* ...and then discard reserved sample */ + bpf_ringbuf_discard(sample, flags); + __sync_fetch_and_add(&discarded, 1); + } else { + bpf_ringbuf_submit(sample, flags); + } + + avail_data = bpf_ringbuf_query(&ringbuf, BPF_RB_AVAIL_DATA); + ring_size = bpf_ringbuf_query(&ringbuf, BPF_RB_RING_SIZE); + cons_pos = bpf_ringbuf_query(&ringbuf, BPF_RB_CONS_POS); + prod_pos = bpf_ringbuf_query(&ringbuf, BPF_RB_PROD_POS); + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/test_ringbuf_multi.c b/tools/testing/selftests/bpf/progs/test_ringbuf_multi.c new file mode 100644 index 000000000000..edf3b6953533 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_ringbuf_multi.c @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Facebook + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +char _license[] SEC("license") = "GPL"; + +struct sample { + int pid; + int seq; + long value; + char comm[16]; +}; + +struct ringbuf_map { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 1 << 12); +} ringbuf1 SEC(".maps"), + ringbuf2 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, 4); + __type(key, int); + __array(values, struct ringbuf_map); +} ringbuf_arr SEC(".maps") = { + .values = { + [0] = &ringbuf1, + [2] = &ringbuf2, + }, +}; + +/* inputs */ +int pid = 0; +int target_ring = 0; +long value = 0; + +/* outputs */ +long total = 0; +long dropped = 0; +long skipped = 0; + +SEC("tp/syscalls/sys_enter_getpgid") +int test_ringbuf(void *ctx) +{ + int cur_pid = bpf_get_current_pid_tgid() >> 32; + struct sample *sample; + void *rb; + int zero = 0; + + if (cur_pid != pid) + return 0; + + rb = bpf_map_lookup_elem(&ringbuf_arr, &target_ring); + if (!rb) { + skipped += 1; + return 1; + } + + sample = bpf_ringbuf_reserve(rb, sizeof(*sample), 0); + if (!sample) { + dropped += 1; + return 1; + } + + sample->pid = pid; + bpf_get_current_comm(sample->comm, sizeof(sample->comm)); + sample->value = value; + + sample->seq = total; + total += 1; + + bpf_ringbuf_submit(sample, 0); + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/test_sk_assign.c b/tools/testing/selftests/bpf/progs/test_sk_assign.c new file mode 100644 index 000000000000..1ecd987005d2 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_sk_assign.c @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2019 Cloudflare Ltd. +// Copyright (c) 2020 Isovalent, Inc. + +#include <stddef.h> +#include <stdbool.h> +#include <string.h> +#include <linux/bpf.h> +#include <linux/if_ether.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/pkt_cls.h> +#include <linux/tcp.h> +#include <sys/socket.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +/* Pin map under /sys/fs/bpf/tc/globals/<map name> */ +#define PIN_GLOBAL_NS 2 + +/* Must match struct bpf_elf_map layout from iproute2 */ +struct { + __u32 type; + __u32 size_key; + __u32 size_value; + __u32 max_elem; + __u32 flags; + __u32 id; + __u32 pinning; +} server_map SEC("maps") = { + .type = BPF_MAP_TYPE_SOCKMAP, + .size_key = sizeof(int), + .size_value = sizeof(__u64), + .max_elem = 1, + .pinning = PIN_GLOBAL_NS, +}; + +int _version SEC("version") = 1; +char _license[] SEC("license") = "GPL"; + +/* Fill 'tuple' with L3 info, and attempt to find L4. On fail, return NULL. */ +static inline struct bpf_sock_tuple * +get_tuple(struct __sk_buff *skb, bool *ipv4, bool *tcp) +{ + void *data_end = (void *)(long)skb->data_end; + void *data = (void *)(long)skb->data; + struct bpf_sock_tuple *result; + struct ethhdr *eth; + __u64 tuple_len; + __u8 proto = 0; + __u64 ihl_len; + + eth = (struct ethhdr *)(data); + if (eth + 1 > data_end) + return NULL; + + if (eth->h_proto == bpf_htons(ETH_P_IP)) { + struct iphdr *iph = (struct iphdr *)(data + sizeof(*eth)); + + if (iph + 1 > data_end) + return NULL; + if (iph->ihl != 5) + /* Options are not supported */ + return NULL; + ihl_len = iph->ihl * 4; + proto = iph->protocol; + *ipv4 = true; + result = (struct bpf_sock_tuple *)&iph->saddr; + } else if (eth->h_proto == bpf_htons(ETH_P_IPV6)) { + struct ipv6hdr *ip6h = (struct ipv6hdr *)(data + sizeof(*eth)); + + if (ip6h + 1 > data_end) + return NULL; + ihl_len = sizeof(*ip6h); + proto = ip6h->nexthdr; + *ipv4 = false; + result = (struct bpf_sock_tuple *)&ip6h->saddr; + } else { + return (struct bpf_sock_tuple *)data; + } + + if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) + return NULL; + + *tcp = (proto == IPPROTO_TCP); + return result; +} + +static inline int +handle_udp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, bool ipv4) +{ + struct bpf_sock_tuple ln = {0}; + struct bpf_sock *sk; + const int zero = 0; + size_t tuple_len; + __be16 dport; + int ret; + + tuple_len = ipv4 ? sizeof(tuple->ipv4) : sizeof(tuple->ipv6); + if ((void *)tuple + tuple_len > (void *)(long)skb->data_end) + return TC_ACT_SHOT; + + sk = bpf_sk_lookup_udp(skb, tuple, tuple_len, BPF_F_CURRENT_NETNS, 0); + if (sk) + goto assign; + + dport = ipv4 ? tuple->ipv4.dport : tuple->ipv6.dport; + if (dport != bpf_htons(4321)) + return TC_ACT_OK; + + sk = bpf_map_lookup_elem(&server_map, &zero); + if (!sk) + return TC_ACT_SHOT; + +assign: + ret = bpf_sk_assign(skb, sk, 0); + bpf_sk_release(sk); + return ret; +} + +static inline int +handle_tcp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, bool ipv4) +{ + struct bpf_sock_tuple ln = {0}; + struct bpf_sock *sk; + const int zero = 0; + size_t tuple_len; + __be16 dport; + int ret; + + tuple_len = ipv4 ? sizeof(tuple->ipv4) : sizeof(tuple->ipv6); + if ((void *)tuple + tuple_len > (void *)(long)skb->data_end) + return TC_ACT_SHOT; + + sk = bpf_skc_lookup_tcp(skb, tuple, tuple_len, BPF_F_CURRENT_NETNS, 0); + if (sk) { + if (sk->state != BPF_TCP_LISTEN) + goto assign; + bpf_sk_release(sk); + } + + dport = ipv4 ? tuple->ipv4.dport : tuple->ipv6.dport; + if (dport != bpf_htons(4321)) + return TC_ACT_OK; + + sk = bpf_map_lookup_elem(&server_map, &zero); + if (!sk) + return TC_ACT_SHOT; + + if (sk->state != BPF_TCP_LISTEN) { + bpf_sk_release(sk); + return TC_ACT_SHOT; + } + +assign: + ret = bpf_sk_assign(skb, sk, 0); + bpf_sk_release(sk); + return ret; +} + +SEC("classifier/sk_assign_test") +int bpf_sk_assign_test(struct __sk_buff *skb) +{ + struct bpf_sock_tuple *tuple, ln = {0}; + bool ipv4 = false; + bool tcp = false; + int tuple_len; + int ret = 0; + + tuple = get_tuple(skb, &ipv4, &tcp); + if (!tuple) + return TC_ACT_SHOT; + + /* Note that the verifier socket return type for bpf_skc_lookup_tcp() + * differs from bpf_sk_lookup_udp(), so even though the C-level type is + * the same here, if we try to share the implementations they will + * fail to verify because we're crossing pointer types. + */ + if (tcp) + ret = handle_tcp(skb, tuple, ipv4); + else + ret = handle_udp(skb, tuple, ipv4); + + return ret == 0 ? TC_ACT_OK : TC_ACT_SHOT; +} diff --git a/tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c b/tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c index d2b38fa6a5b0..e83d0b48d80c 100644 --- a/tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c +++ b/tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c @@ -73,6 +73,7 @@ int bpf_sk_lookup_test0(struct __sk_buff *skb) tuple_len = ipv4 ? sizeof(tuple->ipv4) : sizeof(tuple->ipv6); sk = bpf_sk_lookup_tcp(skb, tuple, tuple_len, BPF_F_CURRENT_NETNS, 0); + bpf_printk("sk=%d\n", sk ? 1 : 0); if (sk) bpf_sk_release(sk); return sk ? TC_ACT_OK : TC_ACT_UNSPEC; diff --git a/tools/testing/selftests/bpf/progs/test_skb_ctx.c b/tools/testing/selftests/bpf/progs/test_skb_ctx.c index 202de3938494..b02ea589ce7e 100644 --- a/tools/testing/selftests/bpf/progs/test_skb_ctx.c +++ b/tools/testing/selftests/bpf/progs/test_skb_ctx.c @@ -23,6 +23,8 @@ int process(struct __sk_buff *skb) return 1; if (skb->gso_segs != 8) return 1; + if (skb->gso_size != 10) + return 1; return 0; } diff --git a/tools/testing/selftests/bpf/progs/test_skb_helpers.c b/tools/testing/selftests/bpf/progs/test_skb_helpers.c new file mode 100644 index 000000000000..bb3fbf1a29e3 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_skb_helpers.c @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +#define TEST_COMM_LEN 16 + +struct { + __uint(type, BPF_MAP_TYPE_CGROUP_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, u32); +} cgroup_map SEC(".maps"); + +char _license[] SEC("license") = "GPL"; + +SEC("classifier/test_skb_helpers") +int test_skb_helpers(struct __sk_buff *skb) +{ + struct task_struct *task; + char comm[TEST_COMM_LEN]; + __u32 tpid; + + task = (struct task_struct *)bpf_get_current_task(); + bpf_probe_read_kernel(&tpid , sizeof(tpid), &task->tgid); + bpf_probe_read_kernel_str(&comm, sizeof(comm), &task->comm); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c b/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c new file mode 100644 index 000000000000..45e8fc75a739 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Isovalent, Inc. +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> + +struct { + __uint(type, BPF_MAP_TYPE_SOCKMAP); + __uint(max_entries, 2); + __type(key, __u32); + __type(value, __u64); +} sock_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_SOCKHASH); + __uint(max_entries, 2); + __type(key, __u32); + __type(value, __u64); +} sock_hash SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_SK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, __u32); + __type(value, __u64); +} socket_storage SEC(".maps"); + +SEC("sk_msg") +int prog_msg_verdict(struct sk_msg_md *msg) +{ + struct task_struct *task = (struct task_struct *)bpf_get_current_task(); + int verdict = SK_PASS; + __u32 pid, tpid; + __u64 *sk_stg; + + pid = bpf_get_current_pid_tgid() >> 32; + sk_stg = bpf_sk_storage_get(&socket_storage, msg->sk, 0, BPF_SK_STORAGE_GET_F_CREATE); + if (!sk_stg) + return SK_DROP; + *sk_stg = pid; + bpf_probe_read_kernel(&tpid , sizeof(tpid), &task->tgid); + if (pid != tpid) + verdict = SK_DROP; + bpf_sk_storage_delete(&socket_storage, (void *)msg->sk); + return verdict; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_sockmap_kern.h b/tools/testing/selftests/bpf/progs/test_sockmap_kern.h index 9b4d3a68a91a..057036ca1111 100644 --- a/tools/testing/selftests/bpf/test_sockmap_kern.h +++ b/tools/testing/selftests/bpf/progs/test_sockmap_kern.h @@ -79,11 +79,18 @@ struct { struct { __uint(type, BPF_MAP_TYPE_ARRAY); - __uint(max_entries, 1); + __uint(max_entries, 2); __type(key, int); __type(value, int); } sock_skb_opts SEC(".maps"); +struct { + __uint(type, TEST_MAP_TYPE); + __uint(max_entries, 20); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); +} tls_sock_map SEC(".maps"); + SEC("sk_skb1") int bpf_prog1(struct __sk_buff *skb) { @@ -110,8 +117,6 @@ int bpf_prog2(struct __sk_buff *skb) flags = *f; } - bpf_printk("sk_skb2: redirect(%iB) flags=%i\n", - len, flags); #ifdef SOCKMAP return bpf_sk_redirect_map(skb, &sock_map, ret, flags); #else @@ -120,6 +125,43 @@ int bpf_prog2(struct __sk_buff *skb) } +SEC("sk_skb3") +int bpf_prog3(struct __sk_buff *skb) +{ + const int one = 1; + int err, *f, ret = SK_PASS; + void *data_end; + char *c; + + err = bpf_skb_pull_data(skb, 19); + if (err) + goto tls_out; + + c = (char *)(long)skb->data; + data_end = (void *)(long)skb->data_end; + + if (c + 18 < data_end) + memcpy(&c[13], "PASS", 4); + f = bpf_map_lookup_elem(&sock_skb_opts, &one); + if (f && *f) { + __u64 flags = 0; + + ret = 0; + flags = *f; +#ifdef SOCKMAP + return bpf_sk_redirect_map(skb, &tls_sock_map, ret, flags); +#else + return bpf_sk_redirect_hash(skb, &tls_sock_map, &ret, flags); +#endif + } + + f = bpf_map_lookup_elem(&sock_skb_opts, &one); + if (f && *f) + ret = SK_DROP; +tls_out: + return ret; +} + SEC("sockops") int bpf_sockmap(struct bpf_sock_ops *skops) { @@ -143,8 +185,6 @@ int bpf_sockmap(struct bpf_sock_ops *skops) err = bpf_sock_hash_update(skops, &sock_map, &ret, BPF_NOEXIST); #endif - bpf_printk("passive(%i -> %i) map ctx update err: %d\n", - lport, bpf_ntohl(rport), err); } break; case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: @@ -160,8 +200,6 @@ int bpf_sockmap(struct bpf_sock_ops *skops) err = bpf_sock_hash_update(skops, &sock_map, &ret, BPF_NOEXIST); #endif - bpf_printk("active(%i -> %i) map ctx update err: %d\n", - lport, bpf_ntohl(rport), err); } break; default: @@ -199,72 +237,6 @@ int bpf_prog4(struct sk_msg_md *msg) } SEC("sk_msg2") -int bpf_prog5(struct sk_msg_md *msg) -{ - int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5; - int *start, *end, *start_push, *end_push, *start_pop, *pop; - int *bytes, len1, len2 = 0, len3, len4; - int err1 = -1, err2 = -1; - - bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); - if (bytes) - err1 = bpf_msg_apply_bytes(msg, *bytes); - bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); - if (bytes) - err2 = bpf_msg_cork_bytes(msg, *bytes); - len1 = (__u64)msg->data_end - (__u64)msg->data; - start = bpf_map_lookup_elem(&sock_bytes, &zero); - end = bpf_map_lookup_elem(&sock_bytes, &one); - if (start && end) { - int err; - - bpf_printk("sk_msg2: pull(%i:%i)\n", - start ? *start : 0, end ? *end : 0); - err = bpf_msg_pull_data(msg, *start, *end, 0); - if (err) - bpf_printk("sk_msg2: pull_data err %i\n", - err); - len2 = (__u64)msg->data_end - (__u64)msg->data; - bpf_printk("sk_msg2: length update %i->%i\n", - len1, len2); - } - - start_push = bpf_map_lookup_elem(&sock_bytes, &two); - end_push = bpf_map_lookup_elem(&sock_bytes, &three); - if (start_push && end_push) { - int err; - - bpf_printk("sk_msg2: push(%i:%i)\n", - start_push ? *start_push : 0, - end_push ? *end_push : 0); - err = bpf_msg_push_data(msg, *start_push, *end_push, 0); - if (err) - bpf_printk("sk_msg2: push_data err %i\n", err); - len3 = (__u64)msg->data_end - (__u64)msg->data; - bpf_printk("sk_msg2: length push_update %i->%i\n", - len2 ? len2 : len1, len3); - } - start_pop = bpf_map_lookup_elem(&sock_bytes, &four); - pop = bpf_map_lookup_elem(&sock_bytes, &five); - if (start_pop && pop) { - int err; - - bpf_printk("sk_msg2: pop(%i@%i)\n", - start_pop, pop); - err = bpf_msg_pop_data(msg, *start_pop, *pop, 0); - if (err) - bpf_printk("sk_msg2: pop_data err %i\n", err); - len4 = (__u64)msg->data_end - (__u64)msg->data; - bpf_printk("sk_msg2: length pop_data %i->%i\n", - len1 ? len1 : 0, len4); - } - - bpf_printk("sk_msg2: data length %i err1 %i err2 %i\n", - len1, err1, err2); - return SK_PASS; -} - -SEC("sk_msg3") int bpf_prog6(struct sk_msg_md *msg) { int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5, key = 0; @@ -305,86 +277,7 @@ int bpf_prog6(struct sk_msg_md *msg) #endif } -SEC("sk_msg4") -int bpf_prog7(struct sk_msg_md *msg) -{ - int *bytes, *start, *end, *start_push, *end_push, *start_pop, *pop, *f; - int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5; - int len1, len2 = 0, len3, len4; - int err1 = 0, err2 = 0, key = 0; - __u64 flags = 0; - - int err; - bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); - if (bytes) - err1 = bpf_msg_apply_bytes(msg, *bytes); - bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); - if (bytes) - err2 = bpf_msg_cork_bytes(msg, *bytes); - len1 = (__u64)msg->data_end - (__u64)msg->data; - - start = bpf_map_lookup_elem(&sock_bytes, &zero); - end = bpf_map_lookup_elem(&sock_bytes, &one); - if (start && end) { - bpf_printk("sk_msg2: pull(%i:%i)\n", - start ? *start : 0, end ? *end : 0); - err = bpf_msg_pull_data(msg, *start, *end, 0); - if (err) - bpf_printk("sk_msg2: pull_data err %i\n", - err); - len2 = (__u64)msg->data_end - (__u64)msg->data; - bpf_printk("sk_msg2: length update %i->%i\n", - len1, len2); - } - - start_push = bpf_map_lookup_elem(&sock_bytes, &two); - end_push = bpf_map_lookup_elem(&sock_bytes, &three); - if (start_push && end_push) { - bpf_printk("sk_msg4: push(%i:%i)\n", - start_push ? *start_push : 0, - end_push ? *end_push : 0); - err = bpf_msg_push_data(msg, *start_push, *end_push, 0); - if (err) - bpf_printk("sk_msg4: push_data err %i\n", - err); - len3 = (__u64)msg->data_end - (__u64)msg->data; - bpf_printk("sk_msg4: length push_update %i->%i\n", - len2 ? len2 : len1, len3); - } - - start_pop = bpf_map_lookup_elem(&sock_bytes, &four); - pop = bpf_map_lookup_elem(&sock_bytes, &five); - if (start_pop && pop) { - int err; - - bpf_printk("sk_msg4: pop(%i@%i)\n", - start_pop, pop); - err = bpf_msg_pop_data(msg, *start_pop, *pop, 0); - if (err) - bpf_printk("sk_msg4: pop_data err %i\n", err); - len4 = (__u64)msg->data_end - (__u64)msg->data; - bpf_printk("sk_msg4: length pop_data %i->%i\n", - len1 ? len1 : 0, len4); - } - - - f = bpf_map_lookup_elem(&sock_redir_flags, &zero); - if (f && *f) { - key = 2; - flags = *f; - } - bpf_printk("sk_msg3: redirect(%iB) flags=%i err=%i\n", - len1, flags, err1 ? err1 : err2); -#ifdef SOCKMAP - err = bpf_msg_redirect_map(msg, &sock_map_redir, key, flags); -#else - err = bpf_msg_redirect_hash(msg, &sock_map_redir, &key, flags); -#endif - bpf_printk("sk_msg3: err %i\n", err); - return err; -} - -SEC("sk_msg5") +SEC("sk_msg3") int bpf_prog8(struct sk_msg_md *msg) { void *data_end = (void *)(long) msg->data_end; @@ -401,7 +294,7 @@ int bpf_prog8(struct sk_msg_md *msg) } return SK_PASS; } -SEC("sk_msg6") +SEC("sk_msg4") int bpf_prog9(struct sk_msg_md *msg) { void *data_end = (void *)(long) msg->data_end; @@ -419,7 +312,7 @@ int bpf_prog9(struct sk_msg_md *msg) return SK_PASS; } -SEC("sk_msg7") +SEC("sk_msg5") int bpf_prog10(struct sk_msg_md *msg) { int *bytes, *start, *end, *start_push, *end_push, *start_pop, *pop; @@ -443,7 +336,6 @@ int bpf_prog10(struct sk_msg_md *msg) pop = bpf_map_lookup_elem(&sock_bytes, &five); if (start_pop && pop) bpf_msg_pop_data(msg, *start_pop, *pop, 0); - bpf_printk("return sk drop\n"); return SK_DROP; } diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_listen.c b/tools/testing/selftests/bpf/progs/test_sockmap_listen.c new file mode 100644 index 000000000000..a3a366c57ce1 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_sockmap_listen.c @@ -0,0 +1,98 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Cloudflare + +#include <errno.h> +#include <stdbool.h> +#include <linux/bpf.h> + +#include <bpf/bpf_helpers.h> + +struct { + __uint(type, BPF_MAP_TYPE_SOCKMAP); + __uint(max_entries, 2); + __type(key, __u32); + __type(value, __u64); +} sock_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_SOCKHASH); + __uint(max_entries, 2); + __type(key, __u32); + __type(value, __u64); +} sock_hash SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 2); + __type(key, int); + __type(value, unsigned int); +} verdict_map SEC(".maps"); + +static volatile bool test_sockmap; /* toggled by user-space */ + +SEC("sk_skb/stream_parser") +int prog_skb_parser(struct __sk_buff *skb) +{ + return skb->len; +} + +SEC("sk_skb/stream_verdict") +int prog_skb_verdict(struct __sk_buff *skb) +{ + unsigned int *count; + __u32 zero = 0; + int verdict; + + if (test_sockmap) + verdict = bpf_sk_redirect_map(skb, &sock_map, zero, 0); + else + verdict = bpf_sk_redirect_hash(skb, &sock_hash, &zero, 0); + + count = bpf_map_lookup_elem(&verdict_map, &verdict); + if (count) + (*count)++; + + return verdict; +} + +SEC("sk_msg") +int prog_msg_verdict(struct sk_msg_md *msg) +{ + unsigned int *count; + __u32 zero = 0; + int verdict; + + if (test_sockmap) + verdict = bpf_msg_redirect_map(msg, &sock_map, zero, 0); + else + verdict = bpf_msg_redirect_hash(msg, &sock_hash, &zero, 0); + + count = bpf_map_lookup_elem(&verdict_map, &verdict); + if (count) + (*count)++; + + return verdict; +} + +SEC("sk_reuseport") +int prog_reuseport(struct sk_reuseport_md *reuse) +{ + unsigned int *count; + int err, verdict; + __u32 zero = 0; + + if (test_sockmap) + err = bpf_sk_select_reuseport(reuse, &sock_map, &zero, 0); + else + err = bpf_sk_select_reuseport(reuse, &sock_hash, &zero, 0); + verdict = err ? SK_DROP : SK_PASS; + + count = bpf_map_lookup_elem(&verdict_map, &verdict); + if (count) + (*count)++; + + return verdict; +} + +int _version SEC("version") = 1; +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_sysctl_prog.c b/tools/testing/selftests/bpf/progs/test_sysctl_prog.c index 2d0b0b82a78a..50525235380e 100644 --- a/tools/testing/selftests/bpf/progs/test_sysctl_prog.c +++ b/tools/testing/selftests/bpf/progs/test_sysctl_prog.c @@ -45,7 +45,7 @@ int sysctl_tcp_mem(struct bpf_sysctl *ctx) unsigned long tcp_mem[3] = {0, 0, 0}; char value[MAX_VALUE_STR_LEN]; unsigned char i, off = 0; - int ret; + volatile int ret; if (ctx->write) return 0; diff --git a/tools/testing/selftests/bpf/progs/test_trampoline_count.c b/tools/testing/selftests/bpf/progs/test_trampoline_count.c index e51e6e3a81c2..f030e469d05b 100644 --- a/tools/testing/selftests/bpf/progs/test_trampoline_count.c +++ b/tools/testing/selftests/bpf/progs/test_trampoline_count.c @@ -2,7 +2,8 @@ #include <stdbool.h> #include <stddef.h> #include <linux/bpf.h> -#include "bpf_trace_helpers.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> struct task_struct; diff --git a/tools/testing/selftests/bpf/progs/test_vmlinux.c b/tools/testing/selftests/bpf/progs/test_vmlinux.c new file mode 100644 index 000000000000..5611b564d3b1 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_vmlinux.c @@ -0,0 +1,84 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ + +#include "vmlinux.h" +#include <asm/unistd.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> + +#define MY_TV_NSEC 1337 + +bool tp_called = false; +bool raw_tp_called = false; +bool tp_btf_called = false; +bool kprobe_called = false; +bool fentry_called = false; + +SEC("tp/syscalls/sys_enter_nanosleep") +int handle__tp(struct trace_event_raw_sys_enter *args) +{ + struct __kernel_timespec *ts; + + if (args->id != __NR_nanosleep) + return 0; + + ts = (void *)args->args[0]; + if (BPF_CORE_READ(ts, tv_nsec) != MY_TV_NSEC) + return 0; + + tp_called = true; + return 0; +} + +SEC("raw_tp/sys_enter") +int BPF_PROG(handle__raw_tp, struct pt_regs *regs, long id) +{ + struct __kernel_timespec *ts; + + if (id != __NR_nanosleep) + return 0; + + ts = (void *)PT_REGS_PARM1_CORE(regs); + if (BPF_CORE_READ(ts, tv_nsec) != MY_TV_NSEC) + return 0; + + raw_tp_called = true; + return 0; +} + +SEC("tp_btf/sys_enter") +int BPF_PROG(handle__tp_btf, struct pt_regs *regs, long id) +{ + struct __kernel_timespec *ts; + + if (id != __NR_nanosleep) + return 0; + + ts = (void *)PT_REGS_PARM1_CORE(regs); + if (BPF_CORE_READ(ts, tv_nsec) != MY_TV_NSEC) + return 0; + + tp_btf_called = true; + return 0; +} + +SEC("kprobe/hrtimer_nanosleep") +int BPF_KPROBE(handle__kprobe, + ktime_t rqtp, enum hrtimer_mode mode, clockid_t clockid) +{ + if (rqtp == MY_TV_NSEC) + kprobe_called = true; + return 0; +} + +SEC("fentry/hrtimer_nanosleep") +int BPF_PROG(handle__fentry, + ktime_t rqtp, enum hrtimer_mode mode, clockid_t clockid) +{ + if (rqtp == MY_TV_NSEC) + fentry_called = true; + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_grow.c b/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_grow.c new file mode 100644 index 000000000000..3d66599eee2e --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_grow.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +SEC("xdp_adjust_tail_grow") +int _xdp_adjust_tail_grow(struct xdp_md *xdp) +{ + void *data_end = (void *)(long)xdp->data_end; + void *data = (void *)(long)xdp->data; + unsigned int data_len; + int offset = 0; + + /* Data length determine test case */ + data_len = data_end - data; + + if (data_len == 54) { /* sizeof(pkt_v4) */ + offset = 4096; /* test too large offset */ + } else if (data_len == 74) { /* sizeof(pkt_v6) */ + offset = 40; + } else if (data_len == 64) { + offset = 128; + } else if (data_len == 128) { + offset = 4096 - 256 - 320 - data_len; /* Max tail grow 3520 */ + } else { + return XDP_ABORTED; /* No matching test */ + } + + if (bpf_xdp_adjust_tail(xdp, offset)) + return XDP_DROP; + return XDP_TX; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_adjust_tail.c b/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_shrink.c index b7fc85769bdc..22065a9cfb25 100644 --- a/tools/testing/selftests/bpf/progs/test_adjust_tail.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_shrink.c @@ -1,5 +1,5 @@ -/* SPDX-License-Identifier: GPL-2.0 - * Copyright (c) 2018 Facebook +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2018 Facebook * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -11,15 +11,15 @@ int _version SEC("version") = 1; -SEC("xdp_adjust_tail") -int _xdp_adjust_tail(struct xdp_md *xdp) +SEC("xdp_adjust_tail_shrink") +int _xdp_adjust_tail_shrink(struct xdp_md *xdp) { void *data_end = (void *)(long)xdp->data_end; void *data = (void *)(long)xdp->data; int offset = 0; - if (data_end - data == 54) - offset = 256; + if (data_end - data == 54) /* sizeof(pkt_v4) */ + offset = 256; /* shrink too much */ else offset = 20; if (bpf_xdp_adjust_tail(xdp, 0 - offset)) diff --git a/tools/testing/selftests/bpf/progs/test_xdp_bpf2bpf.c b/tools/testing/selftests/bpf/progs/test_xdp_bpf2bpf.c index cb8a04ab7a78..a038e827f850 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_bpf2bpf.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_bpf2bpf.c @@ -1,7 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/bpf.h> +#include <bpf/bpf_tracing.h> #include <bpf/bpf_helpers.h> -#include "bpf_trace_helpers.h" + +char _license[] SEC("license") = "GPL"; struct net_device { /* Structure does not need to contain all entries, @@ -27,16 +29,38 @@ struct xdp_buff { struct xdp_rxq_info *rxq; } __attribute__((preserve_access_index)); +struct meta { + int ifindex; + int pkt_len; +}; + +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); +} perf_buf_map SEC(".maps"); + __u64 test_result_fentry = 0; -SEC("fentry/_xdp_tx_iptunnel") +SEC("fentry/FUNC") int BPF_PROG(trace_on_entry, struct xdp_buff *xdp) { + struct meta meta; + void *data_end = (void *)(long)xdp->data_end; + void *data = (void *)(long)xdp->data; + + meta.ifindex = xdp->rxq->dev->ifindex; + meta.pkt_len = data_end - data; + bpf_xdp_output(xdp, &perf_buf_map, + ((__u64) meta.pkt_len << 32) | + BPF_F_CURRENT_CPU, + &meta, sizeof(meta)); + test_result_fentry = xdp->rxq->dev->ifindex; return 0; } __u64 test_result_fexit = 0; -SEC("fexit/_xdp_tx_iptunnel") +SEC("fexit/FUNC") int BPF_PROG(trace_on_exit, struct xdp_buff *xdp, int ret) { test_result_fexit = ret; diff --git a/tools/testing/selftests/bpf/progs/test_xdp_devmap_helpers.c b/tools/testing/selftests/bpf/progs/test_xdp_devmap_helpers.c new file mode 100644 index 000000000000..e5c0f131c8a7 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_xdp_devmap_helpers.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0 +/* fails to load without expected_attach_type = BPF_XDP_DEVMAP + * because of access to egress_ifindex + */ +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> + +SEC("xdp_dm_log") +int xdpdm_devlog(struct xdp_md *ctx) +{ + char fmt[] = "devmap redirect: dev %u -> dev %u len %u\n"; + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + unsigned int len = data_end - data; + + bpf_trace_printk(fmt, sizeof(fmt), + ctx->ingress_ifindex, ctx->egress_ifindex, len); + + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_xdp_with_devmap_helpers.c b/tools/testing/selftests/bpf/progs/test_xdp_with_devmap_helpers.c new file mode 100644 index 000000000000..deef0e050863 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_xdp_with_devmap_helpers.c @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> + +struct { + __uint(type, BPF_MAP_TYPE_DEVMAP); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct bpf_devmap_val)); + __uint(max_entries, 4); +} dm_ports SEC(".maps"); + +SEC("xdp_redir") +int xdp_redir_prog(struct xdp_md *ctx) +{ + return bpf_redirect_map(&dm_ports, 1, 0); +} + +/* invalid program on DEVMAP entry; + * SEC name means expected attach type not set + */ +SEC("xdp_dummy") +int xdp_dummy_prog(struct xdp_md *ctx) +{ + return XDP_PASS; +} + +/* valid program on DEVMAP entry via SEC name; + * has access to egress and ingress ifindex + */ +SEC("xdp_devmap") +int xdp_dummy_dm(struct xdp_md *ctx) +{ + char fmt[] = "devmap redirect: dev %u -> dev %u len %u\n"; + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + unsigned int len = data_end - data; + + bpf_trace_printk(fmt, sizeof(fmt), + ctx->ingress_ifindex, ctx->egress_ifindex, len); + + return XDP_PASS; +} +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c new file mode 100644 index 000000000000..8b36b6640e7e --- /dev/null +++ b/tools/testing/selftests/bpf/progs/trigger_bench.c @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Facebook + +#include <linux/bpf.h> +#include <asm/unistd.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char _license[] SEC("license") = "GPL"; + +long hits = 0; + +SEC("tp/syscalls/sys_enter_getpgid") +int bench_trigger_tp(void *ctx) +{ + __sync_add_and_fetch(&hits, 1); + return 0; +} + +SEC("raw_tp/sys_enter") +int BPF_PROG(bench_trigger_raw_tp, struct pt_regs *regs, long id) +{ + if (id == __NR_getpgid) + __sync_add_and_fetch(&hits, 1); + return 0; +} + +SEC("kprobe/__x64_sys_getpgid") +int bench_trigger_kprobe(void *ctx) +{ + __sync_add_and_fetch(&hits, 1); + return 0; +} + +SEC("fentry/__x64_sys_getpgid") +int bench_trigger_fentry(void *ctx) +{ + __sync_add_and_fetch(&hits, 1); + return 0; +} + +SEC("fmod_ret/__x64_sys_getpgid") +int bench_trigger_fmodret(void *ctx) +{ + __sync_add_and_fetch(&hits, 1); + return -22; +} diff --git a/tools/testing/selftests/bpf/test_bpftool.py b/tools/testing/selftests/bpf/test_bpftool.py new file mode 100644 index 000000000000..4fed2dc25c0a --- /dev/null +++ b/tools/testing/selftests/bpf/test_bpftool.py @@ -0,0 +1,178 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2020 SUSE LLC. + +import collections +import functools +import json +import os +import socket +import subprocess +import unittest + + +# Add the source tree of bpftool and /usr/local/sbin to PATH +cur_dir = os.path.dirname(os.path.realpath(__file__)) +bpftool_dir = os.path.abspath(os.path.join(cur_dir, "..", "..", "..", "..", + "tools", "bpf", "bpftool")) +os.environ["PATH"] = bpftool_dir + ":/usr/local/sbin:" + os.environ["PATH"] + + +class IfaceNotFoundError(Exception): + pass + + +class UnprivilegedUserError(Exception): + pass + + +def _bpftool(args, json=True): + _args = ["bpftool"] + if json: + _args.append("-j") + _args.extend(args) + + return subprocess.check_output(_args) + + +def bpftool(args): + return _bpftool(args, json=False).decode("utf-8") + + +def bpftool_json(args): + res = _bpftool(args) + return json.loads(res) + + +def get_default_iface(): + for iface in socket.if_nameindex(): + if iface[1] != "lo": + return iface[1] + raise IfaceNotFoundError("Could not find any network interface to probe") + + +def default_iface(f): + @functools.wraps(f) + def wrapper(*args, **kwargs): + iface = get_default_iface() + return f(*args, iface, **kwargs) + return wrapper + + +class TestBpftool(unittest.TestCase): + @classmethod + def setUpClass(cls): + if os.getuid() != 0: + raise UnprivilegedUserError( + "This test suite needs root privileges") + + @default_iface + def test_feature_dev_json(self, iface): + unexpected_helpers = [ + "bpf_probe_write_user", + "bpf_trace_printk", + ] + expected_keys = [ + "syscall_config", + "program_types", + "map_types", + "helpers", + "misc", + ] + + res = bpftool_json(["feature", "probe", "dev", iface]) + # Check if the result has all expected keys. + self.assertCountEqual(res.keys(), expected_keys) + # Check if unexpected helpers are not included in helpers probes + # result. + for helpers in res["helpers"].values(): + for unexpected_helper in unexpected_helpers: + self.assertNotIn(unexpected_helper, helpers) + + def test_feature_kernel(self): + test_cases = [ + bpftool_json(["feature", "probe", "kernel"]), + bpftool_json(["feature", "probe"]), + bpftool_json(["feature"]), + ] + unexpected_helpers = [ + "bpf_probe_write_user", + "bpf_trace_printk", + ] + expected_keys = [ + "syscall_config", + "system_config", + "program_types", + "map_types", + "helpers", + "misc", + ] + + for tc in test_cases: + # Check if the result has all expected keys. + self.assertCountEqual(tc.keys(), expected_keys) + # Check if unexpected helpers are not included in helpers probes + # result. + for helpers in tc["helpers"].values(): + for unexpected_helper in unexpected_helpers: + self.assertNotIn(unexpected_helper, helpers) + + def test_feature_kernel_full(self): + test_cases = [ + bpftool_json(["feature", "probe", "kernel", "full"]), + bpftool_json(["feature", "probe", "full"]), + ] + expected_helpers = [ + "bpf_probe_write_user", + "bpf_trace_printk", + ] + + for tc in test_cases: + # Check if expected helpers are included at least once in any + # helpers list for any program type. Unfortunately we cannot assume + # that they will be included in all program types or a specific + # subset of programs. It depends on the kernel version and + # configuration. + found_helpers = False + + for helpers in tc["helpers"].values(): + if all(expected_helper in helpers + for expected_helper in expected_helpers): + found_helpers = True + break + + self.assertTrue(found_helpers) + + def test_feature_kernel_full_vs_not_full(self): + full_res = bpftool_json(["feature", "probe", "full"]) + not_full_res = bpftool_json(["feature", "probe"]) + not_full_set = set() + full_set = set() + + for helpers in full_res["helpers"].values(): + for helper in helpers: + full_set.add(helper) + + for helpers in not_full_res["helpers"].values(): + for helper in helpers: + not_full_set.add(helper) + + self.assertCountEqual(full_set - not_full_set, + {"bpf_probe_write_user", "bpf_trace_printk"}) + self.assertCountEqual(not_full_set - full_set, set()) + + def test_feature_macros(self): + expected_patterns = [ + r"/\*\*\* System call availability \*\*\*/", + r"#define HAVE_BPF_SYSCALL", + r"/\*\*\* eBPF program types \*\*\*/", + r"#define HAVE.*PROG_TYPE", + r"/\*\*\* eBPF map types \*\*\*/", + r"#define HAVE.*MAP_TYPE", + r"/\*\*\* eBPF helper functions \*\*\*/", + r"#define HAVE.*HELPER", + r"/\*\*\* eBPF misc features \*\*\*/", + ] + + res = bpftool(["feature", "probe", "macros"]) + for pattern in expected_patterns: + self.assertRegex(res, pattern) diff --git a/tools/testing/selftests/bpf/test_bpftool.sh b/tools/testing/selftests/bpf/test_bpftool.sh new file mode 100755 index 000000000000..66690778e36d --- /dev/null +++ b/tools/testing/selftests/bpf/test_bpftool.sh @@ -0,0 +1,5 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2020 SUSE LLC. + +python3 -m unittest -v test_bpftool.TestBpftool diff --git a/tools/testing/selftests/bpf/test_btf.c b/tools/testing/selftests/bpf/test_btf.c index 8da77cda5f4a..305fae8f80a9 100644 --- a/tools/testing/selftests/bpf/test_btf.c +++ b/tools/testing/selftests/bpf/test_btf.c @@ -2854,7 +2854,7 @@ static struct btf_raw_test raw_tests[] = { .value_type_id = 1, .max_entries = 4, .btf_load_err = true, - .err_str = "vlen != 0", + .err_str = "Invalid func linkage", }, { diff --git a/tools/testing/selftests/bpf/test_current_pid_tgid_new_ns.c b/tools/testing/selftests/bpf/test_current_pid_tgid_new_ns.c new file mode 100644 index 000000000000..ed253f252cd0 --- /dev/null +++ b/tools/testing/selftests/bpf/test_current_pid_tgid_new_ns.c @@ -0,0 +1,159 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Carlos Neira cneirabustos@gmail.com */ +#define _GNU_SOURCE +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> +#include <sys/syscall.h> +#include <sched.h> +#include <sys/wait.h> +#include <sys/mount.h> +#include "test_progs.h" + +#define CHECK_NEWNS(condition, tag, format...) ({ \ + int __ret = !!(condition); \ + if (__ret) { \ + printf("%s:FAIL:%s ", __func__, tag); \ + printf(format); \ + } else { \ + printf("%s:PASS:%s\n", __func__, tag); \ + } \ + __ret; \ +}) + +struct bss { + __u64 dev; + __u64 ino; + __u64 pid_tgid; + __u64 user_pid_tgid; +}; + +int main(int argc, char **argv) +{ + pid_t pid; + int exit_code = 1; + struct stat st; + + printf("Testing bpf_get_ns_current_pid_tgid helper in new ns\n"); + + if (stat("/proc/self/ns/pid", &st)) { + perror("stat failed on /proc/self/ns/pid ns\n"); + printf("%s:FAILED\n", argv[0]); + return exit_code; + } + + if (CHECK_NEWNS(unshare(CLONE_NEWPID | CLONE_NEWNS), + "unshare CLONE_NEWPID | CLONE_NEWNS", "error errno=%d\n", errno)) + return exit_code; + + pid = fork(); + if (pid == -1) { + perror("Fork() failed\n"); + printf("%s:FAILED\n", argv[0]); + return exit_code; + } + + if (pid > 0) { + int status; + + usleep(5); + waitpid(pid, &status, 0); + return 0; + } else { + + pid = fork(); + if (pid == -1) { + perror("Fork() failed\n"); + printf("%s:FAILED\n", argv[0]); + return exit_code; + } + + if (pid > 0) { + int status; + waitpid(pid, &status, 0); + return 0; + } else { + if (CHECK_NEWNS(mount("none", "/proc", NULL, MS_PRIVATE|MS_REC, NULL), + "Unmounting proc", "Cannot umount proc! errno=%d\n", errno)) + return exit_code; + + if (CHECK_NEWNS(mount("proc", "/proc", "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL), + "Mounting proc", "Cannot mount proc! errno=%d\n", errno)) + return exit_code; + + const char *probe_name = "raw_tracepoint/sys_enter"; + const char *file = "test_ns_current_pid_tgid.o"; + struct bpf_link *link = NULL; + struct bpf_program *prog; + struct bpf_map *bss_map; + struct bpf_object *obj; + int exit_code = 1; + int err, key = 0; + struct bss bss; + struct stat st; + __u64 id; + + obj = bpf_object__open_file(file, NULL); + if (CHECK_NEWNS(IS_ERR(obj), "obj_open", "err %ld\n", PTR_ERR(obj))) + return exit_code; + + err = bpf_object__load(obj); + if (CHECK_NEWNS(err, "obj_load", "err %d errno %d\n", err, errno)) + goto cleanup; + + bss_map = bpf_object__find_map_by_name(obj, "test_ns_.bss"); + if (CHECK_NEWNS(!bss_map, "find_bss_map", "failed\n")) + goto cleanup; + + prog = bpf_object__find_program_by_title(obj, probe_name); + if (CHECK_NEWNS(!prog, "find_prog", "prog '%s' not found\n", + probe_name)) + goto cleanup; + + memset(&bss, 0, sizeof(bss)); + pid_t tid = syscall(SYS_gettid); + pid_t pid = getpid(); + + id = (__u64) tid << 32 | pid; + bss.user_pid_tgid = id; + + if (CHECK_NEWNS(stat("/proc/self/ns/pid", &st), + "stat new ns", "Failed to stat /proc/self/ns/pid errno=%d\n", errno)) + goto cleanup; + + bss.dev = st.st_dev; + bss.ino = st.st_ino; + + err = bpf_map_update_elem(bpf_map__fd(bss_map), &key, &bss, 0); + if (CHECK_NEWNS(err, "setting_bss", "failed to set bss : %d\n", err)) + goto cleanup; + + link = bpf_program__attach_raw_tracepoint(prog, "sys_enter"); + if (CHECK_NEWNS(IS_ERR(link), "attach_raw_tp", "err %ld\n", + PTR_ERR(link))) { + link = NULL; + goto cleanup; + } + + /* trigger some syscalls */ + usleep(1); + + err = bpf_map_lookup_elem(bpf_map__fd(bss_map), &key, &bss); + if (CHECK_NEWNS(err, "set_bss", "failed to get bss : %d\n", err)) + goto cleanup; + + if (CHECK_NEWNS(id != bss.pid_tgid, "Compare user pid/tgid vs. bpf pid/tgid", + "User pid/tgid %llu BPF pid/tgid %llu\n", id, bss.pid_tgid)) + goto cleanup; + + exit_code = 0; + printf("%s:PASS\n", argv[0]); +cleanup: + if (!link) { + bpf_link__destroy(link); + link = NULL; + } + bpf_object__close(obj); + } + } +} diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index 02eae1e864c2..6a12a0e01e07 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -756,11 +756,7 @@ static void test_sockmap(unsigned int tasks, void *data) /* Test update without programs */ for (i = 0; i < 6; i++) { err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_ANY); - if (i < 2 && !err) { - printf("Allowed update sockmap '%i:%i' not in ESTABLISHED\n", - i, sfd[i]); - goto out_sockmap; - } else if (i >= 2 && err) { + if (err) { printf("Failed noprog update sockmap '%i:%i'\n", i, sfd[i]); goto out_sockmap; @@ -1398,23 +1394,25 @@ static void test_map_rdonly(void) key = 1; value = 1234; - /* Insert key=1 element. */ + /* Try to insert key=1 element. */ assert(bpf_map_update_elem(fd, &key, &value, BPF_ANY) == -1 && errno == EPERM); - /* Check that key=2 is not found. */ + /* Check that key=1 is not found. */ assert(bpf_map_lookup_elem(fd, &key, &value) == -1 && errno == ENOENT); assert(bpf_map_get_next_key(fd, &key, &value) == -1 && errno == ENOENT); + + close(fd); } -static void test_map_wronly(void) +static void test_map_wronly_hash(void) { int fd, key = 0, value = 0; fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value), MAP_SIZE, map_flags | BPF_F_WRONLY); if (fd < 0) { - printf("Failed to create map for read only test '%s'!\n", + printf("Failed to create map for write only test '%s'!\n", strerror(errno)); exit(1); } @@ -1424,9 +1422,49 @@ static void test_map_wronly(void) /* Insert key=1 element. */ assert(bpf_map_update_elem(fd, &key, &value, BPF_ANY) == 0); - /* Check that key=2 is not found. */ + /* Check that reading elements and keys from the map is not allowed. */ assert(bpf_map_lookup_elem(fd, &key, &value) == -1 && errno == EPERM); assert(bpf_map_get_next_key(fd, &key, &value) == -1 && errno == EPERM); + + close(fd); +} + +static void test_map_wronly_stack_or_queue(enum bpf_map_type map_type) +{ + int fd, value = 0; + + assert(map_type == BPF_MAP_TYPE_QUEUE || + map_type == BPF_MAP_TYPE_STACK); + fd = bpf_create_map(map_type, 0, sizeof(value), MAP_SIZE, + map_flags | BPF_F_WRONLY); + /* Stack/Queue maps do not support BPF_F_NO_PREALLOC */ + if (map_flags & BPF_F_NO_PREALLOC) { + assert(fd < 0 && errno == EINVAL); + return; + } + if (fd < 0) { + printf("Failed to create map '%s'!\n", strerror(errno)); + exit(1); + } + + value = 1234; + assert(bpf_map_update_elem(fd, NULL, &value, BPF_ANY) == 0); + + /* Peek element should fail */ + assert(bpf_map_lookup_elem(fd, NULL, &value) == -1 && errno == EPERM); + + /* Pop element should fail */ + assert(bpf_map_lookup_and_delete_elem(fd, NULL, &value) == -1 && + errno == EPERM); + + close(fd); +} + +static void test_map_wronly(void) +{ + test_map_wronly_hash(); + test_map_wronly_stack_or_queue(BPF_MAP_TYPE_STACK); + test_map_wronly_stack_or_queue(BPF_MAP_TYPE_QUEUE); } static void prepare_reuseport_grp(int type, int map_fd, size_t map_elem_size, diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index bab1e6f1d8f1..54fa5fa688ce 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -1,11 +1,16 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2017 Facebook */ +#define _GNU_SOURCE #include "test_progs.h" #include "cgroup_helpers.h" #include "bpf_rlimit.h" #include <argp.h> +#include <pthread.h> +#include <sched.h> +#include <signal.h> #include <string.h> +#include <execinfo.h> /* backtrace */ /* defined in test_progs.h */ struct test_env env = {}; @@ -27,6 +32,20 @@ struct prog_test_def { int old_error_cnt; }; +/* Override C runtime library's usleep() implementation to ensure nanosleep() + * is always called. Usleep is frequently used in selftests as a way to + * trigger kprobe and tracepoints. + */ +int usleep(useconds_t usec) +{ + struct timespec ts = { + .tv_sec = usec / 1000000, + .tv_nsec = (usec % 1000000) * 1000, + }; + + return syscall(__NR_nanosleep, &ts, NULL); +} + static bool should_run(struct test_selector *sel, int num, const char *name) { int i; @@ -74,6 +93,34 @@ static void skip_account(void) } } +static void stdio_restore(void); + +/* A bunch of tests set custom affinity per-thread and/or per-process. Reset + * it after each test/sub-test. + */ +static void reset_affinity() { + + cpu_set_t cpuset; + int i, err; + + CPU_ZERO(&cpuset); + for (i = 0; i < env.nr_cpus; i++) + CPU_SET(i, &cpuset); + + err = sched_setaffinity(0, sizeof(cpuset), &cpuset); + if (err < 0) { + stdio_restore(); + fprintf(stderr, "Failed to reset process affinity: %d!\n", err); + exit(-1); + } + err = pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset); + if (err < 0) { + stdio_restore(); + fprintf(stderr, "Failed to reset thread affinity: %d!\n", err); + exit(-1); + } +} + void test__end_subtest() { struct prog_test_def *test = env.test; @@ -91,6 +138,8 @@ void test__end_subtest() test->test_num, test->subtest_num, test->subtest_name, sub_error_cnt ? "FAIL" : "OK"); + reset_affinity(); + free(test->subtest_name); test->subtest_name = NULL; } @@ -173,30 +222,13 @@ int test__join_cgroup(const char *path) return fd; } -struct ipv4_packet pkt_v4 = { - .eth.h_proto = __bpf_constant_htons(ETH_P_IP), - .iph.ihl = 5, - .iph.protocol = IPPROTO_TCP, - .iph.tot_len = __bpf_constant_htons(MAGIC_BYTES), - .tcp.urg_ptr = 123, - .tcp.doff = 5, -}; - -struct ipv6_packet pkt_v6 = { - .eth.h_proto = __bpf_constant_htons(ETH_P_IPV6), - .iph.nexthdr = IPPROTO_TCP, - .iph.payload_len = __bpf_constant_htons(MAGIC_BYTES), - .tcp.urg_ptr = 123, - .tcp.doff = 5, -}; - int bpf_find_map(const char *test, struct bpf_object *obj, const char *name) { struct bpf_map *map; map = bpf_object__find_map_by_name(obj, name); if (!map) { - printf("%s:FAIL:map '%s' not found\n", test, name); + fprintf(stdout, "%s:FAIL:map '%s' not found\n", test, name); test__fail(); return -1; } @@ -302,25 +334,13 @@ int extract_build_id(char *build_id, size_t size) len = size; memcpy(build_id, line, len); build_id[len] = '\0'; + free(line); return 0; err: fclose(fp); return -1; } -void *spin_lock_thread(void *arg) -{ - __u32 duration, retval; - int err, prog_fd = *(u32 *) arg; - - err = bpf_prog_test_run(prog_fd, 10000, &pkt_v4, sizeof(pkt_v4), - NULL, NULL, &retval, &duration); - CHECK(err || retval, "", - "err %d errno %d retval %d duration %d\n", - err, errno, retval, duration); - pthread_exit(arg); -} - /* extern declarations for test funcs */ #define DEFINE_TEST(name) extern void test_##name(void); #include <prog_tests/tests.h> @@ -367,10 +387,22 @@ static int libbpf_print_fn(enum libbpf_print_level level, { if (env.verbosity < VERBOSE_VERY && level == LIBBPF_DEBUG) return 0; - vprintf(format, args); + vfprintf(stdout, format, args); return 0; } +static void free_str_set(const struct str_set *set) +{ + int i; + + if (!set) + return; + + for (i = 0; i < set->cnt; i++) + free((void *)set->strs[i]); + free(set->strs); +} + static int parse_str_list(const char *s, struct str_set *set) { char *input, *state = NULL, *next, **tmp, **strs = NULL; @@ -406,66 +438,6 @@ err: return -ENOMEM; } -int parse_num_list(const char *s, struct test_selector *sel) -{ - int i, set_len = 0, num, start = 0, end = -1; - bool *set = NULL, *tmp, parsing_end = false; - char *next; - - while (s[0]) { - errno = 0; - num = strtol(s, &next, 10); - if (errno) - return -errno; - - if (parsing_end) - end = num; - else - start = num; - - if (!parsing_end && *next == '-') { - s = next + 1; - parsing_end = true; - continue; - } else if (*next == ',') { - parsing_end = false; - s = next + 1; - end = num; - } else if (*next == '\0') { - parsing_end = false; - s = next; - end = num; - } else { - return -EINVAL; - } - - if (start > end) - return -EINVAL; - - if (end + 1 > set_len) { - set_len = end + 1; - tmp = realloc(set, set_len); - if (!tmp) { - free(set); - return -ENOMEM; - } - set = tmp; - } - for (i = start; i <= end; i++) { - set[i] = true; - } - - } - - if (!set) - return -EINVAL; - - sel->num_set = set; - sel->num_set_len = set_len; - - return 0; -} - extern int extra_prog_load_log_flags; static error_t parse_arg(int key, char *arg, struct argp_state *state) @@ -479,13 +451,15 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) if (subtest_str) { *subtest_str = '\0'; if (parse_num_list(subtest_str + 1, - &env->subtest_selector)) { + &env->subtest_selector.num_set, + &env->subtest_selector.num_set_len)) { fprintf(stderr, "Failed to parse subtest numbers.\n"); return -EINVAL; } } - if (parse_num_list(arg, &env->test_selector)) { + if (parse_num_list(arg, &env->test_selector.num_set, + &env->test_selector.num_set_len)) { fprintf(stderr, "Failed to parse test numbers.\n"); return -EINVAL; } @@ -613,10 +587,27 @@ int cd_flavor_subdir(const char *exec_name) if (!flavor) return 0; flavor++; - printf("Switching to flavor '%s' subdirectory...\n", flavor); + fprintf(stdout, "Switching to flavor '%s' subdirectory...\n", flavor); return chdir(flavor); } +#define MAX_BACKTRACE_SZ 128 +void crash_handler(int signum) +{ + void *bt[MAX_BACKTRACE_SZ]; + size_t sz; + + sz = backtrace(bt, ARRAY_SIZE(bt)); + + if (env.test) + dump_test_log(env.test, true); + if (env.stdout) + stdio_restore(); + + fprintf(stderr, "Caught signal #%d!\nStack trace:\n", signum); + backtrace_symbols_fd(bt, sz, STDERR_FILENO); +} + int main(int argc, char **argv) { static const struct argp argp = { @@ -624,8 +615,14 @@ int main(int argc, char **argv) .parser = parse_arg, .doc = argp_program_doc, }; + struct sigaction sigact = { + .sa_handler = crash_handler, + .sa_flags = SA_RESETHAND, + }; int err, i; + sigaction(SIGSEGV, &sigact, NULL); + err = argp_parse(&argp, argc, argv, 0, NULL, &env); if (err) return err; @@ -639,6 +636,12 @@ int main(int argc, char **argv) srand(time(NULL)); env.jit_enabled = is_jit_enabled(); + env.nr_cpus = libbpf_num_possible_cpus(); + if (env.nr_cpus < 0) { + fprintf(stderr, "Failed to get number of CPUs: %d!\n", + env.nr_cpus); + return -1; + } stdio_hijack(); for (i = 0; i < prog_test_cnt; i++) { @@ -669,18 +672,19 @@ int main(int argc, char **argv) test->test_num, test->test_name, test->error_cnt ? "FAIL" : "OK"); + reset_affinity(); if (test->need_cgroup_cleanup) cleanup_cgroup_environment(); } stdio_restore(); - printf("Summary: %d/%d PASSED, %d SKIPPED, %d FAILED\n", - env.succ_cnt, env.sub_succ_cnt, env.skip_cnt, env.fail_cnt); + fprintf(stdout, "Summary: %d/%d PASSED, %d SKIPPED, %d FAILED\n", + env.succ_cnt, env.sub_succ_cnt, env.skip_cnt, env.fail_cnt); - free(env.test_selector.blacklist.strs); - free(env.test_selector.whitelist.strs); + free_str_set(&env.test_selector.blacklist); + free_str_set(&env.test_selector.whitelist); free(env.test_selector.num_set); - free(env.subtest_selector.blacklist.strs); - free(env.subtest_selector.whitelist.strs); + free_str_set(&env.subtest_selector.blacklist); + free_str_set(&env.subtest_selector.whitelist); free(env.subtest_selector.num_set); return env.fail_cnt ? EXIT_FAILURE : EXIT_SUCCESS; diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index bcfa9ef23fda..f4503c926aca 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -37,6 +37,7 @@ typedef __u16 __sum16; #include "bpf_util.h" #include <bpf/bpf_endian.h> #include "trace_helpers.h" +#include "testing_helpers.h" #include "flow_dissector_load.h" enum verbosity { @@ -71,6 +72,7 @@ struct test_env { FILE *stderr; char *log_buf; size_t log_cnt; + int nr_cpus; int succ_cnt; /* successful tests */ int sub_succ_cnt; /* successful sub-tests */ @@ -86,33 +88,22 @@ extern void test__skip(void); extern void test__fail(void); extern int test__join_cgroup(const char *path); -#define MAGIC_BYTES 123 - -/* ipv4 test vector */ -struct ipv4_packet { - struct ethhdr eth; - struct iphdr iph; - struct tcphdr tcp; -} __packed; -extern struct ipv4_packet pkt_v4; - -/* ipv6 test vector */ -struct ipv6_packet { - struct ethhdr eth; - struct ipv6hdr iph; - struct tcphdr tcp; -} __packed; -extern struct ipv6_packet pkt_v6; +#define PRINT_FAIL(format...) \ + ({ \ + test__fail(); \ + fprintf(stdout, "%s:FAIL:%d ", __func__, __LINE__); \ + fprintf(stdout, ##format); \ + }) #define _CHECK(condition, tag, duration, format...) ({ \ int __ret = !!(condition); \ int __save_errno = errno; \ if (__ret) { \ test__fail(); \ - printf("%s:FAIL:%s ", __func__, tag); \ - printf(format); \ + fprintf(stdout, "%s:FAIL:%s ", __func__, tag); \ + fprintf(stdout, ##format); \ } else { \ - printf("%s:PASS:%s %d nsec\n", \ + fprintf(stdout, "%s:PASS:%s %d nsec\n", \ __func__, tag, duration); \ } \ errno = __save_errno; \ @@ -124,7 +115,7 @@ extern struct ipv6_packet pkt_v6; int __save_errno = errno; \ if (__ret) { \ test__fail(); \ - printf("%s:FAIL:%d\n", __func__, __LINE__); \ + fprintf(stdout, "%s:FAIL:%d\n", __func__, __LINE__); \ } \ errno = __save_errno; \ __ret; \ @@ -135,10 +126,6 @@ extern struct ipv6_packet pkt_v6; #define CHECK_ATTR(condition, tag, format...) \ _CHECK(condition, tag, tattr.duration, format) -#define MAGIC_VAL 0x1234 -#define NUM_ITER 100000 -#define VIP_NUM 5 - static inline __u64 ptr_to_u64(const void *ptr) { return (__u64) (unsigned long) ptr; @@ -148,7 +135,6 @@ int bpf_find_map(const char *test, struct bpf_object *obj, const char *name); int compare_map_keys(int map1_fd, int map2_fd); int compare_stack_ips(int smap_fd, int amap_fd, int stack_trace_len); int extract_build_id(char *build_id, size_t size); -void *spin_lock_thread(void *arg); #ifdef __x86_64__ #define SYS_NANOSLEEP_KPROBE_NAME "__x64_sys_nanosleep" diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c index 61fd95b89af8..0358814c67dc 100644 --- a/tools/testing/selftests/bpf/test_sock_addr.c +++ b/tools/testing/selftests/bpf/test_sock_addr.c @@ -677,7 +677,7 @@ static int bind4_prog_load(const struct sock_addr_test *test) uint8_t u4_addr8[4]; uint16_t u4_addr16[2]; uint32_t u4_addr32; - } ip4; + } ip4, port; struct sockaddr_in addr4_rw; if (inet_pton(AF_INET, SERV4_IP, (void *)&ip4) != 1) { @@ -685,6 +685,8 @@ static int bind4_prog_load(const struct sock_addr_test *test) return -1; } + port.u4_addr32 = htons(SERV4_PORT); + if (mk_sockaddr(AF_INET, SERV4_REWRITE_IP, SERV4_REWRITE_PORT, (struct sockaddr *)&addr4_rw, sizeof(addr4_rw)) == -1) return -1; @@ -696,49 +698,65 @@ static int bind4_prog_load(const struct sock_addr_test *test) /* if (sk.family == AF_INET && */ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, offsetof(struct bpf_sock_addr, family)), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, AF_INET, 24), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, AF_INET, 32), /* (sk.type == SOCK_DGRAM || sk.type == SOCK_STREAM) && */ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, offsetof(struct bpf_sock_addr, type)), BPF_JMP_IMM(BPF_JNE, BPF_REG_7, SOCK_DGRAM, 1), BPF_JMP_A(1), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, SOCK_STREAM, 20), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, SOCK_STREAM, 28), /* 1st_byte_of_user_ip4 == expected && */ BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6, offsetof(struct bpf_sock_addr, user_ip4)), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[0], 18), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[0], 26), /* 2nd_byte_of_user_ip4 == expected && */ BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6, offsetof(struct bpf_sock_addr, user_ip4) + 1), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[1], 16), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[1], 24), /* 3rd_byte_of_user_ip4 == expected && */ BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6, offsetof(struct bpf_sock_addr, user_ip4) + 2), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[2], 14), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[2], 22), /* 4th_byte_of_user_ip4 == expected && */ BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6, offsetof(struct bpf_sock_addr, user_ip4) + 3), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[3], 12), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[3], 20), /* 1st_half_of_user_ip4 == expected && */ BPF_LDX_MEM(BPF_H, BPF_REG_7, BPF_REG_6, offsetof(struct bpf_sock_addr, user_ip4)), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr16[0], 10), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr16[0], 18), /* 2nd_half_of_user_ip4 == expected && */ BPF_LDX_MEM(BPF_H, BPF_REG_7, BPF_REG_6, offsetof(struct bpf_sock_addr, user_ip4) + 2), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr16[1], 8), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr16[1], 16), - /* whole_user_ip4 == expected) { */ + /* whole_user_ip4 == expected && */ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, offsetof(struct bpf_sock_addr, user_ip4)), BPF_LD_IMM64(BPF_REG_8, ip4.u4_addr32), /* See [2]. */ + BPF_JMP_REG(BPF_JNE, BPF_REG_7, BPF_REG_8, 12), + + /* 1st_byte_of_user_port == expected && */ + BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6, + offsetof(struct bpf_sock_addr, user_port)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, port.u4_addr8[0], 10), + + /* 1st_half_of_user_port == expected && */ + BPF_LDX_MEM(BPF_H, BPF_REG_7, BPF_REG_6, + offsetof(struct bpf_sock_addr, user_port)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, port.u4_addr16[0], 8), + + /* user_port == expected) { */ + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, + offsetof(struct bpf_sock_addr, user_port)), + BPF_LD_IMM64(BPF_REG_8, port.u4_addr32), /* See [2]. */ BPF_JMP_REG(BPF_JNE, BPF_REG_7, BPF_REG_8, 4), /* user_ip4 = addr4_rw.sin_addr */ diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index 779e11da979c..37695fc8096a 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -54,7 +54,7 @@ static void running_handler(int a); #define S1_PORT 10000 #define S2_PORT 10001 -#define BPF_SOCKMAP_FILENAME "test_sockmap_kern.o" +#define BPF_SOCKMAP_FILENAME "test_sockmap_kern.o" #define BPF_SOCKHASH_FILENAME "test_sockhash_kern.o" #define CG_PATH "/sockmap" @@ -63,14 +63,12 @@ int s1, s2, c1, c2, p1, p2; int test_cnt; int passed; int failed; -int map_fd[8]; -struct bpf_map *maps[8]; +int map_fd[9]; +struct bpf_map *maps[9]; int prog_fd[11]; int txmsg_pass; -int txmsg_noisy; int txmsg_redir; -int txmsg_redir_noisy; int txmsg_drop; int txmsg_apply; int txmsg_cork; @@ -81,7 +79,10 @@ int txmsg_end_push; int txmsg_start_pop; int txmsg_pop; int txmsg_ingress; -int txmsg_skb; +int txmsg_redir_skb; +int txmsg_ktls_skb; +int txmsg_ktls_skb_drop; +int txmsg_ktls_skb_redir; int ktls; int peek_flag; @@ -89,15 +90,13 @@ static const struct option long_options[] = { {"help", no_argument, NULL, 'h' }, {"cgroup", required_argument, NULL, 'c' }, {"rate", required_argument, NULL, 'r' }, - {"verbose", no_argument, NULL, 'v' }, + {"verbose", optional_argument, NULL, 'v' }, {"iov_count", required_argument, NULL, 'i' }, {"length", required_argument, NULL, 'l' }, {"test", required_argument, NULL, 't' }, {"data_test", no_argument, NULL, 'd' }, {"txmsg", no_argument, &txmsg_pass, 1 }, - {"txmsg_noisy", no_argument, &txmsg_noisy, 1 }, {"txmsg_redir", no_argument, &txmsg_redir, 1 }, - {"txmsg_redir_noisy", no_argument, &txmsg_redir_noisy, 1}, {"txmsg_drop", no_argument, &txmsg_drop, 1 }, {"txmsg_apply", required_argument, NULL, 'a'}, {"txmsg_cork", required_argument, NULL, 'k'}, @@ -108,12 +107,108 @@ static const struct option long_options[] = { {"txmsg_start_pop", required_argument, NULL, 'w'}, {"txmsg_pop", required_argument, NULL, 'x'}, {"txmsg_ingress", no_argument, &txmsg_ingress, 1 }, - {"txmsg_skb", no_argument, &txmsg_skb, 1 }, + {"txmsg_redir_skb", no_argument, &txmsg_redir_skb, 1 }, {"ktls", no_argument, &ktls, 1 }, {"peek", no_argument, &peek_flag, 1 }, + {"whitelist", required_argument, NULL, 'n' }, + {"blacklist", required_argument, NULL, 'b' }, {0, 0, NULL, 0 } }; +struct test_env { + const char *type; + const char *subtest; + const char *prepend; + + int test_num; + int subtest_num; + + int succ_cnt; + int fail_cnt; + int fail_last; +}; + +struct test_env env; + +struct sockmap_options { + int verbose; + bool base; + bool sendpage; + bool data_test; + bool drop_expected; + int iov_count; + int iov_length; + int rate; + char *map; + char *whitelist; + char *blacklist; + char *prepend; +}; + +struct _test { + char *title; + void (*tester)(int cg_fd, struct sockmap_options *opt); +}; + +static void test_start(void) +{ + env.subtest_num++; +} + +static void test_fail(void) +{ + env.fail_cnt++; +} + +static void test_pass(void) +{ + env.succ_cnt++; +} + +static void test_reset(void) +{ + txmsg_start = txmsg_end = 0; + txmsg_start_pop = txmsg_pop = 0; + txmsg_start_push = txmsg_end_push = 0; + txmsg_pass = txmsg_drop = txmsg_redir = 0; + txmsg_apply = txmsg_cork = 0; + txmsg_ingress = txmsg_redir_skb = 0; + txmsg_ktls_skb = txmsg_ktls_skb_drop = txmsg_ktls_skb_redir = 0; +} + +static int test_start_subtest(const struct _test *t, struct sockmap_options *o) +{ + env.type = o->map; + env.subtest = t->title; + env.prepend = o->prepend; + env.test_num++; + env.subtest_num = 0; + env.fail_last = env.fail_cnt; + test_reset(); + return 0; +} + +static void test_end_subtest(void) +{ + int error = env.fail_cnt - env.fail_last; + int type = strcmp(env.type, BPF_SOCKMAP_FILENAME); + + if (!error) + test_pass(); + + fprintf(stdout, "#%2d/%2d %8s:%s:%s:%s\n", + env.test_num, env.subtest_num, + !type ? "sockmap" : "sockhash", + env.prepend ? : "", + env.subtest, error ? "FAIL" : "OK"); +} + +static void test_print_results(void) +{ + fprintf(stdout, "Pass: %d Fail: %d\n", + env.succ_cnt, env.fail_cnt); +} + static void usage(char *argv[]) { int i; @@ -296,7 +391,7 @@ static int sockmap_init_sockets(int verbose) return errno; } - if (verbose) { + if (verbose > 1) { printf("connected sockets: c1 <-> p1, c2 <-> p2\n"); printf("cgroups binding: c1(%i) <-> s1(%i) - - - c2(%i) <-> s2(%i)\n", c1, s1, c2, s2); @@ -311,17 +406,6 @@ struct msg_stats { struct timespec end; }; -struct sockmap_options { - int verbose; - bool base; - bool sendpage; - bool data_test; - bool drop_expected; - int iov_count; - int iov_length; - int rate; -}; - static int msg_loop_sendpage(int fd, int iov_length, int cnt, struct msg_stats *s, struct sockmap_options *opt) @@ -345,14 +429,18 @@ static int msg_loop_sendpage(int fd, int iov_length, int cnt, clock_gettime(CLOCK_MONOTONIC, &s->start); for (i = 0; i < cnt; i++) { - int sent = sendfile(fd, fp, NULL, iov_length); + int sent; + + errno = 0; + sent = sendfile(fd, fp, NULL, iov_length); if (!drop && sent < 0) { - perror("send loop error"); + perror("sendpage loop error"); fclose(file); return sent; } else if (drop && sent >= 0) { - printf("sendpage loop error expected: %i\n", sent); + printf("sendpage loop error expected: %i errno %i\n", + sent, errno); fclose(file); return -EIO; } @@ -418,14 +506,41 @@ unwind_iov: static int msg_verify_data(struct msghdr *msg, int size, int chunk_sz) { - int i, j, bytes_cnt = 0; + int i, j = 0, bytes_cnt = 0; unsigned char k = 0; for (i = 0; i < msg->msg_iovlen; i++) { unsigned char *d = msg->msg_iov[i].iov_base; - for (j = 0; - j < msg->msg_iov[i].iov_len && size; j++) { + /* Special case test for skb ingress + ktls */ + if (i == 0 && txmsg_ktls_skb) { + if (msg->msg_iov[i].iov_len < 4) + return -EIO; + if (txmsg_ktls_skb_redir) { + if (memcmp(&d[13], "PASS", 4) != 0) { + fprintf(stderr, + "detected redirect ktls_skb data error with skb ingress update @iov[%i]:%i \"%02x %02x %02x %02x\" != \"PASS\"\n", i, 0, d[13], d[14], d[15], d[16]); + return -EIO; + } + d[13] = 0; + d[14] = 1; + d[15] = 2; + d[16] = 3; + j = 13; + } else if (txmsg_ktls_skb) { + if (memcmp(d, "PASS", 4) != 0) { + fprintf(stderr, + "detected ktls_skb data error with skb ingress update @iov[%i]:%i \"%02x %02x %02x %02x\" != \"PASS\"\n", i, 0, d[0], d[1], d[2], d[3]); + return -EIO; + } + d[0] = 0; + d[1] = 1; + d[2] = 2; + d[3] = 3; + } + } + + for (; j < msg->msg_iov[i].iov_len && size; j++) { if (d[j] != k++) { fprintf(stderr, "detected data corruption @iov[%i]:%i %02x != %02x, %02x ?= %02x\n", @@ -464,13 +579,18 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt, if (tx) { clock_gettime(CLOCK_MONOTONIC, &s->start); for (i = 0; i < cnt; i++) { - int sent = sendmsg(fd, &msg, flags); + int sent; + + errno = 0; + sent = sendmsg(fd, &msg, flags); if (!drop && sent < 0) { - perror("send loop error"); + perror("sendmsg loop error"); goto out_errno; } else if (drop && sent >= 0) { - printf("send loop error expected: %i\n", sent); + fprintf(stderr, + "sendmsg loop error expected: %i errno %i\n", + sent, errno); errno = -EIO; goto out_errno; } @@ -497,9 +617,10 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt, * paths. */ total_bytes = (float)iov_count * (float)iov_length * (float)cnt; - txmsg_pop_total = txmsg_pop; if (txmsg_apply) - txmsg_pop_total *= (total_bytes / txmsg_apply); + txmsg_pop_total = txmsg_pop * (total_bytes / txmsg_apply); + else + txmsg_pop_total = txmsg_pop * cnt; total_bytes -= txmsg_pop_total; err = clock_gettime(CLOCK_MONOTONIC, &s->start); if (err < 0) @@ -633,14 +754,18 @@ static int sendmsg_test(struct sockmap_options *opt) rxpid = fork(); if (rxpid == 0) { - if (opt->drop_expected) - exit(0); + iov_buf -= (txmsg_pop - txmsg_start_pop + 1); + if (opt->drop_expected || txmsg_ktls_skb_drop) + _exit(0); + + if (!iov_buf) /* zero bytes sent case */ + _exit(0); if (opt->sendpage) iov_count = 1; err = msg_loop(rx_fd, iov_count, iov_buf, cnt, &s, false, opt); - if (opt->verbose) + if (opt->verbose > 1) fprintf(stderr, "msg_loop_rx: iov_count %i iov_buf %i cnt %i err %i\n", iov_count, iov_buf, cnt, err); @@ -648,7 +773,7 @@ static int sendmsg_test(struct sockmap_options *opt) sent_Bps = sentBps(s); recvd_Bps = recvdBps(s); } - if (opt->verbose) + if (opt->verbose > 1) fprintf(stdout, "rx_sendmsg: TX: %zuB %fB/s %fGB/s RX: %zuB %fB/s %fGB/s %s\n", s.bytes_sent, sent_Bps, sent_Bps/giga, @@ -678,7 +803,7 @@ static int sendmsg_test(struct sockmap_options *opt) sent_Bps = sentBps(s); recvd_Bps = recvdBps(s); } - if (opt->verbose) + if (opt->verbose > 1) fprintf(stdout, "tx_sendmsg: TX: %zuB %fB/s %f GB/s RX: %zuB %fB/s %fGB/s\n", s.bytes_sent, sent_Bps, sent_Bps/giga, @@ -694,14 +819,14 @@ static int sendmsg_test(struct sockmap_options *opt) if (WIFEXITED(rx_status)) { err = WEXITSTATUS(rx_status); if (err) { - fprintf(stderr, "rx thread exited with err %d. ", err); + fprintf(stderr, "rx thread exited with err %d.\n", err); goto out; } } if (WIFEXITED(tx_status)) { err = WEXITSTATUS(tx_status); if (err) - fprintf(stderr, "tx thread exited with err %d. ", err); + fprintf(stderr, "tx thread exited with err %d.\n", err); } out: return err; @@ -783,6 +908,7 @@ static int forever_ping_pong(int rate, struct sockmap_options *opt) } enum { + SELFTESTS, PING_PONG, SENDMSG, BASE, @@ -816,8 +942,28 @@ static int run_options(struct sockmap_options *options, int cg_fd, int test) return err; } + /* Attach programs to TLS sockmap */ + if (txmsg_ktls_skb) { + err = bpf_prog_attach(prog_fd[0], map_fd[8], + BPF_SK_SKB_STREAM_PARSER, 0); + if (err) { + fprintf(stderr, + "ERROR: bpf_prog_attach (TLS sockmap %i->%i): %d (%s)\n", + prog_fd[0], map_fd[8], err, strerror(errno)); + return err; + } + + err = bpf_prog_attach(prog_fd[2], map_fd[8], + BPF_SK_SKB_STREAM_VERDICT, 0); + if (err) { + fprintf(stderr, "ERROR: bpf_prog_attach (TLS sockmap): %d (%s)\n", + err, strerror(errno)); + return err; + } + } + /* Attach to cgroups */ - err = bpf_prog_attach(prog_fd[2], cg_fd, BPF_CGROUP_SOCK_OPS, 0); + err = bpf_prog_attach(prog_fd[3], cg_fd, BPF_CGROUP_SOCK_OPS, 0); if (err) { fprintf(stderr, "ERROR: bpf_prog_attach (groups): %d (%s)\n", err, strerror(errno)); @@ -833,19 +979,14 @@ run: /* Attach txmsg program to sockmap */ if (txmsg_pass) - tx_prog_fd = prog_fd[3]; - else if (txmsg_noisy) tx_prog_fd = prog_fd[4]; else if (txmsg_redir) tx_prog_fd = prog_fd[5]; - else if (txmsg_redir_noisy) - tx_prog_fd = prog_fd[6]; - else if (txmsg_drop) - tx_prog_fd = prog_fd[9]; - /* apply and cork must be last */ else if (txmsg_apply) - tx_prog_fd = prog_fd[7]; + tx_prog_fd = prog_fd[6]; else if (txmsg_cork) + tx_prog_fd = prog_fd[7]; + else if (txmsg_drop) tx_prog_fd = prog_fd[8]; else tx_prog_fd = 0; @@ -870,7 +1011,7 @@ run: goto out; } - if (txmsg_redir || txmsg_redir_noisy) + if (txmsg_redir) redir_fd = c2; else redir_fd = c1; @@ -1018,7 +1159,35 @@ run: } } - if (txmsg_skb) { + if (txmsg_ktls_skb) { + int ingress = BPF_F_INGRESS; + + i = 0; + err = bpf_map_update_elem(map_fd[8], &i, &p2, BPF_ANY); + if (err) { + fprintf(stderr, + "ERROR: bpf_map_update_elem (c1 sockmap): %d (%s)\n", + err, strerror(errno)); + } + + if (txmsg_ktls_skb_redir) { + i = 1; + err = bpf_map_update_elem(map_fd[7], + &i, &ingress, BPF_ANY); + if (err) { + fprintf(stderr, + "ERROR: bpf_map_update_elem (txmsg_ingress): %d (%s)\n", + err, strerror(errno)); + } + } + + if (txmsg_ktls_skb_drop) { + i = 1; + err = bpf_map_update_elem(map_fd[7], &i, &i, BPF_ANY); + } + } + + if (txmsg_redir_skb) { int skb_fd = (test == SENDMSG || test == SENDPAGE) ? p2 : p1; int ingress = BPF_F_INGRESS; @@ -1033,8 +1202,7 @@ run: } i = 3; - err = bpf_map_update_elem(map_fd[0], - &i, &skb_fd, BPF_ANY); + err = bpf_map_update_elem(map_fd[0], &i, &skb_fd, BPF_ANY); if (err) { fprintf(stderr, "ERROR: bpf_map_update_elem (c1 sockmap): %d (%s)\n", @@ -1068,9 +1236,12 @@ run: fprintf(stderr, "unknown test\n"); out: /* Detatch and zero all the maps */ - bpf_prog_detach2(prog_fd[2], cg_fd, BPF_CGROUP_SOCK_OPS); + bpf_prog_detach2(prog_fd[3], cg_fd, BPF_CGROUP_SOCK_OPS); bpf_prog_detach2(prog_fd[0], map_fd[0], BPF_SK_SKB_STREAM_PARSER); bpf_prog_detach2(prog_fd[1], map_fd[0], BPF_SK_SKB_STREAM_VERDICT); + bpf_prog_detach2(prog_fd[0], map_fd[8], BPF_SK_SKB_STREAM_PARSER); + bpf_prog_detach2(prog_fd[2], map_fd[8], BPF_SK_SKB_STREAM_VERDICT); + if (tx_prog_fd >= 0) bpf_prog_detach2(tx_prog_fd, map_fd[1], BPF_SK_MSG_VERDICT); @@ -1112,12 +1283,8 @@ static void test_options(char *options) if (txmsg_pass) strncat(options, "pass,", OPTSTRING); - if (txmsg_noisy) - strncat(options, "pass_noisy,", OPTSTRING); if (txmsg_redir) strncat(options, "redir,", OPTSTRING); - if (txmsg_redir_noisy) - strncat(options, "redir_noisy,", OPTSTRING); if (txmsg_drop) strncat(options, "drop,", OPTSTRING); if (txmsg_apply) { @@ -1143,8 +1310,10 @@ static void test_options(char *options) } if (txmsg_ingress) strncat(options, "ingress,", OPTSTRING); - if (txmsg_skb) - strncat(options, "skb,", OPTSTRING); + if (txmsg_redir_skb) + strncat(options, "redir_skb,", OPTSTRING); + if (txmsg_ktls_skb) + strncat(options, "ktls_skb,", OPTSTRING); if (ktls) strncat(options, "ktls,", OPTSTRING); if (peek_flag) @@ -1168,416 +1337,317 @@ static int __test_exec(int cgrp, int test, struct sockmap_options *opt) test_options(options); - fprintf(stdout, - "[TEST %i]: (%i, %i, %i, %s, %s): ", - test_cnt, opt->rate, opt->iov_count, opt->iov_length, - test_to_str(test), options); - fflush(stdout); + if (opt->verbose) { + fprintf(stdout, + " [TEST %i]: (%i, %i, %i, %s, %s): ", + test_cnt, opt->rate, opt->iov_count, opt->iov_length, + test_to_str(test), options); + fflush(stdout); + } err = run_options(opt, cgrp, test); - fprintf(stdout, "%s\n", !err ? "PASS" : "FAILED"); + if (opt->verbose) + fprintf(stdout, " %s\n", !err ? "PASS" : "FAILED"); test_cnt++; !err ? passed++ : failed++; free(options); return err; } -static int test_exec(int cgrp, struct sockmap_options *opt) -{ - int err = __test_exec(cgrp, SENDMSG, opt); - - if (err) - goto out; - - err = __test_exec(cgrp, SENDPAGE, opt); -out: - return err; -} - -static int test_loop(int cgrp) -{ - struct sockmap_options opt; - - int err, i, l, r; - - opt.verbose = 0; - opt.base = false; - opt.sendpage = false; - opt.data_test = false; - opt.drop_expected = false; - opt.iov_count = 0; - opt.iov_length = 0; - opt.rate = 0; - - r = 1; - for (i = 1; i < 100; i += 33) { - for (l = 1; l < 100; l += 33) { - opt.rate = r; - opt.iov_count = i; - opt.iov_length = l; - err = test_exec(cgrp, &opt); - if (err) - goto out; - } - } - sched_yield(); -out: - return err; -} - -static int test_txmsg(int cgrp) +static void test_exec(int cgrp, struct sockmap_options *opt) { + int type = strcmp(opt->map, BPF_SOCKMAP_FILENAME); int err; - txmsg_pass = txmsg_noisy = txmsg_redir_noisy = txmsg_drop = 0; - txmsg_apply = txmsg_cork = 0; - txmsg_ingress = txmsg_skb = 0; - - txmsg_pass = 1; - err = test_loop(cgrp); - txmsg_pass = 0; - if (err) - goto out; - - txmsg_redir = 1; - err = test_loop(cgrp); - txmsg_redir = 0; - if (err) - goto out; - - txmsg_drop = 1; - err = test_loop(cgrp); - txmsg_drop = 0; - if (err) - goto out; - - txmsg_redir = 1; - txmsg_ingress = 1; - err = test_loop(cgrp); - txmsg_redir = 0; - txmsg_ingress = 0; - if (err) - goto out; -out: - txmsg_pass = 0; - txmsg_redir = 0; - txmsg_drop = 0; - return err; + if (type == 0) { + test_start(); + err = __test_exec(cgrp, SENDMSG, opt); + if (err) + test_fail(); + } else { + test_start(); + err = __test_exec(cgrp, SENDPAGE, opt); + if (err) + test_fail(); + } } -static int test_send(struct sockmap_options *opt, int cgrp) +static void test_send_one(struct sockmap_options *opt, int cgrp) { - int err; - opt->iov_length = 1; opt->iov_count = 1; opt->rate = 1; - err = test_exec(cgrp, opt); - if (err) - goto out; + test_exec(cgrp, opt); opt->iov_length = 1; opt->iov_count = 1024; opt->rate = 1; - err = test_exec(cgrp, opt); - if (err) - goto out; + test_exec(cgrp, opt); opt->iov_length = 1024; opt->iov_count = 1; opt->rate = 1; - err = test_exec(cgrp, opt); - if (err) - goto out; + test_exec(cgrp, opt); - opt->iov_length = 1; +} + +static void test_send_many(struct sockmap_options *opt, int cgrp) +{ + opt->iov_length = 3; opt->iov_count = 1; opt->rate = 512; - err = test_exec(cgrp, opt); - if (err) - goto out; - - opt->iov_length = 256; - opt->iov_count = 1024; - opt->rate = 2; - err = test_exec(cgrp, opt); - if (err) - goto out; + test_exec(cgrp, opt); opt->rate = 100; opt->iov_count = 1; opt->iov_length = 5; - err = test_exec(cgrp, opt); - if (err) - goto out; -out: - sched_yield(); - return err; + test_exec(cgrp, opt); } -static int test_mixed(int cgrp) +static void test_send_large(struct sockmap_options *opt, int cgrp) { - struct sockmap_options opt = {0}; - int err; + opt->iov_length = 256; + opt->iov_count = 1024; + opt->rate = 2; + test_exec(cgrp, opt); +} - txmsg_pass = txmsg_noisy = txmsg_redir_noisy = txmsg_drop = 0; - txmsg_apply = txmsg_cork = 0; - txmsg_start = txmsg_end = 0; - txmsg_start_push = txmsg_end_push = 0; - txmsg_start_pop = txmsg_pop = 0; +static void test_send(struct sockmap_options *opt, int cgrp) +{ + test_send_one(opt, cgrp); + test_send_many(opt, cgrp); + test_send_large(opt, cgrp); + sched_yield(); +} +static void test_txmsg_pass(int cgrp, struct sockmap_options *opt) +{ /* Test small and large iov_count values with pass/redir/apply/cork */ txmsg_pass = 1; - txmsg_redir = 0; - txmsg_apply = 1; - txmsg_cork = 0; - err = test_send(&opt, cgrp); - if (err) - goto out; + test_send(opt, cgrp); +} - txmsg_pass = 1; - txmsg_redir = 0; - txmsg_apply = 0; - txmsg_cork = 1; - err = test_send(&opt, cgrp); - if (err) - goto out; +static void test_txmsg_redir(int cgrp, struct sockmap_options *opt) +{ + txmsg_redir = 1; + test_send(opt, cgrp); +} - txmsg_pass = 1; - txmsg_redir = 0; - txmsg_apply = 1; - txmsg_cork = 1; - err = test_send(&opt, cgrp); - if (err) - goto out; +static void test_txmsg_drop(int cgrp, struct sockmap_options *opt) +{ + txmsg_drop = 1; + test_send(opt, cgrp); +} - txmsg_pass = 1; - txmsg_redir = 0; - txmsg_apply = 1024; - txmsg_cork = 0; - err = test_send(&opt, cgrp); - if (err) - goto out; +static void test_txmsg_ingress_redir(int cgrp, struct sockmap_options *opt) +{ + txmsg_pass = txmsg_drop = 0; + txmsg_ingress = txmsg_redir = 1; + test_send(opt, cgrp); +} - txmsg_pass = 1; - txmsg_redir = 0; - txmsg_apply = 0; - txmsg_cork = 1024; - err = test_send(&opt, cgrp); - if (err) - goto out; +static void test_txmsg_skb(int cgrp, struct sockmap_options *opt) +{ + bool data = opt->data_test; + int k = ktls; - txmsg_pass = 1; - txmsg_redir = 0; - txmsg_apply = 1024; - txmsg_cork = 1024; - err = test_send(&opt, cgrp); - if (err) - goto out; + opt->data_test = true; + ktls = 1; + txmsg_pass = txmsg_drop = 0; + txmsg_ingress = txmsg_redir = 0; + txmsg_ktls_skb = 1; txmsg_pass = 1; - txmsg_redir = 0; - txmsg_cork = 4096; - txmsg_apply = 4096; - err = test_send(&opt, cgrp); - if (err) - goto out; - txmsg_pass = 0; - txmsg_redir = 1; - txmsg_apply = 1; - txmsg_cork = 0; - err = test_send(&opt, cgrp); - if (err) - goto out; + /* Using data verification so ensure iov layout is + * expected from test receiver side. e.g. has enough + * bytes to write test code. + */ + opt->iov_length = 100; + opt->iov_count = 1; + opt->rate = 1; + test_exec(cgrp, opt); - txmsg_pass = 0; - txmsg_redir = 1; - txmsg_apply = 0; - txmsg_cork = 1; - err = test_send(&opt, cgrp); - if (err) - goto out; + txmsg_ktls_skb_drop = 1; + test_exec(cgrp, opt); - txmsg_pass = 0; - txmsg_redir = 1; - txmsg_apply = 1024; - txmsg_cork = 0; - err = test_send(&opt, cgrp); - if (err) - goto out; + txmsg_ktls_skb_drop = 0; + txmsg_ktls_skb_redir = 1; + test_exec(cgrp, opt); - txmsg_pass = 0; - txmsg_redir = 1; - txmsg_apply = 0; - txmsg_cork = 1024; - err = test_send(&opt, cgrp); - if (err) - goto out; + opt->data_test = data; + ktls = k; +} + + +/* Test cork with hung data. This tests poor usage patterns where + * cork can leave data on the ring if user program is buggy and + * doesn't flush them somehow. They do take some time however + * because they wait for a timeout. Test pass, redir and cork with + * apply logic. Use cork size of 4097 with send_large to avoid + * aligning cork size with send size. + */ +static void test_txmsg_cork_hangs(int cgrp, struct sockmap_options *opt) +{ + txmsg_pass = 1; + txmsg_redir = 0; + txmsg_cork = 4097; + txmsg_apply = 4097; + test_send_large(opt, cgrp); txmsg_pass = 0; txmsg_redir = 1; - txmsg_apply = 1024; - txmsg_cork = 1024; - err = test_send(&opt, cgrp); - if (err) - goto out; + txmsg_apply = 0; + txmsg_cork = 4097; + test_send_large(opt, cgrp); txmsg_pass = 0; txmsg_redir = 1; - txmsg_cork = 4096; - txmsg_apply = 4096; - err = test_send(&opt, cgrp); - if (err) - goto out; -out: - return err; + txmsg_apply = 4097; + txmsg_cork = 4097; + test_send_large(opt, cgrp); } -static int test_start_end(int cgrp) +static void test_txmsg_pull(int cgrp, struct sockmap_options *opt) { - struct sockmap_options opt = {0}; - int err, i; + /* Test basic start/end */ + txmsg_start = 1; + txmsg_end = 2; + test_send(opt, cgrp); + + /* Test >4k pull */ + txmsg_start = 4096; + txmsg_end = 9182; + test_send_large(opt, cgrp); - /* Test basic start/end with lots of iov_count and iov_lengths */ + /* Test pull + redirect */ + txmsg_redir = 0; txmsg_start = 1; txmsg_end = 2; - txmsg_start_push = 1; - txmsg_end_push = 2; - txmsg_start_pop = 1; - txmsg_pop = 1; - err = test_txmsg(cgrp); - if (err) - goto out; + test_send(opt, cgrp); - /* Cut a byte of pushed data but leave reamining in place */ + /* Test pull + cork */ + txmsg_redir = 0; + txmsg_cork = 512; txmsg_start = 1; txmsg_end = 2; - txmsg_start_push = 1; - txmsg_end_push = 3; + test_send_many(opt, cgrp); + + /* Test pull + cork + redirect */ + txmsg_redir = 1; + txmsg_cork = 512; + txmsg_start = 1; + txmsg_end = 2; + test_send_many(opt, cgrp); +} + +static void test_txmsg_pop(int cgrp, struct sockmap_options *opt) +{ + /* Test basic pop */ txmsg_start_pop = 1; - txmsg_pop = 1; - err = test_txmsg(cgrp); - if (err) - goto out; + txmsg_pop = 2; + test_send_many(opt, cgrp); - /* Test start/end with cork */ - opt.rate = 16; - opt.iov_count = 1; - opt.iov_length = 100; - txmsg_cork = 1600; - - txmsg_start_pop = 0; - txmsg_pop = 0; - - for (i = 99; i <= 1600; i += 500) { - txmsg_start = 0; - txmsg_end = i; - txmsg_start_push = 0; - txmsg_end_push = i; - err = test_exec(cgrp, &opt); - if (err) - goto out; - } + /* Test pop with >4k */ + txmsg_start_pop = 4096; + txmsg_pop = 4096; + test_send_large(opt, cgrp); - /* Test pop data in middle of cork */ - for (i = 99; i <= 1600; i += 500) { - txmsg_start_pop = 10; - txmsg_pop = i; - err = test_exec(cgrp, &opt); - if (err) - goto out; - } - txmsg_start_pop = 0; - txmsg_pop = 0; - - /* Test start/end with cork but pull data in middle */ - for (i = 199; i <= 1600; i += 500) { - txmsg_start = 100; - txmsg_end = i; - txmsg_start_push = 100; - txmsg_end_push = i; - err = test_exec(cgrp, &opt); - if (err) - goto out; - } + /* Test pop + redirect */ + txmsg_redir = 1; + txmsg_start_pop = 1; + txmsg_pop = 2; + test_send_many(opt, cgrp); - /* Test start/end with cork pulling last sg entry */ - txmsg_start = 1500; - txmsg_end = 1600; - txmsg_start_push = 1500; - txmsg_end_push = 1600; - err = test_exec(cgrp, &opt); - if (err) - goto out; + /* Test pop + cork */ + txmsg_redir = 0; + txmsg_cork = 512; + txmsg_start_pop = 1; + txmsg_pop = 2; + test_send_many(opt, cgrp); - /* Test pop with cork pulling last sg entry */ - txmsg_start_pop = 1500; - txmsg_pop = 1600; - err = test_exec(cgrp, &opt); - if (err) - goto out; - txmsg_start_pop = 0; - txmsg_pop = 0; - - /* Test start/end pull of single byte in last page */ - txmsg_start = 1111; - txmsg_end = 1112; - txmsg_start_push = 1111; - txmsg_end_push = 1112; - err = test_exec(cgrp, &opt); - if (err) - goto out; + /* Test pop + redirect + cork */ + txmsg_redir = 1; + txmsg_cork = 4; + txmsg_start_pop = 1; + txmsg_pop = 2; + test_send_many(opt, cgrp); +} - /* Test pop of single byte in last page */ - txmsg_start_pop = 1111; - txmsg_pop = 1112; - err = test_exec(cgrp, &opt); - if (err) - goto out; +static void test_txmsg_push(int cgrp, struct sockmap_options *opt) +{ + /* Test basic push */ + txmsg_start_push = 1; + txmsg_end_push = 1; + test_send(opt, cgrp); - /* Test start/end with end < start */ - txmsg_start = 1111; - txmsg_end = 0; - txmsg_start_push = 1111; - txmsg_end_push = 0; - err = test_exec(cgrp, &opt); - if (err) - goto out; + /* Test push 4kB >4k */ + txmsg_start_push = 4096; + txmsg_end_push = 4096; + test_send_large(opt, cgrp); - /* Test start/end with end > data */ - txmsg_start = 0; - txmsg_end = 1601; - txmsg_start_push = 0; - txmsg_end_push = 1601; - err = test_exec(cgrp, &opt); - if (err) - goto out; + /* Test push + redirect */ + txmsg_redir = 1; + txmsg_start_push = 1; + txmsg_end_push = 2; + test_send_many(opt, cgrp); - /* Test start/end with start > data */ - txmsg_start = 1601; - txmsg_end = 1600; - txmsg_start_push = 1601; - txmsg_end_push = 1600; - err = test_exec(cgrp, &opt); - if (err) - goto out; + /* Test push + cork */ + txmsg_redir = 0; + txmsg_cork = 512; + txmsg_start_push = 1; + txmsg_end_push = 2; + test_send_many(opt, cgrp); +} - /* Test pop with start > data */ - txmsg_start_pop = 1601; - txmsg_pop = 1; - err = test_exec(cgrp, &opt); - if (err) - goto out; +static void test_txmsg_push_pop(int cgrp, struct sockmap_options *opt) +{ + txmsg_start_push = 1; + txmsg_end_push = 10; + txmsg_start_pop = 5; + txmsg_pop = 4; + test_send_large(opt, cgrp); +} - /* Test pop with pop range > data */ - txmsg_start_pop = 1599; - txmsg_pop = 10; - err = test_exec(cgrp, &opt); -out: - txmsg_start = 0; - txmsg_end = 0; - sched_yield(); - return err; +static void test_txmsg_apply(int cgrp, struct sockmap_options *opt) +{ + txmsg_pass = 1; + txmsg_redir = 0; + txmsg_apply = 1; + txmsg_cork = 0; + test_send_one(opt, cgrp); + + txmsg_pass = 0; + txmsg_redir = 1; + txmsg_apply = 1; + txmsg_cork = 0; + test_send_one(opt, cgrp); + + txmsg_pass = 1; + txmsg_redir = 0; + txmsg_apply = 1024; + txmsg_cork = 0; + test_send_large(opt, cgrp); + + txmsg_pass = 0; + txmsg_redir = 1; + txmsg_apply = 1024; + txmsg_cork = 0; + test_send_large(opt, cgrp); +} + +static void test_txmsg_cork(int cgrp, struct sockmap_options *opt) +{ + txmsg_pass = 1; + txmsg_redir = 0; + txmsg_apply = 0; + txmsg_cork = 1; + test_send(opt, cgrp); + + txmsg_pass = 1; + txmsg_redir = 0; + txmsg_apply = 1; + txmsg_cork = 1; + test_send(opt, cgrp); } char *map_names[] = { @@ -1589,11 +1659,13 @@ char *map_names[] = { "sock_bytes", "sock_redir_flags", "sock_skb_opts", + "tls_sock_map", }; int prog_attach_type[] = { BPF_SK_SKB_STREAM_PARSER, BPF_SK_SKB_STREAM_VERDICT, + BPF_SK_SKB_STREAM_VERDICT, BPF_CGROUP_SOCK_OPS, BPF_SK_MSG_VERDICT, BPF_SK_MSG_VERDICT, @@ -1607,6 +1679,7 @@ int prog_attach_type[] = { int prog_type[] = { BPF_PROG_TYPE_SK_SKB, BPF_PROG_TYPE_SK_SKB, + BPF_PROG_TYPE_SK_SKB, BPF_PROG_TYPE_SOCK_OPS, BPF_PROG_TYPE_SK_MSG, BPF_PROG_TYPE_SK_MSG, @@ -1662,73 +1735,117 @@ static int populate_progs(char *bpf_file) return 0; } -static int __test_suite(int cg_fd, char *bpf_file) +struct _test test[] = { + {"txmsg test passthrough", test_txmsg_pass}, + {"txmsg test redirect", test_txmsg_redir}, + {"txmsg test drop", test_txmsg_drop}, + {"txmsg test ingress redirect", test_txmsg_ingress_redir}, + {"txmsg test skb", test_txmsg_skb}, + {"txmsg test apply", test_txmsg_apply}, + {"txmsg test cork", test_txmsg_cork}, + {"txmsg test hanging corks", test_txmsg_cork_hangs}, + {"txmsg test push_data", test_txmsg_push}, + {"txmsg test pull-data", test_txmsg_pull}, + {"txmsg test pop-data", test_txmsg_pop}, + {"txmsg test push/pop data", test_txmsg_push_pop}, +}; + +static int check_whitelist(struct _test *t, struct sockmap_options *opt) { - int err, cleanup = cg_fd; + char *entry, *ptr; + + if (!opt->whitelist) + return 0; + ptr = strdup(opt->whitelist); + if (!ptr) + return -ENOMEM; + entry = strtok(ptr, ","); + while (entry) { + if ((opt->prepend && strstr(opt->prepend, entry) != 0) || + strstr(opt->map, entry) != 0 || + strstr(t->title, entry) != 0) + return 0; + entry = strtok(NULL, ","); + } + return -EINVAL; +} - err = populate_progs(bpf_file); +static int check_blacklist(struct _test *t, struct sockmap_options *opt) +{ + char *entry, *ptr; + + if (!opt->blacklist) + return -EINVAL; + ptr = strdup(opt->blacklist); + if (!ptr) + return -ENOMEM; + entry = strtok(ptr, ","); + while (entry) { + if ((opt->prepend && strstr(opt->prepend, entry) != 0) || + strstr(opt->map, entry) != 0 || + strstr(t->title, entry) != 0) + return 0; + entry = strtok(NULL, ","); + } + return -EINVAL; +} + +static int __test_selftests(int cg_fd, struct sockmap_options *opt) +{ + int i, err; + + err = populate_progs(opt->map); if (err < 0) { fprintf(stderr, "ERROR: (%i) load bpf failed\n", err); return err; } - if (cg_fd < 0) { - if (setup_cgroup_environment()) { - fprintf(stderr, "ERROR: cgroup env failed\n"); - return -EINVAL; - } + /* Tests basic commands and APIs */ + for (i = 0; i < sizeof(test)/sizeof(struct _test); i++) { + struct _test t = test[i]; - cg_fd = create_and_get_cgroup(CG_PATH); - if (cg_fd < 0) { - fprintf(stderr, - "ERROR: (%i) open cg path failed: %s\n", - cg_fd, optarg); - return cg_fd; - } + if (check_whitelist(&t, opt) != 0) + continue; + if (check_blacklist(&t, opt) == 0) + continue; - if (join_cgroup(CG_PATH)) { - fprintf(stderr, "ERROR: failed to join cgroup\n"); - return -EINVAL; - } + test_start_subtest(&t, opt); + t.tester(cg_fd, opt); + test_end_subtest(); } - /* Tests basic commands and APIs with range of iov values */ - txmsg_start = txmsg_end = txmsg_start_push = txmsg_end_push = 0; - err = test_txmsg(cg_fd); - if (err) - goto out; + return err; +} - /* Tests interesting combinations of APIs used together */ - err = test_mixed(cg_fd); - if (err) - goto out; +static void test_selftests_sockmap(int cg_fd, struct sockmap_options *opt) +{ + opt->map = BPF_SOCKMAP_FILENAME; + __test_selftests(cg_fd, opt); +} - /* Tests pull_data API using start/end API */ - err = test_start_end(cg_fd); - if (err) - goto out; +static void test_selftests_sockhash(int cg_fd, struct sockmap_options *opt) +{ + opt->map = BPF_SOCKHASH_FILENAME; + __test_selftests(cg_fd, opt); +} -out: - printf("Summary: %i PASSED %i FAILED\n", passed, failed); - if (cleanup < 0) { - cleanup_cgroup_environment(); - close(cg_fd); - } - return err; +static void test_selftests_ktls(int cg_fd, struct sockmap_options *opt) +{ + opt->map = BPF_SOCKHASH_FILENAME; + opt->prepend = "ktls"; + ktls = 1; + __test_selftests(cg_fd, opt); + ktls = 0; } -static int test_suite(int cg_fd) +static int test_selftest(int cg_fd, struct sockmap_options *opt) { - int err; - err = __test_suite(cg_fd, BPF_SOCKMAP_FILENAME); - if (err) - goto out; - err = __test_suite(cg_fd, BPF_SOCKHASH_FILENAME); -out: - if (cg_fd > -1) - close(cg_fd); - return err; + test_selftests_sockmap(cg_fd, opt); + test_selftests_sockhash(cg_fd, opt); + test_selftests_ktls(cg_fd, opt); + test_print_results(); + return 0; } int main(int argc, char **argv) @@ -1737,12 +1854,10 @@ int main(int argc, char **argv) struct sockmap_options options = {0}; int opt, longindex, err, cg_fd = 0; char *bpf_file = BPF_SOCKMAP_FILENAME; - int test = PING_PONG; - - if (argc < 2) - return test_suite(-1); + int test = SELFTESTS; + bool cg_created = 0; - while ((opt = getopt_long(argc, argv, ":dhvc:r:i:l:t:p:q:", + while ((opt = getopt_long(argc, argv, ":dhv:c:r:i:l:t:p:q:n:b:", long_options, &longindex)) != -1) { switch (opt) { case 's': @@ -1783,6 +1898,8 @@ int main(int argc, char **argv) break; case 'v': options.verbose = 1; + if (optarg) + options.verbose = atoi(optarg); break; case 'i': iov_count = atoi(optarg); @@ -1809,6 +1926,15 @@ int main(int argc, char **argv) return -1; } break; + case 'n': + options.whitelist = strdup(optarg); + if (!options.whitelist) + return -ENOMEM; + break; + case 'b': + options.blacklist = strdup(optarg); + if (!options.blacklist) + return -ENOMEM; case 0: break; case 'h': @@ -1818,13 +1944,30 @@ int main(int argc, char **argv) } } - if (argc <= 3 && cg_fd) - return test_suite(cg_fd); - if (!cg_fd) { - fprintf(stderr, "%s requires cgroup option: --cgroup <path>\n", - argv[0]); - return -1; + if (setup_cgroup_environment()) { + fprintf(stderr, "ERROR: cgroup env failed\n"); + return -EINVAL; + } + + cg_fd = create_and_get_cgroup(CG_PATH); + if (cg_fd < 0) { + fprintf(stderr, + "ERROR: (%i) open cg path failed: %s\n", + cg_fd, strerror(errno)); + return cg_fd; + } + + if (join_cgroup(CG_PATH)) { + fprintf(stderr, "ERROR: failed to join cgroup\n"); + return -EINVAL; + } + cg_created = 1; + } + + if (test == SELFTESTS) { + err = test_selftest(cg_fd, &options); + goto out; } err = populate_progs(bpf_file); @@ -1843,6 +1986,13 @@ int main(int argc, char **argv) options.rate = rate; err = run_options(&options, cg_fd, test); +out: + if (options.whitelist) + free(options.whitelist); + if (options.blacklist) + free(options.blacklist); + if (cg_created) + cleanup_cgroup_environment(); close(cg_fd); return err; } diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 87eaa49609a0..78a6bae56ea6 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -50,7 +50,7 @@ #define MAX_INSNS BPF_MAXINSNS #define MAX_TEST_INSNS 1000000 #define MAX_FIXUPS 8 -#define MAX_NR_MAPS 19 +#define MAX_NR_MAPS 20 #define MAX_TEST_RUNS 8 #define POINTER_VALUE 0xcafe4all #define TEST_DATA_LEN 64 @@ -86,6 +86,7 @@ struct bpf_test { int fixup_map_array_small[MAX_FIXUPS]; int fixup_sk_storage_map[MAX_FIXUPS]; int fixup_map_event_output[MAX_FIXUPS]; + int fixup_map_reuseport_array[MAX_FIXUPS]; const char *errstr; const char *errstr_unpriv; uint32_t insn_processed; @@ -637,6 +638,7 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type, int *fixup_map_array_small = test->fixup_map_array_small; int *fixup_sk_storage_map = test->fixup_sk_storage_map; int *fixup_map_event_output = test->fixup_map_event_output; + int *fixup_map_reuseport_array = test->fixup_map_reuseport_array; if (test->fill_helper) { test->fill_insns = calloc(MAX_TEST_INSNS, sizeof(struct bpf_insn)); @@ -806,12 +808,28 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type, fixup_map_event_output++; } while (*fixup_map_event_output); } + if (*fixup_map_reuseport_array) { + map_fds[19] = __create_map(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, + sizeof(u32), sizeof(u64), 1, 0); + do { + prog[*fixup_map_reuseport_array].imm = map_fds[19]; + fixup_map_reuseport_array++; + } while (*fixup_map_reuseport_array); + } } +struct libcap { + struct __user_cap_header_struct hdr; + struct __user_cap_data_struct data[2]; +}; + static int set_admin(bool admin) { cap_t caps; - const cap_value_t cap_val = CAP_SYS_ADMIN; + /* need CAP_BPF, CAP_NET_ADMIN, CAP_PERFMON to load progs */ + const cap_value_t cap_net_admin = CAP_NET_ADMIN; + const cap_value_t cap_sys_admin = CAP_SYS_ADMIN; + struct libcap *cap; int ret = -1; caps = cap_get_proc(); @@ -819,11 +837,26 @@ static int set_admin(bool admin) perror("cap_get_proc"); return -1; } - if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_val, + cap = (struct libcap *)caps; + if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_sys_admin, CAP_CLEAR)) { + perror("cap_set_flag clear admin"); + goto out; + } + if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_net_admin, admin ? CAP_SET : CAP_CLEAR)) { - perror("cap_set_flag"); + perror("cap_set_flag set_or_clear net"); goto out; } + /* libcap is likely old and simply ignores CAP_BPF and CAP_PERFMON, + * so update effective bits manually + */ + if (admin) { + cap->data[1].effective |= 1 << (38 /* CAP_PERFMON */ - 32); + cap->data[1].effective |= 1 << (39 /* CAP_BPF */ - 32); + } else { + cap->data[1].effective &= ~(1 << (38 - 32)); + cap->data[1].effective &= ~(1 << (39 - 32)); + } if (cap_set_proc(caps)) { perror("cap_set_proc"); goto out; @@ -943,7 +976,12 @@ static void do_test_single(struct bpf_test *test, bool unpriv, attr.insns = prog; attr.insns_cnt = prog_len; attr.license = "GPL"; - attr.log_level = verbose || expected_ret == VERBOSE_ACCEPT ? 1 : 4; + if (verbose) + attr.log_level = 1; + else if (expected_ret == VERBOSE_ACCEPT) + attr.log_level = 2; + else + attr.log_level = 4; attr.prog_flags = pflags; fd_prog = bpf_load_program_xattr(&attr, bpf_vlog, sizeof(bpf_vlog)); @@ -1052,9 +1090,11 @@ fail_log: static bool is_admin(void) { + cap_flag_value_t net_priv = CAP_CLEAR; + bool perfmon_priv = false; + bool bpf_priv = false; + struct libcap *cap; cap_t caps; - cap_flag_value_t sysadmin = CAP_CLEAR; - const cap_value_t cap_val = CAP_SYS_ADMIN; #ifdef CAP_IS_SUPPORTED if (!CAP_IS_SUPPORTED(CAP_SETFCAP)) { @@ -1067,11 +1107,14 @@ static bool is_admin(void) perror("cap_get_proc"); return false; } - if (cap_get_flag(caps, cap_val, CAP_EFFECTIVE, &sysadmin)) - perror("cap_get_flag"); + cap = (struct libcap *)caps; + bpf_priv = cap->data[1].effective & (1 << (39/* CAP_BPF */ - 32)); + perfmon_priv = cap->data[1].effective & (1 << (38/* CAP_PERFMON */ - 32)); + if (cap_get_flag(caps, CAP_NET_ADMIN, CAP_EFFECTIVE, &net_priv)) + perror("cap_get_flag NET"); if (cap_free(caps)) perror("cap_free"); - return (sysadmin == CAP_SET); + return bpf_priv && perfmon_priv && net_priv == CAP_SET; } static void get_unpriv_disabled() diff --git a/tools/testing/selftests/bpf/testing_helpers.c b/tools/testing/selftests/bpf/testing_helpers.c new file mode 100644 index 000000000000..0af6337a8962 --- /dev/null +++ b/tools/testing/selftests/bpf/testing_helpers.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* Copyright (C) 2020 Facebook, Inc. */ +#include <stdlib.h> +#include <errno.h> +#include "testing_helpers.h" + +int parse_num_list(const char *s, bool **num_set, int *num_set_len) +{ + int i, set_len = 0, new_len, num, start = 0, end = -1; + bool *set = NULL, *tmp, parsing_end = false; + char *next; + + while (s[0]) { + errno = 0; + num = strtol(s, &next, 10); + if (errno) + return -errno; + + if (parsing_end) + end = num; + else + start = num; + + if (!parsing_end && *next == '-') { + s = next + 1; + parsing_end = true; + continue; + } else if (*next == ',') { + parsing_end = false; + s = next + 1; + end = num; + } else if (*next == '\0') { + parsing_end = false; + s = next; + end = num; + } else { + return -EINVAL; + } + + if (start > end) + return -EINVAL; + + if (end + 1 > set_len) { + new_len = end + 1; + tmp = realloc(set, new_len); + if (!tmp) { + free(set); + return -ENOMEM; + } + for (i = set_len; i < start; i++) + tmp[i] = false; + set = tmp; + set_len = new_len; + } + for (i = start; i <= end; i++) + set[i] = true; + } + + if (!set) + return -EINVAL; + + *num_set = set; + *num_set_len = set_len; + + return 0; +} diff --git a/tools/testing/selftests/bpf/testing_helpers.h b/tools/testing/selftests/bpf/testing_helpers.h new file mode 100644 index 000000000000..923b51762759 --- /dev/null +++ b/tools/testing/selftests/bpf/testing_helpers.h @@ -0,0 +1,5 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +/* Copyright (C) 2020 Facebook, Inc. */ +#include <stdbool.h> + +int parse_num_list(const char *s, bool **set, int *set_len); diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c index 7f989b3e4e22..4d0e913bbb22 100644 --- a/tools/testing/selftests/bpf/trace_helpers.c +++ b/tools/testing/selftests/bpf/trace_helpers.c @@ -4,12 +4,15 @@ #include <string.h> #include <assert.h> #include <errno.h> +#include <fcntl.h> #include <poll.h> #include <unistd.h> #include <linux/perf_event.h> #include <sys/mman.h> #include "trace_helpers.h" +#define DEBUGFS "/sys/kernel/debug/tracing/" + #define MAX_SYMS 300000 static struct ksym syms[MAX_SYMS]; static int sym_cnt; @@ -86,3 +89,23 @@ long ksym_get_addr(const char *name) return 0; } + +void read_trace_pipe(void) +{ + int trace_fd; + + trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0); + if (trace_fd < 0) + return; + + while (1) { + static char buf[4096]; + ssize_t sz; + + sz = read(trace_fd, buf, sizeof(buf) - 1); + if (sz > 0) { + buf[sz] = 0; + puts(buf); + } + } +} diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h index 0383c9b8adc1..25ef597dd03f 100644 --- a/tools/testing/selftests/bpf/trace_helpers.h +++ b/tools/testing/selftests/bpf/trace_helpers.h @@ -12,5 +12,6 @@ struct ksym { int load_kallsyms(void); struct ksym *ksym_search(long key); long ksym_get_addr(const char *name); +void read_trace_pipe(void); #endif diff --git a/tools/testing/selftests/bpf/verifier/.gitignore b/tools/testing/selftests/bpf/verifier/.gitignore index 45984a364647..89c4a3d37544 100644 --- a/tools/testing/selftests/bpf/verifier/.gitignore +++ b/tools/testing/selftests/bpf/verifier/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only tests.h diff --git a/tools/testing/selftests/bpf/verifier/and.c b/tools/testing/selftests/bpf/verifier/and.c index e0fad1548737..d781bc86e100 100644 --- a/tools/testing/selftests/bpf/verifier/and.c +++ b/tools/testing/selftests/bpf/verifier/and.c @@ -15,7 +15,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, - .errstr = "R0 max value is outside of the array range", + .errstr = "R0 max value is outside of the allowed memory range", .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, @@ -44,7 +44,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, - .errstr = "R0 max value is outside of the array range", + .errstr = "R0 max value is outside of the allowed memory range", .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, diff --git a/tools/testing/selftests/bpf/verifier/array_access.c b/tools/testing/selftests/bpf/verifier/array_access.c index f3c33e128709..1c4b1939f5a8 100644 --- a/tools/testing/selftests/bpf/verifier/array_access.c +++ b/tools/testing/selftests/bpf/verifier/array_access.c @@ -117,7 +117,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, - .errstr = "R0 min value is outside of the array range", + .errstr = "R0 min value is outside of the allowed memory range", .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, @@ -137,7 +137,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, - .errstr = "R0 unbounded memory access, make sure to bounds check any array access into a map", + .errstr = "R0 unbounded memory access, make sure to bounds check any such access", .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, diff --git a/tools/testing/selftests/bpf/verifier/bounds.c b/tools/testing/selftests/bpf/verifier/bounds.c index d55f476f2237..4d6645f2874c 100644 --- a/tools/testing/selftests/bpf/verifier/bounds.c +++ b/tools/testing/selftests/bpf/verifier/bounds.c @@ -20,7 +20,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 3 }, - .errstr = "R0 max value is outside of the array range", + .errstr = "R0 max value is outside of the allowed memory range", .result = REJECT, }, { @@ -146,7 +146,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 3 }, - .errstr = "R0 min value is outside of the array range", + .errstr = "R0 min value is outside of the allowed memory range", .result = REJECT }, { @@ -238,7 +238,7 @@ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8), /* r1 = [0x00, 0xff] */ BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0), BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0xffffff80 >> 1), @@ -253,22 +253,18 @@ * [0xffff'ffff'0000'0080, 0xffff'ffff'ffff'ffff] */ BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 0xffffff80 >> 1), - /* r1 = 0 or - * [0x00ff'ffff'ff00'0000, 0x00ff'ffff'ffff'ffff] - */ - BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 8), - /* no-op or OOB pointer computation */ + /* error on OOB pointer computation */ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), - /* potentially OOB access */ - BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0), /* exit */ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 3 }, /* not actually fully unbounded, but the bound is very high */ - .errstr = "R0 unbounded memory access", - .result = REJECT + .errstr_unpriv = "R1 has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root", + .result_unpriv = REJECT, + .errstr = "value -4294967168 makes map_value pointer be out of bounds", + .result = REJECT, }, { "bounds check after truncation of boundary-crossing range (2)", @@ -278,7 +274,7 @@ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8), /* r1 = [0x00, 0xff] */ BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0), BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0xffffff80 >> 1), @@ -295,22 +291,18 @@ * [0xffff'ffff'0000'0080, 0xffff'ffff'ffff'ffff] */ BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 0xffffff80 >> 1), - /* r1 = 0 or - * [0x00ff'ffff'ff00'0000, 0x00ff'ffff'ffff'ffff] - */ - BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 8), - /* no-op or OOB pointer computation */ + /* error on OOB pointer computation */ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), - /* potentially OOB access */ - BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0), /* exit */ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 3 }, /* not actually fully unbounded, but the bound is very high */ - .errstr = "R0 unbounded memory access", - .result = REJECT + .errstr_unpriv = "R1 has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root", + .result_unpriv = REJECT, + .errstr = "value -4294967168 makes map_value pointer be out of bounds", + .result = REJECT, }, { "bounds check after wrapping 32-bit addition", @@ -362,7 +354,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 3 }, - .errstr = "R0 max value is outside of the array range", + .errstr = "R0 max value is outside of the allowed memory range", .result = REJECT }, { @@ -411,16 +403,14 @@ BPF_ALU32_IMM(BPF_RSH, BPF_REG_1, 31), /* r1 = 0xffff'fffe (NOT 0!) */ BPF_ALU32_IMM(BPF_SUB, BPF_REG_1, 2), - /* computes OOB pointer */ + /* error on computing OOB pointer */ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), - /* OOB access */ - BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0), /* exit */ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 3 }, - .errstr = "R0 invalid mem access", + .errstr = "math between map_value pointer and 4294967294 is not allowed", .result = REJECT, }, { @@ -506,3 +496,64 @@ .errstr = "map_value pointer and 1000000000000", .result = REJECT }, +{ + "bounds check mixed 32bit and 64bit arithmetic. test1", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_1, -1), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 32), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 1), + /* r1 = 0xffffFFFF00000001 */ + BPF_JMP32_IMM(BPF_JGT, BPF_REG_1, 1, 3), + /* check ALU64 op keeps 32bit bounds */ + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 1), + BPF_JMP32_IMM(BPF_JGT, BPF_REG_1, 2, 1), + BPF_JMP_A(1), + /* invalid ldx if bounds are lost above */ + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, -1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT +}, +{ + "bounds check mixed 32bit and 64bit arithmetic. test2", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_1, -1), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 32), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 1), + /* r1 = 0xffffFFFF00000001 */ + BPF_MOV64_IMM(BPF_REG_2, 3), + /* r1 = 0x2 */ + BPF_ALU32_IMM(BPF_ADD, BPF_REG_1, 1), + /* check ALU32 op zero extends 64bit bounds */ + BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_2, 1), + BPF_JMP_A(1), + /* invalid ldx if bounds are lost above */ + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, -1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT +}, +{ + "assigning 32bit bounds to 64bit for wA = 0, wB = wA", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_8, BPF_REG_1, + offsetof(struct __sk_buff, data_end)), + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1, + offsetof(struct __sk_buff, data)), + BPF_MOV32_IMM(BPF_REG_9, 0), + BPF_MOV32_REG(BPF_REG_2, BPF_REG_9), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_7), + BPF_ALU64_REG(BPF_ADD, BPF_REG_6, BPF_REG_2), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_6), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 8), + BPF_JMP_REG(BPF_JGT, BPF_REG_3, BPF_REG_8, 1), + BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_6, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, +}, diff --git a/tools/testing/selftests/bpf/verifier/bpf_get_stack.c b/tools/testing/selftests/bpf/verifier/bpf_get_stack.c index f24d50f09dbe..69b048cf46d9 100644 --- a/tools/testing/selftests/bpf/verifier/bpf_get_stack.c +++ b/tools/testing/selftests/bpf/verifier/bpf_get_stack.c @@ -9,17 +9,17 @@ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 28), BPF_MOV64_REG(BPF_REG_7, BPF_REG_0), - BPF_MOV64_IMM(BPF_REG_9, sizeof(struct test_val)), + BPF_MOV64_IMM(BPF_REG_9, sizeof(struct test_val)/2), BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), BPF_MOV64_REG(BPF_REG_2, BPF_REG_7), - BPF_MOV64_IMM(BPF_REG_3, sizeof(struct test_val)), + BPF_MOV64_IMM(BPF_REG_3, sizeof(struct test_val)/2), BPF_MOV64_IMM(BPF_REG_4, 256), BPF_EMIT_CALL(BPF_FUNC_get_stack), BPF_MOV64_IMM(BPF_REG_1, 0), BPF_MOV64_REG(BPF_REG_8, BPF_REG_0), BPF_ALU64_IMM(BPF_LSH, BPF_REG_8, 32), BPF_ALU64_IMM(BPF_ARSH, BPF_REG_8, 32), - BPF_JMP_REG(BPF_JSLT, BPF_REG_1, BPF_REG_8, 16), + BPF_JMP_REG(BPF_JSGT, BPF_REG_1, BPF_REG_8, 16), BPF_ALU64_REG(BPF_SUB, BPF_REG_9, BPF_REG_8), BPF_MOV64_REG(BPF_REG_2, BPF_REG_7), BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_8), @@ -29,7 +29,7 @@ BPF_MOV64_REG(BPF_REG_3, BPF_REG_2), BPF_ALU64_REG(BPF_ADD, BPF_REG_3, BPF_REG_1), BPF_MOV64_REG(BPF_REG_1, BPF_REG_7), - BPF_MOV64_IMM(BPF_REG_5, sizeof(struct test_val)), + BPF_MOV64_IMM(BPF_REG_5, sizeof(struct test_val)/2), BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_5), BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_1, 4), BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index 2d752c4f8d9d..94258c6b5235 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -19,7 +19,7 @@ BPF_MOV64_IMM(BPF_REG_0, 2), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 1, @@ -105,7 +105,7 @@ .prog_type = BPF_PROG_TYPE_SCHED_CLS, .fixup_map_hash_8b = { 16 }, .result = REJECT, - .errstr = "R0 min value is outside of the array range", + .errstr = "R0 min value is outside of the allowed memory range", }, { "calls: overlapping caller/callee", @@ -315,7 +315,7 @@ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .errstr_unpriv = "allowed for root only", + .errstr_unpriv = "allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = POINTER_VALUE, @@ -346,7 +346,7 @@ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_2), BPF_EXIT_INSN(), }, - .errstr_unpriv = "allowed for root only", + .errstr_unpriv = "allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = TEST_DATA_LEN + TEST_DATA_LEN - ETH_HLEN - ETH_HLEN, @@ -397,7 +397,7 @@ BPF_MOV64_IMM(BPF_REG_0, 1), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .fixup_map_hash_48b = { 3 }, .result_unpriv = REJECT, .result = ACCEPT, @@ -1064,7 +1064,7 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .errstr_unpriv = "allowed for root only", + .errstr_unpriv = "allowed for", .result_unpriv = REJECT, .errstr = "R0 !read_ok", .result = REJECT, @@ -1977,7 +1977,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, }, @@ -2003,7 +2003,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .errstr = "!read_ok", .result = REJECT, }, @@ -2028,7 +2028,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .errstr = "!read_ok", .result = REJECT, }, diff --git a/tools/testing/selftests/bpf/verifier/const_or.c b/tools/testing/selftests/bpf/verifier/const_or.c index 84446dfc7c1d..6c214c58e8d4 100644 --- a/tools/testing/selftests/bpf/verifier/const_or.c +++ b/tools/testing/selftests/bpf/verifier/const_or.c @@ -6,7 +6,7 @@ BPF_MOV64_IMM(BPF_REG_2, 34), BPF_ALU64_IMM(BPF_OR, BPF_REG_2, 13), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .result = ACCEPT, @@ -20,7 +20,7 @@ BPF_MOV64_IMM(BPF_REG_2, 34), BPF_ALU64_IMM(BPF_OR, BPF_REG_2, 24), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .errstr = "invalid stack type R1 off=-48 access_size=58", @@ -36,7 +36,7 @@ BPF_MOV64_IMM(BPF_REG_4, 13), BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_4), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .result = ACCEPT, @@ -51,7 +51,7 @@ BPF_MOV64_IMM(BPF_REG_4, 24), BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_4), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .errstr = "invalid stack type R1 off=-48 access_size=58", diff --git a/tools/testing/selftests/bpf/verifier/ctx.c b/tools/testing/selftests/bpf/verifier/ctx.c index 92762c08f5e3..93d6b1641481 100644 --- a/tools/testing/selftests/bpf/verifier/ctx.c +++ b/tools/testing/selftests/bpf/verifier/ctx.c @@ -91,3 +91,108 @@ .result = REJECT, .errstr = "variable ctx access var_off=(0x0; 0x4)", }, +{ + "pass ctx or null check, 1: ctx", + .insns = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_get_netns_cookie), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + .expected_attach_type = BPF_CGROUP_UDP6_SENDMSG, + .result = ACCEPT, +}, +{ + "pass ctx or null check, 2: null", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_get_netns_cookie), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + .expected_attach_type = BPF_CGROUP_UDP6_SENDMSG, + .result = ACCEPT, +}, +{ + "pass ctx or null check, 3: 1", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, 1), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_get_netns_cookie), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + .expected_attach_type = BPF_CGROUP_UDP6_SENDMSG, + .result = REJECT, + .errstr = "R1 type=inv expected=ctx", +}, +{ + "pass ctx or null check, 4: ctx - const", + .insns = { + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -612), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_get_netns_cookie), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + .expected_attach_type = BPF_CGROUP_UDP6_SENDMSG, + .result = REJECT, + .errstr = "dereference of modified ctx ptr", +}, +{ + "pass ctx or null check, 5: null (connect)", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_get_netns_cookie), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + .expected_attach_type = BPF_CGROUP_INET4_CONNECT, + .result = ACCEPT, +}, +{ + "pass ctx or null check, 6: null (bind)", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_get_netns_cookie), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK, + .expected_attach_type = BPF_CGROUP_INET4_POST_BIND, + .result = ACCEPT, +}, +{ + "pass ctx or null check, 7: ctx (bind)", + .insns = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_get_socket_cookie), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK, + .expected_attach_type = BPF_CGROUP_INET4_POST_BIND, + .result = ACCEPT, +}, +{ + "pass ctx or null check, 8: null (bind)", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_get_socket_cookie), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK, + .expected_attach_type = BPF_CGROUP_INET4_POST_BIND, + .result = REJECT, + .errstr = "R1 type=inv expected=ctx", +}, diff --git a/tools/testing/selftests/bpf/verifier/ctx_skb.c b/tools/testing/selftests/bpf/verifier/ctx_skb.c index d438193804b2..2e16b8e268f2 100644 --- a/tools/testing/selftests/bpf/verifier/ctx_skb.c +++ b/tools/testing/selftests/bpf/verifier/ctx_skb.c @@ -1011,6 +1011,53 @@ .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, { + "read gso_size from CGROUP_SKB", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, gso_size)), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, +}, +{ + "read gso_size from CGROUP_SKB", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, + offsetof(struct __sk_buff, gso_size)), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, +}, +{ + "write gso_size from CGROUP_SKB", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, gso_size)), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .result_unpriv = REJECT, + .errstr = "invalid bpf_context access off=176 size=4", + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, +}, +{ + "read gso_size from CLS", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, gso_size)), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, +}, +{ "check wire_len is not readable by sockets", .insns = { BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, diff --git a/tools/testing/selftests/bpf/verifier/dead_code.c b/tools/testing/selftests/bpf/verifier/dead_code.c index 50a8a63be4ac..5cf361d8eb1c 100644 --- a/tools/testing/selftests/bpf/verifier/dead_code.c +++ b/tools/testing/selftests/bpf/verifier/dead_code.c @@ -85,7 +85,7 @@ BPF_MOV64_IMM(BPF_REG_0, 12), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 7, @@ -103,7 +103,7 @@ BPF_MOV64_IMM(BPF_REG_0, 12), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 7, @@ -121,7 +121,7 @@ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -5), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 7, @@ -137,7 +137,7 @@ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, @@ -152,7 +152,7 @@ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .errstr_unpriv = "function calls to other bpf functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, diff --git a/tools/testing/selftests/bpf/verifier/direct_value_access.c b/tools/testing/selftests/bpf/verifier/direct_value_access.c index b9fb28e8e224..988f46a1a4c7 100644 --- a/tools/testing/selftests/bpf/verifier/direct_value_access.c +++ b/tools/testing/selftests/bpf/verifier/direct_value_access.c @@ -68,7 +68,7 @@ }, .fixup_map_array_48b = { 1 }, .result = REJECT, - .errstr = "R1 min value is outside of the array range", + .errstr = "R1 min value is outside of the allowed memory range", }, { "direct map access, write test 7", @@ -220,7 +220,7 @@ }, .fixup_map_array_small = { 1 }, .result = REJECT, - .errstr = "R1 min value is outside of the array range", + .errstr = "R1 min value is outside of the allowed memory range", }, { "direct map access, write test 19", diff --git a/tools/testing/selftests/bpf/verifier/event_output.c b/tools/testing/selftests/bpf/verifier/event_output.c index 130553e19eca..99f8f582c02b 100644 --- a/tools/testing/selftests/bpf/verifier/event_output.c +++ b/tools/testing/selftests/bpf/verifier/event_output.c @@ -92,3 +92,27 @@ .result = ACCEPT, .retval = 1, }, +{ + "perfevent for cgroup dev", + .insns = { __PERF_EVENT_INSNS__ }, + .prog_type = BPF_PROG_TYPE_CGROUP_DEVICE, + .fixup_map_event_output = { 4 }, + .result = ACCEPT, + .retval = 1, +}, +{ + "perfevent for cgroup sysctl", + .insns = { __PERF_EVENT_INSNS__ }, + .prog_type = BPF_PROG_TYPE_CGROUP_SYSCTL, + .fixup_map_event_output = { 4 }, + .result = ACCEPT, + .retval = 1, +}, +{ + "perfevent for cgroup sockopt", + .insns = { __PERF_EVENT_INSNS__ }, + .prog_type = BPF_PROG_TYPE_CGROUP_SOCKOPT, + .fixup_map_event_output = { 4 }, + .result = ACCEPT, + .retval = 1, +}, diff --git a/tools/testing/selftests/bpf/verifier/helper_access_var_len.c b/tools/testing/selftests/bpf/verifier/helper_access_var_len.c index 67ab12410050..87c4e7900083 100644 --- a/tools/testing/selftests/bpf/verifier/helper_access_var_len.c +++ b/tools/testing/selftests/bpf/verifier/helper_access_var_len.c @@ -19,7 +19,7 @@ BPF_MOV64_IMM(BPF_REG_4, 0), BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 2), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -36,7 +36,7 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128), BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 64), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .errstr = "invalid indirect read from stack off -64+0 size 64", @@ -55,7 +55,7 @@ BPF_MOV64_IMM(BPF_REG_4, 0), BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 2), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -84,7 +84,7 @@ BPF_MOV64_IMM(BPF_REG_4, 0), BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 2), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -112,7 +112,7 @@ BPF_MOV64_IMM(BPF_REG_4, 0), BPF_JMP_REG(BPF_JSGE, BPF_REG_4, BPF_REG_2, 2), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -132,7 +132,7 @@ BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 3), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 1), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -152,7 +152,7 @@ BPF_MOV64_IMM(BPF_REG_4, 0), BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 2), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -171,7 +171,7 @@ BPF_MOV64_IMM(BPF_REG_4, 0), BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 2), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -190,7 +190,7 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128), BPF_JMP_IMM(BPF_JGT, BPF_REG_2, 64, 3), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -208,7 +208,7 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128), BPF_JMP_IMM(BPF_JSGT, BPF_REG_2, 64, 3), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -233,7 +233,7 @@ BPF_MOV64_IMM(BPF_REG_4, 0), BPF_JMP_REG(BPF_JSGE, BPF_REG_4, BPF_REG_2, 2), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -259,7 +259,7 @@ BPF_MOV64_IMM(BPF_REG_4, 0), BPF_JMP_REG(BPF_JSGE, BPF_REG_4, BPF_REG_2, 2), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -286,7 +286,7 @@ BPF_MOV64_IMM(BPF_REG_4, 0), BPF_JMP_REG(BPF_JSGE, BPF_REG_4, BPF_REG_2, 2), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -313,12 +313,12 @@ BPF_MOV64_IMM(BPF_REG_4, 0), BPF_JMP_REG(BPF_JSGE, BPF_REG_4, BPF_REG_2, 2), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 4 }, - .errstr = "R1 min value is outside of the array range", + .errstr = "R1 min value is outside of the allowed memory range", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -468,7 +468,7 @@ BPF_MOV64_IMM(BPF_REG_1, 0), BPF_MOV64_IMM(BPF_REG_2, 0), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .errstr = "R1 type=inv expected=fp", @@ -481,7 +481,7 @@ BPF_MOV64_IMM(BPF_REG_1, 0), BPF_MOV64_IMM(BPF_REG_2, 1), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .errstr = "R1 type=inv expected=fp", @@ -495,7 +495,7 @@ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), BPF_MOV64_IMM(BPF_REG_2, 0), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .result = ACCEPT, @@ -513,7 +513,7 @@ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), BPF_MOV64_IMM(BPF_REG_2, 0), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 3 }, @@ -534,7 +534,7 @@ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 3 }, @@ -554,7 +554,7 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 0), BPF_JMP_IMM(BPF_JGT, BPF_REG_2, 8, 2), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_8b = { 3 }, @@ -580,7 +580,7 @@ BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 63), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 1), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16), BPF_EXIT_INSN(), }, @@ -607,7 +607,7 @@ BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 32), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 32), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16), BPF_EXIT_INSN(), }, diff --git a/tools/testing/selftests/bpf/verifier/helper_value_access.c b/tools/testing/selftests/bpf/verifier/helper_value_access.c index 7572e403ddb9..1c7882ddfa63 100644 --- a/tools/testing/selftests/bpf/verifier/helper_value_access.c +++ b/tools/testing/selftests/bpf/verifier/helper_value_access.c @@ -10,7 +10,7 @@ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val)), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -29,7 +29,7 @@ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), BPF_MOV64_IMM(BPF_REG_2, 8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -67,7 +67,7 @@ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val) + 8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -87,7 +87,7 @@ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), BPF_MOV64_IMM(BPF_REG_2, -8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -109,7 +109,7 @@ BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val) - offsetof(struct test_val, foo)), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -129,7 +129,7 @@ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, offsetof(struct test_val, foo)), BPF_MOV64_IMM(BPF_REG_2, 8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -170,7 +170,7 @@ BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val) - offsetof(struct test_val, foo) + 8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -191,7 +191,7 @@ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, offsetof(struct test_val, foo)), BPF_MOV64_IMM(BPF_REG_2, -8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -212,7 +212,7 @@ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, offsetof(struct test_val, foo)), BPF_MOV64_IMM(BPF_REG_2, -1), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -235,7 +235,7 @@ BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val) - offsetof(struct test_val, foo)), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -256,7 +256,7 @@ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3), BPF_MOV64_IMM(BPF_REG_2, 8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -280,7 +280,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, - .errstr = "R1 min value is outside of the array range", + .errstr = "R1 min value is outside of the allowed memory range", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -300,7 +300,7 @@ sizeof(struct test_val) - offsetof(struct test_val, foo) + 8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -322,7 +322,7 @@ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3), BPF_MOV64_IMM(BPF_REG_2, -8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -344,7 +344,7 @@ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3), BPF_MOV64_IMM(BPF_REG_2, -1), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -368,7 +368,7 @@ BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val) - offsetof(struct test_val, foo)), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -390,7 +390,7 @@ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3), BPF_MOV64_IMM(BPF_REG_2, 8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -415,7 +415,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, - .errstr = "R1 min value is outside of the array range", + .errstr = "R1 min value is outside of the allowed memory range", .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, @@ -433,7 +433,7 @@ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3), BPF_MOV64_IMM(BPF_REG_2, 1), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -458,7 +458,7 @@ sizeof(struct test_val) - offsetof(struct test_val, foo) + 1), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, @@ -926,7 +926,7 @@ }, .fixup_map_hash_16b = { 3, 10 }, .result = REJECT, - .errstr = "R2 unbounded memory access, make sure to bounds check any array access into a map", + .errstr = "R2 unbounded memory access, make sure to bounds check any such access", .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, { diff --git a/tools/testing/selftests/bpf/verifier/precise.c b/tools/testing/selftests/bpf/verifier/precise.c index 02151f8c940f..6dc8003ffc70 100644 --- a/tools/testing/selftests/bpf/verifier/precise.c +++ b/tools/testing/selftests/bpf/verifier/precise.c @@ -31,14 +31,14 @@ BPF_MOV64_REG(BPF_REG_1, BPF_REG_FP), BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_TRACEPOINT, .fixup_map_array_48b = { 1 }, .result = VERBOSE_ACCEPT, .errstr = - "26: (85) call bpf_probe_read#4\ + "26: (85) call bpf_probe_read_kernel#113\ last_idx 26 first_idx 20\ regs=4 stack=0 before 25\ regs=4 stack=0 before 24\ @@ -91,7 +91,7 @@ BPF_MOV64_REG(BPF_REG_1, BPF_REG_FP), BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel), BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_TRACEPOINT, @@ -99,7 +99,7 @@ .result = VERBOSE_ACCEPT, .flags = BPF_F_TEST_STATE_FREQ, .errstr = - "26: (85) call bpf_probe_read#4\ + "26: (85) call bpf_probe_read_kernel#113\ last_idx 26 first_idx 22\ regs=4 stack=0 before 25\ regs=4 stack=0 before 24\ diff --git a/tools/testing/selftests/bpf/verifier/prevent_map_lookup.c b/tools/testing/selftests/bpf/verifier/prevent_map_lookup.c index da7a4b37cb98..fc4e301260f6 100644 --- a/tools/testing/selftests/bpf/verifier/prevent_map_lookup.c +++ b/tools/testing/selftests/bpf/verifier/prevent_map_lookup.c @@ -1,34 +1,4 @@ { - "prevent map lookup in sockmap", - .insns = { - BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), - BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), - BPF_LD_MAP_FD(BPF_REG_1, 0), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_EXIT_INSN(), - }, - .fixup_map_sockmap = { 3 }, - .result = REJECT, - .errstr = "cannot pass map_type 15 into func bpf_map_lookup_elem", - .prog_type = BPF_PROG_TYPE_SOCK_OPS, -}, -{ - "prevent map lookup in sockhash", - .insns = { - BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), - BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), - BPF_LD_MAP_FD(BPF_REG_1, 0), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_EXIT_INSN(), - }, - .fixup_map_sockhash = { 3 }, - .result = REJECT, - .errstr = "cannot pass map_type 18 into func bpf_map_lookup_elem", - .prog_type = BPF_PROG_TYPE_SOCK_OPS, -}, -{ "prevent map lookup in stack trace", .insns = { BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), diff --git a/tools/testing/selftests/bpf/verifier/ref_tracking.c b/tools/testing/selftests/bpf/verifier/ref_tracking.c index 604b46151736..056e0273bf12 100644 --- a/tools/testing/selftests/bpf/verifier/ref_tracking.c +++ b/tools/testing/selftests/bpf/verifier/ref_tracking.c @@ -821,3 +821,36 @@ .result = REJECT, .errstr = "invalid mem access", }, +{ + "reference tracking: branch tracking valid pointer null comparison", + .insns = { + BPF_SK_LOOKUP(sk_lookup_tcp), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), + BPF_MOV64_IMM(BPF_REG_3, 1), + BPF_JMP_IMM(BPF_JNE, BPF_REG_6, 0, 1), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 2), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_EMIT_CALL(BPF_FUNC_sk_release), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, +}, +{ + "reference tracking: branch tracking valid pointer value comparison", + .insns = { + BPF_SK_LOOKUP(sk_lookup_tcp), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), + BPF_MOV64_IMM(BPF_REG_3, 1), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 4), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 1234, 2), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_EMIT_CALL(BPF_FUNC_sk_release), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .errstr = "Unreleased reference", + .result = REJECT, +}, diff --git a/tools/testing/selftests/bpf/verifier/sock.c b/tools/testing/selftests/bpf/verifier/sock.c index 9ed192e14f5f..b1aac2641498 100644 --- a/tools/testing/selftests/bpf/verifier/sock.c +++ b/tools/testing/selftests/bpf/verifier/sock.c @@ -222,7 +222,7 @@ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), - BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, offsetofend(struct bpf_sock, state)), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, offsetofend(struct bpf_sock, rx_queue_mapping)), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -516,3 +516,118 @@ .prog_type = BPF_PROG_TYPE_XDP, .result = ACCEPT, }, +{ + "bpf_map_lookup_elem(sockmap, &key)", + .insns = { + BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_sockmap = { 3 }, + .prog_type = BPF_PROG_TYPE_SK_SKB, + .result = REJECT, + .errstr = "Unreleased reference id=2 alloc_insn=5", +}, +{ + "bpf_map_lookup_elem(sockhash, &key)", + .insns = { + BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_sockhash = { 3 }, + .prog_type = BPF_PROG_TYPE_SK_SKB, + .result = REJECT, + .errstr = "Unreleased reference id=2 alloc_insn=5", +}, +{ + "bpf_map_lookup_elem(sockmap, &key); sk->type [fullsock field]; bpf_sk_release(sk)", + .insns = { + BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_sock, type)), + BPF_EMIT_CALL(BPF_FUNC_sk_release), + BPF_EXIT_INSN(), + }, + .fixup_map_sockmap = { 3 }, + .prog_type = BPF_PROG_TYPE_SK_SKB, + .result = ACCEPT, +}, +{ + "bpf_map_lookup_elem(sockhash, &key); sk->type [fullsock field]; bpf_sk_release(sk)", + .insns = { + BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_sock, type)), + BPF_EMIT_CALL(BPF_FUNC_sk_release), + BPF_EXIT_INSN(), + }, + .fixup_map_sockhash = { 3 }, + .prog_type = BPF_PROG_TYPE_SK_SKB, + .result = ACCEPT, +}, +{ + "bpf_sk_select_reuseport(ctx, reuseport_array, &key, flags)", + .insns = { + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -4), + BPF_LD_MAP_FD(BPF_REG_2, 0), + BPF_EMIT_CALL(BPF_FUNC_sk_select_reuseport), + BPF_EXIT_INSN(), + }, + .fixup_map_reuseport_array = { 4 }, + .prog_type = BPF_PROG_TYPE_SK_REUSEPORT, + .result = ACCEPT, +}, +{ + "bpf_sk_select_reuseport(ctx, sockmap, &key, flags)", + .insns = { + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -4), + BPF_LD_MAP_FD(BPF_REG_2, 0), + BPF_EMIT_CALL(BPF_FUNC_sk_select_reuseport), + BPF_EXIT_INSN(), + }, + .fixup_map_sockmap = { 4 }, + .prog_type = BPF_PROG_TYPE_SK_REUSEPORT, + .result = ACCEPT, +}, +{ + "bpf_sk_select_reuseport(ctx, sockhash, &key, flags)", + .insns = { + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -4), + BPF_LD_MAP_FD(BPF_REG_2, 0), + BPF_EMIT_CALL(BPF_FUNC_sk_select_reuseport), + BPF_EXIT_INSN(), + }, + .fixup_map_sockmap = { 4 }, + .prog_type = BPF_PROG_TYPE_SK_REUSEPORT, + .result = ACCEPT, +}, diff --git a/tools/testing/selftests/bpf/verifier/stack_ptr.c b/tools/testing/selftests/bpf/verifier/stack_ptr.c index 7276620ef242..8bfeb77c60bd 100644 --- a/tools/testing/selftests/bpf/verifier/stack_ptr.c +++ b/tools/testing/selftests/bpf/verifier/stack_ptr.c @@ -315,3 +315,43 @@ }, .result = ACCEPT, }, +{ + "store PTR_TO_STACK in R10 to array map using BPF_B", + .insns = { + /* Load pointer to map. */ + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_EXIT_INSN(), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + /* Copy R10 to R9. */ + BPF_MOV64_REG(BPF_REG_9, BPF_REG_10), + /* Pollute other registers with unaligned values. */ + BPF_MOV64_IMM(BPF_REG_2, -1), + BPF_MOV64_IMM(BPF_REG_3, -1), + BPF_MOV64_IMM(BPF_REG_4, -1), + BPF_MOV64_IMM(BPF_REG_5, -1), + BPF_MOV64_IMM(BPF_REG_6, -1), + BPF_MOV64_IMM(BPF_REG_7, -1), + BPF_MOV64_IMM(BPF_REG_8, -1), + /* Store both R9 and R10 with BPF_B and read back. */ + BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_10, 0), + BPF_LDX_MEM(BPF_B, BPF_REG_2, BPF_REG_1, 0), + BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_9, 0), + BPF_LDX_MEM(BPF_B, BPF_REG_3, BPF_REG_1, 0), + /* Should read back as same value. */ + BPF_JMP_REG(BPF_JEQ, BPF_REG_2, BPF_REG_3, 2), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_IMM(BPF_REG_0, 42), + BPF_EXIT_INSN(), + }, + .fixup_map_array_48b = { 3 }, + .result = ACCEPT, + .retval = 42, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, +}, diff --git a/tools/testing/selftests/bpf/verifier/value_illegal_alu.c b/tools/testing/selftests/bpf/verifier/value_illegal_alu.c index 7f6c232cd842..ed1c2cea1dea 100644 --- a/tools/testing/selftests/bpf/verifier/value_illegal_alu.c +++ b/tools/testing/selftests/bpf/verifier/value_illegal_alu.c @@ -88,6 +88,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_48b = { 3 }, + .errstr_unpriv = "leaking pointer from stack off -8", .errstr = "R0 invalid mem access 'inv'", .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, diff --git a/tools/testing/selftests/bpf/verifier/value_or_null.c b/tools/testing/selftests/bpf/verifier/value_or_null.c index 860d4a71cd83..3ecb70a3d939 100644 --- a/tools/testing/selftests/bpf/verifier/value_or_null.c +++ b/tools/testing/selftests/bpf/verifier/value_or_null.c @@ -150,3 +150,22 @@ .result_unpriv = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, +{ + "map lookup and null branch prediction", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, 10), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 2), + BPF_JMP_IMM(BPF_JNE, BPF_REG_6, 0, 1), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_10, 10), + BPF_EXIT_INSN(), + }, + .fixup_map_hash_8b = { 4 }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, +}, diff --git a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c index a53d99cebd9f..97ee658e1242 100644 --- a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c +++ b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c @@ -50,7 +50,7 @@ .fixup_map_array_48b = { 8 }, .result = ACCEPT, .result_unpriv = REJECT, - .errstr_unpriv = "R0 min value is outside of the array range", + .errstr_unpriv = "R0 min value is outside of the allowed memory range", .retval = 1, }, { @@ -325,7 +325,7 @@ }, .fixup_map_array_48b = { 3 }, .result = REJECT, - .errstr = "R0 min value is outside of the array range", + .errstr = "R0 min value is outside of the allowed memory range", .result_unpriv = REJECT, .errstr_unpriv = "R0 pointer arithmetic of map value goes out of range", }, @@ -601,7 +601,7 @@ }, .fixup_map_array_48b = { 3 }, .result = REJECT, - .errstr = "R1 max value is outside of the array range", + .errstr = "R1 max value is outside of the allowed memory range", .errstr_unpriv = "R1 pointer arithmetic of map value goes out of range", .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, @@ -726,7 +726,7 @@ }, .fixup_map_array_48b = { 3 }, .result = REJECT, - .errstr = "R0 min value is outside of the array range", + .errstr = "R0 min value is outside of the allowed memory range", }, { "map access: value_ptr -= known scalar, 2", diff --git a/tools/testing/selftests/breakpoints/.gitignore b/tools/testing/selftests/breakpoints/.gitignore index a23bb4a6f06c..def2e97dab9a 100644 --- a/tools/testing/selftests/breakpoints/.gitignore +++ b/tools/testing/selftests/breakpoints/.gitignore @@ -1,2 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only breakpoint_test step_after_suspend_test diff --git a/tools/testing/selftests/capabilities/.gitignore b/tools/testing/selftests/capabilities/.gitignore index b732dd0d4738..426d9adca67c 100644 --- a/tools/testing/selftests/capabilities/.gitignore +++ b/tools/testing/selftests/capabilities/.gitignore @@ -1,2 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only test_execve validate_cap diff --git a/tools/testing/selftests/cgroup/.gitignore b/tools/testing/selftests/cgroup/.gitignore index 7f9835624793..aa6de65b0838 100644 --- a/tools/testing/selftests/cgroup/.gitignore +++ b/tools/testing/selftests/cgroup/.gitignore @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only test_memcontrol test_core test_freezer diff --git a/tools/testing/selftests/cgroup/Makefile b/tools/testing/selftests/cgroup/Makefile index 66aafe1f5746..967f268fde74 100644 --- a/tools/testing/selftests/cgroup/Makefile +++ b/tools/testing/selftests/cgroup/Makefile @@ -11,6 +11,6 @@ TEST_GEN_PROGS += test_freezer include ../lib.mk -$(OUTPUT)/test_memcontrol: cgroup_util.c -$(OUTPUT)/test_core: cgroup_util.c -$(OUTPUT)/test_freezer: cgroup_util.c +$(OUTPUT)/test_memcontrol: cgroup_util.c ../clone3/clone3_selftests.h +$(OUTPUT)/test_core: cgroup_util.c ../clone3/clone3_selftests.h +$(OUTPUT)/test_freezer: cgroup_util.c ../clone3/clone3_selftests.h diff --git a/tools/testing/selftests/cgroup/cgroup_util.c b/tools/testing/selftests/cgroup/cgroup_util.c index 8f7131dcf1ff..8a637ca7d73a 100644 --- a/tools/testing/selftests/cgroup/cgroup_util.c +++ b/tools/testing/selftests/cgroup/cgroup_util.c @@ -15,6 +15,7 @@ #include <unistd.h> #include "cgroup_util.h" +#include "../clone3/clone3_selftests.h" static ssize_t read_text(const char *path, char *buf, size_t max_len) { @@ -331,12 +332,112 @@ int cg_run(const char *cgroup, } } +pid_t clone_into_cgroup(int cgroup_fd) +{ +#ifdef CLONE_ARGS_SIZE_VER2 + pid_t pid; + + struct clone_args args = { + .flags = CLONE_INTO_CGROUP, + .exit_signal = SIGCHLD, + .cgroup = cgroup_fd, + }; + + pid = sys_clone3(&args, sizeof(struct clone_args)); + /* + * Verify that this is a genuine test failure: + * ENOSYS -> clone3() not available + * E2BIG -> CLONE_INTO_CGROUP not available + */ + if (pid < 0 && (errno == ENOSYS || errno == E2BIG)) + goto pretend_enosys; + + return pid; + +pretend_enosys: +#endif + errno = ENOSYS; + return -ENOSYS; +} + +int clone_reap(pid_t pid, int options) +{ + int ret; + siginfo_t info = { + .si_signo = 0, + }; + +again: + ret = waitid(P_PID, pid, &info, options | __WALL | __WNOTHREAD); + if (ret < 0) { + if (errno == EINTR) + goto again; + return -1; + } + + if (options & WEXITED) { + if (WIFEXITED(info.si_status)) + return WEXITSTATUS(info.si_status); + } + + if (options & WSTOPPED) { + if (WIFSTOPPED(info.si_status)) + return WSTOPSIG(info.si_status); + } + + if (options & WCONTINUED) { + if (WIFCONTINUED(info.si_status)) + return 0; + } + + return -1; +} + +int dirfd_open_opath(const char *dir) +{ + return open(dir, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW | O_PATH); +} + +#define close_prot_errno(fd) \ + if (fd >= 0) { \ + int _e_ = errno; \ + close(fd); \ + errno = _e_; \ + } + +static int clone_into_cgroup_run_nowait(const char *cgroup, + int (*fn)(const char *cgroup, void *arg), + void *arg) +{ + int cgroup_fd; + pid_t pid; + + cgroup_fd = dirfd_open_opath(cgroup); + if (cgroup_fd < 0) + return -1; + + pid = clone_into_cgroup(cgroup_fd); + close_prot_errno(cgroup_fd); + if (pid == 0) + exit(fn(cgroup, arg)); + + return pid; +} + int cg_run_nowait(const char *cgroup, int (*fn)(const char *cgroup, void *arg), void *arg) { int pid; + pid = clone_into_cgroup_run_nowait(cgroup, fn, arg); + if (pid > 0) + return pid; + + /* Genuine test failure. */ + if (pid < 0 && errno != ENOSYS) + return -1; + pid = fork(); if (pid == 0) { char buf[64]; @@ -450,3 +551,28 @@ int proc_read_strstr(int pid, bool thread, const char *item, const char *needle) return strstr(buf, needle) ? 0 : -1; } + +int clone_into_cgroup_run_wait(const char *cgroup) +{ + int cgroup_fd; + pid_t pid; + + cgroup_fd = dirfd_open_opath(cgroup); + if (cgroup_fd < 0) + return -1; + + pid = clone_into_cgroup(cgroup_fd); + close_prot_errno(cgroup_fd); + if (pid < 0) + return -1; + + if (pid == 0) + exit(EXIT_SUCCESS); + + /* + * We don't care whether this fails. We only care whether the initial + * clone succeeded. + */ + (void)clone_reap(pid, WEXITED); + return 0; +} diff --git a/tools/testing/selftests/cgroup/cgroup_util.h b/tools/testing/selftests/cgroup/cgroup_util.h index 49c54fbdb229..5a1305dd1f0b 100644 --- a/tools/testing/selftests/cgroup/cgroup_util.h +++ b/tools/testing/selftests/cgroup/cgroup_util.h @@ -50,3 +50,7 @@ extern int cg_wait_for_proc_count(const char *cgroup, int count); extern int cg_killall(const char *cgroup); extern ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size); extern int proc_read_strstr(int pid, bool thread, const char *item, const char *needle); +extern pid_t clone_into_cgroup(int cgroup_fd); +extern int clone_reap(pid_t pid, int options); +extern int clone_into_cgroup_run_wait(const char *cgroup); +extern int dirfd_open_opath(const char *dir); diff --git a/tools/testing/selftests/cgroup/test_core.c b/tools/testing/selftests/cgroup/test_core.c index e19ce940cd6a..3df648c37876 100644 --- a/tools/testing/selftests/cgroup/test_core.c +++ b/tools/testing/selftests/cgroup/test_core.c @@ -2,7 +2,10 @@ #include <linux/limits.h> #include <sys/types.h> +#include <sys/mman.h> +#include <sys/wait.h> #include <unistd.h> +#include <fcntl.h> #include <stdio.h> #include <errno.h> #include <signal.h> @@ -12,6 +15,115 @@ #include "../kselftest.h" #include "cgroup_util.h" +static int touch_anon(char *buf, size_t size) +{ + int fd; + char *pos = buf; + + fd = open("/dev/urandom", O_RDONLY); + if (fd < 0) + return -1; + + while (size > 0) { + ssize_t ret = read(fd, pos, size); + + if (ret < 0) { + if (errno != EINTR) { + close(fd); + return -1; + } + } else { + pos += ret; + size -= ret; + } + } + close(fd); + + return 0; +} + +static int alloc_and_touch_anon_noexit(const char *cgroup, void *arg) +{ + int ppid = getppid(); + size_t size = (size_t)arg; + void *buf; + + buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, + 0, 0); + if (buf == MAP_FAILED) + return -1; + + if (touch_anon((char *)buf, size)) { + munmap(buf, size); + return -1; + } + + while (getppid() == ppid) + sleep(1); + + munmap(buf, size); + return 0; +} + +/* + * Create a child process that allocates and touches 100MB, then waits to be + * killed. Wait until the child is attached to the cgroup, kill all processes + * in that cgroup and wait until "cgroup.procs" is empty. At this point try to + * destroy the empty cgroup. The test helps detect race conditions between + * dying processes leaving the cgroup and cgroup destruction path. + */ +static int test_cgcore_destroy(const char *root) +{ + int ret = KSFT_FAIL; + char *cg_test = NULL; + int child_pid; + char buf[PAGE_SIZE]; + + cg_test = cg_name(root, "cg_test"); + + if (!cg_test) + goto cleanup; + + for (int i = 0; i < 10; i++) { + if (cg_create(cg_test)) + goto cleanup; + + child_pid = cg_run_nowait(cg_test, alloc_and_touch_anon_noexit, + (void *) MB(100)); + + if (child_pid < 0) + goto cleanup; + + /* wait for the child to enter cgroup */ + if (cg_wait_for_proc_count(cg_test, 1)) + goto cleanup; + + if (cg_killall(cg_test)) + goto cleanup; + + /* wait for cgroup to be empty */ + while (1) { + if (cg_read(cg_test, "cgroup.procs", buf, sizeof(buf))) + goto cleanup; + if (buf[0] == '\0') + break; + usleep(1000); + } + + if (rmdir(cg_test)) + goto cleanup; + + if (waitpid(child_pid, NULL, 0) < 0) + goto cleanup; + } + ret = KSFT_PASS; +cleanup: + if (cg_test) + cg_destroy(cg_test); + free(cg_test); + return ret; +} + /* * A(0) - B(0) - C(1) * \ D(0) @@ -25,8 +137,11 @@ static int test_cgcore_populated(const char *root) { int ret = KSFT_FAIL; + int err; char *cg_test_a = NULL, *cg_test_b = NULL; char *cg_test_c = NULL, *cg_test_d = NULL; + int cgroup_fd = -EBADF; + pid_t pid; cg_test_a = cg_name(root, "cg_test_a"); cg_test_b = cg_name(root, "cg_test_a/cg_test_b"); @@ -78,6 +193,52 @@ static int test_cgcore_populated(const char *root) if (cg_read_strcmp(cg_test_d, "cgroup.events", "populated 0\n")) goto cleanup; + /* Test that we can directly clone into a new cgroup. */ + cgroup_fd = dirfd_open_opath(cg_test_d); + if (cgroup_fd < 0) + goto cleanup; + + pid = clone_into_cgroup(cgroup_fd); + if (pid < 0) { + if (errno == ENOSYS) + goto cleanup_pass; + goto cleanup; + } + + if (pid == 0) { + if (raise(SIGSTOP)) + exit(EXIT_FAILURE); + exit(EXIT_SUCCESS); + } + + err = cg_read_strcmp(cg_test_d, "cgroup.events", "populated 1\n"); + + (void)clone_reap(pid, WSTOPPED); + (void)kill(pid, SIGCONT); + (void)clone_reap(pid, WEXITED); + + if (err) + goto cleanup; + + if (cg_read_strcmp(cg_test_d, "cgroup.events", "populated 0\n")) + goto cleanup; + + /* Remove cgroup. */ + if (cg_test_d) { + cg_destroy(cg_test_d); + free(cg_test_d); + cg_test_d = NULL; + } + + pid = clone_into_cgroup(cgroup_fd); + if (pid < 0) + goto cleanup_pass; + if (pid == 0) + exit(EXIT_SUCCESS); + (void)clone_reap(pid, WEXITED); + goto cleanup; + +cleanup_pass: ret = KSFT_PASS; cleanup: @@ -93,6 +254,8 @@ cleanup: free(cg_test_c); free(cg_test_b); free(cg_test_a); + if (cgroup_fd >= 0) + close(cgroup_fd); return ret; } @@ -136,6 +299,16 @@ static int test_cgcore_invalid_domain(const char *root) if (errno != EOPNOTSUPP) goto cleanup; + if (!clone_into_cgroup_run_wait(child)) + goto cleanup; + + if (errno == ENOSYS) + goto cleanup_pass; + + if (errno != EOPNOTSUPP) + goto cleanup; + +cleanup_pass: ret = KSFT_PASS; cleanup: @@ -345,6 +518,9 @@ static int test_cgcore_internal_process_constraint(const char *root) if (!cg_enter_current(parent)) goto cleanup; + if (!clone_into_cgroup_run_wait(parent)) + goto cleanup; + ret = KSFT_PASS; cleanup: @@ -512,6 +688,7 @@ struct corecg_test { T(test_cgcore_populated), T(test_cgcore_proc_migration), T(test_cgcore_thread_migration), + T(test_cgcore_destroy), }; #undef T diff --git a/tools/testing/selftests/clone3/.gitignore b/tools/testing/selftests/clone3/.gitignore index 0dc4f32c6cb8..a81085742d40 100644 --- a/tools/testing/selftests/clone3/.gitignore +++ b/tools/testing/selftests/clone3/.gitignore @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only clone3 clone3_clear_sighand clone3_set_tid diff --git a/tools/testing/selftests/clone3/clone3_selftests.h b/tools/testing/selftests/clone3/clone3_selftests.h index a3f2c8ad8bcc..91c1a78ddb39 100644 --- a/tools/testing/selftests/clone3/clone3_selftests.h +++ b/tools/testing/selftests/clone3/clone3_selftests.h @@ -5,12 +5,24 @@ #define _GNU_SOURCE #include <sched.h> +#include <linux/sched.h> +#include <linux/types.h> #include <stdint.h> #include <syscall.h> -#include <linux/types.h> +#include <sys/wait.h> + +#include "../kselftest.h" #define ptr_to_u64(ptr) ((__u64)((uintptr_t)(ptr))) +#ifndef CLONE_INTO_CGROUP +#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */ +#endif + +#ifndef CLONE_ARGS_SIZE_VER0 +#define CLONE_ARGS_SIZE_VER0 64 +#endif + #ifndef __NR_clone3 #define __NR_clone3 -1 struct clone_args { @@ -22,10 +34,13 @@ struct clone_args { __aligned_u64 stack; __aligned_u64 stack_size; __aligned_u64 tls; +#define CLONE_ARGS_SIZE_VER1 80 __aligned_u64 set_tid; __aligned_u64 set_tid_size; +#define CLONE_ARGS_SIZE_VER2 88 + __aligned_u64 cgroup; }; -#endif +#endif /* __NR_clone3 */ static pid_t sys_clone3(struct clone_args *args, size_t size) { diff --git a/tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c b/tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c index cd5e1f602ac9..909da9cdda97 100644 --- a/tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c +++ b/tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c @@ -351,6 +351,7 @@ static int test_alloc_errors(char *heap_name) } printf("Expected error checking passed\n"); + ret = 0; out: if (dmabuf_fd >= 0) close(dmabuf_fd); diff --git a/tools/testing/selftests/drivers/.gitignore b/tools/testing/selftests/drivers/.gitignore index f6aebcc27b76..ca74f2e1c719 100644 --- a/tools/testing/selftests/drivers/.gitignore +++ b/tools/testing/selftests/drivers/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only /dma-buf/udmabuf diff --git a/tools/testing/selftests/drivers/net/mlxsw/blackhole_routes.sh b/tools/testing/selftests/drivers/net/mlxsw/blackhole_routes.sh index 5ba5bef44d5b..bdffe698e1d1 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/blackhole_routes.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/blackhole_routes.sh @@ -45,6 +45,7 @@ ALL_TESTS=" blackhole_ipv6 " NUM_NETIFS=4 +: ${TIMEOUT:=20000} # ms source $lib_dir/tc_common.sh source $lib_dir/lib.sh @@ -123,7 +124,7 @@ blackhole_ipv4() skip_hw dst_ip 198.51.100.1 src_ip 192.0.2.1 ip_proto icmp \ action pass - ip -4 route show 198.51.100.0/30 | grep -q offload + busywait "$TIMEOUT" wait_for_offload ip -4 route show 198.51.100.0/30 check_err $? "route not marked as offloaded when should" ping_do $h1 198.51.100.1 @@ -147,7 +148,7 @@ blackhole_ipv6() skip_hw dst_ip 2001:db8:2::1 src_ip 2001:db8:1::1 \ ip_proto icmpv6 action pass - ip -6 route show 2001:db8:2::/120 | grep -q offload + busywait "$TIMEOUT" wait_for_offload ip -6 route show 2001:db8:2::/120 check_err $? "route not marked as offloaded when should" ping6_do $h1 2001:db8:2::1 diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_acl_drops.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_acl_drops.sh new file mode 100755 index 000000000000..b32ba5fec59d --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_acl_drops.sh @@ -0,0 +1,151 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test devlink-trap ACL drops functionality over mlxsw. + +lib_dir=$(dirname $0)/../../../net/forwarding + +ALL_TESTS=" + ingress_flow_action_drop_test + egress_flow_action_drop_test +" +NUM_NETIFS=4 +source $lib_dir/tc_common.sh +source $lib_dir/lib.sh +source $lib_dir/devlink_lib.sh + +h1_create() +{ + simple_if_init $h1 +} + +h1_destroy() +{ + simple_if_fini $h1 +} + +h2_create() +{ + simple_if_init $h2 +} + +h2_destroy() +{ + simple_if_fini $h2 +} + +switch_create() +{ + ip link add dev br0 type bridge vlan_filtering 1 mcast_snooping 0 + + ip link set dev $swp1 master br0 + ip link set dev $swp2 master br0 + + ip link set dev br0 up + ip link set dev $swp1 up + ip link set dev $swp2 up + + tc qdisc add dev $swp1 clsact + tc qdisc add dev $swp2 clsact +} + +switch_destroy() +{ + tc qdisc del dev $swp2 clsact + tc qdisc del dev $swp1 clsact + + ip link set dev $swp2 down + ip link set dev $swp1 down + + ip link del dev br0 +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + swp1=${NETIFS[p2]} + + swp2=${NETIFS[p3]} + h2=${NETIFS[p4]} + + h1mac=$(mac_get $h1) + h2mac=$(mac_get $h2) + + vrf_prepare + + h1_create + h2_create + + switch_create +} + +cleanup() +{ + pre_cleanup + + switch_destroy + + h2_destroy + h1_destroy + + vrf_cleanup +} + +ingress_flow_action_drop_test() +{ + local mz_pid + + tc filter add dev $swp2 egress protocol ip pref 1 handle 101 \ + flower src_mac $h1mac action pass + + tc filter add dev $swp1 ingress protocol ip pref 1 handle 101 \ + flower dst_ip 192.0.2.2 action drop + + $MZ $h1 -c 0 -p 100 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \ + -t ip -d 1msec -q & + mz_pid=$! + + RET=0 + + devlink_trap_drop_test ingress_flow_action_drop $swp2 101 + + log_test "ingress_flow_action_drop" + + tc filter del dev $swp1 ingress protocol ip pref 1 handle 101 flower + + devlink_trap_drop_cleanup $mz_pid $swp2 ip 1 101 +} + +egress_flow_action_drop_test() +{ + local mz_pid + + tc filter add dev $swp2 egress protocol ip pref 2 handle 102 \ + flower src_mac $h1mac action pass + + tc filter add dev $swp2 egress protocol ip pref 1 handle 101 \ + flower dst_ip 192.0.2.2 action drop + + $MZ $h1 -c 0 -p 100 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \ + -t ip -d 1msec -q & + mz_pid=$! + + RET=0 + + devlink_trap_drop_test egress_flow_action_drop $swp2 102 + + log_test "egress_flow_action_drop" + + tc filter del dev $swp2 egress protocol ip pref 1 handle 101 flower + + devlink_trap_drop_cleanup $mz_pid $swp2 ip 2 102 +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_control.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_control.sh new file mode 100755 index 000000000000..a37273473c1b --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_control.sh @@ -0,0 +1,688 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test devlink-trap control trap functionality over mlxsw. Each registered +# control packet trap is tested to make sure it is triggered under the right +# conditions. +# +# +---------------------------------+ +# | H1 (vrf) | +# | + $h1 | +# | | 192.0.2.1/24 | +# | | 2001:db8:1::1/64 | +# | | | +# | | default via 192.0.2.2 | +# | | default via 2001:db8:1::2 | +# +----|----------------------------+ +# | +# +----|----------------------------------------------------------------------+ +# | SW | | +# | + $rp1 | +# | 192.0.2.2/24 | +# | 2001:db8:1::2/64 | +# | | +# | 2001:db8:2::2/64 | +# | 198.51.100.2/24 | +# | + $rp2 | +# | | | +# +----|----------------------------------------------------------------------+ +# | +# +----|----------------------------+ +# | | default via 198.51.100.2 | +# | | default via 2001:db8:2::2 | +# | | | +# | | 2001:db8:2::1/64 | +# | | 198.51.100.1/24 | +# | + $h2 | +# | H2 (vrf) | +# +---------------------------------+ + +lib_dir=$(dirname $0)/../../../net/forwarding + +ALL_TESTS=" + stp_test + lacp_test + lldp_test + igmp_query_test + igmp_v1_report_test + igmp_v2_report_test + igmp_v3_report_test + igmp_v2_leave_test + mld_query_test + mld_v1_report_test + mld_v2_report_test + mld_v1_done_test + ipv4_dhcp_test + ipv6_dhcp_test + arp_request_test + arp_response_test + ipv6_neigh_solicit_test + ipv6_neigh_advert_test + ipv4_bfd_test + ipv6_bfd_test + ipv4_ospf_test + ipv6_ospf_test + ipv4_bgp_test + ipv6_bgp_test + ipv4_vrrp_test + ipv6_vrrp_test + ipv4_pim_test + ipv6_pim_test + uc_loopback_test + local_route_test + external_route_test + ipv6_uc_dip_link_local_scope_test + ipv4_router_alert_test + ipv6_router_alert_test + ipv6_dip_all_nodes_test + ipv6_dip_all_routers_test + ipv6_router_solicit_test + ipv6_router_advert_test + ipv6_redirect_test + ptp_event_test + ptp_general_test + flow_action_sample_test + flow_action_trap_test +" +NUM_NETIFS=4 +source $lib_dir/lib.sh +source $lib_dir/devlink_lib.sh + +h1_create() +{ + simple_if_init $h1 192.0.2.1/24 2001:db8:1::1/64 + + ip -4 route add default vrf v$h1 nexthop via 192.0.2.2 + ip -6 route add default vrf v$h1 nexthop via 2001:db8:1::2 +} + +h1_destroy() +{ + ip -6 route del default vrf v$h1 nexthop via 2001:db8:1::2 + ip -4 route del default vrf v$h1 nexthop via 192.0.2.2 + + simple_if_fini $h1 192.0.2.1/24 2001:db8:1::1/64 +} + +h2_create() +{ + simple_if_init $h2 198.51.100.1/24 2001:db8:2::1/64 + + ip -4 route add default vrf v$h2 nexthop via 198.51.100.2 + ip -6 route add default vrf v$h2 nexthop via 2001:db8:2::2 +} + +h2_destroy() +{ + ip -6 route del default vrf v$h2 nexthop via 2001:db8:2::2 + ip -4 route del default vrf v$h2 nexthop via 198.51.100.2 + + simple_if_fini $h2 198.51.100.1/24 2001:db8:2::1/64 +} + +router_create() +{ + ip link set dev $rp1 up + ip link set dev $rp2 up + + __addr_add_del $rp1 add 192.0.2.2/24 2001:db8:1::2/64 + __addr_add_del $rp2 add 198.51.100.2/24 2001:db8:2::2/64 +} + +router_destroy() +{ + __addr_add_del $rp2 del 198.51.100.2/24 2001:db8:2::2/64 + __addr_add_del $rp1 del 192.0.2.2/24 2001:db8:1::2/64 + + ip link set dev $rp2 down + ip link set dev $rp1 down +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + rp1=${NETIFS[p2]} + + rp2=${NETIFS[p3]} + h2=${NETIFS[p4]} + + vrf_prepare + forwarding_enable + + h1_create + h2_create + router_create +} + +cleanup() +{ + pre_cleanup + + router_destroy + h2_destroy + h1_destroy + + forwarding_restore + vrf_cleanup +} + +stp_test() +{ + devlink_trap_stats_test "STP" "stp" $MZ $h1 -c 1 -t bpdu -q +} + +lacp_payload_get() +{ + local source_mac=$1; shift + local p + + p=$(: + )"01:80:C2:00:00:02:"$( : ETH daddr + )"$source_mac:"$( : ETH saddr + )"88:09:"$( : ETH type + ) + echo $p +} + +lacp_test() +{ + local h1mac=$(mac_get $h1) + + devlink_trap_stats_test "LACP" "lacp" $MZ $h1 -c 1 \ + $(lacp_payload_get $h1mac) -p 100 -q +} + +lldp_payload_get() +{ + local source_mac=$1; shift + local p + + p=$(: + )"01:80:C2:00:00:0E:"$( : ETH daddr + )"$source_mac:"$( : ETH saddr + )"88:CC:"$( : ETH type + ) + echo $p +} + +lldp_test() +{ + local h1mac=$(mac_get $h1) + + devlink_trap_stats_test "LLDP" "lldp" $MZ $h1 -c 1 \ + $(lldp_payload_get $h1mac) -p 100 -q +} + +igmp_query_test() +{ + # IGMP (IP Protocol 2) Membership Query (Type 0x11) + devlink_trap_stats_test "IGMP Membership Query" "igmp_query" \ + $MZ $h1 -c 1 -a own -b 01:00:5E:00:00:01 \ + -A 192.0.2.1 -B 224.0.0.1 -t ip proto=2,p=11 -p 100 -q +} + +igmp_v1_report_test() +{ + # IGMP (IP Protocol 2) Version 1 Membership Report (Type 0x12) + devlink_trap_stats_test "IGMP Version 1 Membership Report" \ + "igmp_v1_report" $MZ $h1 -c 1 -a own -b 01:00:5E:00:00:01 \ + -A 192.0.2.1 -B 244.0.0.1 -t ip proto=2,p=12 -p 100 -q +} + +igmp_v2_report_test() +{ + # IGMP (IP Protocol 2) Version 2 Membership Report (Type 0x16) + devlink_trap_stats_test "IGMP Version 2 Membership Report" \ + "igmp_v2_report" $MZ $h1 -c 1 -a own -b 01:00:5E:00:00:01 \ + -A 192.0.2.1 -B 244.0.0.1 -t ip proto=2,p=16 -p 100 -q +} + +igmp_v3_report_test() +{ + # IGMP (IP Protocol 2) Version 3 Membership Report (Type 0x22) + devlink_trap_stats_test "IGMP Version 3 Membership Report" \ + "igmp_v3_report" $MZ $h1 -c 1 -a own -b 01:00:5E:00:00:01 \ + -A 192.0.2.1 -B 244.0.0.1 -t ip proto=2,p=22 -p 100 -q +} + +igmp_v2_leave_test() +{ + # IGMP (IP Protocol 2) Version 2 Leave Group (Type 0x17) + devlink_trap_stats_test "IGMP Version 2 Leave Group" \ + "igmp_v2_leave" $MZ $h1 -c 1 -a own -b 01:00:5E:00:00:02 \ + -A 192.0.2.1 -B 224.0.0.2 -t ip proto=2,p=17 -p 100 -q +} + +mld_payload_get() +{ + local type=$1; shift + local p + + type=$(printf "%x" $type) + p=$(: + )"3A:"$( : Next Header - ICMPv6 + )"00:"$( : Hdr Ext Len + )"00:00:00:00:00:00:"$( : Options and Padding + )"$type:"$( : ICMPv6.type + )"00:"$( : ICMPv6.code + )"00:"$( : ICMPv6.checksum + ) + echo $p +} + +mld_query_test() +{ + # MLD Multicast Listener Query (Type 130) + devlink_trap_stats_test "MLD Multicast Listener Query" "mld_query" \ + $MZ $h1 -6 -c 1 -A fe80::1 -B ff02::1 \ + -t ip hop=1,next=0,payload=$(mld_payload_get 130) -p 100 -q +} + +mld_v1_report_test() +{ + # MLD Version 1 Multicast Listener Report (Type 131) + devlink_trap_stats_test "MLD Version 1 Multicast Listener Report" \ + "mld_v1_report" $MZ $h1 -6 -c 1 -A fe80::1 -B ff02::16 \ + -t ip hop=1,next=0,payload=$(mld_payload_get 131) -p 100 -q +} + +mld_v2_report_test() +{ + # MLD Version 2 Multicast Listener Report (Type 143) + devlink_trap_stats_test "MLD Version 2 Multicast Listener Report" \ + "mld_v2_report" $MZ $h1 -6 -c 1 -A fe80::1 -B ff02::16 \ + -t ip hop=1,next=0,payload=$(mld_payload_get 143) -p 100 -q +} + +mld_v1_done_test() +{ + # MLD Version 1 Multicast Listener Done (Type 132) + devlink_trap_stats_test "MLD Version 1 Multicast Listener Done" \ + "mld_v1_done" $MZ $h1 -6 -c 1 -A fe80::1 -B ff02::16 \ + -t ip hop=1,next=0,payload=$(mld_payload_get 132) -p 100 -q +} + +ipv4_dhcp_test() +{ + devlink_trap_stats_test "IPv4 DHCP Port 67" "ipv4_dhcp" \ + $MZ $h1 -c 1 -a own -b bcast -A 0.0.0.0 -B 255.255.255.255 \ + -t udp sp=68,dp=67 -p 100 -q + + devlink_trap_stats_test "IPv4 DHCP Port 68" "ipv4_dhcp" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) -A 192.0.2.1 \ + -B 255.255.255.255 -t udp sp=67,dp=68 -p 100 -q +} + +ipv6_dhcp_test() +{ + devlink_trap_stats_test "IPv6 DHCP Port 547" "ipv6_dhcp" \ + $MZ $h1 -6 -c 1 -A fe80::1 -B ff02::1:2 -t udp sp=546,dp=547 \ + -p 100 -q + + devlink_trap_stats_test "IPv6 DHCP Port 546" "ipv6_dhcp" \ + $MZ $h1 -6 -c 1 -A fe80::1 -B ff02::1:2 -t udp sp=547,dp=546 \ + -p 100 -q +} + +arp_request_test() +{ + devlink_trap_stats_test "ARP Request" "arp_request" \ + $MZ $h1 -c 1 -a own -b bcast -t arp request -p 100 -q +} + +arp_response_test() +{ + devlink_trap_stats_test "ARP Response" "arp_response" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) -t arp reply -p 100 -q +} + +icmpv6_header_get() +{ + local type=$1; shift + local p + + type=$(printf "%x" $type) + p=$(: + )"$type:"$( : ICMPv6.type + )"00:"$( : ICMPv6.code + )"00:"$( : ICMPv6.checksum + ) + echo $p +} + +ipv6_neigh_solicit_test() +{ + devlink_trap_stats_test "IPv6 Neighbour Solicitation" \ + "ipv6_neigh_solicit" $MZ $h1 -6 -c 1 \ + -A fe80::1 -B ff02::1:ff00:02 \ + -t ip hop=1,next=58,payload=$(icmpv6_header_get 135) -p 100 -q +} + +ipv6_neigh_advert_test() +{ + devlink_trap_stats_test "IPv6 Neighbour Advertisement" \ + "ipv6_neigh_advert" $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A fe80::1 -B 2001:db8:1::2 \ + -t ip hop=1,next=58,payload=$(icmpv6_header_get 136) -p 100 -q +} + +ipv4_bfd_test() +{ + devlink_trap_stats_test "IPv4 BFD Control - Port 3784" "ipv4_bfd" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \ + -A 192.0.2.1 -B 192.0.2.2 -t udp sp=49153,dp=3784 -p 100 -q + + devlink_trap_stats_test "IPv4 BFD Echo - Port 3785" "ipv4_bfd" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \ + -A 192.0.2.1 -B 192.0.2.2 -t udp sp=49153,dp=3785 -p 100 -q +} + +ipv6_bfd_test() +{ + devlink_trap_stats_test "IPv6 BFD Control - Port 3784" "ipv6_bfd" \ + $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A 2001:db8:1::1 -B 2001:db8:1::2 \ + -t udp sp=49153,dp=3784 -p 100 -q + + devlink_trap_stats_test "IPv6 BFD Echo - Port 3785" "ipv6_bfd" \ + $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A 2001:db8:1::1 -B 2001:db8:1::2 \ + -t udp sp=49153,dp=3785 -p 100 -q +} + +ipv4_ospf_test() +{ + devlink_trap_stats_test "IPv4 OSPF - Multicast" "ipv4_ospf" \ + $MZ $h1 -c 1 -a own -b 01:00:5e:00:00:05 \ + -A 192.0.2.1 -B 224.0.0.5 -t ip proto=89 -p 100 -q + + devlink_trap_stats_test "IPv4 OSPF - Unicast" "ipv4_ospf" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \ + -A 192.0.2.1 -B 192.0.2.2 -t ip proto=89 -p 100 -q +} + +ipv6_ospf_test() +{ + devlink_trap_stats_test "IPv6 OSPF - Multicast" "ipv6_ospf" \ + $MZ $h1 -6 -c 1 -a own -b 33:33:00:00:00:05 \ + -A fe80::1 -B ff02::5 -t ip next=89 -p 100 -q + + devlink_trap_stats_test "IPv6 OSPF - Unicast" "ipv6_ospf" \ + $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A 2001:db8:1::1 -B 2001:db8:1::2 -t ip next=89 -p 100 -q +} + +ipv4_bgp_test() +{ + devlink_trap_stats_test "IPv4 BGP" "ipv4_bgp" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \ + -A 192.0.2.1 -B 192.0.2.2 -t tcp sp=54321,dp=179,flags=rst \ + -p 100 -q +} + +ipv6_bgp_test() +{ + devlink_trap_stats_test "IPv6 BGP" "ipv6_bgp" \ + $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A 2001:db8:1::1 -B 2001:db8:1::2 \ + -t tcp sp=54321,dp=179,flags=rst -p 100 -q +} + +ipv4_vrrp_test() +{ + devlink_trap_stats_test "IPv4 VRRP" "ipv4_vrrp" \ + $MZ $h1 -c 1 -a own -b 01:00:5e:00:00:12 \ + -A 192.0.2.1 -B 224.0.0.18 -t ip proto=112 -p 100 -q +} + +ipv6_vrrp_test() +{ + devlink_trap_stats_test "IPv6 VRRP" "ipv6_vrrp" \ + $MZ $h1 -6 -c 1 -a own -b 33:33:00:00:00:12 \ + -A fe80::1 -B ff02::12 -t ip next=112 -p 100 -q +} + +ipv4_pim_test() +{ + devlink_trap_stats_test "IPv4 PIM - Multicast" "ipv4_pim" \ + $MZ $h1 -c 1 -a own -b 01:00:5e:00:00:0d \ + -A 192.0.2.1 -B 224.0.0.13 -t ip proto=103 -p 100 -q + + devlink_trap_stats_test "IPv4 PIM - Unicast" "ipv4_pim" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \ + -A 192.0.2.1 -B 192.0.2.2 -t ip proto=103 -p 100 -q +} + +ipv6_pim_test() +{ + devlink_trap_stats_test "IPv6 PIM - Multicast" "ipv6_pim" \ + $MZ $h1 -6 -c 1 -a own -b 33:33:00:00:00:0d \ + -A fe80::1 -B ff02::d -t ip next=103 -p 100 -q + + devlink_trap_stats_test "IPv6 PIM - Unicast" "ipv6_pim" \ + $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A fe80::1 -B 2001:db8:1::2 -t ip next=103 -p 100 -q +} + +uc_loopback_test() +{ + # Add neighbours to the fake destination IPs, so that the packets are + # routed in the device and not trapped due to an unresolved neighbour + # exception. + ip -4 neigh add 192.0.2.3 lladdr 00:11:22:33:44:55 nud permanent \ + dev $rp1 + ip -6 neigh add 2001:db8:1::3 lladdr 00:11:22:33:44:55 nud permanent \ + dev $rp1 + + devlink_trap_stats_test "IPv4 Unicast Loopback" "uc_loopback" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \ + -A 192.0.2.1 -B 192.0.2.3 -t udp sp=54321,dp=12345 -p 100 -q + + devlink_trap_stats_test "IPv6 Unicast Loopback" "uc_loopback" \ + $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A 2001:db8:1::1 -B 2001:db8:1::3 -t udp sp=54321,dp=12345 \ + -p 100 -q + + ip -6 neigh del 2001:db8:1::3 dev $rp1 + ip -4 neigh del 192.0.2.3 dev $rp1 +} + +local_route_test() +{ + # Use a fake source IP to prevent the trap from being triggered twice + # when the router sends back a port unreachable message. + devlink_trap_stats_test "IPv4 Local Route" "local_route" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \ + -A 192.0.2.3 -B 192.0.2.2 -t udp sp=54321,dp=12345 -p 100 -q + + devlink_trap_stats_test "IPv6 Local Route" "local_route" \ + $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A 2001:db8:1::3 -B 2001:db8:1::2 -t udp sp=54321,sp=12345 \ + -p 100 -q +} + +external_route_test() +{ + # Add a dummy device through which the incoming packets should be + # routed. + ip link add name dummy10 up type dummy + ip address add 203.0.113.1/24 dev dummy10 + ip -6 address add 2001:db8:10::1/64 dev dummy10 + + devlink_trap_stats_test "IPv4 External Route" "external_route" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \ + -A 192.0.2.1 -B 203.0.113.2 -t udp sp=54321,dp=12345 -p 100 -q + + devlink_trap_stats_test "IPv6 External Route" "external_route" \ + $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A 2001:db8:1::1 -B 2001:db8:10::2 -t udp sp=54321,sp=12345 \ + -p 100 -q + + ip -6 address del 2001:db8:10::1/64 dev dummy10 + ip address del 203.0.113.1/24 dev dummy10 + ip link del dev dummy10 +} + +ipv6_uc_dip_link_local_scope_test() +{ + # Add a dummy link-local prefix route to allow the packet to be routed. + ip -6 route add fe80:1::/64 dev $rp2 + + devlink_trap_stats_test \ + "IPv6 Unicast Destination IP With Link-Local Scope" \ + "ipv6_uc_dip_link_local_scope" \ + $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A fe80::1 -B fe80:1::2 -t udp sp=54321,sp=12345 \ + -p 100 -q + + ip -6 route del fe80:1::/64 dev $rp2 +} + +ipv4_router_alert_get() +{ + local p + + # https://en.wikipedia.org/wiki/IPv4#Options + p=$(: + )"94:"$( : Option Number + )"04:"$( : Option Length + )"00:00:"$( : Option Data + ) + echo $p +} + +ipv4_router_alert_test() +{ + devlink_trap_stats_test "IPv4 Router Alert" "ipv4_router_alert" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \ + -A 192.0.2.1 -B 198.51.100.3 \ + -t ip option=$(ipv4_router_alert_get) -p 100 -q +} + +ipv6_router_alert_get() +{ + local p + + # https://en.wikipedia.org/wiki/IPv6_packet#Hop-by-hop_options_and_destination_options + # https://tools.ietf.org/html/rfc2711#section-2.1 + p=$(: + )"11:"$( : Next Header - UDP + )"00:"$( : Hdr Ext Len + )"05:02:00:00:00:00:"$( : Option Data + ) + echo $p +} + +ipv6_router_alert_test() +{ + devlink_trap_stats_test "IPv6 Router Alert" "ipv6_router_alert" \ + $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A 2001:db8:1::1 -B 2001:db8:1::3 \ + -t ip next=0,payload=$(ipv6_router_alert_get) -p 100 -q +} + +ipv6_dip_all_nodes_test() +{ + devlink_trap_stats_test "IPv6 Destination IP \"All Nodes Address\"" \ + "ipv6_dip_all_nodes" \ + $MZ $h1 -6 -c 1 -a own -b 33:33:00:00:00:01 \ + -A 2001:db8:1::1 -B ff02::1 -t udp sp=12345,dp=54321 -p 100 -q +} + +ipv6_dip_all_routers_test() +{ + devlink_trap_stats_test "IPv6 Destination IP \"All Routers Address\"" \ + "ipv6_dip_all_routers" \ + $MZ $h1 -6 -c 1 -a own -b 33:33:00:00:00:02 \ + -A 2001:db8:1::1 -B ff02::2 -t udp sp=12345,dp=54321 -p 100 -q +} + +ipv6_router_solicit_test() +{ + devlink_trap_stats_test "IPv6 Router Solicitation" \ + "ipv6_router_solicit" \ + $MZ $h1 -6 -c 1 -a own -b 33:33:00:00:00:02 \ + -A fe80::1 -B ff02::2 \ + -t ip hop=1,next=58,payload=$(icmpv6_header_get 133) -p 100 -q +} + +ipv6_router_advert_test() +{ + devlink_trap_stats_test "IPv6 Router Advertisement" \ + "ipv6_router_advert" \ + $MZ $h1 -6 -c 1 -a own -b 33:33:00:00:00:01 \ + -A fe80::1 -B ff02::1 \ + -t ip hop=1,next=58,payload=$(icmpv6_header_get 134) -p 100 -q +} + +ipv6_redirect_test() +{ + devlink_trap_stats_test "IPv6 Redirect Message" \ + "ipv6_redirect" \ + $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \ + -A fe80::1 -B 2001:db8:1::2 \ + -t ip hop=1,next=58,payload=$(icmpv6_header_get 137) -p 100 -q +} + +ptp_event_test() +{ + # PTP is only supported on Spectrum-1, for now. + [[ "$DEVLINK_VIDDID" != "15b3:cb84" ]] && return + + # PTP Sync (0) + devlink_trap_stats_test "PTP Time-Critical Event Message" "ptp_event" \ + $MZ $h1 -c 1 -a own -b 01:00:5e:00:01:81 \ + -A 192.0.2.1 -B 224.0.1.129 \ + -t udp sp=12345,dp=319,payload=10 -p 100 -q +} + +ptp_general_test() +{ + # PTP is only supported on Spectrum-1, for now. + [[ "$DEVLINK_VIDDID" != "15b3:cb84" ]] && return + + # PTP Announce (b) + devlink_trap_stats_test "PTP General Message" "ptp_general" \ + $MZ $h1 -c 1 -a own -b 01:00:5e:00:01:81 \ + -A 192.0.2.1 -B 224.0.1.129 \ + -t udp sp=12345,dp=320,payload=1b -p 100 -q +} + +flow_action_sample_test() +{ + # Install a filter that samples every incoming packet. + tc qdisc add dev $rp1 clsact + tc filter add dev $rp1 ingress proto all pref 1 handle 101 matchall \ + skip_sw action sample rate 1 group 1 + + devlink_trap_stats_test "Flow Sampling" "flow_action_sample" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \ + -A 192.0.2.1 -B 198.51.100.1 -t udp sp=12345,dp=54321 -p 100 -q + + tc filter del dev $rp1 ingress proto all pref 1 handle 101 matchall + tc qdisc del dev $rp1 clsact +} + +flow_action_trap_test() +{ + # Install a filter that traps a specific flow. + tc qdisc add dev $rp1 clsact + tc filter add dev $rp1 ingress proto ip pref 1 handle 101 flower \ + skip_sw ip_proto udp src_port 12345 dst_port 54321 action trap + + devlink_trap_stats_test "Flow Trapping (Logging)" "flow_action_trap" \ + $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \ + -A 192.0.2.1 -B 198.51.100.1 -t udp sp=12345,dp=54321 -p 100 -q + + tc filter del dev $rp1 ingress proto ip pref 1 handle 101 flower + tc qdisc del dev $rp1 clsact +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l2_drops.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l2_drops.sh index 58cdbfb608e9..a4c2812e9807 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l2_drops.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l2_drops.sh @@ -96,7 +96,6 @@ source_mac_is_multicast_test() { local trap_name="source_mac_is_multicast" local smac=01:02:03:04:05:06 - local group_name="l2_drops" local mz_pid tc filter add dev $swp2 egress protocol ip pref 1 handle 101 \ @@ -107,18 +106,17 @@ source_mac_is_multicast_test() RET=0 - devlink_trap_drop_test $trap_name $group_name $swp2 + devlink_trap_drop_test $trap_name $swp2 101 log_test "Source MAC is multicast" - devlink_trap_drop_cleanup $mz_pid $swp2 ip + devlink_trap_drop_cleanup $mz_pid $swp2 ip 1 101 } __vlan_tag_mismatch_test() { local trap_name="vlan_tag_mismatch" local dmac=de:ad:be:ef:13:37 - local group_name="l2_drops" local opt=$1; shift local mz_pid @@ -132,7 +130,7 @@ __vlan_tag_mismatch_test() $MZ $h1 "$opt" -c 0 -p 100 -a own -b $dmac -t ip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $swp2 + devlink_trap_drop_test $trap_name $swp2 101 # Add PVID and make sure packets are no longer dropped. bridge vlan add vid 1 dev $swp1 pvid untagged master @@ -140,7 +138,7 @@ __vlan_tag_mismatch_test() devlink_trap_stats_idle_test $trap_name check_err $? "Trap stats not idle when packets should not be dropped" - devlink_trap_group_stats_idle_test $group_name + devlink_trap_group_stats_idle_test $(devlink_trap_group_get $trap_name) check_err $? "Trap group stats not idle with when packets should not be dropped" tc_check_packets "dev $swp2 egress" 101 0 @@ -148,7 +146,7 @@ __vlan_tag_mismatch_test() devlink_trap_action_set $trap_name "drop" - devlink_trap_drop_cleanup $mz_pid $swp2 ip + devlink_trap_drop_cleanup $mz_pid $swp2 ip 1 101 } vlan_tag_mismatch_untagged_test() @@ -179,7 +177,6 @@ ingress_vlan_filter_test() { local trap_name="ingress_vlan_filter" local dmac=de:ad:be:ef:13:37 - local group_name="l2_drops" local mz_pid local vid=10 @@ -193,7 +190,7 @@ ingress_vlan_filter_test() $MZ $h1 -Q $vid -c 0 -p 100 -a own -b $dmac -t ip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $swp2 + devlink_trap_drop_test $trap_name $swp2 101 # Add the VLAN on the bridge port and make sure packets are no longer # dropped. @@ -202,7 +199,7 @@ ingress_vlan_filter_test() devlink_trap_stats_idle_test $trap_name check_err $? "Trap stats not idle when packets should not be dropped" - devlink_trap_group_stats_idle_test $group_name + devlink_trap_group_stats_idle_test $(devlink_trap_group_get $trap_name) check_err $? "Trap group stats not idle with when packets should not be dropped" tc_check_packets "dev $swp2 egress" 101 0 @@ -212,7 +209,7 @@ ingress_vlan_filter_test() log_test "Ingress VLAN filter" - devlink_trap_drop_cleanup $mz_pid $swp2 ip + devlink_trap_drop_cleanup $mz_pid $swp2 ip 1 101 bridge vlan del vid $vid dev $swp1 master bridge vlan del vid $vid dev $swp2 master @@ -222,7 +219,6 @@ __ingress_stp_filter_test() { local trap_name="ingress_spanning_tree_filter" local dmac=de:ad:be:ef:13:37 - local group_name="l2_drops" local state=$1; shift local mz_pid local vid=20 @@ -237,7 +233,7 @@ __ingress_stp_filter_test() $MZ $h1 -Q $vid -c 0 -p 100 -a own -b $dmac -t ip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $swp2 + devlink_trap_drop_test $trap_name $swp2 101 # Change STP state to forwarding and make sure packets are no longer # dropped. @@ -246,7 +242,7 @@ __ingress_stp_filter_test() devlink_trap_stats_idle_test $trap_name check_err $? "Trap stats not idle when packets should not be dropped" - devlink_trap_group_stats_idle_test $group_name + devlink_trap_group_stats_idle_test $(devlink_trap_group_get $trap_name) check_err $? "Trap group stats not idle with when packets should not be dropped" tc_check_packets "dev $swp2 egress" 101 0 @@ -254,7 +250,7 @@ __ingress_stp_filter_test() devlink_trap_action_set $trap_name "drop" - devlink_trap_drop_cleanup $mz_pid $swp2 ip + devlink_trap_drop_cleanup $mz_pid $swp2 ip 1 101 bridge vlan del vid $vid dev $swp1 master bridge vlan del vid $vid dev $swp2 master @@ -292,7 +288,6 @@ port_list_is_empty_uc_test() { local trap_name="port_list_is_empty" local dmac=de:ad:be:ef:13:37 - local group_name="l2_drops" local mz_pid # Disable unicast flooding on both ports, so that packets cannot egress @@ -308,7 +303,7 @@ port_list_is_empty_uc_test() $MZ $h1 -c 0 -p 100 -a own -b $dmac -t ip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $swp2 + devlink_trap_drop_test $trap_name $swp2 101 # Allow packets to be flooded to one port. ip link set dev $swp2 type bridge_slave flood on @@ -316,7 +311,7 @@ port_list_is_empty_uc_test() devlink_trap_stats_idle_test $trap_name check_err $? "Trap stats not idle when packets should not be dropped" - devlink_trap_group_stats_idle_test $group_name + devlink_trap_group_stats_idle_test $(devlink_trap_group_get $trap_name) check_err $? "Trap group stats not idle with when packets should not be dropped" tc_check_packets "dev $swp2 egress" 101 0 @@ -326,7 +321,7 @@ port_list_is_empty_uc_test() log_test "Port list is empty - unicast" - devlink_trap_drop_cleanup $mz_pid $swp2 ip + devlink_trap_drop_cleanup $mz_pid $swp2 ip 1 101 ip link set dev $swp1 type bridge_slave flood on } @@ -335,7 +330,6 @@ port_list_is_empty_mc_test() { local trap_name="port_list_is_empty" local dmac=01:00:5e:00:00:01 - local group_name="l2_drops" local dip=239.0.0.1 local mz_pid @@ -354,7 +348,7 @@ port_list_is_empty_mc_test() $MZ $h1 -c 0 -p 100 -a own -b $dmac -t ip -B $dip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $swp2 + devlink_trap_drop_test $trap_name $swp2 101 # Allow packets to be flooded to one port. ip link set dev $swp2 type bridge_slave mcast_flood on @@ -362,7 +356,7 @@ port_list_is_empty_mc_test() devlink_trap_stats_idle_test $trap_name check_err $? "Trap stats not idle when packets should not be dropped" - devlink_trap_group_stats_idle_test $group_name + devlink_trap_group_stats_idle_test $(devlink_trap_group_get $trap_name) check_err $? "Trap group stats not idle with when packets should not be dropped" tc_check_packets "dev $swp2 egress" 101 0 @@ -372,7 +366,7 @@ port_list_is_empty_mc_test() log_test "Port list is empty - multicast" - devlink_trap_drop_cleanup $mz_pid $swp2 ip + devlink_trap_drop_cleanup $mz_pid $swp2 ip 1 101 ip link set dev $swp1 type bridge_slave mcast_flood on } @@ -387,7 +381,6 @@ port_loopback_filter_uc_test() { local trap_name="port_loopback_filter" local dmac=de:ad:be:ef:13:37 - local group_name="l2_drops" local mz_pid # Make sure packets can only egress the input port. @@ -401,7 +394,7 @@ port_loopback_filter_uc_test() $MZ $h1 -c 0 -p 100 -a own -b $dmac -t ip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $swp2 + devlink_trap_drop_test $trap_name $swp2 101 # Allow packets to be flooded. ip link set dev $swp2 type bridge_slave flood on @@ -409,7 +402,7 @@ port_loopback_filter_uc_test() devlink_trap_stats_idle_test $trap_name check_err $? "Trap stats not idle when packets should not be dropped" - devlink_trap_group_stats_idle_test $group_name + devlink_trap_group_stats_idle_test $(devlink_trap_group_get $trap_name) check_err $? "Trap group stats not idle with when packets should not be dropped" tc_check_packets "dev $swp2 egress" 101 0 @@ -419,7 +412,7 @@ port_loopback_filter_uc_test() log_test "Port loopback filter - unicast" - devlink_trap_drop_cleanup $mz_pid $swp2 ip + devlink_trap_drop_cleanup $mz_pid $swp2 ip 1 101 } port_loopback_filter_test() diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_drops.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_drops.sh index d88d8e47d11b..f5abb1ebd392 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_drops.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_drops.sh @@ -161,7 +161,6 @@ ping_check() non_ip_test() { local trap_name="non_ip" - local group_name="l3_drops" local mz_pid RET=0 @@ -176,11 +175,11 @@ non_ip_test() 00:00 de:ad:be:ef" & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $rp2 + devlink_trap_drop_test $trap_name $rp2 101 log_test "Non IP" - devlink_trap_drop_cleanup $mz_pid $rp2 "ip" + devlink_trap_drop_cleanup $mz_pid $rp2 "ip" 1 101 } __uc_dip_over_mc_dmac_test() @@ -190,7 +189,6 @@ __uc_dip_over_mc_dmac_test() local dip=$1; shift local flags=${1:-""}; shift local trap_name="uc_dip_over_mc_dmac" - local group_name="l3_drops" local dmac=01:02:03:04:05:06 local mz_pid @@ -206,11 +204,11 @@ __uc_dip_over_mc_dmac_test() -B $dip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $rp2 + devlink_trap_drop_test $trap_name $rp2 101 log_test "Unicast destination IP over multicast destination MAC: $desc" - devlink_trap_drop_cleanup $mz_pid $rp2 $proto + devlink_trap_drop_cleanup $mz_pid $rp2 $proto 1 101 } uc_dip_over_mc_dmac_test() @@ -227,7 +225,6 @@ __sip_is_loopback_test() local dip=$1; shift local flags=${1:-""}; shift local trap_name="sip_is_loopback_address" - local group_name="l3_drops" local mz_pid RET=0 @@ -242,11 +239,11 @@ __sip_is_loopback_test() -b $rp1mac -B $dip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $rp2 + devlink_trap_drop_test $trap_name $rp2 101 log_test "Source IP is loopback address: $desc" - devlink_trap_drop_cleanup $mz_pid $rp2 $proto + devlink_trap_drop_cleanup $mz_pid $rp2 $proto 1 101 } sip_is_loopback_test() @@ -262,7 +259,6 @@ __dip_is_loopback_test() local dip=$1; shift local flags=${1:-""}; shift local trap_name="dip_is_loopback_address" - local group_name="l3_drops" local mz_pid RET=0 @@ -277,11 +273,11 @@ __dip_is_loopback_test() -B $dip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $rp2 + devlink_trap_drop_test $trap_name $rp2 101 log_test "Destination IP is loopback address: $desc" - devlink_trap_drop_cleanup $mz_pid $rp2 $proto + devlink_trap_drop_cleanup $mz_pid $rp2 $proto 1 101 } dip_is_loopback_test() @@ -298,7 +294,6 @@ __sip_is_mc_test() local dip=$1; shift local flags=${1:-""}; shift local trap_name="sip_is_mc" - local group_name="l3_drops" local mz_pid RET=0 @@ -313,11 +308,11 @@ __sip_is_mc_test() -b $rp1mac -B $dip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $rp2 + devlink_trap_drop_test $trap_name $rp2 101 log_test "Source IP is multicast: $desc" - devlink_trap_drop_cleanup $mz_pid $rp2 $proto + devlink_trap_drop_cleanup $mz_pid $rp2 $proto 1 101 } sip_is_mc_test() @@ -329,7 +324,6 @@ sip_is_mc_test() ipv4_sip_is_limited_bc_test() { local trap_name="ipv4_sip_is_limited_bc" - local group_name="l3_drops" local sip=255.255.255.255 local mz_pid @@ -345,11 +339,11 @@ ipv4_sip_is_limited_bc_test() -B $h2_ipv4 -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $rp2 + devlink_trap_drop_test $trap_name $rp2 101 log_test "IPv4 source IP is limited broadcast" - devlink_trap_drop_cleanup $mz_pid $rp2 "ip" + devlink_trap_drop_cleanup $mz_pid $rp2 "ip" 1 101 } ipv4_payload_get() @@ -382,7 +376,6 @@ __ipv4_header_corrupted_test() local ihl=$1; shift local checksum=$1; shift local trap_name="ip_header_corrupted" - local group_name="l3_drops" local payload local mz_pid @@ -399,11 +392,11 @@ __ipv4_header_corrupted_test() $MZ $h1 -c 0 -d 1msec -a $h1mac -b $rp1mac -q p=$payload & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $rp2 + devlink_trap_drop_test $trap_name $rp2 101 log_test "IP header corrupted: $desc: IPv4" - devlink_trap_drop_cleanup $mz_pid $rp2 "ip" + devlink_trap_drop_cleanup $mz_pid $rp2 "ip" 1 101 } ipv6_payload_get() @@ -429,7 +422,6 @@ __ipv6_header_corrupted_test() local desc=$1; shift local ipver=$1; shift local trap_name="ip_header_corrupted" - local group_name="l3_drops" local payload local mz_pid @@ -446,11 +438,11 @@ __ipv6_header_corrupted_test() $MZ $h1 -c 0 -d 1msec -a $h1mac -b $rp1mac -q p=$payload & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $rp2 + devlink_trap_drop_test $trap_name $rp2 101 log_test "IP header corrupted: $desc: IPv6" - devlink_trap_drop_cleanup $mz_pid $rp2 "ip" + devlink_trap_drop_cleanup $mz_pid $rp2 "ip" 1 101 } ip_header_corrupted_test() @@ -469,7 +461,6 @@ ip_header_corrupted_test() ipv6_mc_dip_reserved_scope_test() { local trap_name="ipv6_mc_dip_reserved_scope" - local group_name="l3_drops" local dip=FF00:: local mz_pid @@ -485,17 +476,16 @@ ipv6_mc_dip_reserved_scope_test() "33:33:00:00:00:00" -B $dip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $rp2 + devlink_trap_drop_test $trap_name $rp2 101 log_test "IPv6 multicast destination IP reserved scope" - devlink_trap_drop_cleanup $mz_pid $rp2 "ipv6" + devlink_trap_drop_cleanup $mz_pid $rp2 "ipv6" 1 101 } ipv6_mc_dip_interface_local_scope_test() { local trap_name="ipv6_mc_dip_interface_local_scope" - local group_name="l3_drops" local dip=FF01:: local mz_pid @@ -511,11 +501,11 @@ ipv6_mc_dip_interface_local_scope_test() "33:33:00:00:00:00" -B $dip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $rp2 + devlink_trap_drop_test $trap_name $rp2 101 log_test "IPv6 multicast destination IP interface-local scope" - devlink_trap_drop_cleanup $mz_pid $rp2 "ipv6" + devlink_trap_drop_cleanup $mz_pid $rp2 "ipv6" 1 101 } __blackhole_route_test() @@ -526,7 +516,6 @@ __blackhole_route_test() local dip=$1; shift local ip_proto=${1:-"icmp"}; shift local trap_name="blackhole_route" - local group_name="l3_drops" local mz_pid RET=0 @@ -542,10 +531,10 @@ __blackhole_route_test() -B $dip -d 1msec -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $rp2 + devlink_trap_drop_test $trap_name $rp2 101 log_test "Blackhole route: IPv$flags" - devlink_trap_drop_cleanup $mz_pid $rp2 $proto + devlink_trap_drop_cleanup $mz_pid $rp2 $proto 1 101 ip -$flags route del blackhole $subnet } @@ -558,7 +547,6 @@ blackhole_route_test() irif_disabled_test() { local trap_name="irif_disabled" - local group_name="l3_drops" local t0_packets t0_bytes local t1_packets t1_bytes local mz_pid @@ -613,7 +601,6 @@ irif_disabled_test() erif_disabled_test() { local trap_name="erif_disabled" - local group_name="l3_drops" local t0_packets t0_bytes local t1_packets t1_bytes local mz_pid @@ -641,13 +628,9 @@ erif_disabled_test() mz_pid=$! sleep 5 - # In order to see this trap we need a route that points to disabled RIF. - # When ipv6 address is flushed, there is a delay and the routes are - # deleted before the RIF and we cannot get state that we have route - # to disabled RIF. - # Delete IPv6 address first and then check this trap with flushing IPv4. - ip -6 add flush dev br0 - ip -4 add flush dev br0 + # Unlinking the port from the bridge will disable the RIF associated + # with br0 as it is no longer an upper of any mlxsw port. + ip link set dev $rp1 nomaster t1_packets=$(devlink_trap_rx_packets_get $trap_name) t1_bytes=$(devlink_trap_rx_bytes_get $trap_name) @@ -659,7 +642,6 @@ erif_disabled_test() log_test "Egress RIF disabled" kill $mz_pid && wait $mz_pid &> /dev/null - ip link set dev $rp1 nomaster __addr_add_del $rp1 add 192.0.2.2/24 2001:db8:1::2/64 ip link del dev br0 type bridge devlink_trap_action_set $trap_name "drop" diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_exceptions.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_exceptions.sh index 2bc6df42d597..1fedfc9da434 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_exceptions.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_exceptions.sh @@ -169,7 +169,6 @@ trap_action_check() mtu_value_is_too_small_test() { local trap_name="mtu_value_is_too_small" - local group_name="l3_drops" local expected_action="trap" local mz_pid @@ -191,7 +190,7 @@ mtu_value_is_too_small_test() -B 198.51.100.1 -q & mz_pid=$! - devlink_trap_exception_test $trap_name $group_name + devlink_trap_exception_test $trap_name tc_check_packets_hitting "dev $h1 ingress" 101 check_err $? "Packets were not received to h1" @@ -208,7 +207,6 @@ __ttl_value_is_too_small_test() { local ttl_val=$1; shift local trap_name="ttl_value_is_too_small" - local group_name="l3_drops" local expected_action="trap" local mz_pid @@ -227,7 +225,7 @@ __ttl_value_is_too_small_test() -b $rp1mac -B 198.51.100.1 -q & mz_pid=$! - devlink_trap_exception_test $trap_name $group_name + devlink_trap_exception_test $trap_name tc_check_packets_hitting "dev $h1 ingress" 101 check_err $? "Packets were not received to h1" @@ -271,7 +269,6 @@ __mc_reverse_path_forwarding_test() local proto=$1; shift local flags=${1:-""}; shift local trap_name="mc_reverse_path_forwarding" - local group_name="l3_drops" local expected_action="trap" local mz_pid @@ -292,7 +289,7 @@ __mc_reverse_path_forwarding_test() mz_pid=$! - devlink_trap_exception_test $trap_name $group_name + devlink_trap_exception_test $trap_name tc_check_packets "dev $rp2 egress" 101 0 check_err $? "Packets were not dropped" @@ -322,7 +319,6 @@ __reject_route_test() local unreachable=$1; shift local flags=${1:-""}; shift local trap_name="reject_route" - local group_name="l3_drops" local expected_action="trap" local mz_pid @@ -341,7 +337,7 @@ __reject_route_test() -B $dst_ip -q & mz_pid=$! - devlink_trap_exception_test $trap_name $group_name + devlink_trap_exception_test $trap_name tc_check_packets_hitting "dev $h1 ingress" 101 check_err $? "ICMP packet was not received to h1" @@ -370,7 +366,6 @@ __host_miss_test() local desc=$1; shift local dip=$1; shift local trap_name="unresolved_neigh" - local group_name="l3_drops" local expected_action="trap" local mz_pid @@ -405,7 +400,6 @@ __invalid_nexthop_test() local subnet=$1; shift local via_add=$1; shift local trap_name="unresolved_neigh" - local group_name="l3_drops" local expected_action="trap" local mz_pid @@ -494,7 +488,6 @@ vrf_without_routes_destroy() ipv4_lpm_miss_test() { local trap_name="ipv4_lpm_miss" - local group_name="l3_drops" local expected_action="trap" local mz_pid @@ -511,7 +504,7 @@ ipv4_lpm_miss_test() -B 203.0.113.1 -q & mz_pid=$! - devlink_trap_exception_test $trap_name $group_name + devlink_trap_exception_test $trap_name log_test "LPM miss: IPv4" @@ -522,7 +515,6 @@ ipv4_lpm_miss_test() ipv6_lpm_miss_test() { local trap_name="ipv6_lpm_miss" - local group_name="l3_drops" local expected_action="trap" local mz_pid @@ -539,7 +531,7 @@ ipv6_lpm_miss_test() -B 2001:db8::1 -q & mz_pid=$! - devlink_trap_exception_test $trap_name $group_name + devlink_trap_exception_test $trap_name log_test "LPM miss: IPv6" diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_policer.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_policer.sh new file mode 100755 index 000000000000..47edf099a17e --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_policer.sh @@ -0,0 +1,384 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test devlink-trap policer functionality over mlxsw. + +# +---------------------------------+ +# | H1 (vrf) | +# | + $h1 | +# | | 192.0.2.1/24 | +# | | | +# | | default via 192.0.2.2 | +# +----|----------------------------+ +# | +# +----|----------------------------------------------------------------------+ +# | SW | | +# | + $rp1 | +# | 192.0.2.2/24 | +# | | +# | 198.51.100.2/24 | +# | + $rp2 | +# | | | +# +----|----------------------------------------------------------------------+ +# | +# +----|----------------------------+ +# | | default via 198.51.100.2 | +# | | | +# | | 198.51.100.1/24 | +# | + $h2 | +# | H2 (vrf) | +# +---------------------------------+ + +lib_dir=$(dirname $0)/../../../net/forwarding + +ALL_TESTS=" + rate_limits_test + burst_limits_test + rate_test + burst_test +" +NUM_NETIFS=4 +source $lib_dir/tc_common.sh +source $lib_dir/lib.sh +source $lib_dir/devlink_lib.sh + +h1_create() +{ + simple_if_init $h1 192.0.2.1/24 + mtu_set $h1 10000 + + ip -4 route add default vrf v$h1 nexthop via 192.0.2.2 +} + +h1_destroy() +{ + ip -4 route del default vrf v$h1 nexthop via 192.0.2.2 + + mtu_restore $h1 + simple_if_fini $h1 192.0.2.1/24 +} + +h2_create() +{ + simple_if_init $h2 198.51.100.1/24 + mtu_set $h2 10000 + + ip -4 route add default vrf v$h2 nexthop via 198.51.100.2 +} + +h2_destroy() +{ + ip -4 route del default vrf v$h2 nexthop via 198.51.100.2 + + mtu_restore $h2 + simple_if_fini $h2 198.51.100.1/24 +} + +router_create() +{ + ip link set dev $rp1 up + ip link set dev $rp2 up + + __addr_add_del $rp1 add 192.0.2.2/24 + __addr_add_del $rp2 add 198.51.100.2/24 + mtu_set $rp1 10000 + mtu_set $rp2 10000 + + ip -4 route add blackhole 198.51.100.100 + + devlink trap set $DEVLINK_DEV trap blackhole_route action trap +} + +router_destroy() +{ + devlink trap set $DEVLINK_DEV trap blackhole_route action drop + + ip -4 route del blackhole 198.51.100.100 + + mtu_restore $rp2 + mtu_restore $rp1 + __addr_add_del $rp2 del 198.51.100.2/24 + __addr_add_del $rp1 del 192.0.2.2/24 + + ip link set dev $rp2 down + ip link set dev $rp1 down +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + rp1=${NETIFS[p2]} + + rp2=${NETIFS[p3]} + h2=${NETIFS[p4]} + + rp1_mac=$(mac_get $rp1) + + vrf_prepare + + h1_create + h2_create + + router_create +} + +cleanup() +{ + pre_cleanup + + router_destroy + + h2_destroy + h1_destroy + + vrf_cleanup + + # Reload to ensure devlink-trap settings are back to default. + devlink_reload +} + +rate_limits_test() +{ + RET=0 + + devlink trap policer set $DEVLINK_DEV policer 1 rate 0 &> /dev/null + check_fail $? "Policer rate was changed to rate lower than limit" + devlink trap policer set $DEVLINK_DEV policer 1 \ + rate 2000000001 &> /dev/null + check_fail $? "Policer rate was changed to rate higher than limit" + + devlink trap policer set $DEVLINK_DEV policer 1 rate 1 + check_err $? "Failed to set policer rate to minimum" + devlink trap policer set $DEVLINK_DEV policer 1 rate 2000000000 + check_err $? "Failed to set policer rate to maximum" + + log_test "Trap policer rate limits" +} + +burst_limits_test() +{ + RET=0 + + devlink trap policer set $DEVLINK_DEV policer 1 burst 0 &> /dev/null + check_fail $? "Policer burst size was changed to 0" + devlink trap policer set $DEVLINK_DEV policer 1 burst 17 &> /dev/null + check_fail $? "Policer burst size was changed to burst size that is not power of 2" + devlink trap policer set $DEVLINK_DEV policer 1 burst 8 &> /dev/null + check_fail $? "Policer burst size was changed to burst size lower than limit" + devlink trap policer set $DEVLINK_DEV policer 1 \ + burst $((2**25)) &> /dev/null + check_fail $? "Policer burst size was changed to burst size higher than limit" + + devlink trap policer set $DEVLINK_DEV policer 1 burst 16 + check_err $? "Failed to set policer burst size to minimum" + devlink trap policer set $DEVLINK_DEV policer 1 burst $((2**24)) + check_err $? "Failed to set policer burst size to maximum" + + log_test "Trap policer burst size limits" +} + +trap_rate_get() +{ + local t0 t1 + + t0=$(devlink_trap_rx_packets_get blackhole_route) + sleep 10 + t1=$(devlink_trap_rx_packets_get blackhole_route) + + echo $(((t1 - t0) / 10)) +} + +policer_drop_rate_get() +{ + local id=$1; shift + local t0 t1 + + t0=$(devlink_trap_policer_rx_dropped_get $id) + sleep 10 + t1=$(devlink_trap_policer_rx_dropped_get $id) + + echo $(((t1 - t0) / 10)) +} + +__rate_test() +{ + local rate pct drop_rate + local id=$1; shift + + RET=0 + + devlink trap policer set $DEVLINK_DEV policer $id rate 1000 burst 16 + devlink trap group set $DEVLINK_DEV group l3_drops policer $id + + # Send packets at highest possible rate and make sure they are dropped + # by the policer. Make sure measured received rate is about 1000 pps + log_info "=== Tx rate: Highest, Policer rate: 1000 pps ===" + + start_traffic $h1 192.0.2.1 198.51.100.100 $rp1_mac + + sleep 5 # Take measurements when rate is stable + + rate=$(trap_rate_get) + pct=$((100 * (rate - 1000) / 1000)) + ((-5 <= pct && pct <= 5)) + check_err $? "Expected rate 1000 pps, got $rate pps, which is $pct% off. Required accuracy is +-5%" + log_info "Expected rate 1000 pps, measured rate $rate pps" + + drop_rate=$(policer_drop_rate_get $id) + (( drop_rate > 0 )) + check_err $? "Expected non-zero policer drop rate, got 0" + log_info "Measured policer drop rate of $drop_rate pps" + + stop_traffic + + # Send packets at a rate of 1000 pps and make sure they are not dropped + # by the policer + log_info "=== Tx rate: 1000 pps, Policer rate: 1000 pps ===" + + start_traffic $h1 192.0.2.1 198.51.100.100 $rp1_mac -d 1msec + + sleep 5 # Take measurements when rate is stable + + drop_rate=$(policer_drop_rate_get $id) + (( drop_rate == 0 )) + check_err $? "Expected zero policer drop rate, got a drop rate of $drop_rate pps" + log_info "Measured policer drop rate of $drop_rate pps" + + stop_traffic + + # Unbind the policer and send packets at highest possible rate. Make + # sure they are not dropped by the policer and that the measured + # received rate is higher than 1000 pps + log_info "=== Tx rate: Highest, Policer rate: No policer ===" + + devlink trap group set $DEVLINK_DEV group l3_drops nopolicer + + start_traffic $h1 192.0.2.1 198.51.100.100 $rp1_mac + + rate=$(trap_rate_get) + (( rate > 1000 )) + check_err $? "Expected rate higher than 1000 pps, got $rate pps" + log_info "Measured rate $rate pps" + + drop_rate=$(policer_drop_rate_get $id) + (( drop_rate == 0 )) + check_err $? "Expected zero policer drop rate, got a drop rate of $drop_rate pps" + log_info "Measured policer drop rate of $drop_rate pps" + + stop_traffic + + log_test "Trap policer rate" +} + +rate_test() +{ + local id + + for id in $(devlink_trap_policer_ids_get); do + echo + log_info "Running rate test for policer $id" + __rate_test $id + done +} + +__burst_test() +{ + local t0_rx t0_drop t1_rx t1_drop rx drop + local id=$1; shift + + RET=0 + + devlink trap policer set $DEVLINK_DEV policer $id rate 1000 burst 32 + devlink trap group set $DEVLINK_DEV group l3_drops policer $id + + # Send a burst of 64 packets and make sure that about 32 are received + # and the rest are dropped by the policer + log_info "=== Tx burst size: 64, Policer burst size: 32 pps ===" + + t0_rx=$(devlink_trap_rx_packets_get blackhole_route) + t0_drop=$(devlink_trap_policer_rx_dropped_get $id) + + start_traffic $h1 192.0.2.1 198.51.100.100 $rp1_mac -c 64 + + t1_rx=$(devlink_trap_rx_packets_get blackhole_route) + t1_drop=$(devlink_trap_policer_rx_dropped_get $id) + + rx=$((t1_rx - t0_rx)) + pct=$((100 * (rx - 32) / 32)) + ((-20 <= pct && pct <= 20)) + check_err $? "Expected burst size of 32 packets, got $rx packets, which is $pct% off. Required accuracy is +-20%" + log_info "Expected burst size of 32 packets, measured burst size of $rx packets" + + drop=$((t1_drop - t0_drop)) + (( drop > 0 )) + check_err $? "Expected non-zero policer drops, got 0" + log_info "Measured policer drops of $drop packets" + + # Send a burst of 16 packets and make sure that 16 are received + # and that none are dropped by the policer + log_info "=== Tx burst size: 16, Policer burst size: 32 pps ===" + + t0_rx=$(devlink_trap_rx_packets_get blackhole_route) + t0_drop=$(devlink_trap_policer_rx_dropped_get $id) + + start_traffic $h1 192.0.2.1 198.51.100.100 $rp1_mac -c 16 + + t1_rx=$(devlink_trap_rx_packets_get blackhole_route) + t1_drop=$(devlink_trap_policer_rx_dropped_get $id) + + rx=$((t1_rx - t0_rx)) + (( rx == 16 )) + check_err $? "Expected burst size of 16 packets, got $rx packets" + log_info "Expected burst size of 16 packets, measured burst size of $rx packets" + + drop=$((t1_drop - t0_drop)) + (( drop == 0 )) + check_err $? "Expected zero policer drops, got $drop" + log_info "Measured policer drops of $drop packets" + + # Unbind the policer and send a burst of 64 packets. Make sure that + # 64 packets are received and that none are dropped by the policer + log_info "=== Tx burst size: 64, Policer burst size: No policer ===" + + devlink trap group set $DEVLINK_DEV group l3_drops nopolicer + + t0_rx=$(devlink_trap_rx_packets_get blackhole_route) + t0_drop=$(devlink_trap_policer_rx_dropped_get $id) + + start_traffic $h1 192.0.2.1 198.51.100.100 $rp1_mac -c 64 + + t1_rx=$(devlink_trap_rx_packets_get blackhole_route) + t1_drop=$(devlink_trap_policer_rx_dropped_get $id) + + rx=$((t1_rx - t0_rx)) + (( rx == 64 )) + check_err $? "Expected burst size of 64 packets, got $rx packets" + log_info "Expected burst size of 64 packets, measured burst size of $rx packets" + + drop=$((t1_drop - t0_drop)) + (( drop == 0 )) + check_err $? "Expected zero policer drops, got $drop" + log_info "Measured policer drops of $drop packets" + + log_test "Trap policer burst size" +} + +burst_test() +{ + local id + + for id in $(devlink_trap_policer_ids_get); do + echo + log_info "Running burst size test for policer $id" + __burst_test $id + done +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_ipip.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_ipip.sh index 039629bb92a3..8817851da7a9 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_ipip.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_ipip.sh @@ -140,7 +140,6 @@ ecn_payload_get() ecn_decap_test() { local trap_name="decap_error" - local group_name="tunnel_drops" local desc=$1; shift local ecn_desc=$1; shift local outer_tos=$1; shift @@ -161,7 +160,7 @@ ecn_decap_test() mz_pid=$! - devlink_trap_exception_test $trap_name $group_name + devlink_trap_exception_test $trap_name tc_check_packets "dev $swp1 egress" 101 0 check_err $? "Packets were not dropped" @@ -200,7 +199,6 @@ ipip_payload_get() no_matching_tunnel_test() { local trap_name="decap_error" - local group_name="tunnel_drops" local desc=$1; shift local sip=$1; shift local mz_pid @@ -218,7 +216,7 @@ no_matching_tunnel_test() -A $sip -B 192.0.2.65 -t ip len=48,proto=47,p=$payload -q & mz_pid=$! - devlink_trap_exception_test $trap_name $group_name + devlink_trap_exception_test $trap_name tc_check_packets "dev $swp1 egress" 101 0 check_err $? "Packets were not dropped" diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan.sh index fd19161dd4ec..10e0f3dbc930 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan.sh @@ -159,7 +159,6 @@ ecn_payload_get() ecn_decap_test() { local trap_name="decap_error" - local group_name="tunnel_drops" local desc=$1; shift local ecn_desc=$1; shift local outer_tos=$1; shift @@ -177,7 +176,7 @@ ecn_decap_test() -t udp sp=12345,dp=$VXPORT,tos=$outer_tos,p=$payload -q & mz_pid=$! - devlink_trap_exception_test $trap_name $group_name + devlink_trap_exception_test $trap_name tc_check_packets "dev $swp1 egress" 101 0 check_err $? "Packets were not dropped" @@ -228,7 +227,6 @@ short_payload_get() corrupted_packet_test() { local trap_name="decap_error" - local group_name="tunnel_drops" local desc=$1; shift local payload_get=$1; shift local mz_pid @@ -246,7 +244,7 @@ corrupted_packet_test() -B 192.0.2.17 -t udp sp=12345,dp=$VXPORT,p=$payload -q & mz_pid=$! - devlink_trap_exception_test $trap_name $group_name + devlink_trap_exception_test $trap_name tc_check_packets "dev $swp1 egress" 101 0 check_err $? "Packets were not dropped" @@ -297,7 +295,6 @@ mc_smac_payload_get() overlay_smac_is_mc_test() { local trap_name="overlay_smac_is_mc" - local group_name="tunnel_drops" local mz_pid RET=0 @@ -314,11 +311,11 @@ overlay_smac_is_mc_test() -B 192.0.2.17 -t udp sp=12345,dp=$VXPORT,p=$payload -q & mz_pid=$! - devlink_trap_drop_test $trap_name $group_name $swp1 + devlink_trap_drop_test $trap_name $swp1 101 log_test "Overlay source MAC is multicast" - devlink_trap_drop_cleanup $mz_pid $swp1 "ip" + devlink_trap_drop_cleanup $mz_pid $swp1 "ip" 1 101 } trap cleanup EXIT diff --git a/tools/testing/selftests/drivers/net/mlxsw/extack.sh b/tools/testing/selftests/drivers/net/mlxsw/extack.sh index d72d8488a3b2..7a0a99c1d22f 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/extack.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/extack.sh @@ -8,7 +8,8 @@ lib_dir=$(dirname $0)/../../../net/forwarding ALL_TESTS=" netdev_pre_up_test vxlan_vlan_add_test - port_vlan_add_test + vxlan_bridge_create_test + bridge_create_test " NUM_NETIFS=2 source $lib_dir/lib.sh @@ -106,32 +107,56 @@ vxlan_vlan_add_test() ip link del dev br1 } -port_vlan_add_test() +vxlan_bridge_create_test() { RET=0 - ip link add name br1 up type bridge vlan_filtering 1 mcast_snooping 0 - # Unsupported configuration: mlxsw demands VXLAN with "noudpcsum". ip link add name vx1 up type vxlan id 1000 \ local 192.0.2.17 remote 192.0.2.18 \ dstport 4789 tos inherit ttl 100 - ip link set dev $swp1 master br1 - check_err $? - - bridge vlan del dev $swp1 vid 1 + # Test with VLAN-aware bridge. + ip link add name br1 up type bridge vlan_filtering 1 mcast_snooping 0 ip link set dev vx1 master br1 + + ip link set dev $swp1 master br1 2>&1 > /dev/null \ + | grep -q mlxsw_spectrum check_err $? - bridge vlan add dev $swp1 vid 1 pvid untagged 2>&1 >/dev/null \ + # Test with VLAN-unaware bridge. + ip link set dev br1 type bridge vlan_filtering 0 + + ip link set dev $swp1 master br1 2>&1 > /dev/null \ | grep -q mlxsw_spectrum check_err $? - log_test "extack - map VLAN at port" + log_test "extack - bridge creation with VXLAN" + ip link del dev br1 ip link del dev vx1 +} + +bridge_create_test() +{ + RET=0 + + ip link add name br1 up type bridge vlan_filtering 1 + ip link add name br2 up type bridge vlan_filtering 1 + + ip link set dev $swp1 master br1 + check_err $? + + # Only one VLAN-aware bridge is supported, so this should fail with + # an extack. + ip link set dev $swp2 master br2 2>&1 > /dev/null \ + | grep -q mlxsw_spectrum + check_err $? + + log_test "extack - multiple VLAN-aware bridges creation" + + ip link del dev br2 ip link del dev br1 } diff --git a/tools/testing/selftests/drivers/net/mlxsw/mlxsw_lib.sh b/tools/testing/selftests/drivers/net/mlxsw/mlxsw_lib.sh new file mode 100644 index 000000000000..cbe50f260a40 --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/mlxsw_lib.sh @@ -0,0 +1,13 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +############################################################################## +# Defines + +if [[ ! -v MLXSW_CHIP ]]; then + MLXSW_CHIP=$(devlink -j dev info $DEVLINK_DEV | jq -r '.[][]["driver"]') + if [ -z "$MLXSW_CHIP" ]; then + echo "SKIP: Device $DEVLINK_DEV doesn't support devlink info command" + exit 1 + fi +fi diff --git a/tools/testing/selftests/drivers/net/mlxsw/qos_defprio.sh b/tools/testing/selftests/drivers/net/mlxsw/qos_defprio.sh index eff6393ce974..71066bc4b886 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/qos_defprio.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/qos_defprio.sh @@ -114,23 +114,12 @@ ping_ipv4() ping_test $h1 192.0.2.2 } -wait_for_packets() -{ - local t0=$1; shift - local prio_observe=$1; shift - - local t1=$(ethtool_stats_get $swp1 rx_frames_prio_$prio_observe) - local delta=$((t1 - t0)) - echo $delta - ((delta >= 10)) -} - __test_defprio() { local prio_install=$1; shift local prio_observe=$1; shift - local delta local key + local t1 local i RET=0 @@ -139,9 +128,10 @@ __test_defprio() local t0=$(ethtool_stats_get $swp1 rx_frames_prio_$prio_observe) mausezahn -q $h1 -d 100m -c 10 -t arp reply - delta=$(busywait "$HIT_TIMEOUT" wait_for_packets $t0 $prio_observe) + t1=$(busywait "$HIT_TIMEOUT" until_counter_is ">= $((t0 + 10))" \ + ethtool_stats_get $swp1 rx_frames_prio_$prio_observe) - check_err $? "Default priority $prio_install/$prio_observe: Expected to capture 10 packets, got $delta." + check_err $? "Default priority $prio_install/$prio_observe: Expected to capture 10 packets, got $((t1 - t0))." log_test "Default priority $prio_install/$prio_observe" defprio_uninstall $swp1 $prio_install diff --git a/tools/testing/selftests/drivers/net/mlxsw/qos_dscp_router.sh b/tools/testing/selftests/drivers/net/mlxsw/qos_dscp_router.sh index c745ce3befee..4cb2aa65278a 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/qos_dscp_router.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/qos_dscp_router.sh @@ -31,6 +31,7 @@ ALL_TESTS=" ping_ipv4 test_update test_no_update + test_pedit_norewrite test_dscp_leftover " @@ -56,6 +57,11 @@ zero() echo 0 } +three() +{ + echo 3 +} + h1_create() { simple_if_init $h1 192.0.2.1/28 @@ -103,6 +109,9 @@ switch_create() simple_if_init $swp1 192.0.2.2/28 __simple_if_init $swp2 v$swp1 192.0.2.17/28 + tc qdisc add dev $swp1 clsact + tc qdisc add dev $swp2 clsact + lldptool -T -i $swp1 -V APP $(dscp_map 0) >/dev/null lldptool -T -i $swp2 -V APP $(dscp_map 0) >/dev/null lldpad_app_wait_set $swp1 @@ -115,6 +124,9 @@ switch_destroy() lldptool -T -i $swp1 -V APP -d $(dscp_map 0) >/dev/null lldpad_app_wait_del + tc qdisc del dev $swp2 clsact + tc qdisc del dev $swp1 clsact + __simple_if_fini $swp2 192.0.2.17/28 simple_if_fini $swp1 192.0.2.2/28 } @@ -223,18 +235,36 @@ __test_update() test_update() { + echo "Test net.ipv4.ip_forward_update_priority=1" __test_update 1 reprioritize } test_no_update() { + echo "Test net.ipv4.ip_forward_update_priority=0" __test_update 0 echo } +# Test that when DSCP is updated in pedit, the DSCP rewrite is turned off. +test_pedit_norewrite() +{ + echo "Test no DSCP rewrite after DSCP is updated by pedit" + + tc filter add dev $swp1 ingress handle 101 pref 1 prot ip flower \ + action pedit ex munge ip dsfield set $((3 << 2)) retain 0xfc \ + action skbedit priority 3 + + __test_update 0 three + + tc filter del dev $swp1 ingress pref 1 +} + # Test that when the last APP rule is removed, the prio->DSCP map is properly # set to zeroes, and that the last APP rule does not stay active in the ASIC. test_dscp_leftover() { + echo "Test that last removed DSCP rule is deconfigured correctly" + lldptool -T -i $swp2 -V APP -d $(dscp_map 0) >/dev/null lldpad_app_wait_del diff --git a/tools/testing/selftests/drivers/net/mlxsw/qos_mc_aware.sh b/tools/testing/selftests/drivers/net/mlxsw/qos_mc_aware.sh index 24dd8ed48580..b025daea062d 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/qos_mc_aware.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/qos_mc_aware.sh @@ -300,7 +300,7 @@ test_uc_aware() local i for ((i = 0; i < attempts; ++i)); do - if $ARPING -c 1 -I $h1 -b 192.0.2.66 -q -w 0.1; then + if $ARPING -c 1 -I $h1 -b 192.0.2.66 -q -w 1; then ((passes++)) fi diff --git a/tools/testing/selftests/drivers/net/mlxsw/router_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/router_scale.sh index d231649b4f01..e93878d42596 100644 --- a/tools/testing/selftests/drivers/net/mlxsw/router_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/router_scale.sh @@ -2,16 +2,15 @@ # SPDX-License-Identifier: GPL-2.0 ROUTER_NUM_NETIFS=4 +: ${TIMEOUT:=20000} # ms router_h1_create() { simple_if_init $h1 192.0.1.1/24 - ip route add 193.0.0.0/8 via 192.0.1.2 dev $h1 } router_h1_destroy() { - ip route del 193.0.0.0/8 via 192.0.1.2 dev $h1 simple_if_fini $h1 192.0.1.1/24 } @@ -64,13 +63,15 @@ router_setup_prepare() router_create } -router_offload_validate() +wait_for_routes() { - local route_count=$1 - local offloaded_count + local t0=$1; shift + local route_count=$1; shift - offloaded_count=$(ip route | grep -o 'offload' | wc -l) - [[ $offloaded_count -ge $route_count ]] + local t1=$(ip route | grep -o 'offload' | wc -l) + local delta=$((t1 - t0)) + echo $delta + [[ $delta -ge $route_count ]] } router_routes_create() @@ -90,8 +91,8 @@ router_routes_create() break 3 fi - echo route add 193.${i}.${j}.${k}/32 via \ - 192.0.2.1 dev $rp2 >> $ROUTE_FILE + echo route add 193.${i}.${j}.${k}/32 dev $rp2 \ + >> $ROUTE_FILE ((count++)) done done @@ -111,45 +112,19 @@ router_test() { local route_count=$1 local should_fail=$2 - local count=0 + local delta RET=0 + local t0=$(ip route | grep -o 'offload' | wc -l) router_routes_create $route_count + delta=$(busywait "$TIMEOUT" wait_for_routes $t0 $route_count) - router_offload_validate $route_count - check_err_fail $should_fail $? "Offload of $route_count routes" + check_err_fail $should_fail $? "Offload routes: Expected $route_count, got $delta." if [[ $RET -ne 0 ]] || [[ $should_fail -eq 1 ]]; then return fi - tc filter add dev $h2 ingress protocol ip pref 1 flower \ - skip_sw dst_ip 193.0.0.0/8 action drop - - for i in {0..255} - do - for j in {0..255} - do - for k in {0..255} - do - if [[ $count -eq $route_count ]]; then - break 3 - fi - - $MZ $h1 -c 1 -p 64 -a $h1mac -b $rp1mac \ - -A 192.0.1.1 -B 193.${i}.${j}.${k} \ - -t ip -q - ((count++)) - done - done - done - - tc_check_packets "dev $h2 ingress" 1 $route_count - check_err $? "Offload mismatch" - - tc filter del dev $h2 ingress protocol ip pref 1 flower \ - skip_sw dst_ip 193.0.0.0/8 action drop - router_routes_destroy } diff --git a/tools/testing/selftests/drivers/net/mlxsw/rtnetlink.sh b/tools/testing/selftests/drivers/net/mlxsw/rtnetlink.sh index 5c39e5f6a480..f4031002d5e9 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/rtnetlink.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/rtnetlink.sh @@ -32,6 +32,7 @@ ALL_TESTS=" devlink_reload_test " NUM_NETIFS=2 +: ${TIMEOUT:=20000} # ms source $lib_dir/lib.sh source $lib_dir/devlink_lib.sh @@ -360,20 +361,24 @@ vlan_rif_refcount_test() ip link add link br0 name br0.10 up type vlan id 10 ip -6 address add 2001:db8:1::1/64 dev br0.10 - ip -6 route get fibmatch 2001:db8:1::2 dev br0.10 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip -6 route get fibmatch 2001:db8:1::2 dev br0.10 check_err $? "vlan rif was not created before adding port to vlan" bridge vlan add vid 10 dev $swp1 - ip -6 route get fibmatch 2001:db8:1::2 dev br0.10 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip -6 route get fibmatch 2001:db8:1::2 dev br0.10 check_err $? "vlan rif was destroyed after adding port to vlan" bridge vlan del vid 10 dev $swp1 - ip -6 route get fibmatch 2001:db8:1::2 dev br0.10 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip -6 route get fibmatch 2001:db8:1::2 dev br0.10 check_err $? "vlan rif was destroyed after removing port from vlan" ip link set dev $swp1 nomaster - ip -6 route get fibmatch 2001:db8:1::2 dev br0.10 | grep -q offload - check_fail $? "vlan rif was not destroyed after unlinking port from bridge" + busywait "$TIMEOUT" not wait_for_offload \ + ip -6 route get fibmatch 2001:db8:1::2 dev br0.10 + check_err $? "vlan rif was not destroyed after unlinking port from bridge" log_test "vlan rif refcount" @@ -401,22 +406,28 @@ subport_rif_refcount_test() ip -6 address add 2001:db8:1::1/64 dev bond1 ip -6 address add 2001:db8:2::1/64 dev bond1.10 - ip -6 route get fibmatch 2001:db8:1::2 dev bond1 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip -6 route get fibmatch 2001:db8:1::2 dev bond1 check_err $? "subport rif was not created on lag device" - ip -6 route get fibmatch 2001:db8:2::2 dev bond1.10 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip -6 route get fibmatch 2001:db8:2::2 dev bond1.10 check_err $? "subport rif was not created on vlan device" ip link set dev $swp1 nomaster - ip -6 route get fibmatch 2001:db8:1::2 dev bond1 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip -6 route get fibmatch 2001:db8:1::2 dev bond1 check_err $? "subport rif of lag device was destroyed when should not" - ip -6 route get fibmatch 2001:db8:2::2 dev bond1.10 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip -6 route get fibmatch 2001:db8:2::2 dev bond1.10 check_err $? "subport rif of vlan device was destroyed when should not" ip link set dev $swp2 nomaster - ip -6 route get fibmatch 2001:db8:1::2 dev bond1 | grep -q offload - check_fail $? "subport rif of lag device was not destroyed when should" - ip -6 route get fibmatch 2001:db8:2::2 dev bond1.10 | grep -q offload - check_fail $? "subport rif of vlan device was not destroyed when should" + busywait "$TIMEOUT" not wait_for_offload \ + ip -6 route get fibmatch 2001:db8:1::2 dev bond1 + check_err $? "subport rif of lag device was not destroyed when should" + busywait "$TIMEOUT" not wait_for_offload \ + ip -6 route get fibmatch 2001:db8:2::2 dev bond1.10 + check_err $? "subport rif of vlan device was not destroyed when should" log_test "subport rif refcount" @@ -575,7 +586,8 @@ bridge_extern_learn_test() bridge fdb add de:ad:be:ef:13:37 dev $swp1 master extern_learn - bridge fdb show brport $swp1 | grep de:ad:be:ef:13:37 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + bridge fdb show brport $swp1 de:ad:be:ef:13:37 check_err $? "fdb entry not marked as offloaded when should" log_test "externally learned fdb entry" @@ -595,9 +607,11 @@ neigh_offload_test() ip -6 neigh add 2001:db8:1::2 lladdr de:ad:be:ef:13:37 nud perm \ dev $swp1 - ip -4 neigh show dev $swp1 | grep 192.0.2.2 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip -4 neigh show dev $swp1 192.0.2.2 check_err $? "ipv4 neigh entry not marked as offloaded when should" - ip -6 neigh show dev $swp1 | grep 2001:db8:1::2 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip -6 neigh show dev $swp1 2001:db8:1::2 check_err $? "ipv6 neigh entry not marked as offloaded when should" log_test "neighbour offload indication" @@ -623,25 +637,31 @@ nexthop_offload_test() ip -6 route add 2001:db8:2::/64 vrf v$swp1 \ nexthop via 2001:db8:1::2 dev $swp1 - ip -4 route show 198.51.100.0/24 vrf v$swp1 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip -4 route show 198.51.100.0/24 vrf v$swp1 check_err $? "ipv4 nexthop not marked as offloaded when should" - ip -6 route show 2001:db8:2::/64 vrf v$swp1 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip -6 route show 2001:db8:2::/64 vrf v$swp1 check_err $? "ipv6 nexthop not marked as offloaded when should" ip link set dev $swp2 down sleep 1 - ip -4 route show 198.51.100.0/24 vrf v$swp1 | grep -q offload - check_fail $? "ipv4 nexthop marked as offloaded when should not" - ip -6 route show 2001:db8:2::/64 vrf v$swp1 | grep -q offload - check_fail $? "ipv6 nexthop marked as offloaded when should not" + busywait "$TIMEOUT" not wait_for_offload \ + ip -4 route show 198.51.100.0/24 vrf v$swp1 + check_err $? "ipv4 nexthop marked as offloaded when should not" + busywait "$TIMEOUT" not wait_for_offload \ + ip -6 route show 2001:db8:2::/64 vrf v$swp1 + check_err $? "ipv6 nexthop marked as offloaded when should not" ip link set dev $swp2 up setup_wait - ip -4 route show 198.51.100.0/24 vrf v$swp1 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip -4 route show 198.51.100.0/24 vrf v$swp1 check_err $? "ipv4 nexthop not marked as offloaded after neigh add" - ip -6 route show 2001:db8:2::/64 vrf v$swp1 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip -6 route show 2001:db8:2::/64 vrf v$swp1 check_err $? "ipv6 nexthop not marked as offloaded after neigh add" log_test "nexthop offload indication" diff --git a/tools/testing/selftests/drivers/net/mlxsw/sch_ets.sh b/tools/testing/selftests/drivers/net/mlxsw/sch_ets.sh index c9fc4d4885c1..94c37124a840 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/sch_ets.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/sch_ets.sh @@ -56,11 +56,19 @@ switch_destroy() } # Callback from sch_ets_tests.sh -get_stats() +collect_stats() { - local band=$1; shift + local -a streams=("$@") + local stream - ethtool_stats_get "$h2" rx_octets_prio_$band + # Wait for qdisc counter update so that we don't get it mid-way through. + busywait_for_counter 1000 +1 \ + qdisc_parent_stats_get $swp2 10:$((${streams[0]} + 1)) .bytes \ + > /dev/null + + for stream in ${streams[@]}; do + qdisc_parent_stats_get $swp2 10:$((stream + 1)) .bytes + done } bail_on_lldpad diff --git a/tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh b/tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh new file mode 100644 index 000000000000..0d347d48c112 --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh @@ -0,0 +1,533 @@ +# SPDX-License-Identifier: GPL-2.0 + +# This test sends a >1Gbps stream of traffic from H1, to the switch, which +# forwards it to a 1Gbps port. This 1Gbps stream is then looped back to the +# switch and forwarded to the port under test $swp3, which is also 1Gbps. +# +# This way, $swp3 should be 100% filled with traffic without any of it spilling +# to the backlog. Any extra packets sent should almost 1:1 go to backlog. That +# is what H2 is used for--it sends the extra traffic to create backlog. +# +# A RED Qdisc is installed on $swp3. The configuration is such that the minimum +# and maximum size are 1 byte apart, so there is a very clear border under which +# no marking or dropping takes place, and above which everything is marked or +# dropped. +# +# The test uses the buffer build-up behavior to test the installed RED. +# +# In order to test WRED, $swp3 actually contains RED under PRIO, with two +# different configurations. Traffic is prioritized using 802.1p and relies on +# the implicit mlxsw configuration, where packet priority is taken 1:1 from the +# 802.1p marking. +# +# +--------------------------+ +--------------------------+ +# | H1 | | H2 | +# | + $h1.10 | | + $h2.10 | +# | | 192.0.2.1/28 | | | 192.0.2.2/28 | +# | | | | | | +# | | $h1.11 + | | | $h2.11 + | +# | | 192.0.2.17/28 | | | | 192.0.2.18/28 | | +# | | | | | | | | +# | \______ ______/ | | \______ ______/ | +# | \ / | | \ / | +# | + $h1 | | + $h2 | +# +-------------|------------+ +-------------|------------+ +# | >1Gbps | +# +-------------|------------------------------------------------|------------+ +# | SW + $swp1 + $swp2 | +# | _______/ \___________ ___________/ \_______ | +# | / \ / \ | +# | +-|-----------------+ | +-|-----------------+ | | +# | | + $swp1.10 | | | + $swp2.10 | | | +# | | | | .-------------+ $swp5.10 | | | +# | | BR1_10 | | | | | | | +# | | | | | | BR2_10 | | | +# | | + $swp2.10 | | | | | | | +# | +-|-----------------+ | | | + $swp3.10 | | | +# | | | | +-|-----------------+ | | +# | | +-----------------|-+ | | +-----------------|-+ | +# | | | $swp1.11 + | | | | $swp2.11 + | | +# | | | | | .-----------------+ $swp5.11 | | +# | | | BR1_11 | | | | | | | +# | | | | | | | | BR2_11 | | +# | | | $swp2.11 + | | | | | | | +# | | +-----------------|-+ | | | | $swp3.11 + | | +# | | | | | | +-----------------|-+ | +# | \_______ ___________/ | | \___________ _______/ | +# | \ / \ / \ / | +# | + $swp4 + $swp5 + $swp3 | +# +-------------|----------------------|-------------------------|------------+ +# | | | 1Gbps +# \________1Gbps_________/ | +# +----------------------------|------------+ +# | H3 + $h3 | +# | _____________________/ \_______ | +# | / \ | +# | | | | +# | + $h3.10 $h3.11 + | +# | 192.0.2.3/28 192.0.2.19/28 | +# +-----------------------------------------+ + +NUM_NETIFS=8 +CHECK_TC="yes" +lib_dir=$(dirname $0)/../../../net/forwarding +source $lib_dir/lib.sh +source $lib_dir/devlink_lib.sh +source qos_lib.sh + +ipaddr() +{ + local host=$1; shift + local vlan=$1; shift + + echo 192.0.2.$((16 * (vlan - 10) + host)) +} + +host_create() +{ + local dev=$1; shift + local host=$1; shift + + simple_if_init $dev + mtu_set $dev 10000 + + vlan_create $dev 10 v$dev $(ipaddr $host 10)/28 + ip link set dev $dev.10 type vlan egress 0:0 + + vlan_create $dev 11 v$dev $(ipaddr $host 11)/28 + ip link set dev $dev.11 type vlan egress 0:1 +} + +host_destroy() +{ + local dev=$1; shift + + vlan_destroy $dev 11 + vlan_destroy $dev 10 + mtu_restore $dev + simple_if_fini $dev +} + +h1_create() +{ + host_create $h1 1 +} + +h1_destroy() +{ + host_destroy $h1 +} + +h2_create() +{ + host_create $h2 2 + + # Some of the tests in this suite use multicast traffic. As this traffic + # enters BR2_10 resp. BR2_11, it is flooded to all other ports. Thus + # e.g. traffic ingressing through $swp2 is flooded to $swp3 (the + # intended destination) and $swp5 (which is intended as ingress for + # another stream of traffic). + # + # This is generally not a problem, but if the $swp5 throughput is lower + # than $swp2 throughput, there will be a build-up at $swp5. That may + # cause packets to fail to queue up at $swp3 due to shared buffer + # quotas, and the test to spuriously fail. + # + # Prevent this by setting the speed of $h2 to 1Gbps. + + ethtool -s $h2 speed 1000 autoneg off +} + +h2_destroy() +{ + ethtool -s $h2 autoneg on + host_destroy $h2 +} + +h3_create() +{ + host_create $h3 3 + ethtool -s $h3 speed 1000 autoneg off +} + +h3_destroy() +{ + ethtool -s $h3 autoneg on + host_destroy $h3 +} + +switch_create() +{ + local intf + local vlan + + ip link add dev br1_10 type bridge + ip link add dev br1_11 type bridge + + ip link add dev br2_10 type bridge + ip link add dev br2_11 type bridge + + for intf in $swp1 $swp2 $swp3 $swp4 $swp5; do + ip link set dev $intf up + mtu_set $intf 10000 + done + + for intf in $swp1 $swp4; do + for vlan in 10 11; do + vlan_create $intf $vlan + ip link set dev $intf.$vlan master br1_$vlan + ip link set dev $intf.$vlan up + done + done + + for intf in $swp2 $swp3 $swp5; do + for vlan in 10 11; do + vlan_create $intf $vlan + ip link set dev $intf.$vlan master br2_$vlan + ip link set dev $intf.$vlan up + done + done + + ip link set dev $swp4.10 type vlan egress 0:0 + ip link set dev $swp4.11 type vlan egress 0:1 + for intf in $swp1 $swp2 $swp5; do + for vlan in 10 11; do + ip link set dev $intf.$vlan type vlan ingress 0:0 1:1 + done + done + + for intf in $swp2 $swp3 $swp4 $swp5; do + ethtool -s $intf speed 1000 autoneg off + done + + ip link set dev br1_10 up + ip link set dev br1_11 up + ip link set dev br2_10 up + ip link set dev br2_11 up + + local size=$(devlink_pool_size_thtype 0 | cut -d' ' -f 1) + devlink_port_pool_th_set $swp3 8 $size +} + +switch_destroy() +{ + local intf + local vlan + + devlink_port_pool_th_restore $swp3 8 + + tc qdisc del dev $swp3 root 2>/dev/null + + ip link set dev br2_11 down + ip link set dev br2_10 down + ip link set dev br1_11 down + ip link set dev br1_10 down + + for intf in $swp5 $swp4 $swp3 $swp2; do + ethtool -s $intf autoneg on + done + + for intf in $swp5 $swp3 $swp2 $swp4 $swp1; do + for vlan in 11 10; do + ip link set dev $intf.$vlan down + ip link set dev $intf.$vlan nomaster + vlan_destroy $intf $vlan + done + + mtu_restore $intf + ip link set dev $intf down + done + + ip link del dev br2_11 + ip link del dev br2_10 + ip link del dev br1_11 + ip link del dev br1_10 +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + swp1=${NETIFS[p2]} + + swp2=${NETIFS[p3]} + h2=${NETIFS[p4]} + + swp3=${NETIFS[p5]} + h3=${NETIFS[p6]} + + swp4=${NETIFS[p7]} + swp5=${NETIFS[p8]} + + h3_mac=$(mac_get $h3) + + vrf_prepare + + h1_create + h2_create + h3_create + switch_create +} + +cleanup() +{ + pre_cleanup + + switch_destroy + h3_destroy + h2_destroy + h1_destroy + + vrf_cleanup +} + +ping_ipv4() +{ + ping_test $h1.10 $(ipaddr 3 10) " from host 1, vlan 10" + ping_test $h1.11 $(ipaddr 3 11) " from host 1, vlan 11" + ping_test $h2.10 $(ipaddr 3 10) " from host 2, vlan 10" + ping_test $h2.11 $(ipaddr 3 11) " from host 2, vlan 11" +} + +get_tc() +{ + local vlan=$1; shift + + echo $((vlan - 10)) +} + +get_qdisc_handle() +{ + local vlan=$1; shift + + local tc=$(get_tc $vlan) + local band=$((8 - tc)) + + # Handle is 107: for TC1, 108: for TC0. + echo "10$band:" +} + +get_qdisc_backlog() +{ + local vlan=$1; shift + + qdisc_stats_get $swp3 $(get_qdisc_handle $vlan) .backlog +} + +get_mc_transmit_queue() +{ + local vlan=$1; shift + + local tc=$(($(get_tc $vlan) + 8)) + ethtool_stats_get $swp3 tc_transmit_queue_tc_$tc +} + +get_nmarked() +{ + local vlan=$1; shift + + ethtool_stats_get $swp3 ecn_marked +} + +get_qdisc_npackets() +{ + local vlan=$1; shift + + busywait_for_counter 1100 +1 \ + qdisc_stats_get $swp3 $(get_qdisc_handle $vlan) .packets +} + +# This sends traffic in an attempt to build a backlog of $size. Returns 0 on +# success. After 10 failed attempts it bails out and returns 1. It dumps the +# backlog size to stdout. +build_backlog() +{ + local vlan=$1; shift + local size=$1; shift + local proto=$1; shift + + local tc=$((vlan - 10)) + local band=$((8 - tc)) + local cur=-1 + local i=0 + + while :; do + local cur=$(busywait 1100 until_counter_is "> $cur" \ + get_qdisc_backlog $vlan) + local diff=$((size - cur)) + local pkts=$(((diff + 7999) / 8000)) + + if ((cur >= size)); then + echo $cur + return 0 + elif ((i++ > 10)); then + echo $cur + return 1 + fi + + $MZ $h2.$vlan -p 8000 -a own -b $h3_mac \ + -A $(ipaddr 2 $vlan) -B $(ipaddr 3 $vlan) \ + -t $proto -q -c $pkts "$@" + done +} + +check_marking() +{ + local vlan=$1; shift + local cond=$1; shift + + local npackets_0=$(get_qdisc_npackets $vlan) + local nmarked_0=$(get_nmarked $vlan) + sleep 5 + local npackets_1=$(get_qdisc_npackets $vlan) + local nmarked_1=$(get_nmarked $vlan) + + local nmarked_d=$((nmarked_1 - nmarked_0)) + local npackets_d=$((npackets_1 - npackets_0)) + local pct=$((100 * nmarked_d / npackets_d)) + + echo $pct + ((pct $cond)) +} + +ecn_test_common() +{ + local name=$1; shift + local vlan=$1; shift + local limit=$1; shift + local backlog + local pct + + # Build the below-the-limit backlog using UDP. We could use TCP just + # fine, but this way we get a proof that UDP is accepted when queue + # length is below the limit. The main stream is using TCP, and if the + # limit is misconfigured, we would see this traffic being ECN marked. + RET=0 + backlog=$(build_backlog $vlan $((2 * limit / 3)) udp) + check_err $? "Could not build the requested backlog" + pct=$(check_marking $vlan "== 0") + check_err $? "backlog $backlog / $limit Got $pct% marked packets, expected == 0." + log_test "TC $((vlan - 10)): $name backlog < limit" + + # Now push TCP, because non-TCP traffic would be early-dropped after the + # backlog crosses the limit, and we want to make sure that the backlog + # is above the limit. + RET=0 + backlog=$(build_backlog $vlan $((3 * limit / 2)) tcp tos=0x01) + check_err $? "Could not build the requested backlog" + pct=$(check_marking $vlan ">= 95") + check_err $? "backlog $backlog / $limit Got $pct% marked packets, expected >= 95." + log_test "TC $((vlan - 10)): $name backlog > limit" +} + +do_ecn_test() +{ + local vlan=$1; shift + local limit=$1; shift + local name=ECN + + start_tcp_traffic $h1.$vlan $(ipaddr 1 $vlan) $(ipaddr 3 $vlan) \ + $h3_mac tos=0x01 + sleep 1 + + ecn_test_common "$name" $vlan $limit + + # Up there we saw that UDP gets accepted when backlog is below the + # limit. Now that it is above, it should all get dropped, and backlog + # building should fail. + RET=0 + build_backlog $vlan $((2 * limit)) udp >/dev/null + check_fail $? "UDP traffic went into backlog instead of being early-dropped" + log_test "TC $((vlan - 10)): $name backlog > limit: UDP early-dropped" + + stop_traffic + sleep 1 +} + +do_ecn_nodrop_test() +{ + local vlan=$1; shift + local limit=$1; shift + local name="ECN nodrop" + + start_tcp_traffic $h1.$vlan $(ipaddr 1 $vlan) $(ipaddr 3 $vlan) \ + $h3_mac tos=0x01 + sleep 1 + + ecn_test_common "$name" $vlan $limit + + # Up there we saw that UDP gets accepted when backlog is below the + # limit. Now that it is above, in nodrop mode, make sure it goes to + # backlog as well. + RET=0 + build_backlog $vlan $((2 * limit)) udp >/dev/null + check_err $? "UDP traffic was early-dropped instead of getting into backlog" + log_test "TC $((vlan - 10)): $name backlog > limit: UDP not dropped" + + stop_traffic + sleep 1 +} + +do_red_test() +{ + local vlan=$1; shift + local limit=$1; shift + local backlog + local pct + + # Use ECN-capable TCP to verify there's no marking even though the queue + # is above limit. + start_tcp_traffic $h1.$vlan $(ipaddr 1 $vlan) $(ipaddr 3 $vlan) \ + $h3_mac tos=0x01 + + # Pushing below the queue limit should work. + RET=0 + backlog=$(build_backlog $vlan $((2 * limit / 3)) tcp tos=0x01) + check_err $? "Could not build the requested backlog" + pct=$(check_marking $vlan "== 0") + check_err $? "backlog $backlog / $limit Got $pct% marked packets, expected == 0." + log_test "TC $((vlan - 10)): RED backlog < limit" + + # Pushing above should not. + RET=0 + backlog=$(build_backlog $vlan $((3 * limit / 2)) tcp tos=0x01) + check_fail $? "Traffic went into backlog instead of being early-dropped" + pct=$(check_marking $vlan "== 0") + check_err $? "backlog $backlog / $limit Got $pct% marked packets, expected == 0." + local diff=$((limit - backlog)) + pct=$((100 * diff / limit)) + ((0 <= pct && pct <= 5)) + check_err $? "backlog $backlog / $limit expected <= 5% distance" + log_test "TC $((vlan - 10)): RED backlog > limit" + + stop_traffic + sleep 1 +} + +do_mc_backlog_test() +{ + local vlan=$1; shift + local limit=$1; shift + local backlog + local pct + + RET=0 + + start_tcp_traffic $h1.$vlan $(ipaddr 1 $vlan) $(ipaddr 3 $vlan) bc + start_tcp_traffic $h2.$vlan $(ipaddr 2 $vlan) $(ipaddr 3 $vlan) bc + + qbl=$(busywait 5000 until_counter_is ">= 500000" \ + get_qdisc_backlog $vlan) + check_err $? "Could not build MC backlog" + + # Verify that we actually see the backlog on BUM TC. Do a busywait as + # well, performance blips might cause false fail. + local ebl + ebl=$(busywait 5000 until_counter_is ">= 500000" \ + get_mc_transmit_queue $vlan) + check_err $? "MC backlog reported by qdisc not visible in ethtool" + + stop_traffic + stop_traffic + + log_test "TC $((vlan - 10)): Qdisc reports MC backlog" +} diff --git a/tools/testing/selftests/drivers/net/mlxsw/sch_red_ets.sh b/tools/testing/selftests/drivers/net/mlxsw/sch_red_ets.sh new file mode 100755 index 000000000000..1c36c576613b --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/sch_red_ets.sh @@ -0,0 +1,94 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +ALL_TESTS=" + ping_ipv4 + ecn_test + ecn_nodrop_test + red_test + mc_backlog_test +" +: ${QDISC:=ets} +source sch_red_core.sh + +# do_ecn_test first build 2/3 of the requested backlog and expects no marking, +# and then builds 3/2 of it and does expect marking. The values of $BACKLOG1 and +# $BACKLOG2 are far enough not to overlap, so that we can assume that if we do +# see (do not see) marking, it is actually due to the configuration of that one +# TC, and not due to configuration of the other TC leaking over. +BACKLOG1=200000 +BACKLOG2=500000 + +install_qdisc() +{ + local -a args=("$@") + + tc qdisc add dev $swp3 root handle 10: $QDISC \ + bands 8 priomap 7 6 5 4 3 2 1 0 + tc qdisc add dev $swp3 parent 10:8 handle 108: red \ + limit 1000000 min $BACKLOG1 max $((BACKLOG1 + 1)) \ + probability 1.0 avpkt 8000 burst 38 "${args[@]}" + tc qdisc add dev $swp3 parent 10:7 handle 107: red \ + limit 1000000 min $BACKLOG2 max $((BACKLOG2 + 1)) \ + probability 1.0 avpkt 8000 burst 63 "${args[@]}" + sleep 1 +} + +uninstall_qdisc() +{ + tc qdisc del dev $swp3 parent 10:7 + tc qdisc del dev $swp3 parent 10:8 + tc qdisc del dev $swp3 root +} + +ecn_test() +{ + install_qdisc ecn + + do_ecn_test 10 $BACKLOG1 + do_ecn_test 11 $BACKLOG2 + + uninstall_qdisc +} + +ecn_nodrop_test() +{ + install_qdisc ecn nodrop + + do_ecn_nodrop_test 10 $BACKLOG1 + do_ecn_nodrop_test 11 $BACKLOG2 + + uninstall_qdisc +} + +red_test() +{ + install_qdisc + + do_red_test 10 $BACKLOG1 + do_red_test 11 $BACKLOG2 + + uninstall_qdisc +} + +mc_backlog_test() +{ + install_qdisc + + # Note that the backlog numbers here do not correspond to RED + # configuration, but are arbitrary. + do_mc_backlog_test 10 $BACKLOG1 + do_mc_backlog_test 11 $BACKLOG2 + + uninstall_qdisc +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +bail_on_lldpad +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/mlxsw/sch_red_prio.sh b/tools/testing/selftests/drivers/net/mlxsw/sch_red_prio.sh new file mode 100755 index 000000000000..76820a0e9a1b --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/sch_red_prio.sh @@ -0,0 +1,5 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +QDISC=prio +source sch_red_ets.sh diff --git a/tools/testing/selftests/drivers/net/mlxsw/sch_red_root.sh b/tools/testing/selftests/drivers/net/mlxsw/sch_red_root.sh new file mode 100755 index 000000000000..558667ea11ec --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/sch_red_root.sh @@ -0,0 +1,68 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +ALL_TESTS=" + ping_ipv4 + ecn_test + ecn_nodrop_test + red_test + mc_backlog_test +" +source sch_red_core.sh + +BACKLOG=300000 + +install_qdisc() +{ + local -a args=("$@") + + tc qdisc add dev $swp3 root handle 108: red \ + limit 1000000 min $BACKLOG max $((BACKLOG + 1)) \ + probability 1.0 avpkt 8000 burst 38 "${args[@]}" + sleep 1 +} + +uninstall_qdisc() +{ + tc qdisc del dev $swp3 root +} + +ecn_test() +{ + install_qdisc ecn + do_ecn_test 10 $BACKLOG + uninstall_qdisc +} + +ecn_nodrop_test() +{ + install_qdisc ecn nodrop + do_ecn_nodrop_test 10 $BACKLOG + uninstall_qdisc +} + +red_test() +{ + install_qdisc + do_red_test 10 $BACKLOG + uninstall_qdisc +} + +mc_backlog_test() +{ + install_qdisc + # Note that the backlog value here does not correspond to RED + # configuration, but is arbitrary. + do_mc_backlog_test 10 $BACKLOG + uninstall_qdisc +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +bail_on_lldpad +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/mlxsw/sharedbuffer.sh b/tools/testing/selftests/drivers/net/mlxsw/sharedbuffer.sh new file mode 100755 index 000000000000..7d9e73a43a49 --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/sharedbuffer.sh @@ -0,0 +1,222 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +ALL_TESTS=" + port_pool_test + port_tc_ip_test + port_tc_arp_test +" + +NUM_NETIFS=2 +source ../../../net/forwarding/lib.sh +source ../../../net/forwarding/devlink_lib.sh +source mlxsw_lib.sh + +SB_POOL_ING=0 +SB_POOL_EGR_CPU=10 + +SB_ITC_CPU_IP=2 +SB_ITC_CPU_ARP=2 +SB_ITC=0 + +h1_create() +{ + simple_if_init $h1 192.0.1.1/24 +} + +h1_destroy() +{ + simple_if_fini $h1 192.0.1.1/24 +} + +h2_create() +{ + simple_if_init $h2 192.0.1.2/24 +} + +h2_destroy() +{ + simple_if_fini $h2 192.0.1.2/24 +} + +sb_occ_pool_check() +{ + local dl_port=$1; shift + local pool=$1; shift + local exp_max_occ=$1 + local max_occ + local err=0 + + max_occ=$(devlink sb -j occupancy show $dl_port \ + | jq -e ".[][][\"pool\"][\"$pool\"][\"max\"]") + + if [[ "$max_occ" -ne "$exp_max_occ" ]]; then + err=1 + fi + + echo $max_occ + return $err +} + +sb_occ_itc_check() +{ + local dl_port=$1; shift + local itc=$1; shift + local exp_max_occ=$1 + local max_occ + local err=0 + + max_occ=$(devlink sb -j occupancy show $dl_port \ + | jq -e ".[][][\"itc\"][\"$itc\"][\"max\"]") + + if [[ "$max_occ" -ne "$exp_max_occ" ]]; then + err=1 + fi + + echo $max_occ + return $err +} + +sb_occ_etc_check() +{ + local dl_port=$1; shift + local etc=$1; shift + local exp_max_occ=$1; shift + local max_occ + local err=0 + + max_occ=$(devlink sb -j occupancy show $dl_port \ + | jq -e ".[][][\"etc\"][\"$etc\"][\"max\"]") + + if [[ "$max_occ" -ne "$exp_max_occ" ]]; then + err=1 + fi + + echo $max_occ + return $err +} + +port_pool_test() +{ + local exp_max_occ=288 + local max_occ + + devlink sb occupancy clearmax $DEVLINK_DEV + + $MZ $h1 -c 1 -p 160 -a $h1mac -b $h2mac -A 192.0.1.1 -B 192.0.1.2 \ + -t ip -q + + devlink sb occupancy snapshot $DEVLINK_DEV + + RET=0 + max_occ=$(sb_occ_pool_check $dl_port1 $SB_POOL_ING $exp_max_occ) + check_err $? "Expected iPool($SB_POOL_ING) max occupancy to be $exp_max_occ, but got $max_occ" + log_test "physical port's($h1) ingress pool" + + RET=0 + max_occ=$(sb_occ_pool_check $dl_port2 $SB_POOL_ING $exp_max_occ) + check_err $? "Expected iPool($SB_POOL_ING) max occupancy to be $exp_max_occ, but got $max_occ" + log_test "physical port's($h2) ingress pool" + + RET=0 + max_occ=$(sb_occ_pool_check $cpu_dl_port $SB_POOL_EGR_CPU $exp_max_occ) + check_err $? "Expected ePool($SB_POOL_EGR_CPU) max occupancy to be $exp_max_occ, but got $max_occ" + log_test "CPU port's egress pool" +} + +port_tc_ip_test() +{ + local exp_max_occ=288 + local max_occ + + devlink sb occupancy clearmax $DEVLINK_DEV + + $MZ $h1 -c 1 -p 160 -a $h1mac -b $h2mac -A 192.0.1.1 -B 192.0.1.2 \ + -t ip -q + + devlink sb occupancy snapshot $DEVLINK_DEV + + RET=0 + max_occ=$(sb_occ_itc_check $dl_port2 $SB_ITC $exp_max_occ) + check_err $? "Expected ingress TC($SB_ITC) max occupancy to be $exp_max_occ, but got $max_occ" + log_test "physical port's($h1) ingress TC - IP packet" + + RET=0 + max_occ=$(sb_occ_itc_check $dl_port2 $SB_ITC $exp_max_occ) + check_err $? "Expected ingress TC($SB_ITC) max occupancy to be $exp_max_occ, but got $max_occ" + log_test "physical port's($h2) ingress TC - IP packet" + + RET=0 + max_occ=$(sb_occ_etc_check $cpu_dl_port $SB_ITC_CPU_IP $exp_max_occ) + check_err $? "Expected egress TC($SB_ITC_CPU_IP) max occupancy to be $exp_max_occ, but got $max_occ" + log_test "CPU port's egress TC - IP packet" +} + +port_tc_arp_test() +{ + local exp_max_occ=96 + local max_occ + + if [[ $MLXSW_CHIP != "mlxsw_spectrum" ]]; then + exp_max_occ=144 + fi + + devlink sb occupancy clearmax $DEVLINK_DEV + + $MZ $h1 -c 1 -p 160 -a $h1mac -A 192.0.1.1 -t arp -q + + devlink sb occupancy snapshot $DEVLINK_DEV + + RET=0 + max_occ=$(sb_occ_itc_check $dl_port2 $SB_ITC $exp_max_occ) + check_err $? "Expected ingress TC($SB_ITC) max occupancy to be $exp_max_occ, but got $max_occ" + log_test "physical port's($h1) ingress TC - ARP packet" + + RET=0 + max_occ=$(sb_occ_itc_check $dl_port2 $SB_ITC $exp_max_occ) + check_err $? "Expected ingress TC($SB_ITC) max occupancy to be $exp_max_occ, but got $max_occ" + log_test "physical port's($h2) ingress TC - ARP packet" + + RET=0 + max_occ=$(sb_occ_etc_check $cpu_dl_port $SB_ITC_CPU_ARP $exp_max_occ) + check_err $? "Expected egress TC($SB_ITC_IP2ME) max occupancy to be $exp_max_occ, but got $max_occ" + log_test "CPU port's egress TC - ARP packet" +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + h2=${NETIFS[p2]} + + h1mac=$(mac_get $h1) + h2mac=$(mac_get $h2) + + dl_port1=$(devlink_port_by_netdev $h1) + dl_port2=$(devlink_port_by_netdev $h2) + + cpu_dl_port=$(devlink_cpu_port_get) + + vrf_prepare + + h1_create + h2_create +} + +cleanup() +{ + pre_cleanup + + h2_destroy + h1_destroy + + vrf_cleanup +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/mlxsw/sharedbuffer_configuration.py b/tools/testing/selftests/drivers/net/mlxsw/sharedbuffer_configuration.py new file mode 100755 index 000000000000..0d4b9327c9b3 --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/sharedbuffer_configuration.py @@ -0,0 +1,416 @@ +#!/usr/bin/python +# SPDX-License-Identifier: GPL-2.0 + +import subprocess +import json as j +import random + + +class SkipTest(Exception): + pass + + +class RandomValuePicker: + """ + Class for storing shared buffer configuration. Can handle 3 different + objects, pool, tcbind and portpool. Provide an interface to get random + values for a specific object type as the follow: + 1. Pool: + - random size + + 2. TcBind: + - random pool number + - random threshold + + 3. PortPool: + - random threshold + """ + def __init__(self, pools): + self._pools = [] + for pool in pools: + self._pools.append(pool) + + def _cell_size(self): + return self._pools[0]["cell_size"] + + def _get_static_size(self, th): + # For threshold of 16, this works out to be about 12MB on Spectrum-1, + # and about 17MB on Spectrum-2. + return th * 8000 * self._cell_size() + + def _get_size(self): + return self._get_static_size(16) + + def _get_thtype(self): + return "static" + + def _get_th(self, pool): + # Threshold value could be any integer between 3 to 16 + th = random.randint(3, 16) + if pool["thtype"] == "dynamic": + return th + else: + return self._get_static_size(th) + + def _get_pool(self, direction): + ing_pools = [] + egr_pools = [] + for pool in self._pools: + if pool["type"] == "ingress": + ing_pools.append(pool) + else: + egr_pools.append(pool) + if direction == "ingress": + arr = ing_pools + else: + arr = egr_pools + return arr[random.randint(0, len(arr) - 1)] + + def get_value(self, objid): + if isinstance(objid, Pool): + if objid["pool"] in [4, 8, 9, 10]: + # The threshold type of pools 4, 8, 9 and 10 cannot be changed + raise SkipTest() + else: + return (self._get_size(), self._get_thtype()) + if isinstance(objid, TcBind): + if objid["tc"] >= 8: + # Multicast TCs cannot be changed + raise SkipTest() + else: + pool = self._get_pool(objid["type"]) + th = self._get_th(pool) + pool_n = pool["pool"] + return (pool_n, th) + if isinstance(objid, PortPool): + pool_n = objid["pool"] + pool = self._pools[pool_n] + assert pool["pool"] == pool_n + th = self._get_th(pool) + return (th,) + + +class RecordValuePickerException(Exception): + pass + + +class RecordValuePicker: + """ + Class for storing shared buffer configuration. Can handle 2 different + objects, pool and tcbind. Provide an interface to get the stored values per + object type. + """ + def __init__(self, objlist): + self._recs = [] + for item in objlist: + self._recs.append({"objid": item, "value": item.var_tuple()}) + + def get_value(self, objid): + if isinstance(objid, Pool) and objid["pool"] in [4, 8, 9, 10]: + # The threshold type of pools 4, 8, 9 and 10 cannot be changed + raise SkipTest() + if isinstance(objid, TcBind) and objid["tc"] >= 8: + # Multicast TCs cannot be changed + raise SkipTest() + for rec in self._recs: + if rec["objid"].weak_eq(objid): + return rec["value"] + raise RecordValuePickerException() + + +def run_cmd(cmd, json=False): + out = subprocess.check_output(cmd, shell=True) + if json: + return j.loads(out) + return out + + +def run_json_cmd(cmd): + return run_cmd(cmd, json=True) + + +def log_test(test_name, err_msg=None): + if err_msg: + print("\t%s" % err_msg) + print("TEST: %-80s [FAIL]" % test_name) + else: + print("TEST: %-80s [ OK ]" % test_name) + + +class CommonItem(dict): + varitems = [] + + def var_tuple(self): + ret = [] + self.varitems.sort() + for key in self.varitems: + ret.append(self[key]) + return tuple(ret) + + def weak_eq(self, other): + for key in self: + if key in self.varitems: + continue + if self[key] != other[key]: + return False + return True + + +class CommonList(list): + def get_by(self, by_obj): + for item in self: + if item.weak_eq(by_obj): + return item + return None + + def del_by(self, by_obj): + for item in self: + if item.weak_eq(by_obj): + self.remove(item) + + +class Pool(CommonItem): + varitems = ["size", "thtype"] + + def dl_set(self, dlname, size, thtype): + run_cmd("devlink sb pool set {} sb {} pool {} size {} thtype {}".format(dlname, self["sb"], + self["pool"], + size, thtype)) + + +class PoolList(CommonList): + pass + + +def get_pools(dlname, direction=None): + d = run_json_cmd("devlink sb pool show -j") + pools = PoolList() + for pooldict in d["pool"][dlname]: + if not direction or direction == pooldict["type"]: + pools.append(Pool(pooldict)) + return pools + + +def do_check_pools(dlname, pools, vp): + for pool in pools: + pre_pools = get_pools(dlname) + try: + (size, thtype) = vp.get_value(pool) + except SkipTest: + continue + pool.dl_set(dlname, size, thtype) + post_pools = get_pools(dlname) + pool = post_pools.get_by(pool) + + err_msg = None + if pool["size"] != size: + err_msg = "Incorrect pool size (got {}, expected {})".format(pool["size"], size) + if pool["thtype"] != thtype: + err_msg = "Incorrect pool threshold type (got {}, expected {})".format(pool["thtype"], thtype) + + pre_pools.del_by(pool) + post_pools.del_by(pool) + if pre_pools != post_pools: + err_msg = "Other pool setup changed as well" + log_test("pool {} of sb {} set verification".format(pool["pool"], + pool["sb"]), err_msg) + + +def check_pools(dlname, pools): + # Save defaults + record_vp = RecordValuePicker(pools) + + # For each pool, set random size and static threshold type + do_check_pools(dlname, pools, RandomValuePicker(pools)) + + # Restore defaults + do_check_pools(dlname, pools, record_vp) + + +class TcBind(CommonItem): + varitems = ["pool", "threshold"] + + def __init__(self, port, d): + super(TcBind, self).__init__(d) + self["dlportname"] = port.name + + def dl_set(self, pool, th): + run_cmd("devlink sb tc bind set {} sb {} tc {} type {} pool {} th {}".format(self["dlportname"], + self["sb"], + self["tc"], + self["type"], + pool, th)) + + +class TcBindList(CommonList): + pass + + +def get_tcbinds(ports, verify_existence=False): + d = run_json_cmd("devlink sb tc bind show -j -n") + tcbinds = TcBindList() + for port in ports: + err_msg = None + if port.name not in d["tc_bind"] or len(d["tc_bind"][port.name]) == 0: + err_msg = "No tc bind for port" + else: + for tcbinddict in d["tc_bind"][port.name]: + tcbinds.append(TcBind(port, tcbinddict)) + if verify_existence: + log_test("tc bind existence for port {} verification".format(port.name), err_msg) + return tcbinds + + +def do_check_tcbind(ports, tcbinds, vp): + for tcbind in tcbinds: + pre_tcbinds = get_tcbinds(ports) + try: + (pool, th) = vp.get_value(tcbind) + except SkipTest: + continue + tcbind.dl_set(pool, th) + post_tcbinds = get_tcbinds(ports) + tcbind = post_tcbinds.get_by(tcbind) + + err_msg = None + if tcbind["pool"] != pool: + err_msg = "Incorrect pool (got {}, expected {})".format(tcbind["pool"], pool) + if tcbind["threshold"] != th: + err_msg = "Incorrect threshold (got {}, expected {})".format(tcbind["threshold"], th) + + pre_tcbinds.del_by(tcbind) + post_tcbinds.del_by(tcbind) + if pre_tcbinds != post_tcbinds: + err_msg = "Other tc bind setup changed as well" + log_test("tc bind {}-{} of sb {} set verification".format(tcbind["dlportname"], + tcbind["tc"], + tcbind["sb"]), err_msg) + + +def check_tcbind(dlname, ports, pools): + tcbinds = get_tcbinds(ports, verify_existence=True) + + # Save defaults + record_vp = RecordValuePicker(tcbinds) + + # Bind each port and unicast TC (TCs < 8) to a random pool and a random + # threshold + do_check_tcbind(ports, tcbinds, RandomValuePicker(pools)) + + # Restore defaults + do_check_tcbind(ports, tcbinds, record_vp) + + +class PortPool(CommonItem): + varitems = ["threshold"] + + def __init__(self, port, d): + super(PortPool, self).__init__(d) + self["dlportname"] = port.name + + def dl_set(self, th): + run_cmd("devlink sb port pool set {} sb {} pool {} th {}".format(self["dlportname"], + self["sb"], + self["pool"], th)) + + +class PortPoolList(CommonList): + pass + + +def get_portpools(ports, verify_existence=False): + d = run_json_cmd("devlink sb port pool -j -n") + portpools = PortPoolList() + for port in ports: + err_msg = None + if port.name not in d["port_pool"] or len(d["port_pool"][port.name]) == 0: + err_msg = "No port pool for port" + else: + for portpooldict in d["port_pool"][port.name]: + portpools.append(PortPool(port, portpooldict)) + if verify_existence: + log_test("port pool existence for port {} verification".format(port.name), err_msg) + return portpools + + +def do_check_portpool(ports, portpools, vp): + for portpool in portpools: + pre_portpools = get_portpools(ports) + (th,) = vp.get_value(portpool) + portpool.dl_set(th) + post_portpools = get_portpools(ports) + portpool = post_portpools.get_by(portpool) + + err_msg = None + if portpool["threshold"] != th: + err_msg = "Incorrect threshold (got {}, expected {})".format(portpool["threshold"], th) + + pre_portpools.del_by(portpool) + post_portpools.del_by(portpool) + if pre_portpools != post_portpools: + err_msg = "Other port pool setup changed as well" + log_test("port pool {}-{} of sb {} set verification".format(portpool["dlportname"], + portpool["pool"], + portpool["sb"]), err_msg) + + +def check_portpool(dlname, ports, pools): + portpools = get_portpools(ports, verify_existence=True) + + # Save defaults + record_vp = RecordValuePicker(portpools) + + # For each port pool, set a random threshold + do_check_portpool(ports, portpools, RandomValuePicker(pools)) + + # Restore defaults + do_check_portpool(ports, portpools, record_vp) + + +class Port: + def __init__(self, name): + self.name = name + + +class PortList(list): + pass + + +def get_ports(dlname): + d = run_json_cmd("devlink port show -j") + ports = PortList() + for name in d["port"]: + if name.find(dlname) == 0 and d["port"][name]["flavour"] == "physical": + ports.append(Port(name)) + return ports + + +def get_device(): + devices_info = run_json_cmd("devlink -j dev info")["info"] + for d in devices_info: + if "mlxsw_spectrum" in devices_info[d]["driver"]: + return d + return None + + +class UnavailableDevlinkNameException(Exception): + pass + + +def test_sb_configuration(): + # Use static seed + random.seed(0) + + dlname = get_device() + if not dlname: + raise UnavailableDevlinkNameException() + + ports = get_ports(dlname) + pools = get_pools(dlname) + + check_pools(dlname, pools) + check_tcbind(dlname, ports, pools) + check_portpool(dlname, ports, pools) + + +test_sb_configuration() diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh index 7b2acba82a49..fd583a171db7 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh @@ -8,8 +8,9 @@ source $lib_dir/lib.sh source $lib_dir/tc_common.sh source $lib_dir/devlink_lib.sh -if [ "$DEVLINK_VIDDID" != "15b3:cf6c" ]; then - echo "SKIP: test is tailored for Mellanox Spectrum-2" +if [[ "$DEVLINK_VIDDID" != "15b3:cf6c" && \ + "$DEVLINK_VIDDID" != "15b3:cf70" ]]; then + echo "SKIP: test is tailored for Mellanox Spectrum-2 and Spectrum-3" exit 1 fi diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower_scale.sh index a0795227216e..efd798a85931 100644 --- a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower_scale.sh @@ -8,9 +8,9 @@ tc_flower_get_target() # The driver associates a counter with each tc filter, which means the # number of supported filters is bounded by the number of available # counters. - # Currently, the driver supports 12K (12,288) flow counters and six of + # Currently, the driver supports 30K (30,720) flow counters and six of # these are used for multicast routing. - local target=12282 + local target=30714 if ((! should_fail)); then echo $target diff --git a/tools/testing/selftests/drivers/net/mlxsw/tc_action_hw_stats.sh b/tools/testing/selftests/drivers/net/mlxsw/tc_action_hw_stats.sh new file mode 100755 index 000000000000..20ed98fe5a60 --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/tc_action_hw_stats.sh @@ -0,0 +1,130 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +lib_dir=$(dirname $0)/../../../net/forwarding + +ALL_TESTS=" + default_hw_stats_test + immediate_hw_stats_test + delayed_hw_stats_test + disabled_hw_stats_test +" +NUM_NETIFS=2 + +source $lib_dir/tc_common.sh +source $lib_dir/lib.sh +source $lib_dir/devlink_lib.sh + +h1_create() +{ + simple_if_init $h1 192.0.2.1/24 +} + +h1_destroy() +{ + simple_if_fini $h1 192.0.2.1/24 +} + +switch_create() +{ + simple_if_init $swp1 192.0.2.2/24 + tc qdisc add dev $swp1 clsact +} + +switch_destroy() +{ + tc qdisc del dev $swp1 clsact + simple_if_fini $swp1 192.0.2.2/24 +} + +hw_stats_test() +{ + RET=0 + + local name=$1 + local action_hw_stats=$2 + local occ_delta=$3 + local expected_packet_count=$4 + + local orig_occ=$(devlink_resource_get "counters" "flow" | jq '.["occ"]') + + tc filter add dev $swp1 ingress protocol ip pref 1 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 action drop $action_hw_stats + check_err $? "Failed to add rule with $name hw_stats" + + local new_occ=$(devlink_resource_get "counters" "flow" | jq '.["occ"]') + local expected_occ=$((orig_occ + occ_delta)) + [ "$new_occ" == "$expected_occ" ] + check_err $? "Expected occupancy of $expected_occ, got $new_occ" + + $MZ $h1 -c 1 -p 64 -a $h1mac -b $swp1mac -A 192.0.2.1 -B 192.0.2.2 \ + -t ip -q + + tc_check_packets "dev $swp1 ingress" 101 $expected_packet_count + check_err $? "Did not match incoming packet" + + tc filter del dev $swp1 ingress protocol ip pref 1 handle 101 flower + + log_test "$name hw_stats" +} + +default_hw_stats_test() +{ + hw_stats_test "default" "" 2 1 +} + +immediate_hw_stats_test() +{ + hw_stats_test "immediate" "hw_stats immediate" 2 1 +} + +delayed_hw_stats_test() +{ + RET=0 + + tc filter add dev $swp1 ingress protocol ip pref 1 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 action drop hw_stats delayed + check_fail $? "Unexpected success in adding rule with delayed hw_stats" + + log_test "delayed hw_stats" +} + +disabled_hw_stats_test() +{ + hw_stats_test "disabled" "hw_stats disabled" 0 0 +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + swp1=${NETIFS[p2]} + + h1mac=$(mac_get $h1) + swp1mac=$(mac_get $swp1) + + vrf_prepare + + h1_create + switch_create +} + +cleanup() +{ + pre_cleanup + + switch_destroy + h1_destroy + + vrf_cleanup +} + +check_tc_action_hw_stats_support + +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/mlxsw/tc_flower_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/tc_flower_scale.sh index a6d733d2a4b4..cc0f07e72cf2 100644 --- a/tools/testing/selftests/drivers/net/mlxsw/tc_flower_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/tc_flower_scale.sh @@ -2,9 +2,9 @@ # SPDX-License-Identifier: GPL-2.0 # Test for resource limit of offloaded flower rules. The test adds a given -# number of flower matches for different IPv6 addresses, then generates traffic, -# and ensures each was hit exactly once. This file contains functions to set up -# a testing topology and run the test, and is meant to be sourced from a test +# number of flower matches for different IPv6 addresses, then check the offload +# indication for all of the tc flower rules. This file contains functions to set +# up a testing topology and run the test, and is meant to be sourced from a test # script that calls the testing routine with a given number of rules. TC_FLOWER_NUM_NETIFS=2 @@ -94,22 +94,15 @@ __tc_flower_test() tc_flower_rules_create $count $should_fail - for ((i = 0; i < count; ++i)); do - $MZ $h1 -q -c 1 -t ip -p 20 -b bc -6 \ - -A 2001:db8:2::1 \ - -B $(tc_flower_addr $i) - done - - MISMATCHES=$( - tc -j -s filter show dev $h2 ingress | - jq -r '[ .[] | select(.kind == "flower") | .options | - values as $rule | .actions[].stats.packets | - select(. != 1) | "\(.) on \($rule.keys.dst_ip)" ] | - join(", ")' - ) - - test -z "$MISMATCHES" - check_err $? "Expected to capture 1 packet for each IP, but got $MISMATCHES" + offload_count=$(tc -j -s filter show dev $h2 ingress | + jq -r '[ .[] | select(.kind == "flower") | + .options | .in_hw ]' | jq .[] | wc -l) + [[ $((offload_count - 1)) -eq $count ]] + if [[ $should_fail -eq 0 ]]; then + check_err $? "Offload mismatch" + else + check_err_fail $should_fail $? "Offload more than expacted" + fi } tc_flower_test() diff --git a/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh b/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh new file mode 100755 index 000000000000..9241250c5921 --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh @@ -0,0 +1,318 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +lib_dir=$(dirname $0)/../../../net/forwarding + +ALL_TESTS=" + shared_block_drop_test + egress_redirect_test + multi_mirror_test + matchall_sample_egress_test + matchall_mirror_behind_flower_ingress_test + matchall_sample_behind_flower_ingress_test + matchall_mirror_behind_flower_egress_test +" +NUM_NETIFS=2 + +source $lib_dir/tc_common.sh +source $lib_dir/lib.sh + +switch_create() +{ + simple_if_init $swp1 192.0.2.1/24 + simple_if_init $swp2 192.0.2.2/24 +} + +switch_destroy() +{ + simple_if_fini $swp2 192.0.2.2/24 + simple_if_fini $swp1 192.0.2.1/24 +} + +shared_block_drop_test() +{ + RET=0 + + # It is forbidden in mlxsw driver to have mixed-bound + # shared block with a drop rule. + + tc qdisc add dev $swp1 ingress_block 22 clsact + check_err $? "Failed to create clsact with ingress block" + + tc filter add block 22 protocol ip pref 1 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 action drop + check_err $? "Failed to add drop rule to ingress bound block" + + tc qdisc add dev $swp2 ingress_block 22 clsact + check_err $? "Failed to create another clsact with ingress shared block" + + tc qdisc del dev $swp2 clsact + + tc qdisc add dev $swp2 egress_block 22 clsact + check_fail $? "Incorrect success to create another clsact with egress shared block" + + tc filter del block 22 protocol ip pref 1 handle 101 flower + + tc qdisc add dev $swp2 egress_block 22 clsact + check_err $? "Failed to create another clsact with egress shared block after blocker drop rule removed" + + tc filter add block 22 protocol ip pref 1 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 action drop + check_fail $? "Incorrect success to add drop rule to mixed bound block" + + tc qdisc del dev $swp1 clsact + + tc qdisc add dev $swp1 egress_block 22 clsact + check_err $? "Failed to create another clsact with egress shared block" + + tc filter add block 22 protocol ip pref 1 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 action drop + check_err $? "Failed to add drop rule to egress bound shared block" + + tc filter del block 22 protocol ip pref 1 handle 101 flower + + tc qdisc del dev $swp2 clsact + tc qdisc del dev $swp1 clsact + + log_test "shared block drop" +} + +egress_redirect_test() +{ + RET=0 + + # It is forbidden in mlxsw driver to have mirred redirect on + # egress-bound block. + + tc qdisc add dev $swp1 ingress_block 22 clsact + check_err $? "Failed to create clsact with ingress block" + + tc filter add block 22 protocol ip pref 1 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 \ + action mirred egress redirect dev $swp2 + check_err $? "Failed to add redirect rule to ingress bound block" + + tc qdisc add dev $swp2 ingress_block 22 clsact + check_err $? "Failed to create another clsact with ingress shared block" + + tc qdisc del dev $swp2 clsact + + tc qdisc add dev $swp2 egress_block 22 clsact + check_fail $? "Incorrect success to create another clsact with egress shared block" + + tc filter del block 22 protocol ip pref 1 handle 101 flower + + tc qdisc add dev $swp2 egress_block 22 clsact + check_err $? "Failed to create another clsact with egress shared block after blocker redirect rule removed" + + tc filter add block 22 protocol ip pref 1 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 \ + action mirred egress redirect dev $swp2 + check_fail $? "Incorrect success to add redirect rule to mixed bound block" + + tc qdisc del dev $swp1 clsact + + tc qdisc add dev $swp1 egress_block 22 clsact + check_err $? "Failed to create another clsact with egress shared block" + + tc filter add block 22 protocol ip pref 1 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 \ + action mirred egress redirect dev $swp2 + check_fail $? "Incorrect success to add redirect rule to egress bound shared block" + + tc qdisc del dev $swp2 clsact + + tc filter add block 22 protocol ip pref 1 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 \ + action mirred egress redirect dev $swp2 + check_fail $? "Incorrect success to add redirect rule to egress bound block" + + tc qdisc del dev $swp1 clsact + + log_test "shared block drop" +} + +multi_mirror_test() +{ + RET=0 + + # It is forbidden in mlxsw driver to have multiple mirror + # actions in a single rule. + + tc qdisc add dev $swp1 clsact + + tc filter add dev $swp1 ingress protocol ip pref 1 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 \ + action mirred egress mirror dev $swp2 + check_err $? "Failed to add rule with single mirror action" + + tc filter del dev $swp1 ingress protocol ip pref 1 handle 101 flower + + tc filter add dev $swp1 ingress protocol ip pref 1 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 \ + action mirred egress mirror dev $swp2 \ + action mirred egress mirror dev $swp1 + check_fail $? "Incorrect success to add rule with two mirror actions" + + tc qdisc del dev $swp1 clsact + + log_test "multi mirror" +} + +matchall_sample_egress_test() +{ + RET=0 + + # It is forbidden in mlxsw driver to have matchall with sample action + # bound on egress + + tc qdisc add dev $swp1 clsact + + tc filter add dev $swp1 ingress protocol all pref 1 handle 101 \ + matchall skip_sw action sample rate 100 group 1 + check_err $? "Failed to add rule with sample action on ingress" + + tc filter del dev $swp1 ingress protocol all pref 1 handle 101 matchall + + tc filter add dev $swp1 egress protocol all pref 1 handle 101 \ + matchall skip_sw action sample rate 100 group 1 + check_fail $? "Incorrect success to add rule with sample action on egress" + + tc qdisc del dev $swp1 clsact + + log_test "matchall sample egress" +} + +matchall_behind_flower_ingress_test() +{ + local action=$1 + local action_args=$2 + + RET=0 + + # On ingress, all matchall-mirror and matchall-sample + # rules have to be in front of the flower rules + + tc qdisc add dev $swp1 clsact + + tc filter add dev $swp1 ingress protocol ip pref 10 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 action drop + + tc filter add dev $swp1 ingress protocol all pref 9 handle 102 \ + matchall skip_sw action $action_args + check_err $? "Failed to add matchall rule in front of a flower rule" + + tc filter del dev $swp1 ingress protocol all pref 9 handle 102 matchall + + tc filter add dev $swp1 ingress protocol all pref 11 handle 102 \ + matchall skip_sw action $action_args + check_fail $? "Incorrect success to add matchall rule behind a flower rule" + + tc filter del dev $swp1 ingress protocol ip pref 10 handle 101 flower + + tc filter add dev $swp1 ingress protocol all pref 9 handle 102 \ + matchall skip_sw action $action_args + + tc filter add dev $swp1 ingress protocol ip pref 10 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 action drop + check_err $? "Failed to add flower rule behind a matchall rule" + + tc filter del dev $swp1 ingress protocol ip pref 10 handle 101 flower + + tc filter add dev $swp1 ingress protocol ip pref 8 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 action drop + check_fail $? "Incorrect success to add flower rule in front of a matchall rule" + + tc qdisc del dev $swp1 clsact + + log_test "matchall $action flower ingress" +} + +matchall_mirror_behind_flower_ingress_test() +{ + matchall_behind_flower_ingress_test "mirror" "mirred egress mirror dev $swp2" +} + +matchall_sample_behind_flower_ingress_test() +{ + matchall_behind_flower_ingress_test "sample" "sample rate 100 group 1" +} + +matchall_behind_flower_egress_test() +{ + local action=$1 + local action_args=$2 + + RET=0 + + # On egress, all matchall-mirror rules have to be behind the flower rules + + tc qdisc add dev $swp1 clsact + + tc filter add dev $swp1 egress protocol ip pref 10 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 action drop + + tc filter add dev $swp1 egress protocol all pref 11 handle 102 \ + matchall skip_sw action $action_args + check_err $? "Failed to add matchall rule in front of a flower rule" + + tc filter del dev $swp1 egress protocol all pref 11 handle 102 matchall + + tc filter add dev $swp1 egress protocol all pref 9 handle 102 \ + matchall skip_sw action $action_args + check_fail $? "Incorrect success to add matchall rule behind a flower rule" + + tc filter del dev $swp1 egress protocol ip pref 10 handle 101 flower + + tc filter add dev $swp1 egress protocol all pref 11 handle 102 \ + matchall skip_sw action $action_args + + tc filter add dev $swp1 egress protocol ip pref 10 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 action drop + check_err $? "Failed to add flower rule behind a matchall rule" + + tc filter del dev $swp1 egress protocol ip pref 10 handle 101 flower + + tc filter add dev $swp1 egress protocol ip pref 12 handle 101 flower \ + skip_sw dst_ip 192.0.2.2 action drop + check_fail $? "Incorrect success to add flower rule in front of a matchall rule" + + tc qdisc del dev $swp1 clsact + + log_test "matchall $action flower egress" +} + +matchall_mirror_behind_flower_egress_test() +{ + matchall_behind_flower_egress_test "mirror" "mirred egress mirror dev $swp2" +} + +setup_prepare() +{ + swp1=${NETIFS[p1]} + swp2=${NETIFS[p2]} + + vrf_prepare + + switch_create +} + +cleanup() +{ + pre_cleanup + + switch_destroy + + vrf_cleanup +} + +check_tc_shblock_support + +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/mlxsw/vxlan.sh b/tools/testing/selftests/drivers/net/mlxsw/vxlan.sh index 4632f51af7ab..729a86cc4ede 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/vxlan.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/vxlan.sh @@ -9,6 +9,7 @@ lib_dir=$(dirname $0)/../../../net/forwarding ALL_TESTS="sanitization_test offload_indication_test \ sanitization_vlan_aware_test offload_indication_vlan_aware_test" NUM_NETIFS=2 +: ${TIMEOUT:=20000} # ms source $lib_dir/lib.sh setup_prepare() @@ -470,8 +471,8 @@ offload_indication_fdb_flood_test() bridge fdb append 00:00:00:00:00:00 dev vxlan0 self dst 198.51.100.2 - bridge fdb show brport vxlan0 | grep 00:00:00:00:00:00 \ - | grep -q offload + busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb 00:00:00:00:00:00 \ + bridge fdb show brport vxlan0 check_err $? bridge fdb del 00:00:00:00:00:00 dev vxlan0 self @@ -486,11 +487,11 @@ offload_indication_fdb_bridge_test() bridge fdb add de:ad:be:ef:13:37 dev vxlan0 self master static \ dst 198.51.100.2 - bridge fdb show brport vxlan0 | grep de:ad:be:ef:13:37 | grep self \ - | grep -q offload + busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb \ + de:ad:be:ef:13:37 self bridge fdb show brport vxlan0 check_err $? - bridge fdb show brport vxlan0 | grep de:ad:be:ef:13:37 | grep -v self \ - | grep -q offload + busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb \ + de:ad:be:ef:13:37 self -v bridge fdb show brport vxlan0 check_err $? log_test "vxlan entry offload indication - initial state" @@ -500,9 +501,9 @@ offload_indication_fdb_bridge_test() RET=0 bridge fdb del de:ad:be:ef:13:37 dev vxlan0 master - bridge fdb show brport vxlan0 | grep de:ad:be:ef:13:37 | grep self \ - | grep -q offload - check_fail $? + busywait "$TIMEOUT" not wait_for_offload grep_bridge_fdb \ + de:ad:be:ef:13:37 self bridge fdb show brport vxlan0 + check_err $? log_test "vxlan entry offload indication - after removal from bridge" @@ -511,11 +512,11 @@ offload_indication_fdb_bridge_test() RET=0 bridge fdb add de:ad:be:ef:13:37 dev vxlan0 master static - bridge fdb show brport vxlan0 | grep de:ad:be:ef:13:37 | grep self \ - | grep -q offload + busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb \ + de:ad:be:ef:13:37 self bridge fdb show brport vxlan0 check_err $? - bridge fdb show brport vxlan0 | grep de:ad:be:ef:13:37 | grep -v self \ - | grep -q offload + busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb \ + de:ad:be:ef:13:37 self -v bridge fdb show brport vxlan0 check_err $? log_test "vxlan entry offload indication - after re-add to bridge" @@ -525,9 +526,9 @@ offload_indication_fdb_bridge_test() RET=0 bridge fdb del de:ad:be:ef:13:37 dev vxlan0 self - bridge fdb show brport vxlan0 | grep de:ad:be:ef:13:37 | grep -v self \ - | grep -q offload - check_fail $? + busywait "$TIMEOUT" not wait_for_offload grep_bridge_fdb \ + de:ad:be:ef:13:37 self -v bridge fdb show brport vxlan0 + check_err $? log_test "vxlan entry offload indication - after removal from vxlan" @@ -536,11 +537,11 @@ offload_indication_fdb_bridge_test() RET=0 bridge fdb add de:ad:be:ef:13:37 dev vxlan0 self dst 198.51.100.2 - bridge fdb show brport vxlan0 | grep de:ad:be:ef:13:37 | grep self \ - | grep -q offload + busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb \ + de:ad:be:ef:13:37 self bridge fdb show brport vxlan0 check_err $? - bridge fdb show brport vxlan0 | grep de:ad:be:ef:13:37 | grep -v self \ - | grep -q offload + busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb \ + de:ad:be:ef:13:37 self -v bridge fdb show brport vxlan0 check_err $? log_test "vxlan entry offload indication - after re-add to vxlan" @@ -558,27 +559,32 @@ offload_indication_decap_route_test() { RET=0 - ip route show table local | grep 198.51.100.1 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip route show table local 198.51.100.1 check_err $? ip link set dev vxlan0 down - ip route show table local | grep 198.51.100.1 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip route show table local 198.51.100.1 check_err $? ip link set dev vxlan1 down - ip route show table local | grep 198.51.100.1 | grep -q offload - check_fail $? + busywait "$TIMEOUT" not wait_for_offload \ + ip route show table local 198.51.100.1 + check_err $? log_test "vxlan decap route - vxlan device down" RET=0 ip link set dev vxlan1 up - ip route show table local | grep 198.51.100.1 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip route show table local 198.51.100.1 check_err $? ip link set dev vxlan0 up - ip route show table local | grep 198.51.100.1 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip route show table local 198.51.100.1 check_err $? log_test "vxlan decap route - vxlan device up" @@ -586,11 +592,13 @@ offload_indication_decap_route_test() RET=0 ip address delete 198.51.100.1/32 dev lo - ip route show table local | grep 198.51.100.1 | grep -q offload - check_fail $? + busywait "$TIMEOUT" not wait_for_offload \ + ip route show table local 198.51.100.1 + check_err $? ip address add 198.51.100.1/32 dev lo - ip route show table local | grep 198.51.100.1 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip route show table local 198.51.100.1 check_err $? log_test "vxlan decap route - add local route" @@ -598,16 +606,19 @@ offload_indication_decap_route_test() RET=0 ip link set dev $swp1 nomaster - ip route show table local | grep 198.51.100.1 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip route show table local 198.51.100.1 check_err $? ip link set dev $swp2 nomaster - ip route show table local | grep 198.51.100.1 | grep -q offload - check_fail $? + busywait "$TIMEOUT" not wait_for_offload \ + ip route show table local 198.51.100.1 + check_err $? ip link set dev $swp1 master br0 ip link set dev $swp2 master br1 - ip route show table local | grep 198.51.100.1 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip route show table local 198.51.100.1 check_err $? log_test "vxlan decap route - local ports enslavement" @@ -615,12 +626,14 @@ offload_indication_decap_route_test() RET=0 ip link del dev br0 - ip route show table local | grep 198.51.100.1 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip route show table local 198.51.100.1 check_err $? ip link del dev br1 - ip route show table local | grep 198.51.100.1 | grep -q offload - check_fail $? + busywait "$TIMEOUT" not wait_for_offload \ + ip route show table local 198.51.100.1 + check_err $? log_test "vxlan decap route - bridge device deletion" @@ -632,16 +645,19 @@ offload_indication_decap_route_test() ip link set dev $swp2 master br1 ip link set dev vxlan0 master br0 ip link set dev vxlan1 master br1 - ip route show table local | grep 198.51.100.1 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip route show table local 198.51.100.1 check_err $? ip link del dev vxlan0 - ip route show table local | grep 198.51.100.1 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip route show table local 198.51.100.1 check_err $? ip link del dev vxlan1 - ip route show table local | grep 198.51.100.1 | grep -q offload - check_fail $? + busywait "$TIMEOUT" not wait_for_offload \ + ip route show table local 198.51.100.1 + check_err $? log_test "vxlan decap route - vxlan device deletion" @@ -656,12 +672,15 @@ check_fdb_offloaded() local mac=00:11:22:33:44:55 local zmac=00:00:00:00:00:00 - bridge fdb show dev vxlan0 | grep $mac | grep self | grep -q offload + busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb $mac self \ + bridge fdb show dev vxlan0 check_err $? - bridge fdb show dev vxlan0 | grep $mac | grep master | grep -q offload + busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb $mac master \ + bridge fdb show dev vxlan0 check_err $? - bridge fdb show dev vxlan0 | grep $zmac | grep self | grep -q offload + busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb $zmac self \ + bridge fdb show dev vxlan0 check_err $? } @@ -672,13 +691,15 @@ check_vxlan_fdb_not_offloaded() bridge fdb show dev vxlan0 | grep $mac | grep -q self check_err $? - bridge fdb show dev vxlan0 | grep $mac | grep self | grep -q offload - check_fail $? + busywait "$TIMEOUT" not wait_for_offload grep_bridge_fdb $mac self \ + bridge fdb show dev vxlan0 + check_err $? bridge fdb show dev vxlan0 | grep $zmac | grep -q self check_err $? - bridge fdb show dev vxlan0 | grep $zmac | grep self | grep -q offload - check_fail $? + busywait "$TIMEOUT" not wait_for_offload grep_bridge_fdb $zmac self \ + bridge fdb show dev vxlan0 + check_err $? } check_bridge_fdb_not_offloaded() @@ -688,8 +709,9 @@ check_bridge_fdb_not_offloaded() bridge fdb show dev vxlan0 | grep $mac | grep -q master check_err $? - bridge fdb show dev vxlan0 | grep $mac | grep master | grep -q offload - check_fail $? + busywait "$TIMEOUT" not wait_for_offload grep_bridge_fdb $mac master \ + bridge fdb show dev vxlan0 + check_err $? } __offload_indication_join_vxlan_first() @@ -771,12 +793,14 @@ __offload_indication_join_vxlan_last() ip link set dev $swp1 master br0 - bridge fdb show dev vxlan0 | grep $zmac | grep self | grep -q offload - check_fail $? + busywait "$TIMEOUT" not wait_for_offload grep_bridge_fdb $zmac self \ + bridge fdb show dev vxlan0 + check_err $? ip link set dev vxlan0 master br0 - bridge fdb show dev vxlan0 | grep $zmac | grep self | grep -q offload + busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb $zmac self \ + bridge fdb show dev vxlan0 check_err $? log_test "offload indication - attach vxlan last" @@ -854,20 +878,26 @@ sanitization_vlan_aware_test() bridge vlan del vid 10 dev vxlan20 bridge vlan add vid 20 dev vxlan20 pvid untagged - # Test that offloading of an unsupported tunnel fails when it is - # triggered by addition of VLAN to a local port - RET=0 + # Test that when two VXLAN tunnels with conflicting configurations + # (i.e., different TTL) are enslaved to the same VLAN-aware bridge, + # then the enslavement of a port to the bridge is denied. - # TOS must be set to inherit - ip link set dev vxlan10 type vxlan tos 42 + # Use the offload indication of the local route to ensure the VXLAN + # configuration was correctly rollbacked. + ip address add 198.51.100.1/32 dev lo - ip link set dev $swp1 master br0 - bridge vlan add vid 10 dev $swp1 &> /dev/null + ip link set dev vxlan10 type vxlan ttl 10 + ip link set dev $swp1 master br0 &> /dev/null check_fail $? - log_test "vlan-aware - failed vlan addition to a local port" + busywait "$TIMEOUT" not wait_for_offload \ + ip route show table local 198.51.100.1 + check_err $? + + log_test "vlan-aware - failed enslavement to bridge due to conflict" - ip link set dev vxlan10 type vxlan tos inherit + ip link set dev vxlan10 type vxlan ttl 20 + ip address del 198.51.100.1/32 dev lo ip link del dev vxlan20 ip link del dev vxlan10 @@ -924,11 +954,11 @@ offload_indication_vlan_aware_fdb_test() bridge fdb add de:ad:be:ef:13:37 dev vxlan10 self master static \ dst 198.51.100.2 vlan 10 - bridge fdb show brport vxlan10 | grep de:ad:be:ef:13:37 | grep self \ - | grep -q offload + busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb \ + de:ad:be:ef:13:37 self bridge fdb show brport vxlan10 check_err $? - bridge fdb show brport vxlan10 | grep de:ad:be:ef:13:37 | grep -v self \ - | grep -q offload + busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb \ + de:ad:be:ef:13:37 self -v bridge fdb show brport vxlan10 check_err $? log_test "vxlan entry offload indication - initial state" @@ -938,9 +968,9 @@ offload_indication_vlan_aware_fdb_test() RET=0 bridge fdb del de:ad:be:ef:13:37 dev vxlan10 master vlan 10 - bridge fdb show brport vxlan10 | grep de:ad:be:ef:13:37 | grep self \ - | grep -q offload - check_fail $? + busywait "$TIMEOUT" not wait_for_offload grep_bridge_fdb \ + de:ad:be:ef:13:37 self bridge fdb show brport vxlan10 + check_err $? log_test "vxlan entry offload indication - after removal from bridge" @@ -949,11 +979,11 @@ offload_indication_vlan_aware_fdb_test() RET=0 bridge fdb add de:ad:be:ef:13:37 dev vxlan10 master static vlan 10 - bridge fdb show brport vxlan10 | grep de:ad:be:ef:13:37 | grep self \ - | grep -q offload + busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb \ + de:ad:be:ef:13:37 self bridge fdb show brport vxlan10 check_err $? - bridge fdb show brport vxlan10 | grep de:ad:be:ef:13:37 | grep -v self \ - | grep -q offload + busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb \ + de:ad:be:ef:13:37 self -v bridge fdb show brport vxlan10 check_err $? log_test "vxlan entry offload indication - after re-add to bridge" @@ -963,9 +993,9 @@ offload_indication_vlan_aware_fdb_test() RET=0 bridge fdb del de:ad:be:ef:13:37 dev vxlan10 self - bridge fdb show brport vxlan10 | grep de:ad:be:ef:13:37 | grep -v self \ - | grep -q offload - check_fail $? + busywait "$TIMEOUT" not wait_for_offload grep_bridge_fdb \ + de:ad:be:ef:13:37 self -v bridge fdb show brport vxlan10 + check_err $? log_test "vxlan entry offload indication - after removal from vxlan" @@ -974,11 +1004,11 @@ offload_indication_vlan_aware_fdb_test() RET=0 bridge fdb add de:ad:be:ef:13:37 dev vxlan10 self dst 198.51.100.2 - bridge fdb show brport vxlan10 | grep de:ad:be:ef:13:37 | grep self \ - | grep -q offload + busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb \ + de:ad:be:ef:13:37 self bridge fdb show brport vxlan10 check_err $? - bridge fdb show brport vxlan10 | grep de:ad:be:ef:13:37 | grep -v self \ - | grep -q offload + busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb \ + de:ad:be:ef:13:37 self -v bridge fdb show brport vxlan10 check_err $? log_test "vxlan entry offload indication - after re-add to vxlan" @@ -990,28 +1020,31 @@ offload_indication_vlan_aware_decap_route_test() { RET=0 - ip route show table local | grep 198.51.100.1 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip route show table local 198.51.100.1 check_err $? # Toggle PVID flag on one VxLAN device and make sure route is still # marked as offloaded bridge vlan add vid 10 dev vxlan10 untagged - ip route show table local | grep 198.51.100.1 | grep -q offload + busywait "$TIMEOUT" wait_for_offload \ + ip route show table local 198.51.100.1 check_err $? # Toggle PVID flag on second VxLAN device and make sure route is no # longer marked as offloaded bridge vlan add vid 20 dev vxlan20 untagged - ip route show table local | grep 198.51.100.1 | grep -q offload - check_fail $? + busywait "$TIMEOUT" not wait_for_offload \ + ip route show table local 198.51.100.1 + check_err $? # Toggle PVID flag back and make sure route is marked as offloaded bridge vlan add vid 10 dev vxlan10 pvid untagged bridge vlan add vid 20 dev vxlan20 pvid untagged - ip route show table local | grep 198.51.100.1 | grep -q offload + busywait "$TIMEOUT" wait_for_offload ip route show table local 198.51.100.1 check_err $? log_test "vxlan decap route - vni map/unmap" @@ -1064,35 +1097,33 @@ offload_indication_vlan_aware_l3vni_test() ip link set dev vxlan0 master br0 bridge vlan add dev vxlan0 vid 10 pvid untagged - # No local port or router port is member in the VLAN, so tunnel should - # not be offloaded - bridge fdb show brport vxlan0 | grep $zmac | grep self \ - | grep -q offload - check_fail $? "vxlan tunnel offloaded when should not" + busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb $zmac self \ + bridge fdb show brport vxlan0 + check_err $? "vxlan tunnel not offloaded when should" # Configure a VLAN interface and make sure tunnel is offloaded ip link add link br0 name br10 up type vlan id 10 sysctl_set net.ipv6.conf.br10.disable_ipv6 0 ip -6 address add 2001:db8:1::1/64 dev br10 - bridge fdb show brport vxlan0 | grep $zmac | grep self \ - | grep -q offload + busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb $zmac self \ + bridge fdb show brport vxlan0 check_err $? "vxlan tunnel not offloaded when should" # Unlink the VXLAN device, make sure tunnel is no longer offloaded, # then add it back to the bridge and make sure it is offloaded ip link set dev vxlan0 nomaster - bridge fdb show brport vxlan0 | grep $zmac | grep self \ - | grep -q offload - check_fail $? "vxlan tunnel offloaded after unlinked from bridge" + busywait "$TIMEOUT" not wait_for_offload grep_bridge_fdb $zmac self \ + bridge fdb show brport vxlan0 + check_err $? "vxlan tunnel offloaded after unlinked from bridge" ip link set dev vxlan0 master br0 - bridge fdb show brport vxlan0 | grep $zmac | grep self \ - | grep -q offload - check_fail $? "vxlan tunnel offloaded despite no matching vid" + busywait "$TIMEOUT" not wait_for_offload grep_bridge_fdb $zmac self \ + bridge fdb show brport vxlan0 + check_err $? "vxlan tunnel offloaded despite no matching vid" bridge vlan add dev vxlan0 vid 10 pvid untagged - bridge fdb show brport vxlan0 | grep $zmac | grep self \ - | grep -q offload + busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb $zmac self \ + bridge fdb show brport vxlan0 check_err $? "vxlan tunnel not offloaded after adding vid" log_test "vxlan - l3 vni" diff --git a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh index 025a84c2ab5a..de4b32fc4223 100755 --- a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh @@ -141,6 +141,44 @@ regions_test() check_region_snapshot_count dummy post-first-delete 2 + devlink region new $DL_HANDLE/dummy snapshot 25 + check_err $? "Failed to create a new snapshot with id 25" + + check_region_snapshot_count dummy post-first-request 3 + + devlink region dump $DL_HANDLE/dummy snapshot 25 >> /dev/null + check_err $? "Failed to dump snapshot with id 25" + + devlink region read $DL_HANDLE/dummy snapshot 25 addr 0 len 1 >> /dev/null + check_err $? "Failed to read snapshot with id 25 (1 byte)" + + devlink region read $DL_HANDLE/dummy snapshot 25 addr 128 len 128 >> /dev/null + check_err $? "Failed to read snapshot with id 25 (128 bytes)" + + devlink region read $DL_HANDLE/dummy snapshot 25 addr 128 len $((1<<32)) >> /dev/null + check_err $? "Failed to read snapshot with id 25 (oversized)" + + devlink region read $DL_HANDLE/dummy snapshot 25 addr $((1<<32)) len 128 >> /dev/null 2>&1 + check_fail $? "Bad read of snapshot with id 25 did not fail" + + devlink region del $DL_HANDLE/dummy snapshot 25 + check_err $? "Failed to delete snapshot with id 25" + + check_region_snapshot_count dummy post-second-delete 2 + + sid=$(devlink -j region new $DL_HANDLE/dummy | jq '.[][][][]') + check_err $? "Failed to create a new snapshot with id allocated by the kernel" + + check_region_snapshot_count dummy post-first-request 3 + + devlink region dump $DL_HANDLE/dummy snapshot $sid >> /dev/null + check_err $? "Failed to dump a snapshot with id allocated by the kernel" + + devlink region del $DL_HANDLE/dummy snapshot $sid + check_err $? "Failed to delete snapshot with id allocated by the kernel" + + check_region_snapshot_count dummy post-first-request 2 + log_test "regions test" } @@ -367,6 +405,11 @@ dummy_reporter_test() { RET=0 + check_reporter_info dummy healthy 0 0 0 true + + devlink health set $DL_HANDLE reporter dummy auto_recover false + check_err $? "Failed to dummy reporter auto_recover option" + check_reporter_info dummy healthy 0 0 0 false local BREAK_MSG="foo bar" diff --git a/tools/testing/selftests/drivers/net/netdevsim/devlink_trap.sh b/tools/testing/selftests/drivers/net/netdevsim/devlink_trap.sh index f101ab9441e2..da49ad2761b5 100755 --- a/tools/testing/selftests/drivers/net/netdevsim/devlink_trap.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/devlink_trap.sh @@ -16,6 +16,8 @@ ALL_TESTS=" trap_group_action_test bad_trap_group_test trap_group_stats_test + trap_policer_test + trap_policer_bind_test port_del_test dev_del_test " @@ -23,6 +25,7 @@ NETDEVSIM_PATH=/sys/bus/netdevsim/ DEV_ADDR=1337 DEV=netdevsim${DEV_ADDR} DEVLINK_DEV=netdevsim/${DEV} +DEBUGFS_DIR=/sys/kernel/debug/netdevsim/$DEV/ SLEEP_TIME=1 NETDEV="" NUM_NETIFS=0 @@ -103,6 +106,11 @@ trap_metadata_test() for trap_name in $(devlink_traps_get); do devlink_trap_metadata_test $trap_name "input_port" check_err $? "Input port not reported as metadata of trap $trap_name" + if [ $trap_name == "ingress_flow_action_drop" ] || + [ $trap_name == "egress_flow_action_drop" ]; then + devlink_trap_metadata_test $trap_name "flow_action_cookie" + check_err $? "Flow action cookie not reported as metadata of trap $trap_name" + fi done log_test "Trap metadata" @@ -251,6 +259,123 @@ trap_group_stats_test() log_test "Trap group statistics" } +trap_policer_test() +{ + local packets_t0 + local packets_t1 + + RET=0 + + if [ $(devlink_trap_policers_num_get) -eq 0 ]; then + check_err 1 "Failed to dump policers" + fi + + devlink trap policer set $DEVLINK_DEV policer 1337 &> /dev/null + check_fail $? "Did not get an error for setting a non-existing policer" + devlink trap policer show $DEVLINK_DEV policer 1337 &> /dev/null + check_fail $? "Did not get an error for getting a non-existing policer" + + devlink trap policer set $DEVLINK_DEV policer 1 rate 2000 burst 16 + check_err $? "Failed to set valid parameters for a valid policer" + if [ $(devlink_trap_policer_rate_get 1) -ne 2000 ]; then + check_err 1 "Policer rate was not changed" + fi + if [ $(devlink_trap_policer_burst_get 1) -ne 16 ]; then + check_err 1 "Policer burst size was not changed" + fi + + devlink trap policer set $DEVLINK_DEV policer 1 rate 0 &> /dev/null + check_fail $? "Policer rate was changed to rate lower than limit" + devlink trap policer set $DEVLINK_DEV policer 1 rate 9000 &> /dev/null + check_fail $? "Policer rate was changed to rate higher than limit" + devlink trap policer set $DEVLINK_DEV policer 1 burst 2 &> /dev/null + check_fail $? "Policer burst size was changed to burst size lower than limit" + devlink trap policer set $DEVLINK_DEV policer 1 rate 65537 &> /dev/null + check_fail $? "Policer burst size was changed to burst size higher than limit" + echo "y" > $DEBUGFS_DIR/fail_trap_policer_set + devlink trap policer set $DEVLINK_DEV policer 1 rate 3000 &> /dev/null + check_fail $? "Managed to set policer rate when should not" + echo "n" > $DEBUGFS_DIR/fail_trap_policer_set + if [ $(devlink_trap_policer_rate_get 1) -ne 2000 ]; then + check_err 1 "Policer rate was changed to an invalid value" + fi + if [ $(devlink_trap_policer_burst_get 1) -ne 16 ]; then + check_err 1 "Policer burst size was changed to an invalid value" + fi + + packets_t0=$(devlink_trap_policer_rx_dropped_get 1) + sleep .5 + packets_t1=$(devlink_trap_policer_rx_dropped_get 1) + if [ ! $packets_t1 -gt $packets_t0 ]; then + check_err 1 "Policer drop counter was not incremented" + fi + + echo "y"> $DEBUGFS_DIR/fail_trap_policer_counter_get + devlink -s trap policer show $DEVLINK_DEV policer 1 &> /dev/null + check_fail $? "Managed to read policer drop counter when should not" + echo "n"> $DEBUGFS_DIR/fail_trap_policer_counter_get + devlink -s trap policer show $DEVLINK_DEV policer 1 &> /dev/null + check_err $? "Did not manage to read policer drop counter when should" + + log_test "Trap policer" +} + +trap_group_check_policer() +{ + local group_name=$1; shift + + devlink -j -p trap group show $DEVLINK_DEV group $group_name \ + | jq -e '.[][][]["policer"]' &> /dev/null +} + +trap_policer_bind_test() +{ + RET=0 + + devlink trap group set $DEVLINK_DEV group l2_drops policer 1 + check_err $? "Failed to bind a valid policer" + if [ $(devlink_trap_group_policer_get "l2_drops") -ne 1 ]; then + check_err 1 "Bound policer was not changed" + fi + + devlink trap group set $DEVLINK_DEV group l2_drops policer 1337 \ + &> /dev/null + check_fail $? "Did not get an error for binding a non-existing policer" + if [ $(devlink_trap_group_policer_get "l2_drops") -ne 1 ]; then + check_err 1 "Bound policer was changed when should not" + fi + + devlink trap group set $DEVLINK_DEV group l2_drops policer 0 + check_err $? "Failed to unbind a policer when using ID 0" + trap_group_check_policer "l2_drops" + check_fail $? "Trap group has a policer after unbinding with ID 0" + + devlink trap group set $DEVLINK_DEV group l2_drops policer 1 + check_err $? "Failed to bind a valid policer" + + devlink trap group set $DEVLINK_DEV group l2_drops nopolicer + check_err $? "Failed to unbind a policer when using 'nopolicer' keyword" + trap_group_check_policer "l2_drops" + check_fail $? "Trap group has a policer after unbinding with 'nopolicer' keyword" + + devlink trap group set $DEVLINK_DEV group l2_drops policer 1 + check_err $? "Failed to bind a valid policer" + + echo "y"> $DEBUGFS_DIR/fail_trap_group_set + devlink trap group set $DEVLINK_DEV group l2_drops policer 2 \ + &> /dev/null + check_fail $? "Managed to bind a policer when should not" + echo "n"> $DEBUGFS_DIR/fail_trap_group_set + devlink trap group set $DEVLINK_DEV group l2_drops policer 2 + check_err $? "Did not manage to bind a policer when should" + + devlink trap group set $DEVLINK_DEV group l2_drops action drop \ + policer 1337 &> /dev/null + check_fail $? "Did not get an error for partially modified trap group" + + log_test "Trap policer binding" +} + port_del_test() { local group_name diff --git a/tools/testing/selftests/efivarfs/.gitignore b/tools/testing/selftests/efivarfs/.gitignore index 33618493562b..807407f7f58b 100644 --- a/tools/testing/selftests/efivarfs/.gitignore +++ b/tools/testing/selftests/efivarfs/.gitignore @@ -1,2 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only create-read open-unlink diff --git a/tools/testing/selftests/exec/.gitignore b/tools/testing/selftests/exec/.gitignore index b02279da6fa1..94b02a18f230 100644 --- a/tools/testing/selftests/exec/.gitignore +++ b/tools/testing/selftests/exec/.gitignore @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only subdir* script* execveat @@ -8,3 +9,4 @@ execveat.ephemeral execveat.denatured /recursion-depth xxxxxxxx* +pipe diff --git a/tools/testing/selftests/exec/Makefile b/tools/testing/selftests/exec/Makefile index 33339e31e365..4453b8f8def3 100644 --- a/tools/testing/selftests/exec/Makefile +++ b/tools/testing/selftests/exec/Makefile @@ -3,8 +3,9 @@ CFLAGS = -Wall CFLAGS += -Wno-nonnull CFLAGS += -D_GNU_SOURCE +TEST_PROGS := binfmt_script TEST_GEN_PROGS := execveat -TEST_GEN_FILES := execveat.symlink execveat.denatured script subdir +TEST_GEN_FILES := execveat.symlink execveat.denatured script subdir pipe # Makefile is a run-time dependency, since it's accessed by the execveat test TEST_FILES := Makefile diff --git a/tools/testing/selftests/exec/binfmt_script b/tools/testing/selftests/exec/binfmt_script new file mode 100755 index 000000000000..05f94a741c7a --- /dev/null +++ b/tools/testing/selftests/exec/binfmt_script @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 +# +# Test that truncation of bprm->buf doesn't cause unexpected execs paths, along +# with various other pathological cases. +import os, subprocess + +# Relevant commits +# +# b5372fe5dc84 ("exec: load_script: Do not exec truncated interpreter path") +# 6eb3c3d0a52d ("exec: increase BINPRM_BUF_SIZE to 256") + +# BINPRM_BUF_SIZE +SIZE=256 + +NAME_MAX=int(subprocess.check_output(["getconf", "NAME_MAX", "."])) + +test_num=0 + +code='''#!/usr/bin/perl +print "Executed interpreter! Args:\n"; +print "0 : '$0'\n"; +$counter = 1; +foreach my $a (@ARGV) { + print "$counter : '$a'\n"; + $counter++; +} +''' + +## +# test - produce a binfmt_script hashbang line for testing +# +# @size: bytes for bprm->buf line, including hashbang but not newline +# @good: whether this script is expected to execute correctly +# @hashbang: the special 2 bytes for running binfmt_script +# @leading: any leading whitespace before the executable path +# @root: start of executable pathname +# @target: end of executable pathname +# @arg: bytes following the executable pathname +# @fill: character to fill between @root and @target to reach @size bytes +# @newline: character to use as newline, not counted towards @size +# ... +def test(name, size, good=True, leading="", root="./", target="/perl", + fill="A", arg="", newline="\n", hashbang="#!"): + global test_num, tests, NAME_MAX + test_num += 1 + if test_num > tests: + raise ValueError("more binfmt_script tests than expected! (want %d, expected %d)" + % (test_num, tests)) + + middle = "" + remaining = size - len(hashbang) - len(leading) - len(root) - len(target) - len(arg) + # The middle of the pathname must not exceed NAME_MAX + while remaining >= NAME_MAX: + middle += fill * (NAME_MAX - 1) + middle += '/' + remaining -= NAME_MAX + middle += fill * remaining + + dirpath = root + middle + binary = dirpath + target + if len(target): + os.makedirs(dirpath, mode=0o755, exist_ok=True) + open(binary, "w").write(code) + os.chmod(binary, 0o755) + + buf=hashbang + leading + root + middle + target + arg + newline + if len(newline) > 0: + buf += 'echo this is not really perl\n' + + script = "binfmt_script-%s" % (name) + open(script, "w").write(buf) + os.chmod(script, 0o755) + + proc = subprocess.Popen(["./%s" % (script)], shell=True, + stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + stdout = proc.communicate()[0] + + if proc.returncode == 0 and b'Executed interpreter' in stdout: + if good: + print("ok %d - binfmt_script %s (successful good exec)" + % (test_num, name)) + else: + print("not ok %d - binfmt_script %s succeeded when it should have failed" + % (test_num, name)) + else: + if good: + print("not ok %d - binfmt_script %s failed when it should have succeeded (rc:%d)" + % (test_num, name, proc.returncode)) + else: + print("ok %d - binfmt_script %s (correctly failed bad exec)" + % (test_num, name)) + + # Clean up crazy binaries + os.unlink(script) + if len(target): + elements = binary.split('/') + os.unlink(binary) + elements.pop() + while len(elements) > 1: + os.rmdir("/".join(elements)) + elements.pop() + +tests=27 +print("TAP version 1.3") +print("1..%d" % (tests)) + +### FAIL (8 tests) + +# Entire path is well past the BINFMT_BUF_SIZE. +test(name="too-big", size=SIZE+80, good=False) +# Path is right at max size, making it impossible to tell if it was truncated. +test(name="exact", size=SIZE, good=False) +# Same as above, but with leading whitespace. +test(name="exact-space", size=SIZE, good=False, leading=" ") +# Huge buffer of only whitespace. +test(name="whitespace-too-big", size=SIZE+71, good=False, root="", + fill=" ", target="") +# A good path, but it gets truncated due to leading whitespace. +test(name="truncated", size=SIZE+17, good=False, leading=" " * 19) +# Entirely empty except for #! +test(name="empty", size=2, good=False, root="", + fill="", target="", newline="") +# Within size, but entirely spaces +test(name="spaces", size=SIZE-1, good=False, root="", fill=" ", + target="", newline="") +# Newline before binary. +test(name="newline-prefix", size=SIZE-1, good=False, leading="\n", + root="", fill=" ", target="") + +### ok (19 tests) + +# The original test case that was broken by commit: +# 8099b047ecc4 ("exec: load_script: don't blindly truncate shebang string") +test(name="test.pl", size=439, leading=" ", + root="./nix/store/bwav8kz8b3y471wjsybgzw84mrh4js9-perl-5.28.1/bin", + arg=" -I/nix/store/x6yyav38jgr924nkna62q3pkp0dgmzlx-perl5.28.1-File-Slurp-9999.25/lib/perl5/site_perl -I/nix/store/ha8v67sl8dac92r9z07vzr4gv1y9nwqz-perl5.28.1-Net-DBus-1.1.0/lib/perl5/site_perl -I/nix/store/dcrkvnjmwh69ljsvpbdjjdnqgwx90a9d-perl5.28.1-XML-Parser-2.44/lib/perl5/site_perl -I/nix/store/rmji88k2zz7h4zg97385bygcydrf2q8h-perl5.28.1-XML-Twig-3.52/lib/perl5/site_perl") +# One byte under size, leaving newline visible. +test(name="one-under", size=SIZE-1) +# Two bytes under size, leaving newline visible. +test(name="two-under", size=SIZE-2) +# Exact size, but trailing whitespace visible instead of newline +test(name="exact-trunc-whitespace", size=SIZE, arg=" ") +# Exact size, but trailing space and first arg char visible instead of newline. +test(name="exact-trunc-arg", size=SIZE, arg=" f") +# One bute under, with confirmed non-truncated arg since newline now visible. +test(name="one-under-full-arg", size=SIZE-1, arg=" f") +# Short read buffer by one byte. +test(name="one-under-no-nl", size=SIZE-1, newline="") +# Short read buffer by half buffer size. +test(name="half-under-no-nl", size=int(SIZE/2), newline="") +# One byte under with whitespace arg. leaving wenline visible. +test(name="one-under-trunc-arg", size=SIZE-1, arg=" ") +# One byte under with whitespace leading. leaving wenline visible. +test(name="one-under-leading", size=SIZE-1, leading=" ") +# One byte under with whitespace leading and as arg. leaving newline visible. +test(name="one-under-leading-trunc-arg", size=SIZE-1, leading=" ", arg=" ") +# Same as above, but with 2 bytes under +test(name="two-under-no-nl", size=SIZE-2, newline="") +test(name="two-under-trunc-arg", size=SIZE-2, arg=" ") +test(name="two-under-leading", size=SIZE-2, leading=" ") +test(name="two-under-leading-trunc-arg", size=SIZE-2, leading=" ", arg=" ") +# Same as above, but with buffer half filled +test(name="two-under-no-nl", size=int(SIZE/2), newline="") +test(name="two-under-trunc-arg", size=int(SIZE/2), arg=" ") +test(name="two-under-leading", size=int(SIZE/2), leading=" ") +test(name="two-under-lead-trunc-arg", size=int(SIZE/2), leading=" ", arg=" ") + +if test_num != tests: + raise ValueError("fewer binfmt_script tests than expected! (ran %d, expected %d" + % (test_num, tests)) diff --git a/tools/testing/selftests/exec/execveat.c b/tools/testing/selftests/exec/execveat.c index cbb6efbdb786..67bf7254a48f 100644 --- a/tools/testing/selftests/exec/execveat.c +++ b/tools/testing/selftests/exec/execveat.c @@ -5,7 +5,9 @@ * Selftests for execveat(2). */ +#ifndef _GNU_SOURCE #define _GNU_SOURCE /* to get O_PATH, AT_EMPTY_PATH */ +#endif #include <sys/sendfile.h> #include <sys/stat.h> #include <sys/syscall.h> @@ -311,6 +313,10 @@ static int run_tests(void) fail += check_execveat_fail(AT_FDCWD, fullname_symlink, AT_SYMLINK_NOFOLLOW, ELOOP); + /* Non-regular file failure */ + fail += check_execveat_fail(dot_dfd, "pipe", 0, EACCES); + unlink("pipe"); + /* Shell script wrapping executable file: */ /* dfd + path */ fail += check_execveat(subdir_dfd, "../script", 0); @@ -384,6 +390,8 @@ static void prerequisites(void) fd = open("subdir.ephemeral/script", O_RDWR|O_CREAT|O_TRUNC, 0755); write(fd, script, strlen(script)); close(fd); + + mkfifo("pipe", 0755); } int main(int argc, char **argv) diff --git a/tools/testing/selftests/filesystems/.gitignore b/tools/testing/selftests/filesystems/.gitignore index 8449cf6716ce..f0c0ff20d6cf 100644 --- a/tools/testing/selftests/filesystems/.gitignore +++ b/tools/testing/selftests/filesystems/.gitignore @@ -1,2 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only dnotify_test devpts_pts diff --git a/tools/testing/selftests/filesystems/binderfs/.gitignore b/tools/testing/selftests/filesystems/binderfs/.gitignore index 8a5d9bf63dd4..8e5cf9084894 100644 --- a/tools/testing/selftests/filesystems/binderfs/.gitignore +++ b/tools/testing/selftests/filesystems/binderfs/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only binderfs_test diff --git a/tools/testing/selftests/filesystems/binderfs/Makefile b/tools/testing/selftests/filesystems/binderfs/Makefile index 58cb659b56b4..8af25ae96049 100644 --- a/tools/testing/selftests/filesystems/binderfs/Makefile +++ b/tools/testing/selftests/filesystems/binderfs/Makefile @@ -1,6 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 -CFLAGS += -I../../../../../usr/include/ +CFLAGS += -I../../../../../usr/include/ -pthread TEST_GEN_PROGS := binderfs_test +binderfs_test: binderfs_test.c ../../kselftest.h ../../kselftest_harness.h + include ../../lib.mk diff --git a/tools/testing/selftests/filesystems/binderfs/binderfs_test.c b/tools/testing/selftests/filesystems/binderfs/binderfs_test.c index 8c2ed962e1c7..8a6b507e34a8 100644 --- a/tools/testing/selftests/filesystems/binderfs/binderfs_test.c +++ b/tools/testing/selftests/filesystems/binderfs/binderfs_test.c @@ -3,114 +3,47 @@ #define _GNU_SOURCE #include <errno.h> #include <fcntl.h> +#include <pthread.h> #include <sched.h> #include <stdbool.h> #include <stdio.h> #include <stdlib.h> #include <string.h> +#include <sys/fsuid.h> #include <sys/ioctl.h> #include <sys/mount.h> +#include <sys/socket.h> #include <sys/stat.h> +#include <sys/sysinfo.h> #include <sys/types.h> +#include <sys/wait.h> #include <unistd.h> #include <linux/android/binder.h> #include <linux/android/binderfs.h> -#include "../../kselftest.h" - -static ssize_t write_nointr(int fd, const void *buf, size_t count) -{ - ssize_t ret; -again: - ret = write(fd, buf, count); - if (ret < 0 && errno == EINTR) - goto again; - - return ret; -} -static void write_to_file(const char *filename, const void *buf, size_t count, - int allowed_errno) -{ - int fd, saved_errno; - ssize_t ret; +#include "../../kselftest.h" +#include "../../kselftest_harness.h" - fd = open(filename, O_WRONLY | O_CLOEXEC); - if (fd < 0) - ksft_exit_fail_msg("%s - Failed to open file %s\n", - strerror(errno), filename); +#define DEFAULT_THREADS 4 - ret = write_nointr(fd, buf, count); - if (ret < 0) { - if (allowed_errno && (errno == allowed_errno)) { - close(fd); - return; - } +#define PTR_TO_INT(p) ((int)((intptr_t)(p))) +#define INT_TO_PTR(u) ((void *)((intptr_t)(u))) - goto on_error; +#define close_prot_errno_disarm(fd) \ + if (fd >= 0) { \ + int _e_ = errno; \ + close(fd); \ + errno = _e_; \ + fd = -EBADF; \ } - if ((size_t)ret != count) - goto on_error; - - close(fd); - return; - -on_error: - saved_errno = errno; - close(fd); - errno = saved_errno; - - if (ret < 0) - ksft_exit_fail_msg("%s - Failed to write to file %s\n", - strerror(errno), filename); - - ksft_exit_fail_msg("Failed to write to file %s\n", filename); -} - -static void change_to_userns(void) -{ - int ret; - uid_t uid; - gid_t gid; - /* {g,u}id_map files only allow a max of 4096 bytes written to them */ - char idmap[4096]; - - uid = getuid(); - gid = getgid(); - - ret = unshare(CLONE_NEWUSER); - if (ret < 0) - ksft_exit_fail_msg("%s - Failed to unshare user namespace\n", - strerror(errno)); - - write_to_file("/proc/self/setgroups", "deny", strlen("deny"), ENOENT); - - ret = snprintf(idmap, sizeof(idmap), "0 %d 1", uid); - if (ret < 0 || (size_t)ret >= sizeof(idmap)) - ksft_exit_fail_msg("%s - Failed to prepare uid mapping\n", - strerror(errno)); - - write_to_file("/proc/self/uid_map", idmap, strlen(idmap), 0); - - ret = snprintf(idmap, sizeof(idmap), "0 %d 1", gid); - if (ret < 0 || (size_t)ret >= sizeof(idmap)) - ksft_exit_fail_msg("%s - Failed to prepare uid mapping\n", - strerror(errno)); - - write_to_file("/proc/self/gid_map", idmap, strlen(idmap), 0); - - ret = setgid(0); - if (ret) - ksft_exit_fail_msg("%s - Failed to setgid(0)\n", - strerror(errno)); +#define log_exit(format, ...) \ + ({ \ + fprintf(stderr, format "\n", ##__VA_ARGS__); \ + exit(EXIT_FAILURE); \ + }) - ret = setuid(0); - if (ret) - ksft_exit_fail_msg("%s - Failed to setgid(0)\n", - strerror(errno)); -} - -static void change_to_mountns(void) +static void change_mountns(void) { int ret; @@ -132,36 +65,31 @@ static void rmdir_protect_errno(const char *dir) errno = saved_errno; } -static void __do_binderfs_test(void) +static int __do_binderfs_test(void) { int fd, ret, saved_errno; size_t len; ssize_t wret; - bool keep = false; struct binderfs_device device = { 0 }; struct binder_version version = { 0 }; + char binderfs_mntpt[] = P_tmpdir "/binderfs_XXXXXX", + device_path[sizeof(P_tmpdir "/binderfs_XXXXXX/") + BINDERFS_MAX_NAME]; - change_to_mountns(); - - ret = mkdir("/dev/binderfs", 0755); - if (ret < 0) { - if (errno != EEXIST) - ksft_exit_fail_msg( - "%s - Failed to create binderfs mountpoint\n", - strerror(errno)); + change_mountns(); - keep = true; - } + if (!mkdtemp(binderfs_mntpt)) + ksft_exit_fail_msg( + "%s - Failed to create binderfs mountpoint\n", + strerror(errno)); - ret = mount(NULL, "/dev/binderfs", "binder", 0, 0); + ret = mount(NULL, binderfs_mntpt, "binder", 0, 0); if (ret < 0) { if (errno != ENODEV) ksft_exit_fail_msg("%s - Failed to mount binderfs\n", strerror(errno)); - keep ? : rmdir_protect_errno("/dev/binderfs"); - ksft_exit_skip( - "The Android binderfs filesystem is not available\n"); + rmdir_protect_errno(binderfs_mntpt); + return 1; } /* binderfs mount test passed */ @@ -169,7 +97,8 @@ static void __do_binderfs_test(void) memcpy(device.name, "my-binder", strlen("my-binder")); - fd = open("/dev/binderfs/binder-control", O_RDONLY | O_CLOEXEC); + snprintf(device_path, sizeof(device_path), "%s/binder-control", binderfs_mntpt); + fd = open(device_path, O_RDONLY | O_CLOEXEC); if (fd < 0) ksft_exit_fail_msg( "%s - Failed to open binder-control device\n", @@ -180,7 +109,7 @@ static void __do_binderfs_test(void) close(fd); errno = saved_errno; if (ret < 0) { - keep ? : rmdir_protect_errno("/dev/binderfs"); + rmdir_protect_errno(binderfs_mntpt); ksft_exit_fail_msg( "%s - Failed to allocate new binder device\n", strerror(errno)); @@ -193,9 +122,10 @@ static void __do_binderfs_test(void) /* binder device allocation test passed */ ksft_inc_pass_cnt(); - fd = open("/dev/binderfs/my-binder", O_CLOEXEC | O_RDONLY); + snprintf(device_path, sizeof(device_path), "%s/my-binder", binderfs_mntpt); + fd = open(device_path, O_CLOEXEC | O_RDONLY); if (fd < 0) { - keep ? : rmdir_protect_errno("/dev/binderfs"); + rmdir_protect_errno(binderfs_mntpt); ksft_exit_fail_msg("%s - Failed to open my-binder device\n", strerror(errno)); } @@ -205,7 +135,7 @@ static void __do_binderfs_test(void) close(fd); errno = saved_errno; if (ret < 0) { - keep ? : rmdir_protect_errno("/dev/binderfs"); + rmdir_protect_errno(binderfs_mntpt); ksft_exit_fail_msg( "%s - Failed to open perform BINDER_VERSION request\n", strerror(errno)); @@ -217,9 +147,9 @@ static void __do_binderfs_test(void) /* binder transaction with binderfs binder device passed */ ksft_inc_pass_cnt(); - ret = unlink("/dev/binderfs/my-binder"); + ret = unlink(device_path); if (ret < 0) { - keep ? : rmdir_protect_errno("/dev/binderfs"); + rmdir_protect_errno(binderfs_mntpt); ksft_exit_fail_msg("%s - Failed to delete binder device\n", strerror(errno)); } @@ -227,12 +157,13 @@ static void __do_binderfs_test(void) /* binder device removal passed */ ksft_inc_pass_cnt(); - ret = unlink("/dev/binderfs/binder-control"); + snprintf(device_path, sizeof(device_path), "%s/binder-control", binderfs_mntpt); + ret = unlink(device_path); if (!ret) { - keep ? : rmdir_protect_errno("/dev/binderfs"); + rmdir_protect_errno(binderfs_mntpt); ksft_exit_fail_msg("Managed to delete binder-control device\n"); } else if (errno != EPERM) { - keep ? : rmdir_protect_errno("/dev/binderfs"); + rmdir_protect_errno(binderfs_mntpt); ksft_exit_fail_msg( "%s - Failed to delete binder-control device but exited with unexpected error code\n", strerror(errno)); @@ -242,34 +173,341 @@ static void __do_binderfs_test(void) ksft_inc_xfail_cnt(); on_error: - ret = umount2("/dev/binderfs", MNT_DETACH); - keep ?: rmdir_protect_errno("/dev/binderfs"); + ret = umount2(binderfs_mntpt, MNT_DETACH); + rmdir_protect_errno(binderfs_mntpt); if (ret < 0) ksft_exit_fail_msg("%s - Failed to unmount binderfs\n", strerror(errno)); /* binderfs unmount test passed */ ksft_inc_pass_cnt(); + return 0; } -static void binderfs_test_privileged() +static int wait_for_pid(pid_t pid) { - if (geteuid() != 0) - ksft_print_msg( - "Tests are not run as root. Skipping privileged tests\n"); - else - __do_binderfs_test(); + int status, ret; + +again: + ret = waitpid(pid, &status, 0); + if (ret == -1) { + if (errno == EINTR) + goto again; + + return -1; + } + + if (!WIFEXITED(status)) + return -1; + + return WEXITSTATUS(status); +} + +static int setid_userns_root(void) +{ + if (setuid(0)) + return -1; + if (setgid(0)) + return -1; + + setfsuid(0); + setfsgid(0); + + return 0; +} + +enum idmap_type { + UID_MAP, + GID_MAP, +}; + +static ssize_t read_nointr(int fd, void *buf, size_t count) +{ + ssize_t ret; +again: + ret = read(fd, buf, count); + if (ret < 0 && errno == EINTR) + goto again; + + return ret; +} + +static ssize_t write_nointr(int fd, const void *buf, size_t count) +{ + ssize_t ret; +again: + ret = write(fd, buf, count); + if (ret < 0 && errno == EINTR) + goto again; + + return ret; +} + +static int write_id_mapping(enum idmap_type type, pid_t pid, const char *buf, + size_t buf_size) +{ + int fd; + int ret; + char path[4096]; + + if (type == GID_MAP) { + int setgroups_fd; + + snprintf(path, sizeof(path), "/proc/%d/setgroups", pid); + setgroups_fd = open(path, O_WRONLY | O_CLOEXEC | O_NOFOLLOW); + if (setgroups_fd < 0 && errno != ENOENT) + return -1; + + if (setgroups_fd >= 0) { + ret = write_nointr(setgroups_fd, "deny", sizeof("deny") - 1); + close_prot_errno_disarm(setgroups_fd); + if (ret != sizeof("deny") - 1) + return -1; + } + } + + switch (type) { + case UID_MAP: + ret = snprintf(path, sizeof(path), "/proc/%d/uid_map", pid); + break; + case GID_MAP: + ret = snprintf(path, sizeof(path), "/proc/%d/gid_map", pid); + break; + default: + return -1; + } + if (ret < 0 || ret >= sizeof(path)) + return -E2BIG; + + fd = open(path, O_WRONLY | O_CLOEXEC | O_NOFOLLOW); + if (fd < 0) + return -1; + + ret = write_nointr(fd, buf, buf_size); + close_prot_errno_disarm(fd); + if (ret != buf_size) + return -1; + + return 0; +} + +static void change_userns(int syncfds[2]) +{ + int ret; + char buf; + + close_prot_errno_disarm(syncfds[1]); + + ret = unshare(CLONE_NEWUSER); + if (ret < 0) + ksft_exit_fail_msg("%s - Failed to unshare user namespace\n", + strerror(errno)); + + ret = write_nointr(syncfds[0], "1", 1); + if (ret != 1) + ksft_exit_fail_msg("write_nointr() failed\n"); + + ret = read_nointr(syncfds[0], &buf, 1); + if (ret != 1) + ksft_exit_fail_msg("read_nointr() failed\n"); + + close_prot_errno_disarm(syncfds[0]); + + if (setid_userns_root()) + ksft_exit_fail_msg("setid_userns_root() failed"); +} + +static void change_idmaps(int syncfds[2], pid_t pid) +{ + int ret; + char buf; + char id_map[4096]; + + close_prot_errno_disarm(syncfds[0]); + + ret = read_nointr(syncfds[1], &buf, 1); + if (ret != 1) + ksft_exit_fail_msg("read_nointr() failed\n"); + + snprintf(id_map, sizeof(id_map), "0 %d 1\n", getuid()); + ret = write_id_mapping(UID_MAP, pid, id_map, strlen(id_map)); + if (ret) + ksft_exit_fail_msg("write_id_mapping(UID_MAP) failed"); + + snprintf(id_map, sizeof(id_map), "0 %d 1\n", getgid()); + ret = write_id_mapping(GID_MAP, pid, id_map, strlen(id_map)); + if (ret) + ksft_exit_fail_msg("write_id_mapping(GID_MAP) failed"); + + ret = write_nointr(syncfds[1], "1", 1); + if (ret != 1) + ksft_exit_fail_msg("write_nointr() failed"); + + close_prot_errno_disarm(syncfds[1]); } -static void binderfs_test_unprivileged() +static void *binder_version_thread(void *data) { - change_to_userns(); - __do_binderfs_test(); + int fd = PTR_TO_INT(data); + struct binder_version version = { 0 }; + int ret; + + ret = ioctl(fd, BINDER_VERSION, &version); + if (ret < 0) + ksft_print_msg("%s - Failed to open perform BINDER_VERSION request\n", strerror(errno)); + + pthread_exit(data); } -int main(int argc, char *argv[]) +/* + * Regression test: + * 2669b8b0c798 ("binder: prevent UAF for binderfs devices") + * f0fe2c0f050d ("binder: prevent UAF for binderfs devices II") + * 211b64e4b5b6 ("binderfs: use refcount for binder control devices too") + */ +TEST(binderfs_stress) { - binderfs_test_privileged(); - binderfs_test_unprivileged(); - ksft_exit_pass(); + int fds[1000]; + int syncfds[2]; + pid_t pid; + int fd, ret; + size_t len; + struct binderfs_device device = { 0 }; + char binderfs_mntpt[] = P_tmpdir "/binderfs_XXXXXX", + device_path[sizeof(P_tmpdir "/binderfs_XXXXXX/") + BINDERFS_MAX_NAME]; + + ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, syncfds); + if (ret < 0) + ksft_exit_fail_msg("%s - Failed to create socket pair", strerror(errno)); + + pid = fork(); + if (pid < 0) { + close_prot_errno_disarm(syncfds[0]); + close_prot_errno_disarm(syncfds[1]); + ksft_exit_fail_msg("%s - Failed to fork", strerror(errno)); + } + + if (pid == 0) { + int i, j, k, nthreads; + pthread_attr_t attr; + pthread_t threads[DEFAULT_THREADS]; + change_userns(syncfds); + change_mountns(); + + if (!mkdtemp(binderfs_mntpt)) + log_exit("%s - Failed to create binderfs mountpoint\n", + strerror(errno)); + + ret = mount(NULL, binderfs_mntpt, "binder", 0, 0); + if (ret < 0) + log_exit("%s - Failed to mount binderfs\n", strerror(errno)); + + for (int i = 0; i < ARRAY_SIZE(fds); i++) { + + snprintf(device_path, sizeof(device_path), + "%s/binder-control", binderfs_mntpt); + fd = open(device_path, O_RDONLY | O_CLOEXEC); + if (fd < 0) + log_exit("%s - Failed to open binder-control device\n", strerror(errno)); + + memset(&device, 0, sizeof(device)); + snprintf(device.name, sizeof(device.name), "%d", i); + ret = ioctl(fd, BINDER_CTL_ADD, &device); + close_prot_errno_disarm(fd); + if (ret < 0) + log_exit("%s - Failed to allocate new binder device\n", strerror(errno)); + + snprintf(device_path, sizeof(device_path), "%s/%d", + binderfs_mntpt, i); + fds[i] = open(device_path, O_RDONLY | O_CLOEXEC); + if (fds[i] < 0) + log_exit("%s - Failed to open binder device\n", strerror(errno)); + } + + ret = umount2(binderfs_mntpt, MNT_DETACH); + rmdir_protect_errno(binderfs_mntpt); + if (ret < 0) + log_exit("%s - Failed to unmount binderfs\n", strerror(errno)); + + nthreads = get_nprocs_conf(); + if (nthreads > DEFAULT_THREADS) + nthreads = DEFAULT_THREADS; + + pthread_attr_init(&attr); + for (k = 0; k < ARRAY_SIZE(fds); k++) { + for (i = 0; i < nthreads; i++) { + ret = pthread_create(&threads[i], &attr, binder_version_thread, INT_TO_PTR(fds[k])); + if (ret) { + ksft_print_msg("%s - Failed to create thread %d\n", strerror(errno), i); + break; + } + } + + for (j = 0; j < i; j++) { + void *fdptr = NULL; + + ret = pthread_join(threads[j], &fdptr); + if (ret) + ksft_print_msg("%s - Failed to join thread %d for fd %d\n", strerror(errno), j, PTR_TO_INT(fdptr)); + } + } + pthread_attr_destroy(&attr); + + for (k = 0; k < ARRAY_SIZE(fds); k++) + close(fds[k]); + + exit(EXIT_SUCCESS); + } + + change_idmaps(syncfds, pid); + + ret = wait_for_pid(pid); + if (ret) + ksft_exit_fail_msg("wait_for_pid() failed"); } + +TEST(binderfs_test_privileged) +{ + if (geteuid() != 0) + XFAIL(return, "Tests are not run as root. Skipping privileged tests"); + + if (__do_binderfs_test() == 1) + XFAIL(return, "The Android binderfs filesystem is not available"); +} + +TEST(binderfs_test_unprivileged) +{ + int ret; + int syncfds[2]; + pid_t pid; + + ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, syncfds); + if (ret < 0) + ksft_exit_fail_msg("%s - Failed to create socket pair", strerror(errno)); + + pid = fork(); + if (pid < 0) { + close_prot_errno_disarm(syncfds[0]); + close_prot_errno_disarm(syncfds[1]); + ksft_exit_fail_msg("%s - Failed to fork", strerror(errno)); + } + + if (pid == 0) { + change_userns(syncfds); + if (__do_binderfs_test() == 1) + exit(2); + exit(EXIT_SUCCESS); + } + + change_idmaps(syncfds, pid); + + ret = wait_for_pid(pid); + if (ret) { + if (ret == 2) + XFAIL(return, "The Android binderfs filesystem is not available"); + else + ksft_exit_fail_msg("wait_for_pid() failed"); + } +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/filesystems/epoll/.gitignore b/tools/testing/selftests/filesystems/epoll/.gitignore index 9ae8db44ec14..9090157258b1 100644 --- a/tools/testing/selftests/filesystems/epoll/.gitignore +++ b/tools/testing/selftests/filesystems/epoll/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only epoll_wakeup_test diff --git a/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c b/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c index 37a04dab56f0..d979ff14775a 100644 --- a/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c +++ b/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c @@ -3,17 +3,19 @@ #define _GNU_SOURCE #include <poll.h> #include <unistd.h> +#include <assert.h> #include <signal.h> #include <pthread.h> #include <sys/epoll.h> #include <sys/socket.h> +#include <sys/eventfd.h> #include "../../kselftest_harness.h" struct epoll_mtcontext { int efd[3]; int sfd[4]; - int count; + volatile int count; pthread_t main; pthread_t waiter; @@ -3071,4 +3073,213 @@ TEST(epoll58) close(ctx.sfd[3]); } +static void *epoll59_thread(void *ctx_) +{ + struct epoll_mtcontext *ctx = ctx_; + struct epoll_event e; + int i; + + for (i = 0; i < 100000; i++) { + while (ctx->count == 0) + ; + + e.events = EPOLLIN | EPOLLERR | EPOLLET; + epoll_ctl(ctx->efd[0], EPOLL_CTL_MOD, ctx->sfd[0], &e); + ctx->count = 0; + } + + return NULL; +} + +/* + * t0 + * (p) \ + * e0 + * (et) / + * e0 + * + * Based on https://bugzilla.kernel.org/show_bug.cgi?id=205933 + */ +TEST(epoll59) +{ + pthread_t emitter; + struct pollfd pfd; + struct epoll_event e; + struct epoll_mtcontext ctx = { 0 }; + int i, ret; + + signal(SIGUSR1, signal_handler); + + ctx.efd[0] = epoll_create1(0); + ASSERT_GE(ctx.efd[0], 0); + + ctx.sfd[0] = eventfd(1, 0); + ASSERT_GE(ctx.sfd[0], 0); + + e.events = EPOLLIN | EPOLLERR | EPOLLET; + ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0); + + ASSERT_EQ(pthread_create(&emitter, NULL, epoll59_thread, &ctx), 0); + + for (i = 0; i < 100000; i++) { + ret = epoll_wait(ctx.efd[0], &e, 1, 1000); + ASSERT_GT(ret, 0); + + while (ctx.count != 0) + ; + ctx.count = 1; + } + if (pthread_tryjoin_np(emitter, NULL) < 0) { + pthread_kill(emitter, SIGUSR1); + pthread_join(emitter, NULL); + } + close(ctx.efd[0]); + close(ctx.sfd[0]); +} + +enum { + EPOLL60_EVENTS_NR = 10, +}; + +struct epoll60_ctx { + volatile int stopped; + int ready; + int waiters; + int epfd; + int evfd[EPOLL60_EVENTS_NR]; +}; + +static void *epoll60_wait_thread(void *ctx_) +{ + struct epoll60_ctx *ctx = ctx_; + struct epoll_event e; + sigset_t sigmask; + uint64_t v; + int ret; + + /* Block SIGUSR1 */ + sigemptyset(&sigmask); + sigaddset(&sigmask, SIGUSR1); + sigprocmask(SIG_SETMASK, &sigmask, NULL); + + /* Prepare empty mask for epoll_pwait() */ + sigemptyset(&sigmask); + + while (!ctx->stopped) { + /* Mark we are ready */ + __atomic_fetch_add(&ctx->ready, 1, __ATOMIC_ACQUIRE); + + /* Start when all are ready */ + while (__atomic_load_n(&ctx->ready, __ATOMIC_ACQUIRE) && + !ctx->stopped); + + /* Account this waiter */ + __atomic_fetch_add(&ctx->waiters, 1, __ATOMIC_ACQUIRE); + + ret = epoll_pwait(ctx->epfd, &e, 1, 2000, &sigmask); + if (ret != 1) { + /* We expect only signal delivery on stop */ + assert(ret < 0 && errno == EINTR && "Lost wakeup!\n"); + assert(ctx->stopped); + break; + } + + ret = read(e.data.fd, &v, sizeof(v)); + /* Since we are on ET mode, thus each thread gets its own fd. */ + assert(ret == sizeof(v)); + + __atomic_fetch_sub(&ctx->waiters, 1, __ATOMIC_RELEASE); + } + + return NULL; +} + +static inline unsigned long long msecs(void) +{ + struct timespec ts; + unsigned long long msecs; + + clock_gettime(CLOCK_REALTIME, &ts); + msecs = ts.tv_sec * 1000ull; + msecs += ts.tv_nsec / 1000000ull; + + return msecs; +} + +static inline int count_waiters(struct epoll60_ctx *ctx) +{ + return __atomic_load_n(&ctx->waiters, __ATOMIC_ACQUIRE); +} + +TEST(epoll60) +{ + struct epoll60_ctx ctx = { 0 }; + pthread_t waiters[ARRAY_SIZE(ctx.evfd)]; + struct epoll_event e; + int i, n, ret; + + signal(SIGUSR1, signal_handler); + + ctx.epfd = epoll_create1(0); + ASSERT_GE(ctx.epfd, 0); + + /* Create event fds */ + for (i = 0; i < ARRAY_SIZE(ctx.evfd); i++) { + ctx.evfd[i] = eventfd(0, EFD_NONBLOCK); + ASSERT_GE(ctx.evfd[i], 0); + + e.events = EPOLLIN | EPOLLET; + e.data.fd = ctx.evfd[i]; + ASSERT_EQ(epoll_ctl(ctx.epfd, EPOLL_CTL_ADD, ctx.evfd[i], &e), 0); + } + + /* Create waiter threads */ + for (i = 0; i < ARRAY_SIZE(waiters); i++) + ASSERT_EQ(pthread_create(&waiters[i], NULL, + epoll60_wait_thread, &ctx), 0); + + for (i = 0; i < 300; i++) { + uint64_t v = 1, ms; + + /* Wait for all to be ready */ + while (__atomic_load_n(&ctx.ready, __ATOMIC_ACQUIRE) != + ARRAY_SIZE(ctx.evfd)) + ; + + /* Steady, go */ + __atomic_fetch_sub(&ctx.ready, ARRAY_SIZE(ctx.evfd), + __ATOMIC_ACQUIRE); + + /* Wait all have gone to kernel */ + while (count_waiters(&ctx) != ARRAY_SIZE(ctx.evfd)) + ; + + /* 1ms should be enough to schedule away */ + usleep(1000); + + /* Quickly signal all handles at once */ + for (n = 0; n < ARRAY_SIZE(ctx.evfd); n++) { + ret = write(ctx.evfd[n], &v, sizeof(v)); + ASSERT_EQ(ret, sizeof(v)); + } + + /* Busy loop for 1s and wait for all waiters to wake up */ + ms = msecs(); + while (count_waiters(&ctx) && msecs() < ms + 1000) + ; + + ASSERT_EQ(count_waiters(&ctx), 0); + } + ctx.stopped = 1; + /* Stop waiters */ + for (i = 0; i < ARRAY_SIZE(waiters); i++) + ret = pthread_kill(waiters[i], SIGUSR1); + for (i = 0; i < ARRAY_SIZE(waiters); i++) + pthread_join(waiters[i], NULL); + + for (i = 0; i < ARRAY_SIZE(waiters); i++) + close(ctx.evfd[i]); + close(ctx.epfd); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/firmware/Makefile b/tools/testing/selftests/firmware/Makefile index 012b2cf69c11..40211cd8f0e6 100644 --- a/tools/testing/selftests/firmware/Makefile +++ b/tools/testing/selftests/firmware/Makefile @@ -1,13 +1,10 @@ # SPDX-License-Identifier: GPL-2.0-only # Makefile for firmware loading selftests - -# No binaries, but make sure arg-less "make" doesn't trigger "run_tests" -all: +CFLAGS = -Wall \ + -O2 TEST_PROGS := fw_run_tests.sh TEST_FILES := fw_fallback.sh fw_filesystem.sh fw_lib.sh +TEST_GEN_FILES := fw_namespace include ../lib.mk - -# Nothing to clean up. -clean: diff --git a/tools/testing/selftests/firmware/fw_filesystem.sh b/tools/testing/selftests/firmware/fw_filesystem.sh index 56894477c8bd..fcc281373b4d 100755 --- a/tools/testing/selftests/firmware/fw_filesystem.sh +++ b/tools/testing/selftests/firmware/fw_filesystem.sh @@ -86,6 +86,29 @@ else fi fi +# Try platform (EFI embedded fw) loading too +if [ ! -e "$DIR"/trigger_request_platform ]; then + echo "$0: firmware loading: platform trigger not present, ignoring test" >&2 +else + if printf '\000' >"$DIR"/trigger_request_platform 2> /dev/null; then + echo "$0: empty filename should not succeed (platform)" >&2 + exit 1 + fi + + # Note we echo a non-existing name, since files on the file-system + # are preferred over firmware embedded inside the platform's firmware + # The test adds a fake entry with the requested name to the platform's + # fw list, so the name does not matter as long as it does not exist + if ! echo -n "nope-$NAME" >"$DIR"/trigger_request_platform ; then + echo "$0: could not trigger request platform" >&2 + exit 1 + fi + + # The test verifies itself that the loaded firmware contents matches + # the contents for the fake platform fw entry it added. + echo "$0: platform loading works" +fi + ### Batched requests tests test_config_present() { diff --git a/tools/testing/selftests/firmware/fw_namespace.c b/tools/testing/selftests/firmware/fw_namespace.c new file mode 100644 index 000000000000..5ebc1aec7923 --- /dev/null +++ b/tools/testing/selftests/firmware/fw_namespace.c @@ -0,0 +1,151 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Test triggering of loading of firmware from different mount + * namespaces. Expect firmware to be always loaded from the mount + * namespace of PID 1. */ +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <sched.h> +#include <stdarg.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/mount.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> + +#ifndef CLONE_NEWNS +# define CLONE_NEWNS 0x00020000 +#endif + +static char *fw_path = NULL; + +static void die(char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vfprintf(stderr, fmt, ap); + va_end(ap); + if (fw_path) + unlink(fw_path); + umount("/lib/firmware"); + exit(EXIT_FAILURE); +} + +static void trigger_fw(const char *fw_name, const char *sys_path) +{ + int fd; + + fd = open(sys_path, O_WRONLY); + if (fd < 0) + die("open failed: %s\n", + strerror(errno)); + if (write(fd, fw_name, strlen(fw_name)) != strlen(fw_name)) + exit(EXIT_FAILURE); + close(fd); +} + +static void setup_fw(const char *fw_path) +{ + int fd; + const char fw[] = "ABCD0123"; + + fd = open(fw_path, O_WRONLY | O_CREAT, 0600); + if (fd < 0) + die("open failed: %s\n", + strerror(errno)); + if (write(fd, fw, sizeof(fw) -1) != sizeof(fw) -1) + die("write failed: %s\n", + strerror(errno)); + close(fd); +} + +static bool test_fw_in_ns(const char *fw_name, const char *sys_path, bool block_fw_in_parent_ns) +{ + pid_t child; + + if (block_fw_in_parent_ns) + if (mount("test", "/lib/firmware", "tmpfs", MS_RDONLY, NULL) == -1) + die("blocking firmware in parent ns failed\n"); + + child = fork(); + if (child == -1) { + die("fork failed: %s\n", + strerror(errno)); + } + if (child != 0) { /* parent */ + pid_t pid; + int status; + + pid = waitpid(child, &status, 0); + if (pid == -1) { + die("waitpid failed: %s\n", + strerror(errno)); + } + if (pid != child) { + die("waited for %d got %d\n", + child, pid); + } + if (!WIFEXITED(status)) { + die("child did not terminate cleanly\n"); + } + if (block_fw_in_parent_ns) + umount("/lib/firmware"); + return WEXITSTATUS(status) == EXIT_SUCCESS ? true : false; + } + + if (unshare(CLONE_NEWNS) != 0) { + die("unshare(CLONE_NEWNS) failed: %s\n", + strerror(errno)); + } + if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) == -1) + die("remount root in child ns failed\n"); + + if (!block_fw_in_parent_ns) { + if (mount("test", "/lib/firmware", "tmpfs", MS_RDONLY, NULL) == -1) + die("blocking firmware in child ns failed\n"); + } else + umount("/lib/firmware"); + + trigger_fw(fw_name, sys_path); + + exit(EXIT_SUCCESS); +} + +int main(int argc, char **argv) +{ + const char *fw_name = "test-firmware.bin"; + char *sys_path; + if (argc != 2) + die("usage: %s sys_path\n", argv[0]); + + /* Mount tmpfs to /lib/firmware so we don't have to assume + that it is writable for us.*/ + if (mount("test", "/lib/firmware", "tmpfs", 0, NULL) == -1) + die("mounting tmpfs to /lib/firmware failed\n"); + + sys_path = argv[1]; + asprintf(&fw_path, "/lib/firmware/%s", fw_name); + + setup_fw(fw_path); + + setvbuf(stdout, NULL, _IONBF, 0); + /* Positive case: firmware in PID1 mount namespace */ + printf("Testing with firmware in parent namespace (assumed to be same file system as PID1)\n"); + if (!test_fw_in_ns(fw_name, sys_path, false)) + die("error: failed to access firmware\n"); + + /* Negative case: firmware in child mount namespace, expected to fail */ + printf("Testing with firmware in child namespace\n"); + if (test_fw_in_ns(fw_name, sys_path, true)) + die("error: firmware access did not fail\n"); + + unlink(fw_path); + free(fw_path); + umount("/lib/firmware"); + exit(EXIT_SUCCESS); +} diff --git a/tools/testing/selftests/firmware/fw_run_tests.sh b/tools/testing/selftests/firmware/fw_run_tests.sh index 8e14d555c197..777377078d5e 100755 --- a/tools/testing/selftests/firmware/fw_run_tests.sh +++ b/tools/testing/selftests/firmware/fw_run_tests.sh @@ -61,6 +61,10 @@ run_test_config_0003() check_mods check_setup +echo "Running namespace test: " +$TEST_DIR/fw_namespace $DIR/trigger_request +echo "OK" + if [ -f $FW_FORCE_SYSFS_FALLBACK ]; then run_test_config_0001 run_test_config_0002 diff --git a/tools/testing/selftests/ftrace/.gitignore b/tools/testing/selftests/ftrace/.gitignore index 98d8a5a63049..2659417cb2c7 100644 --- a/tools/testing/selftests/ftrace/.gitignore +++ b/tools/testing/selftests/ftrace/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only logs diff --git a/tools/testing/selftests/ftrace/config b/tools/testing/selftests/ftrace/config index c2c8de4fafff..e59d985eeff0 100644 --- a/tools/testing/selftests/ftrace/config +++ b/tools/testing/selftests/ftrace/config @@ -11,5 +11,6 @@ CONFIG_PREEMPTIRQ_DELAY_TEST=m CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y CONFIG_SAMPLES=y +CONFIG_SAMPLE_FTRACE_DIRECT=m CONFIG_SAMPLE_TRACE_PRINTK=m CONFIG_KALLSYMS_ALL=y diff --git a/tools/testing/selftests/ftrace/ftracetest b/tools/testing/selftests/ftrace/ftracetest index 063ecb290a5a..a4605b5ee66d 100755 --- a/tools/testing/selftests/ftrace/ftracetest +++ b/tools/testing/selftests/ftrace/ftracetest @@ -17,6 +17,7 @@ echo " -v|--verbose Increase verbosity of test messages" echo " -vv Alias of -v -v (Show all results in stdout)" echo " -vvv Alias of -v -v -v (Show all commands immediately)" echo " --fail-unsupported Treat UNSUPPORTED as a failure" +echo " --fail-unresolved Treat UNRESOLVED as a failure" echo " -d|--debug Debug mode (trace all shell commands)" echo " -l|--logdir <dir> Save logs on the <dir>" echo " If <dir> is -, all logs output in console only" @@ -29,8 +30,25 @@ err_ret=1 # kselftest skip code is 4 err_skip=4 +# cgroup RT scheduling prevents chrt commands from succeeding, which +# induces failures in test wakeup tests. Disable for the duration of +# the tests. + +readonly sched_rt_runtime=/proc/sys/kernel/sched_rt_runtime_us + +sched_rt_runtime_orig=$(cat $sched_rt_runtime) + +setup() { + echo -1 > $sched_rt_runtime +} + +cleanup() { + echo $sched_rt_runtime_orig > $sched_rt_runtime +} + errexit() { # message echo "Error: $1" 1>&2 + cleanup exit $err_ret } @@ -39,6 +57,8 @@ if [ `id -u` -ne 0 ]; then errexit "this must be run by root user" fi +setup + # Utilities absdir() { # file_path (cd `dirname $1`; pwd) @@ -93,6 +113,10 @@ parse_opts() { # opts UNSUPPORTED_RESULT=1 shift 1 ;; + --fail-unresolved) + UNRESOLVED_RESULT=1 + shift 1 + ;; --logdir|-l) LOG_DIR=$2 shift 2 @@ -157,6 +181,7 @@ KEEP_LOG=0 DEBUG=0 VERBOSE=0 UNSUPPORTED_RESULT=0 +UNRESOLVED_RESULT=0 STOP_FAILURE=0 # Parse command-line options parse_opts $* @@ -235,6 +260,7 @@ TOTAL_RESULT=0 INSTANCE= CASENO=0 + testcase() { # testfile CASENO=$((CASENO+1)) desc=`grep "^#[ \t]*description:" $1 | cut -f2 -d:` @@ -260,7 +286,7 @@ eval_result() { # sigval $UNRESOLVED) prlog " [${color_blue}UNRESOLVED${color_reset}]" UNRESOLVED_CASES="$UNRESOLVED_CASES $CASENO" - return 1 # this is a kind of bug.. something happened. + return $UNRESOLVED_RESULT # depends on use case ;; $UNTESTED) prlog " [${color_blue}UNTESTED${color_reset}]" @@ -273,7 +299,7 @@ eval_result() { # sigval return $UNSUPPORTED_RESULT # depends on use case ;; $XFAIL) - prlog " [${color_red}XFAIL${color_reset}]" + prlog " [${color_green}XFAIL${color_reset}]" XFAILED_CASES="$XFAILED_CASES $CASENO" return 0 ;; @@ -406,5 +432,7 @@ prlog "# of unsupported: " `echo $UNSUPPORTED_CASES | wc -w` prlog "# of xfailed: " `echo $XFAILED_CASES | wc -w` prlog "# of undefined(test bug): " `echo $UNDEFINED_CASES | wc -w` +cleanup + # if no error, return 0 exit $TOTAL_RESULT diff --git a/tools/testing/selftests/ftrace/test.d/event/event-no-pid.tc b/tools/testing/selftests/ftrace/test.d/event/event-no-pid.tc new file mode 100644 index 000000000000..f0f366f18d0c --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/event/event-no-pid.tc @@ -0,0 +1,125 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: event tracing - restricts events based on pid notrace filtering +# flags: instance + +do_reset() { + echo > set_event + echo > set_event_pid + echo > set_event_notrace_pid + echo 0 > options/event-fork + echo 0 > events/enable + clear_trace + echo 1 > tracing_on +} + +fail() { #msg + cat trace + do_reset + echo $1 + exit_fail +} + +count_pid() { + pid=$@ + cat trace | grep -v '^#' | sed -e 's/[^-]*-\([0-9]*\).*/\1/' | grep $pid | wc -l +} + +count_no_pid() { + pid=$1 + cat trace | grep -v '^#' | sed -e 's/[^-]*-\([0-9]*\).*/\1/' | grep -v $pid | wc -l +} + +enable_system() { + system=$1 + + if [ -d events/$system ]; then + echo 1 > events/$system/enable + fi +} + +enable_events() { + echo 0 > tracing_on + # Enable common groups of events, as all events can allow for + # events to be traced via scheduling that we don't care to test. + enable_system syscalls + enable_system rcu + enable_system block + enable_system exceptions + enable_system irq + enable_system net + enable_system power + enable_system signal + enable_system sock + enable_system timer + enable_system thermal + echo 1 > tracing_on +} + +if [ ! -f set_event -o ! -d events/sched ]; then + echo "event tracing is not supported" + exit_unsupported +fi + +if [ ! -f set_event_pid -o ! -f set_event_notrace_pid ]; then + echo "event pid notrace filtering is not supported" + exit_unsupported +fi + +echo 0 > options/event-fork + +do_reset + +read mypid rest < /proc/self/stat + +echo $mypid > set_event_notrace_pid +grep -q $mypid set_event_notrace_pid + +enable_events + +yield + +echo 0 > tracing_on + +cnt=`count_pid $mypid` +if [ $cnt -ne 0 ]; then + fail "Filtered out task has events" +fi + +cnt=`count_no_pid $mypid` +if [ $cnt -eq 0 ]; then + fail "No other events were recorded" +fi + +do_reset + +echo $mypid > set_event_notrace_pid +echo 1 > options/event-fork + +enable_events + +yield & +child=$! +echo "child = $child" +wait $child + +echo 0 > tracing_on + +cnt=`count_pid $mypid` +if [ $cnt -ne 0 ]; then + fail "Filtered out task has events" +fi + +cnt=`count_pid $child` +if [ $cnt -ne 0 ]; then + fail "Child of filtered out taskhas events" +fi + +cnt=`count_no_pid $mypid` +if [ $cnt -eq 0 ]; then + fail "No other events were recorded" +fi + +do_reset + +exit 0 diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/fgraph-filter-stack.tc b/tools/testing/selftests/ftrace/test.d/ftrace/fgraph-filter-stack.tc index aefab0c66d54..f59853857ad2 100644 --- a/tools/testing/selftests/ftrace/test.d/ftrace/fgraph-filter-stack.tc +++ b/tools/testing/selftests/ftrace/test.d/ftrace/fgraph-filter-stack.tc @@ -10,10 +10,7 @@ if ! grep -q function_graph available_tracers; then exit_unsupported fi -if [ ! -f set_ftrace_filter ]; then - echo "set_ftrace_filter not found? Is dynamic ftrace not set?" - exit_unsupported -fi +check_filter_file set_ftrace_filter do_reset() { if [ -e /proc/sys/kernel/stack_tracer_enabled ]; then diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/fgraph-filter.tc b/tools/testing/selftests/ftrace/test.d/ftrace/fgraph-filter.tc index c8a5209f2119..d610f47edd90 100644 --- a/tools/testing/selftests/ftrace/test.d/ftrace/fgraph-filter.tc +++ b/tools/testing/selftests/ftrace/test.d/ftrace/fgraph-filter.tc @@ -9,6 +9,8 @@ if ! grep -q function_graph available_tracers; then exit_unsupported fi +check_filter_file set_ftrace_filter + fail() { # msg echo $1 exit_fail diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-glob.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-glob.tc index f4e92afab14b..28936f434ee5 100644 --- a/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-glob.tc +++ b/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-glob.tc @@ -9,6 +9,8 @@ if ! grep -q function available_tracers; then exit_unsupported fi +check_filter_file set_ftrace_filter + disable_tracing clear_trace diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-notrace-pid.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-notrace-pid.tc new file mode 100644 index 000000000000..71db68a7975f --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-notrace-pid.tc @@ -0,0 +1,105 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: ftrace - function pid notrace filters +# flags: instance + +# Make sure that function pid matching filter with notrace works. + +if ! grep -q function available_tracers; then + echo "no function tracer configured" + exit_unsupported +fi + +if [ ! -f set_ftrace_notrace_pid ]; then + echo "set_ftrace_notrace_pid not found? Is function tracer not set?" + exit_unsupported +fi + +check_filter_file set_ftrace_filter + +do_function_fork=1 + +if [ ! -f options/function-fork ]; then + do_function_fork=0 + echo "no option for function-fork found. Option will not be tested." +fi + +read PID _ < /proc/self/stat + +if [ $do_function_fork -eq 1 ]; then + # default value of function-fork option + orig_value=`grep function-fork trace_options` +fi + +do_reset() { + if [ $do_function_fork -eq 0 ]; then + return + fi + + echo > set_ftrace_notrace_pid + echo $orig_value > trace_options +} + +fail() { # msg + do_reset + echo $1 + exit_fail +} + +do_test() { + disable_tracing + + echo do_execve* > set_ftrace_filter + echo *do_fork >> set_ftrace_filter + + echo $PID > set_ftrace_notrace_pid + echo function > current_tracer + + if [ $do_function_fork -eq 1 ]; then + # don't allow children to be traced + echo nofunction-fork > trace_options + fi + + enable_tracing + yield + + count_pid=`cat trace | grep -v ^# | grep $PID | wc -l` + count_other=`cat trace | grep -v ^# | grep -v $PID | wc -l` + + # count_pid should be 0 + if [ $count_pid -ne 0 -o $count_other -eq 0 ]; then + fail "PID filtering not working? traced task = $count_pid; other tasks = $count_other " + fi + + disable_tracing + clear_trace + + if [ $do_function_fork -eq 0 ]; then + return + fi + + # allow children to be traced + echo function-fork > trace_options + + # With pid in both set_ftrace_pid and set_ftrace_notrace_pid + # there should not be any tasks traced. + + echo $PID > set_ftrace_pid + + enable_tracing + yield + + count_pid=`cat trace | grep -v ^# | grep $PID | wc -l` + count_other=`cat trace | grep -v ^# | grep -v $PID | wc -l` + + # both should be zero + if [ $count_pid -ne 0 -o $count_other -ne 0 ]; then + fail "PID filtering not following fork? traced task = $count_pid; other tasks = $count_other " + fi +} + +do_test + +do_reset + +exit 0 diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-pid.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-pid.tc index f2ee1e889e13..d58403c4b7cd 100644 --- a/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-pid.tc +++ b/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-pid.tc @@ -16,10 +16,7 @@ if [ ! -f set_ftrace_pid ]; then exit_unsupported fi -if [ ! -f set_ftrace_filter ]; then - echo "set_ftrace_filter not found? Is function tracer not set?" - exit_unsupported -fi +check_filter_file set_ftrace_filter do_function_fork=1 diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-stacktrace.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-stacktrace.tc index 1a52f2883fe0..b2aff786c1a2 100644 --- a/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-stacktrace.tc +++ b/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-stacktrace.tc @@ -3,7 +3,7 @@ # description: ftrace - stacktrace filter command # flags: instance -[ ! -f set_ftrace_filter ] && exit_unsupported +check_filter_file set_ftrace_filter echo _do_fork:stacktrace >> set_ftrace_filter diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func_event_triggers.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func_event_triggers.tc index ca2ffd7957f9..e9b1fd534e96 100644 --- a/tools/testing/selftests/ftrace/test.d/ftrace/func_event_triggers.tc +++ b/tools/testing/selftests/ftrace/test.d/ftrace/func_event_triggers.tc @@ -11,10 +11,7 @@ # # The triggers are set within the set_ftrace_filter file -if [ ! -f set_ftrace_filter ]; then - echo "set_ftrace_filter not found? Is dynamic ftrace not set?" - exit_unsupported -fi +check_filter_file set_ftrace_filter do_reset() { reset_ftrace_filter diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func_mod_trace.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func_mod_trace.tc index 9330c873f9fe..1a4b4a442d33 100644 --- a/tools/testing/selftests/ftrace/test.d/ftrace/func_mod_trace.tc +++ b/tools/testing/selftests/ftrace/test.d/ftrace/func_mod_trace.tc @@ -2,7 +2,7 @@ # SPDX-License-Identifier: GPL-2.0 # description: ftrace - function trace on module -[ ! -f set_ftrace_filter ] && exit_unsupported +check_filter_file set_ftrace_filter : "mod: allows to filter a non exist function" echo 'non_exist_func:mod:non_exist_module' > set_ftrace_filter diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func_profiler.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func_profiler.tc index dfbae637c60c..a3dadb6b93b4 100644 --- a/tools/testing/selftests/ftrace/test.d/ftrace/func_profiler.tc +++ b/tools/testing/selftests/ftrace/test.d/ftrace/func_profiler.tc @@ -18,10 +18,7 @@ if ! grep -q function_graph available_tracers; then exit_unsupported; fi -if [ ! -f set_ftrace_filter ]; then - echo "set_ftrace_filter not found? Is dynamic ftrace not set?" - exit_unsupported -fi +check_filter_file set_ftrace_filter if [ ! -f function_profile_enabled ]; then echo "function_profile_enabled not found, function profiling enabled?" diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc index 51f6e6146bd9..70bad441fa7d 100644 --- a/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc +++ b/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc @@ -10,10 +10,7 @@ # # The triggers are set within the set_ftrace_filter file -if [ ! -f set_ftrace_filter ]; then - echo "set_ftrace_filter not found? Is dynamic ftrace not set?" - exit_unsupported -fi +check_filter_file set_ftrace_filter fail() { # mesg echo $1 diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func_stack_tracer.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func_stack_tracer.tc index b414f0e3c646..51e9e80bc0e6 100644 --- a/tools/testing/selftests/ftrace/test.d/ftrace/func_stack_tracer.tc +++ b/tools/testing/selftests/ftrace/test.d/ftrace/func_stack_tracer.tc @@ -8,6 +8,8 @@ if [ ! -f stack_trace ]; then exit_unsupported fi +check_filter_file stack_trace_filter + echo > stack_trace_filter echo 0 > stack_max_size echo 1 > /proc/sys/kernel/stack_tracer_enabled diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func_traceonoff_triggers.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func_traceonoff_triggers.tc index 0c04282d33dd..3ed173f2944a 100644 --- a/tools/testing/selftests/ftrace/test.d/ftrace/func_traceonoff_triggers.tc +++ b/tools/testing/selftests/ftrace/test.d/ftrace/func_traceonoff_triggers.tc @@ -11,10 +11,7 @@ # # The triggers are set within the set_ftrace_filter file -if [ ! -f set_ftrace_filter ]; then - echo "set_ftrace_filter not found? Is dynamic ftrace not set?" - exit_unsupported -fi +check_filter_file set_ftrace_filter fail() { # mesg echo $1 @@ -41,7 +38,7 @@ fi echo '** ENABLE EVENTS' -echo 1 > events/enable +echo 1 > events/sched/enable echo '** ENABLE TRACING' enable_tracing diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/tracing-error-log.tc b/tools/testing/selftests/ftrace/test.d/ftrace/tracing-error-log.tc index 021c03fd885d..23465823532b 100644 --- a/tools/testing/selftests/ftrace/test.d/ftrace/tracing-error-log.tc +++ b/tools/testing/selftests/ftrace/test.d/ftrace/tracing-error-log.tc @@ -14,6 +14,8 @@ if [ ! -f set_event ]; then exit_unsupported fi +[ -f error_log ] || exit_unsupported + ftrace_errlog_check 'event filter parse error' '((sig >= 10 && sig < 15) || dsig ^== 17) && comm != bash' 'events/signal/signal_generate/filter' exit 0 diff --git a/tools/testing/selftests/ftrace/test.d/functions b/tools/testing/selftests/ftrace/test.d/functions index 5d4550591ff9..697c77ef2e2b 100644 --- a/tools/testing/selftests/ftrace/test.d/functions +++ b/tools/testing/selftests/ftrace/test.d/functions @@ -1,3 +1,9 @@ +check_filter_file() { # check filter file introduced by dynamic ftrace + if [ ! -f "$1" ]; then + echo "$1 not found? Is dynamic ftrace not set?" + exit_unsupported + fi +} clear_trace() { # reset trace output echo > trace @@ -113,12 +119,14 @@ yield() { ping $LOCALHOST -c 1 || sleep .001 || usleep 1 || sleep 1 } +# Since probe event command may include backslash, explicitly use printf "%s" +# to NOT interpret it. ftrace_errlog_check() { # err-prefix command-with-error-pos-by-^ command-file - pos=$(echo -n "${2%^*}" | wc -c) # error position - command=$(echo "$2" | tr -d ^) + pos=$(printf "%s" "${2%^*}" | wc -c) # error position + command=$(printf "%s" "$2" | tr -d ^) echo "Test command: $command" echo > error_log - (! echo "$command" >> "$3" ) 2> /dev/null + (! printf "%s" "$command" >> "$3" ) 2> /dev/null grep "$1: error:" -A 3 error_log N=$(tail -n 1 error_log | wc -c) # " Command: " and "^\n" => 13 diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_type.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_type.tc index 1bcb67dcae26..81490ecaaa92 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_type.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_type.tc @@ -38,7 +38,7 @@ for width in 64 32 16 8; do echo 0 > events/kprobes/testprobe/enable : "Confirm the arguments is recorded in given types correctly" - ARGS=`grep "testprobe" trace | sed -e 's/.* arg1=\(.*\) arg2=\(.*\) arg3=\(.*\) arg4=\(.*\)/\1 \2 \3 \4/'` + ARGS=`grep "testprobe" trace | head -n 1 | sed -e 's/.* arg1=\(.*\) arg2=\(.*\) arg3=\(.*\) arg4=\(.*\)/\1 \2 \3 \4/'` check_types $ARGS $width : "Clear event for next loop" diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_ftrace.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_ftrace.tc index 7650a82db3f5..df5072815b87 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_ftrace.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_ftrace.tc @@ -5,6 +5,8 @@ [ -f kprobe_events ] || exit_unsupported # this is configurable grep "function" available_tracers || exit_unsupported # this is configurable +check_filter_file set_ftrace_filter + # prepare echo nop > current_tracer echo _do_fork > set_ftrace_filter diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc index ef1e9bafb098..eb0f4ab4e070 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc @@ -91,7 +91,9 @@ esac if grep -q "Create/append/" README && grep -q "imm-value" README; then echo 'p:kprobes/testevent _do_fork' > kprobe_events check_error '^r:kprobes/testevent do_exit' # DIFF_PROBE_TYPE -echo 'p:kprobes/testevent _do_fork abcd=\1' > kprobe_events + +# Explicitly use printf "%s" to not interpret \1 +printf "%s" 'p:kprobes/testevent _do_fork abcd=\1' > kprobe_events check_error 'p:kprobes/testevent _do_fork ^bcd=\1' # DIFF_ARG_TYPE check_error 'p:kprobes/testevent _do_fork ^abcd=\1:u8' # DIFF_ARG_TYPE check_error 'p:kprobes/testevent _do_fork ^abcd=\"foo"' # DIFF_ARG_TYPE diff --git a/tools/testing/selftests/ftrace/test.d/preemptirq/irqsoff_tracer.tc b/tools/testing/selftests/ftrace/test.d/preemptirq/irqsoff_tracer.tc index cbd174334a48..2b82c80edf69 100644 --- a/tools/testing/selftests/ftrace/test.d/preemptirq/irqsoff_tracer.tc +++ b/tools/testing/selftests/ftrace/test.d/preemptirq/irqsoff_tracer.tc @@ -17,7 +17,14 @@ unsup() { #msg exit_unsupported } -modprobe $MOD || unsup "$MOD module not available" +unres() { #msg + reset_tracer + rmmod $MOD || true + echo $1 + exit_unresolved +} + +modprobe $MOD || unres "$MOD module not available" rmmod $MOD grep -q "preemptoff" available_tracers || unsup "preemptoff tracer not enabled" diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-field-variable-support.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-field-variable-support.tc index 77be6e1f6e7b..e232059a8ab2 100644 --- a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-field-variable-support.tc +++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-field-variable-support.tc @@ -17,6 +17,11 @@ if [ ! -f synthetic_events ]; then exit_unsupported fi +if [ ! -f events/sched/sched_process_fork/hist ]; then + echo "hist trigger is not supported" + exit_unsupported +fi + echo "Test field variable support" echo 'wakeup_latency u64 lat; pid_t pid; int prio; char comm[16]' > synthetic_events diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-inter-event-combined-hist.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-inter-event-combined-hist.tc index f3eb8aacec0e..07cfcb8157b6 100644 --- a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-inter-event-combined-hist.tc +++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-inter-event-combined-hist.tc @@ -17,6 +17,11 @@ if [ ! -f synthetic_events ]; then exit_unsupported fi +if [ ! -f events/sched/sched_process_fork/hist ]; then + echo "hist trigger is not supported" + exit_unsupported +fi + echo "Test create synthetic event" echo 'waking_latency u64 lat pid_t pid' > synthetic_events diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-multi-actions-accept.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-multi-actions-accept.tc index d281f056f980..73e413c2ca26 100644 --- a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-multi-actions-accept.tc +++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-multi-actions-accept.tc @@ -17,6 +17,11 @@ if [ ! -f synthetic_events ]; then exit_unsupported fi +if [ ! -f events/sched/sched_process_fork/hist ]; then + echo "hist trigger is not supported" + exit_unsupported +fi + echo "Test multiple actions on hist trigger" echo 'wakeup_latency u64 lat; pid_t pid' >> synthetic_events TRIGGER1=events/sched/sched_wakeup/trigger diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-action-hist.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-action-hist.tc index a708f0e7858a..ebe0ad827f9f 100644 --- a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-action-hist.tc +++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-action-hist.tc @@ -17,6 +17,11 @@ if [ ! -f synthetic_events ]; then exit_unsupported fi +if [ ! -f events/sched/sched_process_fork/hist ]; then + echo "hist trigger is not supported" + exit_unsupported +fi + echo "Test create synthetic event" echo 'wakeup_latency u64 lat pid_t pid char comm[16]' > synthetic_events diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-onmax-action-hist.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-onmax-action-hist.tc index dfce6932d8be..2a2ef767249e 100644 --- a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-onmax-action-hist.tc +++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-onmax-action-hist.tc @@ -17,6 +17,11 @@ if [ ! -f synthetic_events ]; then exit_unsupported fi +if [ ! -f events/sched/sched_process_fork/hist ]; then + echo "hist trigger is not supported" + exit_unsupported +fi + echo "Test create synthetic event" echo 'wakeup_latency u64 lat pid_t pid char comm[16]' > synthetic_events diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmax-action-hist.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmax-action-hist.tc index 0035995c2194..98d73bfb0296 100644 --- a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmax-action-hist.tc +++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmax-action-hist.tc @@ -17,6 +17,11 @@ if [ ! -f synthetic_events ]; then exit_unsupported fi +if [ ! -f events/sched/sched_process_fork/hist ]; then + echo "hist trigger is not supported" + exit_unsupported +fi + echo "Test create synthetic event" echo 'wakeup_latency u64 lat pid_t pid char comm[16]' > synthetic_events diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-snapshot-action-hist.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-snapshot-action-hist.tc index f546c1b66a9b..01b01b9c4e07 100644 --- a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-snapshot-action-hist.tc +++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-snapshot-action-hist.tc @@ -12,6 +12,11 @@ if [ ! -f set_event ]; then exit_unsupported fi +if [ ! -f events/sched/sched_process_fork/hist ]; then + echo "hist trigger is not supported" + exit_unsupported +fi + if [ ! -f snapshot ]; then echo "snapshot is not supported" exit_unsupported diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-trace-action-hist.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-trace-action-hist.tc index 8021d60aafec..c3baa486aeb4 100644 --- a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-trace-action-hist.tc +++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-trace-action-hist.tc @@ -17,6 +17,11 @@ if [ ! -f synthetic_events ]; then exit_unsupported fi +if [ ! -f events/sched/sched_process_fork/hist ]; then + echo "hist trigger is not supported" + exit_unsupported +fi + grep -q "trace(<synthetic_event>" README || exit_unsupported # version issue echo "Test create synthetic event" diff --git a/tools/testing/selftests/ftrace/test.d/trigger/trigger-multihist.tc b/tools/testing/selftests/ftrace/test.d/trigger/trigger-multihist.tc index 18fdaab9f570..68ff3f45c720 100644 --- a/tools/testing/selftests/ftrace/test.d/trigger/trigger-multihist.tc +++ b/tools/testing/selftests/ftrace/test.d/trigger/trigger-multihist.tc @@ -23,7 +23,7 @@ if [ ! -f events/sched/sched_process_fork/hist ]; then exit_unsupported fi -echo "Test histogram multiple tiggers" +echo "Test histogram multiple triggers" echo 'hist:keys=parent_pid:vals=child_pid' > events/sched/sched_process_fork/trigger echo 'hist:keys=parent_comm:vals=child_pid' >> events/sched/sched_process_fork/trigger diff --git a/tools/testing/selftests/futex/functional/.gitignore b/tools/testing/selftests/futex/functional/.gitignore index a09f57061902..0efcd494daab 100644 --- a/tools/testing/selftests/futex/functional/.gitignore +++ b/tools/testing/selftests/futex/functional/.gitignore @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only futex_requeue_pi futex_requeue_pi_mismatched_ops futex_requeue_pi_signal_restart diff --git a/tools/testing/selftests/gen_kselftest_tar.sh b/tools/testing/selftests/gen_kselftest_tar.sh index 8b2b6088540d..4a974bc03385 100755 --- a/tools/testing/selftests/gen_kselftest_tar.sh +++ b/tools/testing/selftests/gen_kselftest_tar.sh @@ -49,6 +49,11 @@ main() # directory ./kselftest_install.sh "$install_dir" (cd "$install_work"; tar $copts "$dest"/kselftest${ext} $install_name) + + # Don't put the message at the actual end as people may be parsing the + # "archive created" line in their scripts. + echo -e "\nConsider using 'make gen_tar' instead of this script\n" + echo "Kselftest archive kselftest${ext} created!" # clean up top-level install work directory diff --git a/tools/testing/selftests/gpio/.gitignore b/tools/testing/selftests/gpio/.gitignore index 7d14f743d1a4..4c69408f3e84 100644 --- a/tools/testing/selftests/gpio/.gitignore +++ b/tools/testing/selftests/gpio/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only gpio-mockup-chardev diff --git a/tools/testing/selftests/gpio/Makefile b/tools/testing/selftests/gpio/Makefile index 0bb80619db58..32bdc978a711 100644 --- a/tools/testing/selftests/gpio/Makefile +++ b/tools/testing/selftests/gpio/Makefile @@ -1,13 +1,13 @@ # SPDX-License-Identifier: GPL-2.0 -MOUNT_CFLAGS := $(shell pkg-config --cflags mount 2>/dev/null) -MOUNT_LDLIBS := $(shell pkg-config --libs mount 2>/dev/null) -ifeq ($(MOUNT_LDLIBS),) -MOUNT_LDLIBS := -lmount -I/usr/include/libmount +VAR_CFLAGS := $(shell pkg-config --cflags mount 2>/dev/null) +VAR_LDLIBS := $(shell pkg-config --libs mount 2>/dev/null) +ifeq ($(VAR_LDLIBS),) +VAR_LDLIBS := -lmount -I/usr/include/libmount endif -CFLAGS += -O2 -g -std=gnu99 -Wall -I../../../../usr/include/ $(MOUNT_CFLAGS) -LDLIBS += $(MOUNT_LDLIBS) +CFLAGS += -O2 -g -std=gnu99 -Wall -I../../../../usr/include/ $(VAR_CFLAGS) +LDLIBS += $(VAR_LDLIBS) TEST_PROGS := gpio-mockup.sh TEST_FILES := gpio-mockup-sysfs.sh diff --git a/tools/testing/selftests/ia64/.gitignore b/tools/testing/selftests/ia64/.gitignore index ab806edc8732..e962fb2a08d5 100644 --- a/tools/testing/selftests/ia64/.gitignore +++ b/tools/testing/selftests/ia64/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only aliasing-test diff --git a/tools/testing/selftests/intel_pstate/.gitignore b/tools/testing/selftests/intel_pstate/.gitignore index 3bfcbae5fa13..862de222a3f3 100644 --- a/tools/testing/selftests/intel_pstate/.gitignore +++ b/tools/testing/selftests/intel_pstate/.gitignore @@ -1,2 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only aperf msr diff --git a/tools/testing/selftests/intel_pstate/Makefile b/tools/testing/selftests/intel_pstate/Makefile index 7340fd6a9a9f..39f0fa2a8fd6 100644 --- a/tools/testing/selftests/intel_pstate/Makefile +++ b/tools/testing/selftests/intel_pstate/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 CFLAGS := $(CFLAGS) -Wall -D_GNU_SOURCE -LDLIBS := $(LDLIBS) -lm +LDLIBS += -lm uname_M := $(shell uname -m 2>/dev/null || echo not) ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/) diff --git a/tools/testing/selftests/ipc/.gitignore b/tools/testing/selftests/ipc/.gitignore index 9af04c9353c0..9ed280e4c704 100644 --- a/tools/testing/selftests/ipc/.gitignore +++ b/tools/testing/selftests/ipc/.gitignore @@ -1,2 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only msgque_test msgque diff --git a/tools/testing/selftests/ipc/msgque.c b/tools/testing/selftests/ipc/msgque.c index 4c156aeab6b8..5ec4d9e18806 100644 --- a/tools/testing/selftests/ipc/msgque.c +++ b/tools/testing/selftests/ipc/msgque.c @@ -137,7 +137,7 @@ int dump_queue(struct msgque_data *msgque) for (kern_id = 0; kern_id < 256; kern_id++) { ret = msgctl(kern_id, MSG_STAT, &ds); if (ret < 0) { - if (errno == -EINVAL) + if (errno == EINVAL) continue; printf("Failed to get stats for IPC queue with id %d\n", kern_id); diff --git a/tools/testing/selftests/ir/.gitignore b/tools/testing/selftests/ir/.gitignore index 070ea0c75fb8..0bbada8c1811 100644 --- a/tools/testing/selftests/ir/.gitignore +++ b/tools/testing/selftests/ir/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only ir_loopback diff --git a/tools/testing/selftests/kcmp/.gitignore b/tools/testing/selftests/kcmp/.gitignore index 5a9b3732b2de..38ccdfe80ef7 100644 --- a/tools/testing/selftests/kcmp/.gitignore +++ b/tools/testing/selftests/kcmp/.gitignore @@ -1,2 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only kcmp_test kcmp-test-file diff --git a/tools/testing/selftests/kmod/kmod.sh b/tools/testing/selftests/kmod/kmod.sh index 8b944cf042f6..3702dbcc90a7 100755 --- a/tools/testing/selftests/kmod/kmod.sh +++ b/tools/testing/selftests/kmod/kmod.sh @@ -61,6 +61,8 @@ ALL_TESTS="$ALL_TESTS 0006:10:1" ALL_TESTS="$ALL_TESTS 0007:5:1" ALL_TESTS="$ALL_TESTS 0008:150:1" ALL_TESTS="$ALL_TESTS 0009:150:1" +ALL_TESTS="$ALL_TESTS 0010:1:1" +ALL_TESTS="$ALL_TESTS 0011:1:1" # Kselftest framework requirement - SKIP code is 4. ksft_skip=4 @@ -149,6 +151,7 @@ function load_req_mod() test_finish() { + echo "$MODPROBE" > /proc/sys/kernel/modprobe echo "Test completed" } @@ -443,6 +446,30 @@ kmod_test_0009() config_expect_result ${FUNCNAME[0]} SUCCESS } +kmod_test_0010() +{ + kmod_defaults_driver + config_num_threads 1 + echo "/KMOD_TEST_NONEXISTENT" > /proc/sys/kernel/modprobe + config_trigger ${FUNCNAME[0]} + config_expect_result ${FUNCNAME[0]} -ENOENT + echo "$MODPROBE" > /proc/sys/kernel/modprobe +} + +kmod_test_0011() +{ + kmod_defaults_driver + config_num_threads 1 + # This causes the kernel to not even try executing modprobe. The error + # code is still -ENOENT like when modprobe doesn't exist, so we can't + # easily test for the exact difference. But this still is a useful test + # since there was a bug where request_module() returned 0 in this case. + echo > /proc/sys/kernel/modprobe + config_trigger ${FUNCNAME[0]} + config_expect_result ${FUNCNAME[0]} -ENOENT + echo "$MODPROBE" > /proc/sys/kernel/modprobe +} + list_tests() { echo "Test ID list:" @@ -460,6 +487,8 @@ list_tests() echo "0007 x $(get_test_count 0007) - multithreaded tests with default setup test request_module() and get_fs_type()" echo "0008 x $(get_test_count 0008) - multithreaded - push kmod_concurrent over max_modprobes for request_module()" echo "0009 x $(get_test_count 0009) - multithreaded - push kmod_concurrent over max_modprobes for get_fs_type()" + echo "0010 x $(get_test_count 0010) - test nonexistent modprobe path" + echo "0011 x $(get_test_count 0011) - test completely disabling module autoloading" } usage() @@ -505,18 +534,23 @@ function test_num() fi } -function get_test_count() +function get_test_data() { test_num $1 - TEST_DATA=$(echo $ALL_TESTS | awk '{print $'$1'}') + local field_num=$(echo $1 | sed 's/^0*//') + echo $ALL_TESTS | awk '{print $'$field_num'}' +} + +function get_test_count() +{ + TEST_DATA=$(get_test_data $1) LAST_TWO=${TEST_DATA#*:*} echo ${LAST_TWO%:*} } function get_test_enabled() { - test_num $1 - TEST_DATA=$(echo $ALL_TESTS | awk '{print $'$1'}') + TEST_DATA=$(get_test_data $1) echo ${TEST_DATA#*:*:} } @@ -611,6 +645,7 @@ test_reqs allow_user_defaults load_req_mod +MODPROBE=$(</proc/sys/kernel/modprobe) trap "test_finish" EXIT parse_args $@ diff --git a/tools/testing/selftests/kselftest/runner.sh b/tools/testing/selftests/kselftest/runner.sh index e84d901f8567..676b3a8b114d 100644 --- a/tools/testing/selftests/kselftest/runner.sh +++ b/tools/testing/selftests/kselftest/runner.sh @@ -33,7 +33,7 @@ tap_timeout() { # Make sure tests will time out if utility is available. if [ -x /usr/bin/timeout ] ; then - /usr/bin/timeout "$kselftest_timeout" "$1" + /usr/bin/timeout --foreground "$kselftest_timeout" "$1" else "$1" fi diff --git a/tools/testing/selftests/kselftest_deps.sh b/tools/testing/selftests/kselftest_deps.sh new file mode 100755 index 000000000000..bbc04646346b --- /dev/null +++ b/tools/testing/selftests/kselftest_deps.sh @@ -0,0 +1,272 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# kselftest_deps.sh +# +# Checks for kselftest build dependencies on the build system. +# Copyright (c) 2020 Shuah Khan <skhan@linuxfoundation.org> +# +# + +usage() +{ + +echo -e "Usage: $0 -[p] <compiler> [test_name]\n" +echo -e "\tkselftest_deps.sh [-p] gcc" +echo -e "\tkselftest_deps.sh [-p] gcc vm" +echo -e "\tkselftest_deps.sh [-p] aarch64-linux-gnu-gcc" +echo -e "\tkselftest_deps.sh [-p] aarch64-linux-gnu-gcc vm\n" +echo "- Should be run in selftests directory in the kernel repo." +echo "- Checks if Kselftests can be built/cross-built on a system." +echo "- Parses all test/sub-test Makefile to find library dependencies." +echo "- Runs compile test on a trivial C file with LDLIBS specified" +echo " in the test Makefiles to identify missing library dependencies." +echo "- Prints suggested target list for a system filtering out tests" +echo " failed the build dependency check from the TARGETS in Selftests" +echo " main Makefile when optional -p is specified." +echo "- Prints pass/fail dependency check for each tests/sub-test." +echo "- Prints pass/fail targets and libraries." +echo "- Default: runs dependency checks on all tests." +echo "- Optional test name can be specified to check dependencies for it." +exit 1 + +} + +# Start main() +main() +{ + +base_dir=`pwd` +# Make sure we're in the selftests top-level directory. +if [ $(basename "$base_dir") != "selftests" ]; then + echo -e "\tPlease run $0 in" + echo -e "\ttools/testing/selftests directory ..." + exit 1 +fi + +print_targets=0 + +while getopts "p" arg; do + case $arg in + p) + print_targets=1 + shift;; + esac +done + +if [ $# -eq 0 ] +then + usage +fi + +# Compiler +CC=$1 + +tmp_file=$(mktemp).c +trap "rm -f $tmp_file.o $tmp_file $tmp_file.bin" EXIT +#echo $tmp_file + +pass=$(mktemp).out +trap "rm -f $pass" EXIT +#echo $pass + +fail=$(mktemp).out +trap "rm -f $fail" EXIT +#echo $fail + +# Generate tmp source fire for compile test +cat << "EOF" > $tmp_file +int main() +{ +} +EOF + +# Save results +total_cnt=0 +fail_trgts=() +fail_libs=() +fail_cnt=0 +pass_trgts=() +pass_libs=() +pass_cnt=0 + +# Get all TARGETS from selftests Makefile +targets=$(egrep "^TARGETS +|^TARGETS =" Makefile | cut -d "=" -f2) + +# Single test case +if [ $# -eq 2 ] +then + test=$2/Makefile + + l1_test $test + l2_test $test + l3_test $test + + print_results $1 $2 + exit $? +fi + +# Level 1: LDLIBS set static. +# +# Find all LDLIBS set statically for all executables built by a Makefile +# and filter out VAR_LDLIBS to discard the following: +# gpio/Makefile:LDLIBS += $(VAR_LDLIBS) +# Append space at the end of the list to append more tests. + +l1_tests=$(grep -r --include=Makefile "^LDLIBS" | \ + grep -v "VAR_LDLIBS" | awk -F: '{print $1}') + +# Level 2: LDLIBS set dynamically. +# +# Level 2 +# Some tests have multiple valid LDLIBS lines for individual sub-tests +# that need dependency checks. Find them and append them to the tests +# e.g: vm/Makefile:$(OUTPUT)/userfaultfd: LDLIBS += -lpthread +# Filter out VAR_LDLIBS to discard the following: +# memfd/Makefile:$(OUTPUT)/fuse_mnt: LDLIBS += $(VAR_LDLIBS) +# Append space at the end of the list to append more tests. + +l2_tests=$(grep -r --include=Makefile ": LDLIBS" | \ + grep -v "VAR_LDLIBS" | awk -F: '{print $1}') + +# Level 3 +# gpio, memfd and others use pkg-config to find mount and fuse libs +# respectively and save it in VAR_LDLIBS. If pkg-config doesn't find +# any, VAR_LDLIBS set to default. +# Use the default value and filter out pkg-config for dependency check. +# e.g: +# gpio/Makefile +# VAR_LDLIBS := $(shell pkg-config --libs mount) 2>/dev/null) +# memfd/Makefile +# VAR_LDLIBS := $(shell pkg-config fuse --libs 2>/dev/null) + +l3_tests=$(grep -r --include=Makefile "^VAR_LDLIBS" | \ + grep -v "pkg-config" | awk -F: '{print $1}') + +#echo $l1_tests +#echo $l2_1_tests +#echo $l3_tests + +all_tests +print_results $1 $2 + +exit $? +} +# end main() + +all_tests() +{ + for test in $l1_tests; do + l1_test $test + done + + for test in $l2_tests; do + l2_test $test + done + + for test in $l3_tests; do + l3_test $test + done +} + +# Use same parsing used for l1_tests and pick libraries this time. +l1_test() +{ + test_libs=$(grep --include=Makefile "^LDLIBS" $test | \ + grep -v "VAR_LDLIBS" | \ + sed -e 's/\:/ /' | \ + sed -e 's/+/ /' | cut -d "=" -f 2) + + check_libs $test $test_libs +} + +# Use same parsing used for l2__tests and pick libraries this time. +l2_test() +{ + test_libs=$(grep --include=Makefile ": LDLIBS" $test | \ + grep -v "VAR_LDLIBS" | \ + sed -e 's/\:/ /' | sed -e 's/+/ /' | \ + cut -d "=" -f 2) + + check_libs $test $test_libs +} + +l3_test() +{ + test_libs=$(grep --include=Makefile "^VAR_LDLIBS" $test | \ + grep -v "pkg-config" | sed -e 's/\:/ /' | + sed -e 's/+/ /' | cut -d "=" -f 2) + + check_libs $test $test_libs +} + +check_libs() +{ + +if [[ ! -z "${test_libs// }" ]] +then + + #echo $test_libs + + for lib in $test_libs; do + + let total_cnt+=1 + $CC -o $tmp_file.bin $lib $tmp_file > /dev/null 2>&1 + if [ $? -ne 0 ]; then + echo "FAIL: $test dependency check: $lib" >> $fail + let fail_cnt+=1 + fail_libs+="$lib " + fail_target=$(echo "$test" | cut -d "/" -f1) + fail_trgts+="$fail_target " + targets=$(echo "$targets" | grep -v "$fail_target") + else + echo "PASS: $test dependency check passed $lib" >> $pass + let pass_cnt+=1 + pass_libs+="$lib " + pass_trgts+="$(echo "$test" | cut -d "/" -f1) " + fi + + done +fi +} + +print_results() +{ + echo -e "========================================================"; + echo -e "Kselftest Dependency Check for [$0 $1 $2] results..." + + if [ $print_targets -ne 0 ] + then + echo -e "Suggested Selftest Targets for your configuration:" + echo -e "$targets"; + fi + + echo -e "========================================================"; + echo -e "Checked tests defining LDLIBS dependencies" + echo -e "--------------------------------------------------------"; + echo -e "Total tests with Dependencies:" + echo -e "$total_cnt Pass: $pass_cnt Fail: $fail_cnt"; + + if [ $pass_cnt -ne 0 ]; then + echo -e "--------------------------------------------------------"; + cat $pass + echo -e "--------------------------------------------------------"; + echo -e "Targets passed build dependency check on system:" + echo -e "$(echo "$pass_trgts" | xargs -n1 | sort -u | xargs)" + fi + + if [ $fail_cnt -ne 0 ]; then + echo -e "--------------------------------------------------------"; + cat $fail + echo -e "--------------------------------------------------------"; + echo -e "Targets failed build dependency check on system:" + echo -e "$(echo "$fail_trgts" | xargs -n1 | sort -u | xargs)" + echo -e "--------------------------------------------------------"; + echo -e "Missing libraries system" + echo -e "$(echo "$fail_libs" | xargs -n1 | sort -u | xargs)" + fi + + echo -e "--------------------------------------------------------"; + echo -e "========================================================"; +} + +main "$@" diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 5336b26506ab..c9f03ef93338 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -168,9 +168,17 @@ #define __TEST_IMPL(test_name, _signal) \ static void test_name(struct __test_metadata *_metadata); \ + static inline void wrapper_##test_name( \ + struct __test_metadata *_metadata, \ + struct __fixture_variant_metadata *variant) \ + { \ + test_name(_metadata); \ + } \ static struct __test_metadata _##test_name##_object = \ - { .name = "global." #test_name, \ - .fn = &test_name, .termsig = _signal, \ + { .name = #test_name, \ + .fn = &wrapper_##test_name, \ + .fixture = &_fixture_global, \ + .termsig = _signal, \ .timeout = TEST_TIMEOUT_DEFAULT, }; \ static void __attribute__((constructor)) _register_##test_name(void) \ { \ @@ -212,10 +220,13 @@ * populated and cleaned up using FIXTURE_SETUP() and FIXTURE_TEARDOWN(). */ #define FIXTURE(fixture_name) \ + FIXTURE_VARIANT(fixture_name); \ + static struct __fixture_metadata _##fixture_name##_fixture_object = \ + { .name = #fixture_name, }; \ static void __attribute__((constructor)) \ _register_##fixture_name##_data(void) \ { \ - __fixture_count++; \ + __register_fixture(&_##fixture_name##_fixture_object); \ } \ FIXTURE_DATA(fixture_name) @@ -241,7 +252,10 @@ #define FIXTURE_SETUP(fixture_name) \ void fixture_name##_setup( \ struct __test_metadata __attribute__((unused)) *_metadata, \ - FIXTURE_DATA(fixture_name) __attribute__((unused)) *self) + FIXTURE_DATA(fixture_name) __attribute__((unused)) *self, \ + const FIXTURE_VARIANT(fixture_name) \ + __attribute__((unused)) *variant) + /** * FIXTURE_TEARDOWN(fixture_name) * *_metadata* is included so that EXPECT_* and ASSERT_* work correctly. @@ -264,6 +278,59 @@ FIXTURE_DATA(fixture_name) __attribute__((unused)) *self) /** + * FIXTURE_VARIANT(fixture_name) - Optionally called once per fixture + * to declare fixture variant + * + * @fixture_name: fixture name + * + * .. code-block:: c + * + * FIXTURE_VARIANT(datatype name) { + * type property1; + * ... + * }; + * + * Defines type of constant parameters provided to FIXTURE_SETUP() and TEST_F() + * as *variant*. Variants allow the same tests to be run with different + * arguments. + */ +#define FIXTURE_VARIANT(fixture_name) struct _fixture_variant_##fixture_name + +/** + * FIXTURE_VARIANT_ADD(fixture_name, variant_name) - Called once per fixture + * variant to setup and register the data + * + * @fixture_name: fixture name + * @variant_name: name of the parameter set + * + * .. code-block:: c + * + * FIXTURE_ADD(datatype name) { + * .property1 = val1; + * ... + * }; + * + * Defines a variant of the test fixture, provided to FIXTURE_SETUP() and + * TEST_F() as *variant*. Tests of each fixture will be run once for each + * variant. + */ +#define FIXTURE_VARIANT_ADD(fixture_name, variant_name) \ + extern FIXTURE_VARIANT(fixture_name) \ + _##fixture_name##_##variant_name##_variant; \ + static struct __fixture_variant_metadata \ + _##fixture_name##_##variant_name##_object = \ + { .name = #variant_name, \ + .data = &_##fixture_name##_##variant_name##_variant}; \ + static void __attribute__((constructor)) \ + _register_##fixture_name##_##variant_name(void) \ + { \ + __register_fixture_variant(&_##fixture_name##_fixture_object, \ + &_##fixture_name##_##variant_name##_object); \ + } \ + FIXTURE_VARIANT(fixture_name) \ + _##fixture_name##_##variant_name##_variant = + +/** * TEST_F(fixture_name, test_name) - Emits test registration and helpers for * fixture-based test cases * @@ -293,24 +360,27 @@ #define __TEST_F_IMPL(fixture_name, test_name, signal, tmout) \ static void fixture_name##_##test_name( \ struct __test_metadata *_metadata, \ - FIXTURE_DATA(fixture_name) *self); \ + FIXTURE_DATA(fixture_name) *self, \ + const FIXTURE_VARIANT(fixture_name) *variant); \ static inline void wrapper_##fixture_name##_##test_name( \ - struct __test_metadata *_metadata) \ + struct __test_metadata *_metadata, \ + struct __fixture_variant_metadata *variant) \ { \ /* fixture data is alloced, setup, and torn down per call. */ \ FIXTURE_DATA(fixture_name) self; \ memset(&self, 0, sizeof(FIXTURE_DATA(fixture_name))); \ - fixture_name##_setup(_metadata, &self); \ + fixture_name##_setup(_metadata, &self, variant->data); \ /* Let setup failure terminate early. */ \ if (!_metadata->passed) \ return; \ - fixture_name##_##test_name(_metadata, &self); \ + fixture_name##_##test_name(_metadata, &self, variant->data); \ fixture_name##_teardown(_metadata, &self); \ } \ static struct __test_metadata \ _##fixture_name##_##test_name##_object = { \ - .name = #fixture_name "." #test_name, \ + .name = #test_name, \ .fn = &wrapper_##fixture_name##_##test_name, \ + .fixture = &_##fixture_name##_fixture_object, \ .termsig = signal, \ .timeout = tmout, \ }; \ @@ -321,7 +391,9 @@ } \ static void fixture_name##_##test_name( \ struct __test_metadata __attribute__((unused)) *_metadata, \ - FIXTURE_DATA(fixture_name) __attribute__((unused)) *self) + FIXTURE_DATA(fixture_name) __attribute__((unused)) *self, \ + const FIXTURE_VARIANT(fixture_name) \ + __attribute__((unused)) *variant) /** * TEST_HARNESS_MAIN - Simple wrapper to run the test harness @@ -631,28 +703,84 @@ } \ } while (0); OPTIONAL_HANDLER(_assert) +/* List helpers */ +#define __LIST_APPEND(head, item) \ +{ \ + /* Circular linked list where only prev is circular. */ \ + if (head == NULL) { \ + head = item; \ + item->next = NULL; \ + item->prev = item; \ + return; \ + } \ + if (__constructor_order == _CONSTRUCTOR_ORDER_FORWARD) { \ + item->next = NULL; \ + item->prev = head->prev; \ + item->prev->next = item; \ + head->prev = item; \ + } else { \ + item->next = head; \ + item->next->prev = item; \ + item->prev = item; \ + head = item; \ + } \ +} + +struct __test_metadata; +struct __fixture_variant_metadata; + +/* Contains all the information about a fixture. */ +struct __fixture_metadata { + const char *name; + struct __test_metadata *tests; + struct __fixture_variant_metadata *variant; + struct __fixture_metadata *prev, *next; +} _fixture_global __attribute__((unused)) = { + .name = "global", + .prev = &_fixture_global, +}; + +static struct __fixture_metadata *__fixture_list = &_fixture_global; +static int __constructor_order; + +#define _CONSTRUCTOR_ORDER_FORWARD 1 +#define _CONSTRUCTOR_ORDER_BACKWARD -1 + +static inline void __register_fixture(struct __fixture_metadata *f) +{ + __LIST_APPEND(__fixture_list, f); +} + +struct __fixture_variant_metadata { + const char *name; + const void *data; + struct __fixture_variant_metadata *prev, *next; +}; + +static inline void +__register_fixture_variant(struct __fixture_metadata *f, + struct __fixture_variant_metadata *variant) +{ + __LIST_APPEND(f->variant, variant); +} + /* Contains all the information for test execution and status checking. */ struct __test_metadata { const char *name; - void (*fn)(struct __test_metadata *); + void (*fn)(struct __test_metadata *, + struct __fixture_variant_metadata *); + pid_t pid; /* pid of test when being run */ + struct __fixture_metadata *fixture; int termsig; int passed; int trigger; /* extra handler after the evaluation */ - int timeout; + int timeout; /* seconds to wait for test timeout */ + bool timed_out; /* did this test timeout instead of exiting? */ __u8 step; bool no_print; /* manual trigger when TH_LOG_STREAM is not available */ struct __test_metadata *prev, *next; }; -/* Storage for the (global) tests to be run. */ -static struct __test_metadata *__test_list; -static unsigned int __test_count; -static unsigned int __fixture_count; -static int __constructor_order; - -#define _CONSTRUCTOR_ORDER_FORWARD 1 -#define _CONSTRUCTOR_ORDER_BACKWARD -1 - /* * Since constructors are called in reverse order, reverse the test * list so tests are run in source declaration order. @@ -664,25 +792,7 @@ static int __constructor_order; */ static inline void __register_test(struct __test_metadata *t) { - __test_count++; - /* Circular linked list where only prev is circular. */ - if (__test_list == NULL) { - __test_list = t; - t->next = NULL; - t->prev = t; - return; - } - if (__constructor_order == _CONSTRUCTOR_ORDER_FORWARD) { - t->next = NULL; - t->prev = __test_list->prev; - t->prev->next = t; - __test_list->prev = t; - } else { - t->next = __test_list; - t->next->prev = t; - t->prev = t; - __test_list = t; - } + __LIST_APPEND(t->fixture->tests, t); } static inline int __bail(int for_realz, bool no_print, __u8 step) @@ -695,84 +805,160 @@ static inline int __bail(int for_realz, bool no_print, __u8 step) return 0; } -void __run_test(struct __test_metadata *t) +struct __test_metadata *__active_test; +static void __timeout_handler(int sig, siginfo_t *info, void *ucontext) +{ + struct __test_metadata *t = __active_test; + + /* Sanity check handler execution environment. */ + if (!t) { + fprintf(TH_LOG_STREAM, + "no active test in SIGALRM handler!?\n"); + abort(); + } + if (sig != SIGALRM || sig != info->si_signo) { + fprintf(TH_LOG_STREAM, + "%s: SIGALRM handler caught signal %d!?\n", + t->name, sig != SIGALRM ? sig : info->si_signo); + abort(); + } + + t->timed_out = true; + kill(t->pid, SIGKILL); +} + +void __wait_for_test(struct __test_metadata *t) { - pid_t child_pid; + struct sigaction action = { + .sa_sigaction = __timeout_handler, + .sa_flags = SA_SIGINFO, + }; + struct sigaction saved_action; int status; + if (sigaction(SIGALRM, &action, &saved_action)) { + t->passed = 0; + fprintf(TH_LOG_STREAM, + "%s: unable to install SIGALRM handler\n", + t->name); + return; + } + __active_test = t; + t->timed_out = false; + alarm(t->timeout); + waitpid(t->pid, &status, 0); + alarm(0); + if (sigaction(SIGALRM, &saved_action, NULL)) { + t->passed = 0; + fprintf(TH_LOG_STREAM, + "%s: unable to uninstall SIGALRM handler\n", + t->name); + return; + } + __active_test = NULL; + + if (t->timed_out) { + t->passed = 0; + fprintf(TH_LOG_STREAM, + "%s: Test terminated by timeout\n", t->name); + } else if (WIFEXITED(status)) { + t->passed = t->termsig == -1 ? !WEXITSTATUS(status) : 0; + if (t->termsig != -1) { + fprintf(TH_LOG_STREAM, + "%s: Test exited normally " + "instead of by signal (code: %d)\n", + t->name, + WEXITSTATUS(status)); + } else if (!t->passed) { + fprintf(TH_LOG_STREAM, + "%s: Test failed at step #%d\n", + t->name, + WEXITSTATUS(status)); + } + } else if (WIFSIGNALED(status)) { + t->passed = 0; + if (WTERMSIG(status) == SIGABRT) { + fprintf(TH_LOG_STREAM, + "%s: Test terminated by assertion\n", + t->name); + } else if (WTERMSIG(status) == t->termsig) { + t->passed = 1; + } else { + fprintf(TH_LOG_STREAM, + "%s: Test terminated unexpectedly " + "by signal %d\n", + t->name, + WTERMSIG(status)); + } + } else { + fprintf(TH_LOG_STREAM, + "%s: Test ended in some other way [%u]\n", + t->name, + status); + } +} + +void __run_test(struct __fixture_metadata *f, + struct __fixture_variant_metadata *variant, + struct __test_metadata *t) +{ + /* reset test struct */ t->passed = 1; t->trigger = 0; - printf("[ RUN ] %s\n", t->name); - alarm(t->timeout); - child_pid = fork(); - if (child_pid < 0) { + t->step = 0; + t->no_print = 0; + + printf("[ RUN ] %s%s%s.%s\n", + f->name, variant->name[0] ? "." : "", variant->name, t->name); + t->pid = fork(); + if (t->pid < 0) { printf("ERROR SPAWNING TEST CHILD\n"); t->passed = 0; - } else if (child_pid == 0) { - t->fn(t); + } else if (t->pid == 0) { + t->fn(t, variant); /* return the step that failed or 0 */ _exit(t->passed ? 0 : t->step); } else { - /* TODO(wad) add timeout support. */ - waitpid(child_pid, &status, 0); - if (WIFEXITED(status)) { - t->passed = t->termsig == -1 ? !WEXITSTATUS(status) : 0; - if (t->termsig != -1) { - fprintf(TH_LOG_STREAM, - "%s: Test exited normally " - "instead of by signal (code: %d)\n", - t->name, - WEXITSTATUS(status)); - } else if (!t->passed) { - fprintf(TH_LOG_STREAM, - "%s: Test failed at step #%d\n", - t->name, - WEXITSTATUS(status)); - } - } else if (WIFSIGNALED(status)) { - t->passed = 0; - if (WTERMSIG(status) == SIGABRT) { - fprintf(TH_LOG_STREAM, - "%s: Test terminated by assertion\n", - t->name); - } else if (WTERMSIG(status) == t->termsig) { - t->passed = 1; - } else { - fprintf(TH_LOG_STREAM, - "%s: Test terminated unexpectedly " - "by signal %d\n", - t->name, - WTERMSIG(status)); - } - } else { - fprintf(TH_LOG_STREAM, - "%s: Test ended in some other way [%u]\n", - t->name, - status); - } + __wait_for_test(t); } - printf("[ %4s ] %s\n", (t->passed ? "OK" : "FAIL"), t->name); - alarm(0); + printf("[ %4s ] %s%s%s.%s\n", (t->passed ? "OK" : "FAIL"), + f->name, variant->name[0] ? "." : "", variant->name, t->name); } static int test_harness_run(int __attribute__((unused)) argc, char __attribute__((unused)) **argv) { + struct __fixture_variant_metadata no_variant = { .name = "", }; + struct __fixture_variant_metadata *v; + struct __fixture_metadata *f; struct __test_metadata *t; int ret = 0; + unsigned int case_count = 0, test_count = 0; unsigned int count = 0; unsigned int pass_count = 0; + for (f = __fixture_list; f; f = f->next) { + for (v = f->variant ?: &no_variant; v; v = v->next) { + case_count++; + for (t = f->tests; t; t = t->next) + test_count++; + } + } + /* TODO(wad) add optional arguments similar to gtest. */ printf("[==========] Running %u tests from %u test cases.\n", - __test_count, __fixture_count + 1); - for (t = __test_list; t; t = t->next) { - count++; - __run_test(t); - if (t->passed) - pass_count++; - else - ret = 1; + test_count, case_count); + for (f = __fixture_list; f; f = f->next) { + for (v = f->variant ?: &no_variant; v; v = v->next) { + for (t = f->tests; t; t = t->next) { + count++; + __run_test(f, v, t); + if (t->passed) + pass_count++; + else + ret = 1; + } + } } printf("[==========] %u / %u tests passed.\n", pass_count, count); printf("[ %s ]\n", (ret ? "FAILED" : "PASSED")); diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 30072c3f52fb..f159718f90c0 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -1,5 +1,7 @@ -/s390x/sync_regs_test +# SPDX-License-Identifier: GPL-2.0-only /s390x/memop +/s390x/resets +/s390x/sync_regs_test /x86_64/cr4_cpuid_sync_test /x86_64/evmcs_test /x86_64/hyperv_cpuid @@ -8,6 +10,8 @@ /x86_64/set_sregs_test /x86_64/smm_test /x86_64/state_test +/x86_64/vmx_preemption_timer_test +/x86_64/svm_vmcall_test /x86_64/sync_regs_test /x86_64/vmx_close_while_nested_test /x86_64/vmx_dirty_log_test @@ -15,5 +19,8 @@ /x86_64/vmx_tsc_adjust_test /x86_64/xss_msr_test /clear_dirty_log_test +/demand_paging_test /dirty_log_test /kvm_create_max_vcpus +/set_memory_region_test +/steal_time diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index d91c53b726e6..b4ff112e5c7e 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -5,9 +5,35 @@ all: top_srcdir = ../../../.. KSFT_KHDR_INSTALL := 1 + +# For cross-builds to work, UNAME_M has to map to ARCH and arch specific +# directories and targets in this Makefile. "uname -m" doesn't map to +# arch specific sub-directory names. +# +# UNAME_M variable to used to run the compiles pointing to the right arch +# directories and build the right targets for these supported architectures. +# +# TEST_GEN_PROGS and LIBKVM are set using UNAME_M variable. +# LINUX_TOOL_ARCH_INCLUDE is set using ARCH variable. +# +# x86_64 targets are named to include x86_64 as a suffix and directories +# for includes are in x86_64 sub-directory. s390x and aarch64 follow the +# same convention. "uname -m" doesn't result in the correct mapping for +# s390x and aarch64. +# +# No change necessary for x86_64 UNAME_M := $(shell uname -m) -LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/sparsebit.c +# Set UNAME_M for arm64 compile/install to work +ifeq ($(ARCH),arm64) + UNAME_M := aarch64 +endif +# Set UNAME_M s390x compile/install to work +ifeq ($(ARCH),s390) + UNAME_M := s390x +endif + +LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/sparsebit.c lib/test_util.c LIBKVM_x86_64 = lib/x86_64/processor.c lib/x86_64/vmx.c lib/x86_64/svm.c lib/x86_64/ucall.c LIBKVM_aarch64 = lib/aarch64/processor.c lib/aarch64/ucall.c LIBKVM_s390x = lib/s390x/processor.c lib/s390x/ucall.c @@ -20,26 +46,36 @@ TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test TEST_GEN_PROGS_x86_64 += x86_64/smm_test TEST_GEN_PROGS_x86_64 += x86_64/state_test +TEST_GEN_PROGS_x86_64 += x86_64/vmx_preemption_timer_test +TEST_GEN_PROGS_x86_64 += x86_64/svm_vmcall_test TEST_GEN_PROGS_x86_64 += x86_64/sync_regs_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test TEST_GEN_PROGS_x86_64 += x86_64/xss_msr_test -TEST_GEN_PROGS_x86_64 += x86_64/svm_vmcall_test +TEST_GEN_PROGS_x86_64 += x86_64/debug_regs TEST_GEN_PROGS_x86_64 += clear_dirty_log_test +TEST_GEN_PROGS_x86_64 += demand_paging_test TEST_GEN_PROGS_x86_64 += dirty_log_test TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus +TEST_GEN_PROGS_x86_64 += set_memory_region_test +TEST_GEN_PROGS_x86_64 += steal_time TEST_GEN_PROGS_aarch64 += clear_dirty_log_test +TEST_GEN_PROGS_aarch64 += demand_paging_test TEST_GEN_PROGS_aarch64 += dirty_log_test TEST_GEN_PROGS_aarch64 += kvm_create_max_vcpus +TEST_GEN_PROGS_aarch64 += set_memory_region_test +TEST_GEN_PROGS_aarch64 += steal_time TEST_GEN_PROGS_s390x = s390x/memop -TEST_GEN_PROGS_s390x += s390x/sync_regs_test TEST_GEN_PROGS_s390x += s390x/resets +TEST_GEN_PROGS_s390x += s390x/sync_regs_test +TEST_GEN_PROGS_s390x += demand_paging_test TEST_GEN_PROGS_s390x += dirty_log_test TEST_GEN_PROGS_s390x += kvm_create_max_vcpus +TEST_GEN_PROGS_s390x += set_memory_region_test TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M)) LIBKVM += $(LIBKVM_$(UNAME_M)) @@ -47,7 +83,7 @@ LIBKVM += $(LIBKVM_$(UNAME_M)) INSTALL_HDR_PATH = $(top_srcdir)/usr LINUX_HDR_PATH = $(INSTALL_HDR_PATH)/include/ LINUX_TOOL_INCLUDE = $(top_srcdir)/tools/include -LINUX_TOOL_ARCH_INCLUDE = $(top_srcdir)/tools/arch/x86/include +LINUX_TOOL_ARCH_INCLUDE = $(top_srcdir)/tools/arch/$(ARCH)/include CFLAGS += -Wall -Wstrict-prototypes -Wuninitialized -O2 -g -std=gnu99 \ -fno-stack-protector -fno-PIE -I$(LINUX_TOOL_INCLUDE) \ -I$(LINUX_TOOL_ARCH_INCLUDE) -I$(LINUX_HDR_PATH) -Iinclude \ @@ -78,6 +114,7 @@ $(LIBKVM_OBJ): $(OUTPUT)/%.o: %.c $(OUTPUT)/libkvm.a: $(LIBKVM_OBJ) $(AR) crs $@ $^ +x := $(shell mkdir -p $(sort $(dir $(TEST_GEN_PROGS)))) all: $(STATIC_LIBS) $(TEST_GEN_PROGS): $(STATIC_LIBS) diff --git a/tools/testing/selftests/kvm/clear_dirty_log_test.c b/tools/testing/selftests/kvm/clear_dirty_log_test.c index 749336937d37..11672ec6f74e 100644 --- a/tools/testing/selftests/kvm/clear_dirty_log_test.c +++ b/tools/testing/selftests/kvm/clear_dirty_log_test.c @@ -1,2 +1,6 @@ #define USE_CLEAR_DIRTY_LOG +#define KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE (1 << 0) +#define KVM_DIRTY_LOG_INITIALLY_SET (1 << 1) +#define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \ + KVM_DIRTY_LOG_INITIALLY_SET) #include "dirty_log_test.c" diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c new file mode 100644 index 000000000000..360cd3ea4cd6 --- /dev/null +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -0,0 +1,661 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KVM demand paging test + * Adapted from dirty_log_test.c + * + * Copyright (C) 2018, Red Hat, Inc. + * Copyright (C) 2019, Google, Inc. + */ + +#define _GNU_SOURCE /* for program_invocation_name */ + +#include <stdio.h> +#include <stdlib.h> +#include <sys/syscall.h> +#include <unistd.h> +#include <asm/unistd.h> +#include <time.h> +#include <poll.h> +#include <pthread.h> +#include <linux/bitmap.h> +#include <linux/bitops.h> +#include <linux/userfaultfd.h> + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" + +#ifdef __NR_userfaultfd + +/* The memory slot index demand page */ +#define TEST_MEM_SLOT_INDEX 1 + +/* Default guest test virtual memory offset */ +#define DEFAULT_GUEST_TEST_MEM 0xc0000000 + +#define DEFAULT_GUEST_TEST_MEM_SIZE (1 << 30) /* 1G */ + +#ifdef PRINT_PER_PAGE_UPDATES +#define PER_PAGE_DEBUG(...) printf(__VA_ARGS__) +#else +#define PER_PAGE_DEBUG(...) _no_printf(__VA_ARGS__) +#endif + +#ifdef PRINT_PER_VCPU_UPDATES +#define PER_VCPU_DEBUG(...) printf(__VA_ARGS__) +#else +#define PER_VCPU_DEBUG(...) _no_printf(__VA_ARGS__) +#endif + +#define MAX_VCPUS 512 + +/* + * Guest/Host shared variables. Ensure addr_gva2hva() and/or + * sync_global_to/from_guest() are used when accessing from + * the host. READ/WRITE_ONCE() should also be used with anything + * that may change. + */ +static uint64_t host_page_size; +static uint64_t guest_page_size; + +static char *guest_data_prototype; + +/* + * Guest physical memory offset of the testing memory slot. + * This will be set to the topmost valid physical address minus + * the test memory size. + */ +static uint64_t guest_test_phys_mem; + +/* + * Guest virtual memory offset of the testing memory slot. + * Must not conflict with identity mapped test code. + */ +static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM; + +struct vcpu_args { + uint64_t gva; + uint64_t pages; + + /* Only used by the host userspace part of the vCPU thread */ + int vcpu_id; + struct kvm_vm *vm; +}; + +static struct vcpu_args vcpu_args[MAX_VCPUS]; + +/* + * Continuously write to the first 8 bytes of each page in the demand paging + * memory region. + */ +static void guest_code(uint32_t vcpu_id) +{ + uint64_t gva; + uint64_t pages; + int i; + + /* Make sure vCPU args data structure is not corrupt. */ + GUEST_ASSERT(vcpu_args[vcpu_id].vcpu_id == vcpu_id); + + gva = vcpu_args[vcpu_id].gva; + pages = vcpu_args[vcpu_id].pages; + + for (i = 0; i < pages; i++) { + uint64_t addr = gva + (i * guest_page_size); + + addr &= ~(host_page_size - 1); + *(uint64_t *)addr = 0x0123456789ABCDEF; + } + + GUEST_SYNC(1); +} + +static void *vcpu_worker(void *data) +{ + int ret; + struct vcpu_args *args = (struct vcpu_args *)data; + struct kvm_vm *vm = args->vm; + int vcpu_id = args->vcpu_id; + struct kvm_run *run; + struct timespec start, end, ts_diff; + + vcpu_args_set(vm, vcpu_id, 1, vcpu_id); + run = vcpu_state(vm, vcpu_id); + + clock_gettime(CLOCK_MONOTONIC, &start); + + /* Let the guest access its memory */ + ret = _vcpu_run(vm, vcpu_id); + TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret); + if (get_ucall(vm, vcpu_id, NULL) != UCALL_SYNC) { + TEST_ASSERT(false, + "Invalid guest sync status: exit_reason=%s\n", + exit_reason_str(run->exit_reason)); + } + + clock_gettime(CLOCK_MONOTONIC, &end); + ts_diff = timespec_sub(end, start); + PER_VCPU_DEBUG("vCPU %d execution time: %ld.%.9lds\n", vcpu_id, + ts_diff.tv_sec, ts_diff.tv_nsec); + + return NULL; +} + +#define PAGE_SHIFT_4K 12 +#define PTES_PER_4K_PT 512 + +static struct kvm_vm *create_vm(enum vm_guest_mode mode, int vcpus, + uint64_t vcpu_memory_bytes) +{ + struct kvm_vm *vm; + uint64_t pages = DEFAULT_GUEST_PHY_PAGES; + + /* Account for a few pages per-vCPU for stacks */ + pages += DEFAULT_STACK_PGS * vcpus; + + /* + * Reserve twice the ammount of memory needed to map the test region and + * the page table / stacks region, at 4k, for page tables. Do the + * calculation with 4K page size: the smallest of all archs. (e.g., 64K + * page size guest will need even less memory for page tables). + */ + pages += (2 * pages) / PTES_PER_4K_PT; + pages += ((2 * vcpus * vcpu_memory_bytes) >> PAGE_SHIFT_4K) / + PTES_PER_4K_PT; + pages = vm_adjust_num_guest_pages(mode, pages); + + pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode)); + + vm = _vm_create(mode, pages, O_RDWR); + kvm_vm_elf_load(vm, program_invocation_name, 0, 0); +#ifdef __x86_64__ + vm_create_irqchip(vm); +#endif + return vm; +} + +static int handle_uffd_page_request(int uffd, uint64_t addr) +{ + pid_t tid; + struct timespec start; + struct timespec end; + struct uffdio_copy copy; + int r; + + tid = syscall(__NR_gettid); + + copy.src = (uint64_t)guest_data_prototype; + copy.dst = addr; + copy.len = host_page_size; + copy.mode = 0; + + clock_gettime(CLOCK_MONOTONIC, &start); + + r = ioctl(uffd, UFFDIO_COPY, ©); + if (r == -1) { + pr_info("Failed Paged in 0x%lx from thread %d with errno: %d\n", + addr, tid, errno); + return r; + } + + clock_gettime(CLOCK_MONOTONIC, &end); + + PER_PAGE_DEBUG("UFFDIO_COPY %d \t%ld ns\n", tid, + timespec_to_ns(timespec_sub(end, start))); + PER_PAGE_DEBUG("Paged in %ld bytes at 0x%lx from thread %d\n", + host_page_size, addr, tid); + + return 0; +} + +bool quit_uffd_thread; + +struct uffd_handler_args { + int uffd; + int pipefd; + useconds_t delay; +}; + +static void *uffd_handler_thread_fn(void *arg) +{ + struct uffd_handler_args *uffd_args = (struct uffd_handler_args *)arg; + int uffd = uffd_args->uffd; + int pipefd = uffd_args->pipefd; + useconds_t delay = uffd_args->delay; + int64_t pages = 0; + struct timespec start, end, ts_diff; + + clock_gettime(CLOCK_MONOTONIC, &start); + while (!quit_uffd_thread) { + struct uffd_msg msg; + struct pollfd pollfd[2]; + char tmp_chr; + int r; + uint64_t addr; + + pollfd[0].fd = uffd; + pollfd[0].events = POLLIN; + pollfd[1].fd = pipefd; + pollfd[1].events = POLLIN; + + r = poll(pollfd, 2, -1); + switch (r) { + case -1: + pr_info("poll err"); + continue; + case 0: + continue; + case 1: + break; + default: + pr_info("Polling uffd returned %d", r); + return NULL; + } + + if (pollfd[0].revents & POLLERR) { + pr_info("uffd revents has POLLERR"); + return NULL; + } + + if (pollfd[1].revents & POLLIN) { + r = read(pollfd[1].fd, &tmp_chr, 1); + TEST_ASSERT(r == 1, + "Error reading pipefd in UFFD thread\n"); + return NULL; + } + + if (!pollfd[0].revents & POLLIN) + continue; + + r = read(uffd, &msg, sizeof(msg)); + if (r == -1) { + if (errno == EAGAIN) + continue; + pr_info("Read of uffd gor errno %d", errno); + return NULL; + } + + if (r != sizeof(msg)) { + pr_info("Read on uffd returned unexpected size: %d bytes", r); + return NULL; + } + + if (!(msg.event & UFFD_EVENT_PAGEFAULT)) + continue; + + if (delay) + usleep(delay); + addr = msg.arg.pagefault.address; + r = handle_uffd_page_request(uffd, addr); + if (r < 0) + return NULL; + pages++; + } + + clock_gettime(CLOCK_MONOTONIC, &end); + ts_diff = timespec_sub(end, start); + PER_VCPU_DEBUG("userfaulted %ld pages over %ld.%.9lds. (%f/sec)\n", + pages, ts_diff.tv_sec, ts_diff.tv_nsec, + pages / ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / 100000000.0)); + + return NULL; +} + +static int setup_demand_paging(struct kvm_vm *vm, + pthread_t *uffd_handler_thread, int pipefd, + useconds_t uffd_delay, + struct uffd_handler_args *uffd_args, + void *hva, uint64_t len) +{ + int uffd; + struct uffdio_api uffdio_api; + struct uffdio_register uffdio_register; + + uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); + if (uffd == -1) { + pr_info("uffd creation failed\n"); + return -1; + } + + uffdio_api.api = UFFD_API; + uffdio_api.features = 0; + if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) { + pr_info("ioctl uffdio_api failed\n"); + return -1; + } + + uffdio_register.range.start = (uint64_t)hva; + uffdio_register.range.len = len; + uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) { + pr_info("ioctl uffdio_register failed\n"); + return -1; + } + + if ((uffdio_register.ioctls & UFFD_API_RANGE_IOCTLS) != + UFFD_API_RANGE_IOCTLS) { + pr_info("unexpected userfaultfd ioctl set\n"); + return -1; + } + + uffd_args->uffd = uffd; + uffd_args->pipefd = pipefd; + uffd_args->delay = uffd_delay; + pthread_create(uffd_handler_thread, NULL, uffd_handler_thread_fn, + uffd_args); + + PER_VCPU_DEBUG("Created uffd thread for HVA range [%p, %p)\n", + hva, hva + len); + + return 0; +} + +static void run_test(enum vm_guest_mode mode, bool use_uffd, + useconds_t uffd_delay, int vcpus, + uint64_t vcpu_memory_bytes) +{ + pthread_t *vcpu_threads; + pthread_t *uffd_handler_threads = NULL; + struct uffd_handler_args *uffd_args = NULL; + struct timespec start, end, ts_diff; + int *pipefds = NULL; + struct kvm_vm *vm; + uint64_t guest_num_pages; + int vcpu_id; + int r; + + vm = create_vm(mode, vcpus, vcpu_memory_bytes); + + guest_page_size = vm_get_page_size(vm); + + TEST_ASSERT(vcpu_memory_bytes % guest_page_size == 0, + "Guest memory size is not guest page size aligned."); + + guest_num_pages = (vcpus * vcpu_memory_bytes) / guest_page_size; + guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages); + + /* + * If there should be more memory in the guest test region than there + * can be pages in the guest, it will definitely cause problems. + */ + TEST_ASSERT(guest_num_pages < vm_get_max_gfn(vm), + "Requested more guest memory than address space allows.\n" + " guest pages: %lx max gfn: %x vcpus: %d wss: %lx]\n", + guest_num_pages, vm_get_max_gfn(vm), vcpus, + vcpu_memory_bytes); + + host_page_size = getpagesize(); + TEST_ASSERT(vcpu_memory_bytes % host_page_size == 0, + "Guest memory size is not host page size aligned."); + + guest_test_phys_mem = (vm_get_max_gfn(vm) - guest_num_pages) * + guest_page_size; + guest_test_phys_mem &= ~(host_page_size - 1); + +#ifdef __s390x__ + /* Align to 1M (segment size) */ + guest_test_phys_mem &= ~((1 << 20) - 1); +#endif + + pr_info("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem); + + /* Add an extra memory slot for testing demand paging */ + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, + guest_test_phys_mem, + TEST_MEM_SLOT_INDEX, + guest_num_pages, 0); + + /* Do mapping for the demand paging memory slot */ + virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages, 0); + + ucall_init(vm, NULL); + + guest_data_prototype = malloc(host_page_size); + TEST_ASSERT(guest_data_prototype, + "Failed to allocate buffer for guest data pattern"); + memset(guest_data_prototype, 0xAB, host_page_size); + + vcpu_threads = malloc(vcpus * sizeof(*vcpu_threads)); + TEST_ASSERT(vcpu_threads, "Memory allocation failed"); + + if (use_uffd) { + uffd_handler_threads = + malloc(vcpus * sizeof(*uffd_handler_threads)); + TEST_ASSERT(uffd_handler_threads, "Memory allocation failed"); + + uffd_args = malloc(vcpus * sizeof(*uffd_args)); + TEST_ASSERT(uffd_args, "Memory allocation failed"); + + pipefds = malloc(sizeof(int) * vcpus * 2); + TEST_ASSERT(pipefds, "Unable to allocate memory for pipefd"); + } + + for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) { + vm_paddr_t vcpu_gpa; + void *vcpu_hva; + + vm_vcpu_add_default(vm, vcpu_id, guest_code); + + vcpu_gpa = guest_test_phys_mem + (vcpu_id * vcpu_memory_bytes); + PER_VCPU_DEBUG("Added VCPU %d with test mem gpa [%lx, %lx)\n", + vcpu_id, vcpu_gpa, vcpu_gpa + vcpu_memory_bytes); + + /* Cache the HVA pointer of the region */ + vcpu_hva = addr_gpa2hva(vm, vcpu_gpa); + + if (use_uffd) { + /* + * Set up user fault fd to handle demand paging + * requests. + */ + r = pipe2(&pipefds[vcpu_id * 2], + O_CLOEXEC | O_NONBLOCK); + TEST_ASSERT(!r, "Failed to set up pipefd"); + + r = setup_demand_paging(vm, + &uffd_handler_threads[vcpu_id], + pipefds[vcpu_id * 2], + uffd_delay, &uffd_args[vcpu_id], + vcpu_hva, vcpu_memory_bytes); + if (r < 0) + exit(-r); + } + +#ifdef __x86_64__ + vcpu_set_cpuid(vm, vcpu_id, kvm_get_supported_cpuid()); +#endif + + vcpu_args[vcpu_id].vm = vm; + vcpu_args[vcpu_id].vcpu_id = vcpu_id; + vcpu_args[vcpu_id].gva = guest_test_virt_mem + + (vcpu_id * vcpu_memory_bytes); + vcpu_args[vcpu_id].pages = vcpu_memory_bytes / guest_page_size; + } + + /* Export the shared variables to the guest */ + sync_global_to_guest(vm, host_page_size); + sync_global_to_guest(vm, guest_page_size); + sync_global_to_guest(vm, vcpu_args); + + pr_info("Finished creating vCPUs and starting uffd threads\n"); + + clock_gettime(CLOCK_MONOTONIC, &start); + + for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) { + pthread_create(&vcpu_threads[vcpu_id], NULL, vcpu_worker, + &vcpu_args[vcpu_id]); + } + + pr_info("Started all vCPUs\n"); + + /* Wait for the vcpu threads to quit */ + for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) { + pthread_join(vcpu_threads[vcpu_id], NULL); + PER_VCPU_DEBUG("Joined thread for vCPU %d\n", vcpu_id); + } + + pr_info("All vCPU threads joined\n"); + + clock_gettime(CLOCK_MONOTONIC, &end); + + if (use_uffd) { + char c; + + /* Tell the user fault fd handler threads to quit */ + for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) { + r = write(pipefds[vcpu_id * 2 + 1], &c, 1); + TEST_ASSERT(r == 1, "Unable to write to pipefd"); + + pthread_join(uffd_handler_threads[vcpu_id], NULL); + } + } + + ts_diff = timespec_sub(end, start); + pr_info("Total guest execution time: %ld.%.9lds\n", + ts_diff.tv_sec, ts_diff.tv_nsec); + pr_info("Overall demand paging rate: %f pgs/sec\n", + guest_num_pages / ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / 100000000.0)); + + ucall_uninit(vm); + kvm_vm_free(vm); + + free(guest_data_prototype); + free(vcpu_threads); + if (use_uffd) { + free(uffd_handler_threads); + free(uffd_args); + free(pipefds); + } +} + +struct guest_mode { + bool supported; + bool enabled; +}; +static struct guest_mode guest_modes[NUM_VM_MODES]; + +#define guest_mode_init(mode, supported, enabled) ({ \ + guest_modes[mode] = (struct guest_mode){ supported, enabled }; \ +}) + +static void help(char *name) +{ + int i; + + puts(""); + printf("usage: %s [-h] [-m mode] [-u] [-d uffd_delay_usec]\n" + " [-b memory] [-v vcpus]\n", name); + printf(" -m: specify the guest mode ID to test\n" + " (default: test all supported modes)\n" + " This option may be used multiple times.\n" + " Guest mode IDs:\n"); + for (i = 0; i < NUM_VM_MODES; ++i) { + printf(" %d: %s%s\n", i, vm_guest_mode_string(i), + guest_modes[i].supported ? " (supported)" : ""); + } + printf(" -u: use User Fault FD to handle vCPU page\n" + " faults.\n"); + printf(" -d: add a delay in usec to the User Fault\n" + " FD handler to simulate demand paging\n" + " overheads. Ignored without -u.\n"); + printf(" -b: specify the size of the memory region which should be\n" + " demand paged by each vCPU. e.g. 10M or 3G.\n" + " Default: 1G\n"); + printf(" -v: specify the number of vCPUs to run.\n"); + puts(""); + exit(0); +} + +int main(int argc, char *argv[]) +{ + bool mode_selected = false; + uint64_t vcpu_memory_bytes = DEFAULT_GUEST_TEST_MEM_SIZE; + int vcpus = 1; + unsigned int mode; + int opt, i; + bool use_uffd = false; + useconds_t uffd_delay = 0; + +#ifdef __x86_64__ + guest_mode_init(VM_MODE_PXXV48_4K, true, true); +#endif +#ifdef __aarch64__ + guest_mode_init(VM_MODE_P40V48_4K, true, true); + guest_mode_init(VM_MODE_P40V48_64K, true, true); + { + unsigned int limit = kvm_check_cap(KVM_CAP_ARM_VM_IPA_SIZE); + + if (limit >= 52) + guest_mode_init(VM_MODE_P52V48_64K, true, true); + if (limit >= 48) { + guest_mode_init(VM_MODE_P48V48_4K, true, true); + guest_mode_init(VM_MODE_P48V48_64K, true, true); + } + } +#endif +#ifdef __s390x__ + guest_mode_init(VM_MODE_P40V48_4K, true, true); +#endif + + while ((opt = getopt(argc, argv, "hm:ud:b:v:")) != -1) { + switch (opt) { + case 'm': + if (!mode_selected) { + for (i = 0; i < NUM_VM_MODES; ++i) + guest_modes[i].enabled = false; + mode_selected = true; + } + mode = strtoul(optarg, NULL, 10); + TEST_ASSERT(mode < NUM_VM_MODES, + "Guest mode ID %d too big", mode); + guest_modes[mode].enabled = true; + break; + case 'u': + use_uffd = true; + break; + case 'd': + uffd_delay = strtoul(optarg, NULL, 0); + TEST_ASSERT(uffd_delay >= 0, + "A negative UFFD delay is not supported."); + break; + case 'b': + vcpu_memory_bytes = parse_size(optarg); + break; + case 'v': + vcpus = atoi(optarg); + TEST_ASSERT(vcpus > 0, + "Must have a positive number of vCPUs"); + TEST_ASSERT(vcpus <= MAX_VCPUS, + "This test does not currently support\n" + "more than %d vCPUs.", MAX_VCPUS); + break; + case 'h': + default: + help(argv[0]); + break; + } + } + + for (i = 0; i < NUM_VM_MODES; ++i) { + if (!guest_modes[i].enabled) + continue; + TEST_ASSERT(guest_modes[i].supported, + "Guest mode ID %d (%s) not supported.", + i, vm_guest_mode_string(i)); + run_test(i, use_uffd, uffd_delay, vcpus, vcpu_memory_bytes); + } + + return 0; +} + +#else /* __NR_userfaultfd */ + +#warning "missing __NR_userfaultfd definition" + +int main(void) +{ + print_skip("__NR_userfaultfd must be present for userfaultfd test"); + return KSFT_SKIP; +} + +#endif /* __NR_userfaultfd */ diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index 5614222a6628..752ec158ac59 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -166,24 +166,22 @@ static void *vcpu_worker(void *data) pages_count += TEST_PAGES_PER_LOOP; generate_random_array(guest_array, TEST_PAGES_PER_LOOP); } else { - TEST_ASSERT(false, - "Invalid guest sync status: " - "exit_reason=%s\n", - exit_reason_str(run->exit_reason)); + TEST_FAIL("Invalid guest sync status: " + "exit_reason=%s\n", + exit_reason_str(run->exit_reason)); } } - DEBUG("Dirtied %"PRIu64" pages\n", pages_count); + pr_info("Dirtied %"PRIu64" pages\n", pages_count); return NULL; } -static void vm_dirty_log_verify(unsigned long *bmap) +static void vm_dirty_log_verify(enum vm_guest_mode mode, unsigned long *bmap) { + uint64_t step = vm_num_host_pages(mode, 1); uint64_t page; uint64_t *value_ptr; - uint64_t step = host_page_size >= guest_page_size ? 1 : - guest_page_size / host_page_size; for (page = 0; page < host_num_pages; page += step) { value_ptr = host_test_mem + page * host_page_size; @@ -252,6 +250,8 @@ static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid, struct kvm_vm *vm; uint64_t extra_pg_pages = extra_mem_pages / 512 * 2; + pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode)); + vm = _vm_create(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, O_RDWR); kvm_vm_elf_load(vm, program_invocation_name, 0, 0); #ifdef __x86_64__ @@ -264,6 +264,10 @@ static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid, #define DIRTY_MEM_BITS 30 /* 1G */ #define PAGE_SHIFT_4K 12 +#ifdef USE_CLEAR_DIRTY_LOG +static u64 dirty_log_manual_caps; +#endif + static void run_test(enum vm_guest_mode mode, unsigned long iterations, unsigned long interval, uint64_t phys_offset) { @@ -289,14 +293,11 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations, * case where the size is not aligned to 64 pages. */ guest_num_pages = (1ul << (DIRTY_MEM_BITS - - vm_get_page_shift(vm))) + 16; -#ifdef __s390x__ - /* Round up to multiple of 1M (segment size) */ - guest_num_pages = (guest_num_pages + 0xff) & ~0xffUL; -#endif + vm_get_page_shift(vm))) + 3; + guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages); + host_page_size = getpagesize(); - host_num_pages = (guest_num_pages * guest_page_size) / host_page_size + - !!((guest_num_pages * guest_page_size) % host_page_size); + host_num_pages = vm_num_host_pages(mode, guest_num_pages); if (!phys_offset) { guest_test_phys_mem = (vm_get_max_gfn(vm) - @@ -311,7 +312,7 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations, guest_test_phys_mem &= ~((1 << 20) - 1); #endif - DEBUG("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem); + pr_info("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem); bmap = bitmap_alloc(host_num_pages); host_bmap_track = bitmap_alloc(host_num_pages); @@ -320,7 +321,7 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations, struct kvm_enable_cap cap = {}; cap.cap = KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2; - cap.args[0] = 1; + cap.args[0] = dirty_log_manual_caps; vm_enable_cap(vm, &cap); #endif @@ -332,8 +333,7 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations, KVM_MEM_LOG_DIRTY_PAGES); /* Do mapping for the dirty track memory slot */ - virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, - guest_num_pages * guest_page_size, 0); + virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages, 0); /* Cache the HVA pointer of the region */ host_test_mem = addr_gpa2hva(vm, (vm_paddr_t)guest_test_phys_mem); @@ -341,9 +341,7 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations, #ifdef __x86_64__ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); #endif -#ifdef __aarch64__ ucall_init(vm, NULL); -#endif /* Export the shared variables to the guest */ sync_global_to_guest(vm, host_page_size); @@ -369,7 +367,7 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations, kvm_vm_clear_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap, 0, host_num_pages); #endif - vm_dirty_log_verify(bmap); + vm_dirty_log_verify(mode, bmap); iteration++; sync_global_to_guest(vm, iteration); } @@ -378,9 +376,9 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations, host_quit = true; pthread_join(vcpu_thread, NULL); - DEBUG("Total bits checked: dirty (%"PRIu64"), clear (%"PRIu64"), " - "track_next (%"PRIu64")\n", host_dirty_count, host_clear_count, - host_track_next_count); + pr_info("Total bits checked: dirty (%"PRIu64"), clear (%"PRIu64"), " + "track_next (%"PRIu64")\n", host_dirty_count, host_clear_count, + host_track_next_count); free(bmap); free(host_bmap_track); @@ -388,15 +386,14 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations, kvm_vm_free(vm); } -struct vm_guest_mode_params { +struct guest_mode { bool supported; bool enabled; }; -struct vm_guest_mode_params vm_guest_mode_params[NUM_VM_MODES]; +static struct guest_mode guest_modes[NUM_VM_MODES]; -#define vm_guest_mode_params_init(mode, supported, enabled) \ -({ \ - vm_guest_mode_params[mode] = (struct vm_guest_mode_params){ supported, enabled }; \ +#define guest_mode_init(mode, supported, enabled) ({ \ + guest_modes[mode] = (struct guest_mode){ supported, enabled }; \ }) static void help(char *name) @@ -419,7 +416,7 @@ static void help(char *name) " Guest mode IDs:\n"); for (i = 0; i < NUM_VM_MODES; ++i) { printf(" %d: %s%s\n", i, vm_guest_mode_string(i), - vm_guest_mode_params[i].supported ? " (supported)" : ""); + guest_modes[i].supported ? " (supported)" : ""); } puts(""); exit(0); @@ -433,34 +430,38 @@ int main(int argc, char *argv[]) uint64_t phys_offset = 0; unsigned int mode; int opt, i; -#ifdef __aarch64__ - unsigned int host_ipa_limit; -#endif #ifdef USE_CLEAR_DIRTY_LOG - if (!kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2)) { - fprintf(stderr, "KVM_CLEAR_DIRTY_LOG not available, skipping tests\n"); + dirty_log_manual_caps = + kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2); + if (!dirty_log_manual_caps) { + print_skip("KVM_CLEAR_DIRTY_LOG not available"); exit(KSFT_SKIP); } + dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | + KVM_DIRTY_LOG_INITIALLY_SET); #endif #ifdef __x86_64__ - vm_guest_mode_params_init(VM_MODE_PXXV48_4K, true, true); + guest_mode_init(VM_MODE_PXXV48_4K, true, true); #endif #ifdef __aarch64__ - vm_guest_mode_params_init(VM_MODE_P40V48_4K, true, true); - vm_guest_mode_params_init(VM_MODE_P40V48_64K, true, true); - - host_ipa_limit = kvm_check_cap(KVM_CAP_ARM_VM_IPA_SIZE); - if (host_ipa_limit >= 52) - vm_guest_mode_params_init(VM_MODE_P52V48_64K, true, true); - if (host_ipa_limit >= 48) { - vm_guest_mode_params_init(VM_MODE_P48V48_4K, true, true); - vm_guest_mode_params_init(VM_MODE_P48V48_64K, true, true); + guest_mode_init(VM_MODE_P40V48_4K, true, true); + guest_mode_init(VM_MODE_P40V48_64K, true, true); + + { + unsigned int limit = kvm_check_cap(KVM_CAP_ARM_VM_IPA_SIZE); + + if (limit >= 52) + guest_mode_init(VM_MODE_P52V48_64K, true, true); + if (limit >= 48) { + guest_mode_init(VM_MODE_P48V48_4K, true, true); + guest_mode_init(VM_MODE_P48V48_64K, true, true); + } } #endif #ifdef __s390x__ - vm_guest_mode_params_init(VM_MODE_P40V48_4K, true, true); + guest_mode_init(VM_MODE_P40V48_4K, true, true); #endif while ((opt = getopt(argc, argv, "hi:I:p:m:")) != -1) { @@ -477,13 +478,13 @@ int main(int argc, char *argv[]) case 'm': if (!mode_selected) { for (i = 0; i < NUM_VM_MODES; ++i) - vm_guest_mode_params[i].enabled = false; + guest_modes[i].enabled = false; mode_selected = true; } mode = strtoul(optarg, NULL, 10); TEST_ASSERT(mode < NUM_VM_MODES, "Guest mode ID %d too big", mode); - vm_guest_mode_params[mode].enabled = true; + guest_modes[mode].enabled = true; break; case 'h': default: @@ -495,15 +496,15 @@ int main(int argc, char *argv[]) TEST_ASSERT(iterations > 2, "Iterations must be greater than two"); TEST_ASSERT(interval > 0, "Interval must be greater than zero"); - DEBUG("Test iterations: %"PRIu64", interval: %"PRIu64" (ms)\n", - iterations, interval); + pr_info("Test iterations: %"PRIu64", interval: %"PRIu64" (ms)\n", + iterations, interval); srandom(time(0)); for (i = 0; i < NUM_VM_MODES; ++i) { - if (!vm_guest_mode_params[i].enabled) + if (!guest_modes[i].enabled) continue; - TEST_ASSERT(vm_guest_mode_params[i].supported, + TEST_ASSERT(guest_modes[i].supported, "Guest mode ID %d (%s) not supported.", i, vm_guest_mode_string(i)); run_test(i, iterations, interval, phys_offset); diff --git a/tools/testing/selftests/kvm/include/evmcs.h b/tools/testing/selftests/kvm/include/evmcs.h index 4912d23844bc..a034438b6266 100644 --- a/tools/testing/selftests/kvm/include/evmcs.h +++ b/tools/testing/selftests/kvm/include/evmcs.h @@ -16,6 +16,8 @@ #define u32 uint32_t #define u64 uint64_t +#define EVMCS_VERSION 1 + extern bool enable_evmcs; struct hv_vp_assist_page { @@ -217,8 +219,8 @@ struct hv_enlightened_vmcs { #define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_MASK \ (~((1ull << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT) - 1)) -struct hv_enlightened_vmcs *current_evmcs; -struct hv_vp_assist_page *current_vp_assist; +extern struct hv_enlightened_vmcs *current_evmcs; +extern struct hv_vp_assist_page *current_vp_assist; int vcpu_enable_evmcs(struct kvm_vm *vm, int vcpu_id); diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index ae0d14c2540a..919e161dd289 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -10,13 +10,15 @@ #include "test_util.h" #include "asm/kvm.h" +#include "linux/list.h" #include "linux/kvm.h" #include <sys/ioctl.h> #include "sparsebit.h" -/* Callers of kvm_util only have an incomplete/opaque description of the +/* + * Callers of kvm_util only have an incomplete/opaque description of the * structure kvm_util is using to maintain the state of a VM. */ struct kvm_vm; @@ -24,12 +26,6 @@ struct kvm_vm; typedef uint64_t vm_paddr_t; /* Virtual Machine (Guest) physical address */ typedef uint64_t vm_vaddr_t; /* Virtual Machine (Guest) virtual address */ -#ifndef NDEBUG -#define DEBUG(...) printf(__VA_ARGS__); -#else -#define DEBUG(...) -#endif - /* Minimum allocated guest virtual and physical addresses */ #define KVM_UTIL_MIN_VADDR 0x2000 @@ -84,6 +80,23 @@ void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename, uint32_t data_memslot, uint32_t pgd_memslot); void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent); + +/* + * VM VCPU Dump + * + * Input Args: + * stream - Output FILE stream + * vm - Virtual Machine + * vcpuid - VCPU ID + * indent - Left margin indent amount + * + * Output Args: None + * + * Return: None + * + * Dumps the current state of the VCPU specified by @vcpuid, within the VM + * given by @vm, to the FILE stream given by @stream. + */ void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent); @@ -100,25 +113,65 @@ int _vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long ioctl, void *arg); void vm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg); void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags); +void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa); +void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot); void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid); vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, uint32_t data_memslot, uint32_t pgd_memslot); void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, - size_t size, uint32_t pgd_memslot); + unsigned int npages, uint32_t pgd_memslot); void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa); void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva); vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva); + +/* + * Address Guest Virtual to Guest Physical + * + * Input Args: + * vm - Virtual Machine + * gva - VM virtual address + * + * Output Args: None + * + * Return: + * Equivalent VM physical address + * + * Returns the VM physical address of the translated VM virtual + * address given by @gva. + */ vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva); struct kvm_run *vcpu_state(struct kvm_vm *vm, uint32_t vcpuid); void vcpu_run(struct kvm_vm *vm, uint32_t vcpuid); int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid); void vcpu_run_complete_io(struct kvm_vm *vm, uint32_t vcpuid); +void vcpu_set_guest_debug(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_guest_debug *debug); void vcpu_set_mp_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_mp_state *mp_state); void vcpu_regs_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs); void vcpu_regs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs); + +/* + * VM VCPU Args Set + * + * Input Args: + * vm - Virtual Machine + * vcpuid - VCPU ID + * num - number of arguments + * ... - arguments, each of type uint64_t + * + * Output Args: None + * + * Return: None + * + * Sets the first @num function input registers of the VCPU with @vcpuid, + * per the C calling convention of the architecture, to the values given + * as variable args. Each of the variable args is expected to be of type + * uint64_t. The maximum @num can be is specific to the architecture. + */ void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...); + void vcpu_sregs_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_sregs *sregs); void vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid, @@ -147,15 +200,57 @@ int vcpu_nested_state_set(struct kvm_vm *vm, uint32_t vcpuid, const char *exit_reason_str(unsigned int exit_reason); void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot); + +/* + * VM Virtual Page Map + * + * Input Args: + * vm - Virtual Machine + * vaddr - VM Virtual Address + * paddr - VM Physical Address + * memslot - Memory region slot for new virtual translation tables + * + * Output Args: None + * + * Return: None + * + * Within @vm, creates a virtual translation for the page starting + * at @vaddr to the page starting at @paddr. + */ void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, - uint32_t pgd_memslot); + uint32_t memslot); + vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm, vm_paddr_t paddr_min, uint32_t memslot); vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num, vm_paddr_t paddr_min, uint32_t memslot); -struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_size, +/* + * Create a VM with reasonable defaults + * + * Input Args: + * vcpuid - The id of the single VCPU to add to the VM. + * extra_mem_pages - The number of extra pages to add (this will + * decide how much extra space we will need to + * setup the page tables using memslot 0) + * guest_code - The vCPU's entry point + * + * Output Args: None + * + * Return: + * Pointer to opaque structure that describes the created VM. + */ +struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages, void *guest_code); + +/* + * Adds a vCPU with reasonable defaults (e.g. a stack) + * + * Input Args: + * vm - Virtual Machine + * vcpuid - The id of the VCPU to add to the VM. + * guest_code - The vCPU's entry point + */ void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code); bool vm_is_unrestricted_guest(struct kvm_vm *vm); @@ -163,6 +258,22 @@ bool vm_is_unrestricted_guest(struct kvm_vm *vm); unsigned int vm_get_page_size(struct kvm_vm *vm); unsigned int vm_get_page_shift(struct kvm_vm *vm); unsigned int vm_get_max_gfn(struct kvm_vm *vm); +int vm_get_fd(struct kvm_vm *vm); + +unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size); +unsigned int vm_num_host_pages(enum vm_guest_mode mode, unsigned int num_guest_pages); +unsigned int vm_num_guest_pages(enum vm_guest_mode mode, unsigned int num_host_pages); +static inline unsigned int +vm_adjust_num_guest_pages(enum vm_guest_mode mode, unsigned int num_guest_pages) +{ + unsigned int n; + n = vm_num_guest_pages(mode, vm_num_host_pages(mode, num_guest_pages)); +#ifdef __s390x__ + /* s390 requires 1M aligned guest sizes */ + n = (n + 255) & ~255; +#endif + return n; +} struct kvm_userspace_memory_region * kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start, @@ -203,13 +314,30 @@ void ucall_uninit(struct kvm_vm *vm); void ucall(uint64_t cmd, int nargs, ...); uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc); +#define GUEST_SYNC_ARGS(stage, arg1, arg2, arg3, arg4) \ + ucall(UCALL_SYNC, 6, "hello", stage, arg1, arg2, arg3, arg4) #define GUEST_SYNC(stage) ucall(UCALL_SYNC, 2, "hello", stage) #define GUEST_DONE() ucall(UCALL_DONE, 0) -#define GUEST_ASSERT(_condition) do { \ - if (!(_condition)) \ - ucall(UCALL_ABORT, 2, \ - "Failed guest assert: " \ - #_condition, __LINE__); \ +#define __GUEST_ASSERT(_condition, _nargs, _args...) do { \ + if (!(_condition)) \ + ucall(UCALL_ABORT, 2 + _nargs, \ + "Failed guest assert: " \ + #_condition, __LINE__, _args); \ } while (0) +#define GUEST_ASSERT(_condition) \ + __GUEST_ASSERT((_condition), 0, 0) + +#define GUEST_ASSERT_1(_condition, arg1) \ + __GUEST_ASSERT((_condition), 1, (arg1)) + +#define GUEST_ASSERT_2(_condition, arg1, arg2) \ + __GUEST_ASSERT((_condition), 2, (arg1), (arg2)) + +#define GUEST_ASSERT_3(_condition, arg1, arg2, arg3) \ + __GUEST_ASSERT((_condition), 3, (arg1), (arg2), (arg3)) + +#define GUEST_ASSERT_4(_condition, arg1, arg2, arg3, arg4) \ + __GUEST_ASSERT((_condition), 4, (arg1), (arg2), (arg3), (arg4)) + #endif /* SELFTEST_KVM_UTIL_H */ diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h index a41db6fb7e24..5eb01bf51b86 100644 --- a/tools/testing/selftests/kvm/include/test_util.h +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -19,12 +19,28 @@ #include <fcntl.h> #include "kselftest.h" +static inline int _no_printf(const char *format, ...) { return 0; } + +#ifdef DEBUG +#define pr_debug(...) printf(__VA_ARGS__) +#else +#define pr_debug(...) _no_printf(__VA_ARGS__) +#endif +#ifndef QUIET +#define pr_info(...) printf(__VA_ARGS__) +#else +#define pr_info(...) _no_printf(__VA_ARGS__) +#endif + +void print_skip(const char *fmt, ...) __attribute__((format(printf, 1, 2))); + ssize_t test_write(int fd, const void *buf, size_t count); ssize_t test_read(int fd, void *buf, size_t count); int test_seq_read(const char *path, char **bufp, size_t *sizep); void test_assert(bool exp, const char *exp_str, - const char *file, unsigned int line, const char *fmt, ...); + const char *file, unsigned int line, const char *fmt, ...) + __attribute__((format(printf, 5, 6))); #define TEST_ASSERT(e, fmt, ...) \ test_assert((e), #e, __FILE__, __LINE__, fmt, ##__VA_ARGS__) @@ -39,4 +55,14 @@ void test_assert(bool exp, const char *exp_str, #a, #b, #a, (unsigned long) __a, #b, (unsigned long) __b); \ } while (0) +#define TEST_FAIL(fmt, ...) \ + TEST_ASSERT(false, fmt, ##__VA_ARGS__) + +size_t parse_size(const char *size); + +int64_t timespec_to_ns(struct timespec ts); +struct timespec timespec_add_ns(struct timespec ts, int64_t ns); +struct timespec timespec_add(struct timespec ts1, struct timespec ts2); +struct timespec timespec_sub(struct timespec ts1, struct timespec ts2); + #endif /* SELFTEST_KVM_TEST_UTIL_H */ diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 7428513a4c68..82b7fe16a824 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -79,13 +79,16 @@ static inline uint64_t get_desc64_base(const struct desc64 *desc) static inline uint64_t rdtsc(void) { uint32_t eax, edx; - + uint64_t tsc_val; /* * The lfence is to wait (on Intel CPUs) until all previous - * instructions have been executed. + * instructions have been executed. If software requires RDTSC to be + * executed prior to execution of any subsequent instruction, it can + * execute LFENCE immediately after RDTSC */ - __asm__ __volatile__("lfence; rdtsc" : "=a"(eax), "=d"(edx)); - return ((uint64_t)edx) << 32 | eax; + __asm__ __volatile__("lfence; rdtsc; lfence" : "=a"(eax), "=d"(edx)); + tsc_val = ((uint64_t)edx) << 32 | eax; + return tsc_val; } static inline uint64_t rdtscp(uint32_t *aux) diff --git a/tools/testing/selftests/kvm/include/x86_64/svm_util.h b/tools/testing/selftests/kvm/include/x86_64/svm_util.h index cd037917fece..674151d24fcf 100644 --- a/tools/testing/selftests/kvm/include/x86_64/svm_util.h +++ b/tools/testing/selftests/kvm/include/x86_64/svm_util.h @@ -35,4 +35,14 @@ void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_r void run_guest(struct vmcb *vmcb, uint64_t vmcb_gpa); void nested_svm_check_supported(void); +static inline bool cpu_has_svm(void) +{ + u32 eax = 0x80000001, ecx; + + asm("cpuid" : + "=a" (eax), "=c" (ecx) : "0" (eax) : "ebx", "edx"); + + return ecx & CPUID_SVM; +} + #endif /* SELFTEST_KVM_SVM_UTILS_H */ diff --git a/tools/testing/selftests/kvm/include/x86_64/vmx.h b/tools/testing/selftests/kvm/include/x86_64/vmx.h index 3d27069b9ed9..ccff3e6e2704 100644 --- a/tools/testing/selftests/kvm/include/x86_64/vmx.h +++ b/tools/testing/selftests/kvm/include/x86_64/vmx.h @@ -575,6 +575,33 @@ struct vmx_pages { void *eptp; }; +union vmx_basic { + u64 val; + struct { + u32 revision; + u32 size:13, + reserved1:3, + width:1, + dual:1, + type:4, + insouts:1, + ctrl:1, + vm_entry_exception_ctrl:1, + reserved2:7; + }; +}; + +union vmx_ctrl_msr { + u64 val; + struct { + u32 set, clr; + }; +}; + +union vmx_basic basic; +union vmx_ctrl_msr ctrl_pin_rev; +union vmx_ctrl_msr ctrl_exit_rev; + struct vmx_pages *vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva); bool prepare_for_vmx_operation(struct vmx_pages *vmx); void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp); diff --git a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c index 6f38c3dc0d56..0299cd81b8ba 100644 --- a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c +++ b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c @@ -24,8 +24,8 @@ void test_vcpu_creation(int first_vcpu_id, int num_vcpus) struct kvm_vm *vm; int i; - printf("Testing creating %d vCPUs, with IDs %d...%d.\n", - num_vcpus, first_vcpu_id, first_vcpu_id + num_vcpus - 1); + pr_info("Testing creating %d vCPUs, with IDs %d...%d.\n", + num_vcpus, first_vcpu_id, first_vcpu_id + num_vcpus - 1); vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR); @@ -41,8 +41,8 @@ int main(int argc, char *argv[]) int kvm_max_vcpu_id = kvm_check_cap(KVM_CAP_MAX_VCPU_ID); int kvm_max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS); - printf("KVM_CAP_MAX_VCPU_ID: %d\n", kvm_max_vcpu_id); - printf("KVM_CAP_MAX_VCPUS: %d\n", kvm_max_vcpus); + pr_info("KVM_CAP_MAX_VCPU_ID: %d\n", kvm_max_vcpu_id); + pr_info("KVM_CAP_MAX_VCPUS: %d\n", kvm_max_vcpus); /* * Upstream KVM prior to 4.8 does not support KVM_CAP_MAX_VCPU_ID. diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c index 86036a59a668..2afa6618b396 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/processor.c +++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c @@ -130,7 +130,7 @@ void _virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pte_index(vm, vaddr) * 8; break; default: - TEST_ASSERT(false, "Page table levels must be 2, 3, or 4"); + TEST_FAIL("Page table levels must be 2, 3, or 4"); } *ptep = paddr | 3; @@ -173,20 +173,19 @@ vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) goto unmapped_gva; break; default: - TEST_ASSERT(false, "Page table levels must be 2, 3, or 4"); + TEST_FAIL("Page table levels must be 2, 3, or 4"); } return pte_addr(vm, *ptep) + (gva & (vm->page_size - 1)); unmapped_gva: - TEST_ASSERT(false, "No mapping for vm virtual address, " - "gva: 0x%lx", gva); + TEST_FAIL("No mapping for vm virtual address, gva: 0x%lx", gva); exit(1); } static void pte_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent, uint64_t page, int level) { -#ifdef DEBUG_VM +#ifdef DEBUG static const char * const type[] = { "", "pud", "pmd", "pte" }; uint64_t pte, *ptep; @@ -197,7 +196,7 @@ static void pte_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent, uint64_t p ptep = addr_gpa2hva(vm, pte); if (!*ptep) continue; - printf("%*s%s: %lx: %lx at %p\n", indent, "", type[level], pte, *ptep, ptep); + fprintf(stream, "%*s%s: %lx: %lx at %p\n", indent, "", type[level], pte, *ptep, ptep); pte_dump(stream, vm, indent + 1, pte_addr(vm, *ptep), level + 1); } #endif @@ -215,7 +214,7 @@ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) ptep = addr_gpa2hva(vm, pgd); if (!*ptep) continue; - printf("%*spgd: %lx: %lx at %p\n", indent, "", pgd, *ptep, ptep); + fprintf(stream, "%*spgd: %lx: %lx at %p\n", indent, "", pgd, *ptep, ptep); pte_dump(stream, vm, indent + 1, pte_addr(vm, *ptep), level); } } @@ -262,11 +261,11 @@ void aarch64_vcpu_setup(struct kvm_vm *vm, int vcpuid, struct kvm_vcpu_init *ini switch (vm->mode) { case VM_MODE_P52V48_4K: - TEST_ASSERT(false, "AArch64 does not support 4K sized pages " - "with 52-bit physical address ranges"); + TEST_FAIL("AArch64 does not support 4K sized pages " + "with 52-bit physical address ranges"); case VM_MODE_PXXV48_4K: - TEST_ASSERT(false, "AArch64 does not support 4K sized pages " - "with ANY-bit physical address ranges"); + TEST_FAIL("AArch64 does not support 4K sized pages " + "with ANY-bit physical address ranges"); case VM_MODE_P52V48_64K: tcr_el1 |= 1ul << 14; /* TG0 = 64KB */ tcr_el1 |= 6ul << 32; /* IPS = 52 bits */ @@ -288,7 +287,7 @@ void aarch64_vcpu_setup(struct kvm_vm *vm, int vcpuid, struct kvm_vcpu_init *ini tcr_el1 |= 2ul << 32; /* IPS = 40 bits */ break; default: - TEST_ASSERT(false, "Unknown guest mode, mode: 0x%x", vm->mode); + TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode); } sctlr_el1 |= (1 << 0) | (1 << 2) | (1 << 12) /* M | C | I */; @@ -333,3 +332,21 @@ void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code) { aarch64_vcpu_add_default(vm, vcpuid, NULL, guest_code); } + +void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...) +{ + va_list ap; + int i; + + TEST_ASSERT(num >= 1 && num <= 8, "Unsupported number of args,\n" + " num: %u\n", num); + + va_start(ap, num); + + for (i = 0; i < num; i++) { + set_reg(vm, vcpuid, ARM64_CORE_REG(regs.regs[i]), + va_arg(ap, uint64_t)); + } + + va_end(ap); +} diff --git a/tools/testing/selftests/kvm/lib/aarch64/ucall.c b/tools/testing/selftests/kvm/lib/aarch64/ucall.c index 6cd91970fbad..c8e0ec20d3bf 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/ucall.c +++ b/tools/testing/selftests/kvm/lib/aarch64/ucall.c @@ -62,7 +62,7 @@ void ucall_init(struct kvm_vm *vm, void *arg) if (ucall_mmio_init(vm, start + offset)) return; } - TEST_ASSERT(false, "Can't find a ucall mmio address"); + TEST_FAIL("Can't find a ucall mmio address"); } void ucall_uninit(struct kvm_vm *vm) diff --git a/tools/testing/selftests/kvm/lib/assert.c b/tools/testing/selftests/kvm/lib/assert.c index d1cf9f6e0e6b..5ebbd0d6b472 100644 --- a/tools/testing/selftests/kvm/lib/assert.c +++ b/tools/testing/selftests/kvm/lib/assert.c @@ -82,8 +82,10 @@ test_assert(bool exp, const char *exp_str, } va_end(ap); - if (errno == EACCES) - ksft_exit_skip("Access denied - Exiting.\n"); + if (errno == EACCES) { + print_skip("Access denied - Exiting"); + exit(KSFT_SKIP); + } exit(254); } diff --git a/tools/testing/selftests/kvm/lib/io.c b/tools/testing/selftests/kvm/lib/io.c index eaf351cc7e7f..fedb2a741f0b 100644 --- a/tools/testing/selftests/kvm/lib/io.c +++ b/tools/testing/selftests/kvm/lib/io.c @@ -61,9 +61,9 @@ ssize_t test_write(int fd, const void *buf, size_t count) continue; case 0: - TEST_ASSERT(false, "Unexpected EOF,\n" - " rc: %zi num_written: %zi num_left: %zu", - rc, num_written, num_left); + TEST_FAIL("Unexpected EOF,\n" + " rc: %zi num_written: %zi num_left: %zu", + rc, num_written, num_left); break; default: @@ -138,9 +138,9 @@ ssize_t test_read(int fd, void *buf, size_t count) break; case 0: - TEST_ASSERT(false, "Unexpected EOF,\n" - " rc: %zi num_read: %zi num_left: %zu", - rc, num_read, num_left); + TEST_FAIL("Unexpected EOF,\n" + " rc: %zi num_read: %zi num_left: %zu", + rc, num_read, num_left); break; default: diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index a6dd0401eb50..c9cede5c7d0d 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -92,7 +92,7 @@ static void vm_open(struct kvm_vm *vm, int perm) exit(KSFT_SKIP); if (!kvm_check_cap(KVM_CAP_IMMEDIATE_EXIT)) { - fprintf(stderr, "immediate_exit not available, skipping test\n"); + print_skip("immediate_exit not available"); exit(KSFT_SKIP); } @@ -113,6 +113,25 @@ const char * const vm_guest_mode_string[] = { _Static_assert(sizeof(vm_guest_mode_string)/sizeof(char *) == NUM_VM_MODES, "Missing new mode strings?"); +struct vm_guest_mode_params { + unsigned int pa_bits; + unsigned int va_bits; + unsigned int page_size; + unsigned int page_shift; +}; + +static const struct vm_guest_mode_params vm_guest_mode_params[] = { + { 52, 48, 0x1000, 12 }, + { 52, 48, 0x10000, 16 }, + { 48, 48, 0x1000, 12 }, + { 48, 48, 0x10000, 16 }, + { 40, 48, 0x1000, 12 }, + { 40, 48, 0x10000, 16 }, + { 0, 0, 0x1000, 12 }, +}; +_Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct vm_guest_mode_params) == NUM_VM_MODES, + "Missing new mode params?"); + /* * VM Create * @@ -136,75 +155,57 @@ struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) { struct kvm_vm *vm; - DEBUG("Testing guest mode: %s\n", vm_guest_mode_string(mode)); + pr_debug("%s: mode='%s' pages='%ld' perm='%d'\n", __func__, + vm_guest_mode_string(mode), phy_pages, perm); vm = calloc(1, sizeof(*vm)); TEST_ASSERT(vm != NULL, "Insufficient Memory"); + INIT_LIST_HEAD(&vm->vcpus); + INIT_LIST_HEAD(&vm->userspace_mem_regions); + vm->mode = mode; vm->type = 0; + vm->pa_bits = vm_guest_mode_params[mode].pa_bits; + vm->va_bits = vm_guest_mode_params[mode].va_bits; + vm->page_size = vm_guest_mode_params[mode].page_size; + vm->page_shift = vm_guest_mode_params[mode].page_shift; + /* Setup mode specific traits. */ switch (vm->mode) { case VM_MODE_P52V48_4K: vm->pgtable_levels = 4; - vm->pa_bits = 52; - vm->va_bits = 48; - vm->page_size = 0x1000; - vm->page_shift = 12; break; case VM_MODE_P52V48_64K: vm->pgtable_levels = 3; - vm->pa_bits = 52; - vm->va_bits = 48; - vm->page_size = 0x10000; - vm->page_shift = 16; break; case VM_MODE_P48V48_4K: vm->pgtable_levels = 4; - vm->pa_bits = 48; - vm->va_bits = 48; - vm->page_size = 0x1000; - vm->page_shift = 12; break; case VM_MODE_P48V48_64K: vm->pgtable_levels = 3; - vm->pa_bits = 48; - vm->va_bits = 48; - vm->page_size = 0x10000; - vm->page_shift = 16; break; case VM_MODE_P40V48_4K: vm->pgtable_levels = 4; - vm->pa_bits = 40; - vm->va_bits = 48; - vm->page_size = 0x1000; - vm->page_shift = 12; break; case VM_MODE_P40V48_64K: vm->pgtable_levels = 3; - vm->pa_bits = 40; - vm->va_bits = 48; - vm->page_size = 0x10000; - vm->page_shift = 16; break; case VM_MODE_PXXV48_4K: #ifdef __x86_64__ kvm_get_cpu_address_width(&vm->pa_bits, &vm->va_bits); TEST_ASSERT(vm->va_bits == 48, "Linear address width " "(%d bits) not supported", vm->va_bits); + pr_debug("Guest physical address width detected: %d\n", + vm->pa_bits); vm->pgtable_levels = 4; - vm->page_size = 0x1000; - vm->page_shift = 12; - DEBUG("Guest physical address width detected: %d\n", - vm->pa_bits); #else - TEST_ASSERT(false, "VM_MODE_PXXV48_4K not supported on " - "non-x86 platforms"); + TEST_FAIL("VM_MODE_PXXV48_4K not supported on non-x86 platforms"); #endif break; default: - TEST_ASSERT(false, "Unknown guest mode, mode: 0x%x", mode); + TEST_FAIL("Unknown guest mode, mode: 0x%x", mode); } #ifdef __aarch64__ @@ -260,13 +261,12 @@ void kvm_vm_restart(struct kvm_vm *vmp, int perm) if (vmp->has_irqchip) vm_create_irqchip(vmp); - for (region = vmp->userspace_mem_region_head; region; - region = region->next) { + list_for_each_entry(region, &vmp->userspace_mem_regions, list) { int ret = ioctl(vmp->fd, KVM_SET_USER_MEMORY_REGION, ®ion->region); TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n" " rc: %i errno: %i\n" " slot: %u flags: 0x%x\n" - " guest_phys_addr: 0x%lx size: 0x%lx", + " guest_phys_addr: 0x%llx size: 0x%llx", ret, errno, region->region.slot, region->region.flags, region->region.guest_phys_addr, @@ -281,7 +281,7 @@ void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log) ret = ioctl(vm->fd, KVM_GET_DIRTY_LOG, &args); TEST_ASSERT(ret == 0, "%s: KVM_GET_DIRTY_LOG failed: %s", - strerror(-ret)); + __func__, strerror(-ret)); } void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log, @@ -294,7 +294,7 @@ void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log, ret = ioctl(vm->fd, KVM_CLEAR_DIRTY_LOG, &args); TEST_ASSERT(ret == 0, "%s: KVM_CLEAR_DIRTY_LOG failed: %s", - strerror(-ret)); + __func__, strerror(-ret)); } /* @@ -321,8 +321,7 @@ userspace_mem_region_find(struct kvm_vm *vm, uint64_t start, uint64_t end) { struct userspace_mem_region *region; - for (region = vm->userspace_mem_region_head; region; - region = region->next) { + list_for_each_entry(region, &vm->userspace_mem_regions, list) { uint64_t existing_start = region->region.guest_phys_addr; uint64_t existing_end = region->region.guest_phys_addr + region->region.memory_size - 1; @@ -380,11 +379,11 @@ kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start, */ struct vcpu *vcpu_find(struct kvm_vm *vm, uint32_t vcpuid) { - struct vcpu *vcpup; + struct vcpu *vcpu; - for (vcpup = vm->vcpu_head; vcpup; vcpup = vcpup->next) { - if (vcpup->id == vcpuid) - return vcpup; + list_for_each_entry(vcpu, &vm->vcpus, list) { + if (vcpu->id == vcpuid) + return vcpu; } return NULL; @@ -394,18 +393,16 @@ struct vcpu *vcpu_find(struct kvm_vm *vm, uint32_t vcpuid) * VM VCPU Remove * * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID + * vcpu - VCPU to remove * * Output Args: None * * Return: None, TEST_ASSERT failures for all error conditions * - * Within the VM specified by vm, removes the VCPU given by vcpuid. + * Removes a vCPU from a VM and frees its resources. */ -static void vm_vcpu_rm(struct kvm_vm *vm, uint32_t vcpuid) +static void vm_vcpu_rm(struct vcpu *vcpu) { - struct vcpu *vcpu = vcpu_find(vm, vcpuid); int ret; ret = munmap(vcpu->state, sizeof(*vcpu->state)); @@ -415,21 +412,17 @@ static void vm_vcpu_rm(struct kvm_vm *vm, uint32_t vcpuid) TEST_ASSERT(ret == 0, "Close of VCPU fd failed, rc: %i " "errno: %i", ret, errno); - if (vcpu->next) - vcpu->next->prev = vcpu->prev; - if (vcpu->prev) - vcpu->prev->next = vcpu->next; - else - vm->vcpu_head = vcpu->next; + list_del(&vcpu->list); free(vcpu); } void kvm_vm_release(struct kvm_vm *vmp) { + struct vcpu *vcpu, *tmp; int ret; - while (vmp->vcpu_head) - vm_vcpu_rm(vmp, vmp->vcpu_head->id); + list_for_each_entry_safe(vcpu, tmp, &vmp->vcpus, list) + vm_vcpu_rm(vcpu); ret = close(vmp->fd); TEST_ASSERT(ret == 0, "Close of vm fd failed,\n" @@ -440,35 +433,38 @@ void kvm_vm_release(struct kvm_vm *vmp) " vmp->kvm_fd: %i rc: %i errno: %i", vmp->kvm_fd, ret, errno); } +static void __vm_mem_region_delete(struct kvm_vm *vm, + struct userspace_mem_region *region) +{ + int ret; + + list_del(®ion->list); + + region->region.memory_size = 0; + ret = ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, ®ion->region); + TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed, " + "rc: %i errno: %i", ret, errno); + + sparsebit_free(®ion->unused_phy_pages); + ret = munmap(region->mmap_start, region->mmap_size); + TEST_ASSERT(ret == 0, "munmap failed, rc: %i errno: %i", ret, errno); + + free(region); +} + /* * Destroys and frees the VM pointed to by vmp. */ void kvm_vm_free(struct kvm_vm *vmp) { - int ret; + struct userspace_mem_region *region, *tmp; if (vmp == NULL) return; /* Free userspace_mem_regions. */ - while (vmp->userspace_mem_region_head) { - struct userspace_mem_region *region - = vmp->userspace_mem_region_head; - - region->region.memory_size = 0; - ret = ioctl(vmp->fd, KVM_SET_USER_MEMORY_REGION, - ®ion->region); - TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed, " - "rc: %i errno: %i", ret, errno); - - vmp->userspace_mem_region_head = region->next; - sparsebit_free(®ion->unused_phy_pages); - ret = munmap(region->mmap_start, region->mmap_size); - TEST_ASSERT(ret == 0, "munmap failed, rc: %i errno: %i", - ret, errno); - - free(region); - } + list_for_each_entry_safe(region, tmp, &vmp->userspace_mem_regions, list) + __vm_mem_region_delete(vmp, region); /* Free sparsebit arrays. */ sparsebit_free(&vmp->vpages_valid); @@ -582,6 +578,10 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, size_t huge_page_size = KVM_UTIL_PGS_PER_HUGEPG * vm->page_size; size_t alignment; + TEST_ASSERT(vm_adjust_num_guest_pages(vm->mode, npages) == npages, + "Number of guest pages is not compatible with the host. " + "Try npages=%d", vm_adjust_num_guest_pages(vm->mode, npages)); + TEST_ASSERT((guest_paddr % vm->page_size) == 0, "Guest physical " "address not on a page boundary.\n" " guest_paddr: 0x%lx vm->page_size: 0x%x", @@ -600,7 +600,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, region = (struct userspace_mem_region *) userspace_mem_region_find( vm, guest_paddr, (guest_paddr + npages * vm->page_size) - 1); if (region != NULL) - TEST_ASSERT(false, "overlapping userspace_mem_region already " + TEST_FAIL("overlapping userspace_mem_region already " "exists\n" " requested guest_paddr: 0x%lx npages: 0x%lx " "page_size: 0x%x\n" @@ -610,13 +610,11 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, (uint64_t) region->region.memory_size); /* Confirm no region with the requested slot already exists. */ - for (region = vm->userspace_mem_region_head; region; - region = region->next) { - if (region->region.slot == slot) - break; - } - if (region != NULL) - TEST_ASSERT(false, "A mem region with the requested slot " + list_for_each_entry(region, &vm->userspace_mem_regions, list) { + if (region->region.slot != slot) + continue; + + TEST_FAIL("A mem region with the requested slot " "already exists.\n" " requested slot: %u paddr: 0x%lx npages: 0x%lx\n" " existing slot: %u paddr: 0x%lx size: 0x%lx", @@ -624,6 +622,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, region->region.slot, (uint64_t) region->region.guest_phys_addr, (uint64_t) region->region.memory_size); + } /* Allocate and initialize new mem region structure. */ region = calloc(1, sizeof(*region)); @@ -684,10 +683,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, guest_paddr, (uint64_t) region->region.memory_size); /* Add to linked-list of memory regions. */ - if (vm->userspace_mem_region_head) - vm->userspace_mem_region_head->prev = region; - region->next = vm->userspace_mem_region_head; - vm->userspace_mem_region_head = region; + list_add(®ion->list, &vm->userspace_mem_regions); } /* @@ -710,20 +706,17 @@ memslot2region(struct kvm_vm *vm, uint32_t memslot) { struct userspace_mem_region *region; - for (region = vm->userspace_mem_region_head; region; - region = region->next) { + list_for_each_entry(region, &vm->userspace_mem_regions, list) { if (region->region.slot == memslot) - break; - } - if (region == NULL) { - fprintf(stderr, "No mem region with the requested slot found,\n" - " requested slot: %u\n", memslot); - fputs("---- vm dump ----\n", stderr); - vm_dump(stderr, vm, 2); - TEST_ASSERT(false, "Mem region not found"); + return region; } - return region; + fprintf(stderr, "No mem region with the requested slot found,\n" + " requested slot: %u\n", memslot); + fputs("---- vm dump ----\n", stderr); + vm_dump(stderr, vm, 2); + TEST_FAIL("Mem region not found"); + return NULL; } /* @@ -757,6 +750,54 @@ void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags) } /* + * VM Memory Region Move + * + * Input Args: + * vm - Virtual Machine + * slot - Slot of the memory region to move + * new_gpa - Starting guest physical address + * + * Output Args: None + * + * Return: None + * + * Change the gpa of a memory region. + */ +void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa) +{ + struct userspace_mem_region *region; + int ret; + + region = memslot2region(vm, slot); + + region->region.guest_phys_addr = new_gpa; + + ret = ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, ®ion->region); + + TEST_ASSERT(!ret, "KVM_SET_USER_MEMORY_REGION failed\n" + "ret: %i errno: %i slot: %u new_gpa: 0x%lx", + ret, errno, slot, new_gpa); +} + +/* + * VM Memory Region Delete + * + * Input Args: + * vm - Virtual Machine + * slot - Slot of the memory region to delete + * + * Output Args: None + * + * Return: None + * + * Delete a memory region. + */ +void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot) +{ + __vm_mem_region_delete(vm, memslot2region(vm, slot)); +} + +/* * VCPU mmap Size * * Input Args: None @@ -808,7 +849,7 @@ void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid) /* Confirm a vcpu with the specified id doesn't already exist. */ vcpu = vcpu_find(vm, vcpuid); if (vcpu != NULL) - TEST_ASSERT(false, "vcpu with the specified id " + TEST_FAIL("vcpu with the specified id " "already exists,\n" " requested vcpuid: %u\n" " existing vcpuid: %u state: %p", @@ -831,10 +872,7 @@ void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid) "vcpu id: %u errno: %i", vcpuid, errno); /* Add to linked-list of VCPUs. */ - if (vm->vcpu_head) - vm->vcpu_head->prev = vcpu; - vcpu->next = vm->vcpu_head; - vm->vcpu_head = vcpu; + list_add(&vcpu->list, &vm->vcpus); } /* @@ -901,8 +939,7 @@ static vm_vaddr_t vm_vaddr_unused_gap(struct kvm_vm *vm, size_t sz, } while (pgidx_start != 0); no_va_found: - TEST_ASSERT(false, "No vaddr of specified pages available, " - "pages: 0x%lx", pages); + TEST_FAIL("No vaddr of specified pages available, pages: 0x%lx", pages); /* NOT REACHED */ return -1; @@ -982,21 +1019,21 @@ vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, * vm - Virtual Machine * vaddr - Virtuall address to map * paddr - VM Physical Address - * size - The size of the range to map + * npages - The number of pages to map * pgd_memslot - Memory region slot for new virtual translation tables * * Output Args: None * * Return: None * - * Within the VM given by vm, creates a virtual translation for the - * page range starting at vaddr to the page range starting at paddr. + * Within the VM given by @vm, creates a virtual translation for + * @npages starting at @vaddr to the page range starting at @paddr. */ void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, - size_t size, uint32_t pgd_memslot) + unsigned int npages, uint32_t pgd_memslot) { size_t page_size = vm->page_size; - size_t npages = size / page_size; + size_t size = npages * page_size; TEST_ASSERT(vaddr + size > vaddr, "Vaddr overflow"); TEST_ASSERT(paddr + size > paddr, "Paddr overflow"); @@ -1028,8 +1065,8 @@ void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa) { struct userspace_mem_region *region; - for (region = vm->userspace_mem_region_head; region; - region = region->next) { + + list_for_each_entry(region, &vm->userspace_mem_regions, list) { if ((gpa >= region->region.guest_phys_addr) && (gpa <= (region->region.guest_phys_addr + region->region.memory_size - 1))) @@ -1037,7 +1074,7 @@ void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa) + (gpa - region->region.guest_phys_addr)); } - TEST_ASSERT(false, "No vm physical memory at 0x%lx", gpa); + TEST_FAIL("No vm physical memory at 0x%lx", gpa); return NULL; } @@ -1061,8 +1098,8 @@ void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa) vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva) { struct userspace_mem_region *region; - for (region = vm->userspace_mem_region_head; region; - region = region->next) { + + list_for_each_entry(region, &vm->userspace_mem_regions, list) { if ((hva >= region->host_mem) && (hva <= (region->host_mem + region->region.memory_size - 1))) @@ -1071,8 +1108,7 @@ vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva) + (hva - (uintptr_t) region->host_mem)); } - TEST_ASSERT(false, "No mapping to a guest physical address, " - "hva: %p", hva); + TEST_FAIL("No mapping to a guest physical address, hva: %p", hva); return -1; } @@ -1171,6 +1207,15 @@ void vcpu_run_complete_io(struct kvm_vm *vm, uint32_t vcpuid) ret, errno); } +void vcpu_set_guest_debug(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_guest_debug *debug) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + int ret = ioctl(vcpu->fd, KVM_SET_GUEST_DEBUG, debug); + + TEST_ASSERT(ret == 0, "KVM_SET_GUEST_DEBUG failed: %d", ret); +} + /* * VM VCPU Set MP State * @@ -1490,8 +1535,7 @@ void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) fprintf(stream, "%*sfd: %i\n", indent, "", vm->fd); fprintf(stream, "%*spage_size: 0x%x\n", indent, "", vm->page_size); fprintf(stream, "%*sMem Regions:\n", indent, ""); - for (region = vm->userspace_mem_region_head; region; - region = region->next) { + list_for_each_entry(region, &vm->userspace_mem_regions, list) { fprintf(stream, "%*sguest_phys: 0x%lx size: 0x%lx " "host_virt: %p\n", indent + 2, "", (uint64_t) region->region.guest_phys_addr, @@ -1510,7 +1554,7 @@ void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) virt_dump(stream, vm, indent + 4); } fprintf(stream, "%*sVCPUs:\n", indent, ""); - for (vcpu = vm->vcpu_head; vcpu; vcpu = vcpu->next) + list_for_each_entry(vcpu, &vm->vcpus, list) vcpu_dump(stream, vm, vcpu->id, indent + 2); } @@ -1703,3 +1747,48 @@ unsigned int vm_get_max_gfn(struct kvm_vm *vm) { return vm->max_gfn; } + +int vm_get_fd(struct kvm_vm *vm) +{ + return vm->fd; +} + +static unsigned int vm_calc_num_pages(unsigned int num_pages, + unsigned int page_shift, + unsigned int new_page_shift, + bool ceil) +{ + unsigned int n = 1 << (new_page_shift - page_shift); + + if (page_shift >= new_page_shift) + return num_pages * (1 << (page_shift - new_page_shift)); + + return num_pages / n + !!(ceil && num_pages % n); +} + +static inline int getpageshift(void) +{ + return __builtin_ffs(getpagesize()) - 1; +} + +unsigned int +vm_num_host_pages(enum vm_guest_mode mode, unsigned int num_guest_pages) +{ + return vm_calc_num_pages(num_guest_pages, + vm_guest_mode_params[mode].page_shift, + getpageshift(), true); +} + +unsigned int +vm_num_guest_pages(enum vm_guest_mode mode, unsigned int num_host_pages) +{ + return vm_calc_num_pages(num_host_pages, getpageshift(), + vm_guest_mode_params[mode].page_shift, false); +} + +unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size) +{ + unsigned int n; + n = DIV_ROUND_UP(size, vm_guest_mode_params[mode].page_size); + return vm_adjust_num_guest_pages(mode, n); +} diff --git a/tools/testing/selftests/kvm/lib/kvm_util_internal.h b/tools/testing/selftests/kvm/lib/kvm_util_internal.h index ac50c42750cf..2ef446520748 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util_internal.h +++ b/tools/testing/selftests/kvm/lib/kvm_util_internal.h @@ -12,19 +12,7 @@ #define KVM_DEV_PATH "/dev/kvm" -#ifndef BITS_PER_BYTE -#define BITS_PER_BYTE 8 -#endif - -#ifndef BITS_PER_LONG -#define BITS_PER_LONG (BITS_PER_BYTE * sizeof(long)) -#endif - -#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) -#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_LONG) - struct userspace_mem_region { - struct userspace_mem_region *next, *prev; struct kvm_userspace_memory_region region; struct sparsebit *unused_phy_pages; int fd; @@ -32,10 +20,11 @@ struct userspace_mem_region { void *host_mem; void *mmap_start; size_t mmap_size; + struct list_head list; }; struct vcpu { - struct vcpu *next, *prev; + struct list_head list; uint32_t id; int fd; struct kvm_run *state; @@ -52,8 +41,8 @@ struct kvm_vm { unsigned int pa_bits; unsigned int va_bits; uint64_t max_gfn; - struct vcpu *vcpu_head; - struct userspace_mem_region *userspace_mem_region_head; + struct list_head vcpus; + struct list_head userspace_mem_regions; struct sparsebit *vpages_valid; struct sparsebit *vpages_mapped; bool has_irqchip; @@ -64,8 +53,56 @@ struct kvm_vm { }; struct vcpu *vcpu_find(struct kvm_vm *vm, uint32_t vcpuid); + +/* + * Virtual Translation Tables Dump + * + * Input Args: + * stream - Output FILE stream + * vm - Virtual Machine + * indent - Left margin indent amount + * + * Output Args: None + * + * Return: None + * + * Dumps to the FILE stream given by @stream, the contents of all the + * virtual translation tables for the VM given by @vm. + */ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent); + +/* + * Register Dump + * + * Input Args: + * stream - Output FILE stream + * regs - Registers + * indent - Left margin indent amount + * + * Output Args: None + * + * Return: None + * + * Dumps the state of the registers given by @regs, to the FILE stream + * given by @stream. + */ void regs_dump(FILE *stream, struct kvm_regs *regs, uint8_t indent); + +/* + * System Register Dump + * + * Input Args: + * stream - Output FILE stream + * sregs - System registers + * indent - Left margin indent amount + * + * Output Args: None + * + * Return: None + * + * Dumps the state of the system registers given by @sregs, to the FILE stream + * given by @stream. + */ void sregs_dump(FILE *stream, struct kvm_sregs *sregs, uint8_t indent); struct userspace_mem_region * diff --git a/tools/testing/selftests/kvm/lib/s390x/processor.c b/tools/testing/selftests/kvm/lib/s390x/processor.c index 32a02360b1eb..a88c5d665725 100644 --- a/tools/testing/selftests/kvm/lib/s390x/processor.c +++ b/tools/testing/selftests/kvm/lib/s390x/processor.c @@ -51,22 +51,6 @@ static uint64_t virt_alloc_region(struct kvm_vm *vm, int ri, uint32_t memslot) | ((ri < 4 ? (PAGES_PER_REGION - 1) : 0) & REGION_ENTRY_LENGTH); } -/* - * VM Virtual Page Map - * - * Input Args: - * vm - Virtual Machine - * gva - VM Virtual Address - * gpa - VM Physical Address - * memslot - Memory region slot for new virtual translation tables - * - * Output Args: None - * - * Return: None - * - * Within the VM given by vm, creates a virtual translation for the page - * starting at vaddr to the page starting at paddr. - */ void virt_pg_map(struct kvm_vm *vm, uint64_t gva, uint64_t gpa, uint32_t memslot) { @@ -107,26 +91,6 @@ void virt_pg_map(struct kvm_vm *vm, uint64_t gva, uint64_t gpa, entry[idx] = gpa; } -/* - * Address Guest Virtual to Guest Physical - * - * Input Args: - * vm - Virtual Machine - * gpa - VM virtual address - * - * Output Args: None - * - * Return: - * Equivalent VM physical address - * - * Translates the VM virtual address given by gva to a VM physical - * address and then locates the memory region containing the VM - * physical address, within the VM given by vm. When found, the host - * virtual address providing the memory to the vm physical address is - * returned. - * A TEST_ASSERT failure occurs if no region containing translated - * VM virtual address exists. - */ vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) { int ri, idx; @@ -196,21 +160,6 @@ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) virt_dump_region(stream, vm, indent, vm->pgd); } -/* - * Create a VM with reasonable defaults - * - * Input Args: - * vcpuid - The id of the single VCPU to add to the VM. - * extra_mem_pages - The size of extra memories to add (this will - * decide how much extra space we will need to - * setup the page tables using mem slot 0) - * guest_code - The vCPU's entry point - * - * Output Args: None - * - * Return: - * Pointer to opaque structure that describes the created VM. - */ struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages, void *guest_code) { @@ -231,13 +180,6 @@ struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages, return vm; } -/* - * Adds a vCPU with reasonable defaults (i.e. a stack and initial PSW) - * - * Input Args: - * vcpuid - The id of the VCPU to add to the VM. - * guest_code - The vCPU's entry point - */ void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code) { size_t stack_size = DEFAULT_STACK_PGS * getpagesize(); @@ -269,9 +211,32 @@ void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code) run->psw_addr = (uintptr_t)guest_code; } +void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...) +{ + va_list ap; + struct kvm_regs regs; + int i; + + TEST_ASSERT(num >= 1 && num <= 5, "Unsupported number of args,\n" + " num: %u\n", + num); + + va_start(ap, num); + vcpu_regs_get(vm, vcpuid, ®s); + + for (i = 0; i < num; i++) + regs.gprs[i + 2] = va_arg(ap, uint64_t); + + vcpu_regs_set(vm, vcpuid, ®s); + va_end(ap); +} + void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent) { - struct vcpu *vcpu = vm->vcpu_head; + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + + if (!vcpu) + return; fprintf(stream, "%*spstate: psw: 0x%.16llx:0x%.16llx\n", indent, "", vcpu->state->psw_mask, vcpu->state->psw_addr); diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c new file mode 100644 index 000000000000..689e97c27ee2 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/test_util.c @@ -0,0 +1,93 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * tools/testing/selftests/kvm/lib/test_util.c + * + * Copyright (C) 2020, Google LLC. + */ +#include <stdlib.h> +#include <ctype.h> +#include <limits.h> +#include <assert.h> +#include "test_util.h" + +/* + * Parses "[0-9]+[kmgt]?". + */ +size_t parse_size(const char *size) +{ + size_t base; + char *scale; + int shift = 0; + + TEST_ASSERT(size && isdigit(size[0]), "Need at least one digit in '%s'", size); + + base = strtoull(size, &scale, 0); + + TEST_ASSERT(base != ULLONG_MAX, "Overflow parsing size!"); + + switch (tolower(*scale)) { + case 't': + shift = 40; + break; + case 'g': + shift = 30; + break; + case 'm': + shift = 20; + break; + case 'k': + shift = 10; + break; + case 'b': + case '\0': + shift = 0; + break; + default: + TEST_ASSERT(false, "Unknown size letter %c", *scale); + } + + TEST_ASSERT((base << shift) >> shift == base, "Overflow scaling size!"); + + return base << shift; +} + +int64_t timespec_to_ns(struct timespec ts) +{ + return (int64_t)ts.tv_nsec + 1000000000LL * (int64_t)ts.tv_sec; +} + +struct timespec timespec_add_ns(struct timespec ts, int64_t ns) +{ + struct timespec res; + + res.tv_nsec = ts.tv_nsec + ns; + res.tv_sec = ts.tv_sec + res.tv_nsec / 1000000000LL; + res.tv_nsec %= 1000000000LL; + + return res; +} + +struct timespec timespec_add(struct timespec ts1, struct timespec ts2) +{ + int64_t ns1 = timespec_to_ns(ts1); + int64_t ns2 = timespec_to_ns(ts2); + return timespec_add_ns((struct timespec){0}, ns1 + ns2); +} + +struct timespec timespec_sub(struct timespec ts1, struct timespec ts2) +{ + int64_t ns1 = timespec_to_ns(ts1); + int64_t ns2 = timespec_to_ns(ts2); + return timespec_add_ns((struct timespec){0}, ns1 - ns2); +} + +void print_skip(const char *fmt, ...) +{ + va_list ap; + + assert(fmt); + va_start(ap, fmt); + vprintf(fmt, ap); + va_end(ap); + puts(", skipping test"); +} diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 683d3bdb8f6a..f6eb34eaa0d2 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -77,20 +77,6 @@ struct pageTableEntry { uint64_t execute_disable:1; }; -/* Register Dump - * - * Input Args: - * indent - Left margin indent amount - * regs - register - * - * Output Args: - * stream - Output FILE stream - * - * Return: None - * - * Dumps the state of the registers given by regs, to the FILE stream - * given by steam. - */ void regs_dump(FILE *stream, struct kvm_regs *regs, uint8_t indent) { @@ -115,19 +101,20 @@ void regs_dump(FILE *stream, struct kvm_regs *regs, regs->rip, regs->rflags); } -/* Segment Dump +/* + * Segment Dump * * Input Args: - * indent - Left margin indent amount + * stream - Output FILE stream * segment - KVM segment + * indent - Left margin indent amount * - * Output Args: - * stream - Output FILE stream + * Output Args: None * * Return: None * - * Dumps the state of the KVM segment given by segment, to the FILE stream - * given by steam. + * Dumps the state of the KVM segment given by @segment, to the FILE stream + * given by @stream. */ static void segment_dump(FILE *stream, struct kvm_segment *segment, uint8_t indent) @@ -146,19 +133,20 @@ static void segment_dump(FILE *stream, struct kvm_segment *segment, segment->unusable, segment->padding); } -/* dtable Dump +/* + * dtable Dump * * Input Args: - * indent - Left margin indent amount + * stream - Output FILE stream * dtable - KVM dtable + * indent - Left margin indent amount * - * Output Args: - * stream - Output FILE stream + * Output Args: None * * Return: None * - * Dumps the state of the KVM dtable given by dtable, to the FILE stream - * given by steam. + * Dumps the state of the KVM dtable given by @dtable, to the FILE stream + * given by @stream. */ static void dtable_dump(FILE *stream, struct kvm_dtable *dtable, uint8_t indent) @@ -169,20 +157,6 @@ static void dtable_dump(FILE *stream, struct kvm_dtable *dtable, dtable->padding[0], dtable->padding[1], dtable->padding[2]); } -/* System Register Dump - * - * Input Args: - * indent - Left margin indent amount - * sregs - System registers - * - * Output Args: - * stream - Output FILE stream - * - * Return: None - * - * Dumps the state of the system registers given by sregs, to the FILE stream - * given by steam. - */ void sregs_dump(FILE *stream, struct kvm_sregs *sregs, uint8_t indent) { @@ -240,21 +214,6 @@ void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot) } } -/* VM Virtual Page Map - * - * Input Args: - * vm - Virtual Machine - * vaddr - VM Virtual Address - * paddr - VM Physical Address - * pgd_memslot - Memory region slot for new virtual translation tables - * - * Output Args: None - * - * Return: None - * - * Within the VM given by vm, creates a virtual translation for the page - * starting at vaddr to the page starting at paddr. - */ void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, uint32_t pgd_memslot) { @@ -326,20 +285,6 @@ void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, pte[index[0]].present = 1; } -/* Virtual Translation Tables Dump - * - * Input Args: - * vm - Virtual Machine - * indent - Left margin indent amount - * - * Output Args: - * stream - Output FILE stream - * - * Return: None - * - * Dumps to the FILE stream given by stream, the contents of all the - * virtual translation tables for the VM given by vm. - */ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) { struct pageMapL4Entry *pml4e, *pml4e_start; @@ -421,7 +366,8 @@ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) } } -/* Set Unusable Segment +/* + * Set Unusable Segment * * Input Args: None * @@ -430,7 +376,7 @@ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) * * Return: None * - * Sets the segment register pointed to by segp to an unusable state. + * Sets the segment register pointed to by @segp to an unusable state. */ static void kvm_seg_set_unusable(struct kvm_segment *segp) { @@ -460,7 +406,8 @@ static void kvm_seg_fill_gdt_64bit(struct kvm_vm *vm, struct kvm_segment *segp) } -/* Set Long Mode Flat Kernel Code Segment +/* + * Set Long Mode Flat Kernel Code Segment * * Input Args: * vm - VM whose GDT is being filled, or NULL to only write segp @@ -471,8 +418,8 @@ static void kvm_seg_fill_gdt_64bit(struct kvm_vm *vm, struct kvm_segment *segp) * * Return: None * - * Sets up the KVM segment pointed to by segp, to be a code segment - * with the selector value given by selector. + * Sets up the KVM segment pointed to by @segp, to be a code segment + * with the selector value given by @selector. */ static void kvm_seg_set_kernel_code_64bit(struct kvm_vm *vm, uint16_t selector, struct kvm_segment *segp) @@ -491,7 +438,8 @@ static void kvm_seg_set_kernel_code_64bit(struct kvm_vm *vm, uint16_t selector, kvm_seg_fill_gdt_64bit(vm, segp); } -/* Set Long Mode Flat Kernel Data Segment +/* + * Set Long Mode Flat Kernel Data Segment * * Input Args: * vm - VM whose GDT is being filled, or NULL to only write segp @@ -502,8 +450,8 @@ static void kvm_seg_set_kernel_code_64bit(struct kvm_vm *vm, uint16_t selector, * * Return: None * - * Sets up the KVM segment pointed to by segp, to be a data segment - * with the selector value given by selector. + * Sets up the KVM segment pointed to by @segp, to be a data segment + * with the selector value given by @selector. */ static void kvm_seg_set_kernel_data_64bit(struct kvm_vm *vm, uint16_t selector, struct kvm_segment *segp) @@ -521,24 +469,6 @@ static void kvm_seg_set_kernel_data_64bit(struct kvm_vm *vm, uint16_t selector, kvm_seg_fill_gdt_64bit(vm, segp); } -/* Address Guest Virtual to Guest Physical - * - * Input Args: - * vm - Virtual Machine - * gpa - VM virtual address - * - * Output Args: None - * - * Return: - * Equivalent VM physical address - * - * Translates the VM virtual address given by gva to a VM physical - * address and then locates the memory region containing the VM - * physical address, within the VM given by vm. When found, the host - * virtual address providing the memory to the vm physical address is returned. - * A TEST_ASSERT failure occurs if no region containing translated - * VM virtual address exists. - */ vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) { uint16_t index[4]; @@ -576,8 +506,7 @@ vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) return (pte[index[0]].address * vm->page_size) + (gva & 0xfffu); unmapped_gva: - TEST_ASSERT(false, "No mapping for vm virtual address, " - "gva: 0x%lx", gva); + TEST_FAIL("No mapping for vm virtual address, gva: 0x%lx", gva); exit(EXIT_FAILURE); } @@ -634,18 +563,13 @@ static void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, int gdt_m break; default: - TEST_ASSERT(false, "Unknown guest mode, mode: 0x%x", vm->mode); + TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode); } sregs.cr3 = vm->pgd; vcpu_sregs_set(vm, vcpuid, &sregs); } -/* Adds a vCPU with reasonable defaults (i.e., a stack) - * - * Input Args: - * vcpuid - The id of the VCPU to add to the VM. - * guest_code - The vCPU's entry point - */ + void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code) { struct kvm_mp_state mp_state; @@ -670,7 +594,8 @@ void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code) vcpu_set_mp_state(vm, vcpuid, &mp_state); } -/* Allocate an instance of struct kvm_cpuid2 +/* + * Allocate an instance of struct kvm_cpuid2 * * Input Args: None * @@ -703,7 +628,8 @@ static struct kvm_cpuid2 *allocate_kvm_cpuid2(void) return cpuid; } -/* KVM Supported CPUID Get +/* + * KVM Supported CPUID Get * * Input Args: None * @@ -735,11 +661,12 @@ struct kvm_cpuid2 *kvm_get_supported_cpuid(void) return cpuid; } -/* Locate a cpuid entry. +/* + * Locate a cpuid entry. * * Input Args: - * cpuid: The cpuid. * function: The function of the cpuid entry to find. + * index: The index of the cpuid entry. * * Output Args: None * @@ -766,7 +693,8 @@ kvm_get_supported_cpuid_index(uint32_t function, uint32_t index) return entry; } -/* VM VCPU CPUID Set +/* + * VM VCPU CPUID Set * * Input Args: * vm - Virtual Machine @@ -793,20 +721,6 @@ void vcpu_set_cpuid(struct kvm_vm *vm, } -/* Create a VM with reasonable defaults - * - * Input Args: - * vcpuid - The id of the single VCPU to add to the VM. - * extra_mem_pages - The size of extra memories to add (this will - * decide how much extra space we will need to - * setup the page tables using mem slot 0) - * guest_code - The vCPU's entry point - * - * Output Args: None - * - * Return: - * Pointer to opaque structure that describes the created VM. - */ struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages, void *guest_code) { @@ -837,7 +751,8 @@ struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages, return vm; } -/* VCPU Get MSR +/* + * VCPU Get MSR * * Input Args: * vm - Virtual Machine @@ -869,7 +784,8 @@ uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index) return buffer.entry.data; } -/* _VCPU Set MSR +/* + * _VCPU Set MSR * * Input Args: * vm - Virtual Machine @@ -902,7 +818,8 @@ int _vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index, return r; } -/* VCPU Set MSR +/* + * VCPU Set MSR * * Input Args: * vm - Virtual Machine @@ -926,22 +843,6 @@ void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index, " rc: %i errno: %i", r, errno); } -/* VM VCPU Args Set - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * num - number of arguments - * ... - arguments, each of type uint64_t - * - * Output Args: None - * - * Return: None - * - * Sets the first num function input arguments to the values - * given as variable args. Each of the variable args is expected to - * be of type uint64_t. - */ void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...) { va_list ap; @@ -976,22 +877,6 @@ void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...) va_end(ap); } -/* - * VM VCPU Dump - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * indent - Left margin indent amount - * - * Output Args: - * stream - Output FILE stream - * - * Return: None - * - * Dumps the current state of the VCPU specified by vcpuid, within the VM - * given by vm, to the FILE stream given by stream. - */ void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent) { struct kvm_regs regs; diff --git a/tools/testing/selftests/kvm/lib/x86_64/svm.c b/tools/testing/selftests/kvm/lib/x86_64/svm.c index 6e05a8fc3fe0..c42401068373 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/svm.c +++ b/tools/testing/selftests/kvm/lib/x86_64/svm.c @@ -154,7 +154,7 @@ void nested_svm_check_supported(void) kvm_get_supported_cpuid_entry(0x80000001); if (!(entry->ecx & CPUID_SVM)) { - fprintf(stderr, "nested SVM not enabled, skipping test\n"); + print_skip("nested SVM not enabled"); exit(KSFT_SKIP); } } diff --git a/tools/testing/selftests/kvm/lib/x86_64/vmx.c b/tools/testing/selftests/kvm/lib/x86_64/vmx.c index 7aaa99ca4dbc..4ae104f6ce69 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86_64/vmx.c @@ -17,6 +17,9 @@ bool enable_evmcs; +struct hv_enlightened_vmcs *current_evmcs; +struct hv_vp_assist_page *current_vp_assist; + struct eptPageTableEntry { uint64_t readable:1; uint64_t writable:1; @@ -191,7 +194,7 @@ bool load_vmcs(struct vmx_pages *vmx) if (evmcs_vmptrld(vmx->enlightened_vmcs_gpa, vmx->enlightened_vmcs)) return false; - current_evmcs->revision_id = vmcs_revision(); + current_evmcs->revision_id = EVMCS_VERSION; } return true; @@ -381,7 +384,7 @@ void nested_vmx_check_supported(void) struct kvm_cpuid_entry2 *entry = kvm_get_supported_cpuid_entry(1); if (!(entry->ecx & CPUID_VMX)) { - fprintf(stderr, "nested VMX not enabled, skipping test\n"); + print_skip("nested VMX not enabled"); exit(KSFT_SKIP); } } diff --git a/tools/testing/selftests/kvm/s390x/memop.c b/tools/testing/selftests/kvm/s390x/memop.c index 9edaa9a134ce..9f49ead380ab 100644 --- a/tools/testing/selftests/kvm/s390x/memop.c +++ b/tools/testing/selftests/kvm/s390x/memop.c @@ -40,7 +40,7 @@ int main(int argc, char *argv[]) maxsize = kvm_check_cap(KVM_CAP_S390_MEM_OP); if (!maxsize) { - fprintf(stderr, "CAP_S390_MEM_OP not supported -> skip test\n"); + print_skip("CAP_S390_MEM_OP not supported"); exit(KSFT_SKIP); } if (maxsize > sizeof(mem1)) diff --git a/tools/testing/selftests/kvm/s390x/resets.c b/tools/testing/selftests/kvm/s390x/resets.c index 1485bc6c8999..b143db6d8693 100644 --- a/tools/testing/selftests/kvm/s390x/resets.c +++ b/tools/testing/selftests/kvm/s390x/resets.c @@ -20,29 +20,42 @@ struct kvm_s390_irq buf[VCPU_ID + LOCAL_IRQS]; struct kvm_vm *vm; struct kvm_run *run; -struct kvm_sync_regs *regs; -static uint64_t regs_null[16]; - -static uint64_t crs[16] = { 0x40000ULL, - 0x42000ULL, - 0, 0, 0, 0, 0, - 0x43000ULL, - 0, 0, 0, 0, 0, - 0x44000ULL, - 0, 0 -}; +struct kvm_sync_regs *sync_regs; +static uint8_t regs_null[512]; static void guest_code_initial(void) { - /* Round toward 0 */ - uint32_t fpc = 0x11; + /* set several CRs to "safe" value */ + unsigned long cr2_59 = 0x10; /* enable guarded storage */ + unsigned long cr8_63 = 0x1; /* monitor mask = 1 */ + unsigned long cr10 = 1; /* PER START */ + unsigned long cr11 = -1; /* PER END */ + /* Dirty registers */ asm volatile ( - " lctlg 0,15,%0\n" - " sfpc %1\n" - : : "Q" (crs), "d" (fpc)); - GUEST_SYNC(0); + " lghi 2,0x11\n" /* Round toward 0 */ + " sfpc 2\n" /* set fpc to !=0 */ + " lctlg 2,2,%0\n" + " lctlg 8,8,%1\n" + " lctlg 10,10,%2\n" + " lctlg 11,11,%3\n" + /* now clobber some general purpose regs */ + " llihh 0,0xffff\n" + " llihl 1,0x5555\n" + " llilh 2,0xaaaa\n" + " llill 3,0x0000\n" + /* now clobber a floating point reg */ + " lghi 4,0x1\n" + " cdgbr 0,4\n" + /* now clobber an access reg */ + " sar 9,4\n" + /* We embed diag 501 here to control register content */ + " diag 0,0,0x501\n" + : + : "m" (cr2_59), "m" (cr8_63), "m" (cr10), "m" (cr11) + /* no clobber list as this should not return */ + ); } static void test_one_reg(uint64_t id, uint64_t value) @@ -53,7 +66,7 @@ static void test_one_reg(uint64_t id, uint64_t value) reg.addr = (uintptr_t)&eval_reg; reg.id = id; vcpu_get_reg(vm, VCPU_ID, ®); - TEST_ASSERT(eval_reg == value, "value == %s", value); + TEST_ASSERT(eval_reg == value, "value == 0x%lx", value); } static void assert_noirq(void) @@ -87,6 +100,31 @@ static void assert_clear(void) vcpu_fpu_get(vm, VCPU_ID, &fpu); TEST_ASSERT(!memcmp(&fpu.fprs, regs_null, sizeof(fpu.fprs)), "fprs == 0"); + + /* sync regs */ + TEST_ASSERT(!memcmp(sync_regs->gprs, regs_null, sizeof(sync_regs->gprs)), + "gprs0-15 == 0 (sync_regs)"); + + TEST_ASSERT(!memcmp(sync_regs->acrs, regs_null, sizeof(sync_regs->acrs)), + "acrs0-15 == 0 (sync_regs)"); + + TEST_ASSERT(!memcmp(sync_regs->vrs, regs_null, sizeof(sync_regs->vrs)), + "vrs0-15 == 0 (sync_regs)"); +} + +static void assert_initial_noclear(void) +{ + TEST_ASSERT(sync_regs->gprs[0] == 0xffff000000000000UL, + "gpr0 == 0xffff000000000000 (sync_regs)"); + TEST_ASSERT(sync_regs->gprs[1] == 0x0000555500000000UL, + "gpr1 == 0x0000555500000000 (sync_regs)"); + TEST_ASSERT(sync_regs->gprs[2] == 0x00000000aaaa0000UL, + "gpr2 == 0x00000000aaaa0000 (sync_regs)"); + TEST_ASSERT(sync_regs->gprs[3] == 0x0000000000000000UL, + "gpr3 == 0x0000000000000000 (sync_regs)"); + TEST_ASSERT(sync_regs->fprs[0] == 0x3ff0000000000000UL, + "fpr0 == 0f1 (sync_regs)"); + TEST_ASSERT(sync_regs->acrs[9] == 1, "ar9 == 1 (sync_regs)"); } static void assert_initial(void) @@ -94,12 +132,32 @@ static void assert_initial(void) struct kvm_sregs sregs; struct kvm_fpu fpu; + /* KVM_GET_SREGS */ vcpu_sregs_get(vm, VCPU_ID, &sregs); - TEST_ASSERT(sregs.crs[0] == 0xE0UL, "cr0 == 0xE0"); - TEST_ASSERT(sregs.crs[14] == 0xC2000000UL, "cr14 == 0xC2000000"); + TEST_ASSERT(sregs.crs[0] == 0xE0UL, "cr0 == 0xE0 (KVM_GET_SREGS)"); + TEST_ASSERT(sregs.crs[14] == 0xC2000000UL, + "cr14 == 0xC2000000 (KVM_GET_SREGS)"); TEST_ASSERT(!memcmp(&sregs.crs[1], regs_null, sizeof(sregs.crs[1]) * 12), - "cr1-13 == 0"); - TEST_ASSERT(sregs.crs[15] == 0, "cr15 == 0"); + "cr1-13 == 0 (KVM_GET_SREGS)"); + TEST_ASSERT(sregs.crs[15] == 0, "cr15 == 0 (KVM_GET_SREGS)"); + + /* sync regs */ + TEST_ASSERT(sync_regs->crs[0] == 0xE0UL, "cr0 == 0xE0 (sync_regs)"); + TEST_ASSERT(sync_regs->crs[14] == 0xC2000000UL, + "cr14 == 0xC2000000 (sync_regs)"); + TEST_ASSERT(!memcmp(&sync_regs->crs[1], regs_null, 8 * 12), + "cr1-13 == 0 (sync_regs)"); + TEST_ASSERT(sync_regs->crs[15] == 0, "cr15 == 0 (sync_regs)"); + TEST_ASSERT(sync_regs->fpc == 0, "fpc == 0 (sync_regs)"); + TEST_ASSERT(sync_regs->todpr == 0, "todpr == 0 (sync_regs)"); + TEST_ASSERT(sync_regs->cputm == 0, "cputm == 0 (sync_regs)"); + TEST_ASSERT(sync_regs->ckc == 0, "ckc == 0 (sync_regs)"); + TEST_ASSERT(sync_regs->pp == 0, "pp == 0 (sync_regs)"); + TEST_ASSERT(sync_regs->gbea == 1, "gbea == 1 (sync_regs)"); + + /* kvm_run */ + TEST_ASSERT(run->psw_addr == 0, "psw_addr == 0 (kvm_run)"); + TEST_ASSERT(run->psw_mask == 0, "psw_mask == 0 (kvm_run)"); vcpu_fpu_get(vm, VCPU_ID, &fpu); TEST_ASSERT(!fpu.fpc, "fpc == 0"); @@ -111,9 +169,19 @@ static void assert_initial(void) test_one_reg(KVM_REG_S390_CLOCK_COMP, 0); } +static void assert_normal_noclear(void) +{ + TEST_ASSERT(sync_regs->crs[2] == 0x10, "cr2 == 10 (sync_regs)"); + TEST_ASSERT(sync_regs->crs[8] == 1, "cr10 == 1 (sync_regs)"); + TEST_ASSERT(sync_regs->crs[10] == 1, "cr10 == 1 (sync_regs)"); + TEST_ASSERT(sync_regs->crs[11] == -1, "cr11 == -1 (sync_regs)"); +} + static void assert_normal(void) { test_one_reg(KVM_REG_S390_PFTOKEN, KVM_S390_PFAULT_TOKEN_INVALID); + TEST_ASSERT(sync_regs->pft == KVM_S390_PFAULT_TOKEN_INVALID, + "pft == 0xff..... (sync_regs)"); assert_noirq(); } @@ -134,53 +202,67 @@ static void inject_irq(int cpu_id) static void test_normal(void) { - printf("Testing normal reset\n"); + pr_info("Testing normal reset\n"); /* Create VM */ vm = vm_create_default(VCPU_ID, 0, guest_code_initial); run = vcpu_state(vm, VCPU_ID); - regs = &run->s.regs; + sync_regs = &run->s.regs; vcpu_run(vm, VCPU_ID); inject_irq(VCPU_ID); vcpu_ioctl(vm, VCPU_ID, KVM_S390_NORMAL_RESET, 0); + + /* must clears */ assert_normal(); + /* must not clears */ + assert_normal_noclear(); + assert_initial_noclear(); + kvm_vm_free(vm); } static void test_initial(void) { - printf("Testing initial reset\n"); + pr_info("Testing initial reset\n"); vm = vm_create_default(VCPU_ID, 0, guest_code_initial); run = vcpu_state(vm, VCPU_ID); - regs = &run->s.regs; + sync_regs = &run->s.regs; vcpu_run(vm, VCPU_ID); inject_irq(VCPU_ID); vcpu_ioctl(vm, VCPU_ID, KVM_S390_INITIAL_RESET, 0); + + /* must clears */ assert_normal(); assert_initial(); + /* must not clears */ + assert_initial_noclear(); + kvm_vm_free(vm); } static void test_clear(void) { - printf("Testing clear reset\n"); + pr_info("Testing clear reset\n"); vm = vm_create_default(VCPU_ID, 0, guest_code_initial); run = vcpu_state(vm, VCPU_ID); - regs = &run->s.regs; + sync_regs = &run->s.regs; vcpu_run(vm, VCPU_ID); inject_irq(VCPU_ID); vcpu_ioctl(vm, VCPU_ID, KVM_S390_CLEAR_RESET, 0); + + /* must clears */ assert_normal(); assert_initial(); assert_clear(); + kvm_vm_free(vm); } diff --git a/tools/testing/selftests/kvm/s390x/sync_regs_test.c b/tools/testing/selftests/kvm/s390x/sync_regs_test.c index b705637ca14b..5731ccf34917 100644 --- a/tools/testing/selftests/kvm/s390x/sync_regs_test.c +++ b/tools/testing/selftests/kvm/s390x/sync_regs_test.c @@ -42,6 +42,13 @@ static void guest_code(void) " values did not match: 0x%llx, 0x%llx\n", \ left->reg, right->reg) +#define REG_COMPARE32(reg) \ + TEST_ASSERT(left->reg == right->reg, \ + "Register " #reg \ + " values did not match: 0x%x, 0x%x\n", \ + left->reg, right->reg) + + static void compare_regs(struct kvm_regs *left, struct kvm_sync_regs *right) { int i; @@ -55,7 +62,7 @@ static void compare_sregs(struct kvm_sregs *left, struct kvm_sync_regs *right) int i; for (i = 0; i < 16; i++) - REG_COMPARE(acrs[i]); + REG_COMPARE32(acrs[i]); for (i = 0; i < 16; i++) REG_COMPARE(crs[i]); @@ -79,7 +86,7 @@ int main(int argc, char *argv[]) cap = kvm_check_cap(KVM_CAP_SYNC_REGS); if (!cap) { - fprintf(stderr, "CAP_SYNC_REGS not supported, skipping test\n"); + print_skip("CAP_SYNC_REGS not supported"); exit(KSFT_SKIP); } @@ -155,7 +162,7 @@ int main(int argc, char *argv[]) "r11 sync regs value incorrect 0x%llx.", run->s.regs.gprs[11]); TEST_ASSERT(run->s.regs.acrs[0] == 1 << 11, - "acr0 sync regs value incorrect 0x%llx.", + "acr0 sync regs value incorrect 0x%x.", run->s.regs.acrs[0]); vcpu_regs_get(vm, VCPU_ID, ®s); diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c new file mode 100644 index 000000000000..b3ece55a2da6 --- /dev/null +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -0,0 +1,408 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE /* for program_invocation_short_name */ +#include <fcntl.h> +#include <pthread.h> +#include <sched.h> +#include <semaphore.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> +#include <sys/mman.h> + +#include <linux/compiler.h> + +#include <test_util.h> +#include <kvm_util.h> +#include <processor.h> + +#define VCPU_ID 0 + +/* + * s390x needs at least 1MB alignment, and the x86_64 MOVE/DELETE tests need a + * 2MB sized and aligned region so that the initial region corresponds to + * exactly one large page. + */ +#define MEM_REGION_SIZE 0x200000 + +#ifdef __x86_64__ +/* + * Somewhat arbitrary location and slot, intended to not overlap anything. + */ +#define MEM_REGION_GPA 0xc0000000 +#define MEM_REGION_SLOT 10 + +static const uint64_t MMIO_VAL = 0xbeefull; + +extern const uint64_t final_rip_start; +extern const uint64_t final_rip_end; + +static sem_t vcpu_ready; + +static inline uint64_t guest_spin_on_val(uint64_t spin_val) +{ + uint64_t val; + + do { + val = READ_ONCE(*((uint64_t *)MEM_REGION_GPA)); + } while (val == spin_val); + + GUEST_SYNC(0); + return val; +} + +static void *vcpu_worker(void *data) +{ + struct kvm_vm *vm = data; + struct kvm_run *run; + struct ucall uc; + uint64_t cmd; + + /* + * Loop until the guest is done. Re-enter the guest on all MMIO exits, + * which will occur if the guest attempts to access a memslot after it + * has been deleted or while it is being moved . + */ + run = vcpu_state(vm, VCPU_ID); + + while (1) { + vcpu_run(vm, VCPU_ID); + + if (run->exit_reason == KVM_EXIT_IO) { + cmd = get_ucall(vm, VCPU_ID, &uc); + if (cmd != UCALL_SYNC) + break; + + sem_post(&vcpu_ready); + continue; + } + + if (run->exit_reason != KVM_EXIT_MMIO) + break; + + TEST_ASSERT(!run->mmio.is_write, "Unexpected exit mmio write"); + TEST_ASSERT(run->mmio.len == 8, + "Unexpected exit mmio size = %u", run->mmio.len); + + TEST_ASSERT(run->mmio.phys_addr == MEM_REGION_GPA, + "Unexpected exit mmio address = 0x%llx", + run->mmio.phys_addr); + memcpy(run->mmio.data, &MMIO_VAL, 8); + } + + if (run->exit_reason == KVM_EXIT_IO && cmd == UCALL_ABORT) + TEST_FAIL("%s at %s:%ld, val = %lu", (const char *)uc.args[0], + __FILE__, uc.args[1], uc.args[2]); + + return NULL; +} + +static void wait_for_vcpu(void) +{ + struct timespec ts; + + TEST_ASSERT(!clock_gettime(CLOCK_REALTIME, &ts), + "clock_gettime() failed: %d\n", errno); + + ts.tv_sec += 2; + TEST_ASSERT(!sem_timedwait(&vcpu_ready, &ts), + "sem_timedwait() failed: %d\n", errno); + + /* Wait for the vCPU thread to reenter the guest. */ + usleep(100000); +} + +static struct kvm_vm *spawn_vm(pthread_t *vcpu_thread, void *guest_code) +{ + struct kvm_vm *vm; + uint64_t *hva; + uint64_t gpa; + + vm = vm_create_default(VCPU_ID, 0, guest_code); + + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS_THP, + MEM_REGION_GPA, MEM_REGION_SLOT, + MEM_REGION_SIZE / getpagesize(), 0); + + /* + * Allocate and map two pages so that the GPA accessed by guest_code() + * stays valid across the memslot move. + */ + gpa = vm_phy_pages_alloc(vm, 2, MEM_REGION_GPA, MEM_REGION_SLOT); + TEST_ASSERT(gpa == MEM_REGION_GPA, "Failed vm_phy_pages_alloc\n"); + + virt_map(vm, MEM_REGION_GPA, MEM_REGION_GPA, 2, 0); + + /* Ditto for the host mapping so that both pages can be zeroed. */ + hva = addr_gpa2hva(vm, MEM_REGION_GPA); + memset(hva, 0, 2 * 4096); + + pthread_create(vcpu_thread, NULL, vcpu_worker, vm); + + /* Ensure the guest thread is spun up. */ + wait_for_vcpu(); + + return vm; +} + + +static void guest_code_move_memory_region(void) +{ + uint64_t val; + + GUEST_SYNC(0); + + /* + * Spin until the memory region is moved to a misaligned address. This + * may or may not trigger MMIO, as the window where the memslot is + * invalid is quite small. + */ + val = guest_spin_on_val(0); + GUEST_ASSERT_1(val == 1 || val == MMIO_VAL, val); + + /* Spin until the memory region is realigned. */ + val = guest_spin_on_val(MMIO_VAL); + GUEST_ASSERT_1(val == 1, val); + + GUEST_DONE(); +} + +static void test_move_memory_region(void) +{ + pthread_t vcpu_thread; + struct kvm_vm *vm; + uint64_t *hva; + + vm = spawn_vm(&vcpu_thread, guest_code_move_memory_region); + + hva = addr_gpa2hva(vm, MEM_REGION_GPA); + + /* + * Shift the region's base GPA. The guest should not see "2" as the + * hva->gpa translation is misaligned, i.e. the guest is accessing a + * different host pfn. + */ + vm_mem_region_move(vm, MEM_REGION_SLOT, MEM_REGION_GPA - 4096); + WRITE_ONCE(*hva, 2); + + /* + * The guest _might_ see an invalid memslot and trigger MMIO, but it's + * a tiny window. Spin and defer the sync until the memslot is + * restored and guest behavior is once again deterministic. + */ + usleep(100000); + + /* + * Note, value in memory needs to be changed *before* restoring the + * memslot, else the guest could race the update and see "2". + */ + WRITE_ONCE(*hva, 1); + + /* Restore the original base, the guest should see "1". */ + vm_mem_region_move(vm, MEM_REGION_SLOT, MEM_REGION_GPA); + wait_for_vcpu(); + /* Defered sync from when the memslot was misaligned (above). */ + wait_for_vcpu(); + + pthread_join(vcpu_thread, NULL); + + kvm_vm_free(vm); +} + +static void guest_code_delete_memory_region(void) +{ + uint64_t val; + + GUEST_SYNC(0); + + /* Spin until the memory region is deleted. */ + val = guest_spin_on_val(0); + GUEST_ASSERT_1(val == MMIO_VAL, val); + + /* Spin until the memory region is recreated. */ + val = guest_spin_on_val(MMIO_VAL); + GUEST_ASSERT_1(val == 0, val); + + /* Spin until the memory region is deleted. */ + val = guest_spin_on_val(0); + GUEST_ASSERT_1(val == MMIO_VAL, val); + + asm("1:\n\t" + ".pushsection .rodata\n\t" + ".global final_rip_start\n\t" + "final_rip_start: .quad 1b\n\t" + ".popsection"); + + /* Spin indefinitely (until the code memslot is deleted). */ + guest_spin_on_val(MMIO_VAL); + + asm("1:\n\t" + ".pushsection .rodata\n\t" + ".global final_rip_end\n\t" + "final_rip_end: .quad 1b\n\t" + ".popsection"); + + GUEST_ASSERT_1(0, 0); +} + +static void test_delete_memory_region(void) +{ + pthread_t vcpu_thread; + struct kvm_regs regs; + struct kvm_run *run; + struct kvm_vm *vm; + + vm = spawn_vm(&vcpu_thread, guest_code_delete_memory_region); + + /* Delete the memory region, the guest should not die. */ + vm_mem_region_delete(vm, MEM_REGION_SLOT); + wait_for_vcpu(); + + /* Recreate the memory region. The guest should see "0". */ + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS_THP, + MEM_REGION_GPA, MEM_REGION_SLOT, + MEM_REGION_SIZE / getpagesize(), 0); + wait_for_vcpu(); + + /* Delete the region again so that there's only one memslot left. */ + vm_mem_region_delete(vm, MEM_REGION_SLOT); + wait_for_vcpu(); + + /* + * Delete the primary memslot. This should cause an emulation error or + * shutdown due to the page tables getting nuked. + */ + vm_mem_region_delete(vm, 0); + + pthread_join(vcpu_thread, NULL); + + run = vcpu_state(vm, VCPU_ID); + + TEST_ASSERT(run->exit_reason == KVM_EXIT_SHUTDOWN || + run->exit_reason == KVM_EXIT_INTERNAL_ERROR, + "Unexpected exit reason = %d", run->exit_reason); + + vcpu_regs_get(vm, VCPU_ID, ®s); + + /* + * On AMD, after KVM_EXIT_SHUTDOWN the VMCB has been reinitialized already, + * so the instruction pointer would point to the reset vector. + */ + if (run->exit_reason == KVM_EXIT_INTERNAL_ERROR) + TEST_ASSERT(regs.rip >= final_rip_start && + regs.rip < final_rip_end, + "Bad rip, expected 0x%lx - 0x%lx, got 0x%llx\n", + final_rip_start, final_rip_end, regs.rip); + + kvm_vm_free(vm); +} + +static void test_zero_memory_regions(void) +{ + struct kvm_run *run; + struct kvm_vm *vm; + + pr_info("Testing KVM_RUN with zero added memory regions\n"); + + vm = vm_create(VM_MODE_DEFAULT, 0, O_RDWR); + vm_vcpu_add(vm, VCPU_ID); + + TEST_ASSERT(!ioctl(vm_get_fd(vm), KVM_SET_NR_MMU_PAGES, 64), + "KVM_SET_NR_MMU_PAGES failed, errno = %d\n", errno); + vcpu_run(vm, VCPU_ID); + + run = vcpu_state(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_INTERNAL_ERROR, + "Unexpected exit_reason = %u\n", run->exit_reason); + + kvm_vm_free(vm); +} +#endif /* __x86_64__ */ + +/* + * Test it can be added memory slots up to KVM_CAP_NR_MEMSLOTS, then any + * tentative to add further slots should fail. + */ +static void test_add_max_memory_regions(void) +{ + int ret; + struct kvm_vm *vm; + uint32_t max_mem_slots; + uint32_t slot; + uint64_t guest_addr = 0x0; + uint64_t mem_reg_npages; + void *mem; + + max_mem_slots = kvm_check_cap(KVM_CAP_NR_MEMSLOTS); + TEST_ASSERT(max_mem_slots > 0, + "KVM_CAP_NR_MEMSLOTS should be greater than 0"); + pr_info("Allowed number of memory slots: %i\n", max_mem_slots); + + vm = vm_create(VM_MODE_DEFAULT, 0, O_RDWR); + + mem_reg_npages = vm_calc_num_guest_pages(VM_MODE_DEFAULT, MEM_REGION_SIZE); + + /* Check it can be added memory slots up to the maximum allowed */ + pr_info("Adding slots 0..%i, each memory region with %dK size\n", + (max_mem_slots - 1), MEM_REGION_SIZE >> 10); + for (slot = 0; slot < max_mem_slots; slot++) { + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, + guest_addr, slot, mem_reg_npages, + 0); + guest_addr += MEM_REGION_SIZE; + } + + /* Check it cannot be added memory slots beyond the limit */ + mem = mmap(NULL, MEM_REGION_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + TEST_ASSERT(mem != MAP_FAILED, "Failed to mmap() host"); + + ret = ioctl(vm_get_fd(vm), KVM_SET_USER_MEMORY_REGION, + &(struct kvm_userspace_memory_region) {slot, 0, guest_addr, + MEM_REGION_SIZE, (uint64_t) mem}); + TEST_ASSERT(ret == -1 && errno == EINVAL, + "Adding one more memory slot should fail with EINVAL"); + + munmap(mem, MEM_REGION_SIZE); + kvm_vm_free(vm); +} + +int main(int argc, char *argv[]) +{ +#ifdef __x86_64__ + int i, loops; +#endif + + /* Tell stdout not to buffer its content */ + setbuf(stdout, NULL); + +#ifdef __x86_64__ + /* + * FIXME: the zero-memslot test fails on aarch64 and s390x because + * KVM_RUN fails with ENOEXEC or EFAULT. + */ + test_zero_memory_regions(); +#endif + + test_add_max_memory_regions(); + +#ifdef __x86_64__ + if (argc > 1) + loops = atoi(argv[1]); + else + loops = 10; + + pr_info("Testing MOVE of in-use region, %d loops\n", loops); + for (i = 0; i < loops; i++) + test_move_memory_region(); + + pr_info("Testing DELETE of in-use region, %d loops\n", loops); + for (i = 0; i < loops; i++) + test_delete_memory_region(); +#endif + + return 0; +} diff --git a/tools/testing/selftests/kvm/steal_time.c b/tools/testing/selftests/kvm/steal_time.c new file mode 100644 index 000000000000..fcc840088c91 --- /dev/null +++ b/tools/testing/selftests/kvm/steal_time.c @@ -0,0 +1,352 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * steal/stolen time test + * + * Copyright (C) 2020, Red Hat, Inc. + */ +#define _GNU_SOURCE +#include <stdio.h> +#include <time.h> +#include <sched.h> +#include <pthread.h> +#include <linux/kernel.h> +#include <sys/syscall.h> +#include <asm/kvm.h> +#include <asm/kvm_para.h> + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" + +#define NR_VCPUS 4 +#define ST_GPA_BASE (1 << 30) +#define MIN_RUN_DELAY_NS 200000UL + +static void *st_gva[NR_VCPUS]; +static uint64_t guest_stolen_time[NR_VCPUS]; + +#if defined(__x86_64__) + +/* steal_time must have 64-byte alignment */ +#define STEAL_TIME_SIZE ((sizeof(struct kvm_steal_time) + 63) & ~63) + +static void check_status(struct kvm_steal_time *st) +{ + GUEST_ASSERT(!(READ_ONCE(st->version) & 1)); + GUEST_ASSERT(READ_ONCE(st->flags) == 0); + GUEST_ASSERT(READ_ONCE(st->preempted) == 0); +} + +static void guest_code(int cpu) +{ + struct kvm_steal_time *st = st_gva[cpu]; + uint32_t version; + + GUEST_ASSERT(rdmsr(MSR_KVM_STEAL_TIME) == ((uint64_t)st_gva[cpu] | KVM_MSR_ENABLED)); + + memset(st, 0, sizeof(*st)); + GUEST_SYNC(0); + + check_status(st); + WRITE_ONCE(guest_stolen_time[cpu], st->steal); + version = READ_ONCE(st->version); + check_status(st); + GUEST_SYNC(1); + + check_status(st); + GUEST_ASSERT(version < READ_ONCE(st->version)); + WRITE_ONCE(guest_stolen_time[cpu], st->steal); + check_status(st); + GUEST_DONE(); +} + +static void steal_time_init(struct kvm_vm *vm) +{ + int i; + + if (!(kvm_get_supported_cpuid_entry(KVM_CPUID_FEATURES)->eax & + KVM_FEATURE_STEAL_TIME)) { + print_skip("steal-time not supported"); + exit(KSFT_SKIP); + } + + for (i = 0; i < NR_VCPUS; ++i) { + int ret; + + vcpu_set_cpuid(vm, i, kvm_get_supported_cpuid()); + + /* ST_GPA_BASE is identity mapped */ + st_gva[i] = (void *)(ST_GPA_BASE + i * STEAL_TIME_SIZE); + sync_global_to_guest(vm, st_gva[i]); + + ret = _vcpu_set_msr(vm, i, MSR_KVM_STEAL_TIME, (ulong)st_gva[i] | KVM_STEAL_RESERVED_MASK); + TEST_ASSERT(ret == 0, "Bad GPA didn't fail"); + + vcpu_set_msr(vm, i, MSR_KVM_STEAL_TIME, (ulong)st_gva[i] | KVM_MSR_ENABLED); + } +} + +static void steal_time_dump(struct kvm_vm *vm, uint32_t vcpuid) +{ + struct kvm_steal_time *st = addr_gva2hva(vm, (ulong)st_gva[vcpuid]); + int i; + + pr_info("VCPU%d:\n", vcpuid); + pr_info(" steal: %lld\n", st->steal); + pr_info(" version: %d\n", st->version); + pr_info(" flags: %d\n", st->flags); + pr_info(" preempted: %d\n", st->preempted); + pr_info(" u8_pad: "); + for (i = 0; i < 3; ++i) + pr_info("%d", st->u8_pad[i]); + pr_info("\n pad: "); + for (i = 0; i < 11; ++i) + pr_info("%d", st->pad[i]); + pr_info("\n"); +} + +#elif defined(__aarch64__) + +/* PV_TIME_ST must have 64-byte alignment */ +#define STEAL_TIME_SIZE ((sizeof(struct st_time) + 63) & ~63) + +#define SMCCC_ARCH_FEATURES 0x80000001 +#define PV_TIME_FEATURES 0xc5000020 +#define PV_TIME_ST 0xc5000021 + +struct st_time { + uint32_t rev; + uint32_t attr; + uint64_t st_time; +}; + +static int64_t smccc(uint32_t func, uint32_t arg) +{ + unsigned long ret; + + asm volatile( + "mov x0, %1\n" + "mov x1, %2\n" + "hvc #0\n" + "mov %0, x0\n" + : "=r" (ret) : "r" (func), "r" (arg) : + "x0", "x1", "x2", "x3"); + + return ret; +} + +static void check_status(struct st_time *st) +{ + GUEST_ASSERT(READ_ONCE(st->rev) == 0); + GUEST_ASSERT(READ_ONCE(st->attr) == 0); +} + +static void guest_code(int cpu) +{ + struct st_time *st; + int64_t status; + + status = smccc(SMCCC_ARCH_FEATURES, PV_TIME_FEATURES); + GUEST_ASSERT(status == 0); + status = smccc(PV_TIME_FEATURES, PV_TIME_FEATURES); + GUEST_ASSERT(status == 0); + status = smccc(PV_TIME_FEATURES, PV_TIME_ST); + GUEST_ASSERT(status == 0); + + status = smccc(PV_TIME_ST, 0); + GUEST_ASSERT(status != -1); + GUEST_ASSERT(status == (ulong)st_gva[cpu]); + + st = (struct st_time *)status; + GUEST_SYNC(0); + + check_status(st); + WRITE_ONCE(guest_stolen_time[cpu], st->st_time); + GUEST_SYNC(1); + + check_status(st); + WRITE_ONCE(guest_stolen_time[cpu], st->st_time); + GUEST_DONE(); +} + +static void steal_time_init(struct kvm_vm *vm) +{ + struct kvm_device_attr dev = { + .group = KVM_ARM_VCPU_PVTIME_CTRL, + .attr = KVM_ARM_VCPU_PVTIME_IPA, + }; + int i, ret; + + ret = _vcpu_ioctl(vm, 0, KVM_HAS_DEVICE_ATTR, &dev); + if (ret != 0 && errno == ENXIO) { + print_skip("steal-time not supported"); + exit(KSFT_SKIP); + } + + for (i = 0; i < NR_VCPUS; ++i) { + uint64_t st_ipa; + + vcpu_ioctl(vm, i, KVM_HAS_DEVICE_ATTR, &dev); + + dev.addr = (uint64_t)&st_ipa; + + /* ST_GPA_BASE is identity mapped */ + st_gva[i] = (void *)(ST_GPA_BASE + i * STEAL_TIME_SIZE); + sync_global_to_guest(vm, st_gva[i]); + + st_ipa = (ulong)st_gva[i] | 1; + ret = _vcpu_ioctl(vm, i, KVM_SET_DEVICE_ATTR, &dev); + TEST_ASSERT(ret == -1 && errno == EINVAL, "Bad IPA didn't report EINVAL"); + + st_ipa = (ulong)st_gva[i]; + vcpu_ioctl(vm, i, KVM_SET_DEVICE_ATTR, &dev); + + ret = _vcpu_ioctl(vm, i, KVM_SET_DEVICE_ATTR, &dev); + TEST_ASSERT(ret == -1 && errno == EEXIST, "Set IPA twice without EEXIST"); + + } +} + +static void steal_time_dump(struct kvm_vm *vm, uint32_t vcpuid) +{ + struct st_time *st = addr_gva2hva(vm, (ulong)st_gva[vcpuid]); + + pr_info("VCPU%d:\n", vcpuid); + pr_info(" rev: %d\n", st->rev); + pr_info(" attr: %d\n", st->attr); + pr_info(" st_time: %ld\n", st->st_time); +} + +#endif + +static long get_run_delay(void) +{ + char path[64]; + long val[2]; + FILE *fp; + + sprintf(path, "/proc/%ld/schedstat", syscall(SYS_gettid)); + fp = fopen(path, "r"); + fscanf(fp, "%ld %ld ", &val[0], &val[1]); + fclose(fp); + + return val[1]; +} + +static void *do_steal_time(void *arg) +{ + struct timespec ts, stop; + + clock_gettime(CLOCK_MONOTONIC, &ts); + stop = timespec_add_ns(ts, MIN_RUN_DELAY_NS); + + while (1) { + clock_gettime(CLOCK_MONOTONIC, &ts); + if (timespec_to_ns(timespec_sub(ts, stop)) >= 0) + break; + } + + return NULL; +} + +static void run_vcpu(struct kvm_vm *vm, uint32_t vcpuid) +{ + struct ucall uc; + + vcpu_args_set(vm, vcpuid, 1, vcpuid); + + vcpu_ioctl(vm, vcpuid, KVM_RUN, NULL); + + switch (get_ucall(vm, vcpuid, &uc)) { + case UCALL_SYNC: + case UCALL_DONE: + break; + case UCALL_ABORT: + TEST_ASSERT(false, "%s at %s:%ld", (const char *)uc.args[0], + __FILE__, uc.args[1]); + default: + TEST_ASSERT(false, "Unexpected exit: %s", + exit_reason_str(vcpu_state(vm, vcpuid)->exit_reason)); + } +} + +int main(int ac, char **av) +{ + struct kvm_vm *vm; + pthread_attr_t attr; + pthread_t thread; + cpu_set_t cpuset; + unsigned int gpages; + long stolen_time; + long run_delay; + bool verbose; + int i; + + verbose = ac > 1 && (!strncmp(av[1], "-v", 3) || !strncmp(av[1], "--verbose", 10)); + + /* Set CPU affinity so we can force preemption of the VCPU */ + CPU_ZERO(&cpuset); + CPU_SET(0, &cpuset); + pthread_attr_init(&attr); + pthread_attr_setaffinity_np(&attr, sizeof(cpu_set_t), &cpuset); + pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset); + + /* Create a one VCPU guest and an identity mapped memslot for the steal time structure */ + vm = vm_create_default(0, 0, guest_code); + gpages = vm_calc_num_guest_pages(VM_MODE_DEFAULT, STEAL_TIME_SIZE * NR_VCPUS); + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, ST_GPA_BASE, 1, gpages, 0); + virt_map(vm, ST_GPA_BASE, ST_GPA_BASE, gpages, 0); + ucall_init(vm, NULL); + + /* Add the rest of the VCPUs */ + for (i = 1; i < NR_VCPUS; ++i) + vm_vcpu_add_default(vm, i, guest_code); + + steal_time_init(vm); + + /* Run test on each VCPU */ + for (i = 0; i < NR_VCPUS; ++i) { + /* First VCPU run initializes steal-time */ + run_vcpu(vm, i); + + /* Second VCPU run, expect guest stolen time to be <= run_delay */ + run_vcpu(vm, i); + sync_global_from_guest(vm, guest_stolen_time[i]); + stolen_time = guest_stolen_time[i]; + run_delay = get_run_delay(); + TEST_ASSERT(stolen_time <= run_delay, + "Expected stolen time <= %ld, got %ld", + run_delay, stolen_time); + + /* Steal time from the VCPU. The steal time thread has the same CPU affinity as the VCPUs. */ + run_delay = get_run_delay(); + pthread_create(&thread, &attr, do_steal_time, NULL); + do + pthread_yield(); + while (get_run_delay() - run_delay < MIN_RUN_DELAY_NS); + pthread_join(thread, NULL); + run_delay = get_run_delay() - run_delay; + TEST_ASSERT(run_delay >= MIN_RUN_DELAY_NS, + "Expected run_delay >= %ld, got %ld", + MIN_RUN_DELAY_NS, run_delay); + + /* Run VCPU again to confirm stolen time is consistent with run_delay */ + run_vcpu(vm, i); + sync_global_from_guest(vm, guest_stolen_time[i]); + stolen_time = guest_stolen_time[i] - stolen_time; + TEST_ASSERT(stolen_time >= run_delay, + "Expected stolen time >= %ld, got %ld", + run_delay, stolen_time); + + if (verbose) { + pr_info("VCPU%d: total-stolen-time=%ld test-stolen-time=%ld", i, + guest_stolen_time[i], stolen_time); + if (stolen_time == run_delay) + pr_info(" (BONUS: guest test-stolen-time even exactly matches test-run_delay)"); + pr_info("\n"); + steal_time_dump(vm, i); + } + } + + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c b/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c index 63cc9c3f5ab6..140e91901582 100644 --- a/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c +++ b/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c @@ -72,7 +72,7 @@ int main(int argc, char *argv[]) entry = kvm_get_supported_cpuid_entry(1); if (!(entry->ecx & X86_FEATURE_XSAVE)) { - printf("XSAVE feature not supported, skipping test\n"); + print_skip("XSAVE feature not supported"); return 0; } @@ -101,12 +101,12 @@ int main(int argc, char *argv[]) vcpu_sregs_set(vm, VCPU_ID, &sregs); break; case UCALL_ABORT: - TEST_ASSERT(false, "Guest CR4 bit (OSXSAVE) unsynchronized with CPUID bit."); + TEST_FAIL("Guest CR4 bit (OSXSAVE) unsynchronized with CPUID bit."); break; case UCALL_DONE: goto done; default: - TEST_ASSERT(false, "Unknown ucall 0x%x.", uc.cmd); + TEST_FAIL("Unknown ucall %lu", uc.cmd); } } diff --git a/tools/testing/selftests/kvm/x86_64/debug_regs.c b/tools/testing/selftests/kvm/x86_64/debug_regs.c new file mode 100644 index 000000000000..8162c58a1234 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/debug_regs.c @@ -0,0 +1,202 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KVM guest debug register tests + * + * Copyright (C) 2020, Red Hat, Inc. + */ +#include <stdio.h> +#include <string.h> +#include "kvm_util.h" +#include "processor.h" + +#define VCPU_ID 0 + +#define DR6_BD (1 << 13) +#define DR7_GD (1 << 13) + +/* For testing data access debug BP */ +uint32_t guest_value; + +extern unsigned char sw_bp, hw_bp, write_data, ss_start, bd_start; + +static void guest_code(void) +{ + /* + * Software BP tests. + * + * NOTE: sw_bp need to be before the cmd here, because int3 is an + * exception rather than a normal trap for KVM_SET_GUEST_DEBUG (we + * capture it using the vcpu exception bitmap). + */ + asm volatile("sw_bp: int3"); + + /* Hardware instruction BP test */ + asm volatile("hw_bp: nop"); + + /* Hardware data BP test */ + asm volatile("mov $1234,%%rax;\n\t" + "mov %%rax,%0;\n\t write_data:" + : "=m" (guest_value) : : "rax"); + + /* Single step test, covers 2 basic instructions and 2 emulated */ + asm volatile("ss_start: " + "xor %%rax,%%rax\n\t" + "cpuid\n\t" + "movl $0x1a0,%%ecx\n\t" + "rdmsr\n\t" + : : : "rax", "ecx"); + + /* DR6.BD test */ + asm volatile("bd_start: mov %%dr0, %%rax" : : : "rax"); + GUEST_DONE(); +} + +#define CLEAR_DEBUG() memset(&debug, 0, sizeof(debug)) +#define APPLY_DEBUG() vcpu_set_guest_debug(vm, VCPU_ID, &debug) +#define CAST_TO_RIP(v) ((unsigned long long)&(v)) +#define SET_RIP(v) do { \ + vcpu_regs_get(vm, VCPU_ID, ®s); \ + regs.rip = (v); \ + vcpu_regs_set(vm, VCPU_ID, ®s); \ + } while (0) +#define MOVE_RIP(v) SET_RIP(regs.rip + (v)); + +int main(void) +{ + struct kvm_guest_debug debug; + unsigned long long target_dr6, target_rip; + struct kvm_regs regs; + struct kvm_run *run; + struct kvm_vm *vm; + struct ucall uc; + uint64_t cmd; + int i; + /* Instruction lengths starting at ss_start */ + int ss_size[4] = { + 3, /* xor */ + 2, /* cpuid */ + 5, /* mov */ + 2, /* rdmsr */ + }; + + if (!kvm_check_cap(KVM_CAP_SET_GUEST_DEBUG)) { + print_skip("KVM_CAP_SET_GUEST_DEBUG not supported"); + return 0; + } + + vm = vm_create_default(VCPU_ID, 0, guest_code); + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + run = vcpu_state(vm, VCPU_ID); + + /* Test software BPs - int3 */ + CLEAR_DEBUG(); + debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP; + APPLY_DEBUG(); + vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG && + run->debug.arch.exception == BP_VECTOR && + run->debug.arch.pc == CAST_TO_RIP(sw_bp), + "INT3: exit %d exception %d rip 0x%llx (should be 0x%llx)", + run->exit_reason, run->debug.arch.exception, + run->debug.arch.pc, CAST_TO_RIP(sw_bp)); + MOVE_RIP(1); + + /* Test instruction HW BP over DR[0-3] */ + for (i = 0; i < 4; i++) { + CLEAR_DEBUG(); + debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP; + debug.arch.debugreg[i] = CAST_TO_RIP(hw_bp); + debug.arch.debugreg[7] = 0x400 | (1UL << (2*i+1)); + APPLY_DEBUG(); + vcpu_run(vm, VCPU_ID); + target_dr6 = 0xffff0ff0 | (1UL << i); + TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG && + run->debug.arch.exception == DB_VECTOR && + run->debug.arch.pc == CAST_TO_RIP(hw_bp) && + run->debug.arch.dr6 == target_dr6, + "INS_HW_BP (DR%d): exit %d exception %d rip 0x%llx " + "(should be 0x%llx) dr6 0x%llx (should be 0x%llx)", + i, run->exit_reason, run->debug.arch.exception, + run->debug.arch.pc, CAST_TO_RIP(hw_bp), + run->debug.arch.dr6, target_dr6); + } + /* Skip "nop" */ + MOVE_RIP(1); + + /* Test data access HW BP over DR[0-3] */ + for (i = 0; i < 4; i++) { + CLEAR_DEBUG(); + debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP; + debug.arch.debugreg[i] = CAST_TO_RIP(guest_value); + debug.arch.debugreg[7] = 0x00000400 | (1UL << (2*i+1)) | + (0x000d0000UL << (4*i)); + APPLY_DEBUG(); + vcpu_run(vm, VCPU_ID); + target_dr6 = 0xffff0ff0 | (1UL << i); + TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG && + run->debug.arch.exception == DB_VECTOR && + run->debug.arch.pc == CAST_TO_RIP(write_data) && + run->debug.arch.dr6 == target_dr6, + "DATA_HW_BP (DR%d): exit %d exception %d rip 0x%llx " + "(should be 0x%llx) dr6 0x%llx (should be 0x%llx)", + i, run->exit_reason, run->debug.arch.exception, + run->debug.arch.pc, CAST_TO_RIP(write_data), + run->debug.arch.dr6, target_dr6); + /* Rollback the 4-bytes "mov" */ + MOVE_RIP(-7); + } + /* Skip the 4-bytes "mov" */ + MOVE_RIP(7); + + /* Test single step */ + target_rip = CAST_TO_RIP(ss_start); + target_dr6 = 0xffff4ff0ULL; + vcpu_regs_get(vm, VCPU_ID, ®s); + for (i = 0; i < (sizeof(ss_size) / sizeof(ss_size[0])); i++) { + target_rip += ss_size[i]; + CLEAR_DEBUG(); + debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP; + debug.arch.debugreg[7] = 0x00000400; + APPLY_DEBUG(); + vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG && + run->debug.arch.exception == DB_VECTOR && + run->debug.arch.pc == target_rip && + run->debug.arch.dr6 == target_dr6, + "SINGLE_STEP[%d]: exit %d exception %d rip 0x%llx " + "(should be 0x%llx) dr6 0x%llx (should be 0x%llx)", + i, run->exit_reason, run->debug.arch.exception, + run->debug.arch.pc, target_rip, run->debug.arch.dr6, + target_dr6); + } + + /* Finally test global disable */ + CLEAR_DEBUG(); + debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP; + debug.arch.debugreg[7] = 0x400 | DR7_GD; + APPLY_DEBUG(); + vcpu_run(vm, VCPU_ID); + target_dr6 = 0xffff0ff0 | DR6_BD; + TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG && + run->debug.arch.exception == DB_VECTOR && + run->debug.arch.pc == CAST_TO_RIP(bd_start) && + run->debug.arch.dr6 == target_dr6, + "DR7.GD: exit %d exception %d rip 0x%llx " + "(should be 0x%llx) dr6 0x%llx (should be 0x%llx)", + run->exit_reason, run->debug.arch.exception, + run->debug.arch.pc, target_rip, run->debug.arch.dr6, + target_dr6); + + /* Disable all debug controls, run to the end */ + CLEAR_DEBUG(); + APPLY_DEBUG(); + + vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "KVM_EXIT_IO"); + cmd = get_ucall(vm, VCPU_ID, &uc); + TEST_ASSERT(cmd == UCALL_DONE, "UCALL_DONE"); + + kvm_vm_free(vm); + + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/evmcs_test.c b/tools/testing/selftests/kvm/x86_64/evmcs_test.c index 92915e6408e7..e6e62e5e75b2 100644 --- a/tools/testing/selftests/kvm/x86_64/evmcs_test.c +++ b/tools/testing/selftests/kvm/x86_64/evmcs_test.c @@ -21,10 +21,10 @@ void l2_guest_code(void) { - GUEST_SYNC(6); - GUEST_SYNC(7); + GUEST_SYNC(8); + /* Done, exit to L1 and never come back. */ vmcall(); } @@ -50,12 +50,17 @@ void l1_guest_code(struct vmx_pages *vmx_pages) GUEST_SYNC(5); GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa); + current_evmcs->revision_id = -1u; + GUEST_ASSERT(vmlaunch()); + current_evmcs->revision_id = EVMCS_VERSION; + GUEST_SYNC(6); + GUEST_ASSERT(!vmlaunch()); GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa); - GUEST_SYNC(8); + GUEST_SYNC(9); GUEST_ASSERT(!vmresume()); GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); - GUEST_SYNC(9); + GUEST_SYNC(10); } void guest_code(struct vmx_pages *vmx_pages) @@ -67,6 +72,10 @@ void guest_code(struct vmx_pages *vmx_pages) l1_guest_code(vmx_pages); GUEST_DONE(); + + /* Try enlightened vmptrld with an incorrect GPA */ + evmcs_vmptrld(0xdeadbeef, vmx_pages->enlightened_vmcs); + GUEST_ASSERT(vmlaunch()); } int main(int argc, char *argv[]) @@ -87,7 +96,7 @@ int main(int argc, char *argv[]) if (!kvm_check_cap(KVM_CAP_NESTED_STATE) || !kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)) { - printf("capabilities not available, skipping test\n"); + print_skip("capabilities not available"); exit(KSFT_SKIP); } @@ -109,20 +118,20 @@ int main(int argc, char *argv[]) switch (get_ucall(vm, VCPU_ID, &uc)) { case UCALL_ABORT: - TEST_ASSERT(false, "%s at %s:%d", (const char *)uc.args[0], - __FILE__, uc.args[1]); + TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], + __FILE__, uc.args[1]); /* NOT REACHED */ case UCALL_SYNC: break; case UCALL_DONE: - goto done; + goto part1_done; default: - TEST_ASSERT(false, "Unknown ucall 0x%x.", uc.cmd); + TEST_FAIL("Unknown ucall %lu", uc.cmd); } /* UCALL_SYNC is handled here. */ TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") && - uc.args[1] == stage, "Unexpected register values vmexit #%lx, got %lx", + uc.args[1] == stage, "Stage %d: Unexpected register values vmexit, got %lx", stage, (ulong)uc.args[1]); state = vcpu_save_state(vm, VCPU_ID); @@ -147,6 +156,10 @@ int main(int argc, char *argv[]) (ulong) regs2.rdi, (ulong) regs2.rsi); } -done: +part1_done: + _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_SHUTDOWN, + "Unexpected successful VMEnter with invalid eVMCS pointer!"); + kvm_vm_free(vm); } diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c index 443a2b54645b..4a7967cca281 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c @@ -26,18 +26,18 @@ static void guest_code(void) { } -static int smt_possible(void) +static bool smt_possible(void) { char buf[16]; FILE *f; - bool res = 1; + bool res = true; f = fopen("/sys/devices/system/cpu/smt/control", "r"); if (f) { if (fread(buf, sizeof(*buf), sizeof(buf), f) > 0) { if (!strncmp(buf, "forceoff", 8) || !strncmp(buf, "notsupported", 12)) - res = 0; + res = false; } fclose(f); } @@ -46,29 +46,31 @@ static int smt_possible(void) } static void test_hv_cpuid(struct kvm_cpuid2 *hv_cpuid_entries, - int evmcs_enabled) + bool evmcs_enabled) { int i; + int nent = 9; + u32 test_val; - if (!evmcs_enabled) - TEST_ASSERT(hv_cpuid_entries->nent == 6, - "KVM_GET_SUPPORTED_HV_CPUID should return 6 entries" - " when Enlightened VMCS is disabled (returned %d)", - hv_cpuid_entries->nent); - else - TEST_ASSERT(hv_cpuid_entries->nent == 7, - "KVM_GET_SUPPORTED_HV_CPUID should return 7 entries" - " when Enlightened VMCS is enabled (returned %d)", - hv_cpuid_entries->nent); + if (evmcs_enabled) + nent += 1; /* 0x4000000A */ + + TEST_ASSERT(hv_cpuid_entries->nent == nent, + "KVM_GET_SUPPORTED_HV_CPUID should return %d entries" + " with evmcs=%d (returned %d)", + nent, evmcs_enabled, hv_cpuid_entries->nent); for (i = 0; i < hv_cpuid_entries->nent; i++) { struct kvm_cpuid_entry2 *entry = &hv_cpuid_entries->entries[i]; TEST_ASSERT((entry->function >= 0x40000000) && - (entry->function <= 0x4000000A), - "function %lx is our of supported range", + (entry->function <= 0x40000082), + "function %x is our of supported range", entry->function); + TEST_ASSERT(evmcs_enabled || (entry->function != 0x4000000A), + "0x4000000A leaf should not be reported"); + TEST_ASSERT(entry->index == 0, ".index field should be zero"); @@ -78,12 +80,23 @@ static void test_hv_cpuid(struct kvm_cpuid2 *hv_cpuid_entries, TEST_ASSERT(!entry->padding[0] && !entry->padding[1] && !entry->padding[2], "padding should be zero"); - if (entry->function == 0x40000004) { - int nononarchcs = !!(entry->eax & (1UL << 18)); + switch (entry->function) { + case 0x40000000: + test_val = 0x40000082; - TEST_ASSERT(nononarchcs == !smt_possible(), + TEST_ASSERT(entry->eax == test_val, + "Wrong max leaf report in 0x40000000.EAX: %x" + " (evmcs=%d)", + entry->eax, evmcs_enabled + ); + break; + case 0x40000004: + test_val = entry->eax & (1UL << 18); + + TEST_ASSERT(!!test_val == !smt_possible(), "NoNonArchitecturalCoreSharing bit" " doesn't reflect SMT setting"); + break; } /* @@ -133,50 +146,44 @@ struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(struct kvm_vm *vm) int main(int argc, char *argv[]) { struct kvm_vm *vm; - int rv; + int rv, stage; struct kvm_cpuid2 *hv_cpuid_entries; + bool evmcs_enabled; /* Tell stdout not to buffer its content */ setbuf(stdout, NULL); rv = kvm_check_cap(KVM_CAP_HYPERV_CPUID); if (!rv) { - fprintf(stderr, - "KVM_CAP_HYPERV_CPUID not supported, skip test\n"); + print_skip("KVM_CAP_HYPERV_CPUID not supported"); exit(KSFT_SKIP); } - /* Create VM */ - vm = vm_create_default(VCPU_ID, 0, guest_code); - - test_hv_cpuid_e2big(vm); - - hv_cpuid_entries = kvm_get_supported_hv_cpuid(vm); - if (!hv_cpuid_entries) - return 1; - - test_hv_cpuid(hv_cpuid_entries, 0); - - free(hv_cpuid_entries); + for (stage = 0; stage < 3; stage++) { + evmcs_enabled = false; + + vm = vm_create_default(VCPU_ID, 0, guest_code); + switch (stage) { + case 0: + test_hv_cpuid_e2big(vm); + continue; + case 1: + break; + case 2: + if (!kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)) { + print_skip("Enlightened VMCS is unsupported"); + continue; + } + vcpu_enable_evmcs(vm, VCPU_ID); + evmcs_enabled = true; + break; + } - if (!kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)) { - fprintf(stderr, - "Enlightened VMCS is unsupported, skip related test\n"); - goto vm_free; + hv_cpuid_entries = kvm_get_supported_hv_cpuid(vm); + test_hv_cpuid(hv_cpuid_entries, evmcs_enabled); + free(hv_cpuid_entries); + kvm_vm_free(vm); } - vcpu_enable_evmcs(vm, VCPU_ID); - - hv_cpuid_entries = kvm_get_supported_hv_cpuid(vm); - if (!hv_cpuid_entries) - return 1; - - test_hv_cpuid(hv_cpuid_entries, 1); - - free(hv_cpuid_entries); - -vm_free: - kvm_vm_free(vm); - return 0; } diff --git a/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c b/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c index 00bb97d76000..e6480fd5c4bd 100644 --- a/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c +++ b/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c @@ -44,7 +44,7 @@ void *thr(void *arg) struct kvm_run *run = tc->run; res = ioctl(kvmcpu, KVM_RUN, 0); - printf("ret1=%d exit_reason=%d suberror=%d\n", + pr_info("ret1=%d exit_reason=%d suberror=%d\n", res, run->exit_reason, run->internal.suberror); return 0; @@ -93,12 +93,12 @@ int main(void) int warnings_before, warnings_after; if (!is_intel_cpu()) { - printf("Must be run on an Intel CPU, skipping test\n"); + print_skip("Must be run on an Intel CPU"); exit(KSFT_SKIP); } if (vm_is_unrestricted_guest(NULL)) { - printf("Unrestricted guest must be disabled, skipping test\n"); + print_skip("Unrestricted guest must be disabled"); exit(KSFT_SKIP); } diff --git a/tools/testing/selftests/kvm/x86_64/platform_info_test.c b/tools/testing/selftests/kvm/x86_64/platform_info_test.c index f9334bd3cce9..1e89688cbbbf 100644 --- a/tools/testing/selftests/kvm/x86_64/platform_info_test.c +++ b/tools/testing/selftests/kvm/x86_64/platform_info_test.c @@ -58,8 +58,7 @@ static void test_msr_platform_info_enabled(struct kvm_vm *vm) exit_reason_str(run->exit_reason)); get_ucall(vm, VCPU_ID, &uc); TEST_ASSERT(uc.cmd == UCALL_SYNC, - "Received ucall other than UCALL_SYNC: %u\n", - ucall); + "Received ucall other than UCALL_SYNC: %lu\n", uc.cmd); TEST_ASSERT((uc.args[1] & MSR_PLATFORM_INFO_MAX_TURBO_RATIO) == MSR_PLATFORM_INFO_MAX_TURBO_RATIO, "Expected MSR_PLATFORM_INFO to have max turbo ratio mask: %i.", @@ -89,8 +88,7 @@ int main(int argc, char *argv[]) rv = kvm_check_cap(KVM_CAP_MSR_PLATFORM_INFO); if (!rv) { - fprintf(stderr, - "KVM_CAP_MSR_PLATFORM_INFO not supported, skip test\n"); + print_skip("KVM_CAP_MSR_PLATFORM_INFO not supported"); exit(KSFT_SKIP); } diff --git a/tools/testing/selftests/kvm/x86_64/smm_test.c b/tools/testing/selftests/kvm/x86_64/smm_test.c index 8c063646f2a0..6f8f478b3ceb 100644 --- a/tools/testing/selftests/kvm/x86_64/smm_test.c +++ b/tools/testing/selftests/kvm/x86_64/smm_test.c @@ -17,6 +17,7 @@ #include "kvm_util.h" #include "vmx.h" +#include "svm_util.h" #define VCPU_ID 1 @@ -58,7 +59,7 @@ void self_smi(void) APIC_DEST_SELF | APIC_INT_ASSERT | APIC_DM_SMI); } -void guest_code(struct vmx_pages *vmx_pages) +void guest_code(void *arg) { uint64_t apicbase = rdmsr(MSR_IA32_APICBASE); @@ -72,8 +73,11 @@ void guest_code(struct vmx_pages *vmx_pages) sync_with_host(4); - if (vmx_pages) { - GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); + if (arg) { + if (cpu_has_svm()) + generic_svm_setup(arg, NULL, NULL); + else + GUEST_ASSERT(prepare_for_vmx_operation(arg)); sync_with_host(5); @@ -87,7 +91,7 @@ void guest_code(struct vmx_pages *vmx_pages) int main(int argc, char *argv[]) { - vm_vaddr_t vmx_pages_gva = 0; + vm_vaddr_t nested_gva = 0; struct kvm_regs regs; struct kvm_vm *vm; @@ -114,10 +118,13 @@ int main(int argc, char *argv[]) vcpu_set_msr(vm, VCPU_ID, MSR_IA32_SMBASE, SMRAM_GPA); if (kvm_check_cap(KVM_CAP_NESTED_STATE)) { - vcpu_alloc_vmx(vm, &vmx_pages_gva); - vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); + if (kvm_get_supported_cpuid_entry(0x80000001)->ecx & CPUID_SVM) + vcpu_alloc_svm(vm, &nested_gva); + else + vcpu_alloc_vmx(vm, &nested_gva); + vcpu_args_set(vm, VCPU_ID, 1, nested_gva); } else { - printf("will skip SMM test with VMX enabled\n"); + pr_info("will skip SMM test with VMX enabled\n"); vcpu_args_set(vm, VCPU_ID, 1, 0); } diff --git a/tools/testing/selftests/kvm/x86_64/state_test.c b/tools/testing/selftests/kvm/x86_64/state_test.c index 3ab5ec3da9f4..d43b6f99b66c 100644 --- a/tools/testing/selftests/kvm/x86_64/state_test.c +++ b/tools/testing/selftests/kvm/x86_64/state_test.c @@ -18,14 +18,46 @@ #include "kvm_util.h" #include "processor.h" #include "vmx.h" +#include "svm_util.h" #define VCPU_ID 5 +#define L2_GUEST_STACK_SIZE 256 -void l2_guest_code(void) +void svm_l2_guest_code(void) { + GUEST_SYNC(4); + /* Exit to L1 */ + vmcall(); GUEST_SYNC(6); + /* Done, exit to L1 and never come back. */ + vmcall(); +} - /* Exit to L1 */ +static void svm_l1_guest_code(struct svm_test_data *svm) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + struct vmcb *vmcb = svm->vmcb; + + GUEST_ASSERT(svm->vmcb_gpa); + /* Prepare for L2 execution. */ + generic_svm_setup(svm, svm_l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + GUEST_SYNC(3); + run_guest(vmcb, svm->vmcb_gpa); + GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL); + GUEST_SYNC(5); + vmcb->save.rip += 3; + run_guest(vmcb, svm->vmcb_gpa); + GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL); + GUEST_SYNC(7); +} + +void vmx_l2_guest_code(void) +{ + GUEST_SYNC(6); + + /* Exit to L1 */ vmcall(); /* L1 has now set up a shadow VMCS for us. */ @@ -42,10 +74,9 @@ void l2_guest_code(void) vmcall(); } -void l1_guest_code(struct vmx_pages *vmx_pages) +static void vmx_l1_guest_code(struct vmx_pages *vmx_pages) { -#define L2_GUEST_STACK_SIZE 64 - unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; GUEST_ASSERT(vmx_pages->vmcs_gpa); GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); @@ -56,7 +87,7 @@ void l1_guest_code(struct vmx_pages *vmx_pages) GUEST_SYNC(4); GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa); - prepare_vmcs(vmx_pages, l2_guest_code, + prepare_vmcs(vmx_pages, vmx_l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]); GUEST_SYNC(5); @@ -106,20 +137,24 @@ void l1_guest_code(struct vmx_pages *vmx_pages) GUEST_ASSERT(vmresume()); } -void guest_code(struct vmx_pages *vmx_pages) +static void __attribute__((__flatten__)) guest_code(void *arg) { GUEST_SYNC(1); GUEST_SYNC(2); - if (vmx_pages) - l1_guest_code(vmx_pages); + if (arg) { + if (cpu_has_svm()) + svm_l1_guest_code(arg); + else + vmx_l1_guest_code(arg); + } GUEST_DONE(); } int main(int argc, char *argv[]) { - vm_vaddr_t vmx_pages_gva = 0; + vm_vaddr_t nested_gva = 0; struct kvm_regs regs1, regs2; struct kvm_vm *vm; @@ -136,10 +171,13 @@ int main(int argc, char *argv[]) vcpu_regs_get(vm, VCPU_ID, ®s1); if (kvm_check_cap(KVM_CAP_NESTED_STATE)) { - vcpu_alloc_vmx(vm, &vmx_pages_gva); - vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); + if (kvm_get_supported_cpuid_entry(0x80000001)->ecx & CPUID_SVM) + vcpu_alloc_svm(vm, &nested_gva); + else + vcpu_alloc_vmx(vm, &nested_gva); + vcpu_args_set(vm, VCPU_ID, 1, nested_gva); } else { - printf("will skip nested state checks\n"); + pr_info("will skip nested state checks\n"); vcpu_args_set(vm, VCPU_ID, 1, 0); } @@ -152,20 +190,20 @@ int main(int argc, char *argv[]) switch (get_ucall(vm, VCPU_ID, &uc)) { case UCALL_ABORT: - TEST_ASSERT(false, "%s at %s:%d", (const char *)uc.args[0], - __FILE__, uc.args[1]); + TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], + __FILE__, uc.args[1]); /* NOT REACHED */ case UCALL_SYNC: break; case UCALL_DONE: goto done; default: - TEST_ASSERT(false, "Unknown ucall 0x%x.", uc.cmd); + TEST_FAIL("Unknown ucall %lu", uc.cmd); } /* UCALL_SYNC is handled here. */ TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") && - uc.args[1] == stage, "Unexpected register values vmexit #%lx, got %lx", + uc.args[1] == stage, "Stage %d: Unexpected register values vmexit, got %lx", stage, (ulong)uc.args[1]); state = vcpu_save_state(vm, VCPU_ID); diff --git a/tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c b/tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c index e280f68f6365..0e1adb4e3199 100644 --- a/tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c +++ b/tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c @@ -61,16 +61,14 @@ int main(int argc, char *argv[]) switch (get_ucall(vm, VCPU_ID, &uc)) { case UCALL_ABORT: - TEST_ASSERT(false, "%s", - (const char *)uc.args[0]); + TEST_FAIL("%s", (const char *)uc.args[0]); /* NOT REACHED */ case UCALL_SYNC: break; case UCALL_DONE: goto done; default: - TEST_ASSERT(false, - "Unknown ucall 0x%x.", uc.cmd); + TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd); } } done: diff --git a/tools/testing/selftests/kvm/x86_64/sync_regs_test.c b/tools/testing/selftests/kvm/x86_64/sync_regs_test.c index 5c8224256294..d672f0a473f8 100644 --- a/tools/testing/selftests/kvm/x86_64/sync_regs_test.c +++ b/tools/testing/selftests/kvm/x86_64/sync_regs_test.c @@ -91,11 +91,11 @@ int main(int argc, char *argv[]) cap = kvm_check_cap(KVM_CAP_SYNC_REGS); if ((cap & TEST_SYNC_FIELDS) != TEST_SYNC_FIELDS) { - fprintf(stderr, "KVM_CAP_SYNC_REGS not supported, skipping test\n"); + print_skip("KVM_CAP_SYNC_REGS not supported"); exit(KSFT_SKIP); } if ((cap & INVALID_SYNC_FIELD) != 0) { - fprintf(stderr, "The \"invalid\" field is not invalid, skipping test\n"); + print_skip("The \"invalid\" field is not invalid"); exit(KSFT_SKIP); } diff --git a/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c b/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c index 5dfb53546a26..fe40ade06a49 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c @@ -78,10 +78,10 @@ int main(int argc, char *argv[]) switch (get_ucall(vm, VCPU_ID, &uc)) { case UCALL_ABORT: - TEST_ASSERT(false, "%s", (const char *)uc.args[0]); + TEST_FAIL("%s", (const char *)uc.args[0]); /* NOT REACHED */ default: - TEST_ASSERT(false, "Unknown ucall 0x%x.", uc.cmd); + TEST_FAIL("Unknown ucall %lu", uc.cmd); } } } diff --git a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c index a223a6401258..e894a638a155 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c @@ -21,7 +21,7 @@ /* The memory slot index to track dirty pages */ #define TEST_MEM_SLOT_INDEX 1 -#define TEST_MEM_SIZE 3 +#define TEST_MEM_PAGES 3 /* L1 guest test virtual memory offset */ #define GUEST_TEST_MEM 0xc0000000 @@ -91,15 +91,14 @@ int main(int argc, char *argv[]) vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, GUEST_TEST_MEM, TEST_MEM_SLOT_INDEX, - TEST_MEM_SIZE, + TEST_MEM_PAGES, KVM_MEM_LOG_DIRTY_PAGES); /* * Add an identity map for GVA range [0xc0000000, 0xc0002000). This * affects both L1 and L2. However... */ - virt_map(vm, GUEST_TEST_MEM, GUEST_TEST_MEM, - TEST_MEM_SIZE * 4096, 0); + virt_map(vm, GUEST_TEST_MEM, GUEST_TEST_MEM, TEST_MEM_PAGES, 0); /* * ... pages in the L2 GPA range [0xc0001000, 0xc0003000) will map to @@ -113,11 +112,11 @@ int main(int argc, char *argv[]) nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, 4096, 0); nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, 4096, 0); - bmap = bitmap_alloc(TEST_MEM_SIZE); + bmap = bitmap_alloc(TEST_MEM_PAGES); host_test_mem = addr_gpa2hva(vm, GUEST_TEST_MEM); while (!done) { - memset(host_test_mem, 0xaa, TEST_MEM_SIZE * 4096); + memset(host_test_mem, 0xaa, TEST_MEM_PAGES * 4096); _vcpu_run(vm, VCPU_ID); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "Unexpected exit reason: %u (%s),\n", @@ -126,8 +125,8 @@ int main(int argc, char *argv[]) switch (get_ucall(vm, VCPU_ID, &uc)) { case UCALL_ABORT: - TEST_ASSERT(false, "%s at %s:%d", (const char *)uc.args[0], - __FILE__, uc.args[1]); + TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], + __FILE__, uc.args[1]); /* NOT REACHED */ case UCALL_SYNC: /* @@ -152,7 +151,7 @@ int main(int argc, char *argv[]) done = true; break; default: - TEST_ASSERT(false, "Unknown ucall 0x%x.", uc.cmd); + TEST_FAIL("Unknown ucall %lu", uc.cmd); } } } diff --git a/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c new file mode 100644 index 000000000000..cc72b6188ca7 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c @@ -0,0 +1,255 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * VMX-preemption timer test + * + * Copyright (C) 2020, Google, LLC. + * + * Test to ensure the VM-Enter after migration doesn't + * incorrectly restarts the timer with the full timer + * value instead of partially decayed timer value + * + */ +#define _GNU_SOURCE /* for program_invocation_short_name */ +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> + +#include "test_util.h" + +#include "kvm_util.h" +#include "processor.h" +#include "vmx.h" + +#define VCPU_ID 5 +#define PREEMPTION_TIMER_VALUE 100000000ull +#define PREEMPTION_TIMER_VALUE_THRESHOLD1 80000000ull + +u32 vmx_pt_rate; +bool l2_save_restore_done; +static u64 l2_vmx_pt_start; +volatile u64 l2_vmx_pt_finish; + +void l2_guest_code(void) +{ + u64 vmx_pt_delta; + + vmcall(); + l2_vmx_pt_start = (rdtsc() >> vmx_pt_rate) << vmx_pt_rate; + + /* + * Wait until the 1st threshold has passed + */ + do { + l2_vmx_pt_finish = rdtsc(); + vmx_pt_delta = (l2_vmx_pt_finish - l2_vmx_pt_start) >> + vmx_pt_rate; + } while (vmx_pt_delta < PREEMPTION_TIMER_VALUE_THRESHOLD1); + + /* + * Force L2 through Save and Restore cycle + */ + GUEST_SYNC(1); + + l2_save_restore_done = 1; + + /* + * Now wait for the preemption timer to fire and + * exit to L1 + */ + while ((l2_vmx_pt_finish = rdtsc())) + ; +} + +void l1_guest_code(struct vmx_pages *vmx_pages) +{ +#define L2_GUEST_STACK_SIZE 64 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + u64 l1_vmx_pt_start; + u64 l1_vmx_pt_finish; + u64 l1_tsc_deadline, l2_tsc_deadline; + + GUEST_ASSERT(vmx_pages->vmcs_gpa); + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); + GUEST_ASSERT(load_vmcs(vmx_pages)); + GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa); + + prepare_vmcs(vmx_pages, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + /* + * Check for Preemption timer support + */ + basic.val = rdmsr(MSR_IA32_VMX_BASIC); + ctrl_pin_rev.val = rdmsr(basic.ctrl ? MSR_IA32_VMX_TRUE_PINBASED_CTLS + : MSR_IA32_VMX_PINBASED_CTLS); + ctrl_exit_rev.val = rdmsr(basic.ctrl ? MSR_IA32_VMX_TRUE_EXIT_CTLS + : MSR_IA32_VMX_EXIT_CTLS); + + if (!(ctrl_pin_rev.clr & PIN_BASED_VMX_PREEMPTION_TIMER) || + !(ctrl_exit_rev.clr & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) + return; + + GUEST_ASSERT(!vmlaunch()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + vmwrite(GUEST_RIP, vmreadz(GUEST_RIP) + vmreadz(VM_EXIT_INSTRUCTION_LEN)); + + /* + * Turn on PIN control and resume the guest + */ + GUEST_ASSERT(!vmwrite(PIN_BASED_VM_EXEC_CONTROL, + vmreadz(PIN_BASED_VM_EXEC_CONTROL) | + PIN_BASED_VMX_PREEMPTION_TIMER)); + + GUEST_ASSERT(!vmwrite(VMX_PREEMPTION_TIMER_VALUE, + PREEMPTION_TIMER_VALUE)); + + vmx_pt_rate = rdmsr(MSR_IA32_VMX_MISC) & 0x1F; + + l2_save_restore_done = 0; + + l1_vmx_pt_start = (rdtsc() >> vmx_pt_rate) << vmx_pt_rate; + + GUEST_ASSERT(!vmresume()); + + l1_vmx_pt_finish = rdtsc(); + + /* + * Ensure exit from L2 happens after L2 goes through + * save and restore + */ + GUEST_ASSERT(l2_save_restore_done); + + /* + * Ensure the exit from L2 is due to preemption timer expiry + */ + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_PREEMPTION_TIMER); + + l1_tsc_deadline = l1_vmx_pt_start + + (PREEMPTION_TIMER_VALUE << vmx_pt_rate); + + l2_tsc_deadline = l2_vmx_pt_start + + (PREEMPTION_TIMER_VALUE << vmx_pt_rate); + + /* + * Sync with the host and pass the l1|l2 pt_expiry_finish times and + * tsc deadlines so that host can verify they are as expected + */ + GUEST_SYNC_ARGS(2, l1_vmx_pt_finish, l1_tsc_deadline, + l2_vmx_pt_finish, l2_tsc_deadline); +} + +void guest_code(struct vmx_pages *vmx_pages) +{ + if (vmx_pages) + l1_guest_code(vmx_pages); + + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + vm_vaddr_t vmx_pages_gva = 0; + + struct kvm_regs regs1, regs2; + struct kvm_vm *vm; + struct kvm_run *run; + struct kvm_x86_state *state; + struct ucall uc; + int stage; + + /* + * AMD currently does not implement any VMX features, so for now we + * just early out. + */ + nested_vmx_check_supported(); + + /* Create VM */ + vm = vm_create_default(VCPU_ID, 0, guest_code); + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + run = vcpu_state(vm, VCPU_ID); + + vcpu_regs_get(vm, VCPU_ID, ®s1); + + if (kvm_check_cap(KVM_CAP_NESTED_STATE)) { + vcpu_alloc_vmx(vm, &vmx_pages_gva); + vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); + } else { + pr_info("will skip vmx preemption timer checks\n"); + goto done; + } + + for (stage = 1;; stage++) { + _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Stage %d: unexpected exit reason: %u (%s),\n", + stage, run->exit_reason, + exit_reason_str(run->exit_reason)); + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_ABORT: + TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], + __FILE__, uc.args[1]); + /* NOT REACHED */ + case UCALL_SYNC: + break; + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + + /* UCALL_SYNC is handled here. */ + TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") && + uc.args[1] == stage, "Stage %d: Unexpected register values vmexit, got %lx", + stage, (ulong)uc.args[1]); + /* + * If this stage 2 then we should verify the vmx pt expiry + * is as expected. + * From L1's perspective verify Preemption timer hasn't + * expired too early. + * From L2's perspective verify Preemption timer hasn't + * expired too late. + */ + if (stage == 2) { + + pr_info("Stage %d: L1 PT expiry TSC (%lu) , L1 TSC deadline (%lu)\n", + stage, uc.args[2], uc.args[3]); + + pr_info("Stage %d: L2 PT expiry TSC (%lu) , L2 TSC deadline (%lu)\n", + stage, uc.args[4], uc.args[5]); + + TEST_ASSERT(uc.args[2] >= uc.args[3], + "Stage %d: L1 PT expiry TSC (%lu) < L1 TSC deadline (%lu)", + stage, uc.args[2], uc.args[3]); + + TEST_ASSERT(uc.args[4] < uc.args[5], + "Stage %d: L2 PT expiry TSC (%lu) > L2 TSC deadline (%lu)", + stage, uc.args[4], uc.args[5]); + } + + state = vcpu_save_state(vm, VCPU_ID); + memset(®s1, 0, sizeof(regs1)); + vcpu_regs_get(vm, VCPU_ID, ®s1); + + kvm_vm_release(vm); + + /* Restore state in a new VM. */ + kvm_vm_restart(vm, O_RDWR); + vm_vcpu_add(vm, VCPU_ID); + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + vcpu_load_state(vm, VCPU_ID, state); + run = vcpu_state(vm, VCPU_ID); + free(state); + + memset(®s2, 0, sizeof(regs2)); + vcpu_regs_get(vm, VCPU_ID, ®s2); + TEST_ASSERT(!memcmp(®s1, ®s2, sizeof(regs2)), + "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx", + (ulong) regs2.rdi, (ulong) regs2.rsi); + } + +done: + kvm_vm_free(vm); +} diff --git a/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c b/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c index 9ef7fab39d48..54cdefdfb49d 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c @@ -212,7 +212,7 @@ void test_vmx_nested_state(struct kvm_vm *vm) test_nested_state(vm, state); vcpu_nested_state_get(vm, VCPU_ID, state); TEST_ASSERT(state->size >= sizeof(*state) && state->size <= state_sz, - "Size must be between %d and %d. The size returned was %d.", + "Size must be between %ld and %d. The size returned was %d.", sizeof(*state), state_sz, state->size); TEST_ASSERT(state->hdr.vmx.vmxon_pa == -1ull, "vmxon_pa must be -1ull."); TEST_ASSERT(state->hdr.vmx.vmcs12_pa == -1ull, "vmcs_pa must be -1ull."); @@ -228,7 +228,7 @@ int main(int argc, char *argv[]) have_evmcs = kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS); if (!kvm_check_cap(KVM_CAP_NESTED_STATE)) { - printf("KVM_CAP_NESTED_STATE not available, skipping test\n"); + print_skip("KVM_CAP_NESTED_STATE not available"); exit(KSFT_SKIP); } diff --git a/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c b/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c index 69e482a95c47..fbe8417cbc2c 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c @@ -121,8 +121,8 @@ static void l1_guest_code(struct vmx_pages *vmx_pages) static void report(int64_t val) { - printf("IA32_TSC_ADJUST is %ld (%lld * TSC_ADJUST_VALUE + %lld).\n", - val, val / TSC_ADJUST_VALUE, val % TSC_ADJUST_VALUE); + pr_info("IA32_TSC_ADJUST is %ld (%lld * TSC_ADJUST_VALUE + %lld).\n", + val, val / TSC_ADJUST_VALUE, val % TSC_ADJUST_VALUE); } int main(int argc, char *argv[]) @@ -150,7 +150,7 @@ int main(int argc, char *argv[]) switch (get_ucall(vm, VCPU_ID, &uc)) { case UCALL_ABORT: - TEST_ASSERT(false, "%s", (const char *)uc.args[0]); + TEST_FAIL("%s", (const char *)uc.args[0]); /* NOT REACHED */ case UCALL_SYNC: report(uc.args[1]); @@ -158,7 +158,7 @@ int main(int argc, char *argv[]) case UCALL_DONE: goto done; default: - TEST_ASSERT(false, "Unknown ucall 0x%x.", uc.cmd); + TEST_FAIL("Unknown ucall %lu", uc.cmd); } } diff --git a/tools/testing/selftests/kvm/x86_64/xss_msr_test.c b/tools/testing/selftests/kvm/x86_64/xss_msr_test.c index 851ea81b9d9f..3529376747c2 100644 --- a/tools/testing/selftests/kvm/x86_64/xss_msr_test.c +++ b/tools/testing/selftests/kvm/x86_64/xss_msr_test.c @@ -51,7 +51,7 @@ int main(int argc, char *argv[]) xss_supported = entry && !!(entry->eax & X86_FEATURE_XSAVES); } if (!xss_supported) { - printf("IA32_XSS is not supported by the vCPU.\n"); + print_skip("IA32_XSS is not supported by the vCPU"); exit(KSFT_SKIP); } diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index 3ed0134a764d..b0556c752443 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -137,7 +137,8 @@ endif # Selftest makefiles can override those targets by setting # OVERRIDE_TARGETS = 1. ifeq ($(OVERRIDE_TARGETS),) -$(OUTPUT)/%:%.c +LOCAL_HDRS := $(selfdir)/kselftest_harness.h $(selfdir)/kselftest.h +$(OUTPUT)/%:%.c $(LOCAL_HDRS) $(LINK.c) $^ $(LDLIBS) -o $@ $(OUTPUT)/%.o:%.S diff --git a/tools/testing/selftests/lib/config b/tools/testing/selftests/lib/config index 14a77ea4a8da..b80ee3f6e265 100644 --- a/tools/testing/selftests/lib/config +++ b/tools/testing/selftests/lib/config @@ -2,3 +2,4 @@ CONFIG_TEST_PRINTF=m CONFIG_TEST_BITMAP=m CONFIG_PRIME_NUMBERS=m CONFIG_TEST_STRSCPY=m +CONFIG_TEST_BITOPS=m diff --git a/tools/testing/selftests/lkdtm/run.sh b/tools/testing/selftests/lkdtm/run.sh index dadf819148a4..ee64ff8df8f4 100755 --- a/tools/testing/selftests/lkdtm/run.sh +++ b/tools/testing/selftests/lkdtm/run.sh @@ -25,13 +25,13 @@ fi # Figure out which test to run from our script name. test=$(basename $0 .sh) # Look up details about the test from master list of LKDTM tests. -line=$(egrep '^#?'"$test"'\b' tests.txt) +line=$(grep -E '^#?'"$test"'\b' tests.txt) if [ -z "$line" ]; then echo "Skipped: missing test '$test' in tests.txt" exit $KSELFTEST_SKIP_TEST fi # Check that the test is known to LKDTM. -if ! egrep -q '^'"$test"'$' "$TRIGGER" ; then +if ! grep -E -q '^'"$test"'$' "$TRIGGER" ; then echo "Skipped: test '$test' missing in $TRIGGER!" exit $KSELFTEST_SKIP_TEST fi @@ -59,30 +59,32 @@ if [ -z "$expect" ]; then expect="call trace:" fi -# Clear out dmesg for output reporting -dmesg -c >/dev/null - # Prepare log for report checking -LOG=$(mktemp --tmpdir -t lkdtm-XXXXXX) +LOG=$(mktemp --tmpdir -t lkdtm-log-XXXXXX) +DMESG=$(mktemp --tmpdir -t lkdtm-dmesg-XXXXXX) cleanup() { - rm -f "$LOG" + rm -f "$LOG" "$DMESG" } trap cleanup EXIT +# Save existing dmesg so we can detect new content below +dmesg > "$DMESG" + # Most shells yell about signals and we're expecting the "cat" process # to usually be killed by the kernel. So we have to run it in a sub-shell # and silence errors. ($SHELL -c 'cat <(echo '"$test"') >'"$TRIGGER" 2>/dev/null) || true # Record and dump the results -dmesg -c >"$LOG" +dmesg | diff --changed-group-format='%>' --unchanged-group-format='' "$DMESG" - > "$LOG" || true + cat "$LOG" # Check for expected output -if egrep -qi "$expect" "$LOG" ; then +if grep -E -qi "$expect" "$LOG" ; then echo "$test: saw '$expect': ok" exit 0 else - if egrep -qi XFAIL: "$LOG" ; then + if grep -E -qi XFAIL: "$LOG" ; then echo "$test: saw 'XFAIL': [SKIP]" exit $KSELFTEST_SKIP_TEST else diff --git a/tools/testing/selftests/media_tests/.gitignore b/tools/testing/selftests/media_tests/.gitignore index 8745eba39012..da438e780ffe 100644 --- a/tools/testing/selftests/media_tests/.gitignore +++ b/tools/testing/selftests/media_tests/.gitignore @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only media_device_test media_device_open video_device_test diff --git a/tools/testing/selftests/membarrier/.gitignore b/tools/testing/selftests/membarrier/.gitignore index f2f7ec0a99b4..f2fbba178601 100644 --- a/tools/testing/selftests/membarrier/.gitignore +++ b/tools/testing/selftests/membarrier/.gitignore @@ -1,2 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only membarrier_test_multi_thread membarrier_test_single_thread diff --git a/tools/testing/selftests/memfd/.gitignore b/tools/testing/selftests/memfd/.gitignore index afe87c40ac80..dd9a051f608e 100644 --- a/tools/testing/selftests/memfd/.gitignore +++ b/tools/testing/selftests/memfd/.gitignore @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only fuse_mnt fuse_test memfd_test diff --git a/tools/testing/selftests/memfd/Makefile b/tools/testing/selftests/memfd/Makefile index 53a848109f7b..4da8b565fa32 100644 --- a/tools/testing/selftests/memfd/Makefile +++ b/tools/testing/selftests/memfd/Makefile @@ -6,15 +6,25 @@ CFLAGS += -I../../../../usr/include/ TEST_GEN_PROGS := memfd_test TEST_PROGS := run_fuse_test.sh run_hugetlbfs_test.sh -TEST_GEN_FILES := fuse_mnt fuse_test +TEST_GEN_FILES := fuse_test fuse_mnt -fuse_mnt.o: CFLAGS += $(shell pkg-config fuse --cflags) +VAR_CFLAGS := $(shell pkg-config fuse --cflags 2>/dev/null) +ifeq ($(VAR_CFLAGS),) +VAR_CFLAGS := -D_FILE_OFFSET_BITS=64 -I/usr/include/fuse +endif + +VAR_LDLIBS := $(shell pkg-config fuse --libs 2>/dev/null) +ifeq ($(VAR_LDLIBS),) +VAR_LDLIBS := -lfuse -pthread +endif + +fuse_mnt.o: CFLAGS += $(VAR_CFLAGS) include ../lib.mk -$(OUTPUT)/fuse_mnt: LDLIBS += $(shell pkg-config fuse --libs) +$(OUTPUT)/fuse_mnt: LDLIBS += $(VAR_LDLIBS) -$(OUTPUT)/memfd_test: memfd_test.c common.o -$(OUTPUT)/fuse_test: fuse_test.c common.o +$(OUTPUT)/memfd_test: memfd_test.c common.c +$(OUTPUT)/fuse_test: fuse_test.c common.c -EXTRA_CLEAN = common.o +EXTRA_CLEAN = $(OUTPUT)/common.o diff --git a/tools/testing/selftests/mount/.gitignore b/tools/testing/selftests/mount/.gitignore index 856ad4107eb3..0bc64a6d4c18 100644 --- a/tools/testing/selftests/mount/.gitignore +++ b/tools/testing/selftests/mount/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only unprivileged-remount-test diff --git a/tools/testing/selftests/mqueue/.gitignore b/tools/testing/selftests/mqueue/.gitignore index d8d42377205a..72ad8ca691c9 100644 --- a/tools/testing/selftests/mqueue/.gitignore +++ b/tools/testing/selftests/mqueue/.gitignore @@ -1,2 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only mq_open_tests mq_perf_tests diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index ecc52d4c034d..742c499328b2 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only msg_zerocopy socket psock_fanout @@ -23,3 +24,8 @@ so_txtime tcp_fastopen_backup_key nettest fin_ack_lat +reuseaddr_ports_exhausted +hwtstamp_config +rxtimestamp +timestamping +txtimestamp diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 4c1bd03ffa1c..895ec992b2f1 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -14,6 +14,9 @@ TEST_PROGS += tcp_fastopen_backup_key.sh fcnal-test.sh l2tp.sh traceroute.sh TEST_PROGS += fin_ack_lat.sh fib_nexthop_multiprefix.sh fib_nexthops.sh TEST_PROGS += altnames.sh icmp_redirect.sh ip6_gre_headroom.sh TEST_PROGS += route_localnet.sh +TEST_PROGS += reuseaddr_ports_exhausted.sh +TEST_PROGS += txtimestamp.sh +TEST_PROGS += vrf-xfrm-tests.sh TEST_PROGS_EXTENDED := in_netns.sh TEST_GEN_FILES = socket nettest TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy reuseport_addr_any @@ -22,6 +25,8 @@ TEST_GEN_FILES += udpgso udpgso_bench_tx udpgso_bench_rx ip_defrag TEST_GEN_FILES += so_txtime ipv6_flowlabel ipv6_flowlabel_mgr TEST_GEN_FILES += tcp_fastopen_backup_key TEST_GEN_FILES += fin_ack_lat +TEST_GEN_FILES += reuseaddr_ports_exhausted +TEST_GEN_FILES += hwtstamp_config rxtimestamp timestamping txtimestamp TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict tls diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config index b8503a8119b0..3b42c06b5985 100644 --- a/tools/testing/selftests/net/config +++ b/tools/testing/selftests/net/config @@ -12,6 +12,7 @@ CONFIG_IPV6_VTI=y CONFIG_DUMMY=y CONFIG_BRIDGE=y CONFIG_VLAN_8021Q=y +CONFIG_IFB=y CONFIG_NETFILTER=y CONFIG_NETFILTER_ADVANCED=y CONFIG_NF_CONNTRACK=m @@ -27,5 +28,6 @@ CONFIG_NFT_CHAIN_NAT_IPV6=m CONFIG_NFT_CHAIN_NAT_IPV4=m CONFIG_NET_SCH_FQ=m CONFIG_NET_SCH_ETF=m +CONFIG_NET_SCH_NETEM=y CONFIG_TEST_BLACKHOLE_DEV=m CONFIG_KALLSYMS=y diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index 796670ebc65b..dee567f7576a 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -19,8 +19,8 @@ ret=0 ksft_skip=4 # all tests in this script. Can be overridden with -t option -IPV4_TESTS="ipv4_fcnal ipv4_grp_fcnal ipv4_withv6_fcnal ipv4_fcnal_runtime" -IPV6_TESTS="ipv6_fcnal ipv6_grp_fcnal ipv6_fcnal_runtime" +IPV4_TESTS="ipv4_fcnal ipv4_grp_fcnal ipv4_withv6_fcnal ipv4_fcnal_runtime ipv4_large_grp ipv4_compat_mode ipv4_fdb_grp_fcnal ipv4_torture" +IPV6_TESTS="ipv6_fcnal ipv6_grp_fcnal ipv6_fcnal_runtime ipv6_large_grp ipv6_compat_mode ipv6_fdb_grp_fcnal ipv6_torture" ALL_TESTS="basic ${IPV4_TESTS} ${IPV6_TESTS}" TESTS="${ALL_TESTS}" @@ -146,35 +146,36 @@ setup() create_ns remote IP="ip -netns me" + BRIDGE="bridge -netns me" set -e $IP li add veth1 type veth peer name veth2 $IP li set veth1 up $IP addr add 172.16.1.1/24 dev veth1 - $IP -6 addr add 2001:db8:91::1/64 dev veth1 + $IP -6 addr add 2001:db8:91::1/64 dev veth1 nodad $IP li add veth3 type veth peer name veth4 $IP li set veth3 up $IP addr add 172.16.2.1/24 dev veth3 - $IP -6 addr add 2001:db8:92::1/64 dev veth3 + $IP -6 addr add 2001:db8:92::1/64 dev veth3 nodad $IP li set veth2 netns peer up ip -netns peer addr add 172.16.1.2/24 dev veth2 - ip -netns peer -6 addr add 2001:db8:91::2/64 dev veth2 + ip -netns peer -6 addr add 2001:db8:91::2/64 dev veth2 nodad $IP li set veth4 netns peer up ip -netns peer addr add 172.16.2.2/24 dev veth4 - ip -netns peer -6 addr add 2001:db8:92::2/64 dev veth4 + ip -netns peer -6 addr add 2001:db8:92::2/64 dev veth4 nodad ip -netns remote li add veth5 type veth peer name veth6 ip -netns remote li set veth5 up ip -netns remote addr add dev veth5 172.16.101.1/24 - ip -netns remote addr add dev veth5 2001:db8:101::1/64 + ip -netns remote -6 addr add dev veth5 2001:db8:101::1/64 nodad ip -netns remote ro add 172.16.0.0/22 via 172.16.101.2 ip -netns remote -6 ro add 2001:db8:90::/40 via 2001:db8:101::2 ip -netns remote li set veth6 netns peer up ip -netns peer addr add dev veth6 172.16.101.2/24 - ip -netns peer addr add dev veth6 2001:db8:101::2/64 + ip -netns peer -6 addr add dev veth6 2001:db8:101::2/64 nodad set +e } @@ -248,11 +249,247 @@ check_route6() local expected="$2" local out - out=$($IP -6 route ls match ${pfx} 2>/dev/null) + out=$($IP -6 route ls match ${pfx} 2>/dev/null | sed -e 's/pref medium//') check_output "${out}" "${expected}" } +check_large_grp() +{ + local ipv=$1 + local ecmp=$2 + local grpnum=100 + local nhidstart=100 + local grpidstart=1000 + local iter=0 + local nhidstr="" + local grpidstr="" + local grpstr="" + local ipstr="" + + if [ $ipv -eq 4 ]; then + ipstr="172.16.1." + else + ipstr="2001:db8:91::" + fi + + # + # Create $grpnum groups with specified $ecmp and dump them + # + + # create nexthops with different gateways + iter=2 + while [ $iter -le $(($ecmp + 1)) ] + do + nhidstr="$(($nhidstart + $iter))" + run_cmd "$IP nexthop add id $nhidstr via $ipstr$iter dev veth1" + check_nexthop "id $nhidstr" "id $nhidstr via $ipstr$iter dev veth1 scope link" + + if [ $iter -le $ecmp ]; then + grpstr+="$nhidstr/" + else + grpstr+="$nhidstr" + fi + ((iter++)) + done + + # create duplicate large ecmp groups + iter=0 + while [ $iter -le $grpnum ] + do + grpidstr="$(($grpidstart + $iter))" + run_cmd "$IP nexthop add id $grpidstr group $grpstr" + check_nexthop "id $grpidstr" "id $grpidstr group $grpstr" + ((iter++)) + done + + # dump large groups + run_cmd "$IP nexthop list" + log_test $? 0 "Dump large (x$ecmp) ecmp groups" +} + +start_ip_monitor() +{ + local mtype=$1 + + # start the monitor in the background + tmpfile=`mktemp /var/run/nexthoptestXXX` + mpid=`($IP monitor $mtype > $tmpfile & echo $!) 2>/dev/null` + sleep 0.2 + echo "$mpid $tmpfile" +} + +stop_ip_monitor() +{ + local mpid=$1 + local tmpfile=$2 + local el=$3 + + # check the monitor results + kill $mpid + lines=`wc -l $tmpfile | cut "-d " -f1` + test $lines -eq $el + rc=$? + rm -rf $tmpfile + + return $rc +} + +check_nexthop_fdb_support() +{ + $IP nexthop help 2>&1 | grep -q fdb + if [ $? -ne 0 ]; then + echo "SKIP: iproute2 too old, missing fdb nexthop support" + return $ksft_skip + fi +} + +ipv6_fdb_grp_fcnal() +{ + local rc + + echo + echo "IPv6 fdb groups functional" + echo "--------------------------" + + check_nexthop_fdb_support + if [ $? -eq $ksft_skip ]; then + return $ksft_skip + fi + + # create group with multiple nexthops + run_cmd "$IP nexthop add id 61 via 2001:db8:91::2 fdb" + run_cmd "$IP nexthop add id 62 via 2001:db8:91::3 fdb" + run_cmd "$IP nexthop add id 102 group 61/62 fdb" + check_nexthop "id 102" "id 102 group 61/62 fdb" + log_test $? 0 "Fdb Nexthop group with multiple nexthops" + + ## get nexthop group + run_cmd "$IP nexthop get id 102" + check_nexthop "id 102" "id 102 group 61/62 fdb" + log_test $? 0 "Get Fdb nexthop group by id" + + # fdb nexthop group can only contain fdb nexthops + run_cmd "$IP nexthop add id 63 via 2001:db8:91::4" + run_cmd "$IP nexthop add id 64 via 2001:db8:91::5" + run_cmd "$IP nexthop add id 103 group 63/64 fdb" + log_test $? 2 "Fdb Nexthop group with non-fdb nexthops" + + # Non fdb nexthop group can not contain fdb nexthops + run_cmd "$IP nexthop add id 65 via 2001:db8:91::5 fdb" + run_cmd "$IP nexthop add id 66 via 2001:db8:91::6 fdb" + run_cmd "$IP nexthop add id 104 group 65/66" + log_test $? 2 "Non-Fdb Nexthop group with fdb nexthops" + + # fdb nexthop cannot have blackhole + run_cmd "$IP nexthop add id 67 blackhole fdb" + log_test $? 2 "Fdb Nexthop with blackhole" + + # fdb nexthop with oif + run_cmd "$IP nexthop add id 68 via 2001:db8:91::7 dev veth1 fdb" + log_test $? 2 "Fdb Nexthop with oif" + + # fdb nexthop with onlink + run_cmd "$IP nexthop add id 68 via 2001:db8:91::7 onlink fdb" + log_test $? 2 "Fdb Nexthop with onlink" + + # fdb nexthop with encap + run_cmd "$IP nexthop add id 69 encap mpls 101 via 2001:db8:91::8 dev veth1 fdb" + log_test $? 2 "Fdb Nexthop with encap" + + run_cmd "$IP link add name vx10 type vxlan id 1010 local 2001:db8:91::9 remote 2001:db8:91::10 dstport 4789 nolearning noudpcsum tos inherit ttl 100" + run_cmd "$BRIDGE fdb add 02:02:00:00:00:13 dev vx10 nhid 102 self" + log_test $? 0 "Fdb mac add with nexthop group" + + ## fdb nexthops can only reference nexthop groups and not nexthops + run_cmd "$BRIDGE fdb add 02:02:00:00:00:14 dev vx10 nhid 61 self" + log_test $? 255 "Fdb mac add with nexthop" + + run_cmd "$IP -6 ro add 2001:db8:101::1/128 nhid 66" + log_test $? 2 "Route add with fdb nexthop" + + run_cmd "$IP -6 ro add 2001:db8:101::1/128 nhid 103" + log_test $? 2 "Route add with fdb nexthop group" + + run_cmd "$IP nexthop del id 102" + log_test $? 0 "Fdb nexthop delete" + + $IP link del dev vx10 +} + +ipv4_fdb_grp_fcnal() +{ + local rc + + echo + echo "IPv4 fdb groups functional" + echo "--------------------------" + + check_nexthop_fdb_support + if [ $? -eq $ksft_skip ]; then + return $ksft_skip + fi + + # create group with multiple nexthops + run_cmd "$IP nexthop add id 12 via 172.16.1.2 fdb" + run_cmd "$IP nexthop add id 13 via 172.16.1.3 fdb" + run_cmd "$IP nexthop add id 102 group 12/13 fdb" + check_nexthop "id 102" "id 102 group 12/13 fdb" + log_test $? 0 "Fdb Nexthop group with multiple nexthops" + + # get nexthop group + run_cmd "$IP nexthop get id 102" + check_nexthop "id 102" "id 102 group 12/13 fdb" + log_test $? 0 "Get Fdb nexthop group by id" + + # fdb nexthop group can only contain fdb nexthops + run_cmd "$IP nexthop add id 14 via 172.16.1.2" + run_cmd "$IP nexthop add id 15 via 172.16.1.3" + run_cmd "$IP nexthop add id 103 group 14/15 fdb" + log_test $? 2 "Fdb Nexthop group with non-fdb nexthops" + + # Non fdb nexthop group can not contain fdb nexthops + run_cmd "$IP nexthop add id 16 via 172.16.1.2 fdb" + run_cmd "$IP nexthop add id 17 via 172.16.1.3 fdb" + run_cmd "$IP nexthop add id 104 group 14/15" + log_test $? 2 "Non-Fdb Nexthop group with fdb nexthops" + + # fdb nexthop cannot have blackhole + run_cmd "$IP nexthop add id 18 blackhole fdb" + log_test $? 2 "Fdb Nexthop with blackhole" + + # fdb nexthop with oif + run_cmd "$IP nexthop add id 16 via 172.16.1.2 dev veth1 fdb" + log_test $? 2 "Fdb Nexthop with oif" + + # fdb nexthop with onlink + run_cmd "$IP nexthop add id 16 via 172.16.1.2 onlink fdb" + log_test $? 2 "Fdb Nexthop with onlink" + + # fdb nexthop with encap + run_cmd "$IP nexthop add id 17 encap mpls 101 via 172.16.1.2 dev veth1 fdb" + log_test $? 2 "Fdb Nexthop with encap" + + run_cmd "$IP link add name vx10 type vxlan id 1010 local 10.0.0.1 remote 10.0.0.2 dstport 4789 nolearning noudpcsum tos inherit ttl 100" + run_cmd "$BRIDGE fdb add 02:02:00:00:00:13 dev vx10 nhid 102 self" + log_test $? 0 "Fdb mac add with nexthop group" + + # fdb nexthops can only reference nexthop groups and not nexthops + run_cmd "$BRIDGE fdb add 02:02:00:00:00:14 dev vx10 nhid 12 self" + log_test $? 255 "Fdb mac add with nexthop" + + run_cmd "$IP ro add 172.16.0.0/22 nhid 15" + log_test $? 2 "Route add with fdb nexthop" + + run_cmd "$IP ro add 172.16.0.0/22 nhid 103" + log_test $? 2 "Route add with fdb nexthop group" + + run_cmd "$IP nexthop del id 102" + log_test $? 0 "Fdb nexthop delete" + + $IP link del dev vx10 +} + ################################################################################ # basic operations (add, delete, replace) on nexthops and nexthop groups # @@ -423,8 +660,6 @@ ipv6_fcnal_runtime() echo "IPv6 functional runtime" echo "-----------------------" - sleep 5 - # # IPv6 - the basics # @@ -481,12 +716,12 @@ ipv6_fcnal_runtime() run_cmd "$IP -6 nexthop add id 85 dev veth1" run_cmd "$IP ro replace 2001:db8:101::1/128 nhid 85" log_test $? 0 "IPv6 route with device only nexthop" - check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 85 dev veth1 metric 1024 pref medium" + check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 85 dev veth1 metric 1024" run_cmd "$IP nexthop add id 123 group 81/85" run_cmd "$IP ro replace 2001:db8:101::1/128 nhid 123" log_test $? 0 "IPv6 multipath route with nexthop mix - dev only + gw" - check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 123 metric 1024 nexthop via 2001:db8:91::2 dev veth1 weight 1 nexthop dev veth1 weight 1 pref medium" + check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 123 metric 1024 nexthop via 2001:db8:91::2 dev veth1 weight 1 nexthop dev veth1 weight 1" # # IPv6 route with v4 nexthop - not allowed @@ -519,6 +754,75 @@ ipv6_fcnal_runtime() # route with src address and using nexthop - not allowed } +ipv6_large_grp() +{ + local ecmp=32 + + echo + echo "IPv6 large groups (x$ecmp)" + echo "---------------------" + + check_large_grp 6 $ecmp + + $IP nexthop flush >/dev/null 2>&1 +} + +ipv6_del_add_loop1() +{ + while :; do + $IP nexthop del id 100 + $IP nexthop add id 100 via 2001:db8:91::2 dev veth1 + done >/dev/null 2>&1 +} + +ipv6_grp_replace_loop() +{ + while :; do + $IP nexthop replace id 102 group 100/101 + done >/dev/null 2>&1 +} + +ipv6_torture() +{ + local pid1 + local pid2 + local pid3 + local pid4 + local pid5 + + echo + echo "IPv6 runtime torture" + echo "--------------------" + if [ ! -x "$(command -v mausezahn)" ]; then + echo "SKIP: Could not run test; need mausezahn tool" + return + fi + + run_cmd "$IP nexthop add id 100 via 2001:db8:91::2 dev veth1" + run_cmd "$IP nexthop add id 101 via 2001:db8:92::2 dev veth3" + run_cmd "$IP nexthop add id 102 group 100/101" + run_cmd "$IP route add 2001:db8:101::1 nhid 102" + run_cmd "$IP route add 2001:db8:101::2 nhid 102" + + ipv6_del_add_loop1 & + pid1=$! + ipv6_grp_replace_loop & + pid2=$! + ip netns exec me ping -f 2001:db8:101::1 >/dev/null 2>&1 & + pid3=$! + ip netns exec me ping -f 2001:db8:101::2 >/dev/null 2>&1 & + pid4=$! + ip netns exec me mausezahn veth1 -B 2001:db8:101::2 -A 2001:db8:91::1 -c 0 -t tcp "dp=1-1023, flags=syn" >/dev/null 2>&1 & + pid5=$! + + sleep 300 + kill -9 $pid1 $pid2 $pid3 $pid4 $pid5 + + # if we did not crash, success + log_test 0 0 "IPv6 torture test" +} + + ipv4_fcnal() { local rc @@ -749,6 +1053,29 @@ ipv4_fcnal_runtime() run_cmd "ip netns exec me ping -c1 -w1 172.16.101.1" log_test $? 0 "Ping - multipath" + run_cmd "$IP ro delete 172.16.101.1/32 nhid 122" + + # + # multiple default routes + # - tests fib_select_default + run_cmd "$IP nexthop add id 501 via 172.16.1.2 dev veth1" + run_cmd "$IP ro add default nhid 501" + run_cmd "$IP ro add default via 172.16.1.3 dev veth1 metric 20" + run_cmd "ip netns exec me ping -c1 -w1 172.16.101.1" + log_test $? 0 "Ping - multiple default routes, nh first" + + # flip the order + run_cmd "$IP ro del default nhid 501" + run_cmd "$IP ro del default via 172.16.1.3 dev veth1 metric 20" + run_cmd "$IP ro add default via 172.16.1.2 dev veth1 metric 20" + run_cmd "$IP nexthop replace id 501 via 172.16.1.3 dev veth1" + run_cmd "$IP ro add default nhid 501 metric 20" + run_cmd "ip netns exec me ping -c1 -w1 172.16.101.1" + log_test $? 0 "Ping - multiple default routes, nh second" + + run_cmd "$IP nexthop delete nhid 501" + run_cmd "$IP ro del default" + # # IPv4 with blackhole nexthops # @@ -843,6 +1170,11 @@ ipv4_fcnal_runtime() $IP neigh sh | grep 'dev veth1' fi + run_cmd "$IP ro del 172.16.101.1/32 via inet6 ${lladdr} dev veth1" + run_cmd "$IP -4 ro add default via inet6 ${lladdr} dev veth1" + run_cmd "ip netns exec me ping -c1 -w1 172.16.101.1" + log_test $? 0 "IPv4 default route with IPv6 gateway" + # # MPLS as an example of LWT encap # @@ -857,6 +1189,241 @@ ipv4_fcnal_runtime() log_test $? 0 "IPv4 route with MPLS encap, v6 gw - check" } +ipv4_large_grp() +{ + local ecmp=32 + + echo + echo "IPv4 large groups (x$ecmp)" + echo "---------------------" + + check_large_grp 4 $ecmp + + $IP nexthop flush >/dev/null 2>&1 +} + +sysctl_nexthop_compat_mode_check() +{ + local sysctlname="net.ipv4.nexthop_compat_mode" + local lprefix=$1 + + IPE="ip netns exec me" + + $IPE sysctl -q $sysctlname 2>&1 >/dev/null + if [ $? -ne 0 ]; then + echo "SKIP: kernel lacks nexthop compat mode sysctl control" + return $ksft_skip + fi + + out=$($IPE sysctl $sysctlname 2>/dev/null) + log_test $? 0 "$lprefix default nexthop compat mode check" + check_output "${out}" "$sysctlname = 1" +} + +sysctl_nexthop_compat_mode_set() +{ + local sysctlname="net.ipv4.nexthop_compat_mode" + local mode=$1 + local lprefix=$2 + + IPE="ip netns exec me" + + out=$($IPE sysctl -w $sysctlname=$mode) + log_test $? 0 "$lprefix set compat mode - $mode" + check_output "${out}" "net.ipv4.nexthop_compat_mode = $mode" +} + +ipv6_compat_mode() +{ + local rc + + echo + echo "IPv6 nexthop api compat mode test" + echo "--------------------------------" + + sysctl_nexthop_compat_mode_check "IPv6" + if [ $? -eq $ksft_skip ]; then + return $ksft_skip + fi + + run_cmd "$IP nexthop add id 62 via 2001:db8:91::2 dev veth1" + run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1" + run_cmd "$IP nexthop add id 122 group 62/63" + ipmout=$(start_ip_monitor route) + + run_cmd "$IP -6 ro add 2001:db8:101::1/128 nhid 122" + # route add notification should contain expanded nexthops + stop_ip_monitor $ipmout 3 + log_test $? 0 "IPv6 compat mode on - route add notification" + + # route dump should contain expanded nexthops + check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 122 metric 1024 nexthop via 2001:db8:91::2 dev veth1 weight 1 nexthop via 2001:db8:91::3 dev veth1 weight 1" + log_test $? 0 "IPv6 compat mode on - route dump" + + # change in nexthop group should generate route notification + run_cmd "$IP nexthop add id 64 via 2001:db8:91::4 dev veth1" + ipmout=$(start_ip_monitor route) + run_cmd "$IP nexthop replace id 122 group 62/64" + stop_ip_monitor $ipmout 3 + + log_test $? 0 "IPv6 compat mode on - nexthop change" + + # set compat mode off + sysctl_nexthop_compat_mode_set 0 "IPv6" + + run_cmd "$IP -6 ro del 2001:db8:101::1/128 nhid 122" + + run_cmd "$IP nexthop add id 62 via 2001:db8:91::2 dev veth1" + run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1" + run_cmd "$IP nexthop add id 122 group 62/63" + ipmout=$(start_ip_monitor route) + + run_cmd "$IP -6 ro add 2001:db8:101::1/128 nhid 122" + # route add notification should not contain expanded nexthops + stop_ip_monitor $ipmout 1 + log_test $? 0 "IPv6 compat mode off - route add notification" + + # route dump should not contain expanded nexthops + check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 122 metric 1024" + log_test $? 0 "IPv6 compat mode off - route dump" + + # change in nexthop group should not generate route notification + run_cmd "$IP nexthop add id 64 via 2001:db8:91::4 dev veth1" + ipmout=$(start_ip_monitor route) + run_cmd "$IP nexthop replace id 122 group 62/64" + stop_ip_monitor $ipmout 0 + log_test $? 0 "IPv6 compat mode off - nexthop change" + + # nexthop delete should not generate route notification + ipmout=$(start_ip_monitor route) + run_cmd "$IP nexthop del id 122" + stop_ip_monitor $ipmout 0 + log_test $? 0 "IPv6 compat mode off - nexthop delete" + + # set compat mode back on + sysctl_nexthop_compat_mode_set 1 "IPv6" +} + +ipv4_compat_mode() +{ + local rc + + echo + echo "IPv4 nexthop api compat mode" + echo "----------------------------" + + sysctl_nexthop_compat_mode_check "IPv4" + if [ $? -eq $ksft_skip ]; then + return $ksft_skip + fi + + run_cmd "$IP nexthop add id 21 via 172.16.1.2 dev veth1" + run_cmd "$IP nexthop add id 22 via 172.16.1.2 dev veth1" + run_cmd "$IP nexthop add id 122 group 21/22" + ipmout=$(start_ip_monitor route) + + run_cmd "$IP ro add 172.16.101.1/32 nhid 122" + stop_ip_monitor $ipmout 3 + + # route add notification should contain expanded nexthops + log_test $? 0 "IPv4 compat mode on - route add notification" + + # route dump should contain expanded nexthops + check_route "172.16.101.1" "172.16.101.1 nhid 122 nexthop via 172.16.1.2 dev veth1 weight 1 nexthop via 172.16.1.2 dev veth1 weight 1" + log_test $? 0 "IPv4 compat mode on - route dump" + + # change in nexthop group should generate route notification + run_cmd "$IP nexthop add id 23 via 172.16.1.3 dev veth1" + ipmout=$(start_ip_monitor route) + run_cmd "$IP nexthop replace id 122 group 21/23" + stop_ip_monitor $ipmout 3 + log_test $? 0 "IPv4 compat mode on - nexthop change" + + sysctl_nexthop_compat_mode_set 0 "IPv4" + + # cleanup + run_cmd "$IP ro del 172.16.101.1/32 nhid 122" + + ipmout=$(start_ip_monitor route) + run_cmd "$IP ro add 172.16.101.1/32 nhid 122" + stop_ip_monitor $ipmout 1 + # route add notification should not contain expanded nexthops + log_test $? 0 "IPv4 compat mode off - route add notification" + + # route dump should not contain expanded nexthops + check_route "172.16.101.1" "172.16.101.1 nhid 122" + log_test $? 0 "IPv4 compat mode off - route dump" + + # change in nexthop group should not generate route notification + ipmout=$(start_ip_monitor route) + run_cmd "$IP nexthop replace id 122 group 21/22" + stop_ip_monitor $ipmout 0 + log_test $? 0 "IPv4 compat mode off - nexthop change" + + # nexthop delete should not generate route notification + ipmout=$(start_ip_monitor route) + run_cmd "$IP nexthop del id 122" + stop_ip_monitor $ipmout 0 + log_test $? 0 "IPv4 compat mode off - nexthop delete" + + sysctl_nexthop_compat_mode_set 1 "IPv4" +} + +ipv4_del_add_loop1() +{ + while :; do + $IP nexthop del id 100 + $IP nexthop add id 100 via 172.16.1.2 dev veth1 + done >/dev/null 2>&1 +} + +ipv4_grp_replace_loop() +{ + while :; do + $IP nexthop replace id 102 group 100/101 + done >/dev/null 2>&1 +} + +ipv4_torture() +{ + local pid1 + local pid2 + local pid3 + local pid4 + local pid5 + + echo + echo "IPv4 runtime torture" + echo "--------------------" + if [ ! -x "$(command -v mausezahn)" ]; then + echo "SKIP: Could not run test; need mausezahn tool" + return + fi + + run_cmd "$IP nexthop add id 100 via 172.16.1.2 dev veth1" + run_cmd "$IP nexthop add id 101 via 172.16.2.2 dev veth3" + run_cmd "$IP nexthop add id 102 group 100/101" + run_cmd "$IP route add 172.16.101.1 nhid 102" + run_cmd "$IP route add 172.16.101.2 nhid 102" + + ipv4_del_add_loop1 & + pid1=$! + ipv4_grp_replace_loop & + pid2=$! + ip netns exec me ping -f 172.16.101.1 >/dev/null 2>&1 & + pid3=$! + ip netns exec me ping -f 172.16.101.2 >/dev/null 2>&1 & + pid4=$! + ip netns exec me mausezahn veth1 -B 172.16.101.2 -A 172.16.1.1 -c 0 -t tcp "dp=1-1023, flags=syn" >/dev/null 2>&1 & + pid5=$! + + sleep 300 + kill -9 $pid1 $pid2 $pid3 $pid4 $pid5 + + # if we did not crash, success + log_test 0 0 "IPv4 torture test" +} + basic() { echo diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh index b7616704b55e..84205c3a55eb 100755 --- a/tools/testing/selftests/net/fib_tests.sh +++ b/tools/testing/selftests/net/fib_tests.sh @@ -618,16 +618,22 @@ fib_nexthop_test() fib_suppress_test() { + echo + echo "FIB rule with suppress_prefixlength" + setup + $IP link add dummy1 type dummy $IP link set dummy1 up $IP -6 route add default dev dummy1 $IP -6 rule add table main suppress_prefixlength 0 - ping -f -c 1000 -W 1 1234::1 || true + ping -f -c 1000 -W 1 1234::1 >/dev/null 2>&1 $IP -6 rule del table main suppress_prefixlength 0 $IP link del dummy1 # If we got here without crashing, we're good. - return 0 + log_test 0 0 "FIB rule suppress test" + + cleanup } ################################################################################ diff --git a/tools/testing/selftests/net/forwarding/.gitignore b/tools/testing/selftests/net/forwarding/.gitignore index a793eef5b876..2dea317f12e7 100644 --- a/tools/testing/selftests/net/forwarding/.gitignore +++ b/tools/testing/selftests/net/forwarding/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only forwarding.config diff --git a/tools/testing/selftests/net/forwarding/devlink_lib.sh b/tools/testing/selftests/net/forwarding/devlink_lib.sh index 40b076983239..f0e6be4c09e9 100644 --- a/tools/testing/selftests/net/forwarding/devlink_lib.sh +++ b/tools/testing/selftests/net/forwarding/devlink_lib.sh @@ -35,6 +35,12 @@ if [ $? -ne 0 ]; then exit 1 fi +devlink dev help 2>&1 | grep info &> /dev/null +if [ $? -ne 0 ]; then + echo "SKIP: iproute2 too old, missing devlink dev info support" + exit 1 +fi + ############################################################################## # Devlink helpers @@ -359,7 +365,9 @@ devlink_trap_group_stats_idle_test() devlink_trap_exception_test() { local trap_name=$1; shift - local group_name=$1; shift + local group_name + + group_name=$(devlink_trap_group_get $trap_name) devlink_trap_stats_idle_test $trap_name check_fail $? "Trap stats idle when packets should have been trapped" @@ -371,8 +379,11 @@ devlink_trap_exception_test() devlink_trap_drop_test() { local trap_name=$1; shift - local group_name=$1; shift local dev=$1; shift + local handle=$1; shift + local group_name + + group_name=$(devlink_trap_group_get $trap_name) # This is the common part of all the tests. It checks that stats are # initially idle, then non-idle after changing the trap action and @@ -383,7 +394,6 @@ devlink_trap_drop_test() devlink_trap_group_stats_idle_test $group_name check_err $? "Trap group stats not idle with initial drop action" - devlink_trap_action_set $trap_name "trap" devlink_trap_stats_idle_test $trap_name check_fail $? "Trap stats idle after setting action to trap" @@ -397,7 +407,7 @@ devlink_trap_drop_test() devlink_trap_group_stats_idle_test $group_name check_err $? "Trap group stats not idle after setting action to drop" - tc_check_packets "dev $dev egress" 101 0 + tc_check_packets "dev $dev egress" $handle 0 check_err $? "Packets were not dropped" } @@ -406,7 +416,91 @@ devlink_trap_drop_cleanup() local mz_pid=$1; shift local dev=$1; shift local proto=$1; shift + local pref=$1; shift + local handle=$1; shift kill $mz_pid && wait $mz_pid &> /dev/null - tc filter del dev $dev egress protocol $proto pref 1 handle 101 flower + tc filter del dev $dev egress protocol $proto pref $pref handle $handle flower +} + +devlink_trap_stats_test() +{ + local test_name=$1; shift + local trap_name=$1; shift + local send_one="$@" + local t0_packets + local t1_packets + + RET=0 + + t0_packets=$(devlink_trap_rx_packets_get $trap_name) + + $send_one && sleep 1 + + t1_packets=$(devlink_trap_rx_packets_get $trap_name) + + if [[ $t1_packets -eq $t0_packets ]]; then + check_err 1 "Trap stats did not increase" + fi + + log_test "$test_name" +} + +devlink_trap_policers_num_get() +{ + devlink -j -p trap policer show | jq '.[]["'$DEVLINK_DEV'"] | length' +} + +devlink_trap_policer_rate_get() +{ + local policer_id=$1; shift + + devlink -j -p trap policer show $DEVLINK_DEV policer $policer_id \ + | jq '.[][][]["rate"]' +} + +devlink_trap_policer_burst_get() +{ + local policer_id=$1; shift + + devlink -j -p trap policer show $DEVLINK_DEV policer $policer_id \ + | jq '.[][][]["burst"]' +} + +devlink_trap_policer_rx_dropped_get() +{ + local policer_id=$1; shift + + devlink -j -p -s trap policer show $DEVLINK_DEV policer $policer_id \ + | jq '.[][][]["stats"]["rx"]["dropped"]' +} + +devlink_trap_group_policer_get() +{ + local group_name=$1; shift + + devlink -j -p trap group show $DEVLINK_DEV group $group_name \ + | jq '.[][][]["policer"]' +} + +devlink_trap_policer_ids_get() +{ + devlink -j -p trap policer show \ + | jq '.[]["'$DEVLINK_DEV'"][]["policer"]' +} + +devlink_port_by_netdev() +{ + local if_name=$1 + + devlink -j port show $if_name | jq -e '.[] | keys' | jq -r '.[]' +} + +devlink_cpu_port_get() +{ + local cpu_dl_port_num=$(devlink port list | grep "$DEVLINK_DEV" | + grep cpu | cut -d/ -f3 | cut -d: -f1 | + sed -n '1p') + + echo "$DEVLINK_DEV/$cpu_dl_port_num" } diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index 2f5da414aaa7..977fc2b326a2 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -60,6 +60,15 @@ check_tc_chain_support() fi } +check_tc_action_hw_stats_support() +{ + tc actions help 2>&1 | grep -q hw_stats + if [[ $? -ne 0 ]]; then + echo "SKIP: iproute2 too old; tc is missing action hw_stats support" + exit 1 + fi +} + if [[ "$(id -u)" -ne 0 ]]; then echo "SKIP: need root privileges" exit 0 @@ -248,13 +257,40 @@ busywait() done } +not() +{ + "$@" + [[ $? != 0 ]] +} + +grep_bridge_fdb() +{ + local addr=$1; shift + local word + local flag + + if [ "$1" == "self" ] || [ "$1" == "master" ]; then + word=$1; shift + if [ "$1" == "-v" ]; then + flag=$1; shift + fi + fi + + $@ | grep $addr | grep $flag "$word" +} + +wait_for_offload() +{ + "$@" | grep -q offload +} + until_counter_is() { - local value=$1; shift + local expr=$1; shift local current=$("$@") echo $((current)) - ((current >= value)) + ((current $expr)) } busywait_for_counter() @@ -263,7 +299,7 @@ busywait_for_counter() local delta=$1; shift local base=$("$@") - busywait "$timeout" until_counter_is $((base + delta)) "$@" + busywait "$timeout" until_counter_is ">= $((base + delta))" "$@" } setup_wait_dev() @@ -599,6 +635,17 @@ tc_rule_stats_get() | jq ".[1].options.actions[].stats$selector" } +tc_rule_handle_stats_get() +{ + local id=$1; shift + local handle=$1; shift + local selector=${1:-.packets}; shift + + tc -j -s filter show $id \ + | jq ".[] | select(.options.handle == $handle) | \ + .options.actions[0].stats$selector" +} + ethtool_stats_get() { local dev=$1; shift @@ -607,6 +654,26 @@ ethtool_stats_get() ethtool -S $dev | grep "^ *$stat:" | head -n 1 | cut -d: -f2 } +qdisc_stats_get() +{ + local dev=$1; shift + local handle=$1; shift + local selector=$1; shift + + tc -j -s qdisc show dev "$dev" \ + | jq '.[] | select(.handle == "'"$handle"'") | '"$selector" +} + +qdisc_parent_stats_get() +{ + local dev=$1; shift + local parent=$1; shift + local selector=$1; shift + + tc -j -s qdisc show dev "$dev" invisible \ + | jq '.[] | select(.parent == "'"$parent"'") | '"$selector" +} + humanize() { local speed=$1; shift @@ -1132,18 +1199,29 @@ flood_test() flood_multicast_test $br_port $host1_if $host2_if } -start_traffic() +__start_traffic() { + local proto=$1; shift local h_in=$1; shift # Where the traffic egresses the host local sip=$1; shift local dip=$1; shift local dmac=$1; shift $MZ $h_in -p 8000 -A $sip -B $dip -c 0 \ - -a own -b $dmac -t udp -q & + -a own -b $dmac -t "$proto" -q "$@" & sleep 1 } +start_traffic() +{ + __start_traffic udp "$@" +} + +start_tcp_traffic() +{ + __start_traffic tcp "$@" +} + stop_traffic() { # Suppress noise from killing mausezahn. diff --git a/tools/testing/selftests/net/forwarding/mirror_lib.sh b/tools/testing/selftests/net/forwarding/mirror_lib.sh index 00797597fcf5..c33bfd7ba214 100644 --- a/tools/testing/selftests/net/forwarding/mirror_lib.sh +++ b/tools/testing/selftests/net/forwarding/mirror_lib.sh @@ -29,11 +29,9 @@ mirror_test() local pref=$1; shift local expect=$1; shift - local ping_timeout=$((PING_TIMEOUT * 5)) local t0=$(tc_rule_stats_get $dev $pref) - ip vrf exec $vrf_name \ - ${PING} ${sip:+-I $sip} $dip -c 10 -i 0.5 -w $ping_timeout \ - &> /dev/null + $MZ $vrf_name ${sip:+-A $sip} -B $dip -a own -b bc -q \ + -c 10 -d 100ms -t icmp type=8 sleep 0.5 local t1=$(tc_rule_stats_get $dev $pref) local delta=$((t1 - t0)) diff --git a/tools/testing/selftests/net/forwarding/pedit_dsfield.sh b/tools/testing/selftests/net/forwarding/pedit_dsfield.sh new file mode 100755 index 000000000000..55eeacf59241 --- /dev/null +++ b/tools/testing/selftests/net/forwarding/pedit_dsfield.sh @@ -0,0 +1,309 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# This test sends traffic from H1 to H2. Either on ingress of $swp1, or on +# egress of $swp2, the traffic is acted upon by a pedit action. An ingress +# filter installed on $h2 verifies that the packet looks like expected. +# +# +----------------------+ +----------------------+ +# | H1 | | H2 | +# | + $h1 | | $h2 + | +# | | 192.0.2.1/28 | | 192.0.2.2/28 | | +# +----|-----------------+ +----------------|-----+ +# | | +# +----|----------------------------------------------------------------|-----+ +# | SW | | | +# | +-|----------------------------------------------------------------|-+ | +# | | + $swp1 BR $swp2 + | | +# | +--------------------------------------------------------------------+ | +# +---------------------------------------------------------------------------+ + +ALL_TESTS=" + ping_ipv4 + ping_ipv6 + test_ip_dsfield + test_ip_dscp + test_ip_ecn + test_ip_dscp_ecn + test_ip6_dsfield + test_ip6_dscp + test_ip6_ecn +" + +NUM_NETIFS=4 +source lib.sh +source tc_common.sh + +: ${HIT_TIMEOUT:=2000} # ms + +h1_create() +{ + simple_if_init $h1 192.0.2.1/28 2001:db8:1::1/64 +} + +h1_destroy() +{ + simple_if_fini $h1 192.0.2.1/28 2001:db8:1::1/64 +} + +h2_create() +{ + simple_if_init $h2 192.0.2.2/28 2001:db8:1::2/64 + tc qdisc add dev $h2 clsact +} + +h2_destroy() +{ + tc qdisc del dev $h2 clsact + simple_if_fini $h2 192.0.2.2/28 2001:db8:1::2/64 +} + +switch_create() +{ + ip link add name br1 up type bridge vlan_filtering 1 + ip link set dev $swp1 master br1 + ip link set dev $swp1 up + ip link set dev $swp2 master br1 + ip link set dev $swp2 up + + tc qdisc add dev $swp1 clsact + tc qdisc add dev $swp2 clsact +} + +switch_destroy() +{ + tc qdisc del dev $swp2 clsact + tc qdisc del dev $swp1 clsact + + ip link set dev $swp2 nomaster + ip link set dev $swp1 nomaster + ip link del dev br1 +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + swp1=${NETIFS[p2]} + + swp2=${NETIFS[p3]} + h2=${NETIFS[p4]} + + h2mac=$(mac_get $h2) + + vrf_prepare + h1_create + h2_create + switch_create +} + +cleanup() +{ + pre_cleanup + + switch_destroy + h2_destroy + h1_destroy + vrf_cleanup +} + +ping_ipv4() +{ + ping_test $h1 192.0.2.2 +} + +ping_ipv6() +{ + ping6_test $h1 2001:db8:1::2 +} + +do_test_pedit_dsfield_common() +{ + local pedit_locus=$1; shift + local pedit_action=$1; shift + local mz_flags=$1; shift + + RET=0 + + # TOS 125: DSCP 31, ECN 1. Used for testing that the relevant part is + # overwritten when zero is selected. + $MZ $mz_flags $h1 -c 10 -d 20msec -p 100 \ + -a own -b $h2mac -q -t tcp tos=0x7d,sp=54321,dp=12345 + + local pkts + pkts=$(busywait "$TC_HIT_TIMEOUT" until_counter_is ">= 10" \ + tc_rule_handle_stats_get "dev $h2 ingress" 101) + check_err $? "Expected to get 10 packets on test probe, but got $pkts." + + pkts=$(tc_rule_handle_stats_get "$pedit_locus" 101) + ((pkts >= 10)) + check_err $? "Expected to get 10 packets on pedit rule, but got $pkts." + + log_test "$pedit_locus pedit $pedit_action" +} + +do_test_pedit_dsfield() +{ + local pedit_locus=$1; shift + local pedit_action=$1; shift + local match_prot=$1; shift + local match_flower=$1; shift + local mz_flags=$1; shift + local saddr=$1; shift + local daddr=$1; shift + + tc filter add $pedit_locus handle 101 pref 1 \ + flower action pedit ex munge $pedit_action + tc filter add dev $h2 ingress handle 101 pref 1 prot $match_prot \ + flower skip_hw $match_flower action pass + + do_test_pedit_dsfield_common "$pedit_locus" "$pedit_action" "$mz_flags" + + tc filter del dev $h2 ingress pref 1 + tc filter del $pedit_locus pref 1 +} + +do_test_ip_dsfield() +{ + local locus=$1; shift + local dsfield + + for dsfield in 0 1 2 3 128 252 253 254 255; do + do_test_pedit_dsfield "$locus" \ + "ip dsfield set $dsfield" \ + ip "ip_tos $dsfield" \ + "-A 192.0.2.1 -B 192.0.2.2" + done +} + +test_ip_dsfield() +{ + do_test_ip_dsfield "dev $swp1 ingress" + do_test_ip_dsfield "dev $swp2 egress" +} + +do_test_ip_dscp() +{ + local locus=$1; shift + local dscp + + for dscp in 0 1 2 3 32 61 62 63; do + do_test_pedit_dsfield "$locus" \ + "ip dsfield set $((dscp << 2)) retain 0xfc" \ + ip "ip_tos $(((dscp << 2) | 1))" \ + "-A 192.0.2.1 -B 192.0.2.2" + done +} + +test_ip_dscp() +{ + do_test_ip_dscp "dev $swp1 ingress" + do_test_ip_dscp "dev $swp2 egress" +} + +do_test_ip_ecn() +{ + local locus=$1; shift + local ecn + + for ecn in 0 1 2 3; do + do_test_pedit_dsfield "$locus" \ + "ip dsfield set $ecn retain 0x03" \ + ip "ip_tos $((124 | $ecn))" \ + "-A 192.0.2.1 -B 192.0.2.2" + done +} + +test_ip_ecn() +{ + do_test_ip_ecn "dev $swp1 ingress" + do_test_ip_ecn "dev $swp2 egress" +} + +do_test_ip_dscp_ecn() +{ + local locus=$1; shift + + tc filter add $locus handle 101 pref 1 \ + flower action pedit ex munge ip dsfield set 124 retain 0xfc \ + action pedit ex munge ip dsfield set 1 retain 0x03 + tc filter add dev $h2 ingress handle 101 pref 1 prot ip \ + flower skip_hw ip_tos 125 action pass + + do_test_pedit_dsfield_common "$locus" "set DSCP + set ECN" \ + "-A 192.0.2.1 -B 192.0.2.2" + + tc filter del dev $h2 ingress pref 1 + tc filter del $locus pref 1 +} + +test_ip_dscp_ecn() +{ + do_test_ip_dscp_ecn "dev $swp1 ingress" + do_test_ip_dscp_ecn "dev $swp2 egress" +} + +do_test_ip6_dsfield() +{ + local locus=$1; shift + local dsfield + + for dsfield in 0 1 2 3 128 252 253 254 255; do + do_test_pedit_dsfield "$locus" \ + "ip6 traffic_class set $dsfield" \ + ipv6 "ip_tos $dsfield" \ + "-6 -A 2001:db8:1::1 -B 2001:db8:1::2" + done +} + +test_ip6_dsfield() +{ + do_test_ip6_dsfield "dev $swp1 ingress" + do_test_ip6_dsfield "dev $swp2 egress" +} + +do_test_ip6_dscp() +{ + local locus=$1; shift + local dscp + + for dscp in 0 1 2 3 32 61 62 63; do + do_test_pedit_dsfield "$locus" \ + "ip6 traffic_class set $((dscp << 2)) retain 0xfc" \ + ipv6 "ip_tos $(((dscp << 2) | 1))" \ + "-6 -A 2001:db8:1::1 -B 2001:db8:1::2" + done +} + +test_ip6_dscp() +{ + do_test_ip6_dscp "dev $swp1 ingress" + do_test_ip6_dscp "dev $swp2 egress" +} + +do_test_ip6_ecn() +{ + local locus=$1; shift + local ecn + + for ecn in 0 1 2 3; do + do_test_pedit_dsfield "$locus" \ + "ip6 traffic_class set $ecn retain 0x3" \ + ipv6 "ip_tos $((124 | $ecn))" \ + "-6 -A 2001:db8:1::1 -B 2001:db8:1::2" + done +} + +test_ip6_ecn() +{ + do_test_ip6_ecn "dev $swp1 ingress" + do_test_ip6_ecn "dev $swp2 egress" +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/forwarding/sch_ets.sh b/tools/testing/selftests/net/forwarding/sch_ets.sh index 40e0ad1bc4f2..e60c8b4818cc 100755 --- a/tools/testing/selftests/net/forwarding/sch_ets.sh +++ b/tools/testing/selftests/net/forwarding/sch_ets.sh @@ -34,11 +34,14 @@ switch_destroy() } # Callback from sch_ets_tests.sh -get_stats() +collect_stats() { - local stream=$1; shift + local -a streams=("$@") + local stream - link_stats_get $h2.1$stream rx bytes + for stream in ${streams[@]}; do + qdisc_parent_stats_get $swp2 10:$((stream + 1)) .bytes + done } ets_run diff --git a/tools/testing/selftests/net/forwarding/sch_ets_tests.sh b/tools/testing/selftests/net/forwarding/sch_ets_tests.sh index 3c3b204d47e8..cdf689e99458 100644 --- a/tools/testing/selftests/net/forwarding/sch_ets_tests.sh +++ b/tools/testing/selftests/net/forwarding/sch_ets_tests.sh @@ -2,7 +2,7 @@ # Global interface: # $put -- port under test (e.g. $swp2) -# get_stats($band) -- A function to collect stats for band +# collect_stats($streams...) -- A function to get stats for individual streams # ets_start_traffic($band) -- Start traffic for this band # ets_change_qdisc($op, $dev, $nstrict, $quanta...) -- Add or change qdisc @@ -94,15 +94,11 @@ __ets_dwrr_test() sleep 10 - t0=($(for stream in ${streams[@]}; do - get_stats $stream - done)) + t0=($(collect_stats "${streams[@]}")) sleep 10 - t1=($(for stream in ${streams[@]}; do - get_stats $stream - done)) + t1=($(collect_stats "${streams[@]}")) d=($(for ((i = 0; i < ${#streams[@]}; i++)); do echo $((${t1[$i]} - ${t0[$i]})) done)) diff --git a/tools/testing/selftests/net/forwarding/skbedit_priority.sh b/tools/testing/selftests/net/forwarding/skbedit_priority.sh new file mode 100755 index 000000000000..e3bd8a6bb8b4 --- /dev/null +++ b/tools/testing/selftests/net/forwarding/skbedit_priority.sh @@ -0,0 +1,168 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# This test sends traffic from H1 to H2. Either on ingress of $swp1, or on +# egress of $swp2, the traffic is acted upon by an action skbedit priority. The +# new priority should be taken into account when classifying traffic on the PRIO +# qdisc at $swp2. The test verifies that for different priority values, the +# traffic ends up in expected PRIO band. +# +# +----------------------+ +----------------------+ +# | H1 | | H2 | +# | + $h1 | | $h2 + | +# | | 192.0.2.1/28 | | 192.0.2.2/28 | | +# +----|-----------------+ +----------------|-----+ +# | | +# +----|----------------------------------------------------------------|-----+ +# | SW | | | +# | +-|----------------------------------------------------------------|-+ | +# | | + $swp1 BR $swp2 + | | +# | | PRIO | | +# | +--------------------------------------------------------------------+ | +# +---------------------------------------------------------------------------+ + +ALL_TESTS=" + ping_ipv4 + test_ingress + test_egress +" + +NUM_NETIFS=4 +source lib.sh + +: ${HIT_TIMEOUT:=2000} # ms + +h1_create() +{ + simple_if_init $h1 192.0.2.1/28 +} + +h1_destroy() +{ + simple_if_fini $h1 192.0.2.1/28 +} + +h2_create() +{ + simple_if_init $h2 192.0.2.2/28 +} + +h2_destroy() +{ + simple_if_fini $h2 192.0.2.2/28 +} + +switch_create() +{ + ip link add name br1 up type bridge vlan_filtering 1 + ip link set dev $swp1 master br1 + ip link set dev $swp1 up + ip link set dev $swp2 master br1 + ip link set dev $swp2 up + + tc qdisc add dev $swp1 clsact + tc qdisc add dev $swp2 clsact + tc qdisc add dev $swp2 root handle 10: \ + prio bands 8 priomap 7 6 5 4 3 2 1 0 +} + +switch_destroy() +{ + tc qdisc del dev $swp2 root + tc qdisc del dev $swp2 clsact + tc qdisc del dev $swp1 clsact + + ip link set dev $swp2 nomaster + ip link set dev $swp1 nomaster + ip link del dev br1 +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + swp1=${NETIFS[p2]} + + swp2=${NETIFS[p3]} + h2=${NETIFS[p4]} + + h2mac=$(mac_get $h2) + + vrf_prepare + h1_create + h2_create + switch_create +} + +cleanup() +{ + pre_cleanup + + switch_destroy + h2_destroy + h1_destroy + vrf_cleanup +} + +ping_ipv4() +{ + ping_test $h1 192.0.2.2 +} + +test_skbedit_priority_one() +{ + local locus=$1; shift + local prio=$1; shift + local classid=$1; shift + + RET=0 + + tc filter add $locus handle 101 pref 1 \ + flower action skbedit priority $prio + + local pkt0=$(qdisc_parent_stats_get $swp2 $classid .packets) + local pkt2=$(tc_rule_handle_stats_get "$locus" 101) + $MZ $h1 -t udp "sp=54321,dp=12345" -c 10 -d 20msec -p 100 \ + -a own -b $h2mac -A 192.0.2.1 -B 192.0.2.2 -q + + local pkt1 + pkt1=$(busywait "$HIT_TIMEOUT" until_counter_is ">= $((pkt0 + 10))" \ + qdisc_parent_stats_get $swp2 $classid .packets) + check_err $? "Expected to get 10 packets on class $classid, but got $((pkt1 - pkt0))." + + local pkt3=$(tc_rule_handle_stats_get "$locus" 101) + ((pkt3 >= pkt2 + 10)) + check_err $? "Expected to get 10 packets on skbedit rule but got $((pkt3 - pkt2))." + + log_test "$locus skbedit priority $prio -> classid $classid" + + tc filter del $locus pref 1 +} + +test_ingress() +{ + local prio + + for prio in {0..7}; do + test_skbedit_priority_one "dev $swp1 ingress" \ + $prio 10:$((8 - prio)) + done +} + +test_egress() +{ + local prio + + for prio in {0..7}; do + test_skbedit_priority_one "dev $swp2 egress" \ + $prio 10:$((8 - prio)) + done +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/forwarding/tc_actions.sh b/tools/testing/selftests/net/forwarding/tc_actions.sh index 813d02d1939d..d9eca227136b 100755 --- a/tools/testing/selftests/net/forwarding/tc_actions.sh +++ b/tools/testing/selftests/net/forwarding/tc_actions.sh @@ -2,7 +2,8 @@ # SPDX-License-Identifier: GPL-2.0 ALL_TESTS="gact_drop_and_ok_test mirred_egress_redirect_test \ - mirred_egress_mirror_test gact_trap_test" + mirred_egress_mirror_test matchall_mirred_egress_mirror_test \ + gact_trap_test" NUM_NETIFS=4 source tc_common.sh source lib.sh @@ -50,6 +51,9 @@ switch_destroy() mirred_egress_test() { local action=$1 + local protocol=$2 + local classifier=$3 + local classifier_args=$4 RET=0 @@ -62,9 +66,9 @@ mirred_egress_test() tc_check_packets "dev $h2 ingress" 101 1 check_fail $? "Matched without redirect rule inserted" - tc filter add dev $swp1 ingress protocol ip pref 1 handle 101 flower \ - $tcflags dst_ip 192.0.2.2 action mirred egress $action \ - dev $swp2 + tc filter add dev $swp1 ingress protocol $protocol pref 1 handle 101 \ + $classifier $tcflags $classifier_args \ + action mirred egress $action dev $swp2 $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \ -t ip -q @@ -72,10 +76,11 @@ mirred_egress_test() tc_check_packets "dev $h2 ingress" 101 1 check_err $? "Did not match incoming $action packet" - tc filter del dev $swp1 ingress protocol ip pref 1 handle 101 flower + tc filter del dev $swp1 ingress protocol $protocol pref 1 handle 101 \ + $classifier tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower - log_test "mirred egress $action ($tcflags)" + log_test "mirred egress $classifier $action ($tcflags)" } gact_drop_and_ok_test() @@ -187,12 +192,17 @@ cleanup() mirred_egress_redirect_test() { - mirred_egress_test "redirect" + mirred_egress_test "redirect" "ip" "flower" "dst_ip 192.0.2.2" } mirred_egress_mirror_test() { - mirred_egress_test "mirror" + mirred_egress_test "mirror" "ip" "flower" "dst_ip 192.0.2.2" +} + +matchall_mirred_egress_mirror_test() +{ + mirred_egress_test "mirror" "all" "matchall" "" } trap cleanup EXIT diff --git a/tools/testing/selftests/net/forwarding/tc_common.sh b/tools/testing/selftests/net/forwarding/tc_common.sh index 64f652633585..0e18e8be6e2a 100644 --- a/tools/testing/selftests/net/forwarding/tc_common.sh +++ b/tools/testing/selftests/net/forwarding/tc_common.sh @@ -6,39 +6,14 @@ CHECK_TC="yes" # Can be overridden by the configuration file. See lib.sh TC_HIT_TIMEOUT=${TC_HIT_TIMEOUT:=1000} # ms -__tc_check_packets() -{ - local id=$1 - local handle=$2 - local count=$3 - local operator=$4 - - start_time="$(date -u +%s%3N)" - while true - do - cmd_jq "tc -j -s filter show $id" \ - ".[] | select(.options.handle == $handle) | \ - select(.options.actions[0].stats.packets $operator $count)" \ - &> /dev/null - ret=$? - if [[ $ret -eq 0 ]]; then - return $ret - fi - current_time="$(date -u +%s%3N)" - diff=$(expr $current_time - $start_time) - if [ "$diff" -gt "$TC_HIT_TIMEOUT" ]; then - return 1 - fi - done -} - tc_check_packets() { local id=$1 local handle=$2 local count=$3 - __tc_check_packets "$id" "$handle" "$count" "==" + busywait "$TC_HIT_TIMEOUT" until_counter_is "== $count" \ + tc_rule_handle_stats_get "$id" "$handle" > /dev/null } tc_check_packets_hitting() @@ -46,5 +21,6 @@ tc_check_packets_hitting() local id=$1 local handle=$2 - __tc_check_packets "$id" "$handle" 0 ">" + busywait "$TC_HIT_TIMEOUT" until_counter_is "> 0" \ + tc_rule_handle_stats_get "$id" "$handle" > /dev/null } diff --git a/tools/testing/selftests/networking/timestamping/hwtstamp_config.c b/tools/testing/selftests/net/hwtstamp_config.c index e1fdee841021..e1fdee841021 100644 --- a/tools/testing/selftests/networking/timestamping/hwtstamp_config.c +++ b/tools/testing/selftests/net/hwtstamp_config.c diff --git a/tools/testing/selftests/net/ip_defrag.c b/tools/testing/selftests/net/ip_defrag.c index c0c9ecb891e1..f9ed749fd8c7 100644 --- a/tools/testing/selftests/net/ip_defrag.c +++ b/tools/testing/selftests/net/ip_defrag.c @@ -192,9 +192,9 @@ static void send_fragment(int fd_raw, struct sockaddr *addr, socklen_t alen, } res = sendto(fd_raw, ip_frame, frag_len, 0, addr, alen); - if (res < 0) + if (res < 0 && errno != EPERM) error(1, errno, "send_fragment"); - if (res != frag_len) + if (res >= 0 && res != frag_len) error(1, 0, "send_fragment: %d vs %d", res, frag_len); frag_counter++; @@ -313,9 +313,9 @@ static void send_udp_frags(int fd_raw, struct sockaddr *addr, iphdr->ip_len = htons(frag_len); } res = sendto(fd_raw, ip_frame, frag_len, 0, addr, alen); - if (res < 0) + if (res < 0 && errno != EPERM) error(1, errno, "sendto overlap: %d", frag_len); - if (res != frag_len) + if (res >= 0 && res != frag_len) error(1, 0, "sendto overlap: %d vs %d", (int)res, frag_len); frag_counter++; } diff --git a/tools/testing/selftests/net/mptcp/.gitignore b/tools/testing/selftests/net/mptcp/.gitignore index d72f07642738..260336d5f0b1 100644 --- a/tools/testing/selftests/net/mptcp/.gitignore +++ b/tools/testing/selftests/net/mptcp/.gitignore @@ -1,2 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only mptcp_connect +pm_nl_ctl *.pcap diff --git a/tools/testing/selftests/net/mptcp/Makefile b/tools/testing/selftests/net/mptcp/Makefile index ba450e62dc5b..f50976ee7d44 100644 --- a/tools/testing/selftests/net/mptcp/Makefile +++ b/tools/testing/selftests/net/mptcp/Makefile @@ -1,12 +1,13 @@ # SPDX-License-Identifier: GPL-2.0 top_srcdir = ../../../../.. +KSFT_KHDR_INSTALL := 1 -CFLAGS = -Wall -Wl,--no-as-needed -O2 -g +CFLAGS = -Wall -Wl,--no-as-needed -O2 -g -I$(top_srcdir)/usr/include -TEST_PROGS := mptcp_connect.sh +TEST_PROGS := mptcp_connect.sh pm_netlink.sh mptcp_join.sh -TEST_GEN_FILES = mptcp_connect +TEST_GEN_FILES = mptcp_connect pm_nl_ctl TEST_FILES := settings diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c index 99579c0223c1..cedee5b952ba 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.c +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c @@ -34,8 +34,8 @@ extern int optind; #define TCP_ULP 31 #endif +static int poll_timeout = 10 * 1000; static bool listen_mode; -static int poll_timeout; enum cfg_mode { CFG_MODE_POLL, @@ -50,11 +50,21 @@ static int cfg_sock_proto = IPPROTO_MPTCP; static bool tcpulp_audit; static int pf = AF_INET; static int cfg_sndbuf; +static int cfg_rcvbuf; +static bool cfg_join; static void die_usage(void) { - fprintf(stderr, "Usage: mptcp_connect [-6] [-u] [-s MPTCP|TCP] [-p port] -m mode]" - "[ -l ] [ -t timeout ] connect_address\n"); + fprintf(stderr, "Usage: mptcp_connect [-6] [-u] [-s MPTCP|TCP] [-p port] [-m mode]" + "[-l] connect_address\n"); + fprintf(stderr, "\t-6 use ipv6\n"); + fprintf(stderr, "\t-t num -- set poll timeout to num\n"); + fprintf(stderr, "\t-S num -- set SO_SNDBUF to num\n"); + fprintf(stderr, "\t-R num -- set SO_RCVBUF to num\n"); + fprintf(stderr, "\t-p num -- use port num\n"); + fprintf(stderr, "\t-m [MPTCP|TCP] -- use tcp or mptcp sockets\n"); + fprintf(stderr, "\t-s [mmap|poll] -- use poll (default) or mmap\n"); + fprintf(stderr, "\t-u -- check mptcp ulp\n"); exit(1); } @@ -97,6 +107,17 @@ static void xgetaddrinfo(const char *node, const char *service, } } +static void set_rcvbuf(int fd, unsigned int size) +{ + int err; + + err = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &size, sizeof(size)); + if (err) { + perror("set SO_RCVBUF"); + exit(1); + } +} + static void set_sndbuf(int fd, unsigned int size) { int err; @@ -230,6 +251,7 @@ static int sock_connect_mptcp(const char * const remoteaddr, static size_t do_rnd_write(const int fd, char *buf, const size_t len) { + static bool first = true; unsigned int do_w; ssize_t bw; @@ -237,10 +259,19 @@ static size_t do_rnd_write(const int fd, char *buf, const size_t len) if (do_w == 0 || do_w > len) do_w = len; + if (cfg_join && first && do_w > 100) + do_w = 100; + bw = write(fd, buf, do_w); if (bw < 0) perror("write"); + /* let the join handshake complete, before going on */ + if (cfg_join && first) { + usleep(200000); + first = false; + } + return bw; } @@ -365,8 +396,11 @@ static int copyfd_io_poll(int infd, int peerfd, int outfd) break; /* ... but we still receive. - * Close our write side. + * Close our write side, ev. give some time + * for address notification */ + if (cfg_join) + usleep(400000); shutdown(peerfd, SHUT_WR); } else { if (errno == EINTR) @@ -383,6 +417,10 @@ static int copyfd_io_poll(int infd, int peerfd, int outfd) } } + /* leave some time for late join/announce */ + if (cfg_join) + usleep(400000); + close(peerfd); return 0; } @@ -638,7 +676,7 @@ static void maybe_close(int fd) { unsigned int r = rand(); - if (r & 1) + if (!cfg_join && (r & 1)) close(fd); } @@ -704,6 +742,8 @@ int main_loop(void) check_getpeername_connect(fd); + if (cfg_rcvbuf) + set_rcvbuf(fd, cfg_rcvbuf); if (cfg_sndbuf) set_sndbuf(fd, cfg_sndbuf); @@ -745,7 +785,7 @@ int parse_mode(const char *mode) return 0; } -int parse_sndbuf(const char *size) +static int parse_int(const char *size) { unsigned long s; @@ -765,17 +805,19 @@ int parse_sndbuf(const char *size) die_usage(); } - cfg_sndbuf = s; - - return 0; + return (int)s; } static void parse_opts(int argc, char **argv) { int c; - while ((c = getopt(argc, argv, "6lp:s:hut:m:b:")) != -1) { + while ((c = getopt(argc, argv, "6jlp:s:hut:m:S:R:")) != -1) { switch (c) { + case 'j': + cfg_join = true; + cfg_mode = CFG_MODE_POLL; + break; case 'l': listen_mode = true; break; @@ -802,8 +844,11 @@ static void parse_opts(int argc, char **argv) case 'm': cfg_mode = parse_mode(optarg); break; - case 'b': - cfg_sndbuf = parse_sndbuf(optarg); + case 'S': + cfg_sndbuf = parse_int(optarg); + break; + case 'R': + cfg_rcvbuf = parse_int(optarg); break; } } @@ -831,6 +876,8 @@ int main(int argc, char *argv[]) if (fd < 0) return 1; + if (cfg_rcvbuf) + set_rcvbuf(fd, cfg_rcvbuf); if (cfg_sndbuf) set_sndbuf(fd, cfg_sndbuf); diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index d573a0feb98d..acf02e156d20 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -3,7 +3,7 @@ time_start=$(date +%s) -optstring="b:d:e:l:r:h4cm:" +optstring="S:R:d:e:l:r:h4cm:" ret=0 sin="" sout="" @@ -19,6 +19,7 @@ tc_loss=$((RANDOM%101)) tc_reorder="" testmode="" sndbuf=0 +rcvbuf=0 options_log=true if [ $tc_loss -eq 100 ];then @@ -39,7 +40,8 @@ usage() { echo -e "\t-e: ethtool features to disable, e.g.: \"-e tso -e gso\" (default: randomly disable any of tso/gso/gro)" echo -e "\t-4: IPv4 only: disable IPv6 tests (default: test both IPv4 and IPv6)" echo -e "\t-c: capture packets for each test using tcpdump (default: no capture)" - echo -e "\t-b: set sndbuf value (default: use kernel default)" + echo -e "\t-S: set sndbuf value (default: use kernel default)" + echo -e "\t-R: set rcvbuf value (default: use kernel default)" echo -e "\t-m: test mode (poll, sendfile; default: poll)" } @@ -73,11 +75,19 @@ while getopts "$optstring" option;do "c") capture=true ;; - "b") + "S") if [ $OPTARG -ge 0 ];then sndbuf="$OPTARG" else - echo "-s requires numeric argument, got \"$OPTARG\"" 1>&2 + echo "-S requires numeric argument, got \"$OPTARG\"" 1>&2 + exit 1 + fi + ;; + "R") + if [ $OPTARG -ge 0 ];then + rcvbuf="$OPTARG" + else + echo "-R requires numeric argument, got \"$OPTARG\"" 1>&2 exit 1 fi ;; @@ -342,8 +352,12 @@ do_transfer() port=$((10000+$TEST_COUNT)) TEST_COUNT=$((TEST_COUNT+1)) + if [ "$rcvbuf" -gt 0 ]; then + extra_args="$extra_args -R $rcvbuf" + fi + if [ "$sndbuf" -gt 0 ]; then - extra_args="$extra_args -b $sndbuf" + extra_args="$extra_args -S $sndbuf" fi if [ -n "$testmode" ]; then diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh new file mode 100755 index 000000000000..dd42c2f692d0 --- /dev/null +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -0,0 +1,357 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +ret=0 +sin="" +sout="" +cin="" +cout="" +ksft_skip=4 +timeout=30 +capture=0 + +TEST_COUNT=0 + +init() +{ + capout=$(mktemp) + + rndh=$(printf %x $sec)-$(mktemp -u XXXXXX) + + ns1="ns1-$rndh" + ns2="ns2-$rndh" + + for netns in "$ns1" "$ns2";do + ip netns add $netns || exit $ksft_skip + ip -net $netns link set lo up + ip netns exec $netns sysctl -q net.mptcp.enabled=1 + ip netns exec $netns sysctl -q net.ipv4.conf.all.rp_filter=0 + ip netns exec $netns sysctl -q net.ipv4.conf.default.rp_filter=0 + done + + # ns1 ns2 + # ns1eth1 ns2eth1 + # ns1eth2 ns2eth2 + # ns1eth3 ns2eth3 + # ns1eth4 ns2eth4 + + for i in `seq 1 4`; do + ip link add ns1eth$i netns "$ns1" type veth peer name ns2eth$i netns "$ns2" + ip -net "$ns1" addr add 10.0.$i.1/24 dev ns1eth$i + ip -net "$ns1" addr add dead:beef:$i::1/64 dev ns1eth$i nodad + ip -net "$ns1" link set ns1eth$i up + + ip -net "$ns2" addr add 10.0.$i.2/24 dev ns2eth$i + ip -net "$ns2" addr add dead:beef:$i::2/64 dev ns2eth$i nodad + ip -net "$ns2" link set ns2eth$i up + + # let $ns2 reach any $ns1 address from any interface + ip -net "$ns2" route add default via 10.0.$i.1 dev ns2eth$i metric 10$i + done +} + +cleanup_partial() +{ + rm -f "$capout" + + for netns in "$ns1" "$ns2"; do + ip netns del $netns + done +} + +cleanup() +{ + rm -f "$cin" "$cout" + rm -f "$sin" "$sout" + cleanup_partial +} + +reset() +{ + cleanup_partial + init +} + +for arg in "$@"; do + if [ "$arg" = "-c" ]; then + capture=1 + fi +done + +ip -Version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + + +check_transfer() +{ + in=$1 + out=$2 + what=$3 + + cmp "$in" "$out" > /dev/null 2>&1 + if [ $? -ne 0 ] ;then + echo "[ FAIL ] $what does not match (in, out):" + print_file_err "$in" + print_file_err "$out" + + return 1 + fi + + return 0 +} + +do_ping() +{ + listener_ns="$1" + connector_ns="$2" + connect_addr="$3" + + ip netns exec ${connector_ns} ping -q -c 1 $connect_addr >/dev/null + if [ $? -ne 0 ] ; then + echo "$listener_ns -> $connect_addr connectivity [ FAIL ]" 1>&2 + ret=1 + fi +} + +do_transfer() +{ + listener_ns="$1" + connector_ns="$2" + cl_proto="$3" + srv_proto="$4" + connect_addr="$5" + + port=$((10000+$TEST_COUNT)) + TEST_COUNT=$((TEST_COUNT+1)) + + :> "$cout" + :> "$sout" + :> "$capout" + + if [ $capture -eq 1 ]; then + if [ -z $SUDO_USER ] ; then + capuser="" + else + capuser="-Z $SUDO_USER" + fi + + capfile="mp_join-${listener_ns}.pcap" + + echo "Capturing traffic for test $TEST_COUNT into $capfile" + ip netns exec ${listener_ns} tcpdump -i any -s 65535 -B 32768 $capuser -w $capfile > "$capout" 2>&1 & + cappid=$! + + sleep 1 + fi + + ip netns exec ${listener_ns} ./mptcp_connect -j -t $timeout -l -p $port -s ${srv_proto} 0.0.0.0 < "$sin" > "$sout" & + spid=$! + + sleep 1 + + ip netns exec ${connector_ns} ./mptcp_connect -j -t $timeout -p $port -s ${cl_proto} $connect_addr < "$cin" > "$cout" & + cpid=$! + + wait $cpid + retc=$? + wait $spid + rets=$? + + if [ $capture -eq 1 ]; then + sleep 1 + kill $cappid + fi + + if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then + echo " client exit code $retc, server $rets" 1>&2 + echo "\nnetns ${listener_ns} socket stat for $port:" 1>&2 + ip netns exec ${listener_ns} ss -nita 1>&2 -o "sport = :$port" + echo "\nnetns ${connector_ns} socket stat for $port:" 1>&2 + ip netns exec ${connector_ns} ss -nita 1>&2 -o "dport = :$port" + + cat "$capout" + return 1 + fi + + check_transfer $sin $cout "file received by client" + retc=$? + check_transfer $cin $sout "file received by server" + rets=$? + + if [ $retc -eq 0 ] && [ $rets -eq 0 ];then + cat "$capout" + return 0 + fi + + cat "$capout" + return 1 +} + +make_file() +{ + name=$1 + who=$2 + + SIZE=1 + + dd if=/dev/urandom of="$name" bs=1024 count=$SIZE 2> /dev/null + echo -e "\nMPTCP_TEST_FILE_END_MARKER" >> "$name" + + echo "Created $name (size $SIZE KB) containing data sent by $who" +} + +run_tests() +{ + listener_ns="$1" + connector_ns="$2" + connect_addr="$3" + lret=0 + + do_transfer ${listener_ns} ${connector_ns} MPTCP MPTCP ${connect_addr} + lret=$? + if [ $lret -ne 0 ]; then + ret=$lret + return + fi +} + +chk_join_nr() +{ + local msg="$1" + local syn_nr=$2 + local syn_ack_nr=$3 + local ack_nr=$4 + local count + local dump_stats + + printf "%-36s %s" "$msg" "syn" + count=`ip netns exec $ns1 nstat -as | grep MPTcpExtMPJoinSynRx | awk '{print $2}'` + [ -z "$count" ] && count=0 + if [ "$count" != "$syn_nr" ]; then + echo "[fail] got $count JOIN[s] syn expected $syn_nr" + ret=1 + dump_stats=1 + else + echo -n "[ ok ]" + fi + + echo -n " - synack" + count=`ip netns exec $ns2 nstat -as | grep MPTcpExtMPJoinSynAckRx | awk '{print $2}'` + [ -z "$count" ] && count=0 + if [ "$count" != "$syn_ack_nr" ]; then + echo "[fail] got $count JOIN[s] synack expected $syn_ack_nr" + ret=1 + dump_stats=1 + else + echo -n "[ ok ]" + fi + + echo -n " - ack" + count=`ip netns exec $ns1 nstat -as | grep MPTcpExtMPJoinAckRx | awk '{print $2}'` + [ -z "$count" ] && count=0 + if [ "$count" != "$ack_nr" ]; then + echo "[fail] got $count JOIN[s] ack expected $ack_nr" + ret=1 + dump_stats=1 + else + echo "[ ok ]" + fi + if [ "${dump_stats}" = 1 ]; then + echo Server ns stats + ip netns exec $ns1 nstat -as | grep MPTcp + echo Client ns stats + ip netns exec $ns2 nstat -as | grep MPTcp + fi +} + +sin=$(mktemp) +sout=$(mktemp) +cin=$(mktemp) +cout=$(mktemp) +init +make_file "$cin" "client" +make_file "$sin" "server" +trap cleanup EXIT + +run_tests $ns1 $ns2 10.0.1.1 +chk_join_nr "no JOIN" "0" "0" "0" + +# subflow limted by client +reset +ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow +run_tests $ns1 $ns2 10.0.1.1 +chk_join_nr "single subflow, limited by client" 0 0 0 + +# subflow limted by server +reset +ip netns exec $ns2 ./pm_nl_ctl limits 0 1 +ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow +run_tests $ns1 $ns2 10.0.1.1 +chk_join_nr "single subflow, limited by server" 1 1 0 + +# subflow +reset +ip netns exec $ns1 ./pm_nl_ctl limits 0 1 +ip netns exec $ns2 ./pm_nl_ctl limits 0 1 +ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow +run_tests $ns1 $ns2 10.0.1.1 +chk_join_nr "single subflow" 1 1 1 + +# multiple subflows +reset +ip netns exec $ns1 ./pm_nl_ctl limits 0 2 +ip netns exec $ns2 ./pm_nl_ctl limits 0 2 +ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow +ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow +run_tests $ns1 $ns2 10.0.1.1 +chk_join_nr "multiple subflows" 2 2 2 + +# multiple subflows limited by serverf +reset +ip netns exec $ns1 ./pm_nl_ctl limits 0 1 +ip netns exec $ns2 ./pm_nl_ctl limits 0 2 +ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow +ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow +run_tests $ns1 $ns2 10.0.1.1 +chk_join_nr "multiple subflows, limited by server" 2 2 1 + +# add_address, unused +reset +ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal +run_tests $ns1 $ns2 10.0.1.1 +chk_join_nr "unused signal address" 0 0 0 + +# accept and use add_addr +reset +ip netns exec $ns1 ./pm_nl_ctl limits 0 1 +ip netns exec $ns2 ./pm_nl_ctl limits 1 1 +ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal +run_tests $ns1 $ns2 10.0.1.1 +chk_join_nr "signal address" 1 1 1 + +# accept and use add_addr with an additional subflow +# note: signal address in server ns and local addresses in client ns must +# belong to different subnets or one of the listed local address could be +# used for 'add_addr' subflow +reset +ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal +ip netns exec $ns1 ./pm_nl_ctl limits 0 2 +ip netns exec $ns2 ./pm_nl_ctl limits 1 2 +ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow +run_tests $ns1 $ns2 10.0.1.1 +chk_join_nr "subflow and signal" 2 2 2 + +# accept and use add_addr with additional subflows +reset +ip netns exec $ns1 ./pm_nl_ctl limits 0 3 +ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal +ip netns exec $ns2 ./pm_nl_ctl limits 1 3 +ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow +ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags subflow +run_tests $ns1 $ns2 10.0.1.1 +chk_join_nr "multiple subflows and signal" 3 3 3 + +exit $ret diff --git a/tools/testing/selftests/net/mptcp/pm_netlink.sh b/tools/testing/selftests/net/mptcp/pm_netlink.sh new file mode 100755 index 000000000000..15f4f46ca3a9 --- /dev/null +++ b/tools/testing/selftests/net/mptcp/pm_netlink.sh @@ -0,0 +1,130 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +ksft_skip=4 +ret=0 + +usage() { + echo "Usage: $0 [ -h ]" +} + + +while getopts "$optstring" option;do + case "$option" in + "h") + usage $0 + exit 0 + ;; + "?") + usage $0 + exit 1 + ;; + esac +done + +sec=$(date +%s) +rndh=$(printf %x $sec)-$(mktemp -u XXXXXX) +ns1="ns1-$rndh" +err=$(mktemp) +ret=0 + +cleanup() +{ + rm -f $err + ip netns del $ns1 +} + +ip -Version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +trap cleanup EXIT + +ip netns add $ns1 || exit $ksft_skip +ip -net $ns1 link set lo up +ip netns exec $ns1 sysctl -q net.mptcp.enabled=1 + +check() +{ + local cmd="$1" + local expected="$2" + local msg="$3" + local out=`$cmd 2>$err` + local cmd_ret=$? + + printf "%-50s %s" "$msg" + if [ $cmd_ret -ne 0 ]; then + echo "[FAIL] command execution '$cmd' stderr " + cat $err + ret=1 + elif [ "$out" = "$expected" ]; then + echo "[ OK ]" + else + echo -n "[FAIL] " + echo "expected '$expected' got '$out'" + ret=1 + fi +} + +check "ip netns exec $ns1 ./pm_nl_ctl dump" "" "defaults addr list" +check "ip netns exec $ns1 ./pm_nl_ctl limits" "accept 0 +subflows 0" "defaults limits" + +ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.1 +ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.2 flags subflow dev lo +ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.3 flags signal,backup +check "ip netns exec $ns1 ./pm_nl_ctl get 1" "id 1 flags 10.0.1.1" "simple add/get addr" + +check "ip netns exec $ns1 ./pm_nl_ctl dump" \ +"id 1 flags 10.0.1.1 +id 2 flags subflow dev lo 10.0.1.2 +id 3 flags signal,backup 10.0.1.3" "dump addrs" + +ip netns exec $ns1 ./pm_nl_ctl del 2 +check "ip netns exec $ns1 ./pm_nl_ctl get 2" "" "simple del addr" +check "ip netns exec $ns1 ./pm_nl_ctl dump" \ +"id 1 flags 10.0.1.1 +id 3 flags signal,backup 10.0.1.3" "dump addrs after del" + +ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.3 +check "ip netns exec $ns1 ./pm_nl_ctl get 4" "" "duplicate addr" + +ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.4 id 10 flags signal +check "ip netns exec $ns1 ./pm_nl_ctl get 4" "id 4 flags signal 10.0.1.4" "id addr increment" + +for i in `seq 5 9`; do + ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.$i flags signal >/dev/null 2>&1 +done +check "ip netns exec $ns1 ./pm_nl_ctl get 9" "id 9 flags signal 10.0.1.9" "hard addr limit" +check "ip netns exec $ns1 ./pm_nl_ctl get 10" "" "above hard addr limit" + +for i in `seq 9 256`; do + ip netns exec $ns1 ./pm_nl_ctl del $i + ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.9 +done +check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags 10.0.1.1 +id 3 flags signal,backup 10.0.1.3 +id 4 flags signal 10.0.1.4 +id 5 flags signal 10.0.1.5 +id 6 flags signal 10.0.1.6 +id 7 flags signal 10.0.1.7 +id 8 flags signal 10.0.1.8" "id limit" + +ip netns exec $ns1 ./pm_nl_ctl flush +check "ip netns exec $ns1 ./pm_nl_ctl dump" "" "flush addrs" + +ip netns exec $ns1 ./pm_nl_ctl limits 9 1 +check "ip netns exec $ns1 ./pm_nl_ctl limits" "accept 0 +subflows 0" "rcv addrs above hard limit" + +ip netns exec $ns1 ./pm_nl_ctl limits 1 9 +check "ip netns exec $ns1 ./pm_nl_ctl limits" "accept 0 +subflows 0" "subflows above hard limit" + +ip netns exec $ns1 ./pm_nl_ctl limits 8 8 +check "ip netns exec $ns1 ./pm_nl_ctl limits" "accept 8 +subflows 8" "set limits" + +exit $ret diff --git a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c new file mode 100644 index 000000000000..b24a2f17d415 --- /dev/null +++ b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c @@ -0,0 +1,616 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <errno.h> +#include <error.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include <sys/socket.h> +#include <sys/types.h> + +#include <arpa/inet.h> +#include <net/if.h> + +#include <linux/rtnetlink.h> +#include <linux/genetlink.h> + +#include "linux/mptcp.h" + +#ifndef MPTCP_PM_NAME +#define MPTCP_PM_NAME "mptcp_pm" +#endif + +static void syntax(char *argv[]) +{ + fprintf(stderr, "%s add|get|del|flush|dump|accept [<args>]\n", argv[0]); + fprintf(stderr, "\tadd [flags signal|subflow|backup] [id <nr>] [dev <name>] <ip>\n"); + fprintf(stderr, "\tdel <id>\n"); + fprintf(stderr, "\tget <id>\n"); + fprintf(stderr, "\tflush\n"); + fprintf(stderr, "\tdump\n"); + fprintf(stderr, "\tlimits [<rcv addr max> <subflow max>]\n"); + exit(0); +} + +static int init_genl_req(char *data, int family, int cmd, int version) +{ + struct nlmsghdr *nh = (void *)data; + struct genlmsghdr *gh; + int off = 0; + + nh->nlmsg_type = family; + nh->nlmsg_flags = NLM_F_REQUEST; + nh->nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN); + off += NLMSG_ALIGN(sizeof(*nh)); + + gh = (void *)(data + off); + gh->cmd = cmd; + gh->version = version; + off += NLMSG_ALIGN(sizeof(*gh)); + return off; +} + +static void nl_error(struct nlmsghdr *nh) +{ + struct nlmsgerr *err = (struct nlmsgerr *)NLMSG_DATA(nh); + int len = nh->nlmsg_len - sizeof(*nh); + uint32_t off; + + if (len < sizeof(struct nlmsgerr)) + error(1, 0, "netlink error message truncated %d min %ld", len, + sizeof(struct nlmsgerr)); + + if (!err->error) { + /* check messages from kernel */ + struct rtattr *attrs = (struct rtattr *)NLMSG_DATA(nh); + + while (RTA_OK(attrs, len)) { + if (attrs->rta_type == NLMSGERR_ATTR_MSG) + fprintf(stderr, "netlink ext ack msg: %s\n", + (char *)RTA_DATA(attrs)); + if (attrs->rta_type == NLMSGERR_ATTR_OFFS) { + memcpy(&off, RTA_DATA(attrs), 4); + fprintf(stderr, "netlink err off %d\n", + (int)off); + } + attrs = RTA_NEXT(attrs, len); + } + } else { + fprintf(stderr, "netlink error %d", err->error); + } +} + +/* do a netlink command and, if max > 0, fetch the reply */ +static int do_nl_req(int fd, struct nlmsghdr *nh, int len, int max) +{ + struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK }; + socklen_t addr_len; + void *data = nh; + int rem, ret; + int err = 0; + + nh->nlmsg_len = len; + ret = sendto(fd, data, len, 0, (void *)&nladdr, sizeof(nladdr)); + if (ret != len) + error(1, errno, "send netlink: %uB != %uB\n", ret, len); + if (max == 0) + return 0; + + addr_len = sizeof(nladdr); + rem = ret = recvfrom(fd, data, max, 0, (void *)&nladdr, &addr_len); + if (ret < 0) + error(1, errno, "recv netlink: %uB\n", ret); + + /* Beware: the NLMSG_NEXT macro updates the 'rem' argument */ + for (; NLMSG_OK(nh, rem); nh = NLMSG_NEXT(nh, rem)) { + if (nh->nlmsg_type == NLMSG_ERROR) { + nl_error(nh); + err = 1; + } + } + if (err) + error(1, 0, "bailing out due to netlink error[s]"); + return ret; +} + +static int genl_parse_getfamily(struct nlmsghdr *nlh) +{ + struct genlmsghdr *ghdr = NLMSG_DATA(nlh); + int len = nlh->nlmsg_len; + struct rtattr *attrs; + + if (nlh->nlmsg_type != GENL_ID_CTRL) + error(1, errno, "Not a controller message, len=%d type=0x%x\n", + nlh->nlmsg_len, nlh->nlmsg_type); + + len -= NLMSG_LENGTH(GENL_HDRLEN); + + if (len < 0) + error(1, errno, "wrong controller message len %d\n", len); + + if (ghdr->cmd != CTRL_CMD_NEWFAMILY) + error(1, errno, "Unknown controller command %d\n", ghdr->cmd); + + attrs = (struct rtattr *) ((char *) ghdr + GENL_HDRLEN); + while (RTA_OK(attrs, len)) { + if (attrs->rta_type == CTRL_ATTR_FAMILY_ID) + return *(__u16 *)RTA_DATA(attrs); + attrs = RTA_NEXT(attrs, len); + } + + error(1, errno, "can't find CTRL_ATTR_FAMILY_ID attr"); + return -1; +} + +static int resolve_mptcp_pm_netlink(int fd) +{ + char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) + + NLMSG_ALIGN(sizeof(struct genlmsghdr)) + + 1024]; + struct nlmsghdr *nh; + struct rtattr *rta; + int namelen; + int off = 0; + + memset(data, 0, sizeof(data)); + nh = (void *)data; + off = init_genl_req(data, GENL_ID_CTRL, CTRL_CMD_GETFAMILY, 0); + + rta = (void *)(data + off); + namelen = strlen(MPTCP_PM_NAME) + 1; + rta->rta_type = CTRL_ATTR_FAMILY_NAME; + rta->rta_len = RTA_LENGTH(namelen); + memcpy(RTA_DATA(rta), MPTCP_PM_NAME, namelen); + off += NLMSG_ALIGN(rta->rta_len); + + do_nl_req(fd, nh, off, sizeof(data)); + return genl_parse_getfamily((void *)data); +} + +int add_addr(int fd, int pm_family, int argc, char *argv[]) +{ + char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) + + NLMSG_ALIGN(sizeof(struct genlmsghdr)) + + 1024]; + struct rtattr *rta, *nest; + struct nlmsghdr *nh; + u_int16_t family; + u_int32_t flags; + int nest_start; + u_int8_t id; + int off = 0; + int arg; + + memset(data, 0, sizeof(data)); + nh = (void *)data; + off = init_genl_req(data, pm_family, MPTCP_PM_CMD_ADD_ADDR, + MPTCP_PM_VER); + + if (argc < 3) + syntax(argv); + + nest_start = off; + nest = (void *)(data + off); + nest->rta_type = NLA_F_NESTED | MPTCP_PM_ATTR_ADDR; + nest->rta_len = RTA_LENGTH(0); + off += NLMSG_ALIGN(nest->rta_len); + + /* addr data */ + rta = (void *)(data + off); + if (inet_pton(AF_INET, argv[2], RTA_DATA(rta))) { + family = AF_INET; + rta->rta_type = MPTCP_PM_ADDR_ATTR_ADDR4; + rta->rta_len = RTA_LENGTH(4); + } else if (inet_pton(AF_INET6, argv[2], RTA_DATA(rta))) { + family = AF_INET6; + rta->rta_type = MPTCP_PM_ADDR_ATTR_ADDR6; + rta->rta_len = RTA_LENGTH(16); + } else + error(1, errno, "can't parse ip %s", argv[2]); + off += NLMSG_ALIGN(rta->rta_len); + + /* family */ + rta = (void *)(data + off); + rta->rta_type = MPTCP_PM_ADDR_ATTR_FAMILY; + rta->rta_len = RTA_LENGTH(2); + memcpy(RTA_DATA(rta), &family, 2); + off += NLMSG_ALIGN(rta->rta_len); + + for (arg = 3; arg < argc; arg++) { + if (!strcmp(argv[arg], "flags")) { + char *tok, *str; + + /* flags */ + flags = 0; + if (++arg >= argc) + error(1, 0, " missing flags value"); + + /* do not support flag list yet */ + for (str = argv[arg]; (tok = strtok(str, ",")); + str = NULL) { + if (!strcmp(tok, "subflow")) + flags |= MPTCP_PM_ADDR_FLAG_SUBFLOW; + else if (!strcmp(tok, "signal")) + flags |= MPTCP_PM_ADDR_FLAG_SIGNAL; + else if (!strcmp(tok, "backup")) + flags |= MPTCP_PM_ADDR_FLAG_BACKUP; + else + error(1, errno, + "unknown flag %s", argv[arg]); + } + + rta = (void *)(data + off); + rta->rta_type = MPTCP_PM_ADDR_ATTR_FLAGS; + rta->rta_len = RTA_LENGTH(4); + memcpy(RTA_DATA(rta), &flags, 4); + off += NLMSG_ALIGN(rta->rta_len); + } else if (!strcmp(argv[arg], "id")) { + if (++arg >= argc) + error(1, 0, " missing id value"); + + id = atoi(argv[arg]); + rta = (void *)(data + off); + rta->rta_type = MPTCP_PM_ADDR_ATTR_ID; + rta->rta_len = RTA_LENGTH(1); + memcpy(RTA_DATA(rta), &id, 1); + off += NLMSG_ALIGN(rta->rta_len); + } else if (!strcmp(argv[arg], "dev")) { + int32_t ifindex; + + if (++arg >= argc) + error(1, 0, " missing dev name"); + + ifindex = if_nametoindex(argv[arg]); + if (!ifindex) + error(1, errno, "unknown device %s", argv[arg]); + + rta = (void *)(data + off); + rta->rta_type = MPTCP_PM_ADDR_ATTR_IF_IDX; + rta->rta_len = RTA_LENGTH(4); + memcpy(RTA_DATA(rta), &ifindex, 4); + off += NLMSG_ALIGN(rta->rta_len); + } else + error(1, 0, "unknown keyword %s", argv[arg]); + } + nest->rta_len = off - nest_start; + + do_nl_req(fd, nh, off, 0); + return 0; +} + +int del_addr(int fd, int pm_family, int argc, char *argv[]) +{ + char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) + + NLMSG_ALIGN(sizeof(struct genlmsghdr)) + + 1024]; + struct rtattr *rta, *nest; + struct nlmsghdr *nh; + int nest_start; + u_int8_t id; + int off = 0; + + memset(data, 0, sizeof(data)); + nh = (void *)data; + off = init_genl_req(data, pm_family, MPTCP_PM_CMD_DEL_ADDR, + MPTCP_PM_VER); + + /* the only argument is the address id */ + if (argc != 3) + syntax(argv); + + id = atoi(argv[2]); + + nest_start = off; + nest = (void *)(data + off); + nest->rta_type = NLA_F_NESTED | MPTCP_PM_ATTR_ADDR; + nest->rta_len = RTA_LENGTH(0); + off += NLMSG_ALIGN(nest->rta_len); + + /* build a dummy addr with only the ID set */ + rta = (void *)(data + off); + rta->rta_type = MPTCP_PM_ADDR_ATTR_ID; + rta->rta_len = RTA_LENGTH(1); + memcpy(RTA_DATA(rta), &id, 1); + off += NLMSG_ALIGN(rta->rta_len); + nest->rta_len = off - nest_start; + + do_nl_req(fd, nh, off, 0); + return 0; +} + +static void print_addr(struct rtattr *attrs, int len) +{ + uint16_t family = 0; + char str[1024]; + uint32_t flags; + uint8_t id; + + while (RTA_OK(attrs, len)) { + if (attrs->rta_type == MPTCP_PM_ADDR_ATTR_FAMILY) + memcpy(&family, RTA_DATA(attrs), 2); + if (attrs->rta_type == MPTCP_PM_ADDR_ATTR_ADDR4) { + if (family != AF_INET) + error(1, errno, "wrong IP (v4) for family %d", + family); + inet_ntop(AF_INET, RTA_DATA(attrs), str, sizeof(str)); + printf("%s", str); + } + if (attrs->rta_type == MPTCP_PM_ADDR_ATTR_ADDR6) { + if (family != AF_INET6) + error(1, errno, "wrong IP (v6) for family %d", + family); + inet_ntop(AF_INET6, RTA_DATA(attrs), str, sizeof(str)); + printf("%s", str); + } + if (attrs->rta_type == MPTCP_PM_ADDR_ATTR_ID) { + memcpy(&id, RTA_DATA(attrs), 1); + printf("id %d ", id); + } + if (attrs->rta_type == MPTCP_PM_ADDR_ATTR_FLAGS) { + memcpy(&flags, RTA_DATA(attrs), 4); + + printf("flags "); + if (flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { + printf("signal"); + flags &= ~MPTCP_PM_ADDR_FLAG_SIGNAL; + if (flags) + printf(","); + } + + if (flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { + printf("subflow"); + flags &= ~MPTCP_PM_ADDR_FLAG_SUBFLOW; + if (flags) + printf(","); + } + + if (flags & MPTCP_PM_ADDR_FLAG_BACKUP) { + printf("backup"); + flags &= ~MPTCP_PM_ADDR_FLAG_BACKUP; + if (flags) + printf(","); + } + + /* bump unknown flags, if any */ + if (flags) + printf("0x%x", flags); + printf(" "); + } + if (attrs->rta_type == MPTCP_PM_ADDR_ATTR_IF_IDX) { + char name[IF_NAMESIZE], *ret; + int32_t ifindex; + + memcpy(&ifindex, RTA_DATA(attrs), 4); + ret = if_indextoname(ifindex, name); + if (ret) + printf("dev %s ", ret); + else + printf("dev unknown/%d", ifindex); + } + + attrs = RTA_NEXT(attrs, len); + } + printf("\n"); +} + +static void print_addrs(struct nlmsghdr *nh, int pm_family, int total_len) +{ + struct rtattr *attrs; + + for (; NLMSG_OK(nh, total_len); nh = NLMSG_NEXT(nh, total_len)) { + int len = nh->nlmsg_len; + + if (nh->nlmsg_type == NLMSG_DONE) + break; + if (nh->nlmsg_type == NLMSG_ERROR) + nl_error(nh); + if (nh->nlmsg_type != pm_family) + continue; + + len -= NLMSG_LENGTH(GENL_HDRLEN); + attrs = (struct rtattr *) ((char *) NLMSG_DATA(nh) + + GENL_HDRLEN); + while (RTA_OK(attrs, len)) { + if (attrs->rta_type == + (MPTCP_PM_ATTR_ADDR | NLA_F_NESTED)) + print_addr((void *)RTA_DATA(attrs), + attrs->rta_len); + attrs = RTA_NEXT(attrs, len); + } + } +} + +int get_addr(int fd, int pm_family, int argc, char *argv[]) +{ + char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) + + NLMSG_ALIGN(sizeof(struct genlmsghdr)) + + 1024]; + struct rtattr *rta, *nest; + struct nlmsghdr *nh; + int nest_start; + u_int8_t id; + int off = 0; + + memset(data, 0, sizeof(data)); + nh = (void *)data; + off = init_genl_req(data, pm_family, MPTCP_PM_CMD_GET_ADDR, + MPTCP_PM_VER); + + /* the only argument is the address id */ + if (argc != 3) + syntax(argv); + + id = atoi(argv[2]); + + nest_start = off; + nest = (void *)(data + off); + nest->rta_type = NLA_F_NESTED | MPTCP_PM_ATTR_ADDR; + nest->rta_len = RTA_LENGTH(0); + off += NLMSG_ALIGN(nest->rta_len); + + /* build a dummy addr with only the ID set */ + rta = (void *)(data + off); + rta->rta_type = MPTCP_PM_ADDR_ATTR_ID; + rta->rta_len = RTA_LENGTH(1); + memcpy(RTA_DATA(rta), &id, 1); + off += NLMSG_ALIGN(rta->rta_len); + nest->rta_len = off - nest_start; + + print_addrs(nh, pm_family, do_nl_req(fd, nh, off, sizeof(data))); + return 0; +} + +int dump_addrs(int fd, int pm_family, int argc, char *argv[]) +{ + char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) + + NLMSG_ALIGN(sizeof(struct genlmsghdr)) + + 1024]; + pid_t pid = getpid(); + struct nlmsghdr *nh; + int off = 0; + + memset(data, 0, sizeof(data)); + nh = (void *)data; + off = init_genl_req(data, pm_family, MPTCP_PM_CMD_GET_ADDR, + MPTCP_PM_VER); + nh->nlmsg_flags |= NLM_F_DUMP; + nh->nlmsg_seq = 1; + nh->nlmsg_pid = pid; + nh->nlmsg_len = off; + + print_addrs(nh, pm_family, do_nl_req(fd, nh, off, sizeof(data))); + return 0; +} + +int flush_addrs(int fd, int pm_family, int argc, char *argv[]) +{ + char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) + + NLMSG_ALIGN(sizeof(struct genlmsghdr)) + + 1024]; + struct nlmsghdr *nh; + int off = 0; + + memset(data, 0, sizeof(data)); + nh = (void *)data; + off = init_genl_req(data, pm_family, MPTCP_PM_CMD_FLUSH_ADDRS, + MPTCP_PM_VER); + + do_nl_req(fd, nh, off, 0); + return 0; +} + +static void print_limits(struct nlmsghdr *nh, int pm_family, int total_len) +{ + struct rtattr *attrs; + uint32_t max; + + for (; NLMSG_OK(nh, total_len); nh = NLMSG_NEXT(nh, total_len)) { + int len = nh->nlmsg_len; + + if (nh->nlmsg_type == NLMSG_DONE) + break; + if (nh->nlmsg_type == NLMSG_ERROR) + nl_error(nh); + if (nh->nlmsg_type != pm_family) + continue; + + len -= NLMSG_LENGTH(GENL_HDRLEN); + attrs = (struct rtattr *) ((char *) NLMSG_DATA(nh) + + GENL_HDRLEN); + while (RTA_OK(attrs, len)) { + int type = attrs->rta_type; + + if (type != MPTCP_PM_ATTR_RCV_ADD_ADDRS && + type != MPTCP_PM_ATTR_SUBFLOWS) + goto next; + + memcpy(&max, RTA_DATA(attrs), 4); + printf("%s %u\n", type == MPTCP_PM_ATTR_SUBFLOWS ? + "subflows" : "accept", max); + +next: + attrs = RTA_NEXT(attrs, len); + } + } +} + +int get_set_limits(int fd, int pm_family, int argc, char *argv[]) +{ + char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) + + NLMSG_ALIGN(sizeof(struct genlmsghdr)) + + 1024]; + uint32_t rcv_addr = 0, subflows = 0; + int cmd, len = sizeof(data); + struct nlmsghdr *nh; + int off = 0; + + /* limit */ + if (argc == 4) { + rcv_addr = atoi(argv[2]); + subflows = atoi(argv[3]); + cmd = MPTCP_PM_CMD_SET_LIMITS; + } else { + cmd = MPTCP_PM_CMD_GET_LIMITS; + } + + memset(data, 0, sizeof(data)); + nh = (void *)data; + off = init_genl_req(data, pm_family, cmd, MPTCP_PM_VER); + + /* limit */ + if (cmd == MPTCP_PM_CMD_SET_LIMITS) { + struct rtattr *rta = (void *)(data + off); + + rta->rta_type = MPTCP_PM_ATTR_RCV_ADD_ADDRS; + rta->rta_len = RTA_LENGTH(4); + memcpy(RTA_DATA(rta), &rcv_addr, 4); + off += NLMSG_ALIGN(rta->rta_len); + + rta = (void *)(data + off); + rta->rta_type = MPTCP_PM_ATTR_SUBFLOWS; + rta->rta_len = RTA_LENGTH(4); + memcpy(RTA_DATA(rta), &subflows, 4); + off += NLMSG_ALIGN(rta->rta_len); + + /* do not expect a reply */ + len = 0; + } + + len = do_nl_req(fd, nh, off, len); + if (cmd == MPTCP_PM_CMD_GET_LIMITS) + print_limits(nh, pm_family, len); + return 0; +} + +int main(int argc, char *argv[]) +{ + int fd, pm_family; + + if (argc < 2) + syntax(argv); + + fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC); + if (fd == -1) + error(1, errno, "socket netlink"); + + pm_family = resolve_mptcp_pm_netlink(fd); + + if (!strcmp(argv[1], "add")) + return add_addr(fd, pm_family, argc, argv); + else if (!strcmp(argv[1], "del")) + return del_addr(fd, pm_family, argc, argv); + else if (!strcmp(argv[1], "flush")) + return flush_addrs(fd, pm_family, argc, argv); + else if (!strcmp(argv[1], "get")) + return get_addr(fd, pm_family, argc, argv); + else if (!strcmp(argv[1], "dump")) + return dump_addrs(fd, pm_family, argc, argv); + else if (!strcmp(argv[1], "limits")) + return get_set_limits(fd, pm_family, argc, argv); + + fprintf(stderr, "unknown sub-command: %s", argv[1]); + syntax(argv); + return 0; +} diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh index 71a62e7e35b1..77c09cd339c3 100755 --- a/tools/testing/selftests/net/pmtu.sh +++ b/tools/testing/selftests/net/pmtu.sh @@ -67,6 +67,10 @@ # Same as pmtu_ipv4_vxlan4, but using a generic UDP IPv4/IPv6 # encapsulation (GUE) over IPv4/IPv6, instead of VXLAN # +# - pmtu_ipv{4,6}_ipv{4,6}_exception +# Same as pmtu_ipv4_vxlan4, but using a IPv4/IPv6 tunnel over IPv4/IPv6, +# instead of VXLAN +# # - pmtu_vti4_exception # Set up vti tunnel on top of veth, with xfrm states and policies, in two # namespaces with matching endpoints. Check that route exception is not @@ -151,6 +155,10 @@ tests=" pmtu_ipv6_gue4_exception IPv6 over gue4: PMTU exceptions 1 pmtu_ipv4_gue6_exception IPv4 over gue6: PMTU exceptions 1 pmtu_ipv6_gue6_exception IPv6 over gue6: PMTU exceptions 1 + pmtu_ipv4_ipv4_exception IPv4 over IPv4: PMTU exceptions 1 + pmtu_ipv6_ipv4_exception IPv6 over IPv4: PMTU exceptions 1 + pmtu_ipv4_ipv6_exception IPv4 over IPv6: PMTU exceptions 1 + pmtu_ipv6_ipv6_exception IPv6 over IPv6: PMTU exceptions 1 pmtu_vti6_exception vti6: PMTU exceptions 0 pmtu_vti4_exception vti4: PMTU exceptions 0 pmtu_vti4_default_mtu vti4: default MTU assignment 0 @@ -363,6 +371,62 @@ setup_gue66() { setup_fou_or_gue 6 6 gue } +setup_ipvX_over_ipvY() { + inner=${1} + outer=${2} + + if [ "${outer}" -eq 4 ]; then + a_addr="${prefix4}.${a_r1}.1" + b_addr="${prefix4}.${b_r1}.1" + if [ "${inner}" -eq 4 ]; then + type="ipip" + mode="ipip" + else + type="sit" + mode="ip6ip" + fi + else + a_addr="${prefix6}:${a_r1}::1" + b_addr="${prefix6}:${b_r1}::1" + type="ip6tnl" + if [ "${inner}" -eq 4 ]; then + mode="ipip6" + else + mode="ip6ip6" + fi + fi + + run_cmd ${ns_a} ip link add ip_a type ${type} local ${a_addr} remote ${b_addr} mode ${mode} || return 2 + run_cmd ${ns_b} ip link add ip_b type ${type} local ${b_addr} remote ${a_addr} mode ${mode} + + run_cmd ${ns_a} ip link set ip_a up + run_cmd ${ns_b} ip link set ip_b up + + if [ "${inner}" = "4" ]; then + run_cmd ${ns_a} ip addr add ${tunnel4_a_addr}/${tunnel4_mask} dev ip_a + run_cmd ${ns_b} ip addr add ${tunnel4_b_addr}/${tunnel4_mask} dev ip_b + else + run_cmd ${ns_a} ip addr add ${tunnel6_a_addr}/${tunnel6_mask} dev ip_a + run_cmd ${ns_b} ip addr add ${tunnel6_b_addr}/${tunnel6_mask} dev ip_b + fi +} + +setup_ip4ip4() { + setup_ipvX_over_ipvY 4 4 +} + +setup_ip6ip4() { + setup_ipvX_over_ipvY 6 4 +} + +setup_ip4ip6() { + setup_ipvX_over_ipvY 4 6 +} + +setup_ip6ip6() { + setup_ipvX_over_ipvY 6 6 +} + setup_namespaces() { for n in ${NS_A} ${NS_B} ${NS_R1} ${NS_R2}; do ip netns add ${n} || return 1 @@ -908,6 +972,64 @@ test_pmtu_ipv6_gue6_exception() { test_pmtu_ipvX_over_fouY_or_gueY 6 6 gue } +test_pmtu_ipvX_over_ipvY_exception() { + inner=${1} + outer=${2} + ll_mtu=4000 + + setup namespaces routing ip${inner}ip${outer} || return 2 + + trace "${ns_a}" ip_a "${ns_b}" ip_b \ + "${ns_a}" veth_A-R1 "${ns_r1}" veth_R1-A \ + "${ns_b}" veth_B-R1 "${ns_r1}" veth_R1-B + + if [ ${inner} -eq 4 ]; then + ping=ping + dst=${tunnel4_b_addr} + else + ping=${ping6} + dst=${tunnel6_b_addr} + fi + + if [ ${outer} -eq 4 ]; then + # IPv4 header + exp_mtu=$((${ll_mtu} - 20)) + else + # IPv6 header Option 4 + exp_mtu=$((${ll_mtu} - 40 - 8)) + fi + + # Create route exception by exceeding link layer MTU + mtu "${ns_a}" veth_A-R1 $((${ll_mtu} + 1000)) + mtu "${ns_r1}" veth_R1-A $((${ll_mtu} + 1000)) + mtu "${ns_b}" veth_B-R1 ${ll_mtu} + mtu "${ns_r1}" veth_R1-B ${ll_mtu} + + mtu "${ns_a}" ip_a $((${ll_mtu} + 1000)) || return + mtu "${ns_b}" ip_b $((${ll_mtu} + 1000)) || return + run_cmd ${ns_a} ${ping} -q -M want -i 0.1 -w 1 -s $((${ll_mtu} + 500)) ${dst} + + # Check that exception was created + pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst})" + check_pmtu_value ${exp_mtu} "${pmtu}" "exceeding link layer MTU on ip${inner}ip${outer} interface" +} + +test_pmtu_ipv4_ipv4_exception() { + test_pmtu_ipvX_over_ipvY_exception 4 4 +} + +test_pmtu_ipv6_ipv4_exception() { + test_pmtu_ipvX_over_ipvY_exception 6 4 +} + +test_pmtu_ipv4_ipv6_exception() { + test_pmtu_ipvX_over_ipvY_exception 4 6 +} + +test_pmtu_ipv6_ipv6_exception() { + test_pmtu_ipvX_over_ipvY_exception 6 6 +} + test_pmtu_vti4_exception() { setup namespaces veth vti4 xfrm4 || return 2 trace "${ns_a}" veth_a "${ns_b}" veth_b \ diff --git a/tools/testing/selftests/net/reuseaddr_ports_exhausted.c b/tools/testing/selftests/net/reuseaddr_ports_exhausted.c new file mode 100644 index 000000000000..7b01b7c2ec10 --- /dev/null +++ b/tools/testing/selftests/net/reuseaddr_ports_exhausted.c @@ -0,0 +1,162 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Check if we can fully utilize 4-tuples for connect(). + * + * Rules to bind sockets to the same port when all ephemeral ports are + * exhausted. + * + * 1. if there are TCP_LISTEN sockets on the port, fail to bind. + * 2. if there are sockets without SO_REUSEADDR, fail to bind. + * 3. if SO_REUSEADDR is disabled, fail to bind. + * 4. if SO_REUSEADDR is enabled and SO_REUSEPORT is disabled, + * succeed to bind. + * 5. if SO_REUSEADDR and SO_REUSEPORT are enabled and + * there is no socket having the both options and the same EUID, + * succeed to bind. + * 6. fail to bind. + * + * Author: Kuniyuki Iwashima <kuniyu@amazon.co.jp> + */ +#include <arpa/inet.h> +#include <netinet/in.h> +#include <sys/socket.h> +#include <sys/types.h> +#include <unistd.h> +#include "../kselftest_harness.h" + +struct reuse_opts { + int reuseaddr[2]; + int reuseport[2]; +}; + +struct reuse_opts unreusable_opts[12] = { + {0, 0, 0, 0}, + {0, 0, 0, 1}, + {0, 0, 1, 0}, + {0, 0, 1, 1}, + {0, 1, 0, 0}, + {0, 1, 0, 1}, + {0, 1, 1, 0}, + {0, 1, 1, 1}, + {1, 0, 0, 0}, + {1, 0, 0, 1}, + {1, 0, 1, 0}, + {1, 0, 1, 1}, +}; + +struct reuse_opts reusable_opts[4] = { + {1, 1, 0, 0}, + {1, 1, 0, 1}, + {1, 1, 1, 0}, + {1, 1, 1, 1}, +}; + +int bind_port(struct __test_metadata *_metadata, int reuseaddr, int reuseport) +{ + struct sockaddr_in local_addr; + int len = sizeof(local_addr); + int fd, ret; + + fd = socket(AF_INET, SOCK_STREAM, 0); + ASSERT_NE(-1, fd) TH_LOG("failed to open socket."); + + ret = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &reuseaddr, sizeof(int)); + ASSERT_EQ(0, ret) TH_LOG("failed to setsockopt: SO_REUSEADDR."); + + ret = setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &reuseport, sizeof(int)); + ASSERT_EQ(0, ret) TH_LOG("failed to setsockopt: SO_REUSEPORT."); + + local_addr.sin_family = AF_INET; + local_addr.sin_addr.s_addr = inet_addr("127.0.0.1"); + local_addr.sin_port = 0; + + if (bind(fd, (struct sockaddr *)&local_addr, len) == -1) { + close(fd); + return -1; + } + + return fd; +} + +TEST(reuseaddr_ports_exhausted_unreusable) +{ + struct reuse_opts *opts; + int i, j, fd[2]; + + for (i = 0; i < 12; i++) { + opts = &unreusable_opts[i]; + + for (j = 0; j < 2; j++) + fd[j] = bind_port(_metadata, opts->reuseaddr[j], opts->reuseport[j]); + + ASSERT_NE(-1, fd[0]) TH_LOG("failed to bind."); + EXPECT_EQ(-1, fd[1]) TH_LOG("should fail to bind."); + + for (j = 0; j < 2; j++) + if (fd[j] != -1) + close(fd[j]); + } +} + +TEST(reuseaddr_ports_exhausted_reusable_same_euid) +{ + struct reuse_opts *opts; + int i, j, fd[2]; + + for (i = 0; i < 4; i++) { + opts = &reusable_opts[i]; + + for (j = 0; j < 2; j++) + fd[j] = bind_port(_metadata, opts->reuseaddr[j], opts->reuseport[j]); + + ASSERT_NE(-1, fd[0]) TH_LOG("failed to bind."); + + if (opts->reuseport[0] && opts->reuseport[1]) { + EXPECT_EQ(-1, fd[1]) TH_LOG("should fail to bind because both sockets succeed to be listened."); + } else { + EXPECT_NE(-1, fd[1]) TH_LOG("should succeed to bind to connect to different destinations."); + } + + for (j = 0; j < 2; j++) + if (fd[j] != -1) + close(fd[j]); + } +} + +TEST(reuseaddr_ports_exhausted_reusable_different_euid) +{ + struct reuse_opts *opts; + int i, j, ret, fd[2]; + uid_t euid[2] = {10, 20}; + + for (i = 0; i < 4; i++) { + opts = &reusable_opts[i]; + + for (j = 0; j < 2; j++) { + ret = seteuid(euid[j]); + ASSERT_EQ(0, ret) TH_LOG("failed to seteuid: %d.", euid[j]); + + fd[j] = bind_port(_metadata, opts->reuseaddr[j], opts->reuseport[j]); + + ret = seteuid(0); + ASSERT_EQ(0, ret) TH_LOG("failed to seteuid: 0."); + } + + ASSERT_NE(-1, fd[0]) TH_LOG("failed to bind."); + EXPECT_NE(-1, fd[1]) TH_LOG("should succeed to bind because one socket can be bound in each euid."); + + if (fd[1] != -1) { + ret = listen(fd[0], 5); + ASSERT_EQ(0, ret) TH_LOG("failed to listen."); + + ret = listen(fd[1], 5); + EXPECT_EQ(-1, ret) TH_LOG("should fail to listen because only one uid reserves the port in TCP_LISTEN."); + } + + for (j = 0; j < 2; j++) + if (fd[j] != -1) + close(fd[j]); + } +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/net/reuseaddr_ports_exhausted.sh b/tools/testing/selftests/net/reuseaddr_ports_exhausted.sh new file mode 100755 index 000000000000..20e3a2913d06 --- /dev/null +++ b/tools/testing/selftests/net/reuseaddr_ports_exhausted.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Run tests when all ephemeral ports are exhausted. +# +# Author: Kuniyuki Iwashima <kuniyu@amazon.co.jp> + +set +x +set -e + +readonly NETNS="ns-$(mktemp -u XXXXXX)" + +setup() { + ip netns add "${NETNS}" + ip -netns "${NETNS}" link set lo up + ip netns exec "${NETNS}" \ + sysctl -w net.ipv4.ip_local_port_range="32768 32768" \ + > /dev/null 2>&1 + ip netns exec "${NETNS}" \ + sysctl -w net.ipv4.ip_autobind_reuse=1 > /dev/null 2>&1 +} + +cleanup() { + ip netns del "${NETNS}" +} + +trap cleanup EXIT +setup + +do_test() { + ip netns exec "${NETNS}" ./reuseaddr_ports_exhausted +} + +do_test +echo "tests done" diff --git a/tools/testing/selftests/networking/timestamping/rxtimestamp.c b/tools/testing/selftests/net/rxtimestamp.c index 6dee9e636a95..6dee9e636a95 100644 --- a/tools/testing/selftests/networking/timestamping/rxtimestamp.c +++ b/tools/testing/selftests/net/rxtimestamp.c diff --git a/tools/testing/selftests/net/tcp_mmap.c b/tools/testing/selftests/net/tcp_mmap.c index 35505b31e5cc..4555f88252ba 100644 --- a/tools/testing/selftests/net/tcp_mmap.c +++ b/tools/testing/selftests/net/tcp_mmap.c @@ -165,9 +165,10 @@ void *child_thread(void *arg) socklen_t zc_len = sizeof(zc); int res; + memset(&zc, 0, sizeof(zc)); zc.address = (__u64)((unsigned long)addr); zc.length = chunk_size; - zc.recv_skip_hint = 0; + res = getsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, &zc, &zc_len); if (res == -1) @@ -281,12 +282,14 @@ static void setup_sockaddr(int domain, const char *str_addr, static void do_accept(int fdlisten) { pthread_attr_t attr; + int rcvlowat; pthread_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); + rcvlowat = chunk_size; if (setsockopt(fdlisten, SOL_SOCKET, SO_RCVLOWAT, - &chunk_size, sizeof(chunk_size)) == -1) { + &rcvlowat, sizeof(rcvlowat)) == -1) { perror("setsockopt SO_RCVLOWAT"); } diff --git a/tools/testing/selftests/networking/timestamping/timestamping.c b/tools/testing/selftests/net/timestamping.c index aca3491174a1..aca3491174a1 100644 --- a/tools/testing/selftests/networking/timestamping/timestamping.c +++ b/tools/testing/selftests/net/timestamping.c diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index 0ea44d975b6c..c5282e62df75 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -101,6 +101,21 @@ FIXTURE(tls) bool notls; }; +FIXTURE_VARIANT(tls) +{ + unsigned int tls_version; +}; + +FIXTURE_VARIANT_ADD(tls, 12) +{ + .tls_version = TLS_1_2_VERSION, +}; + +FIXTURE_VARIANT_ADD(tls, 13) +{ + .tls_version = TLS_1_3_VERSION, +}; + FIXTURE_SETUP(tls) { struct tls12_crypto_info_aes_gcm_128 tls12; @@ -112,7 +127,7 @@ FIXTURE_SETUP(tls) len = sizeof(addr); memset(&tls12, 0, sizeof(tls12)); - tls12.info.version = TLS_1_3_VERSION; + tls12.info.version = variant->tls_version; tls12.info.cipher_type = TLS_CIPHER_AES_GCM_128; addr.sin_family = AF_INET; @@ -733,7 +748,7 @@ TEST_F(tls, bidir) struct tls12_crypto_info_aes_gcm_128 tls12; memset(&tls12, 0, sizeof(tls12)); - tls12.info.version = TLS_1_3_VERSION; + tls12.info.version = variant->tls_version; tls12.info.cipher_type = TLS_CIPHER_AES_GCM_128; ret = setsockopt(self->fd, SOL_TLS, TLS_RX, &tls12, @@ -1258,78 +1273,4 @@ TEST(keysizes) { close(cfd); } -TEST(tls12) { - int fd, cfd; - bool notls; - - struct tls12_crypto_info_aes_gcm_128 tls12; - struct sockaddr_in addr; - socklen_t len; - int sfd, ret; - - notls = false; - len = sizeof(addr); - - memset(&tls12, 0, sizeof(tls12)); - tls12.info.version = TLS_1_2_VERSION; - tls12.info.cipher_type = TLS_CIPHER_AES_GCM_128; - - addr.sin_family = AF_INET; - addr.sin_addr.s_addr = htonl(INADDR_ANY); - addr.sin_port = 0; - - fd = socket(AF_INET, SOCK_STREAM, 0); - sfd = socket(AF_INET, SOCK_STREAM, 0); - - ret = bind(sfd, &addr, sizeof(addr)); - ASSERT_EQ(ret, 0); - ret = listen(sfd, 10); - ASSERT_EQ(ret, 0); - - ret = getsockname(sfd, &addr, &len); - ASSERT_EQ(ret, 0); - - ret = connect(fd, &addr, sizeof(addr)); - ASSERT_EQ(ret, 0); - - ret = setsockopt(fd, IPPROTO_TCP, TCP_ULP, "tls", sizeof("tls")); - if (ret != 0) { - notls = true; - printf("Failure setting TCP_ULP, testing without tls\n"); - } - - if (!notls) { - ret = setsockopt(fd, SOL_TLS, TLS_TX, &tls12, - sizeof(tls12)); - ASSERT_EQ(ret, 0); - } - - cfd = accept(sfd, &addr, &len); - ASSERT_GE(cfd, 0); - - if (!notls) { - ret = setsockopt(cfd, IPPROTO_TCP, TCP_ULP, "tls", - sizeof("tls")); - ASSERT_EQ(ret, 0); - - ret = setsockopt(cfd, SOL_TLS, TLS_RX, &tls12, - sizeof(tls12)); - ASSERT_EQ(ret, 0); - } - - close(sfd); - - char const *test_str = "test_read"; - int send_len = 10; - char buf[10]; - - send_len = strlen(test_str) + 1; - EXPECT_EQ(send(fd, test_str, send_len, 0), send_len); - EXPECT_NE(recv(cfd, buf, send_len, 0), -1); - EXPECT_EQ(memcmp(buf, test_str, send_len), 0); - - close(fd); - close(cfd); -} - TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/networking/timestamping/txtimestamp.c b/tools/testing/selftests/net/txtimestamp.c index 7e386be47120..011b0da6b033 100644 --- a/tools/testing/selftests/networking/timestamping/txtimestamp.c +++ b/tools/testing/selftests/net/txtimestamp.c @@ -41,6 +41,7 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> +#include <sys/epoll.h> #include <sys/ioctl.h> #include <sys/select.h> #include <sys/socket.h> @@ -49,6 +50,10 @@ #include <time.h> #include <unistd.h> +#define NSEC_PER_USEC 1000L +#define USEC_PER_SEC 1000000L +#define NSEC_PER_SEC 1000000000LL + /* command line parameters */ static int cfg_proto = SOCK_STREAM; static int cfg_ipproto = IPPROTO_TCP; @@ -61,12 +66,16 @@ static int cfg_delay_snd; static int cfg_delay_ack; static bool cfg_show_payload; static bool cfg_do_pktinfo; +static bool cfg_busy_poll; +static int cfg_sleep_usec = 50 * 1000; static bool cfg_loop_nodata; -static bool cfg_no_delay; static bool cfg_use_cmsg; static bool cfg_use_pf_packet; +static bool cfg_use_epoll; +static bool cfg_epollet; static bool cfg_do_listen; static uint16_t dest_port = 9000; +static bool cfg_print_nsec; static struct sockaddr_in daddr; static struct sockaddr_in6 daddr6; @@ -75,11 +84,48 @@ static struct timespec ts_usr; static int saved_tskey = -1; static int saved_tskey_type = -1; +struct timing_event { + int64_t min; + int64_t max; + int64_t total; + int count; +}; + +static struct timing_event usr_enq; +static struct timing_event usr_snd; +static struct timing_event usr_ack; + static bool test_failed; +static int64_t timespec_to_ns64(struct timespec *ts) +{ + return ts->tv_sec * NSEC_PER_SEC + ts->tv_nsec; +} + static int64_t timespec_to_us64(struct timespec *ts) { - return ts->tv_sec * 1000 * 1000 + ts->tv_nsec / 1000; + return ts->tv_sec * USEC_PER_SEC + ts->tv_nsec / NSEC_PER_USEC; +} + +static void init_timing_event(struct timing_event *te) +{ + te->min = INT64_MAX; + te->max = 0; + te->total = 0; + te->count = 0; +} + +static void add_timing_event(struct timing_event *te, + struct timespec *t_start, struct timespec *t_end) +{ + int64_t ts_delta = timespec_to_ns64(t_end) - timespec_to_ns64(t_start); + + te->count++; + if (ts_delta < te->min) + te->min = ts_delta; + if (ts_delta > te->max) + te->max = ts_delta; + te->total += ts_delta; } static void validate_key(int tskey, int tstype) @@ -113,25 +159,43 @@ static void validate_timestamp(struct timespec *cur, int min_delay) start64 = timespec_to_us64(&ts_usr); if (cur64 < start64 + min_delay || cur64 > start64 + max_delay) { - fprintf(stderr, "ERROR: delay %lu expected between %d and %d\n", + fprintf(stderr, "ERROR: %lu us expected between %d and %d\n", cur64 - start64, min_delay, max_delay); test_failed = true; } } +static void __print_ts_delta_formatted(int64_t ts_delta) +{ + if (cfg_print_nsec) + fprintf(stderr, "%lu ns", ts_delta); + else + fprintf(stderr, "%lu us", ts_delta / NSEC_PER_USEC); +} + static void __print_timestamp(const char *name, struct timespec *cur, uint32_t key, int payload_len) { + int64_t ts_delta; + if (!(cur->tv_sec | cur->tv_nsec)) return; - fprintf(stderr, " %s: %lu s %lu us (seq=%u, len=%u)", - name, cur->tv_sec, cur->tv_nsec / 1000, - key, payload_len); - - if (cur != &ts_usr) - fprintf(stderr, " (USR %+" PRId64 " us)", - timespec_to_us64(cur) - timespec_to_us64(&ts_usr)); + if (cfg_print_nsec) + fprintf(stderr, " %s: %lu s %lu ns (seq=%u, len=%u)", + name, cur->tv_sec, cur->tv_nsec, + key, payload_len); + else + fprintf(stderr, " %s: %lu s %lu us (seq=%u, len=%u)", + name, cur->tv_sec, cur->tv_nsec / NSEC_PER_USEC, + key, payload_len); + + if (cur != &ts_usr) { + ts_delta = timespec_to_ns64(cur) - timespec_to_ns64(&ts_usr); + fprintf(stderr, " (USR +"); + __print_ts_delta_formatted(ts_delta); + fprintf(stderr, ")"); + } fprintf(stderr, "\n"); } @@ -155,14 +219,17 @@ static void print_timestamp(struct scm_timestamping *tss, int tstype, case SCM_TSTAMP_SCHED: tsname = " ENQ"; validate_timestamp(&tss->ts[0], 0); + add_timing_event(&usr_enq, &ts_usr, &tss->ts[0]); break; case SCM_TSTAMP_SND: tsname = " SND"; validate_timestamp(&tss->ts[0], cfg_delay_snd); + add_timing_event(&usr_snd, &ts_usr, &tss->ts[0]); break; case SCM_TSTAMP_ACK: tsname = " ACK"; validate_timestamp(&tss->ts[0], cfg_delay_ack); + add_timing_event(&usr_ack, &ts_usr, &tss->ts[0]); break; default: error(1, 0, "unknown timestamp type: %u", @@ -171,6 +238,21 @@ static void print_timestamp(struct scm_timestamping *tss, int tstype, __print_timestamp(tsname, &tss->ts[0], tskey, payload_len); } +static void print_timing_event(char *name, struct timing_event *te) +{ + if (!te->count) + return; + + fprintf(stderr, " %s: count=%d", name, te->count); + fprintf(stderr, ", avg="); + __print_ts_delta_formatted((int64_t)(te->total / te->count)); + fprintf(stderr, ", min="); + __print_ts_delta_formatted(te->min); + fprintf(stderr, ", max="); + __print_ts_delta_formatted(te->max); + fprintf(stderr, "\n"); +} + /* TODO: convert to check_and_print payload once API is stable */ static void print_payload(char *data, int len) { @@ -198,6 +280,17 @@ static void print_pktinfo(int family, int ifindex, void *saddr, void *daddr) daddr ? inet_ntop(family, daddr, da, sizeof(da)) : "unknown"); } +static void __epoll(int epfd) +{ + struct epoll_event events; + int ret; + + memset(&events, 0, sizeof(events)); + ret = epoll_wait(epfd, &events, 1, cfg_poll_timeout); + if (ret != 1) + error(1, errno, "epoll_wait"); +} + static void __poll(int fd) { struct pollfd pollfd; @@ -391,7 +484,11 @@ static void do_test(int family, unsigned int report_opt) struct msghdr msg; struct iovec iov; char *buf; - int fd, i, val = 1, total_len; + int fd, i, val = 1, total_len, epfd = 0; + + init_timing_event(&usr_enq); + init_timing_event(&usr_snd); + init_timing_event(&usr_ack); total_len = cfg_payload_len; if (cfg_use_pf_packet || cfg_proto == SOCK_RAW) { @@ -418,6 +515,20 @@ static void do_test(int family, unsigned int report_opt) if (fd < 0) error(1, errno, "socket"); + if (cfg_use_epoll) { + struct epoll_event ev; + + memset(&ev, 0, sizeof(ev)); + ev.data.fd = fd; + if (cfg_epollet) + ev.events |= EPOLLET; + epfd = epoll_create(1); + if (epfd <= 0) + error(1, errno, "epoll_create"); + if (epoll_ctl(epfd, EPOLL_CTL_ADD, fd, &ev)) + error(1, errno, "epoll_ctl"); + } + /* reset expected key on each new socket */ saved_tskey = -1; @@ -525,19 +636,28 @@ static void do_test(int family, unsigned int report_opt) error(1, errno, "send"); /* wait for all errors to be queued, else ACKs arrive OOO */ - if (!cfg_no_delay) - usleep(50 * 1000); + if (cfg_sleep_usec) + usleep(cfg_sleep_usec); - __poll(fd); + if (!cfg_busy_poll) { + if (cfg_use_epoll) + __epoll(epfd); + else + __poll(fd); + } while (!recv_errmsg(fd)) {} } + print_timing_event("USR-ENQ", &usr_enq); + print_timing_event("USR-SND", &usr_snd); + print_timing_event("USR-ACK", &usr_ack); + if (close(fd)) error(1, errno, "close"); free(buf); - usleep(100 * 1000); + usleep(100 * NSEC_PER_USEC); } static void __attribute__((noreturn)) usage(const char *filepath) @@ -547,18 +667,22 @@ static void __attribute__((noreturn)) usage(const char *filepath) " -4: only IPv4\n" " -6: only IPv6\n" " -h: show this message\n" + " -b: busy poll to read from error queue\n" " -c N: number of packets for each test\n" " -C: use cmsg to set tstamp recording options\n" - " -D: no delay between packets\n" - " -F: poll() waits forever for an event\n" + " -e: use level-triggered epoll() instead of poll()\n" + " -E: use event-triggered epoll() instead of poll()\n" + " -F: poll()/epoll() waits forever for an event\n" " -I: request PKTINFO\n" " -l N: send N bytes at a time\n" " -L listen on hostname and port\n" " -n: set no-payload option\n" + " -N: print timestamps and durations in nsec (instead of usec)\n" " -p N: connect to port N\n" " -P: use PF_PACKET\n" " -r: use raw\n" " -R: use raw (IP_HDRINCL)\n" + " -S N: usec to sleep before reading error queue\n" " -u: use udp\n" " -v: validate SND delay (usec)\n" " -V: validate ACK delay (usec)\n" @@ -572,7 +696,8 @@ static void parse_opt(int argc, char **argv) int proto_count = 0; int c; - while ((c = getopt(argc, argv, "46c:CDFhIl:Lnp:PrRuv:V:x")) != -1) { + while ((c = getopt(argc, argv, + "46bc:CeEFhIl:LnNp:PrRS:uv:V:x")) != -1) { switch (c) { case '4': do_ipv6 = 0; @@ -580,15 +705,21 @@ static void parse_opt(int argc, char **argv) case '6': do_ipv4 = 0; break; + case 'b': + cfg_busy_poll = true; + break; case 'c': cfg_num_pkts = strtoul(optarg, NULL, 10); break; case 'C': cfg_use_cmsg = true; break; - case 'D': - cfg_no_delay = true; + case 'e': + cfg_use_epoll = true; break; + case 'E': + cfg_use_epoll = true; + cfg_epollet = true; case 'F': cfg_poll_timeout = -1; break; @@ -604,6 +735,9 @@ static void parse_opt(int argc, char **argv) case 'n': cfg_loop_nodata = true; break; + case 'N': + cfg_print_nsec = true; + break; case 'p': dest_port = strtoul(optarg, NULL, 10); break; @@ -623,6 +757,9 @@ static void parse_opt(int argc, char **argv) cfg_proto = SOCK_RAW; cfg_ipproto = IPPROTO_RAW; break; + case 'S': + cfg_sleep_usec = strtoul(optarg, NULL, 10); + break; case 'u': proto_count++; cfg_proto = SOCK_DGRAM; @@ -653,6 +790,8 @@ static void parse_opt(int argc, char **argv) error(1, 0, "pass -P, -r, -R or -u, not multiple"); if (cfg_do_pktinfo && cfg_use_pf_packet) error(1, 0, "cannot ask for pktinfo over pf_packet"); + if (cfg_busy_poll && cfg_use_epoll) + error(1, 0, "pass epoll or busy_poll, not both"); if (optind != argc - 1) error(1, 0, "missing required hostname argument"); diff --git a/tools/testing/selftests/networking/timestamping/txtimestamp.sh b/tools/testing/selftests/net/txtimestamp.sh index df0d86ca72b7..eea6f5193693 100755 --- a/tools/testing/selftests/networking/timestamping/txtimestamp.sh +++ b/tools/testing/selftests/net/txtimestamp.sh @@ -43,15 +43,40 @@ run_test_tcpudpraw() { } run_test_all() { + setup run_test_tcpudpraw # setsockopt run_test_tcpudpraw -C # cmsg run_test_tcpudpraw -n # timestamp w/o data + echo "OK. All tests passed" +} + +run_test_one() { + setup + ./txtimestamp $@ +} + +usage() { + echo "Usage: $0 [ -r | --run ] <txtimestamp args> | [ -h | --help ]" + echo " (no args) Run all tests" + echo " -r|--run Run an individual test with arguments" + echo " -h|--help Help" +} + +main() { + if [[ $# -eq 0 ]]; then + run_test_all + else + if [[ "$1" = "-r" || "$1" == "--run" ]]; then + shift + run_test_one $@ + else + usage + fi + fi } if [[ "$(ip netns identify)" == "root" ]]; then - ../../net/in_netns.sh $0 $@ + ./in_netns.sh $0 $@ else - setup - run_test_all - echo "OK. All tests passed" + main $@ fi diff --git a/tools/testing/selftests/net/vrf-xfrm-tests.sh b/tools/testing/selftests/net/vrf-xfrm-tests.sh new file mode 100755 index 000000000000..184da81f554f --- /dev/null +++ b/tools/testing/selftests/net/vrf-xfrm-tests.sh @@ -0,0 +1,436 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Various combinations of VRF with xfrms and qdisc. + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +PAUSE_ON_FAIL=no +VERBOSE=0 +ret=0 + +HOST1_4=192.168.1.1 +HOST2_4=192.168.1.2 +HOST1_6=2001:db8:1::1 +HOST2_6=2001:db8:1::2 + +XFRM1_4=10.0.1.1 +XFRM2_4=10.0.1.2 +XFRM1_6=fc00:1000::1 +XFRM2_6=fc00:1000::2 +IF_ID=123 + +VRF=red +TABLE=300 + +AUTH_1=0xd94fcfea65fddf21dc6e0d24a0253508 +AUTH_2=0xdc6e0d24a0253508d94fcfea65fddf21 +ENC_1=0xfc46c20f8048be9725930ff3fb07ac2a91f0347dffeacf62 +ENC_2=0x3fb07ac2a91f0347dffeacf62fc46c20f8048be9725930ff +SPI_1=0x02122b77 +SPI_2=0x2b770212 + +which ping6 > /dev/null 2>&1 && ping6=$(which ping6) || ping6=$(which ping) + +################################################################################ +# +log_test() +{ + local rc=$1 + local expected=$2 + local msg="$3" + + if [ ${rc} -eq ${expected} ]; then + printf "TEST: %-60s [ OK ]\n" "${msg}" + nsuccess=$((nsuccess+1)) + else + ret=1 + nfail=$((nfail+1)) + printf "TEST: %-60s [FAIL]\n" "${msg}" + if [ "${PAUSE_ON_FAIL}" = "yes" ]; then + echo + echo "hit enter to continue, 'q' to quit" + read a + [ "$a" = "q" ] && exit 1 + fi + fi +} + +run_cmd_host1() +{ + local cmd="$*" + local out + local rc + + if [ "$VERBOSE" = "1" ]; then + printf " COMMAND: $cmd\n" + fi + + out=$(eval ip netns exec host1 $cmd 2>&1) + rc=$? + if [ "$VERBOSE" = "1" ]; then + if [ -n "$out" ]; then + echo + echo " $out" + fi + echo + fi + + return $rc +} + +################################################################################ +# create namespaces for hosts and sws + +create_vrf() +{ + local ns=$1 + local vrf=$2 + local table=$3 + + if [ -n "${ns}" ]; then + ns="-netns ${ns}" + fi + + ip ${ns} link add ${vrf} type vrf table ${table} + ip ${ns} link set ${vrf} up + ip ${ns} route add vrf ${vrf} unreachable default metric 8192 + ip ${ns} -6 route add vrf ${vrf} unreachable default metric 8192 + + ip ${ns} addr add 127.0.0.1/8 dev ${vrf} + ip ${ns} -6 addr add ::1 dev ${vrf} nodad + + ip ${ns} ru del pref 0 + ip ${ns} ru add pref 32765 from all lookup local + ip ${ns} -6 ru del pref 0 + ip ${ns} -6 ru add pref 32765 from all lookup local +} + +create_ns() +{ + local ns=$1 + local addr=$2 + local addr6=$3 + + [ -z "${addr}" ] && addr="-" + [ -z "${addr6}" ] && addr6="-" + + ip netns add ${ns} + + ip -netns ${ns} link set lo up + if [ "${addr}" != "-" ]; then + ip -netns ${ns} addr add dev lo ${addr} + fi + if [ "${addr6}" != "-" ]; then + ip -netns ${ns} -6 addr add dev lo ${addr6} + fi + + ip -netns ${ns} ro add unreachable default metric 8192 + ip -netns ${ns} -6 ro add unreachable default metric 8192 + + ip netns exec ${ns} sysctl -qw net.ipv4.ip_forward=1 + ip netns exec ${ns} sysctl -qw net.ipv6.conf.all.keep_addr_on_down=1 + ip netns exec ${ns} sysctl -qw net.ipv6.conf.all.forwarding=1 + ip netns exec ${ns} sysctl -qw net.ipv6.conf.default.forwarding=1 + ip netns exec ${ns} sysctl -qw net.ipv6.conf.default.accept_dad=0 +} + +# create veth pair to connect namespaces and apply addresses. +connect_ns() +{ + local ns1=$1 + local ns1_dev=$2 + local ns1_addr=$3 + local ns1_addr6=$4 + local ns2=$5 + local ns2_dev=$6 + local ns2_addr=$7 + local ns2_addr6=$8 + local ns1arg + local ns2arg + + if [ -n "${ns1}" ]; then + ns1arg="-netns ${ns1}" + fi + if [ -n "${ns2}" ]; then + ns2arg="-netns ${ns2}" + fi + + ip ${ns1arg} li add ${ns1_dev} type veth peer name tmp + ip ${ns1arg} li set ${ns1_dev} up + ip ${ns1arg} li set tmp netns ${ns2} name ${ns2_dev} + ip ${ns2arg} li set ${ns2_dev} up + + if [ "${ns1_addr}" != "-" ]; then + ip ${ns1arg} addr add dev ${ns1_dev} ${ns1_addr} + ip ${ns2arg} addr add dev ${ns2_dev} ${ns2_addr} + fi + + if [ "${ns1_addr6}" != "-" ]; then + ip ${ns1arg} addr add dev ${ns1_dev} ${ns1_addr6} nodad + ip ${ns2arg} addr add dev ${ns2_dev} ${ns2_addr6} nodad + fi +} + +################################################################################ + +cleanup() +{ + ip netns del host1 + ip netns del host2 +} + +setup() +{ + create_ns "host1" + create_ns "host2" + + connect_ns "host1" eth0 ${HOST1_4}/24 ${HOST1_6}/64 \ + "host2" eth0 ${HOST2_4}/24 ${HOST2_6}/64 + + create_vrf "host1" ${VRF} ${TABLE} + ip -netns host1 link set dev eth0 master ${VRF} +} + +cleanup_xfrm() +{ + for ns in host1 host2 + do + for x in state policy + do + ip -netns ${ns} xfrm ${x} flush + ip -6 -netns ${ns} xfrm ${x} flush + done + done +} + +setup_xfrm() +{ + local h1_4=$1 + local h2_4=$2 + local h1_6=$3 + local h2_6=$4 + local devarg="$5" + + # + # policy + # + + # host1 - IPv4 out + ip -netns host1 xfrm policy add \ + src ${h1_4} dst ${h2_4} ${devarg} dir out \ + tmpl src ${HOST1_4} dst ${HOST2_4} proto esp mode tunnel + + # host2 - IPv4 in + ip -netns host2 xfrm policy add \ + src ${h1_4} dst ${h2_4} dir in \ + tmpl src ${HOST1_4} dst ${HOST2_4} proto esp mode tunnel + + # host1 - IPv4 in + ip -netns host1 xfrm policy add \ + src ${h2_4} dst ${h1_4} ${devarg} dir in \ + tmpl src ${HOST2_4} dst ${HOST1_4} proto esp mode tunnel + + # host2 - IPv4 out + ip -netns host2 xfrm policy add \ + src ${h2_4} dst ${h1_4} dir out \ + tmpl src ${HOST2_4} dst ${HOST1_4} proto esp mode tunnel + + + # host1 - IPv6 out + ip -6 -netns host1 xfrm policy add \ + src ${h1_6} dst ${h2_6} ${devarg} dir out \ + tmpl src ${HOST1_6} dst ${HOST2_6} proto esp mode tunnel + + # host2 - IPv6 in + ip -6 -netns host2 xfrm policy add \ + src ${h1_6} dst ${h2_6} dir in \ + tmpl src ${HOST1_6} dst ${HOST2_6} proto esp mode tunnel + + # host1 - IPv6 in + ip -6 -netns host1 xfrm policy add \ + src ${h2_6} dst ${h1_6} ${devarg} dir in \ + tmpl src ${HOST2_6} dst ${HOST1_6} proto esp mode tunnel + + # host2 - IPv6 out + ip -6 -netns host2 xfrm policy add \ + src ${h2_6} dst ${h1_6} dir out \ + tmpl src ${HOST2_6} dst ${HOST1_6} proto esp mode tunnel + + # + # state + # + ip -netns host1 xfrm state add src ${HOST1_4} dst ${HOST2_4} \ + proto esp spi ${SPI_1} reqid 0 mode tunnel \ + replay-window 4 replay-oseq 0x4 \ + auth-trunc 'hmac(md5)' ${AUTH_1} 96 \ + enc 'cbc(des3_ede)' ${ENC_1} \ + sel src ${h1_4} dst ${h2_4} ${devarg} + + ip -netns host2 xfrm state add src ${HOST1_4} dst ${HOST2_4} \ + proto esp spi ${SPI_1} reqid 0 mode tunnel \ + replay-window 4 replay-oseq 0x4 \ + auth-trunc 'hmac(md5)' ${AUTH_1} 96 \ + enc 'cbc(des3_ede)' ${ENC_1} \ + sel src ${h1_4} dst ${h2_4} + + + ip -netns host1 xfrm state add src ${HOST2_4} dst ${HOST1_4} \ + proto esp spi ${SPI_2} reqid 0 mode tunnel \ + replay-window 4 replay-oseq 0x4 \ + auth-trunc 'hmac(md5)' ${AUTH_2} 96 \ + enc 'cbc(des3_ede)' ${ENC_2} \ + sel src ${h2_4} dst ${h1_4} ${devarg} + + ip -netns host2 xfrm state add src ${HOST2_4} dst ${HOST1_4} \ + proto esp spi ${SPI_2} reqid 0 mode tunnel \ + replay-window 4 replay-oseq 0x4 \ + auth-trunc 'hmac(md5)' ${AUTH_2} 96 \ + enc 'cbc(des3_ede)' ${ENC_2} \ + sel src ${h2_4} dst ${h1_4} + + + ip -6 -netns host1 xfrm state add src ${HOST1_6} dst ${HOST2_6} \ + proto esp spi ${SPI_1} reqid 0 mode tunnel \ + replay-window 4 replay-oseq 0x4 \ + auth-trunc 'hmac(md5)' ${AUTH_1} 96 \ + enc 'cbc(des3_ede)' ${ENC_1} \ + sel src ${h1_6} dst ${h2_6} ${devarg} + + ip -6 -netns host2 xfrm state add src ${HOST1_6} dst ${HOST2_6} \ + proto esp spi ${SPI_1} reqid 0 mode tunnel \ + replay-window 4 replay-oseq 0x4 \ + auth-trunc 'hmac(md5)' ${AUTH_1} 96 \ + enc 'cbc(des3_ede)' ${ENC_1} \ + sel src ${h1_6} dst ${h2_6} + + + ip -6 -netns host1 xfrm state add src ${HOST2_6} dst ${HOST1_6} \ + proto esp spi ${SPI_2} reqid 0 mode tunnel \ + replay-window 4 replay-oseq 0x4 \ + auth-trunc 'hmac(md5)' ${AUTH_2} 96 \ + enc 'cbc(des3_ede)' ${ENC_2} \ + sel src ${h2_6} dst ${h1_6} ${devarg} + + ip -6 -netns host2 xfrm state add src ${HOST2_6} dst ${HOST1_6} \ + proto esp spi ${SPI_2} reqid 0 mode tunnel \ + replay-window 4 replay-oseq 0x4 \ + auth-trunc 'hmac(md5)' ${AUTH_2} 96 \ + enc 'cbc(des3_ede)' ${ENC_2} \ + sel src ${h2_6} dst ${h1_6} +} + +cleanup_xfrm_dev() +{ + ip -netns host1 li del xfrm0 + ip -netns host2 addr del ${XFRM2_4}/24 dev eth0 + ip -netns host2 addr del ${XFRM2_6}/64 dev eth0 +} + +setup_xfrm_dev() +{ + local vrfarg="vrf ${VRF}" + + ip -netns host1 li add type xfrm dev eth0 if_id ${IF_ID} + ip -netns host1 li set xfrm0 ${vrfarg} up + ip -netns host1 addr add ${XFRM1_4}/24 dev xfrm0 + ip -netns host1 addr add ${XFRM1_6}/64 dev xfrm0 + + ip -netns host2 addr add ${XFRM2_4}/24 dev eth0 + ip -netns host2 addr add ${XFRM2_6}/64 dev eth0 + + setup_xfrm ${XFRM1_4} ${XFRM2_4} ${XFRM1_6} ${XFRM2_6} "if_id ${IF_ID}" +} + +run_tests() +{ + cleanup_xfrm + + # no IPsec + run_cmd_host1 ip vrf exec ${VRF} ping -c1 -w1 ${HOST2_4} + log_test $? 0 "IPv4 no xfrm policy" + run_cmd_host1 ip vrf exec ${VRF} ${ping6} -c1 -w1 ${HOST2_6} + log_test $? 0 "IPv6 no xfrm policy" + + # xfrm without VRF in sel + setup_xfrm ${HOST1_4} ${HOST2_4} ${HOST1_6} ${HOST2_6} + run_cmd_host1 ip vrf exec ${VRF} ping -c1 -w1 ${HOST2_4} + log_test $? 0 "IPv4 xfrm policy based on address" + run_cmd_host1 ip vrf exec ${VRF} ${ping6} -c1 -w1 ${HOST2_6} + log_test $? 0 "IPv6 xfrm policy based on address" + cleanup_xfrm + + # xfrm with VRF in sel + # Known failure: ipv4 resets the flow oif after the lookup. Fix is + # not straightforward. + # setup_xfrm ${HOST1_4} ${HOST2_4} ${HOST1_6} ${HOST2_6} "dev ${VRF}" + # run_cmd_host1 ip vrf exec ${VRF} ping -c1 -w1 ${HOST2_4} + # log_test $? 0 "IPv4 xfrm policy with VRF in selector" + run_cmd_host1 ip vrf exec ${VRF} ${ping6} -c1 -w1 ${HOST2_6} + log_test $? 0 "IPv6 xfrm policy with VRF in selector" + cleanup_xfrm + + # xfrm with enslaved device in sel + # Known failures: combined with the above, __xfrm{4,6}_selector_match + # needs to consider both l3mdev and enslaved device index. + # setup_xfrm ${HOST1_4} ${HOST2_4} ${HOST1_6} ${HOST2_6} "dev eth0" + # run_cmd_host1 ip vrf exec ${VRF} ping -c1 -w1 ${HOST2_4} + # log_test $? 0 "IPv4 xfrm policy with enslaved device in selector" + # run_cmd_host1 ip vrf exec ${VRF} ${ping6} -c1 -w1 ${HOST2_6} + # log_test $? 0 "IPv6 xfrm policy with enslaved device in selector" + # cleanup_xfrm + + # xfrm device + setup_xfrm_dev + run_cmd_host1 ip vrf exec ${VRF} ping -c1 -w1 ${XFRM2_4} + log_test $? 0 "IPv4 xfrm policy with xfrm device" + run_cmd_host1 ip vrf exec ${VRF} ${ping6} -c1 -w1 ${XFRM2_6} + log_test $? 0 "IPv6 xfrm policy with xfrm device" + cleanup_xfrm_dev +} + +################################################################################ +# usage + +usage() +{ + cat <<EOF +usage: ${0##*/} OPTS + + -p Pause on fail + -v verbose mode (show commands and output) + +done +EOF +} + +################################################################################ +# main + +while getopts :pv o +do + case $o in + p) PAUSE_ON_FAIL=yes;; + v) VERBOSE=$(($VERBOSE + 1));; + h) usage; exit 0;; + *) usage; exit 1;; + esac +done + +cleanup 2>/dev/null +setup + +echo +echo "No qdisc on VRF device" +run_tests + +run_cmd_host1 tc qdisc add dev ${VRF} root netem delay 100ms +echo +echo "netem qdisc on VRF device" +run_tests + +printf "\nTests passed: %3d\n" ${nsuccess} +printf "Tests failed: %3d\n" ${nfail} + +exit $ret diff --git a/tools/testing/selftests/networking/timestamping/.gitignore b/tools/testing/selftests/networking/timestamping/.gitignore deleted file mode 100644 index d9355035e746..000000000000 --- a/tools/testing/selftests/networking/timestamping/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -timestamping -rxtimestamp -txtimestamp -hwtstamp_config diff --git a/tools/testing/selftests/networking/timestamping/Makefile b/tools/testing/selftests/networking/timestamping/Makefile deleted file mode 100644 index 1de8bd8ccf5d..000000000000 --- a/tools/testing/selftests/networking/timestamping/Makefile +++ /dev/null @@ -1,11 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -CFLAGS += -I../../../../../usr/include - -TEST_GEN_FILES := hwtstamp_config rxtimestamp timestamping txtimestamp -TEST_PROGS := txtimestamp.sh - -all: $(TEST_PROGS) - -top_srcdir = ../../../../.. -KSFT_KHDR_INSTALL := 1 -include ../../lib.mk diff --git a/tools/testing/selftests/networking/timestamping/config b/tools/testing/selftests/networking/timestamping/config deleted file mode 100644 index a13e3169b0a4..000000000000 --- a/tools/testing/selftests/networking/timestamping/config +++ /dev/null @@ -1,2 +0,0 @@ -CONFIG_IFB=y -CONFIG_NET_SCH_NETEM=y diff --git a/tools/testing/selftests/nsfs/.gitignore b/tools/testing/selftests/nsfs/.gitignore index 2ab2c824ce86..ed79ebdf286e 100644 --- a/tools/testing/selftests/nsfs/.gitignore +++ b/tools/testing/selftests/nsfs/.gitignore @@ -1,2 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only owner pidns diff --git a/tools/testing/selftests/nsfs/pidns.c b/tools/testing/selftests/nsfs/pidns.c index e0d86e1668c0..e3c772c6a7c7 100644 --- a/tools/testing/selftests/nsfs/pidns.c +++ b/tools/testing/selftests/nsfs/pidns.c @@ -27,7 +27,7 @@ #define __stack_aligned__ __attribute__((aligned(16))) struct cr_clone_arg { char stack[128] __stack_aligned__; - char stack_ptr[0]; + char stack_ptr[]; }; static int child(void *args) diff --git a/tools/testing/selftests/ntb/ntb_test.sh b/tools/testing/selftests/ntb/ntb_test.sh index 9c60337317c6..020137b61407 100755 --- a/tools/testing/selftests/ntb/ntb_test.sh +++ b/tools/testing/selftests/ntb/ntb_test.sh @@ -241,7 +241,7 @@ function get_files_count() split_remote $LOC if [[ "$REMOTE" == "" ]]; then - echo $(ls -1 "$LOC"/${NAME}* 2>/dev/null | wc -l) + echo $(ls -1 "$VPATH"/${NAME}* 2>/dev/null | wc -l) else echo $(ssh "$REMOTE" "ls -1 \"$VPATH\"/${NAME}* | \ wc -l" 2> /dev/null) diff --git a/tools/testing/selftests/openat2/.gitignore b/tools/testing/selftests/openat2/.gitignore index bd68f6c3fd07..82a4846cbc4b 100644 --- a/tools/testing/selftests/openat2/.gitignore +++ b/tools/testing/selftests/openat2/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only /*_test diff --git a/tools/testing/selftests/pid_namespace/.gitignore b/tools/testing/selftests/pid_namespace/.gitignore new file mode 100644 index 000000000000..93ab9d7e5b7e --- /dev/null +++ b/tools/testing/selftests/pid_namespace/.gitignore @@ -0,0 +1 @@ +regression_enomem diff --git a/tools/testing/selftests/pid_namespace/Makefile b/tools/testing/selftests/pid_namespace/Makefile new file mode 100644 index 000000000000..dcaefa224ca0 --- /dev/null +++ b/tools/testing/selftests/pid_namespace/Makefile @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0 +CFLAGS += -g -I../../../../usr/include/ + +TEST_GEN_PROGS := regression_enomem + +include ../lib.mk + +$(OUTPUT)/regression_enomem: regression_enomem.c ../pidfd/pidfd.h diff --git a/tools/testing/selftests/pid_namespace/config b/tools/testing/selftests/pid_namespace/config new file mode 100644 index 000000000000..26cdb27e7dbb --- /dev/null +++ b/tools/testing/selftests/pid_namespace/config @@ -0,0 +1,2 @@ +CONFIG_PID_NS=y +CONFIG_USER_NS=y diff --git a/tools/testing/selftests/pid_namespace/regression_enomem.c b/tools/testing/selftests/pid_namespace/regression_enomem.c new file mode 100644 index 000000000000..73d532556d17 --- /dev/null +++ b/tools/testing/selftests/pid_namespace/regression_enomem.c @@ -0,0 +1,45 @@ +#define _GNU_SOURCE +#include <assert.h> +#include <errno.h> +#include <fcntl.h> +#include <linux/types.h> +#include <sched.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <syscall.h> +#include <sys/wait.h> + +#include "../kselftest.h" +#include "../kselftest_harness.h" +#include "../pidfd/pidfd.h" + +/* + * Regression test for: + * 35f71bc0a09a ("fork: report pid reservation failure properly") + * b26ebfe12f34 ("pid: Fix error return value in some cases") + */ +TEST(regression_enomem) +{ + pid_t pid; + + if (geteuid()) + EXPECT_EQ(0, unshare(CLONE_NEWUSER)); + + EXPECT_EQ(0, unshare(CLONE_NEWPID)); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) + exit(EXIT_SUCCESS); + + EXPECT_EQ(0, wait_for_pid(pid)); + + pid = fork(); + ASSERT_LT(pid, 0); + ASSERT_EQ(errno, ENOMEM); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/pidfd/.gitignore b/tools/testing/selftests/pidfd/.gitignore index 39559d723c41..973198a3ec3d 100644 --- a/tools/testing/selftests/pidfd/.gitignore +++ b/tools/testing/selftests/pidfd/.gitignore @@ -1,6 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0-only pidfd_open_test pidfd_poll_test pidfd_test pidfd_wait pidfd_fdinfo_test pidfd_getfd_test +pidfd_setns_test diff --git a/tools/testing/selftests/pidfd/Makefile b/tools/testing/selftests/pidfd/Makefile index 75a545861375..f4a2f28f926b 100644 --- a/tools/testing/selftests/pidfd/Makefile +++ b/tools/testing/selftests/pidfd/Makefile @@ -1,7 +1,8 @@ # SPDX-License-Identifier: GPL-2.0-only CFLAGS += -g -I../../../../usr/include/ -pthread -TEST_GEN_PROGS := pidfd_test pidfd_fdinfo_test pidfd_open_test pidfd_poll_test pidfd_wait pidfd_getfd_test +TEST_GEN_PROGS := pidfd_test pidfd_fdinfo_test pidfd_open_test \ + pidfd_poll_test pidfd_wait pidfd_getfd_test pidfd_setns_test include ../lib.mk diff --git a/tools/testing/selftests/pidfd/config b/tools/testing/selftests/pidfd/config new file mode 100644 index 000000000000..bb11de90c0c9 --- /dev/null +++ b/tools/testing/selftests/pidfd/config @@ -0,0 +1,6 @@ +CONFIG_UTS_NS=y +CONFIG_IPC_NS=y +CONFIG_USER_NS=y +CONFIG_PID_NS=y +CONFIG_NET_NS=y +CONFIG_CGROUPS=y diff --git a/tools/testing/selftests/pidfd/pidfd.h b/tools/testing/selftests/pidfd/pidfd.h index d482515604db..c1921a53dbed 100644 --- a/tools/testing/selftests/pidfd/pidfd.h +++ b/tools/testing/selftests/pidfd/pidfd.h @@ -13,6 +13,8 @@ #include <string.h> #include <syscall.h> #include <sys/mount.h> +#include <sys/types.h> +#include <sys/wait.h> #include "../kselftest.h" diff --git a/tools/testing/selftests/pidfd/pidfd_setns_test.c b/tools/testing/selftests/pidfd/pidfd_setns_test.c new file mode 100644 index 000000000000..133ec5b6cda8 --- /dev/null +++ b/tools/testing/selftests/pidfd/pidfd_setns_test.c @@ -0,0 +1,473 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <linux/types.h> +#include <sched.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <syscall.h> +#include <sys/prctl.h> +#include <sys/wait.h> +#include <unistd.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <linux/kcmp.h> + +#include "pidfd.h" +#include "../clone3/clone3_selftests.h" +#include "../kselftest.h" +#include "../kselftest_harness.h" + +enum { + PIDFD_NS_USER, + PIDFD_NS_MNT, + PIDFD_NS_PID, + PIDFD_NS_UTS, + PIDFD_NS_IPC, + PIDFD_NS_NET, + PIDFD_NS_CGROUP, + PIDFD_NS_PIDCLD, + PIDFD_NS_MAX +}; + +const struct ns_info { + const char *name; + int flag; +} ns_info[] = { + [PIDFD_NS_USER] = { "user", CLONE_NEWUSER, }, + [PIDFD_NS_MNT] = { "mnt", CLONE_NEWNS, }, + [PIDFD_NS_PID] = { "pid", CLONE_NEWPID, }, + [PIDFD_NS_UTS] = { "uts", CLONE_NEWUTS, }, + [PIDFD_NS_IPC] = { "ipc", CLONE_NEWIPC, }, + [PIDFD_NS_NET] = { "net", CLONE_NEWNET, }, + [PIDFD_NS_CGROUP] = { "cgroup", CLONE_NEWCGROUP, }, + [PIDFD_NS_PIDCLD] = { "pid_for_children", 0, }, +}; + +FIXTURE(current_nsset) +{ + pid_t pid; + int pidfd; + int nsfds[PIDFD_NS_MAX]; + + pid_t child_pid_exited; + int child_pidfd_exited; + + pid_t child_pid1; + int child_pidfd1; + int child_nsfds1[PIDFD_NS_MAX]; + + pid_t child_pid2; + int child_pidfd2; + int child_nsfds2[PIDFD_NS_MAX]; +}; + +static int sys_waitid(int which, pid_t pid, int options) +{ + return syscall(__NR_waitid, which, pid, NULL, options, NULL); +} + +pid_t create_child(int *pidfd, unsigned flags) +{ + struct clone_args args = { + .flags = CLONE_PIDFD | flags, + .exit_signal = SIGCHLD, + .pidfd = ptr_to_u64(pidfd), + }; + + return sys_clone3(&args, sizeof(struct clone_args)); +} + +FIXTURE_SETUP(current_nsset) +{ + int i, proc_fd, ret; + + for (i = 0; i < PIDFD_NS_MAX; i++) { + self->nsfds[i] = -EBADF; + self->child_nsfds1[i] = -EBADF; + self->child_nsfds2[i] = -EBADF; + } + + proc_fd = open("/proc/self/ns", O_DIRECTORY | O_CLOEXEC); + ASSERT_GE(proc_fd, 0) { + TH_LOG("%m - Failed to open /proc/self/ns"); + } + + self->pid = getpid(); + for (i = 0; i < PIDFD_NS_MAX; i++) { + const struct ns_info *info = &ns_info[i]; + self->nsfds[i] = openat(proc_fd, info->name, O_RDONLY | O_CLOEXEC); + if (self->nsfds[i] < 0) { + EXPECT_EQ(errno, ENOENT) { + TH_LOG("%m - Failed to open %s namespace for process %d", + info->name, self->pid); + } + } + } + + self->pidfd = sys_pidfd_open(self->pid, 0); + EXPECT_GT(self->pidfd, 0) { + TH_LOG("%m - Failed to open pidfd for process %d", self->pid); + } + + /* Create task that exits right away. */ + self->child_pid_exited = create_child(&self->child_pidfd_exited, + CLONE_NEWUSER | CLONE_NEWNET); + EXPECT_GT(self->child_pid_exited, 0); + + if (self->child_pid_exited == 0) + _exit(EXIT_SUCCESS); + + ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, WEXITED | WNOWAIT), 0); + + self->pidfd = sys_pidfd_open(self->pid, 0); + EXPECT_GE(self->pidfd, 0) { + TH_LOG("%m - Failed to open pidfd for process %d", self->pid); + } + + /* Create tasks that will be stopped. */ + self->child_pid1 = create_child(&self->child_pidfd1, + CLONE_NEWUSER | CLONE_NEWNS | + CLONE_NEWCGROUP | CLONE_NEWIPC | + CLONE_NEWUTS | CLONE_NEWPID | + CLONE_NEWNET); + EXPECT_GE(self->child_pid1, 0); + + if (self->child_pid1 == 0) { + pause(); + _exit(EXIT_SUCCESS); + } + + self->child_pid2 = create_child(&self->child_pidfd2, + CLONE_NEWUSER | CLONE_NEWNS | + CLONE_NEWCGROUP | CLONE_NEWIPC | + CLONE_NEWUTS | CLONE_NEWPID | + CLONE_NEWNET); + EXPECT_GE(self->child_pid2, 0); + + if (self->child_pid2 == 0) { + pause(); + _exit(EXIT_SUCCESS); + } + + for (i = 0; i < PIDFD_NS_MAX; i++) { + char p[100]; + + const struct ns_info *info = &ns_info[i]; + + self->nsfds[i] = openat(proc_fd, info->name, O_RDONLY | O_CLOEXEC); + if (self->nsfds[i] < 0) { + EXPECT_EQ(errno, ENOENT) { + TH_LOG("%m - Failed to open %s namespace for process %d", + info->name, self->pid); + } + } + + ret = snprintf(p, sizeof(p), "/proc/%d/ns/%s", + self->child_pid1, info->name); + EXPECT_GT(ret, 0); + EXPECT_LT(ret, sizeof(p)); + + self->child_nsfds1[i] = open(p, O_RDONLY | O_CLOEXEC); + if (self->child_nsfds1[i] < 0) { + EXPECT_EQ(errno, ENOENT) { + TH_LOG("%m - Failed to open %s namespace for process %d", + info->name, self->child_pid1); + } + } + + ret = snprintf(p, sizeof(p), "/proc/%d/ns/%s", + self->child_pid2, info->name); + EXPECT_GT(ret, 0); + EXPECT_LT(ret, sizeof(p)); + + self->child_nsfds2[i] = open(p, O_RDONLY | O_CLOEXEC); + if (self->child_nsfds2[i] < 0) { + EXPECT_EQ(errno, ENOENT) { + TH_LOG("%m - Failed to open %s namespace for process %d", + info->name, self->child_pid1); + } + } + } + + close(proc_fd); +} + +FIXTURE_TEARDOWN(current_nsset) +{ + int i; + + ASSERT_EQ(sys_pidfd_send_signal(self->child_pidfd1, + SIGKILL, NULL, 0), 0); + ASSERT_EQ(sys_pidfd_send_signal(self->child_pidfd2, + SIGKILL, NULL, 0), 0); + + for (i = 0; i < PIDFD_NS_MAX; i++) { + if (self->nsfds[i] >= 0) + close(self->nsfds[i]); + if (self->child_nsfds1[i] >= 0) + close(self->child_nsfds1[i]); + if (self->child_nsfds2[i] >= 0) + close(self->child_nsfds2[i]); + } + + if (self->child_pidfd1 >= 0) + EXPECT_EQ(0, close(self->child_pidfd1)); + if (self->child_pidfd2 >= 0) + EXPECT_EQ(0, close(self->child_pidfd2)); + ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, WEXITED), 0); + ASSERT_EQ(sys_waitid(P_PID, self->child_pid1, WEXITED), 0); + ASSERT_EQ(sys_waitid(P_PID, self->child_pid2, WEXITED), 0); +} + +static int preserve_ns(const int pid, const char *ns) +{ + int ret; + char path[50]; + + ret = snprintf(path, sizeof(path), "/proc/%d/ns/%s", pid, ns); + if (ret < 0 || (size_t)ret >= sizeof(path)) + return -EIO; + + return open(path, O_RDONLY | O_CLOEXEC); +} + +static int in_same_namespace(int ns_fd1, pid_t pid2, const char *ns) +{ + int ns_fd2 = -EBADF; + int ret = -1; + struct stat ns_st1, ns_st2; + + ret = fstat(ns_fd1, &ns_st1); + if (ret < 0) + return -1; + + ns_fd2 = preserve_ns(pid2, ns); + if (ns_fd2 < 0) + return -1; + + ret = fstat(ns_fd2, &ns_st2); + close(ns_fd2); + if (ret < 0) + return -1; + + /* processes are in the same namespace */ + if ((ns_st1.st_dev == ns_st2.st_dev) && + (ns_st1.st_ino == ns_st2.st_ino)) + return 1; + + /* processes are in different namespaces */ + return 0; +} + +/* Test that we can't pass garbage to the kernel. */ +TEST_F(current_nsset, invalid_flags) +{ + ASSERT_NE(setns(self->pidfd, 0), 0); + EXPECT_EQ(errno, EINVAL); + + ASSERT_NE(setns(self->pidfd, -1), 0); + EXPECT_EQ(errno, EINVAL); + + ASSERT_NE(setns(self->pidfd, CLONE_VM), 0); + EXPECT_EQ(errno, EINVAL); + + ASSERT_NE(setns(self->pidfd, CLONE_NEWUSER | CLONE_VM), 0); + EXPECT_EQ(errno, EINVAL); +} + +/* Test that we can't attach to a task that has already exited. */ +TEST_F(current_nsset, pidfd_exited_child) +{ + int i; + pid_t pid; + + ASSERT_NE(setns(self->child_pidfd_exited, CLONE_NEWUSER | CLONE_NEWNET), + 0); + EXPECT_EQ(errno, ESRCH); + + pid = getpid(); + for (i = 0; i < PIDFD_NS_MAX; i++) { + const struct ns_info *info = &ns_info[i]; + /* Verify that we haven't changed any namespaces. */ + if (self->nsfds[i] >= 0) + ASSERT_EQ(in_same_namespace(self->nsfds[i], pid, info->name), 1); + } +} + +TEST_F(current_nsset, pidfd_incremental_setns) +{ + int i; + pid_t pid; + + pid = getpid(); + for (i = 0; i < PIDFD_NS_MAX; i++) { + const struct ns_info *info = &ns_info[i]; + int nsfd; + + if (self->child_nsfds1[i] < 0) + continue; + + if (info->flag) { + ASSERT_EQ(setns(self->child_pidfd1, info->flag), 0) { + TH_LOG("%m - Failed to setns to %s namespace of %d via pidfd %d", + info->name, self->child_pid1, + self->child_pidfd1); + } + } + + /* Verify that we have changed to the correct namespaces. */ + if (info->flag == CLONE_NEWPID) + nsfd = self->nsfds[i]; + else + nsfd = self->child_nsfds1[i]; + ASSERT_EQ(in_same_namespace(nsfd, pid, info->name), 1) { + TH_LOG("setns failed to place us correctly into %s namespace of %d via pidfd %d", + info->name, self->child_pid1, + self->child_pidfd1); + } + TH_LOG("Managed to correctly setns to %s namespace of %d via pidfd %d", + info->name, self->child_pid1, self->child_pidfd1); + } +} + +TEST_F(current_nsset, nsfd_incremental_setns) +{ + int i; + pid_t pid; + + pid = getpid(); + for (i = 0; i < PIDFD_NS_MAX; i++) { + const struct ns_info *info = &ns_info[i]; + int nsfd; + + if (self->child_nsfds1[i] < 0) + continue; + + if (info->flag) { + ASSERT_EQ(setns(self->child_nsfds1[i], info->flag), 0) { + TH_LOG("%m - Failed to setns to %s namespace of %d via nsfd %d", + info->name, self->child_pid1, + self->child_nsfds1[i]); + } + } + + /* Verify that we have changed to the correct namespaces. */ + if (info->flag == CLONE_NEWPID) + nsfd = self->nsfds[i]; + else + nsfd = self->child_nsfds1[i]; + ASSERT_EQ(in_same_namespace(nsfd, pid, info->name), 1) { + TH_LOG("setns failed to place us correctly into %s namespace of %d via nsfd %d", + info->name, self->child_pid1, + self->child_nsfds1[i]); + } + TH_LOG("Managed to correctly setns to %s namespace of %d via nsfd %d", + info->name, self->child_pid1, self->child_nsfds1[i]); + } +} + +TEST_F(current_nsset, pidfd_one_shot_setns) +{ + unsigned flags = 0; + int i; + pid_t pid; + + for (i = 0; i < PIDFD_NS_MAX; i++) { + const struct ns_info *info = &ns_info[i]; + + if (self->child_nsfds1[i] < 0) + continue; + + flags |= info->flag; + TH_LOG("Adding %s namespace of %d to list of namespaces to attach to", + info->name, self->child_pid1); + } + + ASSERT_EQ(setns(self->child_pidfd1, flags), 0) { + TH_LOG("%m - Failed to setns to namespaces of %d", + self->child_pid1); + } + + pid = getpid(); + for (i = 0; i < PIDFD_NS_MAX; i++) { + const struct ns_info *info = &ns_info[i]; + int nsfd; + + if (self->child_nsfds1[i] < 0) + continue; + + /* Verify that we have changed to the correct namespaces. */ + if (info->flag == CLONE_NEWPID) + nsfd = self->nsfds[i]; + else + nsfd = self->child_nsfds1[i]; + ASSERT_EQ(in_same_namespace(nsfd, pid, info->name), 1) { + TH_LOG("setns failed to place us correctly into %s namespace of %d", + info->name, self->child_pid1); + } + TH_LOG("Managed to correctly setns to %s namespace of %d", + info->name, self->child_pid1); + } +} + +TEST_F(current_nsset, no_foul_play) +{ + unsigned flags = 0; + int i; + + for (i = 0; i < PIDFD_NS_MAX; i++) { + const struct ns_info *info = &ns_info[i]; + + if (self->child_nsfds1[i] < 0) + continue; + + flags |= info->flag; + if (info->flag) /* No use logging pid_for_children. */ + TH_LOG("Adding %s namespace of %d to list of namespaces to attach to", + info->name, self->child_pid1); + } + + ASSERT_EQ(setns(self->child_pidfd1, flags), 0) { + TH_LOG("%m - Failed to setns to namespaces of %d vid pidfd %d", + self->child_pid1, self->child_pidfd1); + } + + /* + * Can't setns to a user namespace outside of our hierarchy since we + * don't have caps in there and didn't create it. That means that under + * no circumstances should we be able to setns to any of the other + * ones since they aren't owned by our user namespace. + */ + for (i = 0; i < PIDFD_NS_MAX; i++) { + const struct ns_info *info = &ns_info[i]; + + if (self->child_nsfds2[i] < 0 || !info->flag) + continue; + + ASSERT_NE(setns(self->child_pidfd2, info->flag), 0) { + TH_LOG("Managed to setns to %s namespace of %d via pidfd %d", + info->name, self->child_pid2, + self->child_pidfd2); + } + TH_LOG("%m - Correctly failed to setns to %s namespace of %d via pidfd %d", + info->name, self->child_pid2, + self->child_pidfd2); + + ASSERT_NE(setns(self->child_nsfds2[i], info->flag), 0) { + TH_LOG("Managed to setns to %s namespace of %d via nsfd %d", + info->name, self->child_pid2, + self->child_nsfds2[i]); + } + TH_LOG("%m - Correctly failed to setns to %s namespace of %d via nsfd %d", + info->name, self->child_pid2, + self->child_nsfds2[i]); + } +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/powerpc/Makefile b/tools/testing/selftests/powerpc/Makefile index 644770c3b754..0830e63818c1 100644 --- a/tools/testing/selftests/powerpc/Makefile +++ b/tools/testing/selftests/powerpc/Makefile @@ -19,6 +19,7 @@ SUB_DIRS = alignment \ copyloops \ dscr \ mm \ + nx-gzip \ pmu \ signal \ primitives \ diff --git a/tools/testing/selftests/powerpc/alignment/.gitignore b/tools/testing/selftests/powerpc/alignment/.gitignore index 6d4fd014511c..28bc6ca13cc6 100644 --- a/tools/testing/selftests/powerpc/alignment/.gitignore +++ b/tools/testing/selftests/powerpc/alignment/.gitignore @@ -1,2 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only copy_first_unaligned alignment_handler diff --git a/tools/testing/selftests/powerpc/benchmarks/.gitignore b/tools/testing/selftests/powerpc/benchmarks/.gitignore index 9161679b1e1a..c9ce13983c99 100644 --- a/tools/testing/selftests/powerpc/benchmarks/.gitignore +++ b/tools/testing/selftests/powerpc/benchmarks/.gitignore @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only gettimeofday context_switch fork diff --git a/tools/testing/selftests/powerpc/benchmarks/Makefile b/tools/testing/selftests/powerpc/benchmarks/Makefile index d40300a65b42..a32a6ab89914 100644 --- a/tools/testing/selftests/powerpc/benchmarks/Makefile +++ b/tools/testing/selftests/powerpc/benchmarks/Makefile @@ -2,6 +2,8 @@ TEST_GEN_PROGS := gettimeofday context_switch fork mmap_bench futex_bench null_syscall TEST_GEN_FILES := exec_target +TEST_FILES := settings + CFLAGS += -O2 top_srcdir = ../../../../.. diff --git a/tools/testing/selftests/powerpc/benchmarks/settings b/tools/testing/selftests/powerpc/benchmarks/settings new file mode 100644 index 000000000000..e7b9417537fb --- /dev/null +++ b/tools/testing/selftests/powerpc/benchmarks/settings @@ -0,0 +1 @@ +timeout=0 diff --git a/tools/testing/selftests/powerpc/cache_shape/.gitignore b/tools/testing/selftests/powerpc/cache_shape/.gitignore index ec1848434be5..b385eee3012c 100644 --- a/tools/testing/selftests/powerpc/cache_shape/.gitignore +++ b/tools/testing/selftests/powerpc/cache_shape/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only cache_shape diff --git a/tools/testing/selftests/powerpc/copyloops/.gitignore b/tools/testing/selftests/powerpc/copyloops/.gitignore index 12ef5b031974..ddaf140b8255 100644 --- a/tools/testing/selftests/powerpc/copyloops/.gitignore +++ b/tools/testing/selftests/powerpc/copyloops/.gitignore @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only copyuser_64_t0 copyuser_64_t1 copyuser_64_t2 diff --git a/tools/testing/selftests/powerpc/dscr/.gitignore b/tools/testing/selftests/powerpc/dscr/.gitignore index b585c6c1564a..1d08b15af697 100644 --- a/tools/testing/selftests/powerpc/dscr/.gitignore +++ b/tools/testing/selftests/powerpc/dscr/.gitignore @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only dscr_default_test dscr_explicit_test dscr_inherit_exec_test diff --git a/tools/testing/selftests/powerpc/dscr/Makefile b/tools/testing/selftests/powerpc/dscr/Makefile index 5df476364b4d..cfa6eedcb66c 100644 --- a/tools/testing/selftests/powerpc/dscr/Makefile +++ b/tools/testing/selftests/powerpc/dscr/Makefile @@ -3,6 +3,8 @@ TEST_GEN_PROGS := dscr_default_test dscr_explicit_test dscr_user_test \ dscr_inherit_test dscr_inherit_exec_test dscr_sysfs_test \ dscr_sysfs_thread_test +TEST_FILES := settings + top_srcdir = ../../../../.. include ../../lib.mk diff --git a/tools/testing/selftests/powerpc/dscr/settings b/tools/testing/selftests/powerpc/dscr/settings new file mode 100644 index 000000000000..e7b9417537fb --- /dev/null +++ b/tools/testing/selftests/powerpc/dscr/settings @@ -0,0 +1 @@ +timeout=0 diff --git a/tools/testing/selftests/powerpc/eeh/eeh-basic.sh b/tools/testing/selftests/powerpc/eeh/eeh-basic.sh index f988d2f42e8f..8a8d0f456946 100755 --- a/tools/testing/selftests/powerpc/eeh/eeh-basic.sh +++ b/tools/testing/selftests/powerpc/eeh/eeh-basic.sh @@ -41,6 +41,11 @@ for dev in `ls -1 /sys/bus/pci/devices/ | grep '\.0$'` ; do continue; fi + if [ "ahci" = "$(basename $(realpath /sys/bus/pci/devices/$dev/driver))" ] ; then + echo "$dev, Skipped: ahci doesn't support recovery" + continue + fi + # Don't inject errosr into an already-frozen PE. This happens with # PEs that contain multiple PCI devices (e.g. multi-function cards) # and injecting new errors during the recovery process will probably diff --git a/tools/testing/selftests/powerpc/math/.gitignore b/tools/testing/selftests/powerpc/math/.gitignore index 50ded63e25b7..e31ca6f453ed 100644 --- a/tools/testing/selftests/powerpc/math/.gitignore +++ b/tools/testing/selftests/powerpc/math/.gitignore @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only fpu_syscall vmx_syscall fpu_preempt diff --git a/tools/testing/selftests/powerpc/mm/.gitignore b/tools/testing/selftests/powerpc/mm/.gitignore index 0ebeaea22641..2ca523255b1b 100644 --- a/tools/testing/selftests/powerpc/mm/.gitignore +++ b/tools/testing/selftests/powerpc/mm/.gitignore @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only hugetlb_vs_thp_test subpage_prot tempfile @@ -6,3 +7,4 @@ segv_errors wild_bctr large_vm_fork_separation bad_accesses +tlbie_test diff --git a/tools/testing/selftests/powerpc/nx-gzip/99-nx-gzip.rules b/tools/testing/selftests/powerpc/nx-gzip/99-nx-gzip.rules new file mode 100644 index 000000000000..5a7118495cb3 --- /dev/null +++ b/tools/testing/selftests/powerpc/nx-gzip/99-nx-gzip.rules @@ -0,0 +1 @@ +SUBSYSTEM=="nxgzip", KERNEL=="nx-gzip", MODE="0666" diff --git a/tools/testing/selftests/powerpc/nx-gzip/Makefile b/tools/testing/selftests/powerpc/nx-gzip/Makefile new file mode 100644 index 000000000000..640fad6cc2c7 --- /dev/null +++ b/tools/testing/selftests/powerpc/nx-gzip/Makefile @@ -0,0 +1,8 @@ +CFLAGS = -O3 -m64 -I./include + +TEST_GEN_FILES := gzfht_test gunz_test +TEST_PROGS := nx-gzip-test.sh + +include ../../lib.mk + +$(TEST_GEN_FILES): gzip_vas.c diff --git a/tools/testing/selftests/powerpc/nx-gzip/README b/tools/testing/selftests/powerpc/nx-gzip/README new file mode 100644 index 000000000000..9809dbaa1905 --- /dev/null +++ b/tools/testing/selftests/powerpc/nx-gzip/README @@ -0,0 +1,45 @@ +Test the nx-gzip function: +========================= + +Verify that following device exists: + /dev/crypto/nx-gzip +If you get a permission error run as sudo or set the device permissions: + sudo chmod go+rw /dev/crypto/nx-gzip +However, chmod may not survive across boots. You may create a udev file such +as: + /etc/udev/rules.d/99-nx-gzip.rules + + +To manually build and run: +$ gcc -O3 -I./include -o gzfht_test gzfht_test.c gzip_vas.c +$ gcc -O3 -I./include -o gunz_test gunz_test.c gzip_vas.c + + +Compress any file using Fixed Huffman mode. Output will have a .nx.gz suffix: +$ ./gzfht_test gzip_vas.c +file gzip_vas.c read, 6413 bytes +compressed 6413 to 3124 bytes total, crc32 checksum = abd15e8a + + +Uncompress the previous output. Output will have a .nx.gunzip suffix: +./gunz_test gzip_vas.c.nx.gz +gzHeader FLG 0 +00 00 00 00 04 03 +gzHeader MTIME, XFL, OS ignored +computed checksum abd15e8a isize 0000190d +stored checksum abd15e8a isize 0000190d +decomp is complete: fclose + + +Compare two files: +$ sha1sum gzip_vas.c.nx.gz.nx.gunzip gzip_vas.c +bf43e3c0c3651f5f22b6f9784cd9b1eeab4120b6 gzip_vas.c.nx.gz.nx.gunzip +bf43e3c0c3651f5f22b6f9784cd9b1eeab4120b6 gzip_vas.c + + +Note that the code here are intended for testing the nx-gzip hardware function. +They are not intended for demonstrating performance or compression ratio. +By being simplistic these selftests expect to allocate the entire set of source +and target pages in the memory so it needs enough memory to work. +For more information and source code consider using: +https://github.com/libnxz/power-gzip diff --git a/tools/testing/selftests/powerpc/nx-gzip/gunz_test.c b/tools/testing/selftests/powerpc/nx-gzip/gunz_test.c new file mode 100644 index 000000000000..6ee0fded0391 --- /dev/null +++ b/tools/testing/selftests/powerpc/nx-gzip/gunz_test.c @@ -0,0 +1,1028 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* P9 gunzip sample code for demonstrating the P9 NX hardware + * interface. Not intended for productive uses or for performance or + * compression ratio measurements. Note also that /dev/crypto/gzip, + * VAS and skiboot support are required + * + * Copyright 2020 IBM Corp. + * + * Author: Bulent Abali <abali@us.ibm.com> + * + * https://github.com/libnxz/power-gzip for zlib api and other utils + * Definitions of acronyms used here. See + * P9 NX Gzip Accelerator User's Manual for details: + * https://github.com/libnxz/power-gzip/blob/develop/doc/power_nx_gzip_um.pdf + * + * adler/crc: 32 bit checksums appended to stream tail + * ce: completion extension + * cpb: coprocessor parameter block (metadata) + * crb: coprocessor request block (command) + * csb: coprocessor status block (status) + * dht: dynamic huffman table + * dde: data descriptor element (address, length) + * ddl: list of ddes + * dh/fh: dynamic and fixed huffman types + * fc: coprocessor function code + * histlen: history/dictionary length + * history: sliding window of up to 32KB of data + * lzcount: Deflate LZ symbol counts + * rembytecnt: remaining byte count + * sfbt: source final block type; last block's type during decomp + * spbc: source processed byte count + * subc: source unprocessed bit count + * tebc: target ending bit count; valid bits in the last byte + * tpbc: target processed byte count + * vas: virtual accelerator switch; the user mode interface + */ + +#define _ISOC11_SOURCE // For aligned_alloc() +#define _DEFAULT_SOURCE // For endian.h + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <stdint.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <sys/fcntl.h> +#include <sys/mman.h> +#include <endian.h> +#include <bits/endian.h> +#include <sys/ioctl.h> +#include <assert.h> +#include <errno.h> +#include <signal.h> +#include "nxu.h" +#include "nx.h" +#include "crb.h" + +int nx_dbg; +FILE *nx_gzip_log; + +#define NX_MIN(X, Y) (((X) < (Y))?(X):(Y)) +#define NX_MAX(X, Y) (((X) > (Y))?(X):(Y)) + +#define GETINPC(X) fgetc(X) +#define FNAME_MAX 1024 + +/* fifo queue management */ +#define fifo_used_bytes(used) (used) +#define fifo_free_bytes(used, len) ((len)-(used)) +/* amount of free bytes in the first and last parts */ +#define fifo_free_first_bytes(cur, used, len) ((((cur)+(used)) <= (len)) \ + ? (len)-((cur)+(used)) : 0) +#define fifo_free_last_bytes(cur, used, len) ((((cur)+(used)) <= (len)) \ + ? (cur) : (len)-(used)) +/* amount of used bytes in the first and last parts */ +#define fifo_used_first_bytes(cur, used, len) ((((cur)+(used)) <= (len)) \ + ? (used) : (len)-(cur)) +#define fifo_used_last_bytes(cur, used, len) ((((cur)+(used)) <= (len)) \ + ? 0 : ((used)+(cur))-(len)) +/* first and last free parts start here */ +#define fifo_free_first_offset(cur, used) ((cur)+(used)) +#define fifo_free_last_offset(cur, used, len) \ + fifo_used_last_bytes(cur, used, len) +/* first and last used parts start here */ +#define fifo_used_first_offset(cur) (cur) +#define fifo_used_last_offset(cur) (0) + +const int fifo_in_len = 1<<24; +const int fifo_out_len = 1<<24; +const int page_sz = 1<<16; +const int line_sz = 1<<7; +const int window_max = 1<<15; + +/* + * Adds an (address, len) pair to the list of ddes (ddl) and updates + * the base dde. ddl[0] is the only dde in a direct dde which + * contains a single (addr,len) pair. For more pairs, ddl[0] becomes + * the indirect (base) dde that points to a list of direct ddes. + * See Section 6.4 of the NX-gzip user manual for DDE description. + * Addr=NULL, len=0 clears the ddl[0]. Returns the total number of + * bytes in ddl. Caller is responsible for allocting the array of + * nx_dde_t *ddl. If N addresses are required in the scatter-gather + * list, the ddl array must have N+1 entries minimum. + */ +static inline uint32_t nx_append_dde(struct nx_dde_t *ddl, void *addr, + uint32_t len) +{ + uint32_t ddecnt; + uint32_t bytes; + + if (addr == NULL && len == 0) { + clearp_dde(ddl); + return 0; + } + + NXPRT(fprintf(stderr, "%d: %s addr %p len %x\n", __LINE__, addr, + __func__, len)); + + /* Number of ddes in the dde list ; == 0 when it is a direct dde */ + ddecnt = getpnn(ddl, dde_count); + bytes = getp32(ddl, ddebc); + + if (ddecnt == 0 && bytes == 0) { + /* First dde is unused; make it a direct dde */ + bytes = len; + putp32(ddl, ddebc, bytes); + putp64(ddl, ddead, (uint64_t) addr); + } else if (ddecnt == 0) { + /* Converting direct to indirect dde + * ddl[0] becomes head dde of ddl + * copy direct to indirect first. + */ + ddl[1] = ddl[0]; + + /* Add the new dde next */ + clear_dde(ddl[2]); + put32(ddl[2], ddebc, len); + put64(ddl[2], ddead, (uint64_t) addr); + + /* Ddl head points to 2 direct ddes */ + ddecnt = 2; + putpnn(ddl, dde_count, ddecnt); + bytes = bytes + len; + putp32(ddl, ddebc, bytes); + /* Pointer to the first direct dde */ + putp64(ddl, ddead, (uint64_t) &ddl[1]); + } else { + /* Append a dde to an existing indirect ddl */ + ++ddecnt; + clear_dde(ddl[ddecnt]); + put64(ddl[ddecnt], ddead, (uint64_t) addr); + put32(ddl[ddecnt], ddebc, len); + + putpnn(ddl, dde_count, ddecnt); + bytes = bytes + len; + putp32(ddl, ddebc, bytes); /* byte sum of all dde */ + } + return bytes; +} + +/* + * Touch specified number of pages represented in number bytes + * beginning from the first buffer in a dde list. + * Do not touch the pages past buf_sz-th byte's page. + * + * Set buf_sz = 0 to touch all pages described by the ddep. + */ +static int nx_touch_pages_dde(struct nx_dde_t *ddep, long buf_sz, long page_sz, + int wr) +{ + uint32_t indirect_count; + uint32_t buf_len; + long total; + uint64_t buf_addr; + struct nx_dde_t *dde_list; + int i; + + assert(!!ddep); + + indirect_count = getpnn(ddep, dde_count); + + NXPRT(fprintf(stderr, "%s dde_count %d request len ", __func__, + indirect_count)); + NXPRT(fprintf(stderr, "0x%lx\n", buf_sz)); + + if (indirect_count == 0) { + /* Direct dde */ + buf_len = getp32(ddep, ddebc); + buf_addr = getp64(ddep, ddead); + + NXPRT(fprintf(stderr, "touch direct ddebc 0x%x ddead %p\n", + buf_len, (void *)buf_addr)); + + if (buf_sz == 0) + nxu_touch_pages((void *)buf_addr, buf_len, page_sz, wr); + else + nxu_touch_pages((void *)buf_addr, NX_MIN(buf_len, + buf_sz), page_sz, wr); + + return ERR_NX_OK; + } + + /* Indirect dde */ + if (indirect_count > MAX_DDE_COUNT) + return ERR_NX_EXCESSIVE_DDE; + + /* First address of the list */ + dde_list = (struct nx_dde_t *) getp64(ddep, ddead); + + if (buf_sz == 0) + buf_sz = getp32(ddep, ddebc); + + total = 0; + for (i = 0; i < indirect_count; i++) { + buf_len = get32(dde_list[i], ddebc); + buf_addr = get64(dde_list[i], ddead); + total += buf_len; + + NXPRT(fprintf(stderr, "touch loop len 0x%x ddead %p total ", + buf_len, (void *)buf_addr)); + NXPRT(fprintf(stderr, "0x%lx\n", total)); + + /* Touching fewer pages than encoded in the ddebc */ + if (total > buf_sz) { + buf_len = NX_MIN(buf_len, total - buf_sz); + nxu_touch_pages((void *)buf_addr, buf_len, page_sz, wr); + NXPRT(fprintf(stderr, "touch loop break len 0x%x ", + buf_len)); + NXPRT(fprintf(stderr, "ddead %p\n", (void *)buf_addr)); + break; + } + nxu_touch_pages((void *)buf_addr, buf_len, page_sz, wr); + } + return ERR_NX_OK; +} + +/* + * Src and dst buffers are supplied in scatter gather lists. + * NX function code and other parameters supplied in cmdp. + */ +static int nx_submit_job(struct nx_dde_t *src, struct nx_dde_t *dst, + struct nx_gzip_crb_cpb_t *cmdp, void *handle) +{ + uint64_t csbaddr; + + memset((void *)&cmdp->crb.csb, 0, sizeof(cmdp->crb.csb)); + + cmdp->crb.source_dde = *src; + cmdp->crb.target_dde = *dst; + + /* Status, output byte count in tpbc */ + csbaddr = ((uint64_t) &cmdp->crb.csb) & csb_address_mask; + put64(cmdp->crb, csb_address, csbaddr); + + /* NX reports input bytes in spbc; cleared */ + cmdp->cpb.out_spbc_comp_wrap = 0; + cmdp->cpb.out_spbc_comp_with_count = 0; + cmdp->cpb.out_spbc_decomp = 0; + + /* Clear output */ + put32(cmdp->cpb, out_crc, INIT_CRC); + put32(cmdp->cpb, out_adler, INIT_ADLER); + + /* Submit the crb, the job descriptor, to the accelerator. */ + return nxu_submit_job(cmdp, handle); +} + +int decompress_file(int argc, char **argv, void *devhandle) +{ + FILE *inpf = NULL; + FILE *outf = NULL; + + int c, expect, i, cc, rc = 0; + char gzfname[FNAME_MAX]; + + /* Queuing, file ops, byte counting */ + char *fifo_in, *fifo_out; + int used_in, cur_in, used_out, cur_out, read_sz, n; + int first_free, last_free, first_used, last_used; + int first_offset, last_offset; + int write_sz, free_space, source_sz; + int source_sz_estimate, target_sz_estimate; + uint64_t last_comp_ratio = 0; /* 1000 max */ + uint64_t total_out = 0; + int is_final, is_eof; + + /* nx hardware */ + int sfbt, subc, spbc, tpbc, nx_ce, fc, resuming = 0; + int history_len = 0; + struct nx_gzip_crb_cpb_t cmd, *cmdp; + struct nx_dde_t *ddl_in; + struct nx_dde_t dde_in[6] __aligned(128); + struct nx_dde_t *ddl_out; + struct nx_dde_t dde_out[6] __aligned(128); + int pgfault_retries; + + /* when using mmap'ed files */ + off_t input_file_offset; + + if (argc > 2) { + fprintf(stderr, "usage: %s <fname> or stdin\n", argv[0]); + fprintf(stderr, " writes to stdout or <fname>.nx.gunzip\n"); + return -1; + } + + if (argc == 1) { + inpf = stdin; + outf = stdout; + } else if (argc == 2) { + char w[1024]; + char *wp; + + inpf = fopen(argv[1], "r"); + if (inpf == NULL) { + perror(argv[1]); + return -1; + } + + /* Make a new file name to write to. Ignoring '.gz' */ + wp = (NULL != (wp = strrchr(argv[1], '/'))) ? (wp+1) : argv[1]; + strcpy(w, wp); + strcat(w, ".nx.gunzip"); + + outf = fopen(w, "w"); + if (outf == NULL) { + perror(w); + return -1; + } + } + + /* Decode the gzip header */ + c = GETINPC(inpf); expect = 0x1f; /* ID1 */ + if (c != expect) + goto err1; + + c = GETINPC(inpf); expect = 0x8b; /* ID2 */ + if (c != expect) + goto err1; + + c = GETINPC(inpf); expect = 0x08; /* CM */ + if (c != expect) + goto err1; + + int flg = GETINPC(inpf); /* FLG */ + + if (flg & 0xE0 || flg & 0x4 || flg == EOF) + goto err2; + + fprintf(stderr, "gzHeader FLG %x\n", flg); + + /* Read 6 bytes; ignoring the MTIME, XFL, OS fields in this + * sample code. + */ + for (i = 0; i < 6; i++) { + char tmp[10]; + + tmp[i] = GETINPC(inpf); + if (tmp[i] == EOF) + goto err3; + fprintf(stderr, "%02x ", tmp[i]); + if (i == 5) + fprintf(stderr, "\n"); + } + fprintf(stderr, "gzHeader MTIME, XFL, OS ignored\n"); + + /* FNAME */ + if (flg & 0x8) { + int k = 0; + + do { + c = GETINPC(inpf); + if (c == EOF || k >= FNAME_MAX) + goto err3; + gzfname[k++] = c; + } while (c); + fprintf(stderr, "gzHeader FNAME: %s\n", gzfname); + } + + /* FHCRC */ + if (flg & 0x2) { + c = GETINPC(inpf); + if (c == EOF) + goto err3; + c = GETINPC(inpf); + if (c == EOF) + goto err3; + fprintf(stderr, "gzHeader FHCRC: ignored\n"); + } + + used_in = cur_in = used_out = cur_out = 0; + is_final = is_eof = 0; + + /* Allocate one page larger to prevent page faults due to NX + * overfetching. + * Either do this (char*)(uintptr_t)aligned_alloc or use + * -std=c11 flag to make the int-to-pointer warning go away. + */ + assert((fifo_in = (char *)(uintptr_t)aligned_alloc(line_sz, + fifo_in_len + page_sz)) != NULL); + assert((fifo_out = (char *)(uintptr_t)aligned_alloc(line_sz, + fifo_out_len + page_sz + line_sz)) != NULL); + /* Leave unused space due to history rounding rules */ + fifo_out = fifo_out + line_sz; + nxu_touch_pages(fifo_out, fifo_out_len, page_sz, 1); + + ddl_in = &dde_in[0]; + ddl_out = &dde_out[0]; + cmdp = &cmd; + memset(&cmdp->crb, 0, sizeof(cmdp->crb)); + +read_state: + + /* Read from .gz file */ + + NXPRT(fprintf(stderr, "read_state:\n")); + + if (is_eof != 0) + goto write_state; + + /* We read in to fifo_in in two steps: first: read in to from + * cur_in to the end of the buffer. last: if free space wrapped + * around, read from fifo_in offset 0 to offset cur_in. + */ + + /* Reset fifo head to reduce unnecessary wrap arounds */ + cur_in = (used_in == 0) ? 0 : cur_in; + + /* Free space total is reduced by a gap */ + free_space = NX_MAX(0, fifo_free_bytes(used_in, fifo_in_len) + - line_sz); + + /* Free space may wrap around as first and last */ + first_free = fifo_free_first_bytes(cur_in, used_in, fifo_in_len); + last_free = fifo_free_last_bytes(cur_in, used_in, fifo_in_len); + + /* Start offsets of the free memory */ + first_offset = fifo_free_first_offset(cur_in, used_in); + last_offset = fifo_free_last_offset(cur_in, used_in, fifo_in_len); + + /* Reduce read_sz because of the line_sz gap */ + read_sz = NX_MIN(free_space, first_free); + n = 0; + if (read_sz > 0) { + /* Read in to offset cur_in + used_in */ + n = fread(fifo_in + first_offset, 1, read_sz, inpf); + used_in = used_in + n; + free_space = free_space - n; + assert(n <= read_sz); + if (n != read_sz) { + /* Either EOF or error; exit the read loop */ + is_eof = 1; + goto write_state; + } + } + + /* If free space wrapped around */ + if (last_free > 0) { + /* Reduce read_sz because of the line_sz gap */ + read_sz = NX_MIN(free_space, last_free); + n = 0; + if (read_sz > 0) { + n = fread(fifo_in + last_offset, 1, read_sz, inpf); + used_in = used_in + n; /* Increase used space */ + free_space = free_space - n; /* Decrease free space */ + assert(n <= read_sz); + if (n != read_sz) { + /* Either EOF or error; exit the read loop */ + is_eof = 1; + goto write_state; + } + } + } + + /* At this point we have used_in bytes in fifo_in with the + * data head starting at cur_in and possibly wrapping around. + */ + +write_state: + + /* Write decompressed data to output file */ + + NXPRT(fprintf(stderr, "write_state:\n")); + + if (used_out == 0) + goto decomp_state; + + /* If fifo_out has data waiting, write it out to the file to + * make free target space for the accelerator used bytes in + * the first and last parts of fifo_out. + */ + + first_used = fifo_used_first_bytes(cur_out, used_out, fifo_out_len); + last_used = fifo_used_last_bytes(cur_out, used_out, fifo_out_len); + + write_sz = first_used; + + n = 0; + if (write_sz > 0) { + n = fwrite(fifo_out + cur_out, 1, write_sz, outf); + used_out = used_out - n; + /* Move head of the fifo */ + cur_out = (cur_out + n) % fifo_out_len; + assert(n <= write_sz); + if (n != write_sz) { + fprintf(stderr, "error: write\n"); + rc = -1; + goto err5; + } + } + + if (last_used > 0) { /* If more data available in the last part */ + write_sz = last_used; /* Keep it here for later */ + n = 0; + if (write_sz > 0) { + n = fwrite(fifo_out, 1, write_sz, outf); + used_out = used_out - n; + cur_out = (cur_out + n) % fifo_out_len; + assert(n <= write_sz); + if (n != write_sz) { + fprintf(stderr, "error: write\n"); + rc = -1; + goto err5; + } + } + } + +decomp_state: + + /* NX decompresses input data */ + + NXPRT(fprintf(stderr, "decomp_state:\n")); + + if (is_final) + goto finish_state; + + /* Address/len lists */ + clearp_dde(ddl_in); + clearp_dde(ddl_out); + + /* FC, CRC, HistLen, Table 6-6 */ + if (resuming) { + /* Resuming a partially decompressed input. + * The key to resume is supplying the 32KB + * dictionary (history) to NX, which is basically + * the last 32KB of output produced. + */ + fc = GZIP_FC_DECOMPRESS_RESUME; + + cmdp->cpb.in_crc = cmdp->cpb.out_crc; + cmdp->cpb.in_adler = cmdp->cpb.out_adler; + + /* Round up the history size to quadword. Section 2.10 */ + history_len = (history_len + 15) / 16; + putnn(cmdp->cpb, in_histlen, history_len); + history_len = history_len * 16; /* bytes */ + + if (history_len > 0) { + /* Chain in the history buffer to the DDE list */ + if (cur_out >= history_len) { + nx_append_dde(ddl_in, fifo_out + + (cur_out - history_len), + history_len); + } else { + nx_append_dde(ddl_in, fifo_out + + ((fifo_out_len + cur_out) + - history_len), + history_len - cur_out); + /* Up to 32KB history wraps around fifo_out */ + nx_append_dde(ddl_in, fifo_out, cur_out); + } + + } + } else { + /* First decompress job */ + fc = GZIP_FC_DECOMPRESS; + + history_len = 0; + /* Writing 0 clears out subc as well */ + cmdp->cpb.in_histlen = 0; + total_out = 0; + + put32(cmdp->cpb, in_crc, INIT_CRC); + put32(cmdp->cpb, in_adler, INIT_ADLER); + put32(cmdp->cpb, out_crc, INIT_CRC); + put32(cmdp->cpb, out_adler, INIT_ADLER); + + /* Assuming 10% compression ratio initially; use the + * most recently measured compression ratio as a + * heuristic to estimate the input and output + * sizes. If we give too much input, the target buffer + * overflows and NX cycles are wasted, and then we + * must retry with smaller input size. 1000 is 100%. + */ + last_comp_ratio = 100UL; + } + cmdp->crb.gzip_fc = 0; + putnn(cmdp->crb, gzip_fc, fc); + + /* + * NX source buffers + */ + first_used = fifo_used_first_bytes(cur_in, used_in, fifo_in_len); + last_used = fifo_used_last_bytes(cur_in, used_in, fifo_in_len); + + if (first_used > 0) + nx_append_dde(ddl_in, fifo_in + cur_in, first_used); + + if (last_used > 0) + nx_append_dde(ddl_in, fifo_in, last_used); + + /* + * NX target buffers + */ + first_free = fifo_free_first_bytes(cur_out, used_out, fifo_out_len); + last_free = fifo_free_last_bytes(cur_out, used_out, fifo_out_len); + + /* Reduce output free space amount not to overwrite the history */ + int target_max = NX_MAX(0, fifo_free_bytes(used_out, fifo_out_len) + - (1<<16)); + + NXPRT(fprintf(stderr, "target_max %d (0x%x)\n", target_max, + target_max)); + + first_free = NX_MIN(target_max, first_free); + if (first_free > 0) { + first_offset = fifo_free_first_offset(cur_out, used_out); + nx_append_dde(ddl_out, fifo_out + first_offset, first_free); + } + + if (last_free > 0) { + last_free = NX_MIN(target_max - first_free, last_free); + if (last_free > 0) { + last_offset = fifo_free_last_offset(cur_out, used_out, + fifo_out_len); + nx_append_dde(ddl_out, fifo_out + last_offset, + last_free); + } + } + + /* Target buffer size is used to limit the source data size + * based on previous measurements of compression ratio. + */ + + /* source_sz includes history */ + source_sz = getp32(ddl_in, ddebc); + assert(source_sz > history_len); + source_sz = source_sz - history_len; + + /* Estimating how much source is needed to 3/4 fill a + * target_max size target buffer. If we overshoot, then NX + * must repeat the job with smaller input and we waste + * bandwidth. If we undershoot then we use more NX calls than + * necessary. + */ + + source_sz_estimate = ((uint64_t)target_max * last_comp_ratio * 3UL) + / 4000; + + if (source_sz_estimate < source_sz) { + /* Target might be small, therefore limiting the + * source data. + */ + source_sz = source_sz_estimate; + target_sz_estimate = target_max; + } else { + /* Source file might be small, therefore limiting target + * touch pages to a smaller value to save processor cycles. + */ + target_sz_estimate = ((uint64_t)source_sz * 1000UL) + / (last_comp_ratio + 1); + target_sz_estimate = NX_MIN(2 * target_sz_estimate, + target_max); + } + + source_sz = source_sz + history_len; + + /* Some NX condition codes require submitting the NX job again. + * Kernel doesn't handle NX page faults. Expects user code to + * touch pages. + */ + pgfault_retries = NX_MAX_FAULTS; + +restart_nx: + + putp32(ddl_in, ddebc, source_sz); + + /* Fault in pages */ + nxu_touch_pages(cmdp, sizeof(struct nx_gzip_crb_cpb_t), page_sz, 1); + nx_touch_pages_dde(ddl_in, 0, page_sz, 0); + nx_touch_pages_dde(ddl_out, target_sz_estimate, page_sz, 1); + + /* Send job to NX */ + cc = nx_submit_job(ddl_in, ddl_out, cmdp, devhandle); + + switch (cc) { + + case ERR_NX_TRANSLATION: + + /* We touched the pages ahead of time. In the most common case + * we shouldn't be here. But may be some pages were paged out. + * Kernel should have placed the faulting address to fsaddr. + */ + NXPRT(fprintf(stderr, "ERR_NX_TRANSLATION %p\n", + (void *)cmdp->crb.csb.fsaddr)); + + if (pgfault_retries == NX_MAX_FAULTS) { + /* Try once with exact number of pages */ + --pgfault_retries; + goto restart_nx; + } else if (pgfault_retries > 0) { + /* If still faulting try fewer input pages + * assuming memory outage + */ + if (source_sz > page_sz) + source_sz = NX_MAX(source_sz / 2, page_sz); + --pgfault_retries; + goto restart_nx; + } else { + fprintf(stderr, "cannot make progress; too many "); + fprintf(stderr, "page fault retries cc= %d\n", cc); + rc = -1; + goto err5; + } + + case ERR_NX_DATA_LENGTH: + + NXPRT(fprintf(stderr, "ERR_NX_DATA_LENGTH; ")); + NXPRT(fprintf(stderr, "stream may have trailing data\n")); + + /* Not an error in the most common case; it just says + * there is trailing data that we must examine. + * + * CC=3 CE(1)=0 CE(0)=1 indicates partial completion + * Fig.6-7 and Table 6-8. + */ + nx_ce = get_csb_ce_ms3b(cmdp->crb.csb); + + if (!csb_ce_termination(nx_ce) && + csb_ce_partial_completion(nx_ce)) { + /* Check CPB for more information + * spbc and tpbc are valid + */ + sfbt = getnn(cmdp->cpb, out_sfbt); /* Table 6-4 */ + subc = getnn(cmdp->cpb, out_subc); /* Table 6-4 */ + spbc = get32(cmdp->cpb, out_spbc_decomp); + tpbc = get32(cmdp->crb.csb, tpbc); + assert(target_max >= tpbc); + + goto ok_cc3; /* not an error */ + } else { + /* History length error when CE(1)=1 CE(0)=0. */ + rc = -1; + fprintf(stderr, "history length error cc= %d\n", cc); + goto err5; + } + + case ERR_NX_TARGET_SPACE: + + /* Target buffer not large enough; retry smaller input + * data; give at least 1 byte. SPBC/TPBC are not valid. + */ + assert(source_sz > history_len); + source_sz = ((source_sz - history_len + 2) / 2) + history_len; + NXPRT(fprintf(stderr, "ERR_NX_TARGET_SPACE; retry with ")); + NXPRT(fprintf(stderr, "smaller input data src %d hist %d\n", + source_sz, history_len)); + goto restart_nx; + + case ERR_NX_OK: + + /* This should not happen for gzip formatted data; + * we need trailing crc and isize + */ + fprintf(stderr, "ERR_NX_OK\n"); + spbc = get32(cmdp->cpb, out_spbc_decomp); + tpbc = get32(cmdp->crb.csb, tpbc); + assert(target_max >= tpbc); + assert(spbc >= history_len); + source_sz = spbc - history_len; + goto offsets_state; + + default: + fprintf(stderr, "error: cc= %d\n", cc); + rc = -1; + goto err5; + } + +ok_cc3: + + NXPRT(fprintf(stderr, "cc3: sfbt: %x\n", sfbt)); + + assert(spbc > history_len); + source_sz = spbc - history_len; + + /* Table 6-4: Source Final Block Type (SFBT) describes the + * last processed deflate block and clues the software how to + * resume the next job. SUBC indicates how many input bits NX + * consumed but did not process. SPBC indicates how many + * bytes of source were given to the accelerator including + * history bytes. + */ + + switch (sfbt) { + int dhtlen; + + case 0x0: /* Deflate final EOB received */ + + /* Calculating the checksum start position. */ + + source_sz = source_sz - subc / 8; + is_final = 1; + break; + + /* Resume decompression cases are below. Basically + * indicates where NX has suspended and how to resume + * the input stream. + */ + + case 0x8: /* Within a literal block; use rembytecount */ + case 0x9: /* Within a literal block; use rembytecount; bfinal=1 */ + + /* Supply the partially processed source byte again */ + source_sz = source_sz - ((subc + 7) / 8); + + /* SUBC LS 3bits: number of bits in the first source byte need + * to be processed. + * 000 means all 8 bits; Table 6-3 + * Clear subc, histlen, sfbt, rembytecnt, dhtlen + */ + cmdp->cpb.in_subc = 0; + cmdp->cpb.in_sfbt = 0; + putnn(cmdp->cpb, in_subc, subc % 8); + putnn(cmdp->cpb, in_sfbt, sfbt); + putnn(cmdp->cpb, in_rembytecnt, getnn(cmdp->cpb, + out_rembytecnt)); + break; + + case 0xA: /* Within a FH block; */ + case 0xB: /* Within a FH block; bfinal=1 */ + + source_sz = source_sz - ((subc + 7) / 8); + + /* Clear subc, histlen, sfbt, rembytecnt, dhtlen */ + cmdp->cpb.in_subc = 0; + cmdp->cpb.in_sfbt = 0; + putnn(cmdp->cpb, in_subc, subc % 8); + putnn(cmdp->cpb, in_sfbt, sfbt); + break; + + case 0xC: /* Within a DH block; */ + case 0xD: /* Within a DH block; bfinal=1 */ + + source_sz = source_sz - ((subc + 7) / 8); + + /* Clear subc, histlen, sfbt, rembytecnt, dhtlen */ + cmdp->cpb.in_subc = 0; + cmdp->cpb.in_sfbt = 0; + putnn(cmdp->cpb, in_subc, subc % 8); + putnn(cmdp->cpb, in_sfbt, sfbt); + + dhtlen = getnn(cmdp->cpb, out_dhtlen); + putnn(cmdp->cpb, in_dhtlen, dhtlen); + assert(dhtlen >= 42); + + /* Round up to a qword */ + dhtlen = (dhtlen + 127) / 128; + + while (dhtlen > 0) { /* Copy dht from cpb.out to cpb.in */ + --dhtlen; + cmdp->cpb.in_dht[dhtlen] = cmdp->cpb.out_dht[dhtlen]; + } + break; + + case 0xE: /* Within a block header; bfinal=0; */ + /* Also given if source data exactly ends (SUBC=0) with + * EOB code with BFINAL=0. Means the next byte will + * contain a block header. + */ + case 0xF: /* within a block header with BFINAL=1. */ + + source_sz = source_sz - ((subc + 7) / 8); + + /* Clear subc, histlen, sfbt, rembytecnt, dhtlen */ + cmdp->cpb.in_subc = 0; + cmdp->cpb.in_sfbt = 0; + putnn(cmdp->cpb, in_subc, subc % 8); + putnn(cmdp->cpb, in_sfbt, sfbt); + + /* Engine did not process any data */ + if (is_eof && (source_sz == 0)) + is_final = 1; + } + +offsets_state: + + /* Adjust the source and target buffer offsets and lengths */ + + NXPRT(fprintf(stderr, "offsets_state:\n")); + + /* Delete input data from fifo_in */ + used_in = used_in - source_sz; + cur_in = (cur_in + source_sz) % fifo_in_len; + input_file_offset = input_file_offset + source_sz; + + /* Add output data to fifo_out */ + used_out = used_out + tpbc; + + assert(used_out <= fifo_out_len); + + total_out = total_out + tpbc; + + /* Deflate history is 32KB max. No need to supply more + * than 32KB on a resume. + */ + history_len = (total_out > window_max) ? window_max : total_out; + + /* To estimate expected expansion in the next NX job; 500 means 50%. + * Deflate best case is around 1 to 1000. + */ + last_comp_ratio = (1000UL * ((uint64_t)source_sz + 1)) + / ((uint64_t)tpbc + 1); + last_comp_ratio = NX_MAX(NX_MIN(1000UL, last_comp_ratio), 1); + NXPRT(fprintf(stderr, "comp_ratio %ld source_sz %d spbc %d tpbc %d\n", + last_comp_ratio, source_sz, spbc, tpbc)); + + resuming = 1; + +finish_state: + + NXPRT(fprintf(stderr, "finish_state:\n")); + + if (is_final) { + if (used_out) + goto write_state; /* More data to write out */ + else if (used_in < 8) { + /* Need at least 8 more bytes containing gzip crc + * and isize. + */ + rc = -1; + goto err4; + } else { + /* Compare checksums and exit */ + int i; + unsigned char tail[8]; + uint32_t cksum, isize; + + for (i = 0; i < 8; i++) + tail[i] = fifo_in[(cur_in + i) % fifo_in_len]; + fprintf(stderr, "computed checksum %08x isize %08x\n", + cmdp->cpb.out_crc, (uint32_t) (total_out + % (1ULL<<32))); + cksum = ((uint32_t) tail[0] | (uint32_t) tail[1]<<8 + | (uint32_t) tail[2]<<16 + | (uint32_t) tail[3]<<24); + isize = ((uint32_t) tail[4] | (uint32_t) tail[5]<<8 + | (uint32_t) tail[6]<<16 + | (uint32_t) tail[7]<<24); + fprintf(stderr, "stored checksum %08x isize %08x\n", + cksum, isize); + + if (cksum == cmdp->cpb.out_crc && isize == (uint32_t) + (total_out % (1ULL<<32))) { + rc = 0; goto ok1; + } else { + rc = -1; goto err4; + } + } + } else + goto read_state; + + return -1; + +err1: + fprintf(stderr, "error: not a gzip file, expect %x, read %x\n", + expect, c); + return -1; + +err2: + fprintf(stderr, "error: the FLG byte is wrong or not being handled\n"); + return -1; + +err3: + fprintf(stderr, "error: gzip header\n"); + return -1; + +err4: + fprintf(stderr, "error: checksum missing or mismatch\n"); + +err5: +ok1: + fprintf(stderr, "decomp is complete: fclose\n"); + fclose(outf); + + return rc; +} + + +int main(int argc, char **argv) +{ + int rc; + struct sigaction act; + void *handle; + + nx_dbg = 0; + nx_gzip_log = NULL; + act.sa_handler = 0; + act.sa_sigaction = nxu_sigsegv_handler; + act.sa_flags = SA_SIGINFO; + act.sa_restorer = 0; + sigemptyset(&act.sa_mask); + sigaction(SIGSEGV, &act, NULL); + + handle = nx_function_begin(NX_FUNC_COMP_GZIP, 0); + if (!handle) { + fprintf(stderr, "Unable to init NX, errno %d\n", errno); + exit(-1); + } + + rc = decompress_file(argc, argv, handle); + + nx_function_end(handle); + + return rc; +} diff --git a/tools/testing/selftests/powerpc/nx-gzip/gzfht_test.c b/tools/testing/selftests/powerpc/nx-gzip/gzfht_test.c new file mode 100644 index 000000000000..7496a83f9c9d --- /dev/null +++ b/tools/testing/selftests/powerpc/nx-gzip/gzfht_test.c @@ -0,0 +1,433 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* P9 gzip sample code for demonstrating the P9 NX hardware interface. + * Not intended for productive uses or for performance or compression + * ratio measurements. For simplicity of demonstration, this sample + * code compresses in to fixed Huffman blocks only (Deflate btype=1) + * and has very simple memory management. Dynamic Huffman blocks + * (Deflate btype=2) are more involved as detailed in the user guide. + * Note also that /dev/crypto/gzip, VAS and skiboot support are + * required. + * + * Copyright 2020 IBM Corp. + * + * https://github.com/libnxz/power-gzip for zlib api and other utils + * + * Author: Bulent Abali <abali@us.ibm.com> + * + * Definitions of acronyms used here. See + * P9 NX Gzip Accelerator User's Manual for details: + * https://github.com/libnxz/power-gzip/blob/develop/doc/power_nx_gzip_um.pdf + * + * adler/crc: 32 bit checksums appended to stream tail + * ce: completion extension + * cpb: coprocessor parameter block (metadata) + * crb: coprocessor request block (command) + * csb: coprocessor status block (status) + * dht: dynamic huffman table + * dde: data descriptor element (address, length) + * ddl: list of ddes + * dh/fh: dynamic and fixed huffman types + * fc: coprocessor function code + * histlen: history/dictionary length + * history: sliding window of up to 32KB of data + * lzcount: Deflate LZ symbol counts + * rembytecnt: remaining byte count + * sfbt: source final block type; last block's type during decomp + * spbc: source processed byte count + * subc: source unprocessed bit count + * tebc: target ending bit count; valid bits in the last byte + * tpbc: target processed byte count + * vas: virtual accelerator switch; the user mode interface + */ + +#define _ISOC11_SOURCE // For aligned_alloc() +#define _DEFAULT_SOURCE // For endian.h + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <stdint.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <sys/fcntl.h> +#include <sys/mman.h> +#include <endian.h> +#include <bits/endian.h> +#include <sys/ioctl.h> +#include <assert.h> +#include <errno.h> +#include <signal.h> +#include "nxu.h" +#include "nx.h" + +int nx_dbg; +FILE *nx_gzip_log; + +#define NX_MIN(X, Y) (((X) < (Y)) ? (X) : (Y)) +#define FNAME_MAX 1024 +#define FEXT ".nx.gz" + +/* + * LZ counts returned in the user supplied nx_gzip_crb_cpb_t structure. + */ +static int compress_fht_sample(char *src, uint32_t srclen, char *dst, + uint32_t dstlen, int with_count, + struct nx_gzip_crb_cpb_t *cmdp, void *handle) +{ + uint32_t fc; + + assert(!!cmdp); + + put32(cmdp->crb, gzip_fc, 0); /* clear */ + fc = (with_count) ? GZIP_FC_COMPRESS_RESUME_FHT_COUNT : + GZIP_FC_COMPRESS_RESUME_FHT; + putnn(cmdp->crb, gzip_fc, fc); + putnn(cmdp->cpb, in_histlen, 0); /* resuming with no history */ + memset((void *) &cmdp->crb.csb, 0, sizeof(cmdp->crb.csb)); + + /* Section 6.6 programming notes; spbc may be in two different + * places depending on FC. + */ + if (!with_count) + put32(cmdp->cpb, out_spbc_comp, 0); + else + put32(cmdp->cpb, out_spbc_comp_with_count, 0); + + /* Figure 6-3 6-4; CSB location */ + put64(cmdp->crb, csb_address, 0); + put64(cmdp->crb, csb_address, + (uint64_t) &cmdp->crb.csb & csb_address_mask); + + /* Source direct dde (scatter-gather list) */ + clear_dde(cmdp->crb.source_dde); + putnn(cmdp->crb.source_dde, dde_count, 0); + put32(cmdp->crb.source_dde, ddebc, srclen); + put64(cmdp->crb.source_dde, ddead, (uint64_t) src); + + /* Target direct dde (scatter-gather list) */ + clear_dde(cmdp->crb.target_dde); + putnn(cmdp->crb.target_dde, dde_count, 0); + put32(cmdp->crb.target_dde, ddebc, dstlen); + put64(cmdp->crb.target_dde, ddead, (uint64_t) dst); + + /* Submit the crb, the job descriptor, to the accelerator */ + return nxu_submit_job(cmdp, handle); +} + +/* + * Prepares a blank no filename no timestamp gzip header and returns + * the number of bytes written to buf. + * Gzip specification at https://tools.ietf.org/html/rfc1952 + */ +int gzip_header_blank(char *buf) +{ + int i = 0; + + buf[i++] = 0x1f; /* ID1 */ + buf[i++] = 0x8b; /* ID2 */ + buf[i++] = 0x08; /* CM */ + buf[i++] = 0x00; /* FLG */ + buf[i++] = 0x00; /* MTIME */ + buf[i++] = 0x00; /* MTIME */ + buf[i++] = 0x00; /* MTIME */ + buf[i++] = 0x00; /* MTIME */ + buf[i++] = 0x04; /* XFL 4=fastest */ + buf[i++] = 0x03; /* OS UNIX */ + + return i; +} + +/* Caller must free the allocated buffer return nonzero on error. */ +int read_alloc_input_file(char *fname, char **buf, size_t *bufsize) +{ + struct stat statbuf; + FILE *fp; + char *p; + size_t num_bytes; + + if (stat(fname, &statbuf)) { + perror(fname); + return(-1); + } + fp = fopen(fname, "r"); + if (fp == NULL) { + perror(fname); + return(-1); + } + assert(NULL != (p = (char *) malloc(statbuf.st_size))); + num_bytes = fread(p, 1, statbuf.st_size, fp); + if (ferror(fp) || (num_bytes != statbuf.st_size)) { + perror(fname); + return(-1); + } + *buf = p; + *bufsize = num_bytes; + return 0; +} + +/* Returns nonzero on error */ +int write_output_file(char *fname, char *buf, size_t bufsize) +{ + FILE *fp; + size_t num_bytes; + + fp = fopen(fname, "w"); + if (fp == NULL) { + perror(fname); + return(-1); + } + num_bytes = fwrite(buf, 1, bufsize, fp); + if (ferror(fp) || (num_bytes != bufsize)) { + perror(fname); + return(-1); + } + fclose(fp); + return 0; +} + +/* + * Z_SYNC_FLUSH as described in zlib.h. + * Returns number of appended bytes + */ +int append_sync_flush(char *buf, int tebc, int final) +{ + uint64_t flush; + int shift = (tebc & 0x7); + + if (tebc > 0) { + /* Last byte is partially full */ + buf = buf - 1; + *buf = *buf & (unsigned char) ((1<<tebc)-1); + } else + *buf = 0; + flush = ((0x1ULL & final) << shift) | *buf; + shift = shift + 3; /* BFINAL and BTYPE written */ + shift = (shift <= 8) ? 8 : 16; + flush |= (0xFFFF0000ULL) << shift; /* Zero length block */ + shift = shift + 32; + while (shift > 0) { + *buf++ = (unsigned char) (flush & 0xffULL); + flush = flush >> 8; + shift = shift - 8; + } + return(((tebc > 5) || (tebc == 0)) ? 5 : 4); +} + +/* + * Final deflate block bit. This call assumes the block + * beginning is byte aligned. + */ +static void set_bfinal(void *buf, int bfinal) +{ + char *b = buf; + + if (bfinal) + *b = *b | (unsigned char) 0x01; + else + *b = *b & (unsigned char) 0xfe; +} + +int compress_file(int argc, char **argv, void *handle) +{ + char *inbuf, *outbuf, *srcbuf, *dstbuf; + char outname[FNAME_MAX]; + uint32_t srclen, dstlen; + uint32_t flushlen, chunk; + size_t inlen, outlen, dsttotlen, srctotlen; + uint32_t crc, spbc, tpbc, tebc; + int lzcounts = 0; + int cc; + int num_hdr_bytes; + struct nx_gzip_crb_cpb_t *cmdp; + uint32_t pagelen = 65536; + int fault_tries = NX_MAX_FAULTS; + + cmdp = (void *)(uintptr_t) + aligned_alloc(sizeof(struct nx_gzip_crb_cpb_t), + sizeof(struct nx_gzip_crb_cpb_t)); + + if (argc != 2) { + fprintf(stderr, "usage: %s <fname>\n", argv[0]); + exit(-1); + } + if (read_alloc_input_file(argv[1], &inbuf, &inlen)) + exit(-1); + fprintf(stderr, "file %s read, %ld bytes\n", argv[1], inlen); + + /* Generous output buffer for header/trailer */ + outlen = 2 * inlen + 1024; + + assert(NULL != (outbuf = (char *)malloc(outlen))); + nxu_touch_pages(outbuf, outlen, pagelen, 1); + + /* Compress piecemeal in smallish chunks */ + chunk = 1<<22; + + /* Write the gzip header to the stream */ + num_hdr_bytes = gzip_header_blank(outbuf); + dstbuf = outbuf + num_hdr_bytes; + outlen = outlen - num_hdr_bytes; + dsttotlen = num_hdr_bytes; + + srcbuf = inbuf; + srctotlen = 0; + + /* Init the CRB, the coprocessor request block */ + memset(&cmdp->crb, 0, sizeof(cmdp->crb)); + + /* Initial gzip crc32 */ + put32(cmdp->cpb, in_crc, 0); + + while (inlen > 0) { + + /* Submit chunk size source data per job */ + srclen = NX_MIN(chunk, inlen); + /* Supply large target in case data expands */ + dstlen = NX_MIN(2*srclen, outlen); + + /* Page faults are handled by the user code */ + + /* Fault-in pages; an improved code wouldn't touch so + * many pages but would try to estimate the + * compression ratio and adjust both the src and dst + * touch amounts. + */ + nxu_touch_pages(cmdp, sizeof(struct nx_gzip_crb_cpb_t), pagelen, + 1); + nxu_touch_pages(srcbuf, srclen, pagelen, 0); + nxu_touch_pages(dstbuf, dstlen, pagelen, 1); + + cc = compress_fht_sample( + srcbuf, srclen, + dstbuf, dstlen, + lzcounts, cmdp, handle); + + if (cc != ERR_NX_OK && cc != ERR_NX_TPBC_GT_SPBC && + cc != ERR_NX_TRANSLATION) { + fprintf(stderr, "nx error: cc= %d\n", cc); + exit(-1); + } + + /* Page faults are handled by the user code */ + if (cc == ERR_NX_TRANSLATION) { + NXPRT(fprintf(stderr, "page fault: cc= %d, ", cc)); + NXPRT(fprintf(stderr, "try= %d, fsa= %08llx\n", + fault_tries, + (unsigned long long) cmdp->crb.csb.fsaddr)); + fault_tries--; + if (fault_tries > 0) { + continue; + } else { + fprintf(stderr, "error: cannot progress; "); + fprintf(stderr, "too many faults\n"); + exit(-1); + }; + } + + fault_tries = NX_MAX_FAULTS; /* Reset for the next chunk */ + + inlen = inlen - srclen; + srcbuf = srcbuf + srclen; + srctotlen = srctotlen + srclen; + + /* Two possible locations for spbc depending on the function + * code. + */ + spbc = (!lzcounts) ? get32(cmdp->cpb, out_spbc_comp) : + get32(cmdp->cpb, out_spbc_comp_with_count); + assert(spbc == srclen); + + /* Target byte count */ + tpbc = get32(cmdp->crb.csb, tpbc); + /* Target ending bit count */ + tebc = getnn(cmdp->cpb, out_tebc); + NXPRT(fprintf(stderr, "compressed chunk %d ", spbc)); + NXPRT(fprintf(stderr, "to %d bytes, tebc= %d\n", tpbc, tebc)); + + if (inlen > 0) { /* More chunks to go */ + set_bfinal(dstbuf, 0); + dstbuf = dstbuf + tpbc; + dsttotlen = dsttotlen + tpbc; + outlen = outlen - tpbc; + /* Round up to the next byte with a flush + * block; do not set the BFINAqL bit. + */ + flushlen = append_sync_flush(dstbuf, tebc, 0); + dsttotlen = dsttotlen + flushlen; + outlen = outlen - flushlen; + dstbuf = dstbuf + flushlen; + NXPRT(fprintf(stderr, "added sync_flush %d bytes\n", + flushlen)); + } else { /* Done */ + /* Set the BFINAL bit of the last block per Deflate + * specification. + */ + set_bfinal(dstbuf, 1); + dstbuf = dstbuf + tpbc; + dsttotlen = dsttotlen + tpbc; + outlen = outlen - tpbc; + } + + /* Resuming crc32 for the next chunk */ + crc = get32(cmdp->cpb, out_crc); + put32(cmdp->cpb, in_crc, crc); + crc = be32toh(crc); + } + + /* Append crc32 and ISIZE to the end */ + memcpy(dstbuf, &crc, 4); + memcpy(dstbuf+4, &srctotlen, 4); + dsttotlen = dsttotlen + 8; + outlen = outlen - 8; + + assert(FNAME_MAX > (strlen(argv[1]) + strlen(FEXT))); + strcpy(outname, argv[1]); + strcat(outname, FEXT); + if (write_output_file(outname, outbuf, dsttotlen)) { + fprintf(stderr, "write error: %s\n", outname); + exit(-1); + } + + fprintf(stderr, "compressed %ld to %ld bytes total, ", srctotlen, + dsttotlen); + fprintf(stderr, "crc32 checksum = %08x\n", crc); + + if (inbuf != NULL) + free(inbuf); + + if (outbuf != NULL) + free(outbuf); + + return 0; +} + +int main(int argc, char **argv) +{ + int rc; + struct sigaction act; + void *handle; + + nx_dbg = 0; + nx_gzip_log = NULL; + act.sa_handler = 0; + act.sa_sigaction = nxu_sigsegv_handler; + act.sa_flags = SA_SIGINFO; + act.sa_restorer = 0; + sigemptyset(&act.sa_mask); + sigaction(SIGSEGV, &act, NULL); + + handle = nx_function_begin(NX_FUNC_COMP_GZIP, 0); + if (!handle) { + fprintf(stderr, "Unable to init NX, errno %d\n", errno); + exit(-1); + } + + rc = compress_file(argc, argv, handle); + + nx_function_end(handle); + + return rc; +} diff --git a/tools/testing/selftests/powerpc/nx-gzip/gzip_vas.c b/tools/testing/selftests/powerpc/nx-gzip/gzip_vas.c new file mode 100644 index 000000000000..c055885da40a --- /dev/null +++ b/tools/testing/selftests/powerpc/nx-gzip/gzip_vas.c @@ -0,0 +1,316 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* + * Copyright 2020 IBM Corp. + * + * Author: Bulent Abali <abali@us.ibm.com> + * + */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <stdint.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <sys/fcntl.h> +#include <sys/mman.h> +#include <endian.h> +#include <bits/endian.h> +#include <sys/ioctl.h> +#include <assert.h> +#include <errno.h> +#include <signal.h> +#include "vas-api.h" +#include "nx.h" +#include "copy-paste.h" +#include "nxu.h" +#include "nx_dbg.h" +#include <sys/platform/ppc.h> + +#define barrier() +#define hwsync() ({ asm volatile("sync" ::: "memory"); }) + +#ifndef NX_NO_CPU_PRI +#define cpu_pri_default() ({ asm volatile ("or 2, 2, 2"); }) +#define cpu_pri_low() ({ asm volatile ("or 31, 31, 31"); }) +#else +#define cpu_pri_default() +#define cpu_pri_low() +#endif + +void *nx_fault_storage_address; + +struct nx_handle { + int fd; + int function; + void *paste_addr; +}; + +static int open_device_nodes(char *devname, int pri, struct nx_handle *handle) +{ + int rc, fd; + void *addr; + struct vas_tx_win_open_attr txattr; + + fd = open(devname, O_RDWR); + if (fd < 0) { + fprintf(stderr, " open device name %s\n", devname); + return -errno; + } + + memset(&txattr, 0, sizeof(txattr)); + txattr.version = 1; + txattr.vas_id = pri; + rc = ioctl(fd, VAS_TX_WIN_OPEN, (unsigned long)&txattr); + if (rc < 0) { + fprintf(stderr, "ioctl() n %d, error %d\n", rc, errno); + rc = -errno; + goto out; + } + + addr = mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0ULL); + if (addr == MAP_FAILED) { + fprintf(stderr, "mmap() failed, errno %d\n", errno); + rc = -errno; + goto out; + } + handle->fd = fd; + handle->paste_addr = (void *)((char *)addr + 0x400); + + rc = 0; +out: + close(fd); + return rc; +} + +void *nx_function_begin(int function, int pri) +{ + int rc; + char *devname = "/dev/crypto/nx-gzip"; + struct nx_handle *nxhandle; + + if (function != NX_FUNC_COMP_GZIP) { + errno = EINVAL; + fprintf(stderr, " NX_FUNC_COMP_GZIP not found\n"); + return NULL; + } + + + nxhandle = malloc(sizeof(*nxhandle)); + if (!nxhandle) { + errno = ENOMEM; + fprintf(stderr, " No memory\n"); + return NULL; + } + + nxhandle->function = function; + rc = open_device_nodes(devname, pri, nxhandle); + if (rc < 0) { + errno = -rc; + fprintf(stderr, " open_device_nodes failed\n"); + return NULL; + } + + return nxhandle; +} + +int nx_function_end(void *handle) +{ + int rc = 0; + struct nx_handle *nxhandle = handle; + + rc = munmap(nxhandle->paste_addr - 0x400, 4096); + if (rc < 0) { + fprintf(stderr, "munmap() failed, errno %d\n", errno); + return rc; + } + close(nxhandle->fd); + free(nxhandle); + + return rc; +} + +static int nx_wait_for_csb(struct nx_gzip_crb_cpb_t *cmdp) +{ + long poll = 0; + uint64_t t; + + /* Save power and let other threads use the h/w. top may show + * 100% but only because OS doesn't know we slowed the this + * h/w thread while polling. We're letting other threads have + * higher throughput on the core. + */ + cpu_pri_low(); + +#define CSB_MAX_POLL 200000000UL +#define USLEEP_TH 300000UL + + t = __ppc_get_timebase(); + + while (getnn(cmdp->crb.csb, csb_v) == 0) { + ++poll; + hwsync(); + + cpu_pri_low(); + + /* usleep(0) takes around 29000 ticks ~60 us. + * 300000 is spinning for about 600 us then + * start sleeping. + */ + if ((__ppc_get_timebase() - t) > USLEEP_TH) { + cpu_pri_default(); + usleep(1); + } + + if (poll > CSB_MAX_POLL) + break; + + /* Fault address from signal handler */ + if (nx_fault_storage_address) { + cpu_pri_default(); + return -EAGAIN; + } + + } + + cpu_pri_default(); + + /* hw has updated csb and output buffer */ + hwsync(); + + /* Check CSB flags. */ + if (getnn(cmdp->crb.csb, csb_v) == 0) { + fprintf(stderr, "CSB still not valid after %d polls.\n", + (int) poll); + prt_err("CSB still not valid after %d polls, giving up.\n", + (int) poll); + return -ETIMEDOUT; + } + + return 0; +} + +static int nxu_run_job(struct nx_gzip_crb_cpb_t *cmdp, void *handle) +{ + int i, ret, retries; + struct nx_handle *nxhandle = handle; + + assert(handle != NULL); + i = 0; + retries = 5000; + while (i++ < retries) { + hwsync(); + vas_copy(&cmdp->crb, 0); + ret = vas_paste(nxhandle->paste_addr, 0); + hwsync(); + + NXPRT(fprintf(stderr, "Paste attempt %d/%d returns 0x%x\n", + i, retries, ret)); + + if ((ret == 2) || (ret == 3)) { + + ret = nx_wait_for_csb(cmdp); + if (!ret) { + goto out; + } else if (ret == -EAGAIN) { + long x; + + prt_err("Touching address %p, 0x%lx\n", + nx_fault_storage_address, + *(long *) nx_fault_storage_address); + x = *(long *) nx_fault_storage_address; + *(long *) nx_fault_storage_address = x; + nx_fault_storage_address = 0; + continue; + } else { + prt_err("wait_for_csb() returns %d\n", ret); + break; + } + } else { + if (i < 10) { + /* spin for few ticks */ +#define SPIN_TH 500UL + uint64_t fail_spin; + + fail_spin = __ppc_get_timebase(); + while ((__ppc_get_timebase() - fail_spin) < + SPIN_TH) + ; + } else { + /* sleep */ + unsigned int pr = 0; + + if (pr++ % 100 == 0) { + prt_err("Paste attempt %d/", i); + prt_err("%d, failed pid= %d\n", retries, + getpid()); + } + usleep(1); + } + continue; + } + } + +out: + cpu_pri_default(); + + return ret; +} + +int nxu_submit_job(struct nx_gzip_crb_cpb_t *cmdp, void *handle) +{ + int cc; + + cc = nxu_run_job(cmdp, handle); + + if (!cc) + cc = getnn(cmdp->crb.csb, csb_cc); /* CC Table 6-8 */ + + return cc; +} + + +void nxu_sigsegv_handler(int sig, siginfo_t *info, void *ctx) +{ + fprintf(stderr, "%d: Got signal %d si_code %d, si_addr %p\n", getpid(), + sig, info->si_code, info->si_addr); + + nx_fault_storage_address = info->si_addr; +} + +/* + * Fault in pages prior to NX job submission. wr=1 may be required to + * touch writeable pages. System zero pages do not fault-in the page as + * intended. Typically set wr=1 for NX target pages and set wr=0 for NX + * source pages. + */ +int nxu_touch_pages(void *buf, long buf_len, long page_len, int wr) +{ + char *begin = buf; + char *end = (char *) buf + buf_len - 1; + volatile char t; + + assert(buf_len >= 0 && !!buf); + + NXPRT(fprintf(stderr, "touch %p %p len 0x%lx wr=%d\n", buf, + (buf + buf_len), buf_len, wr)); + + if (buf_len <= 0 || buf == NULL) + return -1; + + do { + t = *begin; + if (wr) + *begin = t; + begin = begin + page_len; + } while (begin < end); + + /* When buf_sz is small or buf tail is in another page */ + t = *end; + if (wr) + *end = t; + + return 0; +} diff --git a/tools/testing/selftests/powerpc/nx-gzip/include/copy-paste.h b/tools/testing/selftests/powerpc/nx-gzip/include/copy-paste.h new file mode 100644 index 000000000000..0db2d6485037 --- /dev/null +++ b/tools/testing/selftests/powerpc/nx-gzip/include/copy-paste.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +/* From asm-compat.h */ +#define __stringify_in_c(...) #__VA_ARGS__ +#define stringify_in_c(...) __stringify_in_c(__VA_ARGS__) " " + +/* + * Macros taken from arch/powerpc/include/asm/ppc-opcode.h and other + * header files. + */ +#define ___PPC_RA(a) (((a) & 0x1f) << 16) +#define ___PPC_RB(b) (((b) & 0x1f) << 11) + +#define PPC_INST_COPY 0x7c20060c +#define PPC_INST_PASTE 0x7c20070d + +#define PPC_COPY(a, b) stringify_in_c(.long PPC_INST_COPY | \ + ___PPC_RA(a) | ___PPC_RB(b)) +#define PPC_PASTE(a, b) stringify_in_c(.long PPC_INST_PASTE | \ + ___PPC_RA(a) | ___PPC_RB(b)) +#define CR0_SHIFT 28 +#define CR0_MASK 0xF +/* + * Copy/paste instructions: + * + * copy RA,RB + * Copy contents of address (RA) + effective_address(RB) + * to internal copy-buffer. + * + * paste RA,RB + * Paste contents of internal copy-buffer to the address + * (RA) + effective_address(RB) + */ +static inline int vas_copy(void *crb, int offset) +{ + asm volatile(PPC_COPY(%0, %1)";" + : + : "b" (offset), "b" (crb) + : "memory"); + + return 0; +} + +static inline int vas_paste(void *paste_address, int offset) +{ + __u32 cr; + + cr = 0; + asm volatile(PPC_PASTE(%1, %2)";" + "mfocrf %0, 0x80;" + : "=r" (cr) + : "b" (offset), "b" (paste_address) + : "memory", "cr0"); + + return (cr >> CR0_SHIFT) & CR0_MASK; +} diff --git a/tools/testing/selftests/powerpc/nx-gzip/include/crb.h b/tools/testing/selftests/powerpc/nx-gzip/include/crb.h new file mode 100644 index 000000000000..ab101085fa7e --- /dev/null +++ b/tools/testing/selftests/powerpc/nx-gzip/include/crb.h @@ -0,0 +1,155 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef __CRB_H +#define __CRB_H +#include <linux/types.h> +#include "nx.h" + +/* CCW 842 CI/FC masks + * NX P8 workbook, section 4.3.1, figure 4-6 + * "CI/FC Boundary by NX CT type" + */ +#define CCW_CI_842 (0x00003ff8) +#define CCW_FC_842 (0x00000007) + +/* Chapter 6.5.8 Coprocessor-Completion Block (CCB) */ + +#define CCB_VALUE (0x3fffffffffffffff) +#define CCB_ADDRESS (0xfffffffffffffff8) +#define CCB_CM (0x0000000000000007) +#define CCB_CM0 (0x0000000000000004) +#define CCB_CM12 (0x0000000000000003) + +#define CCB_CM0_ALL_COMPLETIONS (0x0) +#define CCB_CM0_LAST_IN_CHAIN (0x4) +#define CCB_CM12_STORE (0x0) +#define CCB_CM12_INTERRUPT (0x1) + +#define CCB_SIZE (0x10) +#define CCB_ALIGN CCB_SIZE + +struct coprocessor_completion_block { + __be64 value; + __be64 address; +} __aligned(CCB_ALIGN); + + +/* Chapter 6.5.7 Coprocessor-Status Block (CSB) */ + +#define CSB_V (0x80) +#define CSB_F (0x04) +#define CSB_CH (0x03) +#define CSB_CE_INCOMPLETE (0x80) +#define CSB_CE_TERMINATION (0x40) +#define CSB_CE_TPBC (0x20) + +#define CSB_CC_SUCCESS (0) +#define CSB_CC_INVALID_ALIGN (1) +#define CSB_CC_OPERAND_OVERLAP (2) +#define CSB_CC_DATA_LENGTH (3) +#define CSB_CC_TRANSLATION (5) +#define CSB_CC_PROTECTION (6) +#define CSB_CC_RD_EXTERNAL (7) +#define CSB_CC_INVALID_OPERAND (8) +#define CSB_CC_PRIVILEGE (9) +#define CSB_CC_INTERNAL (10) +#define CSB_CC_WR_EXTERNAL (12) +#define CSB_CC_NOSPC (13) +#define CSB_CC_EXCESSIVE_DDE (14) +#define CSB_CC_WR_TRANSLATION (15) +#define CSB_CC_WR_PROTECTION (16) +#define CSB_CC_UNKNOWN_CODE (17) +#define CSB_CC_ABORT (18) +#define CSB_CC_TRANSPORT (20) +#define CSB_CC_SEGMENTED_DDL (31) +#define CSB_CC_PROGRESS_POINT (32) +#define CSB_CC_DDE_OVERFLOW (33) +#define CSB_CC_SESSION (34) +#define CSB_CC_PROVISION (36) +#define CSB_CC_CHAIN (37) +#define CSB_CC_SEQUENCE (38) +#define CSB_CC_HW (39) + +#define CSB_SIZE (0x10) +#define CSB_ALIGN CSB_SIZE + +struct coprocessor_status_block { + __u8 flags; + __u8 cs; + __u8 cc; + __u8 ce; + __be32 count; + __be64 address; +} __aligned(CSB_ALIGN); + + +/* Chapter 6.5.10 Data-Descriptor List (DDL) + * each list contains one or more Data-Descriptor Entries (DDE) + */ + +#define DDE_P (0x8000) + +#define DDE_SIZE (0x10) +#define DDE_ALIGN DDE_SIZE + +struct data_descriptor_entry { + __be16 flags; + __u8 count; + __u8 index; + __be32 length; + __be64 address; +} __aligned(DDE_ALIGN); + + +/* Chapter 6.5.2 Coprocessor-Request Block (CRB) */ + +#define CRB_SIZE (0x80) +#define CRB_ALIGN (0x100) /* Errata: requires 256 alignment */ + + +/* Coprocessor Status Block field + * ADDRESS address of CSB + * C CCB is valid + * AT 0 = addrs are virtual, 1 = addrs are phys + * M enable perf monitor + */ +#define CRB_CSB_ADDRESS (0xfffffffffffffff0) +#define CRB_CSB_C (0x0000000000000008) +#define CRB_CSB_AT (0x0000000000000002) +#define CRB_CSB_M (0x0000000000000001) + +struct coprocessor_request_block { + __be32 ccw; + __be32 flags; + __be64 csb_addr; + + struct data_descriptor_entry source; + struct data_descriptor_entry target; + + struct coprocessor_completion_block ccb; + + __u8 reserved[48]; + + struct coprocessor_status_block csb; +} __aligned(CRB_ALIGN); + +#define crb_csb_addr(c) __be64_to_cpu(c->csb_addr) +#define crb_nx_fault_addr(c) __be64_to_cpu(c->stamp.nx.fault_storage_addr) +#define crb_nx_flags(c) c->stamp.nx.flags +#define crb_nx_fault_status(c) c->stamp.nx.fault_status +#define crb_nx_pswid(c) c->stamp.nx.pswid + + +/* RFC02167 Initiate Coprocessor Instructions document + * Chapter 8.2.1.1.1 RS + * Chapter 8.2.3 Coprocessor Directive + * Chapter 8.2.4 Execution + * + * The CCW must be converted to BE before passing to icswx() + */ + +#define CCW_PS (0xff000000) +#define CCW_CT (0x00ff0000) +#define CCW_CD (0x0000ffff) +#define CCW_CL (0x0000c000) + +#endif diff --git a/tools/testing/selftests/powerpc/nx-gzip/include/nx.h b/tools/testing/selftests/powerpc/nx-gzip/include/nx.h new file mode 100644 index 000000000000..1abe23fc29e8 --- /dev/null +++ b/tools/testing/selftests/powerpc/nx-gzip/include/nx.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright 2020 IBM Corp. + * + */ +#ifndef _NX_H +#define _NX_H + +#include <stdbool.h> + +#define NX_FUNC_COMP_842 1 +#define NX_FUNC_COMP_GZIP 2 + +#ifndef __aligned +#define __aligned(x) __attribute__((aligned(x))) +#endif + +struct nx842_func_args { + bool use_crc; + bool decompress; /* true decompress; false compress */ + bool move_data; + int timeout; /* seconds */ +}; + +struct nxbuf_t { + int len; + char *buf; +}; + +/* @function should be EFT (aka 842), GZIP etc */ +void *nx_function_begin(int function, int pri); + +int nx_function(void *handle, struct nxbuf_t *in, struct nxbuf_t *out, + void *arg); + +int nx_function_end(void *handle); + +#endif /* _NX_H */ diff --git a/tools/testing/selftests/powerpc/nx-gzip/include/nx_dbg.h b/tools/testing/selftests/powerpc/nx-gzip/include/nx_dbg.h new file mode 100644 index 000000000000..16464e19c47f --- /dev/null +++ b/tools/testing/selftests/powerpc/nx-gzip/include/nx_dbg.h @@ -0,0 +1,95 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright 2020 IBM Corporation + * + */ + +#ifndef _NXU_DBG_H_ +#define _NXU_DBG_H_ + +#include <sys/file.h> +#include <stdint.h> +#include <stdio.h> +#include <time.h> +#include <pthread.h> + +extern FILE * nx_gzip_log; +extern int nx_gzip_trace; +extern unsigned int nx_gzip_inflate_impl; +extern unsigned int nx_gzip_deflate_impl; +extern unsigned int nx_gzip_inflate_flags; +extern unsigned int nx_gzip_deflate_flags; + +extern int nx_dbg; +pthread_mutex_t mutex_log; + +#define nx_gzip_trace_enabled() (nx_gzip_trace & 0x1) +#define nx_gzip_hw_trace_enabled() (nx_gzip_trace & 0x2) +#define nx_gzip_sw_trace_enabled() (nx_gzip_trace & 0x4) +#define nx_gzip_gather_statistics() (nx_gzip_trace & 0x8) +#define nx_gzip_per_stream_stat() (nx_gzip_trace & 0x10) + +#define prt(fmt, ...) do { \ + pthread_mutex_lock(&mutex_log); \ + flock(nx_gzip_log->_fileno, LOCK_EX); \ + time_t t; struct tm *m; time(&t); m = localtime(&t); \ + fprintf(nx_gzip_log, "[%04d/%02d/%02d %02d:%02d:%02d] " \ + "pid %d: " fmt, \ + (int)m->tm_year + 1900, (int)m->tm_mon+1, (int)m->tm_mday, \ + (int)m->tm_hour, (int)m->tm_min, (int)m->tm_sec, \ + (int)getpid(), ## __VA_ARGS__); \ + fflush(nx_gzip_log); \ + flock(nx_gzip_log->_fileno, LOCK_UN); \ + pthread_mutex_unlock(&mutex_log); \ +} while (0) + +/* Use in case of an error */ +#define prt_err(fmt, ...) do { if (nx_dbg >= 0) { \ + prt("%s:%u: Error: "fmt, \ + __FILE__, __LINE__, ## __VA_ARGS__); \ +}} while (0) + +/* Use in case of an warning */ +#define prt_warn(fmt, ...) do { if (nx_dbg >= 1) { \ + prt("%s:%u: Warning: "fmt, \ + __FILE__, __LINE__, ## __VA_ARGS__); \ +}} while (0) + +/* Informational printouts */ +#define prt_info(fmt, ...) do { if (nx_dbg >= 2) { \ + prt("Info: "fmt, ## __VA_ARGS__); \ +}} while (0) + +/* Trace zlib wrapper code */ +#define prt_trace(fmt, ...) do { if (nx_gzip_trace_enabled()) { \ + prt("### "fmt, ## __VA_ARGS__); \ +}} while (0) + +/* Trace statistics */ +#define prt_stat(fmt, ...) do { if (nx_gzip_gather_statistics()) { \ + prt("### "fmt, ## __VA_ARGS__); \ +}} while (0) + +/* Trace zlib hardware implementation */ +#define hw_trace(fmt, ...) do { \ + if (nx_gzip_hw_trace_enabled()) \ + fprintf(nx_gzip_log, "hhh " fmt, ## __VA_ARGS__); \ + } while (0) + +/* Trace zlib software implementation */ +#define sw_trace(fmt, ...) do { \ + if (nx_gzip_sw_trace_enabled()) \ + fprintf(nx_gzip_log, "sss " fmt, ## __VA_ARGS__); \ + } while (0) + + +/** + * str_to_num - Convert string into number and copy with endings like + * KiB for kilobyte + * MiB for megabyte + * GiB for gigabyte + */ +uint64_t str_to_num(char *str); +void nx_lib_debug(int onoff); + +#endif /* _NXU_DBG_H_ */ diff --git a/tools/testing/selftests/powerpc/nx-gzip/include/nxu.h b/tools/testing/selftests/powerpc/nx-gzip/include/nxu.h new file mode 100644 index 000000000000..20a4e883e0d3 --- /dev/null +++ b/tools/testing/selftests/powerpc/nx-gzip/include/nxu.h @@ -0,0 +1,650 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Hardware interface of the NX-GZIP compression accelerator + * + * Copyright (C) IBM Corporation, 2020 + * + * Author: Bulent Abali <abali@us.ibm.com> + * + */ + +#ifndef _NXU_H +#define _NXU_H + +#include <stdint.h> +#include <endian.h> +#include "nx.h" + +/* deflate */ +#define LLSZ 286 +#define DSZ 30 + +/* nx */ +#define DHTSZ 18 +#define DHT_MAXSZ 288 +#define MAX_DDE_COUNT 256 + +/* util */ +#ifdef NXDBG +#define NXPRT(X) X +#else +#define NXPRT(X) +#endif + +#ifdef NXTIMER +#include <sys/platform/ppc.h> +#define NX_CLK(X) X +#define nx_get_time() __ppc_get_timebase() +#define nx_get_freq() __ppc_get_timebase_freq() +#else +#define NX_CLK(X) +#define nx_get_time() (-1) +#define nx_get_freq() (-1) +#endif + +#define NX_MAX_FAULTS 500 + +/* + * Definitions of acronyms used here. See + * P9 NX Gzip Accelerator User's Manual for details: + * https://github.com/libnxz/power-gzip/blob/develop/doc/power_nx_gzip_um.pdf + * + * adler/crc: 32 bit checksums appended to stream tail + * ce: completion extension + * cpb: coprocessor parameter block (metadata) + * crb: coprocessor request block (command) + * csb: coprocessor status block (status) + * dht: dynamic huffman table + * dde: data descriptor element (address, length) + * ddl: list of ddes + * dh/fh: dynamic and fixed huffman types + * fc: coprocessor function code + * histlen: history/dictionary length + * history: sliding window of up to 32KB of data + * lzcount: Deflate LZ symbol counts + * rembytecnt: remaining byte count + * sfbt: source final block type; last block's type during decomp + * spbc: source processed byte count + * subc: source unprocessed bit count + * tebc: target ending bit count; valid bits in the last byte + * tpbc: target processed byte count + * vas: virtual accelerator switch; the user mode interface + */ + +union nx_qw_t { + uint32_t word[4]; + uint64_t dword[2]; +} __aligned(16); + +/* + * Note: NX registers with fewer than 32 bits are declared by + * convention as uint32_t variables in unions. If *_offset and *_mask + * are defined for a variable, then use get_ put_ macros to + * conveniently access the register fields for endian conversions. + */ + +struct nx_dde_t { + /* Data Descriptor Element, Section 6.4 */ + union { + uint32_t dde_count; + /* When dde_count == 0 ddead is a pointer to a data buffer; + * ddebc is the buffer length bytes. + * When dde_count > 0 dde is an indirect dde; ddead is a + * pointer to a contiguous list of direct ddes; ddebc is the + * total length of all data pointed to by the list of direct + * ddes. Note that only one level of indirection is permitted. + * See Section 6.4 of the user manual for additional details. + */ + }; + uint32_t ddebc; /* dde byte count */ + uint64_t ddead; /* dde address */ +} __aligned(16); + +struct nx_csb_t { + /* Coprocessor Status Block, Section 6.6 */ + union { + uint32_t csb_v; + /* Valid bit. v must be set to 0 by the program + * before submitting the coprocessor command. + * Software can poll for the v bit + */ + + uint32_t csb_f; + /* 16B CSB size. Written to 0 by DMA when it writes the CPB */ + + uint32_t csb_cs; + /* cs completion sequence; unused */ + + uint32_t csb_cc; + /* cc completion code; cc != 0 exception occurred */ + + uint32_t csb_ce; + /* ce completion extension */ + + }; + uint32_t tpbc; + /* target processed byte count TPBC */ + + uint64_t fsaddr; + /* Section 6.12.1 CSB NonZero error summary. FSA Failing storage + * address. Address where error occurred. When available, written + * to A field of CSB + */ +} __aligned(16); + +struct nx_ccb_t { + /* Coprocessor Completion Block, Section 6.7 */ + + uint32_t reserved[3]; + union { + /* When crb.c==0 (no ccb defined) it is reserved; + * When crb.c==1 (ccb defined) it is cm + */ + + uint32_t ccb_cm; + /* Signal interrupt of crb.c==1 and cm==1 */ + + uint32_t word; + /* generic access to the 32bit word */ + }; +} __aligned(16); + +struct vas_stamped_crb_t { + /* + * CRB operand of the paste coprocessor instruction is stamped + * in quadword 4 with the information shown here as its written + * in to the receive FIFO of the coprocessor + */ + + union { + uint32_t vas_buf_num; + /* Verification only vas buffer number which correlates to + * the low order bits of the atag in the paste command + */ + + uint32_t send_wc_id; + /* Pointer to Send Window Context that provides for NX address + * translation information, such as MSR and LPCR bits, job + * completion interrupt RA, PSWID, and job utilization counter. + */ + + }; + union { + uint32_t recv_wc_id; + /* Pointer to Receive Window Context. NX uses this to return + * credits to a Receive FIFO as entries are dequeued. + */ + + }; + uint32_t reserved2; + union { + uint32_t vas_invalid; + /* Invalid bit. If this bit is 1 the CRB is discarded by + * NX upon fetching from the receive FIFO. If this bit is 0 + * the CRB is processed normally. The bit is stamped to 0 + * by VAS and may be written to 1 by hypervisor while + * the CRB is in the receive FIFO (in memory). + */ + + }; +}; + +struct nx_stamped_fault_crb_t { + /* + * A CRB that has a translation fault is stamped by NX in quadword 4 + * and pasted to the Fault Send Window in VAS. + */ + uint64_t fsa; + union { + uint32_t nxsf_t; + uint32_t nxsf_fs; + }; + uint32_t pswid; +}; + +union stamped_crb_t { + struct vas_stamped_crb_t vas; + struct nx_stamped_fault_crb_t nx; +}; + +struct nx_gzip_cpb_t { + /* + * Coprocessor Parameter Block In/Out are used to pass metadata + * to/from accelerator. Tables 6.5 and 6.6 of the user manual. + */ + + /* CPBInput */ + + struct { + union { + union nx_qw_t qw0; + struct { + uint32_t in_adler; /* bits 0:31 */ + uint32_t in_crc; /* bits 32:63 */ + union { + uint32_t in_histlen; /* bits 64:75 */ + uint32_t in_subc; /* bits 93:95 */ + }; + union { + /* bits 108:111 */ + uint32_t in_sfbt; + /* bits 112:127 */ + uint32_t in_rembytecnt; + /* bits 116:127 */ + uint32_t in_dhtlen; + }; + }; + }; + union { + union nx_qw_t in_dht[DHTSZ]; /* qw[1:18] */ + char in_dht_char[DHT_MAXSZ]; /* byte access */ + }; + union nx_qw_t reserved[5]; /* qw[19:23] */ + }; + + /* CPBOutput */ + + volatile struct { + union { + union nx_qw_t qw24; + struct { + uint32_t out_adler; /* bits 0:31 qw[24] */ + uint32_t out_crc; /* bits 32:63 qw[24] */ + union { + /* bits 77:79 qw[24] */ + uint32_t out_tebc; + /* bits 80:95 qw[24] */ + uint32_t out_subc; + }; + union { + /* bits 108:111 qw[24] */ + uint32_t out_sfbt; + /* bits 112:127 qw[24] */ + uint32_t out_rembytecnt; + /* bits 116:127 qw[24] */ + uint32_t out_dhtlen; + }; + }; + }; + union { + union nx_qw_t qw25[79]; /* qw[25:103] */ + /* qw[25] compress no lzcounts or wrap */ + uint32_t out_spbc_comp_wrap; + uint32_t out_spbc_wrap; /* qw[25] wrap */ + /* qw[25] compress no lzcounts */ + uint32_t out_spbc_comp; + /* 286 LL and 30 D symbol counts */ + uint32_t out_lzcount[LLSZ+DSZ]; + struct { + union nx_qw_t out_dht[DHTSZ]; /* qw[25:42] */ + /* qw[43] decompress */ + uint32_t out_spbc_decomp; + }; + }; + /* qw[104] compress with lzcounts */ + uint32_t out_spbc_comp_with_count; + }; +} __aligned(128); + +struct nx_gzip_crb_t { + union { /* byte[0:3] */ + uint32_t gzip_fc; /* bits[24-31] */ + }; + uint32_t reserved1; /* byte[4:7] */ + union { + uint64_t csb_address; /* byte[8:15] */ + struct { + uint32_t reserved2; + union { + uint32_t crb_c; + /* c==0 no ccb defined */ + + uint32_t crb_at; + /* at==0 address type is ignored; + * all addrs effective assumed. + */ + + }; + }; + }; + struct nx_dde_t source_dde; /* byte[16:31] */ + struct nx_dde_t target_dde; /* byte[32:47] */ + volatile struct nx_ccb_t ccb; /* byte[48:63] */ + volatile union { + /* byte[64:239] shift csb by 128 bytes out of the crb; csb was + * in crb earlier; JReilly says csb written with partial inject + */ + union nx_qw_t reserved64[11]; + union stamped_crb_t stamp; /* byte[64:79] */ + }; + volatile struct nx_csb_t csb; +} __aligned(128); + +struct nx_gzip_crb_cpb_t { + struct nx_gzip_crb_t crb; + struct nx_gzip_cpb_t cpb; +} __aligned(2048); + + +/* + * NX hardware convention has the msb bit on the left numbered 0. + * The defines below has *_offset defined as the right most bit + * position of a field. x of size_mask(x) is the field width in bits. + */ + +#define size_mask(x) ((1U<<(x))-1) + +/* + * Offsets and Widths within the containing 32 bits of the various NX + * gzip hardware registers. Use the getnn/putnn macros to access + * these regs + */ + +#define dde_count_mask size_mask(8) +#define dde_count_offset 23 + +/* CSB */ + +#define csb_v_mask size_mask(1) +#define csb_v_offset 0 +#define csb_f_mask size_mask(1) +#define csb_f_offset 6 +#define csb_cs_mask size_mask(8) +#define csb_cs_offset 15 +#define csb_cc_mask size_mask(8) +#define csb_cc_offset 23 +#define csb_ce_mask size_mask(8) +#define csb_ce_offset 31 + +/* CCB */ + +#define ccb_cm_mask size_mask(3) +#define ccb_cm_offset 31 + +/* VAS stamped CRB fields */ + +#define vas_buf_num_mask size_mask(6) +#define vas_buf_num_offset 5 +#define send_wc_id_mask size_mask(16) +#define send_wc_id_offset 31 +#define recv_wc_id_mask size_mask(16) +#define recv_wc_id_offset 31 +#define vas_invalid_mask size_mask(1) +#define vas_invalid_offset 31 + +/* NX stamped fault CRB fields */ + +#define nxsf_t_mask size_mask(1) +#define nxsf_t_offset 23 +#define nxsf_fs_mask size_mask(8) +#define nxsf_fs_offset 31 + +/* CPB input */ + +#define in_histlen_mask size_mask(12) +#define in_histlen_offset 11 +#define in_dhtlen_mask size_mask(12) +#define in_dhtlen_offset 31 +#define in_subc_mask size_mask(3) +#define in_subc_offset 31 +#define in_sfbt_mask size_mask(4) +#define in_sfbt_offset 15 +#define in_rembytecnt_mask size_mask(16) +#define in_rembytecnt_offset 31 + +/* CPB output */ + +#define out_tebc_mask size_mask(3) +#define out_tebc_offset 15 +#define out_subc_mask size_mask(16) +#define out_subc_offset 31 +#define out_sfbt_mask size_mask(4) +#define out_sfbt_offset 15 +#define out_rembytecnt_mask size_mask(16) +#define out_rembytecnt_offset 31 +#define out_dhtlen_mask size_mask(12) +#define out_dhtlen_offset 31 + +/* CRB */ + +#define gzip_fc_mask size_mask(8) +#define gzip_fc_offset 31 +#define crb_c_mask size_mask(1) +#define crb_c_offset 28 +#define crb_at_mask size_mask(1) +#define crb_at_offset 30 +#define csb_address_mask ~(15UL) /* mask off bottom 4b */ + +/* + * Access macros for the registers. Do not access registers directly + * because of the endian conversion. P9 processor may run either as + * Little or Big endian. However the NX coprocessor regs are always + * big endian. + * Use the 32 and 64b macros to access respective + * register sizes. + * Use nn forms for the register fields shorter than 32 bits. + */ + +#define getnn(ST, REG) ((be32toh(ST.REG) >> (31-REG##_offset)) \ + & REG##_mask) +#define getpnn(ST, REG) ((be32toh((ST)->REG) >> (31-REG##_offset)) \ + & REG##_mask) +#define get32(ST, REG) (be32toh(ST.REG)) +#define getp32(ST, REG) (be32toh((ST)->REG)) +#define get64(ST, REG) (be64toh(ST.REG)) +#define getp64(ST, REG) (be64toh((ST)->REG)) + +#define unget32(ST, REG) (get32(ST, REG) & ~((REG##_mask) \ + << (31-REG##_offset))) +/* get 32bits less the REG field */ + +#define ungetp32(ST, REG) (getp32(ST, REG) & ~((REG##_mask) \ + << (31-REG##_offset))) +/* get 32bits less the REG field */ + +#define clear_regs(ST) memset((void *)(&(ST)), 0, sizeof(ST)) +#define clear_dde(ST) do { ST.dde_count = ST.ddebc = 0; ST.ddead = 0; \ + } while (0) +#define clearp_dde(ST) do { (ST)->dde_count = (ST)->ddebc = 0; \ + (ST)->ddead = 0; \ + } while (0) +#define clear_struct(ST) memset((void *)(&(ST)), 0, sizeof(ST)) +#define putnn(ST, REG, X) (ST.REG = htobe32(unget32(ST, REG) | (((X) \ + & REG##_mask) << (31-REG##_offset)))) +#define putpnn(ST, REG, X) ((ST)->REG = htobe32(ungetp32(ST, REG) \ + | (((X) & REG##_mask) << (31-REG##_offset)))) + +#define put32(ST, REG, X) (ST.REG = htobe32(X)) +#define putp32(ST, REG, X) ((ST)->REG = htobe32(X)) +#define put64(ST, REG, X) (ST.REG = htobe64(X)) +#define putp64(ST, REG, X) ((ST)->REG = htobe64(X)) + +/* + * Completion extension ce(0) ce(1) ce(2). Bits ce(3-7) + * unused. Section 6.6 Figure 6.7. + */ + +#define get_csb_ce(ST) ((uint32_t)getnn(ST, csb_ce)) +#define get_csb_ce_ms3b(ST) (get_csb_ce(ST) >> 5) +#define put_csb_ce_ms3b(ST, X) putnn(ST, csb_ce, ((uint32_t)(X) << 5)) + +#define CSB_CE_PARTIAL 0x4 +#define CSB_CE_TERMINATE 0x2 +#define CSB_CE_TPBC_VALID 0x1 + +#define csb_ce_termination(X) (!!((X) & CSB_CE_TERMINATE)) +/* termination, output buffers may be modified, SPBC/TPBC invalid Fig.6-7 */ + +#define csb_ce_check_completion(X) (!csb_ce_termination(X)) +/* if not terminated then check full or partial completion */ + +#define csb_ce_partial_completion(X) (!!((X) & CSB_CE_PARTIAL)) +#define csb_ce_full_completion(X) (!csb_ce_partial_completion(X)) +#define csb_ce_tpbc_valid(X) (!!((X) & CSB_CE_TPBC_VALID)) +/* TPBC indicates successfully stored data count */ + +#define csb_ce_default_err(X) csb_ce_termination(X) +/* most error CEs have CE(0)=0 and CE(1)=1 */ + +#define csb_ce_cc3_partial(X) csb_ce_partial_completion(X) +/* some CC=3 are partially completed, Table 6-8 */ + +#define csb_ce_cc64(X) ((X)&(CSB_CE_PARTIAL \ + | CSB_CE_TERMINATE) == 0) +/* Compression: when TPBC>SPBC then CC=64 Table 6-8; target didn't + * compress smaller than source. + */ + +/* Decompress SFBT combinations Tables 5-3, 6-4, 6-6 */ + +#define SFBT_BFINAL 0x1 +#define SFBT_LIT 0x4 +#define SFBT_FHT 0x5 +#define SFBT_DHT 0x6 +#define SFBT_HDR 0x7 + +/* + * NX gzip function codes. Table 6.2. + * Bits 0:4 are the FC. Bit 5 is used by the DMA controller to + * select one of the two Byte Count Limits. + */ + +#define GZIP_FC_LIMIT_MASK 0x01 +#define GZIP_FC_COMPRESS_FHT 0x00 +#define GZIP_FC_COMPRESS_DHT 0x02 +#define GZIP_FC_COMPRESS_FHT_COUNT 0x04 +#define GZIP_FC_COMPRESS_DHT_COUNT 0x06 +#define GZIP_FC_COMPRESS_RESUME_FHT 0x08 +#define GZIP_FC_COMPRESS_RESUME_DHT 0x0a +#define GZIP_FC_COMPRESS_RESUME_FHT_COUNT 0x0c +#define GZIP_FC_COMPRESS_RESUME_DHT_COUNT 0x0e +#define GZIP_FC_DECOMPRESS 0x10 +#define GZIP_FC_DECOMPRESS_SINGLE_BLK_N_SUSPEND 0x12 +#define GZIP_FC_DECOMPRESS_RESUME 0x14 +#define GZIP_FC_DECOMPRESS_RESUME_SINGLE_BLK_N_SUSPEND 0x16 +#define GZIP_FC_WRAP 0x1e + +#define fc_is_compress(fc) (((fc) & 0x10) == 0) +#define fc_has_count(fc) (fc_is_compress(fc) && (((fc) & 0x4) != 0)) + +/* CSB.CC Error codes */ + +#define ERR_NX_OK 0 +#define ERR_NX_ALIGNMENT 1 +#define ERR_NX_OPOVERLAP 2 +#define ERR_NX_DATA_LENGTH 3 +#define ERR_NX_TRANSLATION 5 +#define ERR_NX_PROTECTION 6 +#define ERR_NX_EXTERNAL_UE7 7 +#define ERR_NX_INVALID_OP 8 +#define ERR_NX_PRIVILEGE 9 +#define ERR_NX_INTERNAL_UE 10 +#define ERR_NX_EXTERN_UE_WR 12 +#define ERR_NX_TARGET_SPACE 13 +#define ERR_NX_EXCESSIVE_DDE 14 +#define ERR_NX_TRANSL_WR 15 +#define ERR_NX_PROTECT_WR 16 +#define ERR_NX_SUBFUNCTION 17 +#define ERR_NX_FUNC_ABORT 18 +#define ERR_NX_BYTE_MAX 19 +#define ERR_NX_CORRUPT_CRB 20 +#define ERR_NX_INVALID_CRB 21 +#define ERR_NX_INVALID_DDE 30 +#define ERR_NX_SEGMENTED_DDL 31 +#define ERR_NX_DDE_OVERFLOW 33 +#define ERR_NX_TPBC_GT_SPBC 64 +#define ERR_NX_MISSING_CODE 66 +#define ERR_NX_INVALID_DIST 67 +#define ERR_NX_INVALID_DHT 68 +#define ERR_NX_EXTERNAL_UE90 90 +#define ERR_NX_WDOG_TIMER 224 +#define ERR_NX_AT_FAULT 250 +#define ERR_NX_INTR_SERVER 252 +#define ERR_NX_UE253 253 +#define ERR_NX_NO_HW 254 +#define ERR_NX_HUNG_OP 255 +#define ERR_NX_END 256 + +/* initial values for non-resume operations */ +#define INIT_CRC 0 /* crc32(0L, Z_NULL, 0) */ +#define INIT_ADLER 1 /* adler32(0L, Z_NULL, 0) adler is initialized to 1 */ + +/* prototypes */ +int nxu_submit_job(struct nx_gzip_crb_cpb_t *c, void *handle); + +extern void nxu_sigsegv_handler(int sig, siginfo_t *info, void *ctx); +extern int nxu_touch_pages(void *buf, long buf_len, long page_len, int wr); + +/* caller supplies a print buffer 4*sizeof(crb) */ + +char *nx_crb_str(struct nx_gzip_crb_t *crb, char *prbuf); +char *nx_cpb_str(struct nx_gzip_cpb_t *cpb, char *prbuf); +char *nx_prt_hex(void *cp, int sz, char *prbuf); +char *nx_lzcount_str(struct nx_gzip_cpb_t *cpb, char *prbuf); +char *nx_strerror(int e); + +#ifdef NX_SIM +#include <stdio.h> +int nx_sim_init(void *ctx); +int nx_sim_end(void *ctx); +int nxu_run_sim_job(struct nx_gzip_crb_cpb_t *c, void *ctx); +#endif /* NX_SIM */ + +/* Deflate stream manipulation */ + +#define set_final_bit(x) (x |= (unsigned char)1) +#define clr_final_bit(x) (x &= ~(unsigned char)1) + +#define append_empty_fh_blk(p, b) do { *(p) = (2 | (1&(b))); *((p)+1) = 0; \ + } while (0) +/* append 10 bits 0000001b 00...... ; + * assumes appending starts on a byte boundary; b is the final bit. + */ + + +#ifdef NX_842 + +/* 842 Engine */ + +struct nx_eft_crb_t { + union { /* byte[0:3] */ + uint32_t eft_fc; /* bits[29-31] */ + }; + uint32_t reserved1; /* byte[4:7] */ + union { + uint64_t csb_address; /* byte[8:15] */ + struct { + uint32_t reserved2; + union { + uint32_t crb_c; + /* c==0 no ccb defined */ + + uint32_t crb_at; + /* at==0 address type is ignored; + * all addrs effective assumed. + */ + + }; + }; + }; + struct nx_dde_t source_dde; /* byte[16:31] */ + struct nx_dde_t target_dde; /* byte[32:47] */ + struct nx_ccb_t ccb; /* byte[48:63] */ + union { + union nx_qw_t reserved64[3]; /* byte[64:96] */ + }; + struct nx_csb_t csb; +} __aligned(128); + +/* 842 CRB */ + +#define EFT_FC_MASK size_mask(3) +#define EFT_FC_OFFSET 31 +#define EFT_FC_COMPRESS 0x0 +#define EFT_FC_COMPRESS_WITH_CRC 0x1 +#define EFT_FC_DECOMPRESS 0x2 +#define EFT_FC_DECOMPRESS_WITH_CRC 0x3 +#define EFT_FC_BLK_DATA_MOVE 0x4 +#endif /* NX_842 */ + +#endif /* _NXU_H */ diff --git a/tools/testing/selftests/powerpc/nx-gzip/include/vas-api.h b/tools/testing/selftests/powerpc/nx-gzip/include/vas-api.h new file mode 120000 index 000000000000..77fb4c7236d0 --- /dev/null +++ b/tools/testing/selftests/powerpc/nx-gzip/include/vas-api.h @@ -0,0 +1 @@ +../../../../../../arch/powerpc/include/uapi/asm/vas-api.h
\ No newline at end of file diff --git a/tools/testing/selftests/powerpc/nx-gzip/nx-gzip-test.sh b/tools/testing/selftests/powerpc/nx-gzip/nx-gzip-test.sh new file mode 100755 index 000000000000..c7b46c5fd7b3 --- /dev/null +++ b/tools/testing/selftests/powerpc/nx-gzip/nx-gzip-test.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0-or-later + +if [[ ! -w /dev/crypto/nx-gzip ]]; then + echo "Can't access /dev/crypto/nx-gzip, skipping" + echo "skip: $0" + exit 4 +fi + +set -e + +function cleanup +{ + rm -f nx-tempfile* +} + +trap cleanup EXIT + +function test_sizes +{ + local n=$1 + local fname="nx-tempfile.$n" + + for size in 4K 64K 1M 64M + do + echo "Testing $size ($n) ..." + dd if=/dev/urandom of=$fname bs=$size count=1 + ./gzfht_test $fname + ./gunz_test ${fname}.nx.gz + done +} + +echo "Doing basic test of different sizes ..." +test_sizes 0 + +echo "Running tests in parallel ..." +for i in {1..16} +do + test_sizes $i & +done + +wait + +echo "OK" + +exit 0 diff --git a/tools/testing/selftests/powerpc/pmu/.gitignore b/tools/testing/selftests/powerpc/pmu/.gitignore index e748f336eed3..f69b1e2641a1 100644 --- a/tools/testing/selftests/powerpc/pmu/.gitignore +++ b/tools/testing/selftests/powerpc/pmu/.gitignore @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only count_instructions l3_bank_test per_event_excludes +count_stcx_fail diff --git a/tools/testing/selftests/powerpc/pmu/Makefile b/tools/testing/selftests/powerpc/pmu/Makefile index 19046db995fe..904672fb78dd 100644 --- a/tools/testing/selftests/powerpc/pmu/Makefile +++ b/tools/testing/selftests/powerpc/pmu/Makefile @@ -2,7 +2,7 @@ noarg: $(MAKE) -C ../ -TEST_GEN_PROGS := count_instructions l3_bank_test per_event_excludes +TEST_GEN_PROGS := count_instructions count_stcx_fail l3_bank_test per_event_excludes EXTRA_SOURCES := ../harness.c event.c lib.c ../utils.c top_srcdir = ../../../../.. @@ -13,8 +13,12 @@ all: $(TEST_GEN_PROGS) ebb $(TEST_GEN_PROGS): $(EXTRA_SOURCES) # loop.S can only be built 64-bit +$(OUTPUT)/count_instructions: CFLAGS += -m64 $(OUTPUT)/count_instructions: loop.S count_instructions.c $(EXTRA_SOURCES) - $(CC) $(CFLAGS) -m64 -o $@ $^ + +$(OUTPUT)/count_stcx_fail: CFLAGS += -m64 +$(OUTPUT)/count_stcx_fail: loop.S $(EXTRA_SOURCES) + $(OUTPUT)/per_event_excludes: ../utils.c diff --git a/tools/testing/selftests/powerpc/pmu/count_stcx_fail.c b/tools/testing/selftests/powerpc/pmu/count_stcx_fail.c new file mode 100644 index 000000000000..7b4ac4537702 --- /dev/null +++ b/tools/testing/selftests/powerpc/pmu/count_stcx_fail.c @@ -0,0 +1,161 @@ +/* + * Copyright 2013, Michael Ellerman, IBM Corp. + * Licensed under GPLv2. + */ + +#define _GNU_SOURCE + +#include <stdio.h> +#include <stdbool.h> +#include <string.h> +#include <sys/prctl.h> + +#include "event.h" +#include "utils.h" +#include "lib.h" + +extern void thirty_two_instruction_loop_with_ll_sc(u64 loops, u64 *ll_sc_target); + +static void setup_event(struct event *e, u64 config, int type, char *name) +{ + event_init_opts(e, config, type, name); + + e->attr.disabled = 1; + e->attr.exclude_kernel = 1; + e->attr.exclude_hv = 1; + e->attr.exclude_idle = 1; +} + +static int do_count_loop(struct event *events, u64 instructions, + u64 overhead, bool report) +{ + s64 difference, expected; + double percentage; + u64 dummy; + + prctl(PR_TASK_PERF_EVENTS_ENABLE); + + /* Run for 1M instructions */ + thirty_two_instruction_loop_with_ll_sc(instructions >> 5, &dummy); + + prctl(PR_TASK_PERF_EVENTS_DISABLE); + + event_read(&events[0]); + event_read(&events[1]); + event_read(&events[2]); + + expected = instructions + overhead + (events[2].result.value * 10); + difference = events[0].result.value - expected; + percentage = (double)difference / events[0].result.value * 100; + + if (report) { + printf("-----\n"); + event_report(&events[0]); + event_report(&events[1]); + event_report(&events[2]); + + printf("Looped for %llu instructions, overhead %llu\n", instructions, overhead); + printf("Expected %llu\n", expected); + printf("Actual %llu\n", events[0].result.value); + printf("Delta %lld, %f%%\n", difference, percentage); + } + + event_reset(&events[0]); + event_reset(&events[1]); + event_reset(&events[2]); + + if (difference < 0) + difference = -difference; + + /* Tolerate a difference below 0.0001 % */ + difference *= 10000 * 100; + if (difference / events[0].result.value) + return -1; + + return 0; +} + +/* Count how many instructions it takes to do a null loop */ +static u64 determine_overhead(struct event *events) +{ + u64 current, overhead; + int i; + + do_count_loop(events, 0, 0, false); + overhead = events[0].result.value; + + for (i = 0; i < 100; i++) { + do_count_loop(events, 0, 0, false); + current = events[0].result.value; + if (current < overhead) { + printf("Replacing overhead %llu with %llu\n", overhead, current); + overhead = current; + } + } + + return overhead; +} + +#define PM_MRK_STCX_FAIL 0x03e158 +#define PM_STCX_FAIL 0x01e058 + +static int test_body(void) +{ + struct event events[3]; + u64 overhead; + + setup_event(&events[0], PERF_COUNT_HW_INSTRUCTIONS, PERF_TYPE_HARDWARE, "instructions"); + setup_event(&events[1], PERF_COUNT_HW_CPU_CYCLES, PERF_TYPE_HARDWARE, "cycles"); + setup_event(&events[2], PM_STCX_FAIL, PERF_TYPE_RAW, "stcx_fail"); + + if (event_open(&events[0])) { + perror("perf_event_open"); + return -1; + } + + if (event_open_with_group(&events[1], events[0].fd)) { + perror("perf_event_open"); + return -1; + } + + if (event_open_with_group(&events[2], events[0].fd)) { + perror("perf_event_open"); + return -1; + } + + overhead = determine_overhead(events); + printf("Overhead of null loop: %llu instructions\n", overhead); + + /* Run for 1Mi instructions */ + FAIL_IF(do_count_loop(events, 1000000, overhead, true)); + + /* Run for 10Mi instructions */ + FAIL_IF(do_count_loop(events, 10000000, overhead, true)); + + /* Run for 100Mi instructions */ + FAIL_IF(do_count_loop(events, 100000000, overhead, true)); + + /* Run for 1Bi instructions */ + FAIL_IF(do_count_loop(events, 1000000000, overhead, true)); + + /* Run for 16Bi instructions */ + FAIL_IF(do_count_loop(events, 16000000000, overhead, true)); + + /* Run for 64Bi instructions */ + FAIL_IF(do_count_loop(events, 64000000000, overhead, true)); + + event_close(&events[0]); + event_close(&events[1]); + + return 0; +} + +static int count_ll_sc(void) +{ + return eat_cpu(test_body); +} + +int main(void) +{ + return test_harness(count_ll_sc, "count_ll_sc"); +} diff --git a/tools/testing/selftests/powerpc/pmu/ebb/.gitignore b/tools/testing/selftests/powerpc/pmu/ebb/.gitignore index 42bddbed8b64..2920fb39439b 100644 --- a/tools/testing/selftests/powerpc/pmu/ebb/.gitignore +++ b/tools/testing/selftests/powerpc/pmu/ebb/.gitignore @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only reg_access_test event_attributes_test cycles_test diff --git a/tools/testing/selftests/powerpc/pmu/ebb/Makefile b/tools/testing/selftests/powerpc/pmu/ebb/Makefile index 417306353e07..ca35dd8848b0 100644 --- a/tools/testing/selftests/powerpc/pmu/ebb/Makefile +++ b/tools/testing/selftests/powerpc/pmu/ebb/Makefile @@ -7,6 +7,7 @@ noarg: # The EBB handler is 64-bit code and everything links against it CFLAGS += -m64 +TMPOUT = $(OUTPUT)/ # Toolchains may build PIE by default which breaks the assembly no-pie-option := $(call try-run, echo 'int main() { return 0; }' | \ $(CC) -Werror $(KBUILD_CPPFLAGS) $(CC_OPTION_CFLAGS) -no-pie -x c - -o "$$TMP", -no-pie) diff --git a/tools/testing/selftests/powerpc/pmu/ebb/trace.h b/tools/testing/selftests/powerpc/pmu/ebb/trace.h index 7c0fb5d2bdb1..da2a3be5441f 100644 --- a/tools/testing/selftests/powerpc/pmu/ebb/trace.h +++ b/tools/testing/selftests/powerpc/pmu/ebb/trace.h @@ -18,7 +18,7 @@ struct trace_entry { u8 type; u8 length; - u8 data[0]; + u8 data[]; }; struct trace_buffer @@ -26,7 +26,7 @@ struct trace_buffer u64 size; bool overflow; void *tail; - u8 data[0]; + u8 data[]; }; struct trace_buffer *trace_buffer_allocate(u64 size); diff --git a/tools/testing/selftests/powerpc/pmu/loop.S b/tools/testing/selftests/powerpc/pmu/loop.S index 8cc9b5e2c9de..c52ba09b6fed 100644 --- a/tools/testing/selftests/powerpc/pmu/loop.S +++ b/tools/testing/selftests/powerpc/pmu/loop.S @@ -41,3 +41,38 @@ FUNC_START(thirty_two_instruction_loop) subi r3,r3,1 b FUNC_NAME(thirty_two_instruction_loop) FUNC_END(thirty_two_instruction_loop) + +FUNC_START(thirty_two_instruction_loop_with_ll_sc) + cmpdi r3,0 + beqlr + addi r5,r5,1 + addi r5,r5,1 + addi r5,r5,1 # 5 + addi r5,r5,1 + addi r5,r5,1 + addi r5,r5,1 + addi r5,r5,1 +1: ldarx r6,0,r4 # 10 + addi r5,r5,1 + addi r5,r5,1 + addi r5,r5,1 + addi r5,r5,1 + addi r5,r5,1 # 15 + addi r5,r5,1 + addi r5,r5,1 + stdcx. r6,0,r4 + bne- 1b + addi r5,r5,1 # 20 + addi r5,r5,1 + addi r5,r5,1 + addi r5,r5,1 + addi r5,r5,1 + addi r5,r5,1 # 25 + addi r5,r5,1 + addi r5,r5,1 + addi r5,r5,1 + addi r5,r5,1 + addi r5,r5,1 # 30 + subi r3,r3,1 + b FUNC_NAME(thirty_two_instruction_loop_with_ll_sc) +FUNC_END(thirty_two_instruction_loop_with_ll_sc) diff --git a/tools/testing/selftests/powerpc/primitives/.gitignore b/tools/testing/selftests/powerpc/primitives/.gitignore index 4cc4e31bed1d..1e5c04e24254 100644 --- a/tools/testing/selftests/powerpc/primitives/.gitignore +++ b/tools/testing/selftests/powerpc/primitives/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only load_unaligned_zeropad diff --git a/tools/testing/selftests/powerpc/ptrace/.gitignore b/tools/testing/selftests/powerpc/ptrace/.gitignore index dce19f221c46..0e96150b7c7e 100644 --- a/tools/testing/selftests/powerpc/ptrace/.gitignore +++ b/tools/testing/selftests/powerpc/ptrace/.gitignore @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only ptrace-gpr ptrace-tm-gpr ptrace-tm-spd-gpr diff --git a/tools/testing/selftests/powerpc/security/.gitignore b/tools/testing/selftests/powerpc/security/.gitignore index 0b969fba3beb..f795e06f5ae3 100644 --- a/tools/testing/selftests/powerpc/security/.gitignore +++ b/tools/testing/selftests/powerpc/security/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only rfi_flush diff --git a/tools/testing/selftests/powerpc/signal/.gitignore b/tools/testing/selftests/powerpc/signal/.gitignore index dca5852a1546..405b5364044c 100644 --- a/tools/testing/selftests/powerpc/signal/.gitignore +++ b/tools/testing/selftests/powerpc/signal/.gitignore @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only signal signal_tm sigfuz +sigreturn_vdso diff --git a/tools/testing/selftests/powerpc/signal/Makefile b/tools/testing/selftests/powerpc/signal/Makefile index 113838fbbe7f..d6ae54663aed 100644 --- a/tools/testing/selftests/powerpc/signal/Makefile +++ b/tools/testing/selftests/powerpc/signal/Makefile @@ -1,10 +1,12 @@ # SPDX-License-Identifier: GPL-2.0 -TEST_GEN_PROGS := signal signal_tm sigfuz +TEST_GEN_PROGS := signal signal_tm sigfuz sigreturn_vdso sig_sc_double_restart CFLAGS += -maltivec $(OUTPUT)/signal_tm: CFLAGS += -mhtm $(OUTPUT)/sigfuz: CFLAGS += -pthread -m64 +TEST_FILES := settings + top_srcdir = ../../../../.. include ../../lib.mk diff --git a/tools/testing/selftests/powerpc/signal/settings b/tools/testing/selftests/powerpc/signal/settings new file mode 100644 index 000000000000..e7b9417537fb --- /dev/null +++ b/tools/testing/selftests/powerpc/signal/settings @@ -0,0 +1 @@ +timeout=0 diff --git a/tools/testing/selftests/powerpc/signal/sig_sc_double_restart.c b/tools/testing/selftests/powerpc/signal/sig_sc_double_restart.c new file mode 100644 index 000000000000..e3972264615b --- /dev/null +++ b/tools/testing/selftests/powerpc/signal/sig_sc_double_restart.c @@ -0,0 +1,174 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Test that a syscall does not get restarted twice, handled by trap_norestart() + * + * Based on Al's description, and a test for the bug fixed in this commit: + * + * commit 9a81c16b527528ad307843be5571111aa8d35a80 + * Author: Al Viro <viro@zeniv.linux.org.uk> + * Date: Mon Sep 20 21:48:57 2010 +0100 + * + * powerpc: fix double syscall restarts + * + * Make sigreturn zero regs->trap, make do_signal() do the same on all + * paths. As it is, signal interrupting e.g. read() from fd 512 (== + * ERESTARTSYS) with another signal getting unblocked when the first + * handler finishes will lead to restart one insn earlier than it ought + * to. Same for multiple signals with in-kernel handlers interrupting + * that sucker at the same time. Same for multiple signals of any kind + * interrupting that sucker on 64bit... + */ +#define _GNU_SOURCE +#include <sys/types.h> +#include <sys/wait.h> +#include <sys/syscall.h> +#include <unistd.h> +#include <signal.h> +#include <errno.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> + +#include "utils.h" + +static void SIGUSR1_handler(int sig) +{ + kill(getpid(), SIGUSR2); + /* + * SIGUSR2 is blocked until the handler exits, at which point it will + * be raised again and think there is a restart to be done because the + * pending restarted syscall has 512 (ERESTARTSYS) in r3. The second + * restart will retreat NIP another 4 bytes to fail case branch. + */ +} + +static void SIGUSR2_handler(int sig) +{ +} + +static ssize_t raw_read(int fd, void *buf, size_t count) +{ + register long nr asm("r0") = __NR_read; + register long _fd asm("r3") = fd; + register void *_buf asm("r4") = buf; + register size_t _count asm("r5") = count; + + asm volatile( +" b 0f \n" +" b 1f \n" +" 0: sc 0 \n" +" bns 2f \n" +" neg %0,%0 \n" +" b 2f \n" +" 1: \n" +" li %0,%4 \n" +" 2: \n" + : "+r"(_fd), "+r"(nr), "+r"(_buf), "+r"(_count) + : "i"(-ENOANO) + : "memory", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "ctr", "cr0"); + + if (_fd < 0) { + errno = -_fd; + _fd = -1; + } + + return _fd; +} + +#define DATA "test 123" +#define DLEN (strlen(DATA)+1) + +int test_restart(void) +{ + int pipefd[2]; + pid_t pid; + char buf[512]; + + if (pipe(pipefd) == -1) { + perror("pipe"); + exit(EXIT_FAILURE); + } + + pid = fork(); + if (pid == -1) { + perror("fork"); + exit(EXIT_FAILURE); + } + + if (pid == 0) { /* Child reads from pipe */ + struct sigaction act; + int fd; + + memset(&act, 0, sizeof(act)); + sigaddset(&act.sa_mask, SIGUSR2); + act.sa_handler = SIGUSR1_handler; + act.sa_flags = SA_RESTART; + if (sigaction(SIGUSR1, &act, NULL) == -1) { + perror("sigaction"); + exit(EXIT_FAILURE); + } + + memset(&act, 0, sizeof(act)); + act.sa_handler = SIGUSR2_handler; + act.sa_flags = SA_RESTART; + if (sigaction(SIGUSR2, &act, NULL) == -1) { + perror("sigaction"); + exit(EXIT_FAILURE); + } + + /* Let's get ERESTARTSYS into r3 */ + while ((fd = dup(pipefd[0])) != 512) { + if (fd == -1) { + perror("dup"); + exit(EXIT_FAILURE); + } + } + + if (raw_read(fd, buf, 512) == -1) { + if (errno == ENOANO) { + fprintf(stderr, "Double restart moved restart before sc instruction.\n"); + _exit(EXIT_FAILURE); + } + perror("read"); + exit(EXIT_FAILURE); + } + + if (strncmp(buf, DATA, DLEN)) { + fprintf(stderr, "bad test string %s\n", buf); + exit(EXIT_FAILURE); + } + + return 0; + + } else { + int wstatus; + + usleep(100000); /* Hack to get reader waiting */ + kill(pid, SIGUSR1); + usleep(100000); + if (write(pipefd[1], DATA, DLEN) != DLEN) { + perror("write"); + exit(EXIT_FAILURE); + } + close(pipefd[0]); + close(pipefd[1]); + if (wait(&wstatus) == -1) { + perror("wait"); + exit(EXIT_FAILURE); + } + if (!WIFEXITED(wstatus)) { + fprintf(stderr, "child exited abnormally\n"); + exit(EXIT_FAILURE); + } + + FAIL_IF(WEXITSTATUS(wstatus) != EXIT_SUCCESS); + + return 0; + } +} + +int main(void) +{ + test_harness_set_timeout(10); + return test_harness(test_restart, "sig sys restart"); +} diff --git a/tools/testing/selftests/powerpc/signal/sigreturn_vdso.c b/tools/testing/selftests/powerpc/signal/sigreturn_vdso.c new file mode 100644 index 000000000000..e282fff0fe25 --- /dev/null +++ b/tools/testing/selftests/powerpc/signal/sigreturn_vdso.c @@ -0,0 +1,127 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test that we can take signals with and without the VDSO mapped, which trigger + * different paths in the signal handling code. + * + * See handle_rt_signal64() and setup_trampoline() in signal_64.c + */ + +#define _GNU_SOURCE + +#include <errno.h> +#include <stdio.h> +#include <signal.h> +#include <stdlib.h> +#include <string.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <unistd.h> + +// Ensure assert() is not compiled out +#undef NDEBUG +#include <assert.h> + +#include "utils.h" + +static int search_proc_maps(char *needle, unsigned long *low, unsigned long *high) +{ + unsigned long start, end; + static char buf[4096]; + char name[128]; + FILE *f; + int rc = -1; + + f = fopen("/proc/self/maps", "r"); + if (!f) { + perror("fopen"); + return -1; + } + + while (fgets(buf, sizeof(buf), f)) { + rc = sscanf(buf, "%lx-%lx %*c%*c%*c%*c %*x %*d:%*d %*d %127s\n", + &start, &end, name); + if (rc == 2) + continue; + + if (rc != 3) { + printf("sscanf errored\n"); + rc = -1; + break; + } + + if (strstr(name, needle)) { + *low = start; + *high = end - 1; + rc = 0; + break; + } + } + + fclose(f); + + return rc; +} + +static volatile sig_atomic_t took_signal = 0; + +static void sigusr1_handler(int sig) +{ + took_signal++; +} + +int test_sigreturn_vdso(void) +{ + unsigned long low, high, size; + struct sigaction act; + char *p; + + act.sa_handler = sigusr1_handler; + act.sa_flags = 0; + sigemptyset(&act.sa_mask); + + assert(sigaction(SIGUSR1, &act, NULL) == 0); + + // Confirm the VDSO is mapped, and work out where it is + assert(search_proc_maps("[vdso]", &low, &high) == 0); + size = high - low + 1; + printf("VDSO is at 0x%lx-0x%lx (%lu bytes)\n", low, high, size); + + kill(getpid(), SIGUSR1); + assert(took_signal == 1); + printf("Signal delivered OK with VDSO mapped\n"); + + // Remap the VDSO somewhere else + p = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + assert(p != MAP_FAILED); + assert(mremap((void *)low, size, size, MREMAP_MAYMOVE|MREMAP_FIXED, p) != MAP_FAILED); + assert(search_proc_maps("[vdso]", &low, &high) == 0); + size = high - low + 1; + printf("VDSO moved to 0x%lx-0x%lx (%lu bytes)\n", low, high, size); + + kill(getpid(), SIGUSR1); + assert(took_signal == 2); + printf("Signal delivered OK with VDSO moved\n"); + + assert(munmap((void *)low, size) == 0); + printf("Unmapped VDSO\n"); + + // Confirm the VDSO is not mapped anymore + assert(search_proc_maps("[vdso]", &low, &high) != 0); + + // Make the stack executable + assert(search_proc_maps("[stack]", &low, &high) == 0); + size = high - low + 1; + mprotect((void *)low, size, PROT_READ|PROT_WRITE|PROT_EXEC); + printf("Remapped the stack executable\n"); + + kill(getpid(), SIGUSR1); + assert(took_signal == 3); + printf("Signal delivered OK with VDSO unmapped\n"); + + return 0; +} + +int main(void) +{ + return test_harness(test_sigreturn_vdso, "sigreturn_vdso"); +} diff --git a/tools/testing/selftests/powerpc/stringloops/.gitignore b/tools/testing/selftests/powerpc/stringloops/.gitignore index 31a17e0ba884..b0dfc74aa57e 100644 --- a/tools/testing/selftests/powerpc/stringloops/.gitignore +++ b/tools/testing/selftests/powerpc/stringloops/.gitignore @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only memcmp_64 memcmp_32 strlen diff --git a/tools/testing/selftests/powerpc/switch_endian/.gitignore b/tools/testing/selftests/powerpc/switch_endian/.gitignore index 89e762eab676..30e962cf84d1 100644 --- a/tools/testing/selftests/powerpc/switch_endian/.gitignore +++ b/tools/testing/selftests/powerpc/switch_endian/.gitignore @@ -1,2 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only switch_endian_test check-reversed.S diff --git a/tools/testing/selftests/powerpc/syscalls/.gitignore b/tools/testing/selftests/powerpc/syscalls/.gitignore index f0f3fcc9d802..b00cab225476 100644 --- a/tools/testing/selftests/powerpc/syscalls/.gitignore +++ b/tools/testing/selftests/powerpc/syscalls/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only ipc_unmuxed diff --git a/tools/testing/selftests/powerpc/tm/.gitignore b/tools/testing/selftests/powerpc/tm/.gitignore index 98f2708d86cc..d8900a0c47a1 100644 --- a/tools/testing/selftests/powerpc/tm/.gitignore +++ b/tools/testing/selftests/powerpc/tm/.gitignore @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only tm-resched-dscr tm-syscall tm-signal-msr-resv @@ -13,6 +14,7 @@ tm-signal-context-chk-vmx tm-signal-context-chk-vsx tm-signal-context-force-tm tm-signal-sigreturn-nt +tm-signal-pagefault tm-vmx-unavail tm-unavailable tm-trap diff --git a/tools/testing/selftests/powerpc/tm/Makefile b/tools/testing/selftests/powerpc/tm/Makefile index b15a1a325bd0..5881e97c73c1 100644 --- a/tools/testing/selftests/powerpc/tm/Makefile +++ b/tools/testing/selftests/powerpc/tm/Makefile @@ -5,7 +5,9 @@ SIGNAL_CONTEXT_CHK_TESTS := tm-signal-context-chk-gpr tm-signal-context-chk-fpu TEST_GEN_PROGS := tm-resched-dscr tm-syscall tm-signal-msr-resv tm-signal-stack \ tm-vmxcopy tm-fork tm-tar tm-tmspr tm-vmx-unavail tm-unavailable tm-trap \ $(SIGNAL_CONTEXT_CHK_TESTS) tm-sigreturn tm-signal-sigreturn-nt \ - tm-signal-context-force-tm tm-poison + tm-signal-context-force-tm tm-poison tm-signal-pagefault + +TEST_FILES := settings top_srcdir = ../../../../.. include ../../lib.mk @@ -22,6 +24,8 @@ $(OUTPUT)/tm-resched-dscr: ../pmu/lib.c $(OUTPUT)/tm-unavailable: CFLAGS += -O0 -pthread -m64 -Wno-error=uninitialized -mvsx $(OUTPUT)/tm-trap: CFLAGS += -O0 -pthread -m64 $(OUTPUT)/tm-signal-context-force-tm: CFLAGS += -pthread -m64 +$(OUTPUT)/tm-signal-pagefault: CFLAGS += -pthread -m64 +$(OUTPUT)/tm-poison: CFLAGS += -m64 SIGNAL_CONTEXT_CHK_TESTS := $(patsubst %,$(OUTPUT)/%,$(SIGNAL_CONTEXT_CHK_TESTS)) $(SIGNAL_CONTEXT_CHK_TESTS): tm-signal.S diff --git a/tools/testing/selftests/powerpc/tm/settings b/tools/testing/selftests/powerpc/tm/settings new file mode 100644 index 000000000000..e7b9417537fb --- /dev/null +++ b/tools/testing/selftests/powerpc/tm/settings @@ -0,0 +1 @@ +timeout=0 diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-context-force-tm.c b/tools/testing/selftests/powerpc/tm/tm-signal-context-force-tm.c index 31717625f318..421cb082f6be 100644 --- a/tools/testing/selftests/powerpc/tm/tm-signal-context-force-tm.c +++ b/tools/testing/selftests/powerpc/tm/tm-signal-context-force-tm.c @@ -42,9 +42,10 @@ #endif /* Setting contexts because the test will crash and we want to recover */ -ucontext_t init_context, main_context; +ucontext_t init_context; -static int count, first_time; +/* count is changed in the signal handler, so it must be volatile */ +static volatile int count; void usr_signal_handler(int signo, siginfo_t *si, void *uc) { @@ -98,11 +99,6 @@ void usr_signal_handler(int signo, siginfo_t *si, void *uc) void seg_signal_handler(int signo, siginfo_t *si, void *uc) { - if (count == COUNT_MAX) { - /* Return to tm_signal_force_msr() and exit */ - setcontext(&main_context); - } - count++; /* Reexecute the test */ @@ -126,37 +122,41 @@ void tm_trap_test(void) */ getcontext(&init_context); - /* Allocated an alternative signal stack area */ - ss.ss_sp = mmap(NULL, SIGSTKSZ, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); - ss.ss_size = SIGSTKSZ; - ss.ss_flags = 0; + while (count < COUNT_MAX) { + /* Allocated an alternative signal stack area */ + ss.ss_sp = mmap(NULL, SIGSTKSZ, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + ss.ss_size = SIGSTKSZ; + ss.ss_flags = 0; - if (ss.ss_sp == (void *)-1) { - perror("mmap error\n"); - exit(-1); - } + if (ss.ss_sp == (void *)-1) { + perror("mmap error\n"); + exit(-1); + } - /* Force the allocation through a page fault */ - if (madvise(ss.ss_sp, SIGSTKSZ, MADV_DONTNEED)) { - perror("madvise\n"); - exit(-1); - } + /* Force the allocation through a page fault */ + if (madvise(ss.ss_sp, SIGSTKSZ, MADV_DONTNEED)) { + perror("madvise\n"); + exit(-1); + } - /* Setting an alternative stack to generate a page fault when - * the signal is raised. - */ - if (sigaltstack(&ss, NULL)) { - perror("sigaltstack\n"); - exit(-1); + /* + * Setting an alternative stack to generate a page fault when + * the signal is raised. + */ + if (sigaltstack(&ss, NULL)) { + perror("sigaltstack\n"); + exit(-1); + } + + /* The signal handler will enable MSR_TS */ + sigaction(SIGUSR1, &usr_sa, NULL); + /* If it does not crash, it might segfault, avoid it to retest */ + sigaction(SIGSEGV, &seg_sa, NULL); + + raise(SIGUSR1); + count++; } - - /* The signal handler will enable MSR_TS */ - sigaction(SIGUSR1, &usr_sa, NULL); - /* If it does not crash, it will segfault, avoid it to retest */ - sigaction(SIGSEGV, &seg_sa, NULL); - - raise(SIGUSR1); } int tm_signal_context_force_tm(void) @@ -169,11 +169,7 @@ int tm_signal_context_force_tm(void) */ SKIP_IF(!is_ppc64le()); - /* Will get back here after COUNT_MAX interactions */ - getcontext(&main_context); - - if (!first_time++) - tm_trap_test(); + tm_trap_test(); return EXIT_SUCCESS; } diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-pagefault.c b/tools/testing/selftests/powerpc/tm/tm-signal-pagefault.c new file mode 100644 index 000000000000..5908bc6abe60 --- /dev/null +++ b/tools/testing/selftests/powerpc/tm/tm-signal-pagefault.c @@ -0,0 +1,284 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2020, Gustavo Luiz Duarte, IBM Corp. + * + * This test starts a transaction and triggers a signal, forcing a pagefault to + * happen when the kernel signal handling code touches the user signal stack. + * + * In order to avoid pre-faulting the signal stack memory and to force the + * pagefault to happen precisely in the kernel signal handling code, the + * pagefault handling is done in userspace using the userfaultfd facility. + * + * Further pagefaults are triggered by crafting the signal handler's ucontext + * to point to additional memory regions managed by the userfaultfd, so using + * the same mechanism used to avoid pre-faulting the signal stack memory. + * + * On failure (bug is present) kernel crashes or never returns control back to + * userspace. If bug is not present, tests completes almost immediately. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <linux/userfaultfd.h> +#include <poll.h> +#include <unistd.h> +#include <sys/ioctl.h> +#include <sys/syscall.h> +#include <fcntl.h> +#include <sys/mman.h> +#include <pthread.h> +#include <signal.h> +#include <errno.h> + +#include "tm.h" + + +#define UF_MEM_SIZE 655360 /* 10 x 64k pages */ + +/* Memory handled by userfaultfd */ +static char *uf_mem; +static size_t uf_mem_offset = 0; + +/* + * Data that will be copied into the faulting pages (instead of zero-filled + * pages). This is used to make the test more reliable and avoid segfaulting + * when we return from the signal handler. Since we are making the signal + * handler's ucontext point to newly allocated memory, when that memory is + * paged-in it will contain the expected content. + */ +static char backing_mem[UF_MEM_SIZE]; + +static size_t pagesize; + +/* + * Return a chunk of at least 'size' bytes of memory that will be handled by + * userfaultfd. If 'backing_data' is not NULL, its content will be save to + * 'backing_mem' and then copied into the faulting pages when the page fault + * is handled. + */ +void *get_uf_mem(size_t size, void *backing_data) +{ + void *ret; + + if (uf_mem_offset + size > UF_MEM_SIZE) { + fprintf(stderr, "Requesting more uf_mem than expected!\n"); + exit(EXIT_FAILURE); + } + + ret = &uf_mem[uf_mem_offset]; + + /* Save the data that will be copied into the faulting page */ + if (backing_data != NULL) + memcpy(&backing_mem[uf_mem_offset], backing_data, size); + + /* Reserve the requested amount of uf_mem */ + uf_mem_offset += size; + /* Keep uf_mem_offset aligned to the page size (round up) */ + uf_mem_offset = (uf_mem_offset + pagesize - 1) & ~(pagesize - 1); + + return ret; +} + +void *fault_handler_thread(void *arg) +{ + struct uffd_msg msg; /* Data read from userfaultfd */ + long uffd; /* userfaultfd file descriptor */ + struct uffdio_copy uffdio_copy; + struct pollfd pollfd; + ssize_t nread, offset; + + uffd = (long) arg; + + for (;;) { + pollfd.fd = uffd; + pollfd.events = POLLIN; + if (poll(&pollfd, 1, -1) == -1) { + perror("poll() failed"); + exit(EXIT_FAILURE); + } + + nread = read(uffd, &msg, sizeof(msg)); + if (nread == 0) { + fprintf(stderr, "read(): EOF on userfaultfd\n"); + exit(EXIT_FAILURE); + } + + if (nread == -1) { + perror("read() failed"); + exit(EXIT_FAILURE); + } + + /* We expect only one kind of event */ + if (msg.event != UFFD_EVENT_PAGEFAULT) { + fprintf(stderr, "Unexpected event on userfaultfd\n"); + exit(EXIT_FAILURE); + } + + /* + * We need to handle page faults in units of pages(!). + * So, round faulting address down to page boundary. + */ + uffdio_copy.dst = msg.arg.pagefault.address & ~(pagesize-1); + + offset = (char *) uffdio_copy.dst - uf_mem; + uffdio_copy.src = (unsigned long) &backing_mem[offset]; + + uffdio_copy.len = pagesize; + uffdio_copy.mode = 0; + uffdio_copy.copy = 0; + if (ioctl(uffd, UFFDIO_COPY, &uffdio_copy) == -1) { + perror("ioctl-UFFDIO_COPY failed"); + exit(EXIT_FAILURE); + } + } +} + +void setup_uf_mem(void) +{ + long uffd; /* userfaultfd file descriptor */ + pthread_t thr; + struct uffdio_api uffdio_api; + struct uffdio_register uffdio_register; + int ret; + + pagesize = sysconf(_SC_PAGE_SIZE); + + /* Create and enable userfaultfd object */ + uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); + if (uffd == -1) { + perror("userfaultfd() failed"); + exit(EXIT_FAILURE); + } + uffdio_api.api = UFFD_API; + uffdio_api.features = 0; + if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) { + perror("ioctl-UFFDIO_API failed"); + exit(EXIT_FAILURE); + } + + /* + * Create a private anonymous mapping. The memory will be demand-zero + * paged, that is, not yet allocated. When we actually touch the memory + * the related page will be allocated via the userfaultfd mechanism. + */ + uf_mem = mmap(NULL, UF_MEM_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (uf_mem == MAP_FAILED) { + perror("mmap() failed"); + exit(EXIT_FAILURE); + } + + /* + * Register the memory range of the mapping we've just mapped to be + * handled by the userfaultfd object. In 'mode' we request to track + * missing pages (i.e. pages that have not yet been faulted-in). + */ + uffdio_register.range.start = (unsigned long) uf_mem; + uffdio_register.range.len = UF_MEM_SIZE; + uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) { + perror("ioctl-UFFDIO_REGISTER"); + exit(EXIT_FAILURE); + } + + /* Create a thread that will process the userfaultfd events */ + ret = pthread_create(&thr, NULL, fault_handler_thread, (void *) uffd); + if (ret != 0) { + fprintf(stderr, "pthread_create(): Error. Returned %d\n", ret); + exit(EXIT_FAILURE); + } +} + +/* + * Assumption: the signal was delivered while userspace was in transactional or + * suspended state, i.e. uc->uc_link != NULL. + */ +void signal_handler(int signo, siginfo_t *si, void *uc) +{ + ucontext_t *ucp = uc; + + /* Skip 'trap' after returning, otherwise we get a SIGTRAP again */ + ucp->uc_link->uc_mcontext.regs->nip += 4; + + ucp->uc_mcontext.v_regs = + get_uf_mem(sizeof(elf_vrreg_t), ucp->uc_mcontext.v_regs); + + ucp->uc_link->uc_mcontext.v_regs = + get_uf_mem(sizeof(elf_vrreg_t), ucp->uc_link->uc_mcontext.v_regs); + + ucp->uc_link = get_uf_mem(sizeof(ucontext_t), ucp->uc_link); +} + +bool have_userfaultfd(void) +{ + long rc; + + errno = 0; + rc = syscall(__NR_userfaultfd, -1); + + return rc == 0 || errno != ENOSYS; +} + +int tm_signal_pagefault(void) +{ + struct sigaction sa; + stack_t ss; + + SKIP_IF(!have_htm()); + SKIP_IF(!have_userfaultfd()); + + setup_uf_mem(); + + /* + * Set an alternative stack that will generate a page fault when the + * signal is raised. The page fault will be treated via userfaultfd, + * i.e. via fault_handler_thread. + */ + ss.ss_sp = get_uf_mem(SIGSTKSZ, NULL); + ss.ss_size = SIGSTKSZ; + ss.ss_flags = 0; + if (sigaltstack(&ss, NULL) == -1) { + perror("sigaltstack() failed"); + exit(EXIT_FAILURE); + } + + sa.sa_flags = SA_SIGINFO | SA_ONSTACK; + sa.sa_sigaction = signal_handler; + if (sigaction(SIGTRAP, &sa, NULL) == -1) { + perror("sigaction() failed"); + exit(EXIT_FAILURE); + } + + /* Trigger a SIGTRAP in transactional state */ + asm __volatile__( + "tbegin.;" + "beq 1f;" + "trap;" + "1: ;" + : : : "memory"); + + /* Trigger a SIGTRAP in suspended state */ + asm __volatile__( + "tbegin.;" + "beq 1f;" + "tsuspend.;" + "trap;" + "tresume.;" + "1: ;" + : : : "memory"); + + return EXIT_SUCCESS; +} + +int main(int argc, char **argv) +{ + /* + * Depending on kernel config, the TM Bad Thing might not result in a + * crash, instead the kernel never returns control back to userspace, so + * set a tight timeout. If the test passes it completes almost + * immediately. + */ + test_harness_set_timeout(2); + return test_harness(tm_signal_pagefault, "tm_signal_pagefault"); +} diff --git a/tools/testing/selftests/powerpc/vphn/.gitignore b/tools/testing/selftests/powerpc/vphn/.gitignore index 7c04395010cb..b744aedfd1f2 100644 --- a/tools/testing/selftests/powerpc/vphn/.gitignore +++ b/tools/testing/selftests/powerpc/vphn/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only test-vphn diff --git a/tools/testing/selftests/prctl/.gitignore b/tools/testing/selftests/prctl/.gitignore index 0b5c27447bf6..91af2b631bc9 100644 --- a/tools/testing/selftests/prctl/.gitignore +++ b/tools/testing/selftests/prctl/.gitignore @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only disable-tsc-ctxt-sw-stress-test disable-tsc-on-off-stress-test disable-tsc-test diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore index 66fab4c58ed4..bed4b5318a86 100644 --- a/tools/testing/selftests/proc/.gitignore +++ b/tools/testing/selftests/proc/.gitignore @@ -1,7 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0-only /fd-001-lookup /fd-002-posix-eq /fd-003-kthread +/proc-fsconfig-hidepid /proc-loadavg-001 +/proc-multiple-procfs /proc-pid-vm /proc-self-map-files-001 /proc-self-map-files-002 diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile index a8ed0f684829..8be8a03d2973 100644 --- a/tools/testing/selftests/proc/Makefile +++ b/tools/testing/selftests/proc/Makefile @@ -19,5 +19,7 @@ TEST_GEN_PROGS += self TEST_GEN_PROGS += setns-dcache TEST_GEN_PROGS += setns-sysvipc TEST_GEN_PROGS += thread-self +TEST_GEN_PROGS += proc-multiple-procfs +TEST_GEN_PROGS += proc-fsconfig-hidepid include ../lib.mk diff --git a/tools/testing/selftests/proc/proc-fsconfig-hidepid.c b/tools/testing/selftests/proc/proc-fsconfig-hidepid.c new file mode 100644 index 000000000000..b9af8f537185 --- /dev/null +++ b/tools/testing/selftests/proc/proc-fsconfig-hidepid.c @@ -0,0 +1,50 @@ +/* + * Copyright © 2020 Alexey Gladkov <gladkov.alexey@gmail.com> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#include <assert.h> +#include <unistd.h> +#include <stdlib.h> +#include <errno.h> +#include <linux/mount.h> +#include <linux/unistd.h> + +static inline int fsopen(const char *fsname, unsigned int flags) +{ + return syscall(__NR_fsopen, fsname, flags); +} + +static inline int fsconfig(int fd, unsigned int cmd, const char *key, const void *val, int aux) +{ + return syscall(__NR_fsconfig, fd, cmd, key, val, aux); +} + +int main(void) +{ + int fsfd, ret; + int hidepid = 2; + + assert((fsfd = fsopen("proc", 0)) != -1); + + ret = fsconfig(fsfd, FSCONFIG_SET_BINARY, "hidepid", &hidepid, 0); + assert(ret == -1); + assert(errno == EINVAL); + + assert(!fsconfig(fsfd, FSCONFIG_SET_STRING, "hidepid", "2", 0)); + assert(!fsconfig(fsfd, FSCONFIG_SET_STRING, "hidepid", "invisible", 0)); + + assert(!close(fsfd)); + + return 0; +} diff --git a/tools/testing/selftests/proc/proc-multiple-procfs.c b/tools/testing/selftests/proc/proc-multiple-procfs.c new file mode 100644 index 000000000000..ab912ad95dab --- /dev/null +++ b/tools/testing/selftests/proc/proc-multiple-procfs.c @@ -0,0 +1,48 @@ +/* + * Copyright © 2020 Alexey Gladkov <gladkov.alexey@gmail.com> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#include <assert.h> +#include <stdlib.h> +#include <stdio.h> +#include <sys/mount.h> +#include <sys/types.h> +#include <sys/stat.h> + +int main(void) +{ + struct stat proc_st1, proc_st2; + char procbuff[] = "/tmp/proc.XXXXXX/meminfo"; + char procdir1[] = "/tmp/proc.XXXXXX"; + char procdir2[] = "/tmp/proc.XXXXXX"; + + assert(mkdtemp(procdir1) != NULL); + assert(mkdtemp(procdir2) != NULL); + + assert(!mount("proc", procdir1, "proc", 0, "hidepid=1")); + assert(!mount("proc", procdir2, "proc", 0, "hidepid=2")); + + snprintf(procbuff, sizeof(procbuff), "%s/meminfo", procdir1); + assert(!stat(procbuff, &proc_st1)); + + snprintf(procbuff, sizeof(procbuff), "%s/meminfo", procdir2); + assert(!stat(procbuff, &proc_st2)); + + umount(procdir1); + umount(procdir2); + + assert(proc_st1.st_dev != proc_st2.st_dev); + + return 0; +} diff --git a/tools/testing/selftests/pstore/.gitignore b/tools/testing/selftests/pstore/.gitignore index 5a4a26e5464b..9938fb406389 100644 --- a/tools/testing/selftests/pstore/.gitignore +++ b/tools/testing/selftests/pstore/.gitignore @@ -1,2 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only logs *uuid diff --git a/tools/testing/selftests/pstore/pstore_tests b/tools/testing/selftests/pstore/pstore_tests index 1cef54458aff..2aa9a3852a84 100755 --- a/tools/testing/selftests/pstore/pstore_tests +++ b/tools/testing/selftests/pstore/pstore_tests @@ -10,7 +10,7 @@ . ./common_tests prlog -n "Checking pstore console is registered ... " -dmesg | grep -q "console \[pstore" +dmesg | grep -Eq "console \[(pstore|${backend})" show_result $? prlog -n "Checking /dev/pmsg0 exists ... " diff --git a/tools/testing/selftests/ptp/.gitignore b/tools/testing/selftests/ptp/.gitignore index f562e49d6917..534ca26eee48 100644 --- a/tools/testing/selftests/ptp/.gitignore +++ b/tools/testing/selftests/ptp/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only testptp diff --git a/tools/testing/selftests/ptp/testptp.c b/tools/testing/selftests/ptp/testptp.c index c0dd10257df5..da7a9dda9490 100644 --- a/tools/testing/selftests/ptp/testptp.c +++ b/tools/testing/selftests/ptp/testptp.c @@ -269,14 +269,16 @@ int main(int argc, char *argv[]) " %d programmable periodic signals\n" " %d pulse per second\n" " %d programmable pins\n" - " %d cross timestamping\n", + " %d cross timestamping\n" + " %d adjust_phase\n", caps.max_adj, caps.n_alarm, caps.n_ext_ts, caps.n_per_out, caps.pps, caps.n_pins, - caps.cross_timestamping); + caps.cross_timestamping, + caps.adjust_phase); } } diff --git a/tools/testing/selftests/ptrace/.gitignore b/tools/testing/selftests/ptrace/.gitignore index cfcc49a7def7..7bebf9534a86 100644 --- a/tools/testing/selftests/ptrace/.gitignore +++ b/tools/testing/selftests/ptrace/.gitignore @@ -1,2 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only get_syscall_info peeksiginfo diff --git a/tools/testing/selftests/ptrace/Makefile b/tools/testing/selftests/ptrace/Makefile index c0b7f89f0930..2f1f532c39db 100644 --- a/tools/testing/selftests/ptrace/Makefile +++ b/tools/testing/selftests/ptrace/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only -CFLAGS += -iquote../../../../include/uapi -Wall +CFLAGS += -std=c99 -pthread -iquote../../../../include/uapi -Wall -TEST_GEN_PROGS := get_syscall_info peeksiginfo +TEST_GEN_PROGS := get_syscall_info peeksiginfo vmaccess include ../lib.mk diff --git a/tools/testing/selftests/ptrace/vmaccess.c b/tools/testing/selftests/ptrace/vmaccess.c new file mode 100644 index 000000000000..4db327b44586 --- /dev/null +++ b/tools/testing/selftests/ptrace/vmaccess.c @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (c) 2020 Bernd Edlinger <bernd.edlinger@hotmail.de> + * All rights reserved. + * + * Check whether /proc/$pid/mem can be accessed without causing deadlocks + * when de_thread is blocked with ->cred_guard_mutex held. + */ + +#include "../kselftest_harness.h" +#include <stdio.h> +#include <fcntl.h> +#include <pthread.h> +#include <signal.h> +#include <unistd.h> +#include <sys/ptrace.h> + +static void *thread(void *arg) +{ + ptrace(PTRACE_TRACEME, 0, 0L, 0L); + return NULL; +} + +TEST(vmaccess) +{ + int f, pid = fork(); + char mm[64]; + + if (!pid) { + pthread_t pt; + + pthread_create(&pt, NULL, thread, NULL); + pthread_join(pt, NULL); + execlp("true", "true", NULL); + } + + sleep(1); + sprintf(mm, "/proc/%d/mem", pid); + f = open(mm, O_RDONLY); + ASSERT_GE(f, 0); + close(f); + f = kill(pid, SIGCONT); + ASSERT_EQ(f, 0); +} + +TEST(attach) +{ + int s, k, pid = fork(); + + if (!pid) { + pthread_t pt; + + pthread_create(&pt, NULL, thread, NULL); + pthread_join(pt, NULL); + execlp("sleep", "sleep", "2", NULL); + } + + sleep(1); + k = ptrace(PTRACE_ATTACH, pid, 0L, 0L); + ASSERT_EQ(errno, EAGAIN); + ASSERT_EQ(k, -1); + k = waitpid(-1, &s, WNOHANG); + ASSERT_NE(k, -1); + ASSERT_NE(k, 0); + ASSERT_NE(k, pid); + ASSERT_EQ(WIFEXITED(s), 1); + ASSERT_EQ(WEXITSTATUS(s), 0); + sleep(1); + k = ptrace(PTRACE_ATTACH, pid, 0L, 0L); + ASSERT_EQ(k, 0); + k = waitpid(-1, &s, 0); + ASSERT_EQ(k, pid); + ASSERT_EQ(WIFSTOPPED(s), 1); + ASSERT_EQ(WSTOPSIG(s), SIGSTOP); + k = ptrace(PTRACE_DETACH, pid, 0L, 0L); + ASSERT_EQ(k, 0); + k = waitpid(-1, &s, 0); + ASSERT_EQ(k, pid); + ASSERT_EQ(WIFEXITED(s), 1); + ASSERT_EQ(WEXITSTATUS(s), 0); + k = waitpid(-1, NULL, 0); + ASSERT_EQ(k, -1); + ASSERT_EQ(errno, ECHILD); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/rcutorture/.gitignore b/tools/testing/selftests/rcutorture/.gitignore index ccc240275d1c..f6cbce77460b 100644 --- a/tools/testing/selftests/rcutorture/.gitignore +++ b/tools/testing/selftests/rcutorture/.gitignore @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only initrd b[0-9]* res diff --git a/tools/testing/selftests/rcutorture/bin/functions.sh b/tools/testing/selftests/rcutorture/bin/functions.sh index c3a49fb4d6f6..12810229fddc 100644 --- a/tools/testing/selftests/rcutorture/bin/functions.sh +++ b/tools/testing/selftests/rcutorture/bin/functions.sh @@ -12,7 +12,7 @@ # Returns 1 if the specified boot-parameter string tells rcutorture to # test CPU-hotplug operations. bootparam_hotplug_cpu () { - echo "$1" | grep -q "rcutorture\.onoff_" + echo "$1" | grep -q "torture\.onoff_" } # checkarg --argname argtype $# arg mustmatch cannotmatch diff --git a/tools/testing/selftests/rcutorture/bin/kcsan-collapse.sh b/tools/testing/selftests/rcutorture/bin/kcsan-collapse.sh new file mode 100755 index 000000000000..e5cc6b2f195e --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/kcsan-collapse.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ +# +# If this was a KCSAN run, collapse the reports in the various console.log +# files onto pairs of functions. +# +# Usage: kcsan-collapse.sh resultsdir +# +# Copyright (C) 2020 Facebook, Inc. +# +# Authors: Paul E. McKenney <paulmck@kernel.org> + +if test -z "$TORTURE_KCONFIG_KCSAN_ARG" +then + exit 0 +fi +cat $1/*/console.log | + grep "BUG: KCSAN: " | + sed -e 's/^\[[^]]*] //' | + sort | + uniq -c | + sort -k1nr > $1/kcsan.sum diff --git a/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh b/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh index 1871d00bccd7..6f50722f251f 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh @@ -20,7 +20,9 @@ rundir="${1}" if test -z "$rundir" -o ! -d "$rundir" then + echo Directory "$rundir" not found. echo Usage: $0 directory + exit 1 fi editor=${EDITOR-vi} diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh index 9d9a41625dd9..1706cd4466b4 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh @@ -41,7 +41,21 @@ else title="$title ($ngpsps/s)" fi echo $title $stopstate $fwdprog - nclosecalls=`grep --binary-files=text 'torture: Reader Batch' $i/console.log | tail -1 | awk '{for (i=NF-8;i<=NF;i++) sum+=$i; } END {print sum}'` + nclosecalls=`grep --binary-files=text 'torture: Reader Batch' $i/console.log | tail -1 | \ + awk -v sum=0 ' + { + for (i = 0; i <= NF; i++) { + sum += $i; + if ($i ~ /Batch:/) { + sum = 0; + i = i + 2; + } + } + } + + END { + print sum + }'` if test -z "$nclosecalls" then exit 0 diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh index e5edd5198725..736f04749b90 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh @@ -13,6 +13,9 @@ # # Authors: Paul E. McKenney <paulmck@linux.ibm.com> +T=/tmp/kvm-recheck.sh.$$ +trap 'rm -f $T' 0 2 + PATH=`pwd`/tools/testing/selftests/rcutorture/bin:$PATH; export PATH . functions.sh for rd in "$@" @@ -67,5 +70,26 @@ do fi fi done + if test -f "$rd/kcsan.sum" + then + if test -s "$rd/kcsan.sum" + then + echo KCSAN summary in $rd/kcsan.sum + else + echo Clean KCSAN run in $rd + fi + fi done -EDITOR=echo kvm-find-errors.sh "${@: -1}" > /dev/null 2>&1 +EDITOR=echo kvm-find-errors.sh "${@: -1}" > $T 2>&1 +ret=$? +builderrors="`tr ' ' '\012' < $T | grep -c '/Make.out.diags'`" +if test "$builderrors" -gt 0 +then + echo $builderrors runs with build errors. +fi +runerrors="`tr ' ' '\012' < $T | grep -c '/console.log.diags'`" +if test "$runerrors" -gt 0 +then + echo $runerrors runs with runtime errors. +fi +exit $ret diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index e0352304b98b..6ff611c630d1 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -44,30 +44,32 @@ then fi echo ' ---' `date`: Starting build echo ' ---' Kconfig fragment at: $config_template >> $resdir/log -touch $resdir/ConfigFragment.input $resdir/ConfigFragment -if test -r "$config_dir/CFcommon" -then - echo " --- $config_dir/CFcommon" >> $resdir/ConfigFragment.input - cat < $config_dir/CFcommon >> $resdir/ConfigFragment.input - config_override.sh $config_dir/CFcommon $config_template > $T/Kc1 - grep '#CHECK#' $config_dir/CFcommon >> $resdir/ConfigFragment -else - cp $config_template $T/Kc1 -fi -echo " --- $config_template" >> $resdir/ConfigFragment.input -cat $config_template >> $resdir/ConfigFragment.input -grep '#CHECK#' $config_template >> $resdir/ConfigFragment -if test -n "$TORTURE_KCONFIG_ARG" -then - echo $TORTURE_KCONFIG_ARG | tr -s " " "\012" > $T/cmdline - echo " --- --kconfig argument" >> $resdir/ConfigFragment.input - cat $T/cmdline >> $resdir/ConfigFragment.input - config_override.sh $T/Kc1 $T/cmdline > $T/Kc2 - # Note that "#CHECK#" is not permitted on commandline. -else - cp $T/Kc1 $T/Kc2 -fi -cat $T/Kc2 >> $resdir/ConfigFragment +touch $resdir/ConfigFragment.input + +# Combine additional Kconfig options into an existing set such that +# newer options win. The first argument is the Kconfig source ID, the +# second the to-be-updated file within $T, and the third and final the +# list of additional Kconfig options. Note that a $2.tmp file is +# created when doing the update. +config_override_param () { + if test -n "$3" + then + echo $3 | sed -e 's/^ *//' -e 's/ *$//' | tr -s " " "\012" > $T/Kconfig_args + echo " --- $1" >> $resdir/ConfigFragment.input + cat $T/Kconfig_args >> $resdir/ConfigFragment.input + config_override.sh $T/$2 $T/Kconfig_args > $T/$2.tmp + mv $T/$2.tmp $T/$2 + # Note that "#CHECK#" is not permitted on commandline. + fi +} + +echo > $T/KcList +config_override_param "$config_dir/CFcommon" KcList "`cat $config_dir/CFcommon 2> /dev/null`" +config_override_param "$config_template" KcList "`cat $config_template 2> /dev/null`" +config_override_param "--kasan options" KcList "$TORTURE_KCONFIG_KASAN_ARG" +config_override_param "--kcsan options" KcList "$TORTURE_KCONFIG_KCSAN_ARG" +config_override_param "--kconfig argument" KcList "$TORTURE_KCONFIG_ARG" +cp $T/KcList $resdir/ConfigFragment base_resdir=`echo $resdir | sed -e 's/\.[0-9]\+$//'` if test "$base_resdir" != "$resdir" -a -f $base_resdir/bzImage -a -f $base_resdir/vmlinux @@ -80,7 +82,7 @@ then ln -s $base_resdir/.config $resdir # for kvm-recheck.sh # Arch-independent indicator touch $resdir/builtkernel -elif kvm-build.sh $T/Kc2 $resdir +elif kvm-build.sh $T/KcList $resdir then # Had to build a kernel for this test. QEMU="`identify_qemu vmlinux`" diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 78d18ab8e954..c279cf9cb010 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -31,6 +31,8 @@ TORTURE_DEFCONFIG=defconfig TORTURE_BOOT_IMAGE="" TORTURE_INITRD="$KVM/initrd"; export TORTURE_INITRD TORTURE_KCONFIG_ARG="" +TORTURE_KCONFIG_KASAN_ARG="" +TORTURE_KCONFIG_KCSAN_ARG="" TORTURE_KMAKE_ARG="" TORTURE_QEMU_MEM=512 TORTURE_SHUTDOWN_GRACE=180 @@ -39,7 +41,7 @@ TORTURE_TRUST_MAKE="" resdir="" configs="" cpus=0 -ds=`date +%Y.%m.%d-%H:%M:%S` +ds=`date +%Y.%m.%d-%H.%M.%S` jitter="-1" usage () { @@ -133,6 +135,12 @@ do TORTURE_KCONFIG_ARG="$2" shift ;; + --kasan) + TORTURE_KCONFIG_KASAN_ARG="CONFIG_DEBUG_INFO=y CONFIG_KASAN=y"; export TORTURE_KCONFIG_KASAN_ARG + ;; + --kcsan) + TORTURE_KCONFIG_KCSAN_ARG="CONFIG_DEBUG_INFO=y CONFIG_KCSAN=y CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC=n CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY=n CONFIG_KCSAN_REPORT_ONCE_IN_MS=100000 CONFIG_KCSAN_VERBOSE=y CONFIG_KCSAN_INTERRUPT_WATCHER=y"; export TORTURE_KCONFIG_KCSAN_ARG + ;; --kmake-arg) checkarg --kmake-arg "(kernel make arguments)" $# "$2" '.*' '^error$' TORTURE_KMAKE_ARG="$2" @@ -310,6 +318,8 @@ TORTURE_BUILDONLY="$TORTURE_BUILDONLY"; export TORTURE_BUILDONLY TORTURE_DEFCONFIG="$TORTURE_DEFCONFIG"; export TORTURE_DEFCONFIG TORTURE_INITRD="$TORTURE_INITRD"; export TORTURE_INITRD TORTURE_KCONFIG_ARG="$TORTURE_KCONFIG_ARG"; export TORTURE_KCONFIG_ARG +TORTURE_KCONFIG_KASAN_ARG="$TORTURE_KCONFIG_KASAN_ARG"; export TORTURE_KCONFIG_KASAN_ARG +TORTURE_KCONFIG_KCSAN_ARG="$TORTURE_KCONFIG_KCSAN_ARG"; export TORTURE_KCONFIG_KCSAN_ARG TORTURE_KMAKE_ARG="$TORTURE_KMAKE_ARG"; export TORTURE_KMAKE_ARG TORTURE_QEMU_CMD="$TORTURE_QEMU_CMD"; export TORTURE_QEMU_CMD TORTURE_QEMU_INTERACTIVE="$TORTURE_QEMU_INTERACTIVE"; export TORTURE_QEMU_INTERACTIVE @@ -464,6 +474,7 @@ echo echo echo " --- `date` Test summary:" echo Results directory: $resdir/$ds +kcsan-collapse.sh $resdir/$ds kvm-recheck.sh $resdir/$ds ___EOF___ diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFLIST b/tools/testing/selftests/rcutorture/configs/rcu/CFLIST index c3c1fb5a9e1f..f2b20db9e296 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/CFLIST +++ b/tools/testing/selftests/rcutorture/configs/rcu/CFLIST @@ -14,3 +14,6 @@ TINY02 TASKS01 TASKS02 TASKS03 +RUDE01 +TRACE01 +TRACE02 diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon index e19a444a0684..0e92d85313aa 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon +++ b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon @@ -3,3 +3,5 @@ CONFIG_PRINTK_TIME=y CONFIG_HYPERVISOR_GUEST=y CONFIG_PARAVIRT=y CONFIG_KVM_GUEST=y +CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC=n +CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/RUDE01 b/tools/testing/selftests/rcutorture/configs/rcu/RUDE01 new file mode 100644 index 000000000000..bafe94cbd739 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/RUDE01 @@ -0,0 +1,10 @@ +CONFIG_SMP=y +CONFIG_NR_CPUS=2 +CONFIG_HOTPLUG_CPU=y +CONFIG_PREEMPT_NONE=n +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=y +CONFIG_DEBUG_LOCK_ALLOC=y +CONFIG_PROVE_LOCKING=y +#CHECK#CONFIG_PROVE_RCU=y +CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/RUDE01.boot b/tools/testing/selftests/rcutorture/configs/rcu/RUDE01.boot new file mode 100644 index 000000000000..9363708c9075 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/RUDE01.boot @@ -0,0 +1 @@ +rcutorture.torture_type=tasks-rude diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 b/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 new file mode 100644 index 000000000000..12e7661b86f5 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 @@ -0,0 +1,11 @@ +CONFIG_SMP=y +CONFIG_NR_CPUS=4 +CONFIG_HOTPLUG_CPU=y +CONFIG_PREEMPT_NONE=y +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=n +CONFIG_DEBUG_LOCK_ALLOC=y +CONFIG_PROVE_LOCKING=y +#CHECK#CONFIG_PROVE_RCU=y +CONFIG_TASKS_TRACE_RCU_READ_MB=y +CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRACE01.boot b/tools/testing/selftests/rcutorture/configs/rcu/TRACE01.boot new file mode 100644 index 000000000000..9675ad632dcc --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/TRACE01.boot @@ -0,0 +1 @@ +rcutorture.torture_type=tasks-tracing diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 b/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 new file mode 100644 index 000000000000..b69ed6673c41 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 @@ -0,0 +1,11 @@ +CONFIG_SMP=y +CONFIG_NR_CPUS=4 +CONFIG_HOTPLUG_CPU=y +CONFIG_PREEMPT_NONE=n +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=y +CONFIG_DEBUG_LOCK_ALLOC=n +CONFIG_PROVE_LOCKING=n +#CHECK#CONFIG_PROVE_RCU=n +CONFIG_TASKS_TRACE_RCU_READ_MB=n +CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRACE02.boot b/tools/testing/selftests/rcutorture/configs/rcu/TRACE02.boot new file mode 100644 index 000000000000..9675ad632dcc --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/TRACE02.boot @@ -0,0 +1 @@ +rcutorture.torture_type=tasks-tracing diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE10 b/tools/testing/selftests/rcutorture/configs/rcu/TREE10 new file mode 100644 index 000000000000..7311f84a5876 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE10 @@ -0,0 +1,18 @@ +CONFIG_SMP=y +CONFIG_NR_CPUS=56 +CONFIG_PREEMPT_NONE=y +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=n +#CHECK#CONFIG_TREE_RCU=y +CONFIG_HZ_PERIODIC=n +CONFIG_NO_HZ_IDLE=y +CONFIG_NO_HZ_FULL=n +CONFIG_RCU_FAST_NO_HZ=n +CONFIG_RCU_TRACE=n +CONFIG_RCU_NOCB_CPU=n +CONFIG_DEBUG_LOCK_ALLOC=n +CONFIG_PROVE_LOCKING=n +#CHECK#CONFIG_PROVE_RCU=n +CONFIG_DEBUG_OBJECTS=n +CONFIG_DEBUG_OBJECTS_RCU_HEAD=n +CONFIG_RCU_EXPERT=n diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/.gitignore b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/.gitignore index 712a3d41a325..24e27957efcc 100644 --- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/.gitignore +++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only srcu.c diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/include/linux/.gitignore b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/include/linux/.gitignore index 1d016e66980a..57d296341304 100644 --- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/include/linux/.gitignore +++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/include/linux/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only srcu.h diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/.gitignore b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/.gitignore index f47cb2045f13..d65462d64816 100644 --- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/.gitignore +++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only *.out diff --git a/tools/testing/selftests/resctrl/Makefile b/tools/testing/selftests/resctrl/Makefile new file mode 100644 index 000000000000..d585cc1948cc --- /dev/null +++ b/tools/testing/selftests/resctrl/Makefile @@ -0,0 +1,17 @@ +CC = $(CROSS_COMPILE)gcc +CFLAGS = -g -Wall +SRCS=$(wildcard *.c) +OBJS=$(SRCS:.c=.o) + +all: resctrl_tests + +$(OBJS): $(SRCS) + $(CC) $(CFLAGS) -c $(SRCS) + +resctrl_tests: $(OBJS) + $(CC) $(CFLAGS) -o $@ $^ + +.PHONY: clean + +clean: + $(RM) $(OBJS) resctrl_tests diff --git a/tools/testing/selftests/resctrl/README b/tools/testing/selftests/resctrl/README new file mode 100644 index 000000000000..6e5a0ffa18e8 --- /dev/null +++ b/tools/testing/selftests/resctrl/README @@ -0,0 +1,53 @@ +resctrl_tests - resctrl file system test suit + +Authors: + Fenghua Yu <fenghua.yu@intel.com> + Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>, + +resctrl_tests tests various resctrl functionalities and interfaces including +both software and hardware. + +Currently it supports Memory Bandwidth Monitoring test and Memory Bandwidth +Allocation test on Intel RDT hardware. More tests will be added in the future. +And the test suit can be extended to cover AMD QoS and ARM MPAM hardware +as well. + +BUILD +----- + +Run "make" to build executable file "resctrl_tests". + +RUN +--- + +To use resctrl_tests, root or sudoer privileges are required. This is because +the test needs to mount resctrl file system and change contents in the file +system. + +Executing the test without any parameter will run all supported tests: + + sudo ./resctrl_tests + +OVERVIEW OF EXECUTION +--------------------- + +A test case has four stages: + + - setup: mount resctrl file system, create group, setup schemata, move test + process pids to tasks, start benchmark. + - execute: let benchmark run + - verify: get resctrl data and verify the data with another source, e.g. + perf event. + - teardown: umount resctrl and clear temporary files. + +ARGUMENTS +--------- + +Parameter '-h' shows usage information. + +usage: resctrl_tests [-h] [-b "benchmark_cmd [options]"] [-t test list] [-n no_of_bits] + -b benchmark_cmd [options]: run specified benchmark for MBM, MBA and CQM default benchmark is builtin fill_buf + -t test list: run tests specified in the test list, e.g. -t mbm, mba, cqm, cat + -n no_of_bits: run cache tests using specified no of bits in cache bit mask + -p cpu_no: specify CPU number to run the test. 1 is default + -h: help diff --git a/tools/testing/selftests/resctrl/cache.c b/tools/testing/selftests/resctrl/cache.c new file mode 100644 index 000000000000..38dbf4962e33 --- /dev/null +++ b/tools/testing/selftests/resctrl/cache.c @@ -0,0 +1,272 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <stdint.h> +#include "resctrl.h" + +struct read_format { + __u64 nr; /* The number of events */ + struct { + __u64 value; /* The value of the event */ + } values[2]; +}; + +static struct perf_event_attr pea_llc_miss; +static struct read_format rf_cqm; +static int fd_lm; +char llc_occup_path[1024]; + +static void initialize_perf_event_attr(void) +{ + pea_llc_miss.type = PERF_TYPE_HARDWARE; + pea_llc_miss.size = sizeof(struct perf_event_attr); + pea_llc_miss.read_format = PERF_FORMAT_GROUP; + pea_llc_miss.exclude_kernel = 1; + pea_llc_miss.exclude_hv = 1; + pea_llc_miss.exclude_idle = 1; + pea_llc_miss.exclude_callchain_kernel = 1; + pea_llc_miss.inherit = 1; + pea_llc_miss.exclude_guest = 1; + pea_llc_miss.disabled = 1; +} + +static void ioctl_perf_event_ioc_reset_enable(void) +{ + ioctl(fd_lm, PERF_EVENT_IOC_RESET, 0); + ioctl(fd_lm, PERF_EVENT_IOC_ENABLE, 0); +} + +static int perf_event_open_llc_miss(pid_t pid, int cpu_no) +{ + fd_lm = perf_event_open(&pea_llc_miss, pid, cpu_no, -1, + PERF_FLAG_FD_CLOEXEC); + if (fd_lm == -1) { + perror("Error opening leader"); + ctrlc_handler(0, NULL, NULL); + return -1; + } + + return 0; +} + +static int initialize_llc_perf(void) +{ + memset(&pea_llc_miss, 0, sizeof(struct perf_event_attr)); + memset(&rf_cqm, 0, sizeof(struct read_format)); + + /* Initialize perf_event_attr structures for HW_CACHE_MISSES */ + initialize_perf_event_attr(); + + pea_llc_miss.config = PERF_COUNT_HW_CACHE_MISSES; + + rf_cqm.nr = 1; + + return 0; +} + +static int reset_enable_llc_perf(pid_t pid, int cpu_no) +{ + int ret = 0; + + ret = perf_event_open_llc_miss(pid, cpu_no); + if (ret < 0) + return ret; + + /* Start counters to log values */ + ioctl_perf_event_ioc_reset_enable(); + + return 0; +} + +/* + * get_llc_perf: llc cache miss through perf events + * @cpu_no: CPU number that the benchmark PID is binded to + * + * Perf events like HW_CACHE_MISSES could be used to validate number of + * cache lines allocated. + * + * Return: =0 on success. <0 on failure. + */ +static int get_llc_perf(unsigned long *llc_perf_miss) +{ + __u64 total_misses; + + /* Stop counters after one span to get miss rate */ + + ioctl(fd_lm, PERF_EVENT_IOC_DISABLE, 0); + + if (read(fd_lm, &rf_cqm, sizeof(struct read_format)) == -1) { + perror("Could not get llc misses through perf"); + + return -1; + } + + total_misses = rf_cqm.values[0].value; + + close(fd_lm); + + *llc_perf_miss = total_misses; + + return 0; +} + +/* + * Get LLC Occupancy as reported by RESCTRL FS + * For CQM, + * 1. If con_mon grp and mon grp given, then read from mon grp in + * con_mon grp + * 2. If only con_mon grp given, then read from con_mon grp + * 3. If both not given, then read from root con_mon grp + * For CAT, + * 1. If con_mon grp given, then read from it + * 2. If con_mon grp not given, then read from root con_mon grp + * + * Return: =0 on success. <0 on failure. + */ +static int get_llc_occu_resctrl(unsigned long *llc_occupancy) +{ + FILE *fp; + + fp = fopen(llc_occup_path, "r"); + if (!fp) { + perror("Failed to open results file"); + + return errno; + } + if (fscanf(fp, "%lu", llc_occupancy) <= 0) { + perror("Could not get llc occupancy"); + fclose(fp); + + return -1; + } + fclose(fp); + + return 0; +} + +/* + * print_results_cache: the cache results are stored in a file + * @filename: file that stores the results + * @bm_pid: child pid that runs benchmark + * @llc_value: perf miss value / + * llc occupancy value reported by resctrl FS + * + * Return: 0 on success. non-zero on failure. + */ +static int print_results_cache(char *filename, int bm_pid, + unsigned long llc_value) +{ + FILE *fp; + + if (strcmp(filename, "stdio") == 0 || strcmp(filename, "stderr") == 0) { + printf("Pid: %d \t LLC_value: %lu\n", bm_pid, + llc_value); + } else { + fp = fopen(filename, "a"); + if (!fp) { + perror("Cannot open results file"); + + return errno; + } + fprintf(fp, "Pid: %d \t llc_value: %lu\n", bm_pid, llc_value); + fclose(fp); + } + + return 0; +} + +int measure_cache_vals(struct resctrl_val_param *param, int bm_pid) +{ + unsigned long llc_perf_miss = 0, llc_occu_resc = 0, llc_value = 0; + int ret; + + /* + * Measure cache miss from perf. + */ + if (!strcmp(param->resctrl_val, "cat")) { + ret = get_llc_perf(&llc_perf_miss); + if (ret < 0) + return ret; + llc_value = llc_perf_miss; + } + + /* + * Measure llc occupancy from resctrl. + */ + if (!strcmp(param->resctrl_val, "cqm")) { + ret = get_llc_occu_resctrl(&llc_occu_resc); + if (ret < 0) + return ret; + llc_value = llc_occu_resc; + } + ret = print_results_cache(param->filename, bm_pid, llc_value); + if (ret) + return ret; + + return 0; +} + +/* + * cache_val: execute benchmark and measure LLC occupancy resctrl + * and perf cache miss for the benchmark + * @param: parameters passed to cache_val() + * + * Return: 0 on success. non-zero on failure. + */ +int cat_val(struct resctrl_val_param *param) +{ + int malloc_and_init_memory = 1, memflush = 1, operation = 0, ret = 0; + char *resctrl_val = param->resctrl_val; + pid_t bm_pid; + + if (strcmp(param->filename, "") == 0) + sprintf(param->filename, "stdio"); + + bm_pid = getpid(); + + /* Taskset benchmark to specified cpu */ + ret = taskset_benchmark(bm_pid, param->cpu_no); + if (ret) + return ret; + + /* Write benchmark to specified con_mon grp, mon_grp in resctrl FS*/ + ret = write_bm_pid_to_resctrl(bm_pid, param->ctrlgrp, param->mongrp, + resctrl_val); + if (ret) + return ret; + + if ((strcmp(resctrl_val, "cat") == 0)) { + ret = initialize_llc_perf(); + if (ret) + return ret; + } + + /* Test runs until the callback setup() tells the test to stop. */ + while (1) { + if (strcmp(resctrl_val, "cat") == 0) { + ret = param->setup(1, param); + if (ret) { + ret = 0; + break; + } + ret = reset_enable_llc_perf(bm_pid, param->cpu_no); + if (ret) + break; + + if (run_fill_buf(param->span, malloc_and_init_memory, + memflush, operation, resctrl_val)) { + fprintf(stderr, "Error-running fill buffer\n"); + ret = -1; + break; + } + + sleep(1); + ret = measure_cache_vals(param, bm_pid); + if (ret) + break; + } else { + break; + } + } + + return ret; +} diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c new file mode 100644 index 000000000000..5da43767b973 --- /dev/null +++ b/tools/testing/selftests/resctrl/cat_test.c @@ -0,0 +1,250 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Cache Allocation Technology (CAT) test + * + * Copyright (C) 2018 Intel Corporation + * + * Authors: + * Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>, + * Fenghua Yu <fenghua.yu@intel.com> + */ +#include "resctrl.h" +#include <unistd.h> + +#define RESULT_FILE_NAME1 "result_cat1" +#define RESULT_FILE_NAME2 "result_cat2" +#define NUM_OF_RUNS 5 +#define MAX_DIFF_PERCENT 4 +#define MAX_DIFF 1000000 + +int count_of_bits; +char cbm_mask[256]; +unsigned long long_mask; +unsigned long cache_size; + +/* + * Change schemata. Write schemata to specified + * con_mon grp, mon_grp in resctrl FS. + * Run 5 times in order to get average values. + */ +static int cat_setup(int num, ...) +{ + struct resctrl_val_param *p; + char schemata[64]; + va_list param; + int ret = 0; + + va_start(param, num); + p = va_arg(param, struct resctrl_val_param *); + va_end(param); + + /* Run NUM_OF_RUNS times */ + if (p->num_of_runs >= NUM_OF_RUNS) + return -1; + + if (p->num_of_runs == 0) { + sprintf(schemata, "%lx", p->mask); + ret = write_schemata(p->ctrlgrp, schemata, p->cpu_no, + p->resctrl_val); + } + p->num_of_runs++; + + return ret; +} + +static void show_cache_info(unsigned long sum_llc_perf_miss, int no_of_bits, + unsigned long span) +{ + unsigned long allocated_cache_lines = span / 64; + unsigned long avg_llc_perf_miss = 0; + float diff_percent; + + avg_llc_perf_miss = sum_llc_perf_miss / (NUM_OF_RUNS - 1); + diff_percent = ((float)allocated_cache_lines - avg_llc_perf_miss) / + allocated_cache_lines * 100; + + printf("%sok CAT: cache miss rate within %d%%\n", + !is_amd && abs((int)diff_percent) > MAX_DIFF_PERCENT ? + "not " : "", MAX_DIFF_PERCENT); + tests_run++; + printf("# Percent diff=%d\n", abs((int)diff_percent)); + printf("# Number of bits: %d\n", no_of_bits); + printf("# Avg_llc_perf_miss: %lu\n", avg_llc_perf_miss); + printf("# Allocated cache lines: %lu\n", allocated_cache_lines); +} + +static int check_results(struct resctrl_val_param *param) +{ + char *token_array[8], temp[512]; + unsigned long sum_llc_perf_miss = 0; + int runs = 0, no_of_bits = 0; + FILE *fp; + + printf("# Checking for pass/fail\n"); + fp = fopen(param->filename, "r"); + if (!fp) { + perror("# Cannot open file"); + + return errno; + } + + while (fgets(temp, sizeof(temp), fp)) { + char *token = strtok(temp, ":\t"); + int fields = 0; + + while (token) { + token_array[fields++] = token; + token = strtok(NULL, ":\t"); + } + /* + * Discard the first value which is inaccurate due to monitoring + * setup transition phase. + */ + if (runs > 0) + sum_llc_perf_miss += strtoul(token_array[3], NULL, 0); + runs++; + } + + fclose(fp); + no_of_bits = count_bits(param->mask); + + show_cache_info(sum_llc_perf_miss, no_of_bits, param->span); + + return 0; +} + +void cat_test_cleanup(void) +{ + remove(RESULT_FILE_NAME1); + remove(RESULT_FILE_NAME2); +} + +int cat_perf_miss_val(int cpu_no, int n, char *cache_type) +{ + unsigned long l_mask, l_mask_1; + int ret, pipefd[2], sibling_cpu_no; + char pipe_message; + pid_t bm_pid; + + cache_size = 0; + + ret = remount_resctrlfs(true); + if (ret) + return ret; + + if (!validate_resctrl_feature_request("cat")) + return -1; + + /* Get default cbm mask for L3/L2 cache */ + ret = get_cbm_mask(cache_type); + if (ret) + return ret; + + long_mask = strtoul(cbm_mask, NULL, 16); + + /* Get L3/L2 cache size */ + ret = get_cache_size(cpu_no, cache_type, &cache_size); + if (ret) + return ret; + printf("cache size :%lu\n", cache_size); + + /* Get max number of bits from default-cabm mask */ + count_of_bits = count_bits(long_mask); + + if (n < 1 || n > count_of_bits - 1) { + printf("Invalid input value for no_of_bits n!\n"); + printf("Please Enter value in range 1 to %d\n", + count_of_bits - 1); + return -1; + } + + /* Get core id from same socket for running another thread */ + sibling_cpu_no = get_core_sibling(cpu_no); + if (sibling_cpu_no < 0) + return -1; + + struct resctrl_val_param param = { + .resctrl_val = "cat", + .cpu_no = cpu_no, + .mum_resctrlfs = 0, + .setup = cat_setup, + }; + + l_mask = long_mask >> n; + l_mask_1 = ~l_mask & long_mask; + + /* Set param values for parent thread which will be allocated bitmask + * with (max_bits - n) bits + */ + param.span = cache_size * (count_of_bits - n) / count_of_bits; + strcpy(param.ctrlgrp, "c2"); + strcpy(param.mongrp, "m2"); + strcpy(param.filename, RESULT_FILE_NAME2); + param.mask = l_mask; + param.num_of_runs = 0; + + if (pipe(pipefd)) { + perror("# Unable to create pipe"); + return errno; + } + + bm_pid = fork(); + + /* Set param values for child thread which will be allocated bitmask + * with n bits + */ + if (bm_pid == 0) { + param.mask = l_mask_1; + strcpy(param.ctrlgrp, "c1"); + strcpy(param.mongrp, "m1"); + param.span = cache_size * n / count_of_bits; + strcpy(param.filename, RESULT_FILE_NAME1); + param.num_of_runs = 0; + param.cpu_no = sibling_cpu_no; + } + + remove(param.filename); + + ret = cat_val(¶m); + if (ret) + return ret; + + ret = check_results(¶m); + if (ret) + return ret; + + if (bm_pid == 0) { + /* Tell parent that child is ready */ + close(pipefd[0]); + pipe_message = 1; + if (write(pipefd[1], &pipe_message, sizeof(pipe_message)) < + sizeof(pipe_message)) { + close(pipefd[1]); + perror("# failed signaling parent process"); + return errno; + } + + close(pipefd[1]); + while (1) + ; + } else { + /* Parent waits for child to be ready. */ + close(pipefd[1]); + pipe_message = 0; + while (pipe_message != 1) { + if (read(pipefd[0], &pipe_message, + sizeof(pipe_message)) < sizeof(pipe_message)) { + perror("# failed reading from child process"); + break; + } + } + close(pipefd[0]); + kill(bm_pid, SIGKILL); + } + + cat_test_cleanup(); + if (bm_pid) + umount_resctrlfs(); + + return 0; +} diff --git a/tools/testing/selftests/resctrl/cqm_test.c b/tools/testing/selftests/resctrl/cqm_test.c new file mode 100644 index 000000000000..c8756152bd61 --- /dev/null +++ b/tools/testing/selftests/resctrl/cqm_test.c @@ -0,0 +1,176 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Cache Monitoring Technology (CQM) test + * + * Copyright (C) 2018 Intel Corporation + * + * Authors: + * Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>, + * Fenghua Yu <fenghua.yu@intel.com> + */ +#include "resctrl.h" +#include <unistd.h> + +#define RESULT_FILE_NAME "result_cqm" +#define NUM_OF_RUNS 5 +#define MAX_DIFF 2000000 +#define MAX_DIFF_PERCENT 15 + +int count_of_bits; +char cbm_mask[256]; +unsigned long long_mask; +unsigned long cache_size; + +static int cqm_setup(int num, ...) +{ + struct resctrl_val_param *p; + va_list param; + + va_start(param, num); + p = va_arg(param, struct resctrl_val_param *); + va_end(param); + + /* Run NUM_OF_RUNS times */ + if (p->num_of_runs >= NUM_OF_RUNS) + return -1; + + p->num_of_runs++; + + return 0; +} + +static void show_cache_info(unsigned long sum_llc_occu_resc, int no_of_bits, + unsigned long span) +{ + unsigned long avg_llc_occu_resc = 0; + float diff_percent; + long avg_diff = 0; + bool res; + + avg_llc_occu_resc = sum_llc_occu_resc / (NUM_OF_RUNS - 1); + avg_diff = (long)abs(span - avg_llc_occu_resc); + + diff_percent = (((float)span - avg_llc_occu_resc) / span) * 100; + + if ((abs((int)diff_percent) <= MAX_DIFF_PERCENT) || + (abs(avg_diff) <= MAX_DIFF)) + res = true; + else + res = false; + + printf("%sok CQM: diff within %d, %d\%%\n", res ? "" : "not", + MAX_DIFF, (int)MAX_DIFF_PERCENT); + + printf("# diff: %ld\n", avg_diff); + printf("# percent diff=%d\n", abs((int)diff_percent)); + printf("# Results are displayed in (Bytes)\n"); + printf("# Number of bits: %d\n", no_of_bits); + printf("# Avg_llc_occu_resc: %lu\n", avg_llc_occu_resc); + printf("# llc_occu_exp (span): %lu\n", span); + + tests_run++; +} + +static int check_results(struct resctrl_val_param *param, int no_of_bits) +{ + char *token_array[8], temp[512]; + unsigned long sum_llc_occu_resc = 0; + int runs = 0; + FILE *fp; + + printf("# checking for pass/fail\n"); + fp = fopen(param->filename, "r"); + if (!fp) { + perror("# Error in opening file\n"); + + return errno; + } + + while (fgets(temp, 1024, fp)) { + char *token = strtok(temp, ":\t"); + int fields = 0; + + while (token) { + token_array[fields++] = token; + token = strtok(NULL, ":\t"); + } + + /* Field 3 is llc occ resc value */ + if (runs > 0) + sum_llc_occu_resc += strtoul(token_array[3], NULL, 0); + runs++; + } + fclose(fp); + show_cache_info(sum_llc_occu_resc, no_of_bits, param->span); + + return 0; +} + +void cqm_test_cleanup(void) +{ + remove(RESULT_FILE_NAME); +} + +int cqm_resctrl_val(int cpu_no, int n, char **benchmark_cmd) +{ + int ret, mum_resctrlfs; + + cache_size = 0; + mum_resctrlfs = 1; + + ret = remount_resctrlfs(mum_resctrlfs); + if (ret) + return ret; + + if (!validate_resctrl_feature_request("cqm")) + return -1; + + ret = get_cbm_mask("L3"); + if (ret) + return ret; + + long_mask = strtoul(cbm_mask, NULL, 16); + + ret = get_cache_size(cpu_no, "L3", &cache_size); + if (ret) + return ret; + printf("cache size :%lu\n", cache_size); + + count_of_bits = count_bits(long_mask); + + if (n < 1 || n > count_of_bits) { + printf("Invalid input value for numbr_of_bits n!\n"); + printf("Please Enter value in range 1 to %d\n", count_of_bits); + return -1; + } + + struct resctrl_val_param param = { + .resctrl_val = "cqm", + .ctrlgrp = "c1", + .mongrp = "m1", + .cpu_no = cpu_no, + .mum_resctrlfs = 0, + .filename = RESULT_FILE_NAME, + .mask = ~(long_mask << n) & long_mask, + .span = cache_size * n / count_of_bits, + .num_of_runs = 0, + .setup = cqm_setup, + }; + + if (strcmp(benchmark_cmd[0], "fill_buf") == 0) + sprintf(benchmark_cmd[1], "%lu", param.span); + + remove(RESULT_FILE_NAME); + + ret = resctrl_val(benchmark_cmd, ¶m); + if (ret) + return ret; + + ret = check_results(¶m, n); + if (ret) + return ret; + + cqm_test_cleanup(); + + return 0; +} diff --git a/tools/testing/selftests/resctrl/fill_buf.c b/tools/testing/selftests/resctrl/fill_buf.c new file mode 100644 index 000000000000..79c611c99a3d --- /dev/null +++ b/tools/testing/selftests/resctrl/fill_buf.c @@ -0,0 +1,213 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fill_buf benchmark + * + * Copyright (C) 2018 Intel Corporation + * + * Authors: + * Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>, + * Fenghua Yu <fenghua.yu@intel.com> + */ +#include <stdio.h> +#include <unistd.h> +#include <stdlib.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <inttypes.h> +#include <malloc.h> +#include <string.h> + +#include "resctrl.h" + +#define CL_SIZE (64) +#define PAGE_SIZE (4 * 1024) +#define MB (1024 * 1024) + +static unsigned char *startptr; + +static void sb(void) +{ +#if defined(__i386) || defined(__x86_64) + asm volatile("sfence\n\t" + : : : "memory"); +#endif +} + +static void ctrl_handler(int signo) +{ + free(startptr); + printf("\nEnding\n"); + sb(); + exit(EXIT_SUCCESS); +} + +static void cl_flush(void *p) +{ +#if defined(__i386) || defined(__x86_64) + asm volatile("clflush (%0)\n\t" + : : "r"(p) : "memory"); +#endif +} + +static void mem_flush(void *p, size_t s) +{ + char *cp = (char *)p; + size_t i = 0; + + s = s / CL_SIZE; /* mem size in cache llines */ + + for (i = 0; i < s; i++) + cl_flush(&cp[i * CL_SIZE]); + + sb(); +} + +static void *malloc_and_init_memory(size_t s) +{ + uint64_t *p64; + size_t s64; + + void *p = memalign(PAGE_SIZE, s); + + p64 = (uint64_t *)p; + s64 = s / sizeof(uint64_t); + + while (s64 > 0) { + *p64 = (uint64_t)rand(); + p64 += (CL_SIZE / sizeof(uint64_t)); + s64 -= (CL_SIZE / sizeof(uint64_t)); + } + + return p; +} + +static int fill_one_span_read(unsigned char *start_ptr, unsigned char *end_ptr) +{ + unsigned char sum, *p; + + sum = 0; + p = start_ptr; + while (p < end_ptr) { + sum += *p; + p += (CL_SIZE / 2); + } + + return sum; +} + +static +void fill_one_span_write(unsigned char *start_ptr, unsigned char *end_ptr) +{ + unsigned char *p; + + p = start_ptr; + while (p < end_ptr) { + *p = '1'; + p += (CL_SIZE / 2); + } +} + +static int fill_cache_read(unsigned char *start_ptr, unsigned char *end_ptr, + char *resctrl_val) +{ + int ret = 0; + FILE *fp; + + while (1) { + ret = fill_one_span_read(start_ptr, end_ptr); + if (!strcmp(resctrl_val, "cat")) + break; + } + + /* Consume read result so that reading memory is not optimized out. */ + fp = fopen("/dev/null", "w"); + if (!fp) + perror("Unable to write to /dev/null"); + fprintf(fp, "Sum: %d ", ret); + fclose(fp); + + return 0; +} + +static int fill_cache_write(unsigned char *start_ptr, unsigned char *end_ptr, + char *resctrl_val) +{ + while (1) { + fill_one_span_write(start_ptr, end_ptr); + if (!strcmp(resctrl_val, "cat")) + break; + } + + return 0; +} + +static int +fill_cache(unsigned long long buf_size, int malloc_and_init, int memflush, + int op, char *resctrl_val) +{ + unsigned char *start_ptr, *end_ptr; + unsigned long long i; + int ret; + + if (malloc_and_init) + start_ptr = malloc_and_init_memory(buf_size); + else + start_ptr = malloc(buf_size); + + if (!start_ptr) + return -1; + + startptr = start_ptr; + end_ptr = start_ptr + buf_size; + + /* + * It's better to touch the memory once to avoid any compiler + * optimizations + */ + if (!malloc_and_init) { + for (i = 0; i < buf_size; i++) + *start_ptr++ = (unsigned char)rand(); + } + + start_ptr = startptr; + + /* Flush the memory before using to avoid "cache hot pages" effect */ + if (memflush) + mem_flush(start_ptr, buf_size); + + if (op == 0) + ret = fill_cache_read(start_ptr, end_ptr, resctrl_val); + else + ret = fill_cache_write(start_ptr, end_ptr, resctrl_val); + + if (ret) { + printf("\n Error in fill cache read/write...\n"); + return -1; + } + + free(startptr); + + return 0; +} + +int run_fill_buf(unsigned long span, int malloc_and_init_memory, + int memflush, int op, char *resctrl_val) +{ + unsigned long long cache_size = span; + int ret; + + /* set up ctrl-c handler */ + if (signal(SIGINT, ctrl_handler) == SIG_ERR) + printf("Failed to catch SIGINT!\n"); + if (signal(SIGHUP, ctrl_handler) == SIG_ERR) + printf("Failed to catch SIGHUP!\n"); + + ret = fill_cache(cache_size, malloc_and_init_memory, memflush, op, + resctrl_val); + if (ret) { + printf("\n Error in fill cache\n"); + return -1; + } + + return 0; +} diff --git a/tools/testing/selftests/resctrl/mba_test.c b/tools/testing/selftests/resctrl/mba_test.c new file mode 100644 index 000000000000..7bf8eaa6204b --- /dev/null +++ b/tools/testing/selftests/resctrl/mba_test.c @@ -0,0 +1,171 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Memory Bandwidth Allocation (MBA) test + * + * Copyright (C) 2018 Intel Corporation + * + * Authors: + * Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>, + * Fenghua Yu <fenghua.yu@intel.com> + */ +#include "resctrl.h" + +#define RESULT_FILE_NAME "result_mba" +#define NUM_OF_RUNS 5 +#define MAX_DIFF 300 +#define ALLOCATION_MAX 100 +#define ALLOCATION_MIN 10 +#define ALLOCATION_STEP 10 + +/* + * Change schemata percentage from 100 to 10%. Write schemata to specified + * con_mon grp, mon_grp in resctrl FS. + * For each allocation, run 5 times in order to get average values. + */ +static int mba_setup(int num, ...) +{ + static int runs_per_allocation, allocation = 100; + struct resctrl_val_param *p; + char allocation_str[64]; + va_list param; + + va_start(param, num); + p = va_arg(param, struct resctrl_val_param *); + va_end(param); + + if (runs_per_allocation >= NUM_OF_RUNS) + runs_per_allocation = 0; + + /* Only set up schemata once every NUM_OF_RUNS of allocations */ + if (runs_per_allocation++ != 0) + return 0; + + if (allocation < ALLOCATION_MIN || allocation > ALLOCATION_MAX) + return -1; + + sprintf(allocation_str, "%d", allocation); + + write_schemata(p->ctrlgrp, allocation_str, p->cpu_no, p->resctrl_val); + allocation -= ALLOCATION_STEP; + + return 0; +} + +static void show_mba_info(unsigned long *bw_imc, unsigned long *bw_resc) +{ + int allocation, runs; + bool failed = false; + + printf("# Results are displayed in (MB)\n"); + /* Memory bandwidth from 100% down to 10% */ + for (allocation = 0; allocation < ALLOCATION_MAX / ALLOCATION_STEP; + allocation++) { + unsigned long avg_bw_imc, avg_bw_resc; + unsigned long sum_bw_imc = 0, sum_bw_resc = 0; + unsigned long avg_diff; + + /* + * The first run is discarded due to inaccurate value from + * phase transition. + */ + for (runs = NUM_OF_RUNS * allocation + 1; + runs < NUM_OF_RUNS * allocation + NUM_OF_RUNS ; runs++) { + sum_bw_imc += bw_imc[runs]; + sum_bw_resc += bw_resc[runs]; + } + + avg_bw_imc = sum_bw_imc / (NUM_OF_RUNS - 1); + avg_bw_resc = sum_bw_resc / (NUM_OF_RUNS - 1); + avg_diff = labs((long)(avg_bw_resc - avg_bw_imc)); + + printf("%sok MBA schemata percentage %u smaller than %d %%\n", + avg_diff > MAX_DIFF ? "not " : "", + ALLOCATION_MAX - ALLOCATION_STEP * allocation, + MAX_DIFF); + tests_run++; + printf("# avg_diff: %lu\n", avg_diff); + printf("# avg_bw_imc: %lu\n", avg_bw_imc); + printf("# avg_bw_resc: %lu\n", avg_bw_resc); + if (avg_diff > MAX_DIFF) + failed = true; + } + + printf("%sok schemata change using MBA%s\n", failed ? "not " : "", + failed ? " # at least one test failed" : ""); + tests_run++; +} + +static int check_results(void) +{ + char *token_array[8], output[] = RESULT_FILE_NAME, temp[512]; + unsigned long bw_imc[1024], bw_resc[1024]; + int runs; + FILE *fp; + + fp = fopen(output, "r"); + if (!fp) { + perror(output); + + return errno; + } + + runs = 0; + while (fgets(temp, sizeof(temp), fp)) { + char *token = strtok(temp, ":\t"); + int fields = 0; + + while (token) { + token_array[fields++] = token; + token = strtok(NULL, ":\t"); + } + + /* Field 3 is perf imc value */ + bw_imc[runs] = strtoul(token_array[3], NULL, 0); + /* Field 5 is resctrl value */ + bw_resc[runs] = strtoul(token_array[5], NULL, 0); + runs++; + } + + fclose(fp); + + show_mba_info(bw_imc, bw_resc); + + return 0; +} + +void mba_test_cleanup(void) +{ + remove(RESULT_FILE_NAME); +} + +int mba_schemata_change(int cpu_no, char *bw_report, char **benchmark_cmd) +{ + struct resctrl_val_param param = { + .resctrl_val = "mba", + .ctrlgrp = "c1", + .mongrp = "m1", + .cpu_no = cpu_no, + .mum_resctrlfs = 1, + .filename = RESULT_FILE_NAME, + .bw_report = bw_report, + .setup = mba_setup + }; + int ret; + + remove(RESULT_FILE_NAME); + + if (!validate_resctrl_feature_request("mba")) + return -1; + + ret = resctrl_val(benchmark_cmd, ¶m); + if (ret) + return ret; + + ret = check_results(); + if (ret) + return ret; + + mba_test_cleanup(); + + return 0; +} diff --git a/tools/testing/selftests/resctrl/mbm_test.c b/tools/testing/selftests/resctrl/mbm_test.c new file mode 100644 index 000000000000..4700f7453f81 --- /dev/null +++ b/tools/testing/selftests/resctrl/mbm_test.c @@ -0,0 +1,145 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Memory Bandwidth Monitoring (MBM) test + * + * Copyright (C) 2018 Intel Corporation + * + * Authors: + * Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>, + * Fenghua Yu <fenghua.yu@intel.com> + */ +#include "resctrl.h" + +#define RESULT_FILE_NAME "result_mbm" +#define MAX_DIFF 300 +#define NUM_OF_RUNS 5 + +static void +show_bw_info(unsigned long *bw_imc, unsigned long *bw_resc, int span) +{ + unsigned long avg_bw_imc = 0, avg_bw_resc = 0; + unsigned long sum_bw_imc = 0, sum_bw_resc = 0; + long avg_diff = 0; + int runs; + + /* + * Discard the first value which is inaccurate due to monitoring setup + * transition phase. + */ + for (runs = 1; runs < NUM_OF_RUNS ; runs++) { + sum_bw_imc += bw_imc[runs]; + sum_bw_resc += bw_resc[runs]; + } + + avg_bw_imc = sum_bw_imc / 4; + avg_bw_resc = sum_bw_resc / 4; + avg_diff = avg_bw_resc - avg_bw_imc; + + printf("%sok MBM: diff within %d%%\n", + labs(avg_diff) > MAX_DIFF ? "not " : "", MAX_DIFF); + tests_run++; + printf("# avg_diff: %lu\n", labs(avg_diff)); + printf("# Span (MB): %d\n", span); + printf("# avg_bw_imc: %lu\n", avg_bw_imc); + printf("# avg_bw_resc: %lu\n", avg_bw_resc); +} + +static int check_results(int span) +{ + unsigned long bw_imc[NUM_OF_RUNS], bw_resc[NUM_OF_RUNS]; + char temp[1024], *token_array[8]; + char output[] = RESULT_FILE_NAME; + int runs; + FILE *fp; + + printf("# Checking for pass/fail\n"); + + fp = fopen(output, "r"); + if (!fp) { + perror(output); + + return errno; + } + + runs = 0; + while (fgets(temp, sizeof(temp), fp)) { + char *token = strtok(temp, ":\t"); + int i = 0; + + while (token) { + token_array[i++] = token; + token = strtok(NULL, ":\t"); + } + + bw_resc[runs] = strtoul(token_array[5], NULL, 0); + bw_imc[runs] = strtoul(token_array[3], NULL, 0); + runs++; + } + + show_bw_info(bw_imc, bw_resc, span); + + fclose(fp); + + return 0; +} + +static int mbm_setup(int num, ...) +{ + struct resctrl_val_param *p; + static int num_of_runs; + va_list param; + int ret = 0; + + /* Run NUM_OF_RUNS times */ + if (num_of_runs++ >= NUM_OF_RUNS) + return -1; + + va_start(param, num); + p = va_arg(param, struct resctrl_val_param *); + va_end(param); + + /* Set up shemata with 100% allocation on the first run. */ + if (num_of_runs == 0) + ret = write_schemata(p->ctrlgrp, "100", p->cpu_no, + p->resctrl_val); + + return ret; +} + +void mbm_test_cleanup(void) +{ + remove(RESULT_FILE_NAME); +} + +int mbm_bw_change(int span, int cpu_no, char *bw_report, char **benchmark_cmd) +{ + struct resctrl_val_param param = { + .resctrl_val = "mbm", + .ctrlgrp = "c1", + .mongrp = "m1", + .span = span, + .cpu_no = cpu_no, + .mum_resctrlfs = 1, + .filename = RESULT_FILE_NAME, + .bw_report = bw_report, + .setup = mbm_setup + }; + int ret; + + remove(RESULT_FILE_NAME); + + if (!validate_resctrl_feature_request("mbm")) + return -1; + + ret = resctrl_val(benchmark_cmd, ¶m); + if (ret) + return ret; + + ret = check_results(span); + if (ret) + return ret; + + mbm_test_cleanup(); + + return 0; +} diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h new file mode 100644 index 000000000000..39bf59c6b9c5 --- /dev/null +++ b/tools/testing/selftests/resctrl/resctrl.h @@ -0,0 +1,107 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#define _GNU_SOURCE +#ifndef RESCTRL_H +#define RESCTRL_H +#include <stdio.h> +#include <stdarg.h> +#include <math.h> +#include <errno.h> +#include <sched.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <signal.h> +#include <dirent.h> +#include <stdbool.h> +#include <sys/stat.h> +#include <sys/ioctl.h> +#include <sys/mount.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <sys/select.h> +#include <sys/time.h> +#include <sys/eventfd.h> +#include <asm/unistd.h> +#include <linux/perf_event.h> + +#define MB (1024 * 1024) +#define RESCTRL_PATH "/sys/fs/resctrl" +#define PHYS_ID_PATH "/sys/devices/system/cpu/cpu" +#define CBM_MASK_PATH "/sys/fs/resctrl/info" + +#define PARENT_EXIT(err_msg) \ + do { \ + perror(err_msg); \ + kill(ppid, SIGKILL); \ + exit(EXIT_FAILURE); \ + } while (0) + +/* + * resctrl_val_param: resctrl test parameters + * @resctrl_val: Resctrl feature (Eg: mbm, mba.. etc) + * @ctrlgrp: Name of the control monitor group (con_mon grp) + * @mongrp: Name of the monitor group (mon grp) + * @cpu_no: CPU number to which the benchmark would be binded + * @span: Memory bytes accessed in each benchmark iteration + * @mum_resctrlfs: Should the resctrl FS be remounted? + * @filename: Name of file to which the o/p should be written + * @bw_report: Bandwidth report type (reads vs writes) + * @setup: Call back function to setup test environment + */ +struct resctrl_val_param { + char *resctrl_val; + char ctrlgrp[64]; + char mongrp[64]; + int cpu_no; + unsigned long span; + int mum_resctrlfs; + char filename[64]; + char *bw_report; + unsigned long mask; + int num_of_runs; + int (*setup)(int num, ...); +}; + +pid_t bm_pid, ppid; +int tests_run; + +char llc_occup_path[1024]; +bool is_amd; + +bool check_resctrlfs_support(void); +int filter_dmesg(void); +int remount_resctrlfs(bool mum_resctrlfs); +int get_resource_id(int cpu_no, int *resource_id); +int umount_resctrlfs(void); +int validate_bw_report_request(char *bw_report); +bool validate_resctrl_feature_request(char *resctrl_val); +char *fgrep(FILE *inf, const char *str); +int taskset_benchmark(pid_t bm_pid, int cpu_no); +void run_benchmark(int signum, siginfo_t *info, void *ucontext); +int write_schemata(char *ctrlgrp, char *schemata, int cpu_no, + char *resctrl_val); +int write_bm_pid_to_resctrl(pid_t bm_pid, char *ctrlgrp, char *mongrp, + char *resctrl_val); +int perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, + int group_fd, unsigned long flags); +int run_fill_buf(unsigned long span, int malloc_and_init_memory, int memflush, + int op, char *resctrl_va); +int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param); +int mbm_bw_change(int span, int cpu_no, char *bw_report, char **benchmark_cmd); +void tests_cleanup(void); +void mbm_test_cleanup(void); +int mba_schemata_change(int cpu_no, char *bw_report, char **benchmark_cmd); +void mba_test_cleanup(void); +int get_cbm_mask(char *cache_type); +int get_cache_size(int cpu_no, char *cache_type, unsigned long *cache_size); +void ctrlc_handler(int signum, siginfo_t *info, void *ptr); +int cat_val(struct resctrl_val_param *param); +void cat_test_cleanup(void); +int cat_perf_miss_val(int cpu_no, int no_of_bits, char *cache_type); +int cqm_resctrl_val(int cpu_no, int n, char **benchmark_cmd); +unsigned int count_bits(unsigned long n); +void cqm_test_cleanup(void); +int get_core_sibling(int cpu_no); +int measure_cache_vals(struct resctrl_val_param *param, int bm_pid); + +#endif /* RESCTRL_H */ diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c new file mode 100644 index 000000000000..425cc85ac883 --- /dev/null +++ b/tools/testing/selftests/resctrl/resctrl_tests.c @@ -0,0 +1,202 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Resctrl tests + * + * Copyright (C) 2018 Intel Corporation + * + * Authors: + * Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>, + * Fenghua Yu <fenghua.yu@intel.com> + */ +#include "resctrl.h" + +#define BENCHMARK_ARGS 64 +#define BENCHMARK_ARG_SIZE 64 + +bool is_amd; + +void detect_amd(void) +{ + FILE *inf = fopen("/proc/cpuinfo", "r"); + char *res; + + if (!inf) + return; + + res = fgrep(inf, "vendor_id"); + + if (res) { + char *s = strchr(res, ':'); + + is_amd = s && !strcmp(s, ": AuthenticAMD\n"); + free(res); + } + fclose(inf); +} + +static void cmd_help(void) +{ + printf("usage: resctrl_tests [-h] [-b \"benchmark_cmd [options]\"] [-t test list] [-n no_of_bits]\n"); + printf("\t-b benchmark_cmd [options]: run specified benchmark for MBM, MBA and CQM"); + printf("\t default benchmark is builtin fill_buf\n"); + printf("\t-t test list: run tests specified in the test list, "); + printf("e.g. -t mbm, mba, cqm, cat\n"); + printf("\t-n no_of_bits: run cache tests using specified no of bits in cache bit mask\n"); + printf("\t-p cpu_no: specify CPU number to run the test. 1 is default\n"); + printf("\t-h: help\n"); +} + +void tests_cleanup(void) +{ + mbm_test_cleanup(); + mba_test_cleanup(); + cqm_test_cleanup(); + cat_test_cleanup(); +} + +int main(int argc, char **argv) +{ + bool has_ben = false, mbm_test = true, mba_test = true, cqm_test = true; + int res, c, cpu_no = 1, span = 250, argc_new = argc, i, no_of_bits = 5; + char *benchmark_cmd[BENCHMARK_ARGS], bw_report[64], bm_type[64]; + char benchmark_cmd_area[BENCHMARK_ARGS][BENCHMARK_ARG_SIZE]; + int ben_ind, ben_count; + bool cat_test = true; + + for (i = 0; i < argc; i++) { + if (strcmp(argv[i], "-b") == 0) { + ben_ind = i + 1; + ben_count = argc - ben_ind; + argc_new = ben_ind - 1; + has_ben = true; + break; + } + } + + while ((c = getopt(argc_new, argv, "ht:b:")) != -1) { + char *token; + + switch (c) { + case 't': + token = strtok(optarg, ","); + + mbm_test = false; + mba_test = false; + cqm_test = false; + cat_test = false; + while (token) { + if (!strcmp(token, "mbm")) { + mbm_test = true; + } else if (!strcmp(token, "mba")) { + mba_test = true; + } else if (!strcmp(token, "cqm")) { + cqm_test = true; + } else if (!strcmp(token, "cat")) { + cat_test = true; + } else { + printf("invalid argument\n"); + + return -1; + } + token = strtok(NULL, ":\t"); + } + break; + case 'p': + cpu_no = atoi(optarg); + break; + case 'n': + no_of_bits = atoi(optarg); + break; + case 'h': + cmd_help(); + + return 0; + default: + printf("invalid argument\n"); + + return -1; + } + } + + printf("TAP version 13\n"); + + /* + * Typically we need root privileges, because: + * 1. We write to resctrl FS + * 2. We execute perf commands + */ + if (geteuid() != 0) + printf("# WARNING: not running as root, tests may fail.\n"); + + /* Detect AMD vendor */ + detect_amd(); + + if (has_ben) { + /* Extract benchmark command from command line. */ + for (i = ben_ind; i < argc; i++) { + benchmark_cmd[i - ben_ind] = benchmark_cmd_area[i]; + sprintf(benchmark_cmd[i - ben_ind], "%s", argv[i]); + } + benchmark_cmd[ben_count] = NULL; + } else { + /* If no benchmark is given by "-b" argument, use fill_buf. */ + for (i = 0; i < 6; i++) + benchmark_cmd[i] = benchmark_cmd_area[i]; + + strcpy(benchmark_cmd[0], "fill_buf"); + sprintf(benchmark_cmd[1], "%d", span); + strcpy(benchmark_cmd[2], "1"); + strcpy(benchmark_cmd[3], "1"); + strcpy(benchmark_cmd[4], "0"); + strcpy(benchmark_cmd[5], ""); + benchmark_cmd[6] = NULL; + } + + sprintf(bw_report, "reads"); + sprintf(bm_type, "fill_buf"); + + check_resctrlfs_support(); + filter_dmesg(); + + if (!is_amd && mbm_test) { + printf("# Starting MBM BW change ...\n"); + if (!has_ben) + sprintf(benchmark_cmd[5], "%s", "mba"); + res = mbm_bw_change(span, cpu_no, bw_report, benchmark_cmd); + printf("%sok MBM: bw change\n", res ? "not " : ""); + mbm_test_cleanup(); + tests_run++; + } + + if (!is_amd && mba_test) { + printf("# Starting MBA Schemata change ...\n"); + if (!has_ben) + sprintf(benchmark_cmd[1], "%d", span); + res = mba_schemata_change(cpu_no, bw_report, benchmark_cmd); + printf("%sok MBA: schemata change\n", res ? "not " : ""); + mba_test_cleanup(); + tests_run++; + } + + if (cqm_test) { + printf("# Starting CQM test ...\n"); + if (!has_ben) + sprintf(benchmark_cmd[5], "%s", "cqm"); + res = cqm_resctrl_val(cpu_no, no_of_bits, benchmark_cmd); + printf("%sok CQM: test\n", res ? "not " : ""); + cqm_test_cleanup(); + tests_run++; + } + + if (cat_test) { + printf("# Starting CAT test ...\n"); + res = cat_perf_miss_val(cpu_no, no_of_bits, "L3"); + printf("%sok CAT: test\n", res ? "not " : ""); + tests_run++; + cat_test_cleanup(); + } + + printf("1..%d\n", tests_run); + + return 0; +} diff --git a/tools/testing/selftests/resctrl/resctrl_val.c b/tools/testing/selftests/resctrl/resctrl_val.c new file mode 100644 index 000000000000..520fea3606d1 --- /dev/null +++ b/tools/testing/selftests/resctrl/resctrl_val.c @@ -0,0 +1,744 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Memory bandwidth monitoring and allocation library + * + * Copyright (C) 2018 Intel Corporation + * + * Authors: + * Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>, + * Fenghua Yu <fenghua.yu@intel.com> + */ +#include "resctrl.h" + +#define UNCORE_IMC "uncore_imc" +#define READ_FILE_NAME "events/cas_count_read" +#define WRITE_FILE_NAME "events/cas_count_write" +#define DYN_PMU_PATH "/sys/bus/event_source/devices" +#define SCALE 0.00006103515625 +#define MAX_IMCS 20 +#define MAX_TOKENS 5 +#define READ 0 +#define WRITE 1 +#define CON_MON_MBM_LOCAL_BYTES_PATH \ + "%s/%s/mon_groups/%s/mon_data/mon_L3_%02d/mbm_local_bytes" + +#define CON_MBM_LOCAL_BYTES_PATH \ + "%s/%s/mon_data/mon_L3_%02d/mbm_local_bytes" + +#define MON_MBM_LOCAL_BYTES_PATH \ + "%s/mon_groups/%s/mon_data/mon_L3_%02d/mbm_local_bytes" + +#define MBM_LOCAL_BYTES_PATH \ + "%s/mon_data/mon_L3_%02d/mbm_local_bytes" + +#define CON_MON_LCC_OCCUP_PATH \ + "%s/%s/mon_groups/%s/mon_data/mon_L3_%02d/llc_occupancy" + +#define CON_LCC_OCCUP_PATH \ + "%s/%s/mon_data/mon_L3_%02d/llc_occupancy" + +#define MON_LCC_OCCUP_PATH \ + "%s/mon_groups/%s/mon_data/mon_L3_%02d/llc_occupancy" + +#define LCC_OCCUP_PATH \ + "%s/mon_data/mon_L3_%02d/llc_occupancy" + +struct membw_read_format { + __u64 value; /* The value of the event */ + __u64 time_enabled; /* if PERF_FORMAT_TOTAL_TIME_ENABLED */ + __u64 time_running; /* if PERF_FORMAT_TOTAL_TIME_RUNNING */ + __u64 id; /* if PERF_FORMAT_ID */ +}; + +struct imc_counter_config { + __u32 type; + __u64 event; + __u64 umask; + struct perf_event_attr pe; + struct membw_read_format return_value; + int fd; +}; + +static char mbm_total_path[1024]; +static int imcs; +static struct imc_counter_config imc_counters_config[MAX_IMCS][2]; + +void membw_initialize_perf_event_attr(int i, int j) +{ + memset(&imc_counters_config[i][j].pe, 0, + sizeof(struct perf_event_attr)); + imc_counters_config[i][j].pe.type = imc_counters_config[i][j].type; + imc_counters_config[i][j].pe.size = sizeof(struct perf_event_attr); + imc_counters_config[i][j].pe.disabled = 1; + imc_counters_config[i][j].pe.inherit = 1; + imc_counters_config[i][j].pe.exclude_guest = 0; + imc_counters_config[i][j].pe.config = + imc_counters_config[i][j].umask << 8 | + imc_counters_config[i][j].event; + imc_counters_config[i][j].pe.sample_type = PERF_SAMPLE_IDENTIFIER; + imc_counters_config[i][j].pe.read_format = + PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING; +} + +void membw_ioctl_perf_event_ioc_reset_enable(int i, int j) +{ + ioctl(imc_counters_config[i][j].fd, PERF_EVENT_IOC_RESET, 0); + ioctl(imc_counters_config[i][j].fd, PERF_EVENT_IOC_ENABLE, 0); +} + +void membw_ioctl_perf_event_ioc_disable(int i, int j) +{ + ioctl(imc_counters_config[i][j].fd, PERF_EVENT_IOC_DISABLE, 0); +} + +/* + * get_event_and_umask: Parse config into event and umask + * @cas_count_cfg: Config + * @count: iMC number + * @op: Operation (read/write) + */ +void get_event_and_umask(char *cas_count_cfg, int count, bool op) +{ + char *token[MAX_TOKENS]; + int i = 0; + + strcat(cas_count_cfg, ","); + token[0] = strtok(cas_count_cfg, "=,"); + + for (i = 1; i < MAX_TOKENS; i++) + token[i] = strtok(NULL, "=,"); + + for (i = 0; i < MAX_TOKENS; i++) { + if (!token[i]) + break; + if (strcmp(token[i], "event") == 0) { + if (op == READ) + imc_counters_config[count][READ].event = + strtol(token[i + 1], NULL, 16); + else + imc_counters_config[count][WRITE].event = + strtol(token[i + 1], NULL, 16); + } + if (strcmp(token[i], "umask") == 0) { + if (op == READ) + imc_counters_config[count][READ].umask = + strtol(token[i + 1], NULL, 16); + else + imc_counters_config[count][WRITE].umask = + strtol(token[i + 1], NULL, 16); + } + } +} + +static int open_perf_event(int i, int cpu_no, int j) +{ + imc_counters_config[i][j].fd = + perf_event_open(&imc_counters_config[i][j].pe, -1, cpu_no, -1, + PERF_FLAG_FD_CLOEXEC); + + if (imc_counters_config[i][j].fd == -1) { + fprintf(stderr, "Error opening leader %llx\n", + imc_counters_config[i][j].pe.config); + + return -1; + } + + return 0; +} + +/* Get type and config (read and write) of an iMC counter */ +static int read_from_imc_dir(char *imc_dir, int count) +{ + char cas_count_cfg[1024], imc_counter_cfg[1024], imc_counter_type[1024]; + FILE *fp; + + /* Get type of iMC counter */ + sprintf(imc_counter_type, "%s%s", imc_dir, "type"); + fp = fopen(imc_counter_type, "r"); + if (!fp) { + perror("Failed to open imc counter type file"); + + return -1; + } + if (fscanf(fp, "%u", &imc_counters_config[count][READ].type) <= 0) { + perror("Could not get imc type"); + fclose(fp); + + return -1; + } + fclose(fp); + + imc_counters_config[count][WRITE].type = + imc_counters_config[count][READ].type; + + /* Get read config */ + sprintf(imc_counter_cfg, "%s%s", imc_dir, READ_FILE_NAME); + fp = fopen(imc_counter_cfg, "r"); + if (!fp) { + perror("Failed to open imc config file"); + + return -1; + } + if (fscanf(fp, "%s", cas_count_cfg) <= 0) { + perror("Could not get imc cas count read"); + fclose(fp); + + return -1; + } + fclose(fp); + + get_event_and_umask(cas_count_cfg, count, READ); + + /* Get write config */ + sprintf(imc_counter_cfg, "%s%s", imc_dir, WRITE_FILE_NAME); + fp = fopen(imc_counter_cfg, "r"); + if (!fp) { + perror("Failed to open imc config file"); + + return -1; + } + if (fscanf(fp, "%s", cas_count_cfg) <= 0) { + perror("Could not get imc cas count write"); + fclose(fp); + + return -1; + } + fclose(fp); + + get_event_and_umask(cas_count_cfg, count, WRITE); + + return 0; +} + +/* + * A system can have 'n' number of iMC (Integrated Memory Controller) + * counters, get that 'n'. For each iMC counter get it's type and config. + * Also, each counter has two configs, one for read and the other for write. + * A config again has two parts, event and umask. + * Enumerate all these details into an array of structures. + * + * Return: >= 0 on success. < 0 on failure. + */ +static int num_of_imcs(void) +{ + unsigned int count = 0; + char imc_dir[512]; + struct dirent *ep; + int ret; + DIR *dp; + + dp = opendir(DYN_PMU_PATH); + if (dp) { + while ((ep = readdir(dp))) { + if (strstr(ep->d_name, UNCORE_IMC)) { + sprintf(imc_dir, "%s/%s/", DYN_PMU_PATH, + ep->d_name); + ret = read_from_imc_dir(imc_dir, count); + if (ret) { + closedir(dp); + + return ret; + } + count++; + } + } + closedir(dp); + if (count == 0) { + perror("Unable find iMC counters!\n"); + + return -1; + } + } else { + perror("Unable to open PMU directory!\n"); + + return -1; + } + + return count; +} + +static int initialize_mem_bw_imc(void) +{ + int imc, j; + + imcs = num_of_imcs(); + if (imcs <= 0) + return imcs; + + /* Initialize perf_event_attr structures for all iMC's */ + for (imc = 0; imc < imcs; imc++) { + for (j = 0; j < 2; j++) + membw_initialize_perf_event_attr(imc, j); + } + + return 0; +} + +/* + * get_mem_bw_imc: Memory band width as reported by iMC counters + * @cpu_no: CPU number that the benchmark PID is binded to + * @bw_report: Bandwidth report type (reads, writes) + * + * Memory B/W utilized by a process on a socket can be calculated using + * iMC counters. Perf events are used to read these counters. + * + * Return: >= 0 on success. < 0 on failure. + */ +static float get_mem_bw_imc(int cpu_no, char *bw_report) +{ + float reads, writes, of_mul_read, of_mul_write; + int imc, j, ret; + + /* Start all iMC counters to log values (both read and write) */ + reads = 0, writes = 0, of_mul_read = 1, of_mul_write = 1; + for (imc = 0; imc < imcs; imc++) { + for (j = 0; j < 2; j++) { + ret = open_perf_event(imc, cpu_no, j); + if (ret) + return -1; + } + for (j = 0; j < 2; j++) + membw_ioctl_perf_event_ioc_reset_enable(imc, j); + } + + sleep(1); + + /* Stop counters after a second to get results (both read and write) */ + for (imc = 0; imc < imcs; imc++) { + for (j = 0; j < 2; j++) + membw_ioctl_perf_event_ioc_disable(imc, j); + } + + /* + * Get results which are stored in struct type imc_counter_config + * Take over flow into consideration before calculating total b/w + */ + for (imc = 0; imc < imcs; imc++) { + struct imc_counter_config *r = + &imc_counters_config[imc][READ]; + struct imc_counter_config *w = + &imc_counters_config[imc][WRITE]; + + if (read(r->fd, &r->return_value, + sizeof(struct membw_read_format)) == -1) { + perror("Couldn't get read b/w through iMC"); + + return -1; + } + + if (read(w->fd, &w->return_value, + sizeof(struct membw_read_format)) == -1) { + perror("Couldn't get write bw through iMC"); + + return -1; + } + + __u64 r_time_enabled = r->return_value.time_enabled; + __u64 r_time_running = r->return_value.time_running; + + if (r_time_enabled != r_time_running) + of_mul_read = (float)r_time_enabled / + (float)r_time_running; + + __u64 w_time_enabled = w->return_value.time_enabled; + __u64 w_time_running = w->return_value.time_running; + + if (w_time_enabled != w_time_running) + of_mul_write = (float)w_time_enabled / + (float)w_time_running; + reads += r->return_value.value * of_mul_read * SCALE; + writes += w->return_value.value * of_mul_write * SCALE; + } + + for (imc = 0; imc < imcs; imc++) { + close(imc_counters_config[imc][READ].fd); + close(imc_counters_config[imc][WRITE].fd); + } + + if (strcmp(bw_report, "reads") == 0) + return reads; + + if (strcmp(bw_report, "writes") == 0) + return writes; + + return (reads + writes); +} + +void set_mbm_path(const char *ctrlgrp, const char *mongrp, int resource_id) +{ + if (ctrlgrp && mongrp) + sprintf(mbm_total_path, CON_MON_MBM_LOCAL_BYTES_PATH, + RESCTRL_PATH, ctrlgrp, mongrp, resource_id); + else if (!ctrlgrp && mongrp) + sprintf(mbm_total_path, MON_MBM_LOCAL_BYTES_PATH, RESCTRL_PATH, + mongrp, resource_id); + else if (ctrlgrp && !mongrp) + sprintf(mbm_total_path, CON_MBM_LOCAL_BYTES_PATH, RESCTRL_PATH, + ctrlgrp, resource_id); + else if (!ctrlgrp && !mongrp) + sprintf(mbm_total_path, MBM_LOCAL_BYTES_PATH, RESCTRL_PATH, + resource_id); +} + +/* + * initialize_mem_bw_resctrl: Appropriately populate "mbm_total_path" + * @ctrlgrp: Name of the control monitor group (con_mon grp) + * @mongrp: Name of the monitor group (mon grp) + * @cpu_no: CPU number that the benchmark PID is binded to + * @resctrl_val: Resctrl feature (Eg: mbm, mba.. etc) + */ +static void initialize_mem_bw_resctrl(const char *ctrlgrp, const char *mongrp, + int cpu_no, char *resctrl_val) +{ + int resource_id; + + if (get_resource_id(cpu_no, &resource_id) < 0) { + perror("Could not get resource_id"); + return; + } + + if (strcmp(resctrl_val, "mbm") == 0) + set_mbm_path(ctrlgrp, mongrp, resource_id); + + if ((strcmp(resctrl_val, "mba") == 0)) { + if (ctrlgrp) + sprintf(mbm_total_path, CON_MBM_LOCAL_BYTES_PATH, + RESCTRL_PATH, ctrlgrp, resource_id); + else + sprintf(mbm_total_path, MBM_LOCAL_BYTES_PATH, + RESCTRL_PATH, resource_id); + } +} + +/* + * Get MBM Local bytes as reported by resctrl FS + * For MBM, + * 1. If con_mon grp and mon grp are given, then read from con_mon grp's mon grp + * 2. If only con_mon grp is given, then read from con_mon grp + * 3. If both are not given, then read from root con_mon grp + * For MBA, + * 1. If con_mon grp is given, then read from it + * 2. If con_mon grp is not given, then read from root con_mon grp + */ +static unsigned long get_mem_bw_resctrl(void) +{ + unsigned long mbm_total = 0; + FILE *fp; + + fp = fopen(mbm_total_path, "r"); + if (!fp) { + perror("Failed to open total bw file"); + + return -1; + } + if (fscanf(fp, "%lu", &mbm_total) <= 0) { + perror("Could not get mbm local bytes"); + fclose(fp); + + return -1; + } + fclose(fp); + + return mbm_total; +} + +pid_t bm_pid, ppid; + +void ctrlc_handler(int signum, siginfo_t *info, void *ptr) +{ + kill(bm_pid, SIGKILL); + umount_resctrlfs(); + tests_cleanup(); + printf("Ending\n\n"); + + exit(EXIT_SUCCESS); +} + +/* + * print_results_bw: the memory bandwidth results are stored in a file + * @filename: file that stores the results + * @bm_pid: child pid that runs benchmark + * @bw_imc: perf imc counter value + * @bw_resc: memory bandwidth value + * + * Return: 0 on success. non-zero on failure. + */ +static int print_results_bw(char *filename, int bm_pid, float bw_imc, + unsigned long bw_resc) +{ + unsigned long diff = fabs(bw_imc - bw_resc); + FILE *fp; + + if (strcmp(filename, "stdio") == 0 || strcmp(filename, "stderr") == 0) { + printf("Pid: %d \t Mem_BW_iMC: %f \t ", bm_pid, bw_imc); + printf("Mem_BW_resc: %lu \t Difference: %lu\n", bw_resc, diff); + } else { + fp = fopen(filename, "a"); + if (!fp) { + perror("Cannot open results file"); + + return errno; + } + if (fprintf(fp, "Pid: %d \t Mem_BW_iMC: %f \t Mem_BW_resc: %lu \t Difference: %lu\n", + bm_pid, bw_imc, bw_resc, diff) <= 0) { + fclose(fp); + perror("Could not log results."); + + return errno; + } + fclose(fp); + } + + return 0; +} + +static void set_cqm_path(const char *ctrlgrp, const char *mongrp, char sock_num) +{ + if (strlen(ctrlgrp) && strlen(mongrp)) + sprintf(llc_occup_path, CON_MON_LCC_OCCUP_PATH, RESCTRL_PATH, + ctrlgrp, mongrp, sock_num); + else if (!strlen(ctrlgrp) && strlen(mongrp)) + sprintf(llc_occup_path, MON_LCC_OCCUP_PATH, RESCTRL_PATH, + mongrp, sock_num); + else if (strlen(ctrlgrp) && !strlen(mongrp)) + sprintf(llc_occup_path, CON_LCC_OCCUP_PATH, RESCTRL_PATH, + ctrlgrp, sock_num); + else if (!strlen(ctrlgrp) && !strlen(mongrp)) + sprintf(llc_occup_path, LCC_OCCUP_PATH, RESCTRL_PATH, sock_num); +} + +/* + * initialize_llc_occu_resctrl: Appropriately populate "llc_occup_path" + * @ctrlgrp: Name of the control monitor group (con_mon grp) + * @mongrp: Name of the monitor group (mon grp) + * @cpu_no: CPU number that the benchmark PID is binded to + * @resctrl_val: Resctrl feature (Eg: cat, cqm.. etc) + */ +static void initialize_llc_occu_resctrl(const char *ctrlgrp, const char *mongrp, + int cpu_no, char *resctrl_val) +{ + int resource_id; + + if (get_resource_id(cpu_no, &resource_id) < 0) { + perror("# Unable to resource_id"); + return; + } + + if (strcmp(resctrl_val, "cqm") == 0) + set_cqm_path(ctrlgrp, mongrp, resource_id); +} + +static int +measure_vals(struct resctrl_val_param *param, unsigned long *bw_resc_start) +{ + unsigned long bw_imc, bw_resc, bw_resc_end; + int ret; + + /* + * Measure memory bandwidth from resctrl and from + * another source which is perf imc value or could + * be something else if perf imc event is not available. + * Compare the two values to validate resctrl value. + * It takes 1sec to measure the data. + */ + bw_imc = get_mem_bw_imc(param->cpu_no, param->bw_report); + if (bw_imc <= 0) + return bw_imc; + + bw_resc_end = get_mem_bw_resctrl(); + if (bw_resc_end <= 0) + return bw_resc_end; + + bw_resc = (bw_resc_end - *bw_resc_start) / MB; + ret = print_results_bw(param->filename, bm_pid, bw_imc, bw_resc); + if (ret) + return ret; + + *bw_resc_start = bw_resc_end; + + return 0; +} + +/* + * resctrl_val: execute benchmark and measure memory bandwidth on + * the benchmark + * @benchmark_cmd: benchmark command and its arguments + * @param: parameters passed to resctrl_val() + * + * Return: 0 on success. non-zero on failure. + */ +int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param) +{ + char *resctrl_val = param->resctrl_val; + unsigned long bw_resc_start = 0; + struct sigaction sigact; + int ret = 0, pipefd[2]; + char pipe_message = 0; + union sigval value; + + if (strcmp(param->filename, "") == 0) + sprintf(param->filename, "stdio"); + + if ((strcmp(resctrl_val, "mba")) == 0 || + (strcmp(resctrl_val, "mbm")) == 0) { + ret = validate_bw_report_request(param->bw_report); + if (ret) + return ret; + } + + ret = remount_resctrlfs(param->mum_resctrlfs); + if (ret) + return ret; + + /* + * If benchmark wasn't successfully started by child, then child should + * kill parent, so save parent's pid + */ + ppid = getpid(); + + if (pipe(pipefd)) { + perror("# Unable to create pipe"); + + return -1; + } + + /* + * Fork to start benchmark, save child's pid so that it can be killed + * when needed + */ + bm_pid = fork(); + if (bm_pid == -1) { + perror("# Unable to fork"); + + return -1; + } + + if (bm_pid == 0) { + /* + * Mask all signals except SIGUSR1, parent uses SIGUSR1 to + * start benchmark + */ + sigfillset(&sigact.sa_mask); + sigdelset(&sigact.sa_mask, SIGUSR1); + + sigact.sa_sigaction = run_benchmark; + sigact.sa_flags = SA_SIGINFO; + + /* Register for "SIGUSR1" signal from parent */ + if (sigaction(SIGUSR1, &sigact, NULL)) + PARENT_EXIT("Can't register child for signal"); + + /* Tell parent that child is ready */ + close(pipefd[0]); + pipe_message = 1; + if (write(pipefd[1], &pipe_message, sizeof(pipe_message)) < + sizeof(pipe_message)) { + perror("# failed signaling parent process"); + close(pipefd[1]); + return -1; + } + close(pipefd[1]); + + /* Suspend child until delivery of "SIGUSR1" from parent */ + sigsuspend(&sigact.sa_mask); + + PARENT_EXIT("Child is done"); + } + + printf("# benchmark PID: %d\n", bm_pid); + + /* + * Register CTRL-C handler for parent, as it has to kill benchmark + * before exiting + */ + sigact.sa_sigaction = ctrlc_handler; + sigemptyset(&sigact.sa_mask); + sigact.sa_flags = SA_SIGINFO; + if (sigaction(SIGINT, &sigact, NULL) || + sigaction(SIGHUP, &sigact, NULL)) { + perror("# sigaction"); + ret = errno; + goto out; + } + + value.sival_ptr = benchmark_cmd; + + /* Taskset benchmark to specified cpu */ + ret = taskset_benchmark(bm_pid, param->cpu_no); + if (ret) + goto out; + + /* Write benchmark to specified control&monitoring grp in resctrl FS */ + ret = write_bm_pid_to_resctrl(bm_pid, param->ctrlgrp, param->mongrp, + resctrl_val); + if (ret) + goto out; + + if ((strcmp(resctrl_val, "mbm") == 0) || + (strcmp(resctrl_val, "mba") == 0)) { + ret = initialize_mem_bw_imc(); + if (ret) + goto out; + + initialize_mem_bw_resctrl(param->ctrlgrp, param->mongrp, + param->cpu_no, resctrl_val); + } else if (strcmp(resctrl_val, "cqm") == 0) + initialize_llc_occu_resctrl(param->ctrlgrp, param->mongrp, + param->cpu_no, resctrl_val); + + /* Parent waits for child to be ready. */ + close(pipefd[1]); + while (pipe_message != 1) { + if (read(pipefd[0], &pipe_message, sizeof(pipe_message)) < + sizeof(pipe_message)) { + perror("# failed reading message from child process"); + close(pipefd[0]); + goto out; + } + } + close(pipefd[0]); + + /* Signal child to start benchmark */ + if (sigqueue(bm_pid, SIGUSR1, value) == -1) { + perror("# sigqueue SIGUSR1 to child"); + ret = errno; + goto out; + } + + /* Give benchmark enough time to fully run */ + sleep(1); + + /* Test runs until the callback setup() tells the test to stop. */ + while (1) { + if ((strcmp(resctrl_val, "mbm") == 0) || + (strcmp(resctrl_val, "mba") == 0)) { + ret = param->setup(1, param); + if (ret) { + ret = 0; + break; + } + + ret = measure_vals(param, &bw_resc_start); + if (ret) + break; + } else if (strcmp(resctrl_val, "cqm") == 0) { + ret = param->setup(1, param); + if (ret) { + ret = 0; + break; + } + sleep(1); + ret = measure_cache_vals(param, bm_pid); + if (ret) + break; + } else { + break; + } + } + +out: + kill(bm_pid, SIGKILL); + umount_resctrlfs(); + + return ret; +} diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c new file mode 100644 index 000000000000..19c0ec4045a4 --- /dev/null +++ b/tools/testing/selftests/resctrl/resctrlfs.c @@ -0,0 +1,722 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Basic resctrl file system operations + * + * Copyright (C) 2018 Intel Corporation + * + * Authors: + * Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>, + * Fenghua Yu <fenghua.yu@intel.com> + */ +#include "resctrl.h" + +int tests_run; + +static int find_resctrl_mount(char *buffer) +{ + FILE *mounts; + char line[256], *fs, *mntpoint; + + mounts = fopen("/proc/mounts", "r"); + if (!mounts) { + perror("/proc/mounts"); + return -ENXIO; + } + while (!feof(mounts)) { + if (!fgets(line, 256, mounts)) + break; + fs = strtok(line, " \t"); + if (!fs) + continue; + mntpoint = strtok(NULL, " \t"); + if (!mntpoint) + continue; + fs = strtok(NULL, " \t"); + if (!fs) + continue; + if (strcmp(fs, "resctrl")) + continue; + + fclose(mounts); + if (buffer) + strncpy(buffer, mntpoint, 256); + + return 0; + } + + fclose(mounts); + + return -ENOENT; +} + +char cbm_mask[256]; + +/* + * remount_resctrlfs - Remount resctrl FS at /sys/fs/resctrl + * @mum_resctrlfs: Should the resctrl FS be remounted? + * + * If not mounted, mount it. + * If mounted and mum_resctrlfs then remount resctrl FS. + * If mounted and !mum_resctrlfs then noop + * + * Return: 0 on success, non-zero on failure + */ +int remount_resctrlfs(bool mum_resctrlfs) +{ + char mountpoint[256]; + int ret; + + ret = find_resctrl_mount(mountpoint); + if (ret) + strcpy(mountpoint, RESCTRL_PATH); + + if (!ret && mum_resctrlfs && umount(mountpoint)) { + printf("not ok unmounting \"%s\"\n", mountpoint); + perror("# umount"); + tests_run++; + } + + if (!ret && !mum_resctrlfs) + return 0; + + ret = mount("resctrl", RESCTRL_PATH, "resctrl", 0, NULL); + printf("%sok mounting resctrl to \"%s\"\n", ret ? "not " : "", + RESCTRL_PATH); + if (ret) + perror("# mount"); + + tests_run++; + + return ret; +} + +int umount_resctrlfs(void) +{ + if (umount(RESCTRL_PATH)) { + perror("# Unable to umount resctrl"); + + return errno; + } + + return 0; +} + +/* + * get_resource_id - Get socket number/l3 id for a specified CPU + * @cpu_no: CPU number + * @resource_id: Socket number or l3_id + * + * Return: >= 0 on success, < 0 on failure. + */ +int get_resource_id(int cpu_no, int *resource_id) +{ + char phys_pkg_path[1024]; + FILE *fp; + + if (is_amd) + sprintf(phys_pkg_path, "%s%d/cache/index3/id", + PHYS_ID_PATH, cpu_no); + else + sprintf(phys_pkg_path, "%s%d/topology/physical_package_id", + PHYS_ID_PATH, cpu_no); + + fp = fopen(phys_pkg_path, "r"); + if (!fp) { + perror("Failed to open physical_package_id"); + + return -1; + } + if (fscanf(fp, "%d", resource_id) <= 0) { + perror("Could not get socket number or l3 id"); + fclose(fp); + + return -1; + } + fclose(fp); + + return 0; +} + +/* + * get_cache_size - Get cache size for a specified CPU + * @cpu_no: CPU number + * @cache_type: Cache level L2/L3 + * @cache_size: pointer to cache_size + * + * Return: = 0 on success, < 0 on failure. + */ +int get_cache_size(int cpu_no, char *cache_type, unsigned long *cache_size) +{ + char cache_path[1024], cache_str[64]; + int length, i, cache_num; + FILE *fp; + + if (!strcmp(cache_type, "L3")) { + cache_num = 3; + } else if (!strcmp(cache_type, "L2")) { + cache_num = 2; + } else { + perror("Invalid cache level"); + return -1; + } + + sprintf(cache_path, "/sys/bus/cpu/devices/cpu%d/cache/index%d/size", + cpu_no, cache_num); + fp = fopen(cache_path, "r"); + if (!fp) { + perror("Failed to open cache size"); + + return -1; + } + if (fscanf(fp, "%s", cache_str) <= 0) { + perror("Could not get cache_size"); + fclose(fp); + + return -1; + } + fclose(fp); + + length = (int)strlen(cache_str); + + *cache_size = 0; + + for (i = 0; i < length; i++) { + if ((cache_str[i] >= '0') && (cache_str[i] <= '9')) + + *cache_size = *cache_size * 10 + (cache_str[i] - '0'); + + else if (cache_str[i] == 'K') + + *cache_size = *cache_size * 1024; + + else if (cache_str[i] == 'M') + + *cache_size = *cache_size * 1024 * 1024; + + else + break; + } + + return 0; +} + +#define CORE_SIBLINGS_PATH "/sys/bus/cpu/devices/cpu" + +/* + * get_cbm_mask - Get cbm mask for given cache + * @cache_type: Cache level L2/L3 + * + * Mask is stored in cbm_mask which is global variable. + * + * Return: = 0 on success, < 0 on failure. + */ +int get_cbm_mask(char *cache_type) +{ + char cbm_mask_path[1024]; + FILE *fp; + + sprintf(cbm_mask_path, "%s/%s/cbm_mask", CBM_MASK_PATH, cache_type); + + fp = fopen(cbm_mask_path, "r"); + if (!fp) { + perror("Failed to open cache level"); + + return -1; + } + if (fscanf(fp, "%s", cbm_mask) <= 0) { + perror("Could not get max cbm_mask"); + fclose(fp); + + return -1; + } + fclose(fp); + + return 0; +} + +/* + * get_core_sibling - Get sibling core id from the same socket for given CPU + * @cpu_no: CPU number + * + * Return: > 0 on success, < 0 on failure. + */ +int get_core_sibling(int cpu_no) +{ + char core_siblings_path[1024], cpu_list_str[64]; + int sibling_cpu_no = -1; + FILE *fp; + + sprintf(core_siblings_path, "%s%d/topology/core_siblings_list", + CORE_SIBLINGS_PATH, cpu_no); + + fp = fopen(core_siblings_path, "r"); + if (!fp) { + perror("Failed to open core siblings path"); + + return -1; + } + if (fscanf(fp, "%s", cpu_list_str) <= 0) { + perror("Could not get core_siblings list"); + fclose(fp); + + return -1; + } + fclose(fp); + + char *token = strtok(cpu_list_str, "-,"); + + while (token) { + sibling_cpu_no = atoi(token); + /* Skipping core 0 as we don't want to run test on core 0 */ + if (sibling_cpu_no != 0) + break; + token = strtok(NULL, "-,"); + } + + return sibling_cpu_no; +} + +/* + * taskset_benchmark - Taskset PID (i.e. benchmark) to a specified cpu + * @bm_pid: PID that should be binded + * @cpu_no: CPU number at which the PID would be binded + * + * Return: 0 on success, non-zero on failure + */ +int taskset_benchmark(pid_t bm_pid, int cpu_no) +{ + cpu_set_t my_set; + + CPU_ZERO(&my_set); + CPU_SET(cpu_no, &my_set); + + if (sched_setaffinity(bm_pid, sizeof(cpu_set_t), &my_set)) { + perror("Unable to taskset benchmark"); + + return -1; + } + + return 0; +} + +/* + * run_benchmark - Run a specified benchmark or fill_buf (default benchmark) + * in specified signal. Direct benchmark stdio to /dev/null. + * @signum: signal number + * @info: signal info + * @ucontext: user context in signal handling + * + * Return: void + */ +void run_benchmark(int signum, siginfo_t *info, void *ucontext) +{ + int operation, ret, malloc_and_init_memory, memflush; + unsigned long span, buffer_span; + char **benchmark_cmd; + char resctrl_val[64]; + FILE *fp; + + benchmark_cmd = info->si_ptr; + + /* + * Direct stdio of child to /dev/null, so that only parent writes to + * stdio (console) + */ + fp = freopen("/dev/null", "w", stdout); + if (!fp) + PARENT_EXIT("Unable to direct benchmark status to /dev/null"); + + if (strcmp(benchmark_cmd[0], "fill_buf") == 0) { + /* Execute default fill_buf benchmark */ + span = strtoul(benchmark_cmd[1], NULL, 10); + malloc_and_init_memory = atoi(benchmark_cmd[2]); + memflush = atoi(benchmark_cmd[3]); + operation = atoi(benchmark_cmd[4]); + sprintf(resctrl_val, "%s", benchmark_cmd[5]); + + if (strcmp(resctrl_val, "cqm") != 0) + buffer_span = span * MB; + else + buffer_span = span; + + if (run_fill_buf(buffer_span, malloc_and_init_memory, memflush, + operation, resctrl_val)) + fprintf(stderr, "Error in running fill buffer\n"); + } else { + /* Execute specified benchmark */ + ret = execvp(benchmark_cmd[0], benchmark_cmd); + if (ret) + perror("wrong\n"); + } + + fclose(stdout); + PARENT_EXIT("Unable to run specified benchmark"); +} + +/* + * create_grp - Create a group only if one doesn't exist + * @grp_name: Name of the group + * @grp: Full path and name of the group + * @parent_grp: Full path and name of the parent group + * + * Return: 0 on success, non-zero on failure + */ +static int create_grp(const char *grp_name, char *grp, const char *parent_grp) +{ + int found_grp = 0; + struct dirent *ep; + DIR *dp; + + /* + * At this point, we are guaranteed to have resctrl FS mounted and if + * length of grp_name == 0, it means, user wants to use root con_mon + * grp, so do nothing + */ + if (strlen(grp_name) == 0) + return 0; + + /* Check if requested grp exists or not */ + dp = opendir(parent_grp); + if (dp) { + while ((ep = readdir(dp)) != NULL) { + if (strcmp(ep->d_name, grp_name) == 0) + found_grp = 1; + } + closedir(dp); + } else { + perror("Unable to open resctrl for group"); + + return -1; + } + + /* Requested grp doesn't exist, hence create it */ + if (found_grp == 0) { + if (mkdir(grp, 0) == -1) { + perror("Unable to create group"); + + return -1; + } + } + + return 0; +} + +static int write_pid_to_tasks(char *tasks, pid_t pid) +{ + FILE *fp; + + fp = fopen(tasks, "w"); + if (!fp) { + perror("Failed to open tasks file"); + + return -1; + } + if (fprintf(fp, "%d\n", pid) < 0) { + perror("Failed to wr pid to tasks file"); + fclose(fp); + + return -1; + } + fclose(fp); + + return 0; +} + +/* + * write_bm_pid_to_resctrl - Write a PID (i.e. benchmark) to resctrl FS + * @bm_pid: PID that should be written + * @ctrlgrp: Name of the control monitor group (con_mon grp) + * @mongrp: Name of the monitor group (mon grp) + * @resctrl_val: Resctrl feature (Eg: mbm, mba.. etc) + * + * If a con_mon grp is requested, create it and write pid to it, otherwise + * write pid to root con_mon grp. + * If a mon grp is requested, create it and write pid to it, otherwise + * pid is not written, this means that pid is in con_mon grp and hence + * should consult con_mon grp's mon_data directory for results. + * + * Return: 0 on success, non-zero on failure + */ +int write_bm_pid_to_resctrl(pid_t bm_pid, char *ctrlgrp, char *mongrp, + char *resctrl_val) +{ + char controlgroup[128], monitorgroup[512], monitorgroup_p[256]; + char tasks[1024]; + int ret = 0; + + if (strlen(ctrlgrp)) + sprintf(controlgroup, "%s/%s", RESCTRL_PATH, ctrlgrp); + else + sprintf(controlgroup, "%s", RESCTRL_PATH); + + /* Create control and monitoring group and write pid into it */ + ret = create_grp(ctrlgrp, controlgroup, RESCTRL_PATH); + if (ret) + goto out; + sprintf(tasks, "%s/tasks", controlgroup); + ret = write_pid_to_tasks(tasks, bm_pid); + if (ret) + goto out; + + /* Create mon grp and write pid into it for "mbm" and "cqm" test */ + if ((strcmp(resctrl_val, "cqm") == 0) || + (strcmp(resctrl_val, "mbm") == 0)) { + if (strlen(mongrp)) { + sprintf(monitorgroup_p, "%s/mon_groups", controlgroup); + sprintf(monitorgroup, "%s/%s", monitorgroup_p, mongrp); + ret = create_grp(mongrp, monitorgroup, monitorgroup_p); + if (ret) + goto out; + + sprintf(tasks, "%s/mon_groups/%s/tasks", + controlgroup, mongrp); + ret = write_pid_to_tasks(tasks, bm_pid); + if (ret) + goto out; + } + } + +out: + printf("%sok writing benchmark parameters to resctrl FS\n", + ret ? "not " : ""); + if (ret) + perror("# writing to resctrlfs"); + + tests_run++; + + return ret; +} + +/* + * write_schemata - Update schemata of a con_mon grp + * @ctrlgrp: Name of the con_mon grp + * @schemata: Schemata that should be updated to + * @cpu_no: CPU number that the benchmark PID is binded to + * @resctrl_val: Resctrl feature (Eg: mbm, mba.. etc) + * + * Update schemata of a con_mon grp *only* if requested resctrl feature is + * allocation type + * + * Return: 0 on success, non-zero on failure + */ +int write_schemata(char *ctrlgrp, char *schemata, int cpu_no, char *resctrl_val) +{ + char controlgroup[1024], schema[1024], reason[64]; + int resource_id, ret = 0; + FILE *fp; + + if ((strcmp(resctrl_val, "mba") != 0) && + (strcmp(resctrl_val, "cat") != 0) && + (strcmp(resctrl_val, "cqm") != 0)) + return -ENOENT; + + if (!schemata) { + printf("# Skipping empty schemata update\n"); + + return -1; + } + + if (get_resource_id(cpu_no, &resource_id) < 0) { + sprintf(reason, "Failed to get resource id"); + ret = -1; + + goto out; + } + + if (strlen(ctrlgrp) != 0) + sprintf(controlgroup, "%s/%s/schemata", RESCTRL_PATH, ctrlgrp); + else + sprintf(controlgroup, "%s/schemata", RESCTRL_PATH); + + if (!strcmp(resctrl_val, "cat") || !strcmp(resctrl_val, "cqm")) + sprintf(schema, "%s%d%c%s", "L3:", resource_id, '=', schemata); + if (strcmp(resctrl_val, "mba") == 0) + sprintf(schema, "%s%d%c%s", "MB:", resource_id, '=', schemata); + + fp = fopen(controlgroup, "w"); + if (!fp) { + sprintf(reason, "Failed to open control group"); + ret = -1; + + goto out; + } + + if (fprintf(fp, "%s\n", schema) < 0) { + sprintf(reason, "Failed to write schemata in control group"); + fclose(fp); + ret = -1; + + goto out; + } + fclose(fp); + +out: + printf("%sok Write schema \"%s\" to resctrl FS%s%s\n", + ret ? "not " : "", schema, ret ? " # " : "", + ret ? reason : ""); + tests_run++; + + return ret; +} + +bool check_resctrlfs_support(void) +{ + FILE *inf = fopen("/proc/filesystems", "r"); + DIR *dp; + char *res; + bool ret = false; + + if (!inf) + return false; + + res = fgrep(inf, "nodev\tresctrl\n"); + + if (res) { + ret = true; + free(res); + } + + fclose(inf); + + printf("%sok kernel supports resctrl filesystem\n", ret ? "" : "not "); + tests_run++; + + dp = opendir(RESCTRL_PATH); + printf("%sok resctrl mountpoint \"%s\" exists\n", + dp ? "" : "not ", RESCTRL_PATH); + if (dp) + closedir(dp); + tests_run++; + + printf("# resctrl filesystem %s mounted\n", + find_resctrl_mount(NULL) ? "not" : "is"); + + return ret; +} + +char *fgrep(FILE *inf, const char *str) +{ + char line[256]; + int slen = strlen(str); + + while (!feof(inf)) { + if (!fgets(line, 256, inf)) + break; + if (strncmp(line, str, slen)) + continue; + + return strdup(line); + } + + return NULL; +} + +/* + * validate_resctrl_feature_request - Check if requested feature is valid. + * @resctrl_val: Requested feature + * + * Return: 0 on success, non-zero on failure + */ +bool validate_resctrl_feature_request(char *resctrl_val) +{ + FILE *inf = fopen("/proc/cpuinfo", "r"); + bool found = false; + char *res; + + if (!inf) + return false; + + res = fgrep(inf, "flags"); + + if (res) { + char *s = strchr(res, ':'); + + found = s && !strstr(s, resctrl_val); + free(res); + } + fclose(inf); + + return found; +} + +int filter_dmesg(void) +{ + char line[1024]; + FILE *fp; + int pipefds[2]; + pid_t pid; + int ret; + + ret = pipe(pipefds); + if (ret) { + perror("pipe"); + return ret; + } + pid = fork(); + if (pid == 0) { + close(pipefds[0]); + dup2(pipefds[1], STDOUT_FILENO); + execlp("dmesg", "dmesg", NULL); + perror("executing dmesg"); + exit(1); + } + close(pipefds[1]); + fp = fdopen(pipefds[0], "r"); + if (!fp) { + perror("fdopen(pipe)"); + kill(pid, SIGTERM); + + return -1; + } + + while (fgets(line, 1024, fp)) { + if (strstr(line, "intel_rdt:")) + printf("# dmesg: %s", line); + if (strstr(line, "resctrl:")) + printf("# dmesg: %s", line); + } + fclose(fp); + waitpid(pid, NULL, 0); + + return 0; +} + +int validate_bw_report_request(char *bw_report) +{ + if (strcmp(bw_report, "reads") == 0) + return 0; + if (strcmp(bw_report, "writes") == 0) + return 0; + if (strcmp(bw_report, "nt-writes") == 0) { + strcpy(bw_report, "writes"); + return 0; + } + if (strcmp(bw_report, "total") == 0) + return 0; + + fprintf(stderr, "Requested iMC B/W report type unavailable\n"); + + return -1; +} + +int perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, + int group_fd, unsigned long flags) +{ + int ret; + + ret = syscall(__NR_perf_event_open, hw_event, pid, cpu, + group_fd, flags); + return ret; +} + +unsigned int count_bits(unsigned long n) +{ + unsigned int count = 0; + + while (n) { + count += n & 1; + n >>= 1; + } + + return count; +} diff --git a/tools/testing/selftests/rseq/.gitignore b/tools/testing/selftests/rseq/.gitignore index cc610da7e369..5910888ebfe1 100644 --- a/tools/testing/selftests/rseq/.gitignore +++ b/tools/testing/selftests/rseq/.gitignore @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only basic_percpu_ops_test basic_test basic_rseq_op_test diff --git a/tools/testing/selftests/rtc/.gitignore b/tools/testing/selftests/rtc/.gitignore index d0ad44f6294a..fb2d533aa575 100644 --- a/tools/testing/selftests/rtc/.gitignore +++ b/tools/testing/selftests/rtc/.gitignore @@ -1,2 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only rtctest setdate diff --git a/tools/testing/selftests/safesetid/.gitignore b/tools/testing/selftests/safesetid/.gitignore index 9c1a629bca01..25d3db172907 100644 --- a/tools/testing/selftests/safesetid/.gitignore +++ b/tools/testing/selftests/safesetid/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only safesetid-test diff --git a/tools/testing/selftests/seccomp/.gitignore b/tools/testing/selftests/seccomp/.gitignore index 5af29d3a1b0a..dec678577f9c 100644 --- a/tools/testing/selftests/seccomp/.gitignore +++ b/tools/testing/selftests/seccomp/.gitignore @@ -1,2 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only seccomp_bpf seccomp_benchmark diff --git a/tools/testing/selftests/seccomp/Makefile b/tools/testing/selftests/seccomp/Makefile index 1760b3e39730..0ebfe8b0e147 100644 --- a/tools/testing/selftests/seccomp/Makefile +++ b/tools/testing/selftests/seccomp/Makefile @@ -1,17 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 -all: - -include ../lib.mk - -.PHONY: all clean - -BINARIES := seccomp_bpf seccomp_benchmark CFLAGS += -Wl,-no-as-needed -Wall +LDFLAGS += -lpthread -seccomp_bpf: seccomp_bpf.c ../kselftest_harness.h - $(CC) $(CFLAGS) $(LDFLAGS) $< -lpthread -o $@ - -TEST_PROGS += $(BINARIES) -EXTRA_CLEAN := $(BINARIES) - -all: $(BINARIES) +TEST_GEN_PROGS := seccomp_bpf seccomp_benchmark +include ../lib.mk diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index ee1b727ede04..c0aa46ce14f6 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -212,6 +212,10 @@ struct seccomp_notif_sizes { #define SECCOMP_USER_NOTIF_FLAG_CONTINUE 0x00000001 #endif +#ifndef SECCOMP_FILTER_FLAG_TSYNC_ESRCH +#define SECCOMP_FILTER_FLAG_TSYNC_ESRCH (1UL << 4) +#endif + #ifndef seccomp int seccomp(unsigned int op, unsigned int flags, void *args) { @@ -909,7 +913,7 @@ TEST(ERRNO_order) EXPECT_EQ(12, errno); } -FIXTURE_DATA(TRAP) { +FIXTURE(TRAP) { struct sock_fprog prog; }; @@ -1020,7 +1024,7 @@ TEST_F(TRAP, handler) EXPECT_NE(0, (unsigned long)sigsys->_call_addr); } -FIXTURE_DATA(precedence) { +FIXTURE(precedence) { struct sock_fprog allow; struct sock_fprog log; struct sock_fprog trace; @@ -1509,7 +1513,7 @@ void tracer_poke(struct __test_metadata *_metadata, pid_t tracee, int status, EXPECT_EQ(0, ret); } -FIXTURE_DATA(TRACE_poke) { +FIXTURE(TRACE_poke) { struct sock_fprog prog; pid_t tracer; long poked; @@ -1817,7 +1821,7 @@ void tracer_ptrace(struct __test_metadata *_metadata, pid_t tracee, change_syscall(_metadata, tracee, -1, -ESRCH); } -FIXTURE_DATA(TRACE_syscall) { +FIXTURE(TRACE_syscall) { struct sock_fprog prog; pid_t tracer, mytid, mypid, parent; }; @@ -2187,7 +2191,8 @@ TEST(detect_seccomp_filter_flags) unsigned int flags[] = { SECCOMP_FILTER_FLAG_TSYNC, SECCOMP_FILTER_FLAG_LOG, SECCOMP_FILTER_FLAG_SPEC_ALLOW, - SECCOMP_FILTER_FLAG_NEW_LISTENER }; + SECCOMP_FILTER_FLAG_NEW_LISTENER, + SECCOMP_FILTER_FLAG_TSYNC_ESRCH }; unsigned int exclusive[] = { SECCOMP_FILTER_FLAG_TSYNC, SECCOMP_FILTER_FLAG_NEW_LISTENER }; @@ -2321,7 +2326,7 @@ struct tsync_sibling { } \ } while (0) -FIXTURE_DATA(TSYNC) { +FIXTURE(TSYNC) { struct sock_fprog root_prog, apply_prog; struct tsync_sibling sibling[TSYNC_SIBLINGS]; sem_t started; @@ -2645,6 +2650,55 @@ TEST_F(TSYNC, two_siblings_with_one_divergence) EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status); } +TEST_F(TSYNC, two_siblings_with_one_divergence_no_tid_in_err) +{ + long ret, flags; + void *status; + + ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) { + TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); + } + + ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog); + ASSERT_NE(ENOSYS, errno) { + TH_LOG("Kernel does not support seccomp syscall!"); + } + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!"); + } + self->sibling[0].diverge = 1; + tsync_start_sibling(&self->sibling[0]); + tsync_start_sibling(&self->sibling[1]); + + while (self->sibling_count < TSYNC_SIBLINGS) { + sem_wait(&self->started); + self->sibling_count++; + } + + flags = SECCOMP_FILTER_FLAG_TSYNC | \ + SECCOMP_FILTER_FLAG_TSYNC_ESRCH; + ret = seccomp(SECCOMP_SET_MODE_FILTER, flags, &self->apply_prog); + ASSERT_EQ(ESRCH, errno) { + TH_LOG("Did not return ESRCH for diverged sibling."); + } + ASSERT_EQ(-1, ret) { + TH_LOG("Did not fail on diverged sibling."); + } + + /* Wake the threads */ + pthread_mutex_lock(&self->mutex); + ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) { + TH_LOG("cond broadcast non-zero"); + } + pthread_mutex_unlock(&self->mutex); + + /* Ensure they are both unkilled. */ + PTHREAD_JOIN(self->sibling[0].tid, &status); + EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status); + PTHREAD_JOIN(self->sibling[1].tid, &status); + EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status); +} + TEST_F(TSYNC, two_siblings_not_under_filter) { long ret, sib; @@ -2749,12 +2803,13 @@ TEST(syscall_restart) offsetof(struct seccomp_data, nr)), #ifdef __NR_sigreturn - BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_sigreturn, 6, 0), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_sigreturn, 7, 0), #endif - BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 5, 0), - BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_exit, 4, 0), - BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_rt_sigreturn, 3, 0), - BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_nanosleep, 4, 0), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 6, 0), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_exit, 5, 0), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_rt_sigreturn, 4, 0), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_nanosleep, 5, 0), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_clock_nanosleep, 4, 0), BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_restart_syscall, 4, 0), /* Allow __NR_write for easy logging. */ @@ -2841,7 +2896,8 @@ TEST(syscall_restart) ASSERT_EQ(PTRACE_EVENT_SECCOMP, (status >> 16)); ASSERT_EQ(0, ptrace(PTRACE_GETEVENTMSG, child_pid, NULL, &msg)); ASSERT_EQ(0x100, msg); - EXPECT_EQ(__NR_nanosleep, get_syscall(_metadata, child_pid)); + ret = get_syscall(_metadata, child_pid); + EXPECT_TRUE(ret == __NR_nanosleep || ret == __NR_clock_nanosleep); /* Might as well check siginfo for sanity while we're here. */ ASSERT_EQ(0, ptrace(PTRACE_GETSIGINFO, child_pid, NULL, &info)); @@ -3196,6 +3252,24 @@ TEST(user_notification_basic) EXPECT_EQ(0, WEXITSTATUS(status)); } +TEST(user_notification_with_tsync) +{ + int ret; + unsigned int flags; + + /* these were exclusive */ + flags = SECCOMP_FILTER_FLAG_NEW_LISTENER | + SECCOMP_FILTER_FLAG_TSYNC; + ASSERT_EQ(-1, user_trap_syscall(__NR_getppid, flags)); + ASSERT_EQ(EINVAL, errno); + + /* but now they're not */ + flags |= SECCOMP_FILTER_FLAG_TSYNC_ESRCH; + ret = user_trap_syscall(__NR_getppid, flags); + close(ret); + ASSERT_LE(0, ret); +} + TEST(user_notification_kill_in_middle) { pid_t pid; diff --git a/tools/testing/selftests/sigaltstack/.gitignore b/tools/testing/selftests/sigaltstack/.gitignore index 35897b0a3f44..50a19a8888ce 100644 --- a/tools/testing/selftests/sigaltstack/.gitignore +++ b/tools/testing/selftests/sigaltstack/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only sas diff --git a/tools/testing/selftests/size/.gitignore b/tools/testing/selftests/size/.gitignore index 189b7818de34..923e18eed1a0 100644 --- a/tools/testing/selftests/size/.gitignore +++ b/tools/testing/selftests/size/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only get_size diff --git a/tools/testing/selftests/sparc64/drivers/.gitignore b/tools/testing/selftests/sparc64/drivers/.gitignore index 90e835ed74e6..0331f77373b5 100644 --- a/tools/testing/selftests/sparc64/drivers/.gitignore +++ b/tools/testing/selftests/sparc64/drivers/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only adi-test diff --git a/tools/testing/selftests/splice/.gitignore b/tools/testing/selftests/splice/.gitignore index 1e23fefd68e8..d5a2da428752 100644 --- a/tools/testing/selftests/splice/.gitignore +++ b/tools/testing/selftests/splice/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only default_file_splice_read diff --git a/tools/testing/selftests/sync/.gitignore b/tools/testing/selftests/sync/.gitignore index f5091e7792f2..f1152357712f 100644 --- a/tools/testing/selftests/sync/.gitignore +++ b/tools/testing/selftests/sync/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only sync_test diff --git a/tools/testing/selftests/sysctl/config b/tools/testing/selftests/sysctl/config index 6ca14800d755..fc263efd1fad 100644 --- a/tools/testing/selftests/sysctl/config +++ b/tools/testing/selftests/sysctl/config @@ -1 +1 @@ -CONFIG_TEST_SYSCTL=y +CONFIG_TEST_SYSCTL=m diff --git a/tools/testing/selftests/sysctl/sysctl.sh b/tools/testing/selftests/sysctl/sysctl.sh index 6a970b127c9b..19515dcb7d04 100755 --- a/tools/testing/selftests/sysctl/sysctl.sh +++ b/tools/testing/selftests/sysctl/sysctl.sh @@ -39,16 +39,7 @@ ALL_TESTS="$ALL_TESTS 0003:1:1:int_0002" ALL_TESTS="$ALL_TESTS 0004:1:1:uint_0001" ALL_TESTS="$ALL_TESTS 0005:3:1:int_0003" ALL_TESTS="$ALL_TESTS 0006:50:1:bitmap_0001" - -test_modprobe() -{ - if [ ! -d $DIR ]; then - echo "$0: $DIR not present" >&2 - echo "You must have the following enabled in your kernel:" >&2 - cat $TEST_DIR/config >&2 - exit $ksft_skip - fi -} +ALL_TESTS="$ALL_TESTS 0007:1:1:boot_int" function allow_user_defaults() { @@ -122,13 +113,15 @@ test_reqs() function load_req_mod() { - if [ ! -d $DIR ]; then + if [ ! -d $SYSCTL ]; then if ! modprobe -q -n $TEST_DRIVER; then echo "$0: module $TEST_DRIVER not found [SKIP]" + echo "You must set CONFIG_TEST_SYSCTL=m in your kernel" >&2 exit $ksft_skip fi modprobe $TEST_DRIVER if [ $? -ne 0 ]; then + echo "$0: modprobe $TEST_DRIVER failed." exit fi fi @@ -752,6 +745,46 @@ sysctl_test_0006() run_bitmaptest } +sysctl_test_0007() +{ + TARGET="${SYSCTL}/boot_int" + if [ ! -f $TARGET ]; then + echo "Skipping test for $TARGET as it is not present ..." + return $ksft_skip + fi + + if [ -d $DIR ]; then + echo "Boot param test only possible sysctl_test is built-in, not module:" + cat $TEST_DIR/config >&2 + return $ksft_skip + fi + + echo -n "Testing if $TARGET is set to 1 ..." + ORIG=$(cat "${TARGET}") + + if [ x$ORIG = "x1" ]; then + echo "ok" + return 0 + fi + echo "FAIL" + echo "Checking if /proc/cmdline contains setting of the expected parameter ..." + if [ ! -f /proc/cmdline ]; then + echo "/proc/cmdline does not exist, test inconclusive" + return 0 + fi + + FOUND=$(grep -c "sysctl[./]debug[./]test_sysctl[./]boot_int=1" /proc/cmdline) + if [ $FOUND = "1" ]; then + echo "Kernel param found but $TARGET is not 1, TEST FAILED" + rc=1 + test_rc + fi + + echo "Skipping test, expected kernel parameter missing." + echo "To perform this test, make sure kernel is booted with parameter: sysctl.debug.test_sysctl.boot_int=1" + return $ksft_skip +} + list_tests() { echo "Test ID list:" @@ -766,6 +799,7 @@ list_tests() echo "0004 x $(get_test_count 0004) - tests proc_douintvec()" echo "0005 x $(get_test_count 0005) - tests proc_douintvec() array" echo "0006 x $(get_test_count 0006) - tests proc_do_large_bitmap()" + echo "0007 x $(get_test_count 0007) - tests setting sysctl from kernel boot param" } usage() @@ -929,7 +963,6 @@ test_reqs allow_user_defaults check_production_sysctl_writes_strict load_req_mod -test_modprobe trap "test_finish" EXIT diff --git a/tools/testing/selftests/tc-testing/.gitignore b/tools/testing/selftests/tc-testing/.gitignore index c26d72e0166f..d52f65de23b4 100644 --- a/tools/testing/selftests/tc-testing/.gitignore +++ b/tools/testing/selftests/tc-testing/.gitignore @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only __pycache__/ *.pyc plugins/ diff --git a/tools/testing/selftests/tc-testing/config b/tools/testing/selftests/tc-testing/config index c03af4600281..c33a7aac27ff 100644 --- a/tools/testing/selftests/tc-testing/config +++ b/tools/testing/selftests/tc-testing/config @@ -31,6 +31,7 @@ CONFIG_NET_EMATCH_U32=m CONFIG_NET_EMATCH_META=m CONFIG_NET_EMATCH_TEXT=m CONFIG_NET_EMATCH_IPSET=m +CONFIG_NET_EMATCH_CANID=m CONFIG_NET_EMATCH_IPT=m CONFIG_NET_CLS_ACT=y CONFIG_NET_ACT_POLICE=m @@ -58,3 +59,8 @@ CONFIG_NET_IFE_SKBPRIO=m CONFIG_NET_IFE_SKBTCINDEX=m CONFIG_NET_SCH_FIFO=y CONFIG_NET_SCH_ETS=m + +# +## Network testing +# +CONFIG_CAN=m diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/pedit.json b/tools/testing/selftests/tc-testing/tc-tests/actions/pedit.json index f8ea6f5fa8e9..72cdc3c800a5 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/pedit.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/pedit.json @@ -1472,6 +1472,31 @@ ] }, { + "id": "94bb", + "name": "Add pedit action with LAYERED_OP ip6 traffic_class", + "category": [ + "actions", + "pedit", + "layered_op" + ], + "setup": [ + [ + "$TC actions flush action pedit", + 0, + 1, + 255 + ] + ], + "cmdUnderTest": "$TC actions add action pedit ex munge ip6 traffic_class set 0x40 continue", + "expExitCode": "0", + "verifyCmd": "$TC actions list action pedit", + "matchPattern": "ipv6\\+0: val 04000000 mask f00fffff", + "matchCount": "1", + "teardown": [ + "$TC actions flush action pedit" + ] + }, + { "id": "6f5e", "name": "Add pedit action with LAYERED_OP ip6 flow_lbl", "category": [ diff --git a/tools/testing/selftests/tc-testing/tc-tests/filters/basic.json b/tools/testing/selftests/tc-testing/tc-tests/filters/basic.json index 98a20faf3198..e788c114a484 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/filters/basic.json +++ b/tools/testing/selftests/tc-testing/tc-tests/filters/basic.json @@ -372,5 +372,907 @@ "teardown": [ "$TC qdisc del dev $DEV1 ingress" ] + }, + { + "id": "bae4", + "name": "Add basic filter with u32 ematch u8/zero offset and default action", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u8 0x11 0x0f at 0)' classid 1:1", + "expExitCode": "0", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(01000000/0f000000 at 0\\)", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "e6cb", + "name": "Add basic filter with u32 ematch u8/zero offset and invalid value >0xFF", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u8 0x1122 0x0f at 0)' classid 1:1", + "expExitCode": "1", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(11220000/0f000000 at 0\\)", + "matchCount": "0", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "7727", + "name": "Add basic filter with u32 ematch u8/positive offset and default action", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u8 0x77 0x1f at 12)' classid 1:1", + "expExitCode": "0", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(17000000/1f000000 at 12\\)", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "a429", + "name": "Add basic filter with u32 ematch u8/invalid mask >0xFF", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u8 0x77 0xff00 at 12)' classid 1:1", + "expExitCode": "1", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(77000000/ff000000 at 12\\)", + "matchCount": "0", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "8373", + "name": "Add basic filter with u32 ematch u8/missing offset", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u8 0x77 0xff at)' classid 1:1", + "expExitCode": "1", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(77000000 at 12\\)", + "matchCount": "0", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "ab8e", + "name": "Add basic filter with u32 ematch u8/missing AT keyword", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u8 0x77 0xff 0)' classid 1:1", + "expExitCode": "1", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(77000000 at 12\\)", + "matchCount": "0", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "712d", + "name": "Add basic filter with u32 ematch u8/missing value", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u8 at 12)' classid 1:1", + "expExitCode": "1", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(at 12\\)", + "matchCount": "0", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "350f", + "name": "Add basic filter with u32 ematch u8/non-numeric value", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u8 zero 0xff at 0)' classid 1:1", + "expExitCode": "1", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(00000000/ff000000 at 0\\)", + "matchCount": "0", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "e28f", + "name": "Add basic filter with u32 ematch u8/non-numeric mask", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u8 0x11 mask at 0)' classid 1:1", + "expExitCode": "1", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(11000000/00000000 at 0\\)", + "matchCount": "0", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "6d5f", + "name": "Add basic filter with u32 ematch u8/negative offset and default action", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u8 0xaa 0xf0 at -14)' classid 1:1", + "expExitCode": "0", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(0000a000/0000f000 at -16\\)", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "12dc", + "name": "Add basic filter with u32 ematch u8/nexthdr+ offset and default action", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u8 0xaa 0xf0 at nexthdr+0)' classid 1:1", + "expExitCode": "0", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(a0000000/f0000000 at nexthdr\\+0\\)", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "1d85", + "name": "Add basic filter with u32 ematch u16/zero offset and default action", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u16 0x1122 0xffff at 0)' classid 1:1", + "expExitCode": "0", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(11220000/ffff0000 at 0\\)", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "3672", + "name": "Add basic filter with u32 ematch u16/zero offset and invalid value >0xFFFF", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u16 0x112233 0xffff at 0)' classid 1:1", + "expExitCode": "1", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(11223300/ffff0000 at 0\\)", + "matchCount": "0", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "7fb0", + "name": "Add basic filter with u32 ematch u16/positive offset and default action", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u16 0x7788 0x1fff at 12)' classid 1:1", + "expExitCode": "0", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(17880000/1fff0000 at 12\\)", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "19af", + "name": "Add basic filter with u32 ematch u16/invalid mask >0xFFFF", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u16 0x7788 0xffffffff at 12)' classid 1:1", + "expExitCode": "1", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(77880000/ffffffff at 12\\)", + "matchCount": "0", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "446d", + "name": "Add basic filter with u32 ematch u16/missing offset", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u16 0x7788 0xffff at)' classid 1:1", + "expExitCode": "1", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(77880000 at 12\\)", + "matchCount": "0", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "151b", + "name": "Add basic filter with u32 ematch u16/missing AT keyword", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u16 0x7788 0xffff 0)' classid 1:1", + "expExitCode": "1", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(77880000/ffff0000 at 0\\)", + "matchCount": "0", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "bb23", + "name": "Add basic filter with u32 ematch u16/missing value", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u16 at 12)' classid 1:1", + "expExitCode": "1", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(at 12\\)", + "matchCount": "0", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "decc", + "name": "Add basic filter with u32 ematch u16/non-numeric value", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u16 zero 0xffff at 0)' classid 1:1", + "expExitCode": "1", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(00000000/ffff0000 at 0\\)", + "matchCount": "0", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "e988", + "name": "Add basic filter with u32 ematch u16/non-numeric mask", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u8 0x1122 mask at 0)' classid 1:1", + "expExitCode": "1", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(11220000/00000000 at 0\\)", + "matchCount": "0", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "07d8", + "name": "Add basic filter with u32 ematch u16/negative offset and default action", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u16 0xaabb 0xffff at -12)' classid 1:1", + "expExitCode": "0", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(aabb0000/ffff0000 at -12\\)", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "f474", + "name": "Add basic filter with u32 ematch u16/nexthdr+ offset and default action", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u16 0xaabb 0xf0f0 at nexthdr+0)' classid 1:1", + "expExitCode": "0", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(a0b00000/f0f00000 at nexthdr\\+0\\)", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "47a0", + "name": "Add basic filter with u32 ematch u32/zero offset and default action", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u32 0xaabbccdd 0xffffffff at 0)' classid 1:1", + "expExitCode": "0", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(aabbccdd/ffffffff at 0\\)", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "849f", + "name": "Add basic filter with u32 ematch u32/positive offset and default action", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u32 0x11227788 0x1ffff0f0 at 12)' classid 1:1", + "expExitCode": "0", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(11227080/1ffff0f0 at 12\\)", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "d288", + "name": "Add basic filter with u32 ematch u32/missing offset", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u32 0x11227788 0xffffffff at)' classid 1:1", + "expExitCode": "1", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(11227788/ffffffff at 12\\)", + "matchCount": "0", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "4998", + "name": "Add basic filter with u32 ematch u32/missing AT keyword", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u32 0x77889900 0xfffff0f0 0)' classid 1:1", + "expExitCode": "1", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(77889900/fffff0f0 at 0\\)", + "matchCount": "0", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "1f0a", + "name": "Add basic filter with u32 ematch u32/missing value", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u32 at 12)' classid 1:1", + "expExitCode": "1", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(at 12\\)", + "matchCount": "0", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "848e", + "name": "Add basic filter with u32 ematch u32/non-numeric value", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u32 zero 0xffff at 0)' classid 1:1", + "expExitCode": "1", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(00000000/ffff0000 at 0\\)", + "matchCount": "0", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "f748", + "name": "Add basic filter with u32 ematch u32/non-numeric mask", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u32 0x11223344 mask at 0)' classid 1:1", + "expExitCode": "1", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(11223344/00000000 at 0\\)", + "matchCount": "0", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "55a6", + "name": "Add basic filter with u32 ematch u32/negative offset and default action", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u32 0xaabbccdd 0xff00ff00 at -12)' classid 1:1", + "expExitCode": "0", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(aa00cc00/ff00ff00 at -12\\)", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "7282", + "name": "Add basic filter with u32 ematch u32/nexthdr+ offset and default action", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u32 0xaabbccdd 0xffffffff at nexthdr+0)' classid 1:1", + "expExitCode": "0", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(aabbccdd/ffffffff at nexthdr\\+0\\)", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "b2b6", + "name": "Add basic filter with canid ematch and single SFF", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'canid(sff 1)' classid 1:1", + "expExitCode": "0", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*canid\\(sff 0x1\\)", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "f67f", + "name": "Add basic filter with canid ematch and single SFF with mask", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'canid(sff 0xaabb:0x00ff)' classid 1:1", + "expExitCode": "0", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*canid\\(sff 0x2BB:0xFF\\)", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "bd5c", + "name": "Add basic filter with canid ematch and multiple SFF", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'canid(sff 1 sff 2 sff 3)' classid 1:1", + "expExitCode": "0", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*canid\\(sff 0x1 sff 0x2 sff 0x3\\)", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "83c7", + "name": "Add basic filter with canid ematch and multiple SFF with masks", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'canid(sff 0xaa:0x01 sff 0xbb:0x02 sff 0xcc:0x03)' classid 1:1", + "expExitCode": "0", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*canid\\(sff 0xAA:0x1 sff 0xBB:0x2 sff 0xCC:0x3\\)", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "a8f5", + "name": "Add basic filter with canid ematch and single EFF", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'canid(eff 1)' classid 1:1", + "expExitCode": "0", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*canid\\(eff 0x1\\)", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "98ae", + "name": "Add basic filter with canid ematch and single EFF with mask", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'canid(eff 0xaabb:0xf1f1)' classid 1:1", + "expExitCode": "0", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*canid\\(eff 0xAABB:0xF1F1\\)", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "6056", + "name": "Add basic filter with canid ematch and multiple EFF", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'canid(eff 1 eff 2 eff 3)' classid 1:1", + "expExitCode": "0", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*canid\\(eff 0x1 eff 0x2 eff 0x3\\)", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "d188", + "name": "Add basic filter with canid ematch and multiple EFF with masks", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'canid(eff 0xaa:0x01 eff 0xbb:0x02 eff 0xcc:0x03)' classid 1:1", + "expExitCode": "0", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*canid\\(eff 0xAA:0x1 eff 0xBB:0x2 eff 0xCC:0x3\\)", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "25d1", + "name": "Add basic filter with canid ematch and a combination of SFF/EFF", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'canid(sff 0x01 eff 0x02)' classid 1:1", + "expExitCode": "0", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*canid\\(eff 0x2 sff 0x1\\)", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] + }, + { + "id": "b438", + "name": "Add basic filter with canid ematch and a combination of SFF/EFF with masks", + "category": [ + "filter", + "basic" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'canid(sff 0x01:0xf eff 0x02:0xf)' classid 1:1", + "expExitCode": "0", + "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic", + "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*canid\\(eff 0x2:0xF sff 0x1:0xF\\)", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress" + ] } ] diff --git a/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json b/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json index 8877f7b2b809..bb543bf69d69 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json +++ b/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json @@ -32,7 +32,7 @@ "setup": [ "$TC qdisc add dev $DEV2 ingress" ], - "cmdUnderTest": "$TC filter add dev $DEV2 protocol ip pref 1 parent ffff: handle 0xffffffff flower action ok", + "cmdUnderTest": "$TC filter add dev $DEV2 protocol ip pref 1 ingress handle 0xffffffff flower action ok", "expExitCode": "0", "verifyCmd": "$TC filter show dev $DEV2 ingress", "matchPattern": "filter protocol ip pref 1 flower.*handle 0xffffffff", @@ -77,9 +77,9 @@ }, "setup": [ "$TC qdisc add dev $DEV2 ingress", - "$TC filter add dev $DEV2 protocol ip prio 1 parent ffff: flower dst_mac e4:11:22:11:4a:51 src_mac e4:11:22:11:4a:50 ip_proto tcp src_ip 1.1.1.1 dst_ip 2.2.2.2 action drop" + "$TC filter add dev $DEV2 protocol ip prio 1 ingress flower dst_mac e4:11:22:11:4a:51 src_mac e4:11:22:11:4a:50 ip_proto tcp src_ip 1.1.1.1 dst_ip 2.2.2.2 action drop" ], - "cmdUnderTest": "$TC filter add dev $DEV2 protocol ip prio 1 parent ffff: flower dst_mac e4:11:22:11:4a:51 src_mac e4:11:22:11:4a:50 ip_proto tcp src_ip 1.1.1.1 dst_ip 2.2.2.2 action drop", + "cmdUnderTest": "$TC filter add dev $DEV2 protocol ip prio 1 ingress flower dst_mac e4:11:22:11:4a:51 src_mac e4:11:22:11:4a:50 ip_proto tcp src_ip 1.1.1.1 dst_ip 2.2.2.2 action drop", "expExitCode": "2", "verifyCmd": "$TC -s filter show dev $DEV2 ingress", "matchPattern": "filter protocol ip pref 1 flower chain 0 handle", @@ -87,5 +87,43 @@ "teardown": [ "$TC qdisc del dev $DEV2 ingress" ] + }, + { + "id": "7c65", + "name": "Add flower filter and then terse dump it", + "category": [ + "filter", + "flower" + ], + "setup": [ + "$TC qdisc add dev $DEV2 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV2 protocol ip pref 1 ingress flower dst_mac e4:11:22:11:4a:51 action drop", + "expExitCode": "0", + "verifyCmd": "$TC filter show terse dev $DEV2 ingress", + "matchPattern": "filter protocol ip pref 1 flower.*handle", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV2 ingress" + ] + }, + { + "id": "d45e", + "name": "Add flower filter and verify that terse dump doesn't output filter key", + "category": [ + "filter", + "flower" + ], + "setup": [ + "$TC qdisc add dev $DEV2 ingress" + ], + "cmdUnderTest": "$TC filter add dev $DEV2 protocol ip pref 1 ingress flower dst_mac e4:11:22:11:4a:51 action drop", + "expExitCode": "0", + "verifyCmd": "$TC filter show terse dev $DEV2 ingress", + "matchPattern": " dst_mac e4:11:22:11:4a:51", + "matchCount": "0", + "teardown": [ + "$TC qdisc del dev $DEV2 ingress" + ] } ] diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_pie.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_pie.json new file mode 100644 index 000000000000..1cda2e11b3ad --- /dev/null +++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_pie.json @@ -0,0 +1,21 @@ +[ + { + "id": "83be", + "name": "Create FQ-PIE with invalid number of flows", + "category": [ + "qdisc", + "fq_pie" + ], + "setup": [ + "$IP link add dev $DUMMY type dummy || /bin/true" + ], + "cmdUnderTest": "$TC qdisc add dev $DUMMY root fq_pie flows 65536", + "expExitCode": "2", + "verifyCmd": "$TC qdisc show dev $DUMMY", + "matchPattern": "qdisc", + "matchCount": "0", + "teardown": [ + "$IP link del dev $DUMMY" + ] + } +] diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/red.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/red.json new file mode 100644 index 000000000000..0703a2a255eb --- /dev/null +++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/red.json @@ -0,0 +1,185 @@ +[ + { + "id": "8b6e", + "name": "Create RED with no flags", + "category": [ + "qdisc", + "red" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link add dev $DUMMY type dummy || /bin/true" + ], + "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root red limit 1M avpkt 1500 min 100K max 300K", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $DUMMY", + "matchPattern": "qdisc red 1: root .* limit 1Mb min 100Kb max 300Kb $", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root", + "$IP link del dev $DUMMY type dummy" + ] + }, + { + "id": "342e", + "name": "Create RED with adaptive flag", + "category": [ + "qdisc", + "red" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link add dev $DUMMY type dummy || /bin/true" + ], + "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root red adaptive limit 1M avpkt 1500 min 100K max 300K", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $DUMMY", + "matchPattern": "qdisc red 1: root .* limit 1Mb min 100Kb max 300Kb adaptive $", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root", + "$IP link del dev $DUMMY type dummy" + ] + }, + { + "id": "2d4b", + "name": "Create RED with ECN flag", + "category": [ + "qdisc", + "red" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link add dev $DUMMY type dummy || /bin/true" + ], + "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root red ecn limit 1M avpkt 1500 min 100K max 300K", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $DUMMY", + "matchPattern": "qdisc red 1: root .* limit 1Mb min 100Kb max 300Kb ecn $", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root", + "$IP link del dev $DUMMY type dummy" + ] + }, + { + "id": "650f", + "name": "Create RED with flags ECN, adaptive", + "category": [ + "qdisc", + "red" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link add dev $DUMMY type dummy || /bin/true" + ], + "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root red ecn adaptive limit 1M avpkt 1500 min 100K max 300K", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $DUMMY", + "matchPattern": "qdisc red 1: root .* limit 1Mb min 100Kb max 300Kb ecn adaptive $", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root", + "$IP link del dev $DUMMY type dummy" + ] + }, + { + "id": "5f15", + "name": "Create RED with flags ECN, harddrop", + "category": [ + "qdisc", + "red" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link add dev $DUMMY type dummy || /bin/true" + ], + "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root red ecn harddrop limit 1M avpkt 1500 min 100K max 300K", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $DUMMY", + "matchPattern": "qdisc red 1: root .* limit 1Mb min 100Kb max 300Kb ecn harddrop $", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root", + "$IP link del dev $DUMMY type dummy" + ] + }, + { + "id": "53e8", + "name": "Create RED with flags ECN, nodrop", + "category": [ + "qdisc", + "red" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link add dev $DUMMY type dummy || /bin/true" + ], + "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root red ecn nodrop limit 1M avpkt 1500 min 100K max 300K", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $DUMMY", + "matchPattern": "qdisc red 1: root .* limit 1Mb min 100Kb max 300Kb ecn nodrop $", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root", + "$IP link del dev $DUMMY type dummy" + ] + }, + { + "id": "d091", + "name": "Fail to create RED with only nodrop flag", + "category": [ + "qdisc", + "red" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link add dev $DUMMY type dummy || /bin/true" + ], + "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root red nodrop limit 1M avpkt 1500 min 100K max 300K", + "expExitCode": "2", + "verifyCmd": "$TC qdisc show dev $DUMMY", + "matchPattern": "qdisc red", + "matchCount": "0", + "teardown": [ + "$IP link del dev $DUMMY type dummy" + ] + }, + { + "id": "af8e", + "name": "Create RED with flags ECN, nodrop, harddrop", + "category": [ + "qdisc", + "red" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link add dev $DUMMY type dummy || /bin/true" + ], + "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root red ecn harddrop nodrop limit 1M avpkt 1500 min 100K max 300K", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $DUMMY", + "matchPattern": "qdisc red 1: root .* limit 1Mb min 100Kb max 300Kb ecn harddrop nodrop $", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root", + "$IP link del dev $DUMMY type dummy" + ] + } +] diff --git a/tools/testing/selftests/tc-testing/tdc.py b/tools/testing/selftests/tc-testing/tdc.py index e566c70e64a1..a3e43189d940 100755 --- a/tools/testing/selftests/tc-testing/tdc.py +++ b/tools/testing/selftests/tc-testing/tdc.py @@ -713,9 +713,8 @@ def set_operation_mode(pm, parser, args, remaining): exit(0) if args.list: - if args.list: - list_test_cases(alltests) - exit(0) + list_test_cases(alltests) + exit(0) if len(alltests): req_plugins = pm.get_required_plugins(alltests) diff --git a/tools/testing/selftests/tc-testing/tdc_batch.py b/tools/testing/selftests/tc-testing/tdc_batch.py index 6a2bd2cf528e..995f66ce43eb 100755 --- a/tools/testing/selftests/tc-testing/tdc_batch.py +++ b/tools/testing/selftests/tc-testing/tdc_batch.py @@ -72,21 +72,21 @@ mac_prefix = args.mac_prefix def format_add_filter(device, prio, handle, skip, src_mac, dst_mac, share_action): - return ("filter add dev {} {} protocol ip parent ffff: handle {} " + return ("filter add dev {} {} protocol ip ingress handle {} " " flower {} src_mac {} dst_mac {} action drop {}".format( device, prio, handle, skip, src_mac, dst_mac, share_action)) def format_rep_filter(device, prio, handle, skip, src_mac, dst_mac, share_action): - return ("filter replace dev {} {} protocol ip parent ffff: handle {} " + return ("filter replace dev {} {} protocol ip ingress handle {} " " flower {} src_mac {} dst_mac {} action drop {}".format( device, prio, handle, skip, src_mac, dst_mac, share_action)) def format_del_filter(device, prio, handle, skip, src_mac, dst_mac, share_action): - return ("filter del dev {} {} protocol ip parent ffff: handle {} " + return ("filter del dev {} {} protocol ip ingress handle {} " "flower".format(device, prio, handle)) diff --git a/tools/testing/selftests/timens/.gitignore b/tools/testing/selftests/timens/.gitignore index 789f21e81028..2e43851b47c1 100644 --- a/tools/testing/selftests/timens/.gitignore +++ b/tools/testing/selftests/timens/.gitignore @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only clock_nanosleep exec gettime_perf diff --git a/tools/testing/selftests/timens/clock_nanosleep.c b/tools/testing/selftests/timens/clock_nanosleep.c index 8e7b7c72ef65..72d41b955fb2 100644 --- a/tools/testing/selftests/timens/clock_nanosleep.c +++ b/tools/testing/selftests/timens/clock_nanosleep.c @@ -119,7 +119,7 @@ int main(int argc, char *argv[]) ksft_set_plan(4); - check_config_posix_timers(); + check_supported_timers(); if (unshare_timens()) return 1; diff --git a/tools/testing/selftests/timens/exec.c b/tools/testing/selftests/timens/exec.c index 87b47b557a7a..e40dc5be2f66 100644 --- a/tools/testing/selftests/timens/exec.c +++ b/tools/testing/selftests/timens/exec.c @@ -11,7 +11,6 @@ #include <sys/wait.h> #include <time.h> #include <unistd.h> -#include <time.h> #include <string.h> #include "log.h" diff --git a/tools/testing/selftests/timens/procfs.c b/tools/testing/selftests/timens/procfs.c index 43d93f4006b9..7f14f0fdac84 100644 --- a/tools/testing/selftests/timens/procfs.c +++ b/tools/testing/selftests/timens/procfs.c @@ -12,7 +12,6 @@ #include <sys/types.h> #include <time.h> #include <unistd.h> -#include <time.h> #include "log.h" #include "timens.h" diff --git a/tools/testing/selftests/timens/timens.c b/tools/testing/selftests/timens/timens.c index 559d26e21ba0..52b6a1185f52 100644 --- a/tools/testing/selftests/timens/timens.c +++ b/tools/testing/selftests/timens/timens.c @@ -10,7 +10,6 @@ #include <sys/types.h> #include <time.h> #include <unistd.h> -#include <time.h> #include <string.h> #include "log.h" @@ -156,7 +155,7 @@ int main(int argc, char *argv[]) nscheck(); - check_config_posix_timers(); + check_supported_timers(); ksft_set_plan(ARRAY_SIZE(clocks) * 2); diff --git a/tools/testing/selftests/timens/timens.h b/tools/testing/selftests/timens/timens.h index e09e7e39bc52..d4fc52d47146 100644 --- a/tools/testing/selftests/timens/timens.h +++ b/tools/testing/selftests/timens/timens.h @@ -14,15 +14,26 @@ #endif static int config_posix_timers = true; +static int config_alarm_timers = true; -static inline void check_config_posix_timers(void) +static inline void check_supported_timers(void) { + struct timespec ts; + if (timer_create(-1, 0, 0) == -1 && errno == ENOSYS) config_posix_timers = false; + + if (clock_gettime(CLOCK_BOOTTIME_ALARM, &ts) == -1 && errno == EINVAL) + config_alarm_timers = false; } static inline bool check_skip(int clockid) { + if (!config_alarm_timers && clockid == CLOCK_BOOTTIME_ALARM) { + ksft_test_result_skip("CLOCK_BOOTTIME_ALARM isn't supported\n"); + return true; + } + if (config_posix_timers) return false; diff --git a/tools/testing/selftests/timens/timer.c b/tools/testing/selftests/timens/timer.c index 0cca7aafc4bd..5e7f0051bd7b 100644 --- a/tools/testing/selftests/timens/timer.c +++ b/tools/testing/selftests/timens/timer.c @@ -11,7 +11,6 @@ #include <stdio.h> #include <stdint.h> #include <signal.h> -#include <time.h> #include "log.h" #include "timens.h" @@ -23,6 +22,9 @@ int run_test(int clockid, struct timespec now) timer_t fd; int i; + if (check_skip(clockid)) + return 0; + for (i = 0; i < 2; i++) { struct sigevent sevp = {.sigev_notify = SIGEV_NONE}; int flags = 0; @@ -75,6 +77,8 @@ int main(int argc, char *argv[]) nscheck(); + check_supported_timers(); + ksft_set_plan(3); clock_gettime(CLOCK_MONOTONIC, &mtime_now); diff --git a/tools/testing/selftests/timens/timerfd.c b/tools/testing/selftests/timens/timerfd.c index eff1ec5ff215..9edd43d6b2c1 100644 --- a/tools/testing/selftests/timens/timerfd.c +++ b/tools/testing/selftests/timens/timerfd.c @@ -28,6 +28,9 @@ int run_test(int clockid, struct timespec now) long long elapsed; int fd, i; + if (check_skip(clockid)) + return 0; + if (tclock_gettime(clockid, &now)) return pr_perror("clock_gettime(%d)", clockid); @@ -81,6 +84,8 @@ int main(int argc, char *argv[]) nscheck(); + check_supported_timers(); + ksft_set_plan(3); clock_gettime(CLOCK_MONOTONIC, &mtime_now); diff --git a/tools/testing/selftests/timers/.gitignore b/tools/testing/selftests/timers/.gitignore index 32a9eadb2d4e..bb5326ff900b 100644 --- a/tools/testing/selftests/timers/.gitignore +++ b/tools/testing/selftests/timers/.gitignore @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only alarmtimer-suspend change_skew clocksource-switch diff --git a/tools/testing/selftests/tmpfs/.gitignore b/tools/testing/selftests/tmpfs/.gitignore index a96838fad74d..b1afaa925905 100644 --- a/tools/testing/selftests/tmpfs/.gitignore +++ b/tools/testing/selftests/tmpfs/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only /bug-link-o-tmpfile diff --git a/tools/testing/selftests/tpm2/test_smoke.sh b/tools/testing/selftests/tpm2/test_smoke.sh index b630c7b5950a..663062701d5a 100755 --- a/tools/testing/selftests/tpm2/test_smoke.sh +++ b/tools/testing/selftests/tpm2/test_smoke.sh @@ -1,17 +1,13 @@ #!/bin/bash # SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) -self.flags = flags # Kselftest framework requirement - SKIP code is 4. ksft_skip=4 +[ -f /dev/tpm0 ] || exit $ksft_skip -if [ -f /dev/tpm0 ] ; then - python -m unittest -v tpm2_tests.SmokeTest - python -m unittest -v tpm2_tests.AsyncTest -else - exit $ksft_skip -fi +python -m unittest -v tpm2_tests.SmokeTest +python -m unittest -v tpm2_tests.AsyncTest CLEAR_CMD=$(which tpm2_clear) if [ -n $CLEAR_CMD ]; then diff --git a/tools/testing/selftests/tpm2/test_space.sh b/tools/testing/selftests/tpm2/test_space.sh index 180b469c53b4..36c9d030a1c6 100755 --- a/tools/testing/selftests/tpm2/test_space.sh +++ b/tools/testing/selftests/tpm2/test_space.sh @@ -4,8 +4,6 @@ # Kselftest framework requirement - SKIP code is 4. ksft_skip=4 -if [ -f /dev/tpmrm0 ] ; then - python -m unittest -v tpm2_tests.SpaceTest -else - exit $ksft_skip -fi +[ -f /dev/tpmrm0 ] || exit $ksft_skip + +python -m unittest -v tpm2_tests.SpaceTest diff --git a/tools/testing/selftests/vDSO/.gitignore b/tools/testing/selftests/vDSO/.gitignore index 133bf9ee986c..5eb64d41e541 100644 --- a/tools/testing/selftests/vDSO/.gitignore +++ b/tools/testing/selftests/vDSO/.gitignore @@ -1,2 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only vdso_test +vdso_test_gettimeofday +vdso_test_getcpu vdso_standalone_test_x86 diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile index 9e03d61f52fd..0069f2f83f86 100644 --- a/tools/testing/selftests/vDSO/Makefile +++ b/tools/testing/selftests/vDSO/Makefile @@ -4,7 +4,7 @@ include ../lib.mk uname_M := $(shell uname -m 2>/dev/null || echo not) ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/) -TEST_GEN_PROGS := $(OUTPUT)/vdso_test +TEST_GEN_PROGS := $(OUTPUT)/vdso_test_gettimeofday $(OUTPUT)/vdso_test_getcpu ifeq ($(ARCH),x86) TEST_GEN_PROGS += $(OUTPUT)/vdso_standalone_test_x86 endif @@ -17,7 +17,8 @@ LDLIBS += -lgcc_s endif all: $(TEST_GEN_PROGS) -$(OUTPUT)/vdso_test: parse_vdso.c vdso_test.c +$(OUTPUT)/vdso_test_gettimeofday: parse_vdso.c vdso_test_gettimeofday.c +$(OUTPUT)/vdso_test_getcpu: parse_vdso.c vdso_test_getcpu.c $(OUTPUT)/vdso_standalone_test_x86: vdso_standalone_test_x86.c parse_vdso.c $(CC) $(CFLAGS) $(CFLAGS_vdso_standalone_test_x86) \ vdso_standalone_test_x86.c parse_vdso.c \ diff --git a/tools/testing/selftests/vDSO/parse_vdso.c b/tools/testing/selftests/vDSO/parse_vdso.c index 1dbb4b87268f..413f75620a35 100644 --- a/tools/testing/selftests/vDSO/parse_vdso.c +++ b/tools/testing/selftests/vDSO/parse_vdso.c @@ -21,29 +21,7 @@ #include <limits.h> #include <elf.h> -/* - * To use this vDSO parser, first call one of the vdso_init_* functions. - * If you've already parsed auxv, then pass the value of AT_SYSINFO_EHDR - * to vdso_init_from_sysinfo_ehdr. Otherwise pass auxv to vdso_init_from_auxv. - * Then call vdso_sym for each symbol you want. For example, to look up - * gettimeofday on x86_64, use: - * - * <some pointer> = vdso_sym("LINUX_2.6", "gettimeofday"); - * or - * <some pointer> = vdso_sym("LINUX_2.6", "__vdso_gettimeofday"); - * - * vdso_sym will return 0 if the symbol doesn't exist or if the init function - * failed or was not called. vdso_sym is a little slow, so its return value - * should be cached. - * - * vdso_sym is threadsafe; the init functions are not. - * - * These are the prototypes: - */ -extern void vdso_init_from_auxv(void *auxv); -extern void vdso_init_from_sysinfo_ehdr(uintptr_t base); -extern void *vdso_sym(const char *version, const char *name); - +#include "parse_vdso.h" /* And here's the code. */ #ifndef ELF_BITS diff --git a/tools/testing/selftests/vDSO/parse_vdso.h b/tools/testing/selftests/vDSO/parse_vdso.h new file mode 100644 index 000000000000..de0453067d7c --- /dev/null +++ b/tools/testing/selftests/vDSO/parse_vdso.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef PARSE_VDSO_H +#define PARSE_VDSO_H + +#include <stdint.h> + +/* + * To use this vDSO parser, first call one of the vdso_init_* functions. + * If you've already parsed auxv, then pass the value of AT_SYSINFO_EHDR + * to vdso_init_from_sysinfo_ehdr. Otherwise pass auxv to vdso_init_from_auxv. + * Then call vdso_sym for each symbol you want. For example, to look up + * gettimeofday on x86_64, use: + * + * <some pointer> = vdso_sym("LINUX_2.6", "gettimeofday"); + * or + * <some pointer> = vdso_sym("LINUX_2.6", "__vdso_gettimeofday"); + * + * vdso_sym will return 0 if the symbol doesn't exist or if the init function + * failed or was not called. vdso_sym is a little slow, so its return value + * should be cached. + * + * vdso_sym is threadsafe; the init functions are not. + * + * These are the prototypes: + */ +void *vdso_sym(const char *version, const char *name); +void vdso_init_from_sysinfo_ehdr(uintptr_t base); +void vdso_init_from_auxv(void *auxv); + +#endif diff --git a/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c b/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c index 5ac4b00acfbc..8a44ff973ee1 100644 --- a/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c +++ b/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c @@ -16,9 +16,7 @@ #include <unistd.h> #include <stdint.h> -extern void *vdso_sym(const char *version, const char *name); -extern void vdso_init_from_sysinfo_ehdr(uintptr_t base); -extern void vdso_init_from_auxv(void *auxv); +#include "parse_vdso.h" /* We need a libc functions... */ int strcmp(const char *a, const char *b) diff --git a/tools/testing/selftests/vDSO/vdso_test_getcpu.c b/tools/testing/selftests/vDSO/vdso_test_getcpu.c new file mode 100644 index 000000000000..fc25ede131b8 --- /dev/null +++ b/tools/testing/selftests/vDSO/vdso_test_getcpu.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * vdso_test_getcpu.c: Sample code to test parse_vdso.c and vDSO getcpu() + * + * Copyright (c) 2020 Arm Ltd + */ + +#include <stdint.h> +#include <elf.h> +#include <stdio.h> +#include <sys/auxv.h> +#include <sys/time.h> + +#include "../kselftest.h" +#include "parse_vdso.h" + +const char *version = "LINUX_2.6"; +const char *name = "__vdso_getcpu"; + +struct getcpu_cache; +typedef long (*getcpu_t)(unsigned int *, unsigned int *, + struct getcpu_cache *); + +int main(int argc, char **argv) +{ + unsigned long sysinfo_ehdr; + unsigned int cpu, node; + getcpu_t get_cpu; + long ret; + + sysinfo_ehdr = getauxval(AT_SYSINFO_EHDR); + if (!sysinfo_ehdr) { + printf("AT_SYSINFO_EHDR is not present!\n"); + return KSFT_SKIP; + } + + vdso_init_from_sysinfo_ehdr(getauxval(AT_SYSINFO_EHDR)); + + get_cpu = (getcpu_t)vdso_sym(version, name); + if (!get_cpu) { + printf("Could not find %s\n", name); + return KSFT_SKIP; + } + + ret = get_cpu(&cpu, &node, 0); + if (ret == 0) { + printf("Running on CPU %u node %u\n", cpu, node); + } else { + printf("%s failed\n", name); + return KSFT_FAIL; + } + + return 0; +} diff --git a/tools/testing/selftests/vDSO/vdso_test.c b/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c index 719d5a6bd664..8ccc73ed8240 100644 --- a/tools/testing/selftests/vDSO/vdso_test.c +++ b/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c @@ -1,10 +1,11 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * vdso_test.c: Sample code to test parse_vdso.c + * vdso_test_gettimeofday.c: Sample code to test parse_vdso.c and + * vDSO gettimeofday() * Copyright (c) 2014 Andy Lutomirski * * Compile with: - * gcc -std=gnu99 vdso_test.c parse_vdso.c + * gcc -std=gnu99 vdso_test_gettimeofday.c parse_vdso_gettimeofday.c * * Tested on x86, 32-bit and 64-bit. It may work on other architectures, too. */ @@ -16,10 +17,7 @@ #include <sys/time.h> #include "../kselftest.h" - -extern void *vdso_sym(const char *version, const char *name); -extern void vdso_init_from_sysinfo_ehdr(uintptr_t base); -extern void vdso_init_from_auxv(void *auxv); +#include "parse_vdso.h" /* * ARM64's vDSO exports its gettimeofday() implementation with a different diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore index 31b3c98b6d34..849e8226395a 100644 --- a/tools/testing/selftests/vm/.gitignore +++ b/tools/testing/selftests/vm/.gitignore @@ -1,12 +1,16 @@ +# SPDX-License-Identifier: GPL-2.0-only hugepage-mmap hugepage-shm +khugepaged map_hugetlb map_populate thuge-gen compaction_test mlock2-tests +mremap_dontunmap on-fault-limit transhuge-stress +protection_keys userfaultfd mlock-intersect-test mlock-random-test @@ -14,3 +18,5 @@ virtual_address_range gup_benchmark va_128TBswitch map_fixed_noreplace +write_to_hugetlbfs +hmm-tests diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 7f9a8a8c31da..a9026706d597 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -1,12 +1,13 @@ # SPDX-License-Identifier: GPL-2.0 # Makefile for vm selftests uname_M := $(shell uname -m 2>/dev/null || echo not) -ARCH ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/') +MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/') CFLAGS = -Wall -I ../../../../usr/include $(EXTRA_CFLAGS) LDLIBS = -lrt TEST_GEN_FILES = compaction_test TEST_GEN_FILES += gup_benchmark +TEST_GEN_FILES += hmm-tests TEST_GEN_FILES += hugepage-mmap TEST_GEN_FILES += hugepage-shm TEST_GEN_FILES += map_hugetlb @@ -14,14 +15,41 @@ TEST_GEN_FILES += map_fixed_noreplace TEST_GEN_FILES += map_populate TEST_GEN_FILES += mlock-random-test TEST_GEN_FILES += mlock2-tests +TEST_GEN_FILES += mremap_dontunmap TEST_GEN_FILES += on-fault-limit TEST_GEN_FILES += thuge-gen TEST_GEN_FILES += transhuge-stress TEST_GEN_FILES += userfaultfd +TEST_GEN_FILES += khugepaged -ifneq (,$(filter $(ARCH),arm64 ia64 mips64 parisc64 ppc64 riscv64 s390x sh64 sparc64 x86_64)) +ifeq ($(ARCH),x86_64) +CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_32bit_program.c -m32) +CAN_BUILD_X86_64 := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_64bit_program.c) +CAN_BUILD_WITH_NOPIE := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_program.c -no-pie) + +TARGETS := protection_keys +BINARIES_32 := $(TARGETS:%=%_32) +BINARIES_64 := $(TARGETS:%=%_64) + +ifeq ($(CAN_BUILD_WITH_NOPIE),1) +CFLAGS += -no-pie +endif + +ifeq ($(CAN_BUILD_I386),1) +TEST_GEN_FILES += $(BINARIES_32) +endif + +ifeq ($(CAN_BUILD_X86_64),1) +TEST_GEN_FILES += $(BINARIES_64) +endif +else +TEST_GEN_FILES += protection_keys +endif + +ifneq (,$(filter $(MACHINE),arm64 ia64 mips64 parisc64 ppc64 ppc64le riscv64 s390x sh64 sparc64 x86_64)) TEST_GEN_FILES += va_128TBswitch TEST_GEN_FILES += virtual_address_range +TEST_GEN_FILES += write_to_hugetlbfs endif TEST_PROGS := run_vmtests @@ -31,6 +59,57 @@ TEST_FILES := test_vmalloc.sh KSFT_KHDR_INSTALL := 1 include ../lib.mk +$(OUTPUT)/hmm-tests: LDLIBS += -lhugetlbfs -lpthread + +ifeq ($(ARCH),x86_64) +BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32)) +BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64)) + +define gen-target-rule-32 +$(1) $(1)_32: $(OUTPUT)/$(1)_32 +.PHONY: $(1) $(1)_32 +endef + +define gen-target-rule-64 +$(1) $(1)_64: $(OUTPUT)/$(1)_64 +.PHONY: $(1) $(1)_64 +endef + +ifeq ($(CAN_BUILD_I386),1) +$(BINARIES_32): CFLAGS += -m32 +$(BINARIES_32): LDLIBS += -lrt -ldl -lm +$(BINARIES_32): %_32: %.c + $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@ +$(foreach t,$(TARGETS),$(eval $(call gen-target-rule-32,$(t)))) +endif + +ifeq ($(CAN_BUILD_X86_64),1) +$(BINARIES_64): CFLAGS += -m64 +$(BINARIES_64): LDLIBS += -lrt -ldl +$(BINARIES_64): %_64: %.c + $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@ +$(foreach t,$(TARGETS),$(eval $(call gen-target-rule-64,$(t)))) +endif + +# x86_64 users should be encouraged to install 32-bit libraries +ifeq ($(CAN_BUILD_I386)$(CAN_BUILD_X86_64),01) +all: warn_32bit_failure + +warn_32bit_failure: + @echo "Warning: you seem to have a broken 32-bit build" 2>&1; \ + echo "environment. This will reduce test coverage of 64-bit" 2>&1; \ + echo "kernels. If you are using a Debian-like distribution," 2>&1; \ + echo "try:"; 2>&1; \ + echo ""; \ + echo " apt-get install gcc-multilib libc6-i386 libc6-dev-i386"; \ + echo ""; \ + echo "If you are using a Fedora-like distribution, try:"; \ + echo ""; \ + echo " yum install glibc-devel.*i686"; \ + exit 0; +endif +endif + $(OUTPUT)/userfaultfd: LDLIBS += -lpthread $(OUTPUT)/mlock-random-test: LDLIBS += -lcap diff --git a/tools/testing/selftests/vm/charge_reserved_hugetlb.sh b/tools/testing/selftests/vm/charge_reserved_hugetlb.sh new file mode 100644 index 000000000000..18d33684faad --- /dev/null +++ b/tools/testing/selftests/vm/charge_reserved_hugetlb.sh @@ -0,0 +1,575 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +set -e + +if [[ $(id -u) -ne 0 ]]; then + echo "This test must be run as root. Skipping..." + exit 0 +fi + +fault_limit_file=limit_in_bytes +reservation_limit_file=rsvd.limit_in_bytes +fault_usage_file=usage_in_bytes +reservation_usage_file=rsvd.usage_in_bytes + +if [[ "$1" == "-cgroup-v2" ]]; then + cgroup2=1 + fault_limit_file=max + reservation_limit_file=rsvd.max + fault_usage_file=current + reservation_usage_file=rsvd.current +fi + +cgroup_path=/dev/cgroup/memory +if [[ ! -e $cgroup_path ]]; then + mkdir -p $cgroup_path + if [[ $cgroup2 ]]; then + mount -t cgroup2 none $cgroup_path + else + mount -t cgroup memory,hugetlb $cgroup_path + fi +fi + +if [[ $cgroup2 ]]; then + echo "+hugetlb" >/dev/cgroup/memory/cgroup.subtree_control +fi + +function cleanup() { + if [[ $cgroup2 ]]; then + echo $$ >$cgroup_path/cgroup.procs + else + echo $$ >$cgroup_path/tasks + fi + + if [[ -e /mnt/huge ]]; then + rm -rf /mnt/huge/* + umount /mnt/huge || echo error + rmdir /mnt/huge + fi + if [[ -e $cgroup_path/hugetlb_cgroup_test ]]; then + rmdir $cgroup_path/hugetlb_cgroup_test + fi + if [[ -e $cgroup_path/hugetlb_cgroup_test1 ]]; then + rmdir $cgroup_path/hugetlb_cgroup_test1 + fi + if [[ -e $cgroup_path/hugetlb_cgroup_test2 ]]; then + rmdir $cgroup_path/hugetlb_cgroup_test2 + fi + echo 0 >/proc/sys/vm/nr_hugepages + echo CLEANUP DONE +} + +function expect_equal() { + local expected="$1" + local actual="$2" + local error="$3" + + if [[ "$expected" != "$actual" ]]; then + echo "expected ($expected) != actual ($actual): $3" + cleanup + exit 1 + fi +} + +function get_machine_hugepage_size() { + hpz=$(grep -i hugepagesize /proc/meminfo) + kb=${hpz:14:-3} + mb=$(($kb / 1024)) + echo $mb +} + +MB=$(get_machine_hugepage_size) + +function setup_cgroup() { + local name="$1" + local cgroup_limit="$2" + local reservation_limit="$3" + + mkdir $cgroup_path/$name + + echo writing cgroup limit: "$cgroup_limit" + echo "$cgroup_limit" >$cgroup_path/$name/hugetlb.${MB}MB.$fault_limit_file + + echo writing reseravation limit: "$reservation_limit" + echo "$reservation_limit" > \ + $cgroup_path/$name/hugetlb.${MB}MB.$reservation_limit_file + + if [ -e "$cgroup_path/$name/cpuset.cpus" ]; then + echo 0 >$cgroup_path/$name/cpuset.cpus + fi + if [ -e "$cgroup_path/$name/cpuset.mems" ]; then + echo 0 >$cgroup_path/$name/cpuset.mems + fi +} + +function wait_for_hugetlb_memory_to_get_depleted() { + local cgroup="$1" + local path="/dev/cgroup/memory/$cgroup/hugetlb.${MB}MB.$reservation_usage_file" + # Wait for hugetlbfs memory to get depleted. + while [ $(cat $path) != 0 ]; do + echo Waiting for hugetlb memory to get depleted. + cat $path + sleep 0.5 + done +} + +function wait_for_hugetlb_memory_to_get_reserved() { + local cgroup="$1" + local size="$2" + + local path="/dev/cgroup/memory/$cgroup/hugetlb.${MB}MB.$reservation_usage_file" + # Wait for hugetlbfs memory to get written. + while [ $(cat $path) != $size ]; do + echo Waiting for hugetlb memory reservation to reach size $size. + cat $path + sleep 0.5 + done +} + +function wait_for_hugetlb_memory_to_get_written() { + local cgroup="$1" + local size="$2" + + local path="/dev/cgroup/memory/$cgroup/hugetlb.${MB}MB.$fault_usage_file" + # Wait for hugetlbfs memory to get written. + while [ $(cat $path) != $size ]; do + echo Waiting for hugetlb memory to reach size $size. + cat $path + sleep 0.5 + done +} + +function write_hugetlbfs_and_get_usage() { + local cgroup="$1" + local size="$2" + local populate="$3" + local write="$4" + local path="$5" + local method="$6" + local private="$7" + local expect_failure="$8" + local reserve="$9" + + # Function return values. + reservation_failed=0 + oom_killed=0 + hugetlb_difference=0 + reserved_difference=0 + + local hugetlb_usage=$cgroup_path/$cgroup/hugetlb.${MB}MB.$fault_usage_file + local reserved_usage=$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file + + local hugetlb_before=$(cat $hugetlb_usage) + local reserved_before=$(cat $reserved_usage) + + echo + echo Starting: + echo hugetlb_usage="$hugetlb_before" + echo reserved_usage="$reserved_before" + echo expect_failure is "$expect_failure" + + output=$(mktemp) + set +e + if [[ "$method" == "1" ]] || [[ "$method" == 2 ]] || + [[ "$private" == "-r" ]] && [[ "$expect_failure" != 1 ]]; then + + bash write_hugetlb_memory.sh "$size" "$populate" "$write" \ + "$cgroup" "$path" "$method" "$private" "-l" "$reserve" 2>&1 | tee $output & + + local write_result=$? + local write_pid=$! + + until grep -q -i "DONE" $output; do + echo waiting for DONE signal. + if ! ps $write_pid > /dev/null + then + echo "FAIL: The write died" + cleanup + exit 1 + fi + sleep 0.5 + done + + echo ================= write_hugetlb_memory.sh output is: + cat $output + echo ================= end output. + + if [[ "$populate" == "-o" ]] || [[ "$write" == "-w" ]]; then + wait_for_hugetlb_memory_to_get_written "$cgroup" "$size" + elif [[ "$reserve" != "-n" ]]; then + wait_for_hugetlb_memory_to_get_reserved "$cgroup" "$size" + else + # This case doesn't produce visible effects, but we still have + # to wait for the async process to start and execute... + sleep 0.5 + fi + + echo write_result is $write_result + else + bash write_hugetlb_memory.sh "$size" "$populate" "$write" \ + "$cgroup" "$path" "$method" "$private" "$reserve" + local write_result=$? + + if [[ "$reserve" != "-n" ]]; then + wait_for_hugetlb_memory_to_get_reserved "$cgroup" "$size" + fi + fi + set -e + + if [[ "$write_result" == 1 ]]; then + reservation_failed=1 + fi + + # On linus/master, the above process gets SIGBUS'd on oomkill, with + # return code 135. On earlier kernels, it gets actual oomkill, with return + # code 137, so just check for both conditions in case we're testing + # against an earlier kernel. + if [[ "$write_result" == 135 ]] || [[ "$write_result" == 137 ]]; then + oom_killed=1 + fi + + local hugetlb_after=$(cat $hugetlb_usage) + local reserved_after=$(cat $reserved_usage) + + echo After write: + echo hugetlb_usage="$hugetlb_after" + echo reserved_usage="$reserved_after" + + hugetlb_difference=$(($hugetlb_after - $hugetlb_before)) + reserved_difference=$(($reserved_after - $reserved_before)) +} + +function cleanup_hugetlb_memory() { + set +e + local cgroup="$1" + if [[ "$(pgrep -f write_to_hugetlbfs)" != "" ]]; then + echo killing write_to_hugetlbfs + killall -2 write_to_hugetlbfs + wait_for_hugetlb_memory_to_get_depleted $cgroup + fi + set -e + + if [[ -e /mnt/huge ]]; then + rm -rf /mnt/huge/* + umount /mnt/huge + rmdir /mnt/huge + fi +} + +function run_test() { + local size=$(($1 * ${MB} * 1024 * 1024)) + local populate="$2" + local write="$3" + local cgroup_limit=$(($4 * ${MB} * 1024 * 1024)) + local reservation_limit=$(($5 * ${MB} * 1024 * 1024)) + local nr_hugepages="$6" + local method="$7" + local private="$8" + local expect_failure="$9" + local reserve="${10}" + + # Function return values. + hugetlb_difference=0 + reserved_difference=0 + reservation_failed=0 + oom_killed=0 + + echo nr hugepages = "$nr_hugepages" + echo "$nr_hugepages" >/proc/sys/vm/nr_hugepages + + setup_cgroup "hugetlb_cgroup_test" "$cgroup_limit" "$reservation_limit" + + mkdir -p /mnt/huge + mount -t hugetlbfs -o pagesize=${MB}M,size=256M none /mnt/huge + + write_hugetlbfs_and_get_usage "hugetlb_cgroup_test" "$size" "$populate" \ + "$write" "/mnt/huge/test" "$method" "$private" "$expect_failure" \ + "$reserve" + + cleanup_hugetlb_memory "hugetlb_cgroup_test" + + local final_hugetlb=$(cat $cgroup_path/hugetlb_cgroup_test/hugetlb.${MB}MB.$fault_usage_file) + local final_reservation=$(cat $cgroup_path/hugetlb_cgroup_test/hugetlb.${MB}MB.$reservation_usage_file) + + echo $hugetlb_difference + echo $reserved_difference + expect_equal "0" "$final_hugetlb" "final hugetlb is not zero" + expect_equal "0" "$final_reservation" "final reservation is not zero" +} + +function run_multiple_cgroup_test() { + local size1="$1" + local populate1="$2" + local write1="$3" + local cgroup_limit1="$4" + local reservation_limit1="$5" + + local size2="$6" + local populate2="$7" + local write2="$8" + local cgroup_limit2="$9" + local reservation_limit2="${10}" + + local nr_hugepages="${11}" + local method="${12}" + local private="${13}" + local expect_failure="${14}" + local reserve="${15}" + + # Function return values. + hugetlb_difference1=0 + reserved_difference1=0 + reservation_failed1=0 + oom_killed1=0 + + hugetlb_difference2=0 + reserved_difference2=0 + reservation_failed2=0 + oom_killed2=0 + + echo nr hugepages = "$nr_hugepages" + echo "$nr_hugepages" >/proc/sys/vm/nr_hugepages + + setup_cgroup "hugetlb_cgroup_test1" "$cgroup_limit1" "$reservation_limit1" + setup_cgroup "hugetlb_cgroup_test2" "$cgroup_limit2" "$reservation_limit2" + + mkdir -p /mnt/huge + mount -t hugetlbfs -o pagesize=${MB}M,size=256M none /mnt/huge + + write_hugetlbfs_and_get_usage "hugetlb_cgroup_test1" "$size1" \ + "$populate1" "$write1" "/mnt/huge/test1" "$method" "$private" \ + "$expect_failure" "$reserve" + + hugetlb_difference1=$hugetlb_difference + reserved_difference1=$reserved_difference + reservation_failed1=$reservation_failed + oom_killed1=$oom_killed + + local cgroup1_hugetlb_usage=$cgroup_path/hugetlb_cgroup_test1/hugetlb.${MB}MB.$fault_usage_file + local cgroup1_reservation_usage=$cgroup_path/hugetlb_cgroup_test1/hugetlb.${MB}MB.$reservation_usage_file + local cgroup2_hugetlb_usage=$cgroup_path/hugetlb_cgroup_test2/hugetlb.${MB}MB.$fault_usage_file + local cgroup2_reservation_usage=$cgroup_path/hugetlb_cgroup_test2/hugetlb.${MB}MB.$reservation_usage_file + + local usage_before_second_write=$(cat $cgroup1_hugetlb_usage) + local reservation_usage_before_second_write=$(cat $cgroup1_reservation_usage) + + write_hugetlbfs_and_get_usage "hugetlb_cgroup_test2" "$size2" \ + "$populate2" "$write2" "/mnt/huge/test2" "$method" "$private" \ + "$expect_failure" "$reserve" + + hugetlb_difference2=$hugetlb_difference + reserved_difference2=$reserved_difference + reservation_failed2=$reservation_failed + oom_killed2=$oom_killed + + expect_equal "$usage_before_second_write" \ + "$(cat $cgroup1_hugetlb_usage)" "Usage changed." + expect_equal "$reservation_usage_before_second_write" \ + "$(cat $cgroup1_reservation_usage)" "Reservation usage changed." + + cleanup_hugetlb_memory + + local final_hugetlb=$(cat $cgroup1_hugetlb_usage) + local final_reservation=$(cat $cgroup1_reservation_usage) + + expect_equal "0" "$final_hugetlb" \ + "hugetlbt_cgroup_test1 final hugetlb is not zero" + expect_equal "0" "$final_reservation" \ + "hugetlbt_cgroup_test1 final reservation is not zero" + + local final_hugetlb=$(cat $cgroup2_hugetlb_usage) + local final_reservation=$(cat $cgroup2_reservation_usage) + + expect_equal "0" "$final_hugetlb" \ + "hugetlb_cgroup_test2 final hugetlb is not zero" + expect_equal "0" "$final_reservation" \ + "hugetlb_cgroup_test2 final reservation is not zero" +} + +cleanup + +for populate in "" "-o"; do + for method in 0 1 2; do + for private in "" "-r"; do + for reserve in "" "-n"; do + + # Skip mmap(MAP_HUGETLB | MAP_SHARED). Doesn't seem to be supported. + if [[ "$method" == 1 ]] && [[ "$private" == "" ]]; then + continue + fi + + # Skip populated shmem tests. Doesn't seem to be supported. + if [[ "$method" == 2"" ]] && [[ "$populate" == "-o" ]]; then + continue + fi + + if [[ "$method" == 2"" ]] && [[ "$reserve" == "-n" ]]; then + continue + fi + + cleanup + echo + echo + echo + echo Test normal case. + echo private=$private, populate=$populate, method=$method, reserve=$reserve + run_test 5 "$populate" "" 10 10 10 "$method" "$private" "0" "$reserve" + + echo Memory charged to hugtlb=$hugetlb_difference + echo Memory charged to reservation=$reserved_difference + + if [[ "$populate" == "-o" ]]; then + expect_equal "$((5 * $MB * 1024 * 1024))" "$hugetlb_difference" \ + "Reserved memory charged to hugetlb cgroup." + else + expect_equal "0" "$hugetlb_difference" \ + "Reserved memory charged to hugetlb cgroup." + fi + + if [[ "$reserve" != "-n" ]] || [[ "$populate" == "-o" ]]; then + expect_equal "$((5 * $MB * 1024 * 1024))" "$reserved_difference" \ + "Reserved memory not charged to reservation usage." + else + expect_equal "0" "$reserved_difference" \ + "Reserved memory not charged to reservation usage." + fi + + echo 'PASS' + + cleanup + echo + echo + echo + echo Test normal case with write. + echo private=$private, populate=$populate, method=$method, reserve=$reserve + run_test 5 "$populate" '-w' 5 5 10 "$method" "$private" "0" "$reserve" + + echo Memory charged to hugtlb=$hugetlb_difference + echo Memory charged to reservation=$reserved_difference + + expect_equal "$((5 * $MB * 1024 * 1024))" "$hugetlb_difference" \ + "Reserved memory charged to hugetlb cgroup." + + expect_equal "$((5 * $MB * 1024 * 1024))" "$reserved_difference" \ + "Reserved memory not charged to reservation usage." + + echo 'PASS' + + cleanup + continue + echo + echo + echo + echo Test more than reservation case. + echo private=$private, populate=$populate, method=$method, reserve=$reserve + + if [ "$reserve" != "-n" ]; then + run_test "5" "$populate" '' "10" "2" "10" "$method" "$private" "1" \ + "$reserve" + + expect_equal "1" "$reservation_failed" "Reservation succeeded." + fi + + echo 'PASS' + + cleanup + + echo + echo + echo + echo Test more than cgroup limit case. + echo private=$private, populate=$populate, method=$method, reserve=$reserve + + # Not sure if shm memory can be cleaned up when the process gets sigbus'd. + if [[ "$method" != 2 ]]; then + run_test 5 "$populate" "-w" 2 10 10 "$method" "$private" "1" "$reserve" + + expect_equal "1" "$oom_killed" "Not oom killed." + fi + echo 'PASS' + + cleanup + + echo + echo + echo + echo Test normal case, multiple cgroups. + echo private=$private, populate=$populate, method=$method, reserve=$reserve + run_multiple_cgroup_test "3" "$populate" "" "10" "10" "5" \ + "$populate" "" "10" "10" "10" \ + "$method" "$private" "0" "$reserve" + + echo Memory charged to hugtlb1=$hugetlb_difference1 + echo Memory charged to reservation1=$reserved_difference1 + echo Memory charged to hugtlb2=$hugetlb_difference2 + echo Memory charged to reservation2=$reserved_difference2 + + if [[ "$reserve" != "-n" ]] || [[ "$populate" == "-o" ]]; then + expect_equal "3" "$reserved_difference1" \ + "Incorrect reservations charged to cgroup 1." + + expect_equal "5" "$reserved_difference2" \ + "Incorrect reservation charged to cgroup 2." + + else + expect_equal "0" "$reserved_difference1" \ + "Incorrect reservations charged to cgroup 1." + + expect_equal "0" "$reserved_difference2" \ + "Incorrect reservation charged to cgroup 2." + fi + + if [[ "$populate" == "-o" ]]; then + expect_equal "3" "$hugetlb_difference1" \ + "Incorrect hugetlb charged to cgroup 1." + + expect_equal "5" "$hugetlb_difference2" \ + "Incorrect hugetlb charged to cgroup 2." + + else + expect_equal "0" "$hugetlb_difference1" \ + "Incorrect hugetlb charged to cgroup 1." + + expect_equal "0" "$hugetlb_difference2" \ + "Incorrect hugetlb charged to cgroup 2." + fi + echo 'PASS' + + cleanup + echo + echo + echo + echo Test normal case with write, multiple cgroups. + echo private=$private, populate=$populate, method=$method, reserve=$reserve + run_multiple_cgroup_test "3" "$populate" "-w" "10" "10" "5" \ + "$populate" "-w" "10" "10" "10" \ + "$method" "$private" "0" "$reserve" + + echo Memory charged to hugtlb1=$hugetlb_difference1 + echo Memory charged to reservation1=$reserved_difference1 + echo Memory charged to hugtlb2=$hugetlb_difference2 + echo Memory charged to reservation2=$reserved_difference2 + + expect_equal "3" "$hugetlb_difference1" \ + "Incorrect hugetlb charged to cgroup 1." + + expect_equal "3" "$reserved_difference1" \ + "Incorrect reservation charged to cgroup 1." + + expect_equal "5" "$hugetlb_difference2" \ + "Incorrect hugetlb charged to cgroup 2." + + expect_equal "5" "$reserved_difference2" \ + "Incorrected reservation charged to cgroup 2." + echo 'PASS' + + cleanup + + done # reserve + done # private + done # populate +done # method + +umount $cgroup_path +rmdir $cgroup_path diff --git a/tools/testing/selftests/vm/config b/tools/testing/selftests/vm/config index 93b90a9b1eeb..3ba674b64fa9 100644 --- a/tools/testing/selftests/vm/config +++ b/tools/testing/selftests/vm/config @@ -1,3 +1,5 @@ CONFIG_SYSVIPC=y CONFIG_USERFAULTFD=y CONFIG_TEST_VMALLOC=m +CONFIG_DEVICE_PRIVATE=y +CONFIG_TEST_HMM=m diff --git a/tools/testing/selftests/vm/gup_benchmark.c b/tools/testing/selftests/vm/gup_benchmark.c index 389327e9b30a..43b4dfe161a2 100644 --- a/tools/testing/selftests/vm/gup_benchmark.c +++ b/tools/testing/selftests/vm/gup_benchmark.c @@ -18,6 +18,10 @@ #define GUP_LONGTERM_BENCHMARK _IOWR('g', 2, struct gup_benchmark) #define GUP_BENCHMARK _IOWR('g', 3, struct gup_benchmark) +/* Similar to above, but use FOLL_PIN instead of FOLL_GET. */ +#define PIN_FAST_BENCHMARK _IOWR('g', 4, struct gup_benchmark) +#define PIN_BENCHMARK _IOWR('g', 5, struct gup_benchmark) + /* Just the flags we need, copied from mm.h: */ #define FOLL_WRITE 0x01 /* check pte is writable */ @@ -40,8 +44,14 @@ int main(int argc, char **argv) char *file = "/dev/zero"; char *p; - while ((opt = getopt(argc, argv, "m:r:n:f:tTLUwSH")) != -1) { + while ((opt = getopt(argc, argv, "m:r:n:f:abtTLUuwSH")) != -1) { switch (opt) { + case 'a': + cmd = PIN_FAST_BENCHMARK; + break; + case 'b': + cmd = PIN_BENCHMARK; + break; case 'm': size = atoi(optarg) * MB; break; @@ -63,6 +73,9 @@ int main(int argc, char **argv) case 'U': cmd = GUP_BENCHMARK; break; + case 'u': + cmd = GUP_FAST_BENCHMARK; + break; case 'w': write = 1; break; diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/vm/hmm-tests.c new file mode 100644 index 000000000000..79db22604019 --- /dev/null +++ b/tools/testing/selftests/vm/hmm-tests.c @@ -0,0 +1,1359 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * HMM stands for Heterogeneous Memory Management, it is a helper layer inside + * the linux kernel to help device drivers mirror a process address space in + * the device. This allows the device to use the same address space which + * makes communication and data exchange a lot easier. + * + * This framework's sole purpose is to exercise various code paths inside + * the kernel to make sure that HMM performs as expected and to flush out any + * bugs. + */ + +#include "../kselftest_harness.h" + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <unistd.h> +#include <strings.h> +#include <time.h> +#include <pthread.h> +#include <hugetlbfs.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <sys/ioctl.h> + +/* + * This is a private UAPI to the kernel test module so it isn't exported + * in the usual include/uapi/... directory. + */ +#include "../../../../lib/test_hmm_uapi.h" + +struct hmm_buffer { + void *ptr; + void *mirror; + unsigned long size; + int fd; + uint64_t cpages; + uint64_t faults; +}; + +#define TWOMEG (1 << 21) +#define HMM_BUFFER_SIZE (1024 << 12) +#define HMM_PATH_MAX 64 +#define NTIMES 256 + +#define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1))) + +FIXTURE(hmm) +{ + int fd; + unsigned int page_size; + unsigned int page_shift; +}; + +FIXTURE(hmm2) +{ + int fd0; + int fd1; + unsigned int page_size; + unsigned int page_shift; +}; + +static int hmm_open(int unit) +{ + char pathname[HMM_PATH_MAX]; + int fd; + + snprintf(pathname, sizeof(pathname), "/dev/hmm_dmirror%d", unit); + fd = open(pathname, O_RDWR, 0); + if (fd < 0) + fprintf(stderr, "could not open hmm dmirror driver (%s)\n", + pathname); + return fd; +} + +FIXTURE_SETUP(hmm) +{ + self->page_size = sysconf(_SC_PAGE_SIZE); + self->page_shift = ffs(self->page_size) - 1; + + self->fd = hmm_open(0); + ASSERT_GE(self->fd, 0); +} + +FIXTURE_SETUP(hmm2) +{ + self->page_size = sysconf(_SC_PAGE_SIZE); + self->page_shift = ffs(self->page_size) - 1; + + self->fd0 = hmm_open(0); + ASSERT_GE(self->fd0, 0); + self->fd1 = hmm_open(1); + ASSERT_GE(self->fd1, 0); +} + +FIXTURE_TEARDOWN(hmm) +{ + int ret = close(self->fd); + + ASSERT_EQ(ret, 0); + self->fd = -1; +} + +FIXTURE_TEARDOWN(hmm2) +{ + int ret = close(self->fd0); + + ASSERT_EQ(ret, 0); + self->fd0 = -1; + + ret = close(self->fd1); + ASSERT_EQ(ret, 0); + self->fd1 = -1; +} + +static int hmm_dmirror_cmd(int fd, + unsigned long request, + struct hmm_buffer *buffer, + unsigned long npages) +{ + struct hmm_dmirror_cmd cmd; + int ret; + + /* Simulate a device reading system memory. */ + cmd.addr = (__u64)buffer->ptr; + cmd.ptr = (__u64)buffer->mirror; + cmd.npages = npages; + + for (;;) { + ret = ioctl(fd, request, &cmd); + if (ret == 0) + break; + if (errno == EINTR) + continue; + return -errno; + } + buffer->cpages = cmd.cpages; + buffer->faults = cmd.faults; + + return 0; +} + +static void hmm_buffer_free(struct hmm_buffer *buffer) +{ + if (buffer == NULL) + return; + + if (buffer->ptr) + munmap(buffer->ptr, buffer->size); + free(buffer->mirror); + free(buffer); +} + +/* + * Create a temporary file that will be deleted on close. + */ +static int hmm_create_file(unsigned long size) +{ + char path[HMM_PATH_MAX]; + int fd; + + strcpy(path, "/tmp"); + fd = open(path, O_TMPFILE | O_EXCL | O_RDWR, 0600); + if (fd >= 0) { + int r; + + do { + r = ftruncate(fd, size); + } while (r == -1 && errno == EINTR); + if (!r) + return fd; + close(fd); + } + return -1; +} + +/* + * Return a random unsigned number. + */ +static unsigned int hmm_random(void) +{ + static int fd = -1; + unsigned int r; + + if (fd < 0) { + fd = open("/dev/urandom", O_RDONLY); + if (fd < 0) { + fprintf(stderr, "%s:%d failed to open /dev/urandom\n", + __FILE__, __LINE__); + return ~0U; + } + } + read(fd, &r, sizeof(r)); + return r; +} + +static void hmm_nanosleep(unsigned int n) +{ + struct timespec t; + + t.tv_sec = 0; + t.tv_nsec = n; + nanosleep(&t, NULL); +} + +/* + * Simple NULL test of device open/close. + */ +TEST_F(hmm, open_close) +{ +} + +/* + * Read private anonymous memory. + */ +TEST_F(hmm, anon_read) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + int val; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* + * Initialize buffer in system memory but leave the first two pages + * zero (pte_none and pfn_zero). + */ + i = 2 * self->page_size / sizeof(*ptr); + for (ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Set buffer permission to read-only. */ + ret = mprotect(buffer->ptr, size, PROT_READ); + ASSERT_EQ(ret, 0); + + /* Populate the CPU page table with a special zero page. */ + val = *(int *)(buffer->ptr + self->page_size); + ASSERT_EQ(val, 0); + + /* Simulate a device reading system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device read. */ + ptr = buffer->mirror; + for (i = 0; i < 2 * self->page_size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], 0); + for (; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Read private anonymous memory which has been protected with + * mprotect() PROT_NONE. + */ +TEST_F(hmm, anon_read_prot) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Initialize mirror buffer so we can verify it isn't written. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = -i; + + /* Protect buffer from reading. */ + ret = mprotect(buffer->ptr, size, PROT_NONE); + ASSERT_EQ(ret, 0); + + /* Simulate a device reading system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages); + ASSERT_EQ(ret, -EFAULT); + + /* Allow CPU to read the buffer so we can check it. */ + ret = mprotect(buffer->ptr, size, PROT_READ); + ASSERT_EQ(ret, 0); + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], -i); + + hmm_buffer_free(buffer); +} + +/* + * Write private anonymous memory. + */ +TEST_F(hmm, anon_write) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Write private anonymous memory which has been protected with + * mprotect() PROT_READ. + */ +TEST_F(hmm, anon_write_prot) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Simulate a device reading a zero page of memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, 1); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, 1); + ASSERT_EQ(buffer->faults, 1); + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, -EPERM); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], 0); + + /* Now allow writing and see that the zero page is replaced. */ + ret = mprotect(buffer->ptr, size, PROT_WRITE | PROT_READ); + ASSERT_EQ(ret, 0); + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Check that a device writing an anonymous private mapping + * will copy-on-write if a child process inherits the mapping. + */ +TEST_F(hmm, anon_write_child) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + pid_t pid; + int child_fd; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer->ptr so we can tell if it is written. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = -i; + + pid = fork(); + if (pid == -1) + ASSERT_EQ(pid, 0); + if (pid != 0) { + waitpid(pid, &ret, 0); + ASSERT_EQ(WIFEXITED(ret), 1); + + /* Check that the parent's buffer did not change. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + return; + } + + /* Check that we see the parent's values. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], -i); + + /* The child process needs its own mirror to its own mm. */ + child_fd = hmm_open(0); + ASSERT_GE(child_fd, 0); + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(child_fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], -i); + + close(child_fd); + exit(0); +} + +/* + * Check that a device writing an anonymous shared mapping + * will not copy-on-write if a child process inherits the mapping. + */ +TEST_F(hmm, anon_write_child_shared) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + pid_t pid; + int child_fd; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer->ptr so we can tell if it is written. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = -i; + + pid = fork(); + if (pid == -1) + ASSERT_EQ(pid, 0); + if (pid != 0) { + waitpid(pid, &ret, 0); + ASSERT_EQ(WIFEXITED(ret), 1); + + /* Check that the parent's buffer did change. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], -i); + return; + } + + /* Check that we see the parent's values. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], -i); + + /* The child process needs its own mirror to its own mm. */ + child_fd = hmm_open(0); + ASSERT_GE(child_fd, 0); + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(child_fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], -i); + + close(child_fd); + exit(0); +} + +/* + * Write private anonymous huge page. + */ +TEST_F(hmm, anon_write_huge) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + void *old_ptr; + void *map; + int *ptr; + int ret; + + size = 2 * TWOMEG; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + size = TWOMEG; + npages = size >> self->page_shift; + map = (void *)ALIGN((uintptr_t)buffer->ptr, size); + ret = madvise(map, size, MADV_HUGEPAGE); + ASSERT_EQ(ret, 0); + old_ptr = buffer->ptr; + buffer->ptr = map; + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + buffer->ptr = old_ptr; + hmm_buffer_free(buffer); +} + +/* + * Write huge TLBFS page. + */ +TEST_F(hmm, anon_write_hugetlbfs) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + long pagesizes[4]; + int n, idx; + + /* Skip test if we can't allocate a hugetlbfs page. */ + + n = gethugepagesizes(pagesizes, 4); + if (n <= 0) + return; + for (idx = 0; --n > 0; ) { + if (pagesizes[n] < pagesizes[idx]) + idx = n; + } + size = ALIGN(TWOMEG, pagesizes[idx]); + npages = size >> self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->ptr = get_hugepage_region(size, GHR_STRICT); + if (buffer->ptr == NULL) { + free(buffer); + return; + } + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + free_hugepage_region(buffer->ptr); + buffer->ptr = NULL; + hmm_buffer_free(buffer); +} + +/* + * Read mmap'ed file memory. + */ +TEST_F(hmm, file_read) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + int fd; + ssize_t len; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + fd = hmm_create_file(size); + ASSERT_GE(fd, 0); + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = fd; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + /* Write initial contents of the file. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + len = pwrite(fd, buffer->mirror, size, 0); + ASSERT_EQ(len, size); + memset(buffer->mirror, 0, size); + + buffer->ptr = mmap(NULL, size, + PROT_READ, + MAP_SHARED, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Simulate a device reading system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Write mmap'ed file memory. + */ +TEST_F(hmm, file_write) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + int fd; + ssize_t len; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + fd = hmm_create_file(size); + ASSERT_GE(fd, 0); + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = fd; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_SHARED, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Check that the device also wrote the file. */ + len = pread(fd, buffer->mirror, size, 0); + ASSERT_EQ(len, size); + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Migrate anonymous memory to device private memory. + */ +TEST_F(hmm, migrate) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Migrate anonymous memory to device private memory and fault it back to system + * memory. + */ +TEST_F(hmm, migrate_fault) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Fault pages back to system memory and check them. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Try to migrate various memory types to device private memory. + */ +TEST_F(hmm2, migrate_mixed) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + int *ptr; + unsigned char *p; + int ret; + int val; + + npages = 6; + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + /* Reserve a range of addresses. */ + buffer->ptr = mmap(NULL, size, + PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + p = buffer->ptr; + + /* Migrating a protected area should be an error. */ + ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_MIGRATE, buffer, npages); + ASSERT_EQ(ret, -EINVAL); + + /* Punch a hole after the first page address. */ + ret = munmap(buffer->ptr + self->page_size, self->page_size); + ASSERT_EQ(ret, 0); + + /* We expect an error if the vma doesn't cover the range. */ + ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_MIGRATE, buffer, 3); + ASSERT_EQ(ret, -EINVAL); + + /* Page 2 will be a read-only zero page. */ + ret = mprotect(buffer->ptr + 2 * self->page_size, self->page_size, + PROT_READ); + ASSERT_EQ(ret, 0); + ptr = (int *)(buffer->ptr + 2 * self->page_size); + val = *ptr + 3; + ASSERT_EQ(val, 3); + + /* Page 3 will be read-only. */ + ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size, + PROT_READ | PROT_WRITE); + ASSERT_EQ(ret, 0); + ptr = (int *)(buffer->ptr + 3 * self->page_size); + *ptr = val; + ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size, + PROT_READ); + ASSERT_EQ(ret, 0); + + /* Page 4-5 will be read-write. */ + ret = mprotect(buffer->ptr + 4 * self->page_size, 2 * self->page_size, + PROT_READ | PROT_WRITE); + ASSERT_EQ(ret, 0); + ptr = (int *)(buffer->ptr + 4 * self->page_size); + *ptr = val; + ptr = (int *)(buffer->ptr + 5 * self->page_size); + *ptr = val; + + /* Now try to migrate pages 2-5 to device 1. */ + buffer->ptr = p + 2 * self->page_size; + ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_MIGRATE, buffer, 4); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, 4); + + /* Page 5 won't be migrated to device 0 because it's on device 1. */ + buffer->ptr = p + 5 * self->page_size; + ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_MIGRATE, buffer, 1); + ASSERT_EQ(ret, -ENOENT); + buffer->ptr = p; + + buffer->ptr = p; + hmm_buffer_free(buffer); +} + +/* + * Migrate anonymous memory to device private memory and fault it back to system + * memory multiple times. + */ +TEST_F(hmm, migrate_multiple) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + unsigned long c; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + for (c = 0; c < NTIMES; c++) { + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, + npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Fault pages back to system memory and check them. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); + } +} + +/* + * Read anonymous memory multiple times. + */ +TEST_F(hmm, anon_read_multiple) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + unsigned long c; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + for (c = 0; c < NTIMES; c++) { + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i + c; + + /* Simulate a device reading system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, + npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i + c); + + hmm_buffer_free(buffer); + } +} + +void *unmap_buffer(void *p) +{ + struct hmm_buffer *buffer = p; + + /* Delay for a bit and then unmap buffer while it is being read. */ + hmm_nanosleep(hmm_random() % 32000); + munmap(buffer->ptr + buffer->size / 2, buffer->size / 2); + buffer->ptr = NULL; + + return NULL; +} + +/* + * Try reading anonymous memory while it is being unmapped. + */ +TEST_F(hmm, anon_teardown) +{ + unsigned long npages; + unsigned long size; + unsigned long c; + void *ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + for (c = 0; c < NTIMES; ++c) { + pthread_t thread; + struct hmm_buffer *buffer; + unsigned long i; + int *ptr; + int rc; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i + c; + + rc = pthread_create(&thread, NULL, unmap_buffer, buffer); + ASSERT_EQ(rc, 0); + + /* Simulate a device reading system memory. */ + rc = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, + npages); + if (rc == 0) { + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; + i < size / sizeof(*ptr); + ++i) + ASSERT_EQ(ptr[i], i + c); + } + + pthread_join(thread, &ret); + hmm_buffer_free(buffer); + } +} + +/* + * Test memory snapshot without faulting in pages accessed by the device. + */ +TEST_F(hmm2, snapshot) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + int *ptr; + unsigned char *p; + unsigned char *m; + int ret; + int val; + + npages = 7; + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(npages); + ASSERT_NE(buffer->mirror, NULL); + + /* Reserve a range of addresses. */ + buffer->ptr = mmap(NULL, size, + PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + p = buffer->ptr; + + /* Punch a hole after the first page address. */ + ret = munmap(buffer->ptr + self->page_size, self->page_size); + ASSERT_EQ(ret, 0); + + /* Page 2 will be read-only zero page. */ + ret = mprotect(buffer->ptr + 2 * self->page_size, self->page_size, + PROT_READ); + ASSERT_EQ(ret, 0); + ptr = (int *)(buffer->ptr + 2 * self->page_size); + val = *ptr + 3; + ASSERT_EQ(val, 3); + + /* Page 3 will be read-only. */ + ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size, + PROT_READ | PROT_WRITE); + ASSERT_EQ(ret, 0); + ptr = (int *)(buffer->ptr + 3 * self->page_size); + *ptr = val; + ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size, + PROT_READ); + ASSERT_EQ(ret, 0); + + /* Page 4-6 will be read-write. */ + ret = mprotect(buffer->ptr + 4 * self->page_size, 3 * self->page_size, + PROT_READ | PROT_WRITE); + ASSERT_EQ(ret, 0); + ptr = (int *)(buffer->ptr + 4 * self->page_size); + *ptr = val; + + /* Page 5 will be migrated to device 0. */ + buffer->ptr = p + 5 * self->page_size; + ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_MIGRATE, buffer, 1); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, 1); + + /* Page 6 will be migrated to device 1. */ + buffer->ptr = p + 6 * self->page_size; + ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_MIGRATE, buffer, 1); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, 1); + + /* Simulate a device snapshotting CPU pagetables. */ + buffer->ptr = p; + ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_SNAPSHOT, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device saw. */ + m = buffer->mirror; + ASSERT_EQ(m[0], HMM_DMIRROR_PROT_ERROR); + ASSERT_EQ(m[1], HMM_DMIRROR_PROT_ERROR); + ASSERT_EQ(m[2], HMM_DMIRROR_PROT_ZERO | HMM_DMIRROR_PROT_READ); + ASSERT_EQ(m[3], HMM_DMIRROR_PROT_READ); + ASSERT_EQ(m[4], HMM_DMIRROR_PROT_WRITE); + ASSERT_EQ(m[5], HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL | + HMM_DMIRROR_PROT_WRITE); + ASSERT_EQ(m[6], HMM_DMIRROR_PROT_NONE); + + hmm_buffer_free(buffer); +} + +/* + * Test two devices reading the same memory (double mapped). + */ +TEST_F(hmm2, double_map) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = 6; + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(npages); + ASSERT_NE(buffer->mirror, NULL); + + /* Reserve a range of addresses. */ + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Make region read-only. */ + ret = mprotect(buffer->ptr, size, PROT_READ); + ASSERT_EQ(ret, 0); + + /* Simulate device 0 reading system memory. */ + ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_READ, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Simulate device 1 reading system memory. */ + ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_READ, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Punch a hole after the first page address. */ + ret = munmap(buffer->ptr + self->page_size, self->page_size); + ASSERT_EQ(ret, 0); + + hmm_buffer_free(buffer); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/vm/hugetlb_reparenting_test.sh b/tools/testing/selftests/vm/hugetlb_reparenting_test.sh new file mode 100644 index 000000000000..d11d1febccc3 --- /dev/null +++ b/tools/testing/selftests/vm/hugetlb_reparenting_test.sh @@ -0,0 +1,244 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +set -e + +if [[ $(id -u) -ne 0 ]]; then + echo "This test must be run as root. Skipping..." + exit 0 +fi + +usage_file=usage_in_bytes + +if [[ "$1" == "-cgroup-v2" ]]; then + cgroup2=1 + usage_file=current +fi + +CGROUP_ROOT='/dev/cgroup/memory' +MNT='/mnt/huge/' + +if [[ ! -e $CGROUP_ROOT ]]; then + mkdir -p $CGROUP_ROOT + if [[ $cgroup2 ]]; then + mount -t cgroup2 none $CGROUP_ROOT + sleep 1 + echo "+hugetlb +memory" >$CGROUP_ROOT/cgroup.subtree_control + else + mount -t cgroup memory,hugetlb $CGROUP_ROOT + fi +fi + +function get_machine_hugepage_size() { + hpz=$(grep -i hugepagesize /proc/meminfo) + kb=${hpz:14:-3} + mb=$(($kb / 1024)) + echo $mb +} + +MB=$(get_machine_hugepage_size) + +function cleanup() { + echo cleanup + set +e + rm -rf "$MNT"/* 2>/dev/null + umount "$MNT" 2>/dev/null + rmdir "$MNT" 2>/dev/null + rmdir "$CGROUP_ROOT"/a/b 2>/dev/null + rmdir "$CGROUP_ROOT"/a 2>/dev/null + rmdir "$CGROUP_ROOT"/test1 2>/dev/null + echo 0 >/proc/sys/vm/nr_hugepages + set -e +} + +function assert_state() { + local expected_a="$1" + local expected_a_hugetlb="$2" + local expected_b="" + local expected_b_hugetlb="" + + if [ ! -z ${3:-} ] && [ ! -z ${4:-} ]; then + expected_b="$3" + expected_b_hugetlb="$4" + fi + local tolerance=$((5 * 1024 * 1024)) + + local actual_a + actual_a="$(cat "$CGROUP_ROOT"/a/memory.$usage_file)" + if [[ $actual_a -lt $(($expected_a - $tolerance)) ]] || + [[ $actual_a -gt $(($expected_a + $tolerance)) ]]; then + echo actual a = $((${actual_a%% *} / 1024 / 1024)) MB + echo expected a = $((${expected_a%% *} / 1024 / 1024)) MB + echo fail + + cleanup + exit 1 + fi + + local actual_a_hugetlb + actual_a_hugetlb="$(cat "$CGROUP_ROOT"/a/hugetlb.${MB}MB.$usage_file)" + if [[ $actual_a_hugetlb -lt $(($expected_a_hugetlb - $tolerance)) ]] || + [[ $actual_a_hugetlb -gt $(($expected_a_hugetlb + $tolerance)) ]]; then + echo actual a hugetlb = $((${actual_a_hugetlb%% *} / 1024 / 1024)) MB + echo expected a hugetlb = $((${expected_a_hugetlb%% *} / 1024 / 1024)) MB + echo fail + + cleanup + exit 1 + fi + + if [[ -z "$expected_b" || -z "$expected_b_hugetlb" ]]; then + return + fi + + local actual_b + actual_b="$(cat "$CGROUP_ROOT"/a/b/memory.$usage_file)" + if [[ $actual_b -lt $(($expected_b - $tolerance)) ]] || + [[ $actual_b -gt $(($expected_b + $tolerance)) ]]; then + echo actual b = $((${actual_b%% *} / 1024 / 1024)) MB + echo expected b = $((${expected_b%% *} / 1024 / 1024)) MB + echo fail + + cleanup + exit 1 + fi + + local actual_b_hugetlb + actual_b_hugetlb="$(cat "$CGROUP_ROOT"/a/b/hugetlb.${MB}MB.$usage_file)" + if [[ $actual_b_hugetlb -lt $(($expected_b_hugetlb - $tolerance)) ]] || + [[ $actual_b_hugetlb -gt $(($expected_b_hugetlb + $tolerance)) ]]; then + echo actual b hugetlb = $((${actual_b_hugetlb%% *} / 1024 / 1024)) MB + echo expected b hugetlb = $((${expected_b_hugetlb%% *} / 1024 / 1024)) MB + echo fail + + cleanup + exit 1 + fi +} + +function setup() { + echo 100 >/proc/sys/vm/nr_hugepages + mkdir "$CGROUP_ROOT"/a + sleep 1 + if [[ $cgroup2 ]]; then + echo "+hugetlb +memory" >$CGROUP_ROOT/a/cgroup.subtree_control + else + echo 0 >$CGROUP_ROOT/a/cpuset.mems + echo 0 >$CGROUP_ROOT/a/cpuset.cpus + fi + + mkdir "$CGROUP_ROOT"/a/b + + if [[ ! $cgroup2 ]]; then + echo 0 >$CGROUP_ROOT/a/b/cpuset.mems + echo 0 >$CGROUP_ROOT/a/b/cpuset.cpus + fi + + mkdir -p "$MNT" + mount -t hugetlbfs none "$MNT" +} + +write_hugetlbfs() { + local cgroup="$1" + local path="$2" + local size="$3" + + if [[ $cgroup2 ]]; then + echo $$ >$CGROUP_ROOT/$cgroup/cgroup.procs + else + echo 0 >$CGROUP_ROOT/$cgroup/cpuset.mems + echo 0 >$CGROUP_ROOT/$cgroup/cpuset.cpus + echo $$ >"$CGROUP_ROOT/$cgroup/tasks" + fi + ./write_to_hugetlbfs -p "$path" -s "$size" -m 0 -o + if [[ $cgroup2 ]]; then + echo $$ >$CGROUP_ROOT/cgroup.procs + else + echo $$ >"$CGROUP_ROOT/tasks" + fi + echo +} + +set -e + +size=$((${MB} * 1024 * 1024 * 25)) # 50MB = 25 * 2MB hugepages. + +cleanup + +echo +echo +echo Test charge, rmdir, uncharge +setup +echo mkdir +mkdir $CGROUP_ROOT/test1 + +echo write +write_hugetlbfs test1 "$MNT"/test $size + +echo rmdir +rmdir $CGROUP_ROOT/test1 +mkdir $CGROUP_ROOT/test1 + +echo uncharge +rm -rf /mnt/huge/* + +cleanup + +echo done +echo +echo +if [[ ! $cgroup2 ]]; then + echo "Test parent and child hugetlb usage" + setup + + echo write + write_hugetlbfs a "$MNT"/test $size + + echo Assert memory charged correctly for parent use. + assert_state 0 $size 0 0 + + write_hugetlbfs a/b "$MNT"/test2 $size + + echo Assert memory charged correctly for child use. + assert_state 0 $(($size * 2)) 0 $size + + rmdir "$CGROUP_ROOT"/a/b + sleep 5 + echo Assert memory reparent correctly. + assert_state 0 $(($size * 2)) + + rm -rf "$MNT"/* + umount "$MNT" + echo Assert memory uncharged correctly. + assert_state 0 0 + + cleanup +fi + +echo +echo +echo "Test child only hugetlb usage" +echo setup +setup + +echo write +write_hugetlbfs a/b "$MNT"/test2 $size + +echo Assert memory charged correctly for child only use. +assert_state 0 $(($size)) 0 $size + +rmdir "$CGROUP_ROOT"/a/b +echo Assert memory reparent correctly. +assert_state 0 $size + +rm -rf "$MNT"/* +umount "$MNT" +echo Assert memory uncharged correctly. +assert_state 0 0 + +cleanup + +echo ALL PASS + +umount $CGROUP_ROOT +rm -rf $CGROUP_ROOT diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c new file mode 100644 index 000000000000..51b89cedd09d --- /dev/null +++ b/tools/testing/selftests/vm/khugepaged.c @@ -0,0 +1,1035 @@ +#define _GNU_SOURCE +#include <fcntl.h> +#include <limits.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdbool.h> +#include <string.h> +#include <unistd.h> + +#include <sys/mman.h> +#include <sys/wait.h> + +#ifndef MADV_PAGEOUT +#define MADV_PAGEOUT 21 +#endif + +#define BASE_ADDR ((void *)(1UL << 30)) +static unsigned long hpage_pmd_size; +static unsigned long page_size; +static int hpage_pmd_nr; + +#define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/" +#define PID_SMAPS "/proc/self/smaps" + +enum thp_enabled { + THP_ALWAYS, + THP_MADVISE, + THP_NEVER, +}; + +static const char *thp_enabled_strings[] = { + "always", + "madvise", + "never", + NULL +}; + +enum thp_defrag { + THP_DEFRAG_ALWAYS, + THP_DEFRAG_DEFER, + THP_DEFRAG_DEFER_MADVISE, + THP_DEFRAG_MADVISE, + THP_DEFRAG_NEVER, +}; + +static const char *thp_defrag_strings[] = { + "always", + "defer", + "defer+madvise", + "madvise", + "never", + NULL +}; + +enum shmem_enabled { + SHMEM_ALWAYS, + SHMEM_WITHIN_SIZE, + SHMEM_ADVISE, + SHMEM_NEVER, + SHMEM_DENY, + SHMEM_FORCE, +}; + +static const char *shmem_enabled_strings[] = { + "always", + "within_size", + "advise", + "never", + "deny", + "force", + NULL +}; + +struct khugepaged_settings { + bool defrag; + unsigned int alloc_sleep_millisecs; + unsigned int scan_sleep_millisecs; + unsigned int max_ptes_none; + unsigned int max_ptes_swap; + unsigned int max_ptes_shared; + unsigned long pages_to_scan; +}; + +struct settings { + enum thp_enabled thp_enabled; + enum thp_defrag thp_defrag; + enum shmem_enabled shmem_enabled; + bool debug_cow; + bool use_zero_page; + struct khugepaged_settings khugepaged; +}; + +static struct settings default_settings = { + .thp_enabled = THP_MADVISE, + .thp_defrag = THP_DEFRAG_ALWAYS, + .shmem_enabled = SHMEM_NEVER, + .debug_cow = 0, + .use_zero_page = 0, + .khugepaged = { + .defrag = 1, + .alloc_sleep_millisecs = 10, + .scan_sleep_millisecs = 10, + }, +}; + +static struct settings saved_settings; +static bool skip_settings_restore; + +static int exit_status; + +static void success(const char *msg) +{ + printf(" \e[32m%s\e[0m\n", msg); +} + +static void fail(const char *msg) +{ + printf(" \e[31m%s\e[0m\n", msg); + exit_status++; +} + +static int read_file(const char *path, char *buf, size_t buflen) +{ + int fd; + ssize_t numread; + + fd = open(path, O_RDONLY); + if (fd == -1) + return 0; + + numread = read(fd, buf, buflen - 1); + if (numread < 1) { + close(fd); + return 0; + } + + buf[numread] = '\0'; + close(fd); + + return (unsigned int) numread; +} + +static int write_file(const char *path, const char *buf, size_t buflen) +{ + int fd; + ssize_t numwritten; + + fd = open(path, O_WRONLY); + if (fd == -1) + return 0; + + numwritten = write(fd, buf, buflen - 1); + close(fd); + if (numwritten < 1) + return 0; + + return (unsigned int) numwritten; +} + +static int read_string(const char *name, const char *strings[]) +{ + char path[PATH_MAX]; + char buf[256]; + char *c; + int ret; + + ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + + if (!read_file(path, buf, sizeof(buf))) { + perror(path); + exit(EXIT_FAILURE); + } + + c = strchr(buf, '['); + if (!c) { + printf("%s: Parse failure\n", __func__); + exit(EXIT_FAILURE); + } + + c++; + memmove(buf, c, sizeof(buf) - (c - buf)); + + c = strchr(buf, ']'); + if (!c) { + printf("%s: Parse failure\n", __func__); + exit(EXIT_FAILURE); + } + *c = '\0'; + + ret = 0; + while (strings[ret]) { + if (!strcmp(strings[ret], buf)) + return ret; + ret++; + } + + printf("Failed to parse %s\n", name); + exit(EXIT_FAILURE); +} + +static void write_string(const char *name, const char *val) +{ + char path[PATH_MAX]; + int ret; + + ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + + if (!write_file(path, val, strlen(val) + 1)) { + perror(path); + exit(EXIT_FAILURE); + } +} + +static const unsigned long read_num(const char *name) +{ + char path[PATH_MAX]; + char buf[21]; + int ret; + + ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + + ret = read_file(path, buf, sizeof(buf)); + if (ret < 0) { + perror("read_file(read_num)"); + exit(EXIT_FAILURE); + } + + return strtoul(buf, NULL, 10); +} + +static void write_num(const char *name, unsigned long num) +{ + char path[PATH_MAX]; + char buf[21]; + int ret; + + ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + + sprintf(buf, "%ld", num); + if (!write_file(path, buf, strlen(buf) + 1)) { + perror(path); + exit(EXIT_FAILURE); + } +} + +static void write_settings(struct settings *settings) +{ + struct khugepaged_settings *khugepaged = &settings->khugepaged; + + write_string("enabled", thp_enabled_strings[settings->thp_enabled]); + write_string("defrag", thp_defrag_strings[settings->thp_defrag]); + write_string("shmem_enabled", + shmem_enabled_strings[settings->shmem_enabled]); + write_num("debug_cow", settings->debug_cow); + write_num("use_zero_page", settings->use_zero_page); + + write_num("khugepaged/defrag", khugepaged->defrag); + write_num("khugepaged/alloc_sleep_millisecs", + khugepaged->alloc_sleep_millisecs); + write_num("khugepaged/scan_sleep_millisecs", + khugepaged->scan_sleep_millisecs); + write_num("khugepaged/max_ptes_none", khugepaged->max_ptes_none); + write_num("khugepaged/max_ptes_swap", khugepaged->max_ptes_swap); + write_num("khugepaged/max_ptes_shared", khugepaged->max_ptes_shared); + write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan); +} + +static void restore_settings(int sig) +{ + if (skip_settings_restore) + goto out; + + printf("Restore THP and khugepaged settings..."); + write_settings(&saved_settings); + success("OK"); + if (sig) + exit(EXIT_FAILURE); +out: + exit(exit_status); +} + +static void save_settings(void) +{ + printf("Save THP and khugepaged settings..."); + saved_settings = (struct settings) { + .thp_enabled = read_string("enabled", thp_enabled_strings), + .thp_defrag = read_string("defrag", thp_defrag_strings), + .shmem_enabled = + read_string("shmem_enabled", shmem_enabled_strings), + .debug_cow = read_num("debug_cow"), + .use_zero_page = read_num("use_zero_page"), + }; + saved_settings.khugepaged = (struct khugepaged_settings) { + .defrag = read_num("khugepaged/defrag"), + .alloc_sleep_millisecs = + read_num("khugepaged/alloc_sleep_millisecs"), + .scan_sleep_millisecs = + read_num("khugepaged/scan_sleep_millisecs"), + .max_ptes_none = read_num("khugepaged/max_ptes_none"), + .max_ptes_swap = read_num("khugepaged/max_ptes_swap"), + .max_ptes_shared = read_num("khugepaged/max_ptes_shared"), + .pages_to_scan = read_num("khugepaged/pages_to_scan"), + }; + success("OK"); + + signal(SIGTERM, restore_settings); + signal(SIGINT, restore_settings); + signal(SIGHUP, restore_settings); + signal(SIGQUIT, restore_settings); +} + +static void adjust_settings(void) +{ + + printf("Adjust settings..."); + write_settings(&default_settings); + success("OK"); +} + +#define MAX_LINE_LENGTH 500 + +static bool check_for_pattern(FILE *fp, char *pattern, char *buf) +{ + while (fgets(buf, MAX_LINE_LENGTH, fp) != NULL) { + if (!strncmp(buf, pattern, strlen(pattern))) + return true; + } + return false; +} + +static bool check_huge(void *addr) +{ + bool thp = false; + int ret; + FILE *fp; + char buffer[MAX_LINE_LENGTH]; + char addr_pattern[MAX_LINE_LENGTH]; + + ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-", + (unsigned long) addr); + if (ret >= MAX_LINE_LENGTH) { + printf("%s: Pattern is too long\n", __func__); + exit(EXIT_FAILURE); + } + + + fp = fopen(PID_SMAPS, "r"); + if (!fp) { + printf("%s: Failed to open file %s\n", __func__, PID_SMAPS); + exit(EXIT_FAILURE); + } + if (!check_for_pattern(fp, addr_pattern, buffer)) + goto err_out; + + ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "AnonHugePages:%10ld kB", + hpage_pmd_size >> 10); + if (ret >= MAX_LINE_LENGTH) { + printf("%s: Pattern is too long\n", __func__); + exit(EXIT_FAILURE); + } + /* + * Fetch the AnonHugePages: in the same block and check whether it got + * the expected number of hugeepages next. + */ + if (!check_for_pattern(fp, "AnonHugePages:", buffer)) + goto err_out; + + if (strncmp(buffer, addr_pattern, strlen(addr_pattern))) + goto err_out; + + thp = true; +err_out: + fclose(fp); + return thp; +} + + +static bool check_swap(void *addr, unsigned long size) +{ + bool swap = false; + int ret; + FILE *fp; + char buffer[MAX_LINE_LENGTH]; + char addr_pattern[MAX_LINE_LENGTH]; + + ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-", + (unsigned long) addr); + if (ret >= MAX_LINE_LENGTH) { + printf("%s: Pattern is too long\n", __func__); + exit(EXIT_FAILURE); + } + + + fp = fopen(PID_SMAPS, "r"); + if (!fp) { + printf("%s: Failed to open file %s\n", __func__, PID_SMAPS); + exit(EXIT_FAILURE); + } + if (!check_for_pattern(fp, addr_pattern, buffer)) + goto err_out; + + ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "Swap:%19ld kB", + size >> 10); + if (ret >= MAX_LINE_LENGTH) { + printf("%s: Pattern is too long\n", __func__); + exit(EXIT_FAILURE); + } + /* + * Fetch the Swap: in the same block and check whether it got + * the expected number of hugeepages next. + */ + if (!check_for_pattern(fp, "Swap:", buffer)) + goto err_out; + + if (strncmp(buffer, addr_pattern, strlen(addr_pattern))) + goto err_out; + + swap = true; +err_out: + fclose(fp); + return swap; +} + +static void *alloc_mapping(void) +{ + void *p; + + p = mmap(BASE_ADDR, hpage_pmd_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (p != BASE_ADDR) { + printf("Failed to allocate VMA at %p\n", BASE_ADDR); + exit(EXIT_FAILURE); + } + + return p; +} + +static void fill_memory(int *p, unsigned long start, unsigned long end) +{ + int i; + + for (i = start / page_size; i < end / page_size; i++) + p[i * page_size / sizeof(*p)] = i + 0xdead0000; +} + +static void validate_memory(int *p, unsigned long start, unsigned long end) +{ + int i; + + for (i = start / page_size; i < end / page_size; i++) { + if (p[i * page_size / sizeof(*p)] != i + 0xdead0000) { + printf("Page %d is corrupted: %#x\n", + i, p[i * page_size / sizeof(*p)]); + exit(EXIT_FAILURE); + } + } +} + +#define TICK 500000 +static bool wait_for_scan(const char *msg, char *p) +{ + int full_scans; + int timeout = 6; /* 3 seconds */ + + /* Sanity check */ + if (check_huge(p)) { + printf("Unexpected huge page\n"); + exit(EXIT_FAILURE); + } + + madvise(p, hpage_pmd_size, MADV_HUGEPAGE); + + /* Wait until the second full_scan completed */ + full_scans = read_num("khugepaged/full_scans") + 2; + + printf("%s...", msg); + while (timeout--) { + if (check_huge(p)) + break; + if (read_num("khugepaged/full_scans") >= full_scans) + break; + printf("."); + usleep(TICK); + } + + madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); + + return !timeout; +} + +static void alloc_at_fault(void) +{ + struct settings settings = default_settings; + char *p; + + settings.thp_enabled = THP_ALWAYS; + write_settings(&settings); + + p = alloc_mapping(); + *p = 1; + printf("Allocate huge page on fault..."); + if (check_huge(p)) + success("OK"); + else + fail("Fail"); + + write_settings(&default_settings); + + madvise(p, page_size, MADV_DONTNEED); + printf("Split huge PMD on MADV_DONTNEED..."); + if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + munmap(p, hpage_pmd_size); +} + +static void collapse_full(void) +{ + void *p; + + p = alloc_mapping(); + fill_memory(p, 0, hpage_pmd_size); + if (wait_for_scan("Collapse fully populated PTE table", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, hpage_pmd_size); + munmap(p, hpage_pmd_size); +} + +static void collapse_empty(void) +{ + void *p; + + p = alloc_mapping(); + if (wait_for_scan("Do not collapse empty PTE table", p)) + fail("Timeout"); + else if (check_huge(p)) + fail("Fail"); + else + success("OK"); + munmap(p, hpage_pmd_size); +} + +static void collapse_single_pte_entry(void) +{ + void *p; + + p = alloc_mapping(); + fill_memory(p, 0, page_size); + if (wait_for_scan("Collapse PTE table with single PTE entry present", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, page_size); + munmap(p, hpage_pmd_size); +} + +static void collapse_max_ptes_none(void) +{ + int max_ptes_none = hpage_pmd_nr / 2; + struct settings settings = default_settings; + void *p; + + settings.khugepaged.max_ptes_none = max_ptes_none; + write_settings(&settings); + + p = alloc_mapping(); + + fill_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); + if (wait_for_scan("Do not collapse with max_ptes_none exceeded", p)) + fail("Timeout"); + else if (check_huge(p)) + fail("Fail"); + else + success("OK"); + validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); + + fill_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); + if (wait_for_scan("Collapse with max_ptes_none PTEs empty", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); + + munmap(p, hpage_pmd_size); + write_settings(&default_settings); +} + +static void collapse_swapin_single_pte(void) +{ + void *p; + p = alloc_mapping(); + fill_memory(p, 0, hpage_pmd_size); + + printf("Swapout one page..."); + if (madvise(p, page_size, MADV_PAGEOUT)) { + perror("madvise(MADV_PAGEOUT)"); + exit(EXIT_FAILURE); + } + if (check_swap(p, page_size)) { + success("OK"); + } else { + fail("Fail"); + goto out; + } + + if (wait_for_scan("Collapse with swapping in single PTE entry", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, hpage_pmd_size); +out: + munmap(p, hpage_pmd_size); +} + +static void collapse_max_ptes_swap(void) +{ + int max_ptes_swap = read_num("khugepaged/max_ptes_swap"); + void *p; + + p = alloc_mapping(); + + fill_memory(p, 0, hpage_pmd_size); + printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr); + if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT)) { + perror("madvise(MADV_PAGEOUT)"); + exit(EXIT_FAILURE); + } + if (check_swap(p, (max_ptes_swap + 1) * page_size)) { + success("OK"); + } else { + fail("Fail"); + goto out; + } + + if (wait_for_scan("Do not collapse with max_ptes_swap exceeded", p)) + fail("Timeout"); + else if (check_huge(p)) + fail("Fail"); + else + success("OK"); + validate_memory(p, 0, hpage_pmd_size); + + fill_memory(p, 0, hpage_pmd_size); + printf("Swapout %d of %d pages...", max_ptes_swap, hpage_pmd_nr); + if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) { + perror("madvise(MADV_PAGEOUT)"); + exit(EXIT_FAILURE); + } + if (check_swap(p, max_ptes_swap * page_size)) { + success("OK"); + } else { + fail("Fail"); + goto out; + } + + if (wait_for_scan("Collapse with max_ptes_swap pages swapped out", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, hpage_pmd_size); +out: + munmap(p, hpage_pmd_size); +} + +static void collapse_single_pte_entry_compound(void) +{ + void *p; + + p = alloc_mapping(); + + printf("Allocate huge page..."); + madvise(p, hpage_pmd_size, MADV_HUGEPAGE); + fill_memory(p, 0, hpage_pmd_size); + if (check_huge(p)) + success("OK"); + else + fail("Fail"); + madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); + + printf("Split huge page leaving single PTE mapping compound page..."); + madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED); + if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + + if (wait_for_scan("Collapse PTE table with single PTE mapping compound page", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, page_size); + munmap(p, hpage_pmd_size); +} + +static void collapse_full_of_compound(void) +{ + void *p; + + p = alloc_mapping(); + + printf("Allocate huge page..."); + madvise(p, hpage_pmd_size, MADV_HUGEPAGE); + fill_memory(p, 0, hpage_pmd_size); + if (check_huge(p)) + success("OK"); + else + fail("Fail"); + + printf("Split huge page leaving single PTE page table full of compound pages..."); + madvise(p, page_size, MADV_NOHUGEPAGE); + madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); + if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + + if (wait_for_scan("Collapse PTE table full of compound pages", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, hpage_pmd_size); + munmap(p, hpage_pmd_size); +} + +static void collapse_compound_extreme(void) +{ + void *p; + int i; + + p = alloc_mapping(); + for (i = 0; i < hpage_pmd_nr; i++) { + printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...", + i + 1, hpage_pmd_nr); + + madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE); + fill_memory(BASE_ADDR, 0, hpage_pmd_size); + if (!check_huge(BASE_ADDR)) { + printf("Failed to allocate huge page\n"); + exit(EXIT_FAILURE); + } + madvise(BASE_ADDR, hpage_pmd_size, MADV_NOHUGEPAGE); + + p = mremap(BASE_ADDR - i * page_size, + i * page_size + hpage_pmd_size, + (i + 1) * page_size, + MREMAP_MAYMOVE | MREMAP_FIXED, + BASE_ADDR + 2 * hpage_pmd_size); + if (p == MAP_FAILED) { + perror("mremap+unmap"); + exit(EXIT_FAILURE); + } + + p = mremap(BASE_ADDR + 2 * hpage_pmd_size, + (i + 1) * page_size, + (i + 1) * page_size + hpage_pmd_size, + MREMAP_MAYMOVE | MREMAP_FIXED, + BASE_ADDR - (i + 1) * page_size); + if (p == MAP_FAILED) { + perror("mremap+alloc"); + exit(EXIT_FAILURE); + } + } + + munmap(BASE_ADDR, hpage_pmd_size); + fill_memory(p, 0, hpage_pmd_size); + if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + + if (wait_for_scan("Collapse PTE table full of different compound pages", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); + + validate_memory(p, 0, hpage_pmd_size); + munmap(p, hpage_pmd_size); +} + +static void collapse_fork(void) +{ + int wstatus; + void *p; + + p = alloc_mapping(); + + printf("Allocate small page..."); + fill_memory(p, 0, page_size); + if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + + printf("Share small page over fork()..."); + if (!fork()) { + /* Do not touch settings on child exit */ + skip_settings_restore = true; + exit_status = 0; + + if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + + fill_memory(p, page_size, 2 * page_size); + + if (wait_for_scan("Collapse PTE table with single page shared with parent process", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); + + validate_memory(p, 0, page_size); + munmap(p, hpage_pmd_size); + exit(exit_status); + } + + wait(&wstatus); + exit_status += WEXITSTATUS(wstatus); + + printf("Check if parent still has small page..."); + if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, page_size); + munmap(p, hpage_pmd_size); +} + +static void collapse_fork_compound(void) +{ + int wstatus; + void *p; + + p = alloc_mapping(); + + printf("Allocate huge page..."); + madvise(p, hpage_pmd_size, MADV_HUGEPAGE); + fill_memory(p, 0, hpage_pmd_size); + if (check_huge(p)) + success("OK"); + else + fail("Fail"); + + printf("Share huge page over fork()..."); + if (!fork()) { + /* Do not touch settings on child exit */ + skip_settings_restore = true; + exit_status = 0; + + if (check_huge(p)) + success("OK"); + else + fail("Fail"); + + printf("Split huge page PMD in child process..."); + madvise(p, page_size, MADV_NOHUGEPAGE); + madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); + if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + fill_memory(p, 0, page_size); + + write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1); + if (wait_for_scan("Collapse PTE table full of compound pages in child", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); + write_num("khugepaged/max_ptes_shared", + default_settings.khugepaged.max_ptes_shared); + + validate_memory(p, 0, hpage_pmd_size); + munmap(p, hpage_pmd_size); + exit(exit_status); + } + + wait(&wstatus); + exit_status += WEXITSTATUS(wstatus); + + printf("Check if parent still has huge page..."); + if (check_huge(p)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, hpage_pmd_size); + munmap(p, hpage_pmd_size); +} + +static void collapse_max_ptes_shared() +{ + int max_ptes_shared = read_num("khugepaged/max_ptes_shared"); + int wstatus; + void *p; + + p = alloc_mapping(); + + printf("Allocate huge page..."); + madvise(p, hpage_pmd_size, MADV_HUGEPAGE); + fill_memory(p, 0, hpage_pmd_size); + if (check_huge(p)) + success("OK"); + else + fail("Fail"); + + printf("Share huge page over fork()..."); + if (!fork()) { + /* Do not touch settings on child exit */ + skip_settings_restore = true; + exit_status = 0; + + if (check_huge(p)) + success("OK"); + else + fail("Fail"); + + printf("Trigger CoW on page %d of %d...", + hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr); + fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size); + if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + + if (wait_for_scan("Do not collapse with max_ptes_shared exceeded", p)) + fail("Timeout"); + else if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + + printf("Trigger CoW on page %d of %d...", + hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr); + fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared) * page_size); + if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + + + if (wait_for_scan("Collapse with max_ptes_shared PTEs shared", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); + + validate_memory(p, 0, hpage_pmd_size); + munmap(p, hpage_pmd_size); + exit(exit_status); + } + + wait(&wstatus); + exit_status += WEXITSTATUS(wstatus); + + printf("Check if parent still has huge page..."); + if (check_huge(p)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, hpage_pmd_size); + munmap(p, hpage_pmd_size); +} + +int main(void) +{ + setbuf(stdout, NULL); + + page_size = getpagesize(); + hpage_pmd_size = read_num("hpage_pmd_size"); + hpage_pmd_nr = hpage_pmd_size / page_size; + + default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1; + default_settings.khugepaged.max_ptes_swap = hpage_pmd_nr / 8; + default_settings.khugepaged.max_ptes_shared = hpage_pmd_nr / 2; + default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8; + + save_settings(); + adjust_settings(); + + alloc_at_fault(); + collapse_full(); + collapse_empty(); + collapse_single_pte_entry(); + collapse_max_ptes_none(); + collapse_swapin_single_pte(); + collapse_max_ptes_swap(); + collapse_single_pte_entry_compound(); + collapse_full_of_compound(); + collapse_compound_extreme(); + collapse_fork(); + collapse_fork_compound(); + collapse_max_ptes_shared(); + + restore_settings(0); +} diff --git a/tools/testing/selftests/vm/map_hugetlb.c b/tools/testing/selftests/vm/map_hugetlb.c index 5a2d7b8efc40..6af951900aa3 100644 --- a/tools/testing/selftests/vm/map_hugetlb.c +++ b/tools/testing/selftests/vm/map_hugetlb.c @@ -45,20 +45,20 @@ static void check_bytes(char *addr) printf("First hex is %x\n", *((unsigned int *)addr)); } -static void write_bytes(char *addr) +static void write_bytes(char *addr, size_t length) { unsigned long i; - for (i = 0; i < LENGTH; i++) + for (i = 0; i < length; i++) *(addr + i) = (char)i; } -static int read_bytes(char *addr) +static int read_bytes(char *addr, size_t length) { unsigned long i; check_bytes(addr); - for (i = 0; i < LENGTH; i++) + for (i = 0; i < length; i++) if (*(addr + i) != (char)i) { printf("Mismatch at %lu\n", i); return 1; @@ -96,11 +96,11 @@ int main(int argc, char **argv) printf("Returned address is %p\n", addr); check_bytes(addr); - write_bytes(addr); - ret = read_bytes(addr); + write_bytes(addr, length); + ret = read_bytes(addr, length); /* munmap() length of MAP_HUGETLB memory must be hugepage aligned */ - if (munmap(addr, LENGTH)) { + if (munmap(addr, length)) { perror("munmap"); exit(1); } diff --git a/tools/testing/selftests/vm/mlock2-tests.c b/tools/testing/selftests/vm/mlock2-tests.c index 637b6d0ac0d0..11b2301f3aa3 100644 --- a/tools/testing/selftests/vm/mlock2-tests.c +++ b/tools/testing/selftests/vm/mlock2-tests.c @@ -67,59 +67,6 @@ out: return ret; } -static uint64_t get_pageflags(unsigned long addr) -{ - FILE *file; - uint64_t pfn; - unsigned long offset; - - file = fopen("/proc/self/pagemap", "r"); - if (!file) { - perror("fopen pagemap"); - _exit(1); - } - - offset = addr / getpagesize() * sizeof(pfn); - - if (fseek(file, offset, SEEK_SET)) { - perror("fseek pagemap"); - _exit(1); - } - - if (fread(&pfn, sizeof(pfn), 1, file) != 1) { - perror("fread pagemap"); - _exit(1); - } - - fclose(file); - return pfn; -} - -static uint64_t get_kpageflags(unsigned long pfn) -{ - uint64_t flags; - FILE *file; - - file = fopen("/proc/kpageflags", "r"); - if (!file) { - perror("fopen kpageflags"); - _exit(1); - } - - if (fseek(file, pfn * sizeof(flags), SEEK_SET)) { - perror("fseek kpageflags"); - _exit(1); - } - - if (fread(&flags, sizeof(flags), 1, file) != 1) { - perror("fread kpageflags"); - _exit(1); - } - - fclose(file); - return flags; -} - #define VMFLAGS "VmFlags:" static bool is_vmflag_set(unsigned long addr, const char *vmflag) @@ -159,19 +106,13 @@ out: #define RSS "Rss:" #define LOCKED "lo" -static bool is_vma_lock_on_fault(unsigned long addr) +static unsigned long get_value_for_name(unsigned long addr, const char *name) { - bool ret = false; - bool locked; - FILE *smaps = NULL; - unsigned long vma_size, vma_rss; char *line = NULL; - char *value; size_t size = 0; - - locked = is_vmflag_set(addr, LOCKED); - if (!locked) - goto out; + char *value_ptr; + FILE *smaps = NULL; + unsigned long value = -1UL; smaps = seek_to_smaps_entry(addr); if (!smaps) { @@ -180,112 +121,70 @@ static bool is_vma_lock_on_fault(unsigned long addr) } while (getline(&line, &size, smaps) > 0) { - if (!strstr(line, SIZE)) { + if (!strstr(line, name)) { free(line); line = NULL; size = 0; continue; } - value = line + strlen(SIZE); - if (sscanf(value, "%lu kB", &vma_size) < 1) { + value_ptr = line + strlen(name); + if (sscanf(value_ptr, "%lu kB", &value) < 1) { printf("Unable to parse smaps entry for Size\n"); goto out; } break; } - while (getline(&line, &size, smaps) > 0) { - if (!strstr(line, RSS)) { - free(line); - line = NULL; - size = 0; - continue; - } - - value = line + strlen(RSS); - if (sscanf(value, "%lu kB", &vma_rss) < 1) { - printf("Unable to parse smaps entry for Rss\n"); - goto out; - } - break; - } - - ret = locked && (vma_rss < vma_size); out: - free(line); if (smaps) fclose(smaps); - return ret; + free(line); + return value; } -#define PRESENT_BIT 0x8000000000000000ULL -#define PFN_MASK 0x007FFFFFFFFFFFFFULL -#define UNEVICTABLE_BIT (1UL << 18) - -static int lock_check(char *map) +static bool is_vma_lock_on_fault(unsigned long addr) { - unsigned long page_size = getpagesize(); - uint64_t page1_flags, page2_flags; + bool locked; + unsigned long vma_size, vma_rss; - page1_flags = get_pageflags((unsigned long)map); - page2_flags = get_pageflags((unsigned long)map + page_size); + locked = is_vmflag_set(addr, LOCKED); + if (!locked) + return false; - /* Both pages should be present */ - if (((page1_flags & PRESENT_BIT) == 0) || - ((page2_flags & PRESENT_BIT) == 0)) { - printf("Failed to make both pages present\n"); - return 1; - } + vma_size = get_value_for_name(addr, SIZE); + vma_rss = get_value_for_name(addr, RSS); - page1_flags = get_kpageflags(page1_flags & PFN_MASK); - page2_flags = get_kpageflags(page2_flags & PFN_MASK); + /* only one page is faulted in */ + return (vma_rss < vma_size); +} - /* Both pages should be unevictable */ - if (((page1_flags & UNEVICTABLE_BIT) == 0) || - ((page2_flags & UNEVICTABLE_BIT) == 0)) { - printf("Failed to make both pages unevictable\n"); - return 1; - } +#define PRESENT_BIT 0x8000000000000000ULL +#define PFN_MASK 0x007FFFFFFFFFFFFFULL +#define UNEVICTABLE_BIT (1UL << 18) - if (!is_vmflag_set((unsigned long)map, LOCKED)) { - printf("VMA flag %s is missing on page 1\n", LOCKED); - return 1; - } +static int lock_check(unsigned long addr) +{ + bool locked; + unsigned long vma_size, vma_rss; - if (!is_vmflag_set((unsigned long)map + page_size, LOCKED)) { - printf("VMA flag %s is missing on page 2\n", LOCKED); - return 1; - } + locked = is_vmflag_set(addr, LOCKED); + if (!locked) + return false; - return 0; + vma_size = get_value_for_name(addr, SIZE); + vma_rss = get_value_for_name(addr, RSS); + + return (vma_rss == vma_size); } static int unlock_lock_check(char *map) { - unsigned long page_size = getpagesize(); - uint64_t page1_flags, page2_flags; - - page1_flags = get_pageflags((unsigned long)map); - page2_flags = get_pageflags((unsigned long)map + page_size); - page1_flags = get_kpageflags(page1_flags & PFN_MASK); - page2_flags = get_kpageflags(page2_flags & PFN_MASK); - - if ((page1_flags & UNEVICTABLE_BIT) || (page2_flags & UNEVICTABLE_BIT)) { - printf("A page is still marked unevictable after unlock\n"); - return 1; - } - if (is_vmflag_set((unsigned long)map, LOCKED)) { printf("VMA flag %s is present on page 1 after unlock\n", LOCKED); return 1; } - if (is_vmflag_set((unsigned long)map + page_size, LOCKED)) { - printf("VMA flag %s is present on page 2 after unlock\n", LOCKED); - return 1; - } - return 0; } @@ -311,7 +210,7 @@ static int test_mlock_lock() goto unmap; } - if (lock_check(map)) + if (!lock_check((unsigned long)map)) goto unmap; /* Now unlock and recheck attributes */ @@ -330,64 +229,18 @@ out: static int onfault_check(char *map) { - unsigned long page_size = getpagesize(); - uint64_t page1_flags, page2_flags; - - page1_flags = get_pageflags((unsigned long)map); - page2_flags = get_pageflags((unsigned long)map + page_size); - - /* Neither page should be present */ - if ((page1_flags & PRESENT_BIT) || (page2_flags & PRESENT_BIT)) { - printf("Pages were made present by MLOCK_ONFAULT\n"); - return 1; - } - *map = 'a'; - page1_flags = get_pageflags((unsigned long)map); - page2_flags = get_pageflags((unsigned long)map + page_size); - - /* Only page 1 should be present */ - if ((page1_flags & PRESENT_BIT) == 0) { - printf("Page 1 is not present after fault\n"); - return 1; - } else if (page2_flags & PRESENT_BIT) { - printf("Page 2 was made present\n"); - return 1; - } - - page1_flags = get_kpageflags(page1_flags & PFN_MASK); - - /* Page 1 should be unevictable */ - if ((page1_flags & UNEVICTABLE_BIT) == 0) { - printf("Failed to make faulted page unevictable\n"); - return 1; - } - if (!is_vma_lock_on_fault((unsigned long)map)) { printf("VMA is not marked for lock on fault\n"); return 1; } - if (!is_vma_lock_on_fault((unsigned long)map + page_size)) { - printf("VMA is not marked for lock on fault\n"); - return 1; - } - return 0; } static int unlock_onfault_check(char *map) { unsigned long page_size = getpagesize(); - uint64_t page1_flags; - - page1_flags = get_pageflags((unsigned long)map); - page1_flags = get_kpageflags(page1_flags & PFN_MASK); - - if (page1_flags & UNEVICTABLE_BIT) { - printf("Page 1 is still marked unevictable after unlock\n"); - return 1; - } if (is_vma_lock_on_fault((unsigned long)map) || is_vma_lock_on_fault((unsigned long)map + page_size)) { @@ -445,7 +298,6 @@ static int test_lock_onfault_of_present() char *map; int ret = 1; unsigned long page_size = getpagesize(); - uint64_t page1_flags, page2_flags; map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); @@ -465,17 +317,6 @@ static int test_lock_onfault_of_present() goto unmap; } - page1_flags = get_pageflags((unsigned long)map); - page2_flags = get_pageflags((unsigned long)map + page_size); - page1_flags = get_kpageflags(page1_flags & PFN_MASK); - page2_flags = get_kpageflags(page2_flags & PFN_MASK); - - /* Page 1 should be unevictable */ - if ((page1_flags & UNEVICTABLE_BIT) == 0) { - printf("Failed to make present page unevictable\n"); - goto unmap; - } - if (!is_vma_lock_on_fault((unsigned long)map) || !is_vma_lock_on_fault((unsigned long)map + page_size)) { printf("VMA with present pages is not marked lock on fault\n"); @@ -507,7 +348,7 @@ static int test_munlockall() goto out; } - if (lock_check(map)) + if (!lock_check((unsigned long)map)) goto unmap; if (munlockall()) { @@ -549,7 +390,7 @@ static int test_munlockall() goto out; } - if (lock_check(map)) + if (!lock_check((unsigned long)map)) goto unmap; if (munlockall()) { diff --git a/tools/testing/selftests/vm/mremap_dontunmap.c b/tools/testing/selftests/vm/mremap_dontunmap.c new file mode 100644 index 000000000000..3a7b5ef0b0c6 --- /dev/null +++ b/tools/testing/selftests/vm/mremap_dontunmap.c @@ -0,0 +1,312 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Tests for mremap w/ MREMAP_DONTUNMAP. + * + * Copyright 2020, Brian Geffon <bgeffon@google.com> + */ +#define _GNU_SOURCE +#include <sys/mman.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include "../kselftest.h" + +#ifndef MREMAP_DONTUNMAP +#define MREMAP_DONTUNMAP 4 +#endif + +unsigned long page_size; +char *page_buffer; + +static void dump_maps(void) +{ + char cmd[32]; + + snprintf(cmd, sizeof(cmd), "cat /proc/%d/maps", getpid()); + system(cmd); +} + +#define BUG_ON(condition, description) \ + do { \ + if (condition) { \ + fprintf(stderr, "[FAIL]\t%s():%d\t%s:%s\n", __func__, \ + __LINE__, (description), strerror(errno)); \ + dump_maps(); \ + exit(1); \ + } \ + } while (0) + +// Try a simple operation for to "test" for kernel support this prevents +// reporting tests as failed when it's run on an older kernel. +static int kernel_support_for_mremap_dontunmap() +{ + int ret = 0; + unsigned long num_pages = 1; + void *source_mapping = mmap(NULL, num_pages * page_size, PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + BUG_ON(source_mapping == MAP_FAILED, "mmap"); + + // This simple remap should only fail if MREMAP_DONTUNMAP isn't + // supported. + void *dest_mapping = + mremap(source_mapping, num_pages * page_size, num_pages * page_size, + MREMAP_DONTUNMAP | MREMAP_MAYMOVE, 0); + if (dest_mapping == MAP_FAILED) { + ret = errno; + } else { + BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1, + "unable to unmap destination mapping"); + } + + BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, + "unable to unmap source mapping"); + return ret; +} + +// This helper will just validate that an entire mapping contains the expected +// byte. +static int check_region_contains_byte(void *addr, unsigned long size, char byte) +{ + BUG_ON(size & (page_size - 1), + "check_region_contains_byte expects page multiples"); + BUG_ON((unsigned long)addr & (page_size - 1), + "check_region_contains_byte expects page alignment"); + + memset(page_buffer, byte, page_size); + + unsigned long num_pages = size / page_size; + unsigned long i; + + // Compare each page checking that it contains our expected byte. + for (i = 0; i < num_pages; ++i) { + int ret = + memcmp(addr + (i * page_size), page_buffer, page_size); + if (ret) { + return ret; + } + } + + return 0; +} + +// this test validates that MREMAP_DONTUNMAP moves the pagetables while leaving +// the source mapping mapped. +static void mremap_dontunmap_simple() +{ + unsigned long num_pages = 5; + + void *source_mapping = + mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + BUG_ON(source_mapping == MAP_FAILED, "mmap"); + + memset(source_mapping, 'a', num_pages * page_size); + + // Try to just move the whole mapping anywhere (not fixed). + void *dest_mapping = + mremap(source_mapping, num_pages * page_size, num_pages * page_size, + MREMAP_DONTUNMAP | MREMAP_MAYMOVE, NULL); + BUG_ON(dest_mapping == MAP_FAILED, "mremap"); + + // Validate that the pages have been moved, we know they were moved if + // the dest_mapping contains a's. + BUG_ON(check_region_contains_byte + (dest_mapping, num_pages * page_size, 'a') != 0, + "pages did not migrate"); + BUG_ON(check_region_contains_byte + (source_mapping, num_pages * page_size, 0) != 0, + "source should have no ptes"); + + BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1, + "unable to unmap destination mapping"); + BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, + "unable to unmap source mapping"); +} + +// This test validates MREMAP_DONTUNMAP will move page tables to a specific +// destination using MREMAP_FIXED, also while validating that the source +// remains intact. +static void mremap_dontunmap_simple_fixed() +{ + unsigned long num_pages = 5; + + // Since we want to guarantee that we can remap to a point, we will + // create a mapping up front. + void *dest_mapping = + mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + BUG_ON(dest_mapping == MAP_FAILED, "mmap"); + memset(dest_mapping, 'X', num_pages * page_size); + + void *source_mapping = + mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + BUG_ON(source_mapping == MAP_FAILED, "mmap"); + memset(source_mapping, 'a', num_pages * page_size); + + void *remapped_mapping = + mremap(source_mapping, num_pages * page_size, num_pages * page_size, + MREMAP_FIXED | MREMAP_DONTUNMAP | MREMAP_MAYMOVE, + dest_mapping); + BUG_ON(remapped_mapping == MAP_FAILED, "mremap"); + BUG_ON(remapped_mapping != dest_mapping, + "mremap should have placed the remapped mapping at dest_mapping"); + + // The dest mapping will have been unmap by mremap so we expect the Xs + // to be gone and replaced with a's. + BUG_ON(check_region_contains_byte + (dest_mapping, num_pages * page_size, 'a') != 0, + "pages did not migrate"); + + // And the source mapping will have had its ptes dropped. + BUG_ON(check_region_contains_byte + (source_mapping, num_pages * page_size, 0) != 0, + "source should have no ptes"); + + BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1, + "unable to unmap destination mapping"); + BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, + "unable to unmap source mapping"); +} + +// This test validates that we can MREMAP_DONTUNMAP for a portion of an +// existing mapping. +static void mremap_dontunmap_partial_mapping() +{ + /* + * source mapping: + * -------------- + * | aaaaaaaaaa | + * -------------- + * to become: + * -------------- + * | aaaaa00000 | + * -------------- + * With the destination mapping containing 5 pages of As. + * --------- + * | aaaaa | + * --------- + */ + unsigned long num_pages = 10; + void *source_mapping = + mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + BUG_ON(source_mapping == MAP_FAILED, "mmap"); + memset(source_mapping, 'a', num_pages * page_size); + + // We will grab the last 5 pages of the source and move them. + void *dest_mapping = + mremap(source_mapping + (5 * page_size), 5 * page_size, + 5 * page_size, + MREMAP_DONTUNMAP | MREMAP_MAYMOVE, NULL); + BUG_ON(dest_mapping == MAP_FAILED, "mremap"); + + // We expect the first 5 pages of the source to contain a's and the + // final 5 pages to contain zeros. + BUG_ON(check_region_contains_byte(source_mapping, 5 * page_size, 'a') != + 0, "first 5 pages of source should have original pages"); + BUG_ON(check_region_contains_byte + (source_mapping + (5 * page_size), 5 * page_size, 0) != 0, + "final 5 pages of source should have no ptes"); + + // Finally we expect the destination to have 5 pages worth of a's. + BUG_ON(check_region_contains_byte(dest_mapping, 5 * page_size, 'a') != + 0, "dest mapping should contain ptes from the source"); + + BUG_ON(munmap(dest_mapping, 5 * page_size) == -1, + "unable to unmap destination mapping"); + BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, + "unable to unmap source mapping"); +} + +// This test validates that we can remap over only a portion of a mapping. +static void mremap_dontunmap_partial_mapping_overwrite(void) +{ + /* + * source mapping: + * --------- + * |aaaaa| + * --------- + * dest mapping initially: + * ----------- + * |XXXXXXXXXX| + * ------------ + * Source to become: + * --------- + * |00000| + * --------- + * With the destination mapping containing 5 pages of As. + * ------------ + * |aaaaaXXXXX| + * ------------ + */ + void *source_mapping = + mmap(NULL, 5 * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + BUG_ON(source_mapping == MAP_FAILED, "mmap"); + memset(source_mapping, 'a', 5 * page_size); + + void *dest_mapping = + mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + BUG_ON(dest_mapping == MAP_FAILED, "mmap"); + memset(dest_mapping, 'X', 10 * page_size); + + // We will grab the last 5 pages of the source and move them. + void *remapped_mapping = + mremap(source_mapping, 5 * page_size, + 5 * page_size, + MREMAP_DONTUNMAP | MREMAP_MAYMOVE | MREMAP_FIXED, dest_mapping); + BUG_ON(dest_mapping == MAP_FAILED, "mremap"); + BUG_ON(dest_mapping != remapped_mapping, "expected to remap to dest_mapping"); + + BUG_ON(check_region_contains_byte(source_mapping, 5 * page_size, 0) != + 0, "first 5 pages of source should have no ptes"); + + // Finally we expect the destination to have 5 pages worth of a's. + BUG_ON(check_region_contains_byte(dest_mapping, 5 * page_size, 'a') != 0, + "dest mapping should contain ptes from the source"); + + // Finally the last 5 pages shouldn't have been touched. + BUG_ON(check_region_contains_byte(dest_mapping + (5 * page_size), + 5 * page_size, 'X') != 0, + "dest mapping should have retained the last 5 pages"); + + BUG_ON(munmap(dest_mapping, 10 * page_size) == -1, + "unable to unmap destination mapping"); + BUG_ON(munmap(source_mapping, 5 * page_size) == -1, + "unable to unmap source mapping"); +} + +int main(void) +{ + page_size = sysconf(_SC_PAGE_SIZE); + + // test for kernel support for MREMAP_DONTUNMAP skipping the test if + // not. + if (kernel_support_for_mremap_dontunmap() != 0) { + printf("No kernel support for MREMAP_DONTUNMAP\n"); + return KSFT_SKIP; + } + + // Keep a page sized buffer around for when we need it. + page_buffer = + mmap(NULL, page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + BUG_ON(page_buffer == MAP_FAILED, "unable to mmap a page."); + + mremap_dontunmap_simple(); + mremap_dontunmap_simple_fixed(); + mremap_dontunmap_partial_mapping(); + mremap_dontunmap_partial_mapping_overwrite(); + + BUG_ON(munmap(page_buffer, page_size) == -1, + "unable to unmap page buffer"); + + printf("OK\n"); + return 0; +} diff --git a/tools/testing/selftests/vm/pkey-helpers.h b/tools/testing/selftests/vm/pkey-helpers.h new file mode 100644 index 000000000000..622a85848f61 --- /dev/null +++ b/tools/testing/selftests/vm/pkey-helpers.h @@ -0,0 +1,225 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _PKEYS_HELPER_H +#define _PKEYS_HELPER_H +#define _GNU_SOURCE +#include <string.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdint.h> +#include <stdbool.h> +#include <signal.h> +#include <assert.h> +#include <stdlib.h> +#include <ucontext.h> +#include <sys/mman.h> + +/* Define some kernel-like types */ +#define u8 __u8 +#define u16 __u16 +#define u32 __u32 +#define u64 __u64 + +#define PTR_ERR_ENOTSUP ((void *)-ENOTSUP) + +#ifndef DEBUG_LEVEL +#define DEBUG_LEVEL 0 +#endif +#define DPRINT_IN_SIGNAL_BUF_SIZE 4096 +extern int dprint_in_signal; +extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; + +extern int test_nr; +extern int iteration_nr; + +#ifdef __GNUC__ +__attribute__((format(printf, 1, 2))) +#endif +static inline void sigsafe_printf(const char *format, ...) +{ + va_list ap; + + if (!dprint_in_signal) { + va_start(ap, format); + vprintf(format, ap); + va_end(ap); + } else { + int ret; + /* + * No printf() functions are signal-safe. + * They deadlock easily. Write the format + * string to get some output, even if + * incomplete. + */ + ret = write(1, format, strlen(format)); + if (ret < 0) + exit(1); + } +} +#define dprintf_level(level, args...) do { \ + if (level <= DEBUG_LEVEL) \ + sigsafe_printf(args); \ +} while (0) +#define dprintf0(args...) dprintf_level(0, args) +#define dprintf1(args...) dprintf_level(1, args) +#define dprintf2(args...) dprintf_level(2, args) +#define dprintf3(args...) dprintf_level(3, args) +#define dprintf4(args...) dprintf_level(4, args) + +extern void abort_hooks(void); +#define pkey_assert(condition) do { \ + if (!(condition)) { \ + dprintf0("assert() at %s::%d test_nr: %d iteration: %d\n", \ + __FILE__, __LINE__, \ + test_nr, iteration_nr); \ + dprintf0("errno at assert: %d", errno); \ + abort_hooks(); \ + exit(__LINE__); \ + } \ +} while (0) + +__attribute__((noinline)) int read_ptr(int *ptr); +void expected_pkey_fault(int pkey); +int sys_pkey_alloc(unsigned long flags, unsigned long init_val); +int sys_pkey_free(unsigned long pkey); +int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, + unsigned long pkey); +void record_pkey_malloc(void *ptr, long size, int prot); + +#if defined(__i386__) || defined(__x86_64__) /* arch */ +#include "pkey-x86.h" +#elif defined(__powerpc64__) /* arch */ +#include "pkey-powerpc.h" +#else /* arch */ +#error Architecture not supported +#endif /* arch */ + +#define PKEY_MASK (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE) + +static inline u64 set_pkey_bits(u64 reg, int pkey, u64 flags) +{ + u32 shift = pkey_bit_position(pkey); + /* mask out bits from pkey in old value */ + reg &= ~((u64)PKEY_MASK << shift); + /* OR in new bits for pkey */ + reg |= (flags & PKEY_MASK) << shift; + return reg; +} + +static inline u64 get_pkey_bits(u64 reg, int pkey) +{ + u32 shift = pkey_bit_position(pkey); + /* + * shift down the relevant bits to the lowest two, then + * mask off all the other higher bits + */ + return ((reg >> shift) & PKEY_MASK); +} + +extern u64 shadow_pkey_reg; + +static inline u64 _read_pkey_reg(int line) +{ + u64 pkey_reg = __read_pkey_reg(); + + dprintf4("read_pkey_reg(line=%d) pkey_reg: %016llx" + " shadow: %016llx\n", + line, pkey_reg, shadow_pkey_reg); + assert(pkey_reg == shadow_pkey_reg); + + return pkey_reg; +} + +#define read_pkey_reg() _read_pkey_reg(__LINE__) + +static inline void write_pkey_reg(u64 pkey_reg) +{ + dprintf4("%s() changing %016llx to %016llx\n", __func__, + __read_pkey_reg(), pkey_reg); + /* will do the shadow check for us: */ + read_pkey_reg(); + __write_pkey_reg(pkey_reg); + shadow_pkey_reg = pkey_reg; + dprintf4("%s(%016llx) pkey_reg: %016llx\n", __func__, + pkey_reg, __read_pkey_reg()); +} + +/* + * These are technically racy. since something could + * change PKEY register between the read and the write. + */ +static inline void __pkey_access_allow(int pkey, int do_allow) +{ + u64 pkey_reg = read_pkey_reg(); + int bit = pkey * 2; + + if (do_allow) + pkey_reg &= (1<<bit); + else + pkey_reg |= (1<<bit); + + dprintf4("pkey_reg now: %016llx\n", read_pkey_reg()); + write_pkey_reg(pkey_reg); +} + +static inline void __pkey_write_allow(int pkey, int do_allow_write) +{ + u64 pkey_reg = read_pkey_reg(); + int bit = pkey * 2 + 1; + + if (do_allow_write) + pkey_reg &= (1<<bit); + else + pkey_reg |= (1<<bit); + + write_pkey_reg(pkey_reg); + dprintf4("pkey_reg now: %016llx\n", read_pkey_reg()); +} + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) +#define ALIGN_UP(x, align_to) (((x) + ((align_to)-1)) & ~((align_to)-1)) +#define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1)) +#define ALIGN_PTR_UP(p, ptr_align_to) \ + ((typeof(p))ALIGN_UP((unsigned long)(p), ptr_align_to)) +#define ALIGN_PTR_DOWN(p, ptr_align_to) \ + ((typeof(p))ALIGN_DOWN((unsigned long)(p), ptr_align_to)) +#define __stringify_1(x...) #x +#define __stringify(x...) __stringify_1(x) + +static inline u32 *siginfo_get_pkey_ptr(siginfo_t *si) +{ +#ifdef si_pkey + return &si->si_pkey; +#else + return (u32 *)(((u8 *)si) + si_pkey_offset); +#endif +} + +static inline int kernel_has_pkeys(void) +{ + /* try allocating a key and see if it succeeds */ + int ret = sys_pkey_alloc(0, 0); + if (ret <= 0) { + return 0; + } + sys_pkey_free(ret); + return 1; +} + +static inline int is_pkeys_supported(void) +{ + /* check if the cpu supports pkeys */ + if (!cpu_has_pkeys()) { + dprintf1("SKIP: %s: no CPU support\n", __func__); + return 0; + } + + /* check if the kernel supports pkeys */ + if (!kernel_has_pkeys()) { + dprintf1("SKIP: %s: no kernel support\n", __func__); + return 0; + } + + return 1; +} + +#endif /* _PKEYS_HELPER_H */ diff --git a/tools/testing/selftests/vm/pkey-powerpc.h b/tools/testing/selftests/vm/pkey-powerpc.h new file mode 100644 index 000000000000..1ebb586b2fbc --- /dev/null +++ b/tools/testing/selftests/vm/pkey-powerpc.h @@ -0,0 +1,133 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _PKEYS_POWERPC_H +#define _PKEYS_POWERPC_H + +#ifndef SYS_mprotect_key +# define SYS_mprotect_key 386 +#endif +#ifndef SYS_pkey_alloc +# define SYS_pkey_alloc 384 +# define SYS_pkey_free 385 +#endif +#define REG_IP_IDX PT_NIP +#define REG_TRAPNO PT_TRAP +#define gregs gp_regs +#define fpregs fp_regs +#define si_pkey_offset 0x20 + +#undef PKEY_DISABLE_ACCESS +#define PKEY_DISABLE_ACCESS 0x3 /* disable read and write */ + +#undef PKEY_DISABLE_WRITE +#define PKEY_DISABLE_WRITE 0x2 + +#define NR_PKEYS 32 +#define NR_RESERVED_PKEYS_4K 27 /* pkey-0, pkey-1, exec-only-pkey + and 24 other keys that cannot be + represented in the PTE */ +#define NR_RESERVED_PKEYS_64K_3KEYS 3 /* PowerNV and KVM: pkey-0, + pkey-1 and exec-only key */ +#define NR_RESERVED_PKEYS_64K_4KEYS 4 /* PowerVM: pkey-0, pkey-1, + pkey-31 and exec-only key */ +#define PKEY_BITS_PER_PKEY 2 +#define HPAGE_SIZE (1UL << 24) +#define PAGE_SIZE sysconf(_SC_PAGESIZE) + +static inline u32 pkey_bit_position(int pkey) +{ + return (NR_PKEYS - pkey - 1) * PKEY_BITS_PER_PKEY; +} + +static inline u64 __read_pkey_reg(void) +{ + u64 pkey_reg; + + asm volatile("mfspr %0, 0xd" : "=r" (pkey_reg)); + + return pkey_reg; +} + +static inline void __write_pkey_reg(u64 pkey_reg) +{ + u64 amr = pkey_reg; + + dprintf4("%s() changing %016llx to %016llx\n", + __func__, __read_pkey_reg(), pkey_reg); + + asm volatile("isync; mtspr 0xd, %0; isync" + : : "r" ((unsigned long)(amr)) : "memory"); + + dprintf4("%s() pkey register after changing %016llx to %016llx\n", + __func__, __read_pkey_reg(), pkey_reg); +} + +static inline int cpu_has_pkeys(void) +{ + /* No simple way to determine this */ + return 1; +} + +static inline bool arch_is_powervm() +{ + struct stat buf; + + if ((stat("/sys/firmware/devicetree/base/ibm,partition-name", &buf) == 0) && + (stat("/sys/firmware/devicetree/base/hmc-managed?", &buf) == 0) && + (stat("/sys/firmware/devicetree/base/chosen/qemu,graphic-width", &buf) == -1) ) + return true; + + return false; +} + +static inline int get_arch_reserved_keys(void) +{ + if (sysconf(_SC_PAGESIZE) == 4096) + return NR_RESERVED_PKEYS_4K; + else + if (arch_is_powervm()) + return NR_RESERVED_PKEYS_64K_4KEYS; + else + return NR_RESERVED_PKEYS_64K_3KEYS; +} + +void expect_fault_on_read_execonly_key(void *p1, int pkey) +{ + /* + * powerpc does not allow userspace to change permissions of exec-only + * keys since those keys are not allocated by userspace. The signal + * handler wont be able to reset the permissions, which means the code + * will infinitely continue to segfault here. + */ + return; +} + +/* 4-byte instructions * 16384 = 64K page */ +#define __page_o_noops() asm(".rept 16384 ; nop; .endr") + +void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey) +{ + void *ptr; + int ret; + + dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, + size, prot, pkey); + pkey_assert(pkey < NR_PKEYS); + ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + pkey_assert(ptr != (void *)-1); + + ret = syscall(__NR_subpage_prot, ptr, size, NULL); + if (ret) { + perror("subpage_perm"); + return PTR_ERR_ENOTSUP; + } + + ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey); + pkey_assert(!ret); + record_pkey_malloc(ptr, size, prot); + + dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr); + return ptr; +} + +#endif /* _PKEYS_POWERPC_H */ diff --git a/tools/testing/selftests/vm/pkey-x86.h b/tools/testing/selftests/vm/pkey-x86.h new file mode 100644 index 000000000000..3be20f5d5275 --- /dev/null +++ b/tools/testing/selftests/vm/pkey-x86.h @@ -0,0 +1,181 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _PKEYS_X86_H +#define _PKEYS_X86_H + +#ifdef __i386__ + +#ifndef SYS_mprotect_key +# define SYS_mprotect_key 380 +#endif + +#ifndef SYS_pkey_alloc +# define SYS_pkey_alloc 381 +# define SYS_pkey_free 382 +#endif + +#define REG_IP_IDX REG_EIP +#define si_pkey_offset 0x14 + +#else + +#ifndef SYS_mprotect_key +# define SYS_mprotect_key 329 +#endif + +#ifndef SYS_pkey_alloc +# define SYS_pkey_alloc 330 +# define SYS_pkey_free 331 +#endif + +#define REG_IP_IDX REG_RIP +#define si_pkey_offset 0x20 + +#endif + +#ifndef PKEY_DISABLE_ACCESS +# define PKEY_DISABLE_ACCESS 0x1 +#endif + +#ifndef PKEY_DISABLE_WRITE +# define PKEY_DISABLE_WRITE 0x2 +#endif + +#define NR_PKEYS 16 +#define NR_RESERVED_PKEYS 2 /* pkey-0 and exec-only-pkey */ +#define PKEY_BITS_PER_PKEY 2 +#define HPAGE_SIZE (1UL<<21) +#define PAGE_SIZE 4096 +#define MB (1<<20) + +static inline void __page_o_noops(void) +{ + /* 8-bytes of instruction * 512 bytes = 1 page */ + asm(".rept 512 ; nopl 0x7eeeeeee(%eax) ; .endr"); +} + +static inline u64 __read_pkey_reg(void) +{ + unsigned int eax, edx; + unsigned int ecx = 0; + unsigned pkey_reg; + + asm volatile(".byte 0x0f,0x01,0xee\n\t" + : "=a" (eax), "=d" (edx) + : "c" (ecx)); + pkey_reg = eax; + return pkey_reg; +} + +static inline void __write_pkey_reg(u64 pkey_reg) +{ + unsigned int eax = pkey_reg; + unsigned int ecx = 0; + unsigned int edx = 0; + + dprintf4("%s() changing %016llx to %016llx\n", __func__, + __read_pkey_reg(), pkey_reg); + asm volatile(".byte 0x0f,0x01,0xef\n\t" + : : "a" (eax), "c" (ecx), "d" (edx)); + assert(pkey_reg == __read_pkey_reg()); +} + +static inline void __cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + /* ecx is often an input as well as an output. */ + asm volatile( + "cpuid;" + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (*eax), "2" (*ecx)); +} + +/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx) */ +#define X86_FEATURE_PKU (1<<3) /* Protection Keys for Userspace */ +#define X86_FEATURE_OSPKE (1<<4) /* OS Protection Keys Enable */ + +static inline int cpu_has_pkeys(void) +{ + unsigned int eax; + unsigned int ebx; + unsigned int ecx; + unsigned int edx; + + eax = 0x7; + ecx = 0x0; + __cpuid(&eax, &ebx, &ecx, &edx); + + if (!(ecx & X86_FEATURE_PKU)) { + dprintf2("cpu does not have PKU\n"); + return 0; + } + if (!(ecx & X86_FEATURE_OSPKE)) { + dprintf2("cpu does not have OSPKE\n"); + return 0; + } + return 1; +} + +static inline u32 pkey_bit_position(int pkey) +{ + return pkey * PKEY_BITS_PER_PKEY; +} + +#define XSTATE_PKEY_BIT (9) +#define XSTATE_PKEY 0x200 + +int pkey_reg_xstate_offset(void) +{ + unsigned int eax; + unsigned int ebx; + unsigned int ecx; + unsigned int edx; + int xstate_offset; + int xstate_size; + unsigned long XSTATE_CPUID = 0xd; + int leaf; + + /* assume that XSTATE_PKEY is set in XCR0 */ + leaf = XSTATE_PKEY_BIT; + { + eax = XSTATE_CPUID; + ecx = leaf; + __cpuid(&eax, &ebx, &ecx, &edx); + + if (leaf == XSTATE_PKEY_BIT) { + xstate_offset = ebx; + xstate_size = eax; + } + } + + if (xstate_size == 0) { + printf("could not find size/offset of PKEY in xsave state\n"); + return 0; + } + + return xstate_offset; +} + +static inline int get_arch_reserved_keys(void) +{ + return NR_RESERVED_PKEYS; +} + +void expect_fault_on_read_execonly_key(void *p1, int pkey) +{ + int ptr_contents; + + ptr_contents = read_ptr(p1); + dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); + expected_pkey_fault(pkey); +} + +void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey) +{ + return PTR_ERR_ENOTSUP; +} + +#endif /* _PKEYS_X86_H */ diff --git a/tools/testing/selftests/x86/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c index 480995bceefa..fc19addcb5c8 100644 --- a/tools/testing/selftests/x86/protection_keys.c +++ b/tools/testing/selftests/vm/protection_keys.c @@ -1,11 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Tests x86 Memory Protection Keys (see Documentation/core-api/protection-keys.rst) + * Tests Memory Protection Keys (see Documentation/vm/protection-keys.txt) * * There are examples in here of: * * how to set protection keys on memory - * * how to set/clear bits in PKRU (the rights register) - * * how to handle SEGV_PKRU signals and extract pkey-relevant + * * how to set/clear bits in pkey registers (the rights register) + * * how to handle SEGV_PKUERR signals and extract pkey-relevant * information from the siginfo * * Things to add: @@ -22,8 +22,10 @@ * gcc -m32 -o protection_keys_32 -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm */ #define _GNU_SOURCE +#define __SANE_USERSPACE_TYPES__ #include <errno.h> #include <linux/futex.h> +#include <time.h> #include <sys/time.h> #include <sys/syscall.h> #include <string.h> @@ -48,34 +50,10 @@ int iteration_nr = 1; int test_nr; -unsigned int shadow_pkru; - -#define HPAGE_SIZE (1UL<<21) -#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) -#define ALIGN_UP(x, align_to) (((x) + ((align_to)-1)) & ~((align_to)-1)) -#define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1)) -#define ALIGN_PTR_UP(p, ptr_align_to) ((typeof(p))ALIGN_UP((unsigned long)(p), ptr_align_to)) -#define ALIGN_PTR_DOWN(p, ptr_align_to) ((typeof(p))ALIGN_DOWN((unsigned long)(p), ptr_align_to)) -#define __stringify_1(x...) #x -#define __stringify(x...) __stringify_1(x) - -#define PTR_ERR_ENOTSUP ((void *)-ENOTSUP) - +u64 shadow_pkey_reg; int dprint_in_signal; char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; -extern void abort_hooks(void); -#define pkey_assert(condition) do { \ - if (!(condition)) { \ - dprintf0("assert() at %s::%d test_nr: %d iteration: %d\n", \ - __FILE__, __LINE__, \ - test_nr, iteration_nr); \ - dprintf0("errno at assert: %d", errno); \ - abort_hooks(); \ - exit(__LINE__); \ - } \ -} while (0) - void cat_into_file(char *str, char *file) { int fd = open(file, O_RDWR); @@ -158,12 +136,6 @@ void abort_hooks(void) #endif } -static inline void __page_o_noops(void) -{ - /* 8-bytes of instruction * 512 bytes = 1 page */ - asm(".rept 512 ; nopl 0x7eeeeeee(%eax) ; .endr"); -} - /* * This attempts to have roughly a page of instructions followed by a few * instructions that do a write, and another page of instructions. That @@ -174,7 +146,12 @@ static inline void __page_o_noops(void) * will then fault, which makes sure that the fault code handles * execute-only memory properly. */ +#ifdef __powerpc64__ +/* This way, both 4K and 64K alignment are maintained */ +__attribute__((__aligned__(65536))) +#else __attribute__((__aligned__(PAGE_SIZE))) +#endif void lots_o_noops_around_write(int *write_to_me) { dprintf3("running %s()\n", __func__); @@ -186,51 +163,134 @@ void lots_o_noops_around_write(int *write_to_me) dprintf3("%s() done\n", __func__); } -/* Define some kernel-like types */ -#define u8 uint8_t -#define u16 uint16_t -#define u32 uint32_t -#define u64 uint64_t +void dump_mem(void *dumpme, int len_bytes) +{ + char *c = (void *)dumpme; + int i; -#ifdef __i386__ + for (i = 0; i < len_bytes; i += sizeof(u64)) { + u64 *ptr = (u64 *)(c + i); + dprintf1("dump[%03d][@%p]: %016llx\n", i, ptr, *ptr); + } +} -#ifndef SYS_mprotect_key -# define SYS_mprotect_key 380 -#endif +static u32 hw_pkey_get(int pkey, unsigned long flags) +{ + u64 pkey_reg = __read_pkey_reg(); -#ifndef SYS_pkey_alloc -# define SYS_pkey_alloc 381 -# define SYS_pkey_free 382 -#endif + dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n", + __func__, pkey, flags, 0, 0); + dprintf2("%s() raw pkey_reg: %016llx\n", __func__, pkey_reg); -#define REG_IP_IDX REG_EIP -#define si_pkey_offset 0x14 + return (u32) get_pkey_bits(pkey_reg, pkey); +} -#else +static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags) +{ + u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE); + u64 old_pkey_reg = __read_pkey_reg(); + u64 new_pkey_reg; -#ifndef SYS_mprotect_key -# define SYS_mprotect_key 329 -#endif + /* make sure that 'rights' only contains the bits we expect: */ + assert(!(rights & ~mask)); -#ifndef SYS_pkey_alloc -# define SYS_pkey_alloc 330 -# define SYS_pkey_free 331 -#endif + /* modify bits accordingly in old pkey_reg and assign it */ + new_pkey_reg = set_pkey_bits(old_pkey_reg, pkey, rights); -#define REG_IP_IDX REG_RIP -#define si_pkey_offset 0x20 + __write_pkey_reg(new_pkey_reg); -#endif + dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x" + " pkey_reg now: %016llx old_pkey_reg: %016llx\n", + __func__, pkey, rights, flags, 0, __read_pkey_reg(), + old_pkey_reg); + return 0; +} -void dump_mem(void *dumpme, int len_bytes) +void pkey_disable_set(int pkey, int flags) { - char *c = (void *)dumpme; - int i; + unsigned long syscall_flags = 0; + int ret; + int pkey_rights; + u64 orig_pkey_reg = read_pkey_reg(); - for (i = 0; i < len_bytes; i += sizeof(u64)) { - u64 *ptr = (u64 *)(c + i); - dprintf1("dump[%03d][@%p]: %016jx\n", i, ptr, *ptr); - } + dprintf1("START->%s(%d, 0x%x)\n", __func__, + pkey, flags); + pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); + + pkey_rights = hw_pkey_get(pkey, syscall_flags); + + dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, + pkey, pkey, pkey_rights); + + pkey_assert(pkey_rights >= 0); + + pkey_rights |= flags; + + ret = hw_pkey_set(pkey, pkey_rights, syscall_flags); + assert(!ret); + /* pkey_reg and flags have the same format */ + shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights); + dprintf1("%s(%d) shadow: 0x%016llx\n", + __func__, pkey, shadow_pkey_reg); + + pkey_assert(ret >= 0); + + pkey_rights = hw_pkey_get(pkey, syscall_flags); + dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, + pkey, pkey, pkey_rights); + + dprintf1("%s(%d) pkey_reg: 0x%016llx\n", + __func__, pkey, read_pkey_reg()); + if (flags) + pkey_assert(read_pkey_reg() >= orig_pkey_reg); + dprintf1("END<---%s(%d, 0x%x)\n", __func__, + pkey, flags); +} + +void pkey_disable_clear(int pkey, int flags) +{ + unsigned long syscall_flags = 0; + int ret; + int pkey_rights = hw_pkey_get(pkey, syscall_flags); + u64 orig_pkey_reg = read_pkey_reg(); + + pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); + + dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, + pkey, pkey, pkey_rights); + pkey_assert(pkey_rights >= 0); + + pkey_rights &= ~flags; + + ret = hw_pkey_set(pkey, pkey_rights, 0); + shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights); + pkey_assert(ret >= 0); + + pkey_rights = hw_pkey_get(pkey, syscall_flags); + dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, + pkey, pkey, pkey_rights); + + dprintf1("%s(%d) pkey_reg: 0x%016llx\n", __func__, + pkey, read_pkey_reg()); + if (flags) + assert(read_pkey_reg() <= orig_pkey_reg); +} + +void pkey_write_allow(int pkey) +{ + pkey_disable_clear(pkey, PKEY_DISABLE_WRITE); +} +void pkey_write_deny(int pkey) +{ + pkey_disable_set(pkey, PKEY_DISABLE_WRITE); +} +void pkey_access_allow(int pkey) +{ + pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS); +} +void pkey_access_deny(int pkey) +{ + pkey_disable_set(pkey, PKEY_DISABLE_ACCESS); } /* Failed address bound checks: */ @@ -255,7 +315,7 @@ static char *si_code_str(int si_code) return "UNKNOWN"; } -int pkru_faults; +int pkey_faults; int last_si_pkey = -1; void signal_handler(int signum, siginfo_t *si, void *vucontext) { @@ -263,24 +323,28 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) int trapno; unsigned long ip; char *fpregs; - u32 *pkru_ptr; +#if defined(__i386__) || defined(__x86_64__) /* arch */ + u32 *pkey_reg_ptr; + int pkey_reg_offset; +#endif /* arch */ u64 siginfo_pkey; u32 *si_pkey_ptr; - int pkru_offset; - fpregset_t fpregset; dprint_in_signal = 1; dprintf1(">>>>===============SIGSEGV============================\n"); - dprintf1("%s()::%d, pkru: 0x%x shadow: %x\n", __func__, __LINE__, - __rdpkru(), shadow_pkru); + dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n", + __func__, __LINE__, + __read_pkey_reg(), shadow_pkey_reg); trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO]; ip = uctxt->uc_mcontext.gregs[REG_IP_IDX]; - fpregset = uctxt->uc_mcontext.fpregs; - fpregs = (void *)fpregset; + fpregs = (char *) uctxt->uc_mcontext.fpregs; - dprintf2("%s() trapno: %d ip: 0x%lx info->si_code: %s/%d\n", __func__, - trapno, ip, si_code_str(si->si_code), si->si_code); + dprintf2("%s() trapno: %d ip: 0x%016lx info->si_code: %s/%d\n", + __func__, trapno, ip, si_code_str(si->si_code), + si->si_code); + +#if defined(__i386__) || defined(__x86_64__) /* arch */ #ifdef __i386__ /* * 32-bit has some extra padding so that userspace can tell whether @@ -288,20 +352,22 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) * state. We just assume that it is here. */ fpregs += 0x70; -#endif - pkru_offset = pkru_xstate_offset(); - pkru_ptr = (void *)(&fpregs[pkru_offset]); +#endif /* i386 */ + pkey_reg_offset = pkey_reg_xstate_offset(); + pkey_reg_ptr = (void *)(&fpregs[pkey_reg_offset]); - dprintf1("siginfo: %p\n", si); - dprintf1(" fpregs: %p\n", fpregs); /* - * If we got a PKRU fault, we *HAVE* to have at least one bit set in + * If we got a PKEY fault, we *HAVE* to have at least one bit set in * here. */ - dprintf1("pkru_xstate_offset: %d\n", pkru_xstate_offset()); + dprintf1("pkey_reg_xstate_offset: %d\n", pkey_reg_xstate_offset()); if (DEBUG_LEVEL > 4) - dump_mem(pkru_ptr - 128, 256); - pkey_assert(*pkru_ptr); + dump_mem(pkey_reg_ptr - 128, 256); + pkey_assert(*pkey_reg_ptr); +#endif /* arch */ + + dprintf1("siginfo: %p\n", si); + dprintf1(" fpregs: %p\n", fpregs); if ((si->si_code == SEGV_MAPERR) || (si->si_code == SEGV_ACCERR) || @@ -310,20 +376,29 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) exit(4); } - si_pkey_ptr = (u32 *)(((u8 *)si) + si_pkey_offset); + si_pkey_ptr = siginfo_get_pkey_ptr(si); dprintf1("si_pkey_ptr: %p\n", si_pkey_ptr); dump_mem((u8 *)si_pkey_ptr - 8, 24); siginfo_pkey = *si_pkey_ptr; pkey_assert(siginfo_pkey < NR_PKEYS); last_si_pkey = siginfo_pkey; - dprintf1("signal pkru from xsave: %08x\n", *pkru_ptr); - /* need __rdpkru() version so we do not do shadow_pkru checking */ - dprintf1("signal pkru from pkru: %08x\n", __rdpkru()); - dprintf1("pkey from siginfo: %jx\n", siginfo_pkey); - *(u64 *)pkru_ptr = 0x00000000; - dprintf1("WARNING: set PRKU=0 to allow faulting instruction to continue\n"); - pkru_faults++; + /* + * need __read_pkey_reg() version so we do not do shadow_pkey_reg + * checking + */ + dprintf1("signal pkey_reg from pkey_reg: %016llx\n", + __read_pkey_reg()); + dprintf1("pkey from siginfo: %016llx\n", siginfo_pkey); +#if defined(__i386__) || defined(__x86_64__) /* arch */ + dprintf1("signal pkey_reg from xsave: %08x\n", *pkey_reg_ptr); + *(u64 *)pkey_reg_ptr = 0x00000000; + dprintf1("WARNING: set PKEY_REG=0 to allow faulting instruction to continue\n"); +#elif defined(__powerpc64__) /* arch */ + /* restore access and let the faulting instruction continue */ + pkey_access_allow(siginfo_pkey); +#endif /* arch */ + pkey_faults++; dprintf1("<<<<==================================================\n"); dprint_in_signal = 0; } @@ -391,143 +466,6 @@ pid_t fork_lazy_child(void) return forkret; } -#ifndef PKEY_DISABLE_ACCESS -# define PKEY_DISABLE_ACCESS 0x1 -#endif - -#ifndef PKEY_DISABLE_WRITE -# define PKEY_DISABLE_WRITE 0x2 -#endif - -static u32 hw_pkey_get(int pkey, unsigned long flags) -{ - u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE); - u32 pkru = __rdpkru(); - u32 shifted_pkru; - u32 masked_pkru; - - dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n", - __func__, pkey, flags, 0, 0); - dprintf2("%s() raw pkru: %x\n", __func__, pkru); - - shifted_pkru = (pkru >> (pkey * PKRU_BITS_PER_PKEY)); - dprintf2("%s() shifted_pkru: %x\n", __func__, shifted_pkru); - masked_pkru = shifted_pkru & mask; - dprintf2("%s() masked pkru: %x\n", __func__, masked_pkru); - /* - * shift down the relevant bits to the lowest two, then - * mask off all the other high bits. - */ - return masked_pkru; -} - -static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags) -{ - u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE); - u32 old_pkru = __rdpkru(); - u32 new_pkru; - - /* make sure that 'rights' only contains the bits we expect: */ - assert(!(rights & ~mask)); - - /* copy old pkru */ - new_pkru = old_pkru; - /* mask out bits from pkey in old value: */ - new_pkru &= ~(mask << (pkey * PKRU_BITS_PER_PKEY)); - /* OR in new bits for pkey: */ - new_pkru |= (rights << (pkey * PKRU_BITS_PER_PKEY)); - - __wrpkru(new_pkru); - - dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x pkru now: %x old_pkru: %x\n", - __func__, pkey, rights, flags, 0, __rdpkru(), old_pkru); - return 0; -} - -void pkey_disable_set(int pkey, int flags) -{ - unsigned long syscall_flags = 0; - int ret; - int pkey_rights; - u32 orig_pkru = rdpkru(); - - dprintf1("START->%s(%d, 0x%x)\n", __func__, - pkey, flags); - pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); - - pkey_rights = hw_pkey_get(pkey, syscall_flags); - - dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, - pkey, pkey, pkey_rights); - pkey_assert(pkey_rights >= 0); - - pkey_rights |= flags; - - ret = hw_pkey_set(pkey, pkey_rights, syscall_flags); - assert(!ret); - /*pkru and flags have the same format */ - shadow_pkru |= flags << (pkey * 2); - dprintf1("%s(%d) shadow: 0x%x\n", __func__, pkey, shadow_pkru); - - pkey_assert(ret >= 0); - - pkey_rights = hw_pkey_get(pkey, syscall_flags); - dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, - pkey, pkey, pkey_rights); - - dprintf1("%s(%d) pkru: 0x%x\n", __func__, pkey, rdpkru()); - if (flags) - pkey_assert(rdpkru() > orig_pkru); - dprintf1("END<---%s(%d, 0x%x)\n", __func__, - pkey, flags); -} - -void pkey_disable_clear(int pkey, int flags) -{ - unsigned long syscall_flags = 0; - int ret; - int pkey_rights = hw_pkey_get(pkey, syscall_flags); - u32 orig_pkru = rdpkru(); - - pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); - - dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, - pkey, pkey, pkey_rights); - pkey_assert(pkey_rights >= 0); - - pkey_rights |= flags; - - ret = hw_pkey_set(pkey, pkey_rights, 0); - /* pkru and flags have the same format */ - shadow_pkru &= ~(flags << (pkey * 2)); - pkey_assert(ret >= 0); - - pkey_rights = hw_pkey_get(pkey, syscall_flags); - dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, - pkey, pkey, pkey_rights); - - dprintf1("%s(%d) pkru: 0x%x\n", __func__, pkey, rdpkru()); - if (flags) - assert(rdpkru() > orig_pkru); -} - -void pkey_write_allow(int pkey) -{ - pkey_disable_clear(pkey, PKEY_DISABLE_WRITE); -} -void pkey_write_deny(int pkey) -{ - pkey_disable_set(pkey, PKEY_DISABLE_WRITE); -} -void pkey_access_allow(int pkey) -{ - pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS); -} -void pkey_access_deny(int pkey) -{ - pkey_disable_set(pkey, PKEY_DISABLE_ACCESS); -} - int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, unsigned long pkey) { @@ -561,33 +499,44 @@ int alloc_pkey(void) int ret; unsigned long init_val = 0x0; - dprintf1("alloc_pkey()::%d, pkru: 0x%x shadow: %x\n", - __LINE__, __rdpkru(), shadow_pkru); + dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n", + __func__, __LINE__, __read_pkey_reg(), shadow_pkey_reg); ret = sys_pkey_alloc(0, init_val); /* - * pkey_alloc() sets PKRU, so we need to reflect it in - * shadow_pkru: + * pkey_alloc() sets PKEY register, so we need to reflect it in + * shadow_pkey_reg: */ - dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", - __LINE__, ret, __rdpkru(), shadow_pkru); + dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); if (ret) { /* clear both the bits: */ - shadow_pkru &= ~(0x3 << (ret * 2)); - dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", - __LINE__, ret, __rdpkru(), shadow_pkru); + shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret, + ~PKEY_MASK); + dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, + __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); /* * move the new state in from init_val - * (remember, we cheated and init_val == pkru format) + * (remember, we cheated and init_val == pkey_reg format) */ - shadow_pkru |= (init_val << (ret * 2)); + shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret, + init_val); } - dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", - __LINE__, ret, __rdpkru(), shadow_pkru); - dprintf1("alloc_pkey()::%d errno: %d\n", __LINE__, errno); + dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); + dprintf1("%s()::%d errno: %d\n", __func__, __LINE__, errno); /* for shadow checking: */ - rdpkru(); - dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", - __LINE__, ret, __rdpkru(), shadow_pkru); + read_pkey_reg(); + dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); return ret; } @@ -612,10 +561,10 @@ int alloc_random_pkey(void) int nr_alloced = 0; int random_index; memset(alloced_pkeys, 0, sizeof(alloced_pkeys)); + srand((unsigned int)time(NULL)); /* allocate every possible key and make a note of which ones we got */ max_nr_pkey_allocs = NR_PKEYS; - max_nr_pkey_allocs = 1; for (i = 0; i < max_nr_pkey_allocs; i++) { int new_pkey = alloc_pkey(); if (new_pkey < 0) @@ -638,8 +587,9 @@ int alloc_random_pkey(void) free_ret = sys_pkey_free(alloced_pkeys[i]); pkey_assert(!free_ret); } - dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__, - __LINE__, ret, __rdpkru(), shadow_pkru); + dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", __func__, + __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg); return ret; } @@ -657,11 +607,15 @@ int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, if (nr_iterations-- < 0) break; - dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__, - __LINE__, ret, __rdpkru(), shadow_pkru); + dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); sys_pkey_free(rpkey); - dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__, - __LINE__, ret, __rdpkru(), shadow_pkru); + dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); } pkey_assert(pkey < NR_PKEYS); @@ -669,8 +623,9 @@ int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, dprintf1("mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n", ptr, size, orig_prot, pkey, ret); pkey_assert(!ret); - dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__, - __LINE__, ret, __rdpkru(), shadow_pkru); + dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", __func__, + __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg); return ret; } @@ -752,7 +707,7 @@ void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey) void *ptr; int ret; - rdpkru(); + read_pkey_reg(); dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, size, prot, pkey); pkey_assert(pkey < NR_PKEYS); @@ -761,7 +716,7 @@ void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey) ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey); pkey_assert(!ret); record_pkey_malloc(ptr, size, prot); - rdpkru(); + read_pkey_reg(); dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr); return ptr; @@ -798,12 +753,15 @@ void *malloc_pkey_anon_huge(long size, int prot, u16 pkey) } int hugetlb_setup_ok; +#define SYSFS_FMT_NR_HUGE_PAGES "/sys/kernel/mm/hugepages/hugepages-%ldkB/nr_hugepages" #define GET_NR_HUGE_PAGES 10 void setup_hugetlbfs(void) { int err; int fd; - char buf[] = "123"; + char buf[256]; + long hpagesz_kb; + long hpagesz_mb; if (geteuid() != 0) { fprintf(stderr, "WARNING: not run as root, can not do hugetlb test\n"); @@ -814,11 +772,16 @@ void setup_hugetlbfs(void) /* * Now go make sure that we got the pages and that they - * are 2M pages. Someone might have made 1G the default. + * are PMD-level pages. Someone might have made PUD-level + * pages the default. */ - fd = open("/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages", O_RDONLY); + hpagesz_kb = HPAGE_SIZE / 1024; + hpagesz_mb = hpagesz_kb / 1024; + sprintf(buf, SYSFS_FMT_NR_HUGE_PAGES, hpagesz_kb); + fd = open(buf, O_RDONLY); if (fd < 0) { - perror("opening sysfs 2M hugetlb config"); + fprintf(stderr, "opening sysfs %ldM hugetlb config: %s\n", + hpagesz_mb, strerror(errno)); return; } @@ -826,13 +789,14 @@ void setup_hugetlbfs(void) err = read(fd, buf, sizeof(buf)-1); close(fd); if (err <= 0) { - perror("reading sysfs 2M hugetlb config"); + fprintf(stderr, "reading sysfs %ldM hugetlb config: %s\n", + hpagesz_mb, strerror(errno)); return; } if (atoi(buf) != GET_NR_HUGE_PAGES) { - fprintf(stderr, "could not confirm 2M pages, got: '%s' expected %d\n", - buf, GET_NR_HUGE_PAGES); + fprintf(stderr, "could not confirm %ldM pages, got: '%s' expected %d\n", + hpagesz_mb, buf, GET_NR_HUGE_PAGES); return; } @@ -886,6 +850,7 @@ void *malloc_pkey_mmap_dax(long size, int prot, u16 pkey) void *(*pkey_malloc[])(long size, int prot, u16 pkey) = { malloc_pkey_with_mprotect, + malloc_pkey_with_mprotect_subpage, malloc_pkey_anon_huge, malloc_pkey_hugetlb /* can not do direct with the pkey_mprotect() API: @@ -924,14 +889,14 @@ void *malloc_pkey(long size, int prot, u16 pkey) return ret; } -int last_pkru_faults; +int last_pkey_faults; #define UNKNOWN_PKEY -2 -void expected_pk_fault(int pkey) +void expected_pkey_fault(int pkey) { - dprintf2("%s(): last_pkru_faults: %d pkru_faults: %d\n", - __func__, last_pkru_faults, pkru_faults); + dprintf2("%s(): last_pkey_faults: %d pkey_faults: %d\n", + __func__, last_pkey_faults, pkey_faults); dprintf2("%s(%d): last_si_pkey: %d\n", __func__, pkey, last_si_pkey); - pkey_assert(last_pkru_faults + 1 == pkru_faults); + pkey_assert(last_pkey_faults + 1 == pkey_faults); /* * For exec-only memory, we do not know the pkey in @@ -940,24 +905,28 @@ void expected_pk_fault(int pkey) if (pkey != UNKNOWN_PKEY) pkey_assert(last_si_pkey == pkey); +#if defined(__i386__) || defined(__x86_64__) /* arch */ /* - * The signal handler shold have cleared out PKRU to let the + * The signal handler shold have cleared out PKEY register to let the * test program continue. We now have to restore it. */ - if (__rdpkru() != 0) + if (__read_pkey_reg() != 0) +#else /* arch */ + if (__read_pkey_reg() != shadow_pkey_reg) +#endif /* arch */ pkey_assert(0); - __wrpkru(shadow_pkru); - dprintf1("%s() set PKRU=%x to restore state after signal nuked it\n", - __func__, shadow_pkru); - last_pkru_faults = pkru_faults; + __write_pkey_reg(shadow_pkey_reg); + dprintf1("%s() set pkey_reg=%016llx to restore state after signal " + "nuked it\n", __func__, shadow_pkey_reg); + last_pkey_faults = pkey_faults; last_si_pkey = -1; } -#define do_not_expect_pk_fault(msg) do { \ - if (last_pkru_faults != pkru_faults) \ - dprintf0("unexpected PK fault: %s\n", msg); \ - pkey_assert(last_pkru_faults == pkru_faults); \ +#define do_not_expect_pkey_fault(msg) do { \ + if (last_pkey_faults != pkey_faults) \ + dprintf0("unexpected PKey fault: %s\n", msg); \ + pkey_assert(last_pkey_faults == pkey_faults); \ } while (0) int test_fds[10] = { -1 }; @@ -1000,6 +969,58 @@ __attribute__((noinline)) int read_ptr(int *ptr) return *ptr; } +void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey) +{ + int i, err; + int max_nr_pkey_allocs; + int alloced_pkeys[NR_PKEYS]; + int nr_alloced = 0; + long size; + + pkey_assert(pkey_last_malloc_record); + size = pkey_last_malloc_record->size; + /* + * This is a bit of a hack. But mprotect() requires + * huge-page-aligned sizes when operating on hugetlbfs. + * So, make sure that we use something that's a multiple + * of a huge page when we can. + */ + if (size >= HPAGE_SIZE) + size = HPAGE_SIZE; + + /* allocate every possible key and make sure key-0 never got allocated */ + max_nr_pkey_allocs = NR_PKEYS; + for (i = 0; i < max_nr_pkey_allocs; i++) { + int new_pkey = alloc_pkey(); + pkey_assert(new_pkey != 0); + + if (new_pkey < 0) + break; + alloced_pkeys[nr_alloced++] = new_pkey; + } + /* free all the allocated keys */ + for (i = 0; i < nr_alloced; i++) { + int free_ret; + + if (!alloced_pkeys[i]) + continue; + free_ret = sys_pkey_free(alloced_pkeys[i]); + pkey_assert(!free_ret); + } + + /* attach key-0 in various modes */ + err = sys_mprotect_pkey(ptr, size, PROT_READ, 0); + pkey_assert(!err); + err = sys_mprotect_pkey(ptr, size, PROT_WRITE, 0); + pkey_assert(!err); + err = sys_mprotect_pkey(ptr, size, PROT_EXEC, 0); + pkey_assert(!err); + err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE, 0); + pkey_assert(!err); + err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE|PROT_EXEC, 0); + pkey_assert(!err); +} + void test_read_of_write_disabled_region(int *ptr, u16 pkey) { int ptr_contents; @@ -1015,26 +1036,67 @@ void test_read_of_access_disabled_region(int *ptr, u16 pkey) int ptr_contents; dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", pkey, ptr); - rdpkru(); + read_pkey_reg(); + pkey_access_deny(pkey); + ptr_contents = read_ptr(ptr); + dprintf1("*ptr: %d\n", ptr_contents); + expected_pkey_fault(pkey); +} + +void test_read_of_access_disabled_region_with_page_already_mapped(int *ptr, + u16 pkey) +{ + int ptr_contents; + + dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", + pkey, ptr); + ptr_contents = read_ptr(ptr); + dprintf1("reading ptr before disabling the read : %d\n", + ptr_contents); + read_pkey_reg(); pkey_access_deny(pkey); ptr_contents = read_ptr(ptr); dprintf1("*ptr: %d\n", ptr_contents); - expected_pk_fault(pkey); + expected_pkey_fault(pkey); } + +void test_write_of_write_disabled_region_with_page_already_mapped(int *ptr, + u16 pkey) +{ + *ptr = __LINE__; + dprintf1("disabling write access; after accessing the page, " + "to PKEY[%02d], doing write\n", pkey); + pkey_write_deny(pkey); + *ptr = __LINE__; + expected_pkey_fault(pkey); +} + void test_write_of_write_disabled_region(int *ptr, u16 pkey) { dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey); pkey_write_deny(pkey); *ptr = __LINE__; - expected_pk_fault(pkey); + expected_pkey_fault(pkey); } void test_write_of_access_disabled_region(int *ptr, u16 pkey) { dprintf1("disabling access to PKEY[%02d], doing write\n", pkey); pkey_access_deny(pkey); *ptr = __LINE__; - expected_pk_fault(pkey); + expected_pkey_fault(pkey); } + +void test_write_of_access_disabled_region_with_page_already_mapped(int *ptr, + u16 pkey) +{ + *ptr = __LINE__; + dprintf1("disabling access; after accessing the page, " + " to PKEY[%02d], doing write\n", pkey); + pkey_access_deny(pkey); + *ptr = __LINE__; + expected_pkey_fault(pkey); +} + void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey) { int ret; @@ -1160,9 +1222,11 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey) int new_pkey; dprintf1("%s() alloc loop: %d\n", __func__, i); new_pkey = alloc_pkey(); - dprintf4("%s()::%d, err: %d pkru: 0x%x shadow: 0x%x\n", __func__, - __LINE__, err, __rdpkru(), shadow_pkru); - rdpkru(); /* for shadow checking */ + dprintf4("%s()::%d, err: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, err, __read_pkey_reg(), + shadow_pkey_reg); + read_pkey_reg(); /* for shadow checking */ dprintf2("%s() errno: %d ENOSPC: %d\n", __func__, errno, ENOSPC); if ((new_pkey == -1) && (errno == ENOSPC)) { dprintf2("%s() failed to allocate pkey after %d tries\n", @@ -1188,6 +1252,7 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey) dprintf3("%s()::%d\n", __func__, __LINE__); /* + * On x86: * There are 16 pkeys supported in hardware. Three are * allocated by the time we get here: * 1. The default key (0) @@ -1195,13 +1260,21 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey) * 3. One allocated by the test code and passed in via * 'pkey' to this function. * Ensure that we can allocate at least another 13 (16-3). + * + * On powerpc: + * There are either 5, 28, 29 or 32 pkeys supported in + * hardware depending on the page size (4K or 64K) and + * platform (powernv or powervm). Four are allocated by + * the time we get here. These include pkey-0, pkey-1, + * exec-only pkey and the one allocated by the test code. + * Ensure that we can allocate the remaining. */ - pkey_assert(i >= NR_PKEYS-3); + pkey_assert(i >= (NR_PKEYS - get_arch_reserved_keys() - 1)); for (i = 0; i < nr_allocated_pkeys; i++) { err = sys_pkey_free(allocated_pkeys[i]); pkey_assert(!err); - rdpkru(); /* for shadow checking */ + read_pkey_reg(); /* for shadow checking */ } } @@ -1287,7 +1360,7 @@ void test_ptrace_of_child(int *ptr, u16 pkey) pkey_assert(ret != -1); /* Now access from the current task, and expect an exception: */ peek_result = read_ptr(ptr); - expected_pk_fault(pkey); + expected_pkey_fault(pkey); /* * Try to access the NON-pkey-protected "plain_ptr" via ptrace: @@ -1297,7 +1370,7 @@ void test_ptrace_of_child(int *ptr, u16 pkey) pkey_assert(ret != -1); /* Now access from the current task, and expect NO exception: */ peek_result = read_ptr(plain_ptr); - do_not_expect_pk_fault("read plain pointer after ptrace"); + do_not_expect_pkey_fault("read plain pointer after ptrace"); ret = ptrace(PTRACE_DETACH, child_pid, ignored, 0); pkey_assert(ret != -1); @@ -1347,17 +1420,15 @@ void test_executing_on_unreadable_memory(int *ptr, u16 pkey) pkey_assert(!ret); pkey_access_deny(pkey); - dprintf2("pkru: %x\n", rdpkru()); + dprintf2("pkey_reg: %016llx\n", read_pkey_reg()); /* * Make sure this is an *instruction* fault */ madvise(p1, PAGE_SIZE, MADV_DONTNEED); lots_o_noops_around_write(&scratch); - do_not_expect_pk_fault("executing on PROT_EXEC memory"); - ptr_contents = read_ptr(p1); - dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); - expected_pk_fault(pkey); + do_not_expect_pkey_fault("executing on PROT_EXEC memory"); + expect_fault_on_read_execonly_key(p1, pkey); } void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) @@ -1378,15 +1449,13 @@ void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) ret = mprotect(p1, PAGE_SIZE, PROT_EXEC); pkey_assert(!ret); - dprintf2("pkru: %x\n", rdpkru()); + dprintf2("pkey_reg: %016llx\n", read_pkey_reg()); /* Make sure this is an *instruction* fault */ madvise(p1, PAGE_SIZE, MADV_DONTNEED); lots_o_noops_around_write(&scratch); - do_not_expect_pk_fault("executing on PROT_EXEC memory"); - ptr_contents = read_ptr(p1); - dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); - expected_pk_fault(UNKNOWN_PKEY); + do_not_expect_pkey_fault("executing on PROT_EXEC memory"); + expect_fault_on_read_execonly_key(p1, UNKNOWN_PKEY); /* * Put the memory back to non-PROT_EXEC. Should clear the @@ -1400,7 +1469,7 @@ void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) ret = mprotect(p1, PAGE_SIZE, PROT_READ|PROT_EXEC); pkey_assert(!ret); ptr_contents = read_ptr(p1); - do_not_expect_pk_fault("plain read on recently PROT_EXEC area"); + do_not_expect_pkey_fault("plain read on recently PROT_EXEC area"); } void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) @@ -1408,7 +1477,7 @@ void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) int size = PAGE_SIZE; int sret; - if (cpu_has_pku()) { + if (cpu_has_pkeys()) { dprintf1("SKIP: %s: no CPU support\n", __func__); return; } @@ -1420,8 +1489,11 @@ void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) void (*pkey_tests[])(int *ptr, u16 pkey) = { test_read_of_write_disabled_region, test_read_of_access_disabled_region, + test_read_of_access_disabled_region_with_page_already_mapped, test_write_of_write_disabled_region, + test_write_of_write_disabled_region_with_page_already_mapped, test_write_of_access_disabled_region, + test_write_of_access_disabled_region_with_page_already_mapped, test_kernel_write_of_access_disabled_region, test_kernel_write_of_write_disabled_region, test_kernel_gup_of_access_disabled_region, @@ -1433,6 +1505,7 @@ void (*pkey_tests[])(int *ptr, u16 pkey) = { test_pkey_syscalls_on_non_allocated_pkey, test_pkey_syscalls_bad_args, test_pkey_alloc_exhaust, + test_pkey_alloc_free_attach_pkey0, }; void run_tests_once(void) @@ -1442,7 +1515,7 @@ void run_tests_once(void) for (test_nr = 0; test_nr < ARRAY_SIZE(pkey_tests); test_nr++) { int pkey; - int orig_pkru_faults = pkru_faults; + int orig_pkey_faults = pkey_faults; dprintf1("======================\n"); dprintf1("test %d preparing...\n", test_nr); @@ -1457,8 +1530,8 @@ void run_tests_once(void) free_pkey_malloc(ptr); sys_pkey_free(pkey); - dprintf1("pkru_faults: %d\n", pkru_faults); - dprintf1("orig_pkru_faults: %d\n", orig_pkru_faults); + dprintf1("pkey_faults: %d\n", pkey_faults); + dprintf1("orig_pkey_faults: %d\n", orig_pkey_faults); tracing_off(); close_test_fds(); @@ -1471,18 +1544,19 @@ void run_tests_once(void) void pkey_setup_shadow(void) { - shadow_pkru = __rdpkru(); + shadow_pkey_reg = __read_pkey_reg(); } int main(void) { int nr_iterations = 22; + int pkeys_supported = is_pkeys_supported(); setup_handlers(); - printf("has pku: %d\n", cpu_has_pku()); + printf("has pkeys: %d\n", pkeys_supported); - if (!cpu_has_pku()) { + if (!pkeys_supported) { int size = PAGE_SIZE; int *ptr; @@ -1495,7 +1569,7 @@ int main(void) } pkey_setup_shadow(); - printf("startup pkru: %x\n", rdpkru()); + printf("startup pkey_reg: %016llx\n", read_pkey_reg()); setup_hugetlbfs(); while (nr_iterations-- > 0) diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests index f33714843198..a3f4f30f0a2e 100755 --- a/tools/testing/selftests/vm/run_vmtests +++ b/tools/testing/selftests/vm/run_vmtests @@ -59,7 +59,7 @@ else fi #filter 64bit architectures -ARCH64STR="arm64 ia64 mips64 parisc64 ppc64 riscv64 s390x sh64 sparc64 x86_64" +ARCH64STR="arm64 ia64 mips64 parisc64 ppc64 ppc64le riscv64 s390x sh64 sparc64 x86_64" if [ -z $ARCH ]; then ARCH=`uname -m 2>/dev/null | sed -e 's/aarch64.*/arm64/'` fi @@ -123,6 +123,28 @@ else echo "[PASS]" fi +echo "--------------------------------------------" +echo "running 'gup_benchmark -U' (normal/slow gup)" +echo "--------------------------------------------" +./gup_benchmark -U +if [ $? -ne 0 ]; then + echo "[FAIL]" + exitcode=1 +else + echo "[PASS]" +fi + +echo "------------------------------------------" +echo "running gup_benchmark -b (pin_user_pages)" +echo "------------------------------------------" +./gup_benchmark -b +if [ $? -ne 0 ]; then + echo "[FAIL]" + exitcode=1 +else + echo "[PASS]" +fi + echo "-------------------" echo "running userfaultfd" echo "-------------------" @@ -270,4 +292,35 @@ else exitcode=1 fi +echo "------------------------------------" +echo "running MREMAP_DONTUNMAP smoke test" +echo "------------------------------------" +./mremap_dontunmap +ret_val=$? + +if [ $ret_val -eq 0 ]; then + echo "[PASS]" +elif [ $ret_val -eq $ksft_skip ]; then + echo "[SKIP]" + exitcode=$ksft_skip +else + echo "[FAIL]" + exitcode=1 +fi + +echo "running HMM smoke test" +echo "------------------------------------" +./test_hmm.sh smoke +ret_val=$? + +if [ $ret_val -eq 0 ]; then + echo "[PASS]" +elif [ $ret_val -eq $ksft_skip ]; then + echo "[SKIP]" + exitcode=$ksft_skip +else + echo "[FAIL]" + exitcode=1 +fi + exit $exitcode diff --git a/tools/testing/selftests/vm/test_hmm.sh b/tools/testing/selftests/vm/test_hmm.sh new file mode 100755 index 000000000000..0647b525a625 --- /dev/null +++ b/tools/testing/selftests/vm/test_hmm.sh @@ -0,0 +1,97 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (C) 2018 Uladzislau Rezki (Sony) <urezki@gmail.com> +# +# This is a test script for the kernel test driver to analyse vmalloc +# allocator. Therefore it is just a kernel module loader. You can specify +# and pass different parameters in order to: +# a) analyse performance of vmalloc allocations; +# b) stressing and stability check of vmalloc subsystem. + +TEST_NAME="test_hmm" +DRIVER="test_hmm" + +# 1 if fails +exitcode=1 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +check_test_requirements() +{ + uid=$(id -u) + if [ $uid -ne 0 ]; then + echo "$0: Must be run as root" + exit $ksft_skip + fi + + if ! which modprobe > /dev/null 2>&1; then + echo "$0: You need modprobe installed" + exit $ksft_skip + fi + + if ! modinfo $DRIVER > /dev/null 2>&1; then + echo "$0: You must have the following enabled in your kernel:" + echo "CONFIG_TEST_HMM=m" + exit $ksft_skip + fi +} + +load_driver() +{ + modprobe $DRIVER > /dev/null 2>&1 + if [ $? == 0 ]; then + major=$(awk "\$2==\"HMM_DMIRROR\" {print \$1}" /proc/devices) + mknod /dev/hmm_dmirror0 c $major 0 + mknod /dev/hmm_dmirror1 c $major 1 + fi +} + +unload_driver() +{ + modprobe -r $DRIVER > /dev/null 2>&1 + rm -f /dev/hmm_dmirror? +} + +run_smoke() +{ + echo "Running smoke test. Note, this test provides basic coverage." + + load_driver + $(dirname "${BASH_SOURCE[0]}")/hmm-tests + unload_driver +} + +usage() +{ + echo -n "Usage: $0" + echo + echo "Example usage:" + echo + echo "# Shows help message" + echo "./${TEST_NAME}.sh" + echo + echo "# Smoke testing" + echo "./${TEST_NAME}.sh smoke" + echo + exit 0 +} + +function run_test() +{ + if [ $# -eq 0 ]; then + usage + else + if [ "$1" = "smoke" ]; then + run_smoke + else + usage + fi + fi +} + +check_test_requirements +run_test $@ + +exit 0 diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index d3362777a425..61e5cfeb1350 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -54,6 +54,7 @@ #include <linux/userfaultfd.h> #include <setjmp.h> #include <stdbool.h> +#include <assert.h> #include "../kselftest.h" @@ -76,6 +77,8 @@ static int test_type; #define ALARM_INTERVAL_SECS 10 static volatile bool test_uffdio_copy_eexist = true; static volatile bool test_uffdio_zeropage_eexist = true; +/* Whether to test uffd write-protection */ +static bool test_uffdio_wp = false; static bool map_shared; static int huge_fd; @@ -86,6 +89,13 @@ static char *area_src, *area_src_alias, *area_dst, *area_dst_alias; static char *zeropage; pthread_attr_t attr; +/* Userfaultfd test statistics */ +struct uffd_stats { + int cpu; + unsigned long missing_faults; + unsigned long wp_faults; +}; + /* pthread_mutex_t starts at page offset 0 */ #define area_mutex(___area, ___nr) \ ((pthread_mutex_t *) ((___area) + (___nr)*page_size)) @@ -125,6 +135,37 @@ static void usage(void) exit(1); } +static void uffd_stats_reset(struct uffd_stats *uffd_stats, + unsigned long n_cpus) +{ + int i; + + for (i = 0; i < n_cpus; i++) { + uffd_stats[i].cpu = i; + uffd_stats[i].missing_faults = 0; + uffd_stats[i].wp_faults = 0; + } +} + +static void uffd_stats_report(struct uffd_stats *stats, int n_cpus) +{ + int i; + unsigned long long miss_total = 0, wp_total = 0; + + for (i = 0; i < n_cpus; i++) { + miss_total += stats[i].missing_faults; + wp_total += stats[i].wp_faults; + } + + printf("userfaults: %llu missing (", miss_total); + for (i = 0; i < n_cpus; i++) + printf("%lu+", stats[i].missing_faults); + printf("\b), %llu wp (", wp_total); + for (i = 0; i < n_cpus; i++) + printf("%lu+", stats[i].wp_faults); + printf("\b)\n"); +} + static int anon_release_pages(char *rel_area) { int ret = 0; @@ -245,10 +286,15 @@ struct uffd_test_ops { void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset); }; -#define ANON_EXPECTED_IOCTLS ((1 << _UFFDIO_WAKE) | \ +#define SHMEM_EXPECTED_IOCTLS ((1 << _UFFDIO_WAKE) | \ (1 << _UFFDIO_COPY) | \ (1 << _UFFDIO_ZEROPAGE)) +#define ANON_EXPECTED_IOCTLS ((1 << _UFFDIO_WAKE) | \ + (1 << _UFFDIO_COPY) | \ + (1 << _UFFDIO_ZEROPAGE) | \ + (1 << _UFFDIO_WRITEPROTECT)) + static struct uffd_test_ops anon_uffd_test_ops = { .expected_ioctls = ANON_EXPECTED_IOCTLS, .allocate_area = anon_allocate_area, @@ -257,7 +303,7 @@ static struct uffd_test_ops anon_uffd_test_ops = { }; static struct uffd_test_ops shmem_uffd_test_ops = { - .expected_ioctls = ANON_EXPECTED_IOCTLS, + .expected_ioctls = SHMEM_EXPECTED_IOCTLS, .allocate_area = shmem_allocate_area, .release_pages = shmem_release_pages, .alias_mapping = noop_alias_mapping, @@ -281,6 +327,21 @@ static int my_bcmp(char *str1, char *str2, size_t n) return 0; } +static void wp_range(int ufd, __u64 start, __u64 len, bool wp) +{ + struct uffdio_writeprotect prms = { 0 }; + + /* Write protection page faults */ + prms.range.start = start; + prms.range.len = len; + /* Undo write-protect, do wakeup after that */ + prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0; + + if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms)) + fprintf(stderr, "clear WP failed for address 0x%Lx\n", + start), exit(1); +} + static void *locking_thread(void *arg) { unsigned long cpu = (unsigned long) arg; @@ -419,7 +480,10 @@ static int __copy_page(int ufd, unsigned long offset, bool retry) uffdio_copy.dst = (unsigned long) area_dst + offset; uffdio_copy.src = (unsigned long) area_src + offset; uffdio_copy.len = page_size; - uffdio_copy.mode = 0; + if (test_uffdio_wp) + uffdio_copy.mode = UFFDIO_COPY_MODE_WP; + else + uffdio_copy.mode = 0; uffdio_copy.copy = 0; if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) { /* real retval in ufdio_copy.copy */ @@ -467,8 +531,8 @@ static int uffd_read_msg(int ufd, struct uffd_msg *msg) return 0; } -/* Return 1 if page fault handled by us; otherwise 0 */ -static int uffd_handle_page_fault(struct uffd_msg *msg) +static void uffd_handle_page_fault(struct uffd_msg *msg, + struct uffd_stats *stats) { unsigned long offset; @@ -476,25 +540,32 @@ static int uffd_handle_page_fault(struct uffd_msg *msg) fprintf(stderr, "unexpected msg event %u\n", msg->event), exit(1); - if (bounces & BOUNCE_VERIFY && - msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) - fprintf(stderr, "unexpected write fault\n"), exit(1); + if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) { + wp_range(uffd, msg->arg.pagefault.address, page_size, false); + stats->wp_faults++; + } else { + /* Missing page faults */ + if (bounces & BOUNCE_VERIFY && + msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) + fprintf(stderr, "unexpected write fault\n"), exit(1); - offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst; - offset &= ~(page_size-1); + offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst; + offset &= ~(page_size-1); - return copy_page(uffd, offset); + if (copy_page(uffd, offset)) + stats->missing_faults++; + } } static void *uffd_poll_thread(void *arg) { - unsigned long cpu = (unsigned long) arg; + struct uffd_stats *stats = (struct uffd_stats *)arg; + unsigned long cpu = stats->cpu; struct pollfd pollfd[2]; struct uffd_msg msg; struct uffdio_register uffd_reg; int ret; char tmp_chr; - unsigned long userfaults = 0; pollfd[0].fd = uffd; pollfd[0].events = POLLIN; @@ -524,7 +595,7 @@ static void *uffd_poll_thread(void *arg) msg.event), exit(1); break; case UFFD_EVENT_PAGEFAULT: - userfaults += uffd_handle_page_fault(&msg); + uffd_handle_page_fault(&msg, stats); break; case UFFD_EVENT_FORK: close(uffd); @@ -543,50 +614,67 @@ static void *uffd_poll_thread(void *arg) break; } } - return (void *)userfaults; + + return NULL; } pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER; static void *uffd_read_thread(void *arg) { - unsigned long *this_cpu_userfaults; + struct uffd_stats *stats = (struct uffd_stats *)arg; struct uffd_msg msg; - this_cpu_userfaults = (unsigned long *) arg; - *this_cpu_userfaults = 0; - pthread_mutex_unlock(&uffd_read_mutex); /* from here cancellation is ok */ for (;;) { if (uffd_read_msg(uffd, &msg)) continue; - (*this_cpu_userfaults) += uffd_handle_page_fault(&msg); + uffd_handle_page_fault(&msg, stats); } - return (void *)NULL; + + return NULL; } static void *background_thread(void *arg) { unsigned long cpu = (unsigned long) arg; - unsigned long page_nr; + unsigned long page_nr, start_nr, mid_nr, end_nr; + + start_nr = cpu * nr_pages_per_cpu; + end_nr = (cpu+1) * nr_pages_per_cpu; + mid_nr = (start_nr + end_nr) / 2; - for (page_nr = cpu * nr_pages_per_cpu; - page_nr < (cpu+1) * nr_pages_per_cpu; - page_nr++) + /* Copy the first half of the pages */ + for (page_nr = start_nr; page_nr < mid_nr; page_nr++) + copy_page_retry(uffd, page_nr * page_size); + + /* + * If we need to test uffd-wp, set it up now. Then we'll have + * at least the first half of the pages mapped already which + * can be write-protected for testing + */ + if (test_uffdio_wp) + wp_range(uffd, (unsigned long)area_dst + start_nr * page_size, + nr_pages_per_cpu * page_size, true); + + /* + * Continue the 2nd half of the page copying, handling write + * protection faults if any + */ + for (page_nr = mid_nr; page_nr < end_nr; page_nr++) copy_page_retry(uffd, page_nr * page_size); return NULL; } -static int stress(unsigned long *userfaults) +static int stress(struct uffd_stats *uffd_stats) { unsigned long cpu; pthread_t locking_threads[nr_cpus]; pthread_t uffd_threads[nr_cpus]; pthread_t background_threads[nr_cpus]; - void **_userfaults = (void **) userfaults; finished = 0; for (cpu = 0; cpu < nr_cpus; cpu++) { @@ -595,12 +683,13 @@ static int stress(unsigned long *userfaults) return 1; if (bounces & BOUNCE_POLL) { if (pthread_create(&uffd_threads[cpu], &attr, - uffd_poll_thread, (void *)cpu)) + uffd_poll_thread, + (void *)&uffd_stats[cpu])) return 1; } else { if (pthread_create(&uffd_threads[cpu], &attr, uffd_read_thread, - &_userfaults[cpu])) + (void *)&uffd_stats[cpu])) return 1; pthread_mutex_lock(&uffd_read_mutex); } @@ -637,7 +726,8 @@ static int stress(unsigned long *userfaults) fprintf(stderr, "pipefd write error\n"); return 1; } - if (pthread_join(uffd_threads[cpu], &_userfaults[cpu])) + if (pthread_join(uffd_threads[cpu], + (void *)&uffd_stats[cpu])) return 1; } else { if (pthread_cancel(uffd_threads[cpu])) @@ -735,17 +825,31 @@ static int faulting_process(int signal_test) } for (nr = 0; nr < split_nr_pages; nr++) { + int steps = 1; + unsigned long offset = nr * page_size; + if (signal_test) { if (sigsetjmp(*sigbuf, 1) != 0) { - if (nr == lastnr) { + if (steps == 1 && nr == lastnr) { fprintf(stderr, "Signal repeated\n"); return 1; } lastnr = nr; if (signal_test == 1) { - if (copy_page(uffd, nr * page_size)) - signalled++; + if (steps == 1) { + /* This is a MISSING request */ + steps++; + if (copy_page(uffd, offset)) + signalled++; + } else { + /* This is a WP request */ + assert(steps == 2); + wp_range(uffd, + (__u64)area_dst + + offset, + page_size, false); + } } else { signalled++; continue; @@ -758,8 +862,13 @@ static int faulting_process(int signal_test) fprintf(stderr, "nr %lu memory corruption %Lu %Lu\n", nr, count, - count_verify[nr]), exit(1); - } + count_verify[nr]); + } + /* + * Trigger write protection if there is by writting + * the same value back. + */ + *area_count(area_dst, nr) = count; } if (signal_test) @@ -781,6 +890,11 @@ static int faulting_process(int signal_test) nr, count, count_verify[nr]), exit(1); } + /* + * Trigger write protection if there is by writting + * the same value back. + */ + *area_count(area_dst, nr) = count; } if (uffd_test_ops->release_pages(area_dst)) @@ -884,6 +998,8 @@ static int userfaultfd_zeropage_test(void) uffdio_register.range.start = (unsigned long) area_dst; uffdio_register.range.len = nr_pages * page_size; uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; + if (test_uffdio_wp) + uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) fprintf(stderr, "register failure\n"), exit(1); @@ -908,11 +1024,11 @@ static int userfaultfd_events_test(void) { struct uffdio_register uffdio_register; unsigned long expected_ioctls; - unsigned long userfaults; pthread_t uffd_mon; int err, features; pid_t pid; char c; + struct uffd_stats stats = { 0 }; printf("testing events (fork, remap, remove): "); fflush(stdout); @@ -929,6 +1045,8 @@ static int userfaultfd_events_test(void) uffdio_register.range.start = (unsigned long) area_dst; uffdio_register.range.len = nr_pages * page_size; uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; + if (test_uffdio_wp) + uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) fprintf(stderr, "register failure\n"), exit(1); @@ -939,7 +1057,7 @@ static int userfaultfd_events_test(void) "unexpected missing ioctl for anon memory\n"), exit(1); - if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, NULL)) + if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) perror("uffd_poll_thread create"), exit(1); pid = fork(); @@ -955,13 +1073,14 @@ static int userfaultfd_events_test(void) if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) perror("pipe write"), exit(1); - if (pthread_join(uffd_mon, (void **)&userfaults)) + if (pthread_join(uffd_mon, NULL)) return 1; close(uffd); - printf("userfaults: %ld\n", userfaults); - return userfaults != nr_pages; + uffd_stats_report(&stats, 1); + + return stats.missing_faults != nr_pages; } static int userfaultfd_sig_test(void) @@ -973,6 +1092,7 @@ static int userfaultfd_sig_test(void) int err, features; pid_t pid; char c; + struct uffd_stats stats = { 0 }; printf("testing signal delivery: "); fflush(stdout); @@ -988,6 +1108,8 @@ static int userfaultfd_sig_test(void) uffdio_register.range.start = (unsigned long) area_dst; uffdio_register.range.len = nr_pages * page_size; uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; + if (test_uffdio_wp) + uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) fprintf(stderr, "register failure\n"), exit(1); @@ -1004,7 +1126,7 @@ static int userfaultfd_sig_test(void) if (uffd_test_ops->release_pages(area_dst)) return 1; - if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, NULL)) + if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) perror("uffd_poll_thread create"), exit(1); pid = fork(); @@ -1030,6 +1152,7 @@ static int userfaultfd_sig_test(void) close(uffd); return userfaults != 0; } + static int userfaultfd_stress(void) { void *area; @@ -1038,7 +1161,7 @@ static int userfaultfd_stress(void) struct uffdio_register uffdio_register; unsigned long cpu; int err; - unsigned long userfaults[nr_cpus]; + struct uffd_stats uffd_stats[nr_cpus]; uffd_test_ops->allocate_area((void **)&area_src); if (!area_src) @@ -1119,6 +1242,8 @@ static int userfaultfd_stress(void) uffdio_register.range.start = (unsigned long) area_dst; uffdio_register.range.len = nr_pages * page_size; uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; + if (test_uffdio_wp) + uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) { fprintf(stderr, "register failure\n"); return 1; @@ -1167,10 +1292,17 @@ static int userfaultfd_stress(void) if (uffd_test_ops->release_pages(area_dst)) return 1; + uffd_stats_reset(uffd_stats, nr_cpus); + /* bounce pass */ - if (stress(userfaults)) + if (stress(uffd_stats)) return 1; + /* Clear all the write protections if there is any */ + if (test_uffdio_wp) + wp_range(uffd, (unsigned long)area_dst, + nr_pages * page_size, false); + /* unregister */ if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) { fprintf(stderr, "unregister failure\n"); @@ -1209,10 +1341,7 @@ static int userfaultfd_stress(void) area_src_alias = area_dst_alias; area_dst_alias = tmp_area; - printf("userfaults:"); - for (cpu = 0; cpu < nr_cpus; cpu++) - printf(" %lu", userfaults[cpu]); - printf("\n"); + uffd_stats_report(uffd_stats, nr_cpus); } if (err) @@ -1252,6 +1381,8 @@ static void set_test_type(const char *type) if (!strcmp(type, "anon")) { test_type = TEST_ANON; uffd_test_ops = &anon_uffd_test_ops; + /* Only enable write-protect test for anonymous test */ + test_uffdio_wp = true; } else if (!strcmp(type, "hugetlb")) { test_type = TEST_HUGETLB; uffd_test_ops = &hugetlb_uffd_test_ops; diff --git a/tools/testing/selftests/vm/write_hugetlb_memory.sh b/tools/testing/selftests/vm/write_hugetlb_memory.sh new file mode 100644 index 000000000000..d3d0d108924d --- /dev/null +++ b/tools/testing/selftests/vm/write_hugetlb_memory.sh @@ -0,0 +1,23 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +set -e + +size=$1 +populate=$2 +write=$3 +cgroup=$4 +path=$5 +method=$6 +private=$7 +want_sleep=$8 +reserve=$9 + +echo "Putting task in cgroup '$cgroup'" +echo $$ > /dev/cgroup/memory/"$cgroup"/cgroup.procs + +echo "Method is $method" + +set +e +./write_to_hugetlbfs -p "$path" -s "$size" "$write" "$populate" -m "$method" \ + "$private" "$want_sleep" "$reserve" diff --git a/tools/testing/selftests/vm/write_to_hugetlbfs.c b/tools/testing/selftests/vm/write_to_hugetlbfs.c new file mode 100644 index 000000000000..6a2caba19ee1 --- /dev/null +++ b/tools/testing/selftests/vm/write_to_hugetlbfs.c @@ -0,0 +1,240 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This program reserves and uses hugetlb memory, supporting a bunch of + * scenarios needed by the charged_reserved_hugetlb.sh test. + */ + +#include <err.h> +#include <errno.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/shm.h> +#include <sys/stat.h> +#include <sys/mman.h> + +/* Global definitions. */ +enum method { + HUGETLBFS, + MMAP_MAP_HUGETLB, + SHM, + MAX_METHOD +}; + + +/* Global variables. */ +static const char *self; +static char *shmaddr; +static int shmid; + +/* + * Show usage and exit. + */ +static void exit_usage(void) +{ + printf("Usage: %s -p <path to hugetlbfs file> -s <size to map> " + "[-m <0=hugetlbfs | 1=mmap(MAP_HUGETLB)>] [-l] [-r] " + "[-o] [-w] [-n]\n", + self); + exit(EXIT_FAILURE); +} + +void sig_handler(int signo) +{ + printf("Received %d.\n", signo); + if (signo == SIGINT) { + printf("Deleting the memory\n"); + if (shmdt((const void *)shmaddr) != 0) { + perror("Detach failure"); + shmctl(shmid, IPC_RMID, NULL); + exit(4); + } + + shmctl(shmid, IPC_RMID, NULL); + printf("Done deleting the memory\n"); + } + exit(2); +} + +int main(int argc, char **argv) +{ + int fd = 0; + int key = 0; + int *ptr = NULL; + int c = 0; + int size = 0; + char path[256] = ""; + enum method method = MAX_METHOD; + int want_sleep = 0, private = 0; + int populate = 0; + int write = 0; + int reserve = 1; + + if (signal(SIGINT, sig_handler) == SIG_ERR) + err(1, "\ncan't catch SIGINT\n"); + + /* Parse command-line arguments. */ + setvbuf(stdout, NULL, _IONBF, 0); + self = argv[0]; + + while ((c = getopt(argc, argv, "s:p:m:owlrn")) != -1) { + switch (c) { + case 's': + size = atoi(optarg); + break; + case 'p': + strncpy(path, optarg, sizeof(path)); + break; + case 'm': + if (atoi(optarg) >= MAX_METHOD) { + errno = EINVAL; + perror("Invalid -m."); + exit_usage(); + } + method = atoi(optarg); + break; + case 'o': + populate = 1; + break; + case 'w': + write = 1; + break; + case 'l': + want_sleep = 1; + break; + case 'r': + private + = 1; + break; + case 'n': + reserve = 0; + break; + default: + errno = EINVAL; + perror("Invalid arg"); + exit_usage(); + } + } + + if (strncmp(path, "", sizeof(path)) != 0) { + printf("Writing to this path: %s\n", path); + } else { + errno = EINVAL; + perror("path not found"); + exit_usage(); + } + + if (size != 0) { + printf("Writing this size: %d\n", size); + } else { + errno = EINVAL; + perror("size not found"); + exit_usage(); + } + + if (!populate) + printf("Not populating.\n"); + else + printf("Populating.\n"); + + if (!write) + printf("Not writing to memory.\n"); + + if (method == MAX_METHOD) { + errno = EINVAL; + perror("-m Invalid"); + exit_usage(); + } else + printf("Using method=%d\n", method); + + if (!private) + printf("Shared mapping.\n"); + else + printf("Private mapping.\n"); + + if (!reserve) + printf("NO_RESERVE mapping.\n"); + else + printf("RESERVE mapping.\n"); + + switch (method) { + case HUGETLBFS: + printf("Allocating using HUGETLBFS.\n"); + fd = open(path, O_CREAT | O_RDWR, 0777); + if (fd == -1) + err(1, "Failed to open file."); + + ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, + (private ? MAP_PRIVATE : MAP_SHARED) | + (populate ? MAP_POPULATE : 0) | + (reserve ? 0 : MAP_NORESERVE), + fd, 0); + + if (ptr == MAP_FAILED) { + close(fd); + err(1, "Error mapping the file"); + } + break; + case MMAP_MAP_HUGETLB: + printf("Allocating using MAP_HUGETLB.\n"); + ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, + (private ? (MAP_PRIVATE | MAP_ANONYMOUS) : + MAP_SHARED) | + MAP_HUGETLB | (populate ? MAP_POPULATE : 0) | + (reserve ? 0 : MAP_NORESERVE), + -1, 0); + + if (ptr == MAP_FAILED) + err(1, "mmap"); + + printf("Returned address is %p\n", ptr); + break; + case SHM: + printf("Allocating using SHM.\n"); + shmid = shmget(key, size, + SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W); + if (shmid < 0) { + shmid = shmget(++key, size, + SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W); + if (shmid < 0) + err(1, "shmget"); + } + printf("shmid: 0x%x, shmget key:%d\n", shmid, key); + + ptr = shmat(shmid, NULL, 0); + if (ptr == (int *)-1) { + perror("Shared memory attach failure"); + shmctl(shmid, IPC_RMID, NULL); + exit(2); + } + printf("shmaddr: %p\n", ptr); + + break; + default: + errno = EINVAL; + err(1, "Invalid method."); + } + + if (write) { + printf("Writing to memory.\n"); + memset(ptr, 1, size); + } + + if (want_sleep) { + /* Signal to caller that we're done. */ + printf("DONE\n"); + + /* Hold memory until external kill signal is delivered. */ + while (1) + sleep(100); + } + + if (method == HUGETLBFS) + close(fd); + + return 0; +} diff --git a/tools/testing/selftests/watchdog/.gitignore b/tools/testing/selftests/watchdog/.gitignore index 5aac51575c7e..61d7b89cdbca 100644 --- a/tools/testing/selftests/watchdog/.gitignore +++ b/tools/testing/selftests/watchdog/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only watchdog-test diff --git a/tools/testing/selftests/wireguard/netns.sh b/tools/testing/selftests/wireguard/netns.sh index 936e1ca9410e..17a1f53ceba0 100755 --- a/tools/testing/selftests/wireguard/netns.sh +++ b/tools/testing/selftests/wireguard/netns.sh @@ -48,8 +48,11 @@ cleanup() { exec 2>/dev/null printf "$orig_message_cost" > /proc/sys/net/core/message_cost ip0 link del dev wg0 + ip0 link del dev wg1 ip1 link del dev wg0 + ip1 link del dev wg1 ip2 link del dev wg0 + ip2 link del dev wg1 local to_kill="$(ip netns pids $netns0) $(ip netns pids $netns1) $(ip netns pids $netns2)" [[ -n $to_kill ]] && kill $to_kill pp ip netns del $netns1 @@ -77,18 +80,20 @@ ip0 link set wg0 netns $netns2 key1="$(pp wg genkey)" key2="$(pp wg genkey)" key3="$(pp wg genkey)" +key4="$(pp wg genkey)" pub1="$(pp wg pubkey <<<"$key1")" pub2="$(pp wg pubkey <<<"$key2")" pub3="$(pp wg pubkey <<<"$key3")" +pub4="$(pp wg pubkey <<<"$key4")" psk="$(pp wg genpsk)" [[ -n $key1 && -n $key2 && -n $psk ]] configure_peers() { ip1 addr add 192.168.241.1/24 dev wg0 - ip1 addr add fd00::1/24 dev wg0 + ip1 addr add fd00::1/112 dev wg0 ip2 addr add 192.168.241.2/24 dev wg0 - ip2 addr add fd00::2/24 dev wg0 + ip2 addr add fd00::2/112 dev wg0 n1 wg set wg0 \ private-key <(echo "$key1") \ @@ -230,9 +235,38 @@ n1 ping -W 1 -c 1 192.168.241.2 n1 wg set wg0 private-key <(echo "$key3") n2 wg set wg0 peer "$pub3" preshared-key <(echo "$psk") allowed-ips 192.168.241.1/32 peer "$pub1" remove n1 ping -W 1 -c 1 192.168.241.2 +n2 wg set wg0 peer "$pub3" remove + +# Test that we can route wg through wg +ip1 addr flush dev wg0 +ip2 addr flush dev wg0 +ip1 addr add fd00::5:1/112 dev wg0 +ip2 addr add fd00::5:2/112 dev wg0 +n1 wg set wg0 private-key <(echo "$key1") peer "$pub2" preshared-key <(echo "$psk") allowed-ips fd00::5:2/128 endpoint 127.0.0.1:2 +n2 wg set wg0 private-key <(echo "$key2") listen-port 2 peer "$pub1" preshared-key <(echo "$psk") allowed-ips fd00::5:1/128 endpoint 127.212.121.99:9998 +ip1 link add wg1 type wireguard +ip2 link add wg1 type wireguard +ip1 addr add 192.168.241.1/24 dev wg1 +ip1 addr add fd00::1/112 dev wg1 +ip2 addr add 192.168.241.2/24 dev wg1 +ip2 addr add fd00::2/112 dev wg1 +ip1 link set mtu 1340 up dev wg1 +ip2 link set mtu 1340 up dev wg1 +n1 wg set wg1 listen-port 5 private-key <(echo "$key3") peer "$pub4" allowed-ips 192.168.241.2/32,fd00::2/128 endpoint [fd00::5:2]:5 +n2 wg set wg1 listen-port 5 private-key <(echo "$key4") peer "$pub3" allowed-ips 192.168.241.1/32,fd00::1/128 endpoint [fd00::5:1]:5 +tests +# Try to set up a routing loop between the two namespaces +ip1 link set netns $netns0 dev wg1 +ip0 addr add 192.168.241.1/24 dev wg1 +ip0 link set up dev wg1 +n0 ping -W 1 -c 1 192.168.241.2 +n1 wg set wg0 peer "$pub2" endpoint 192.168.241.2:7 +ip2 link del wg0 +ip2 link del wg1 +! n0 ping -W 1 -c 10 -f 192.168.241.2 || false # Should not crash kernel +ip0 link del wg1 ip1 link del wg0 -ip2 link del wg0 # Test using NAT. We now change the topology to this: # ┌────────────────────────────────────────┐ ┌────────────────────────────────────────────────┐ ┌────────────────────────────────────────┐ @@ -282,6 +316,20 @@ pp sleep 3 n2 ping -W 1 -c 1 192.168.241.1 n1 wg set wg0 peer "$pub2" persistent-keepalive 0 +# Test that onion routing works, even when it loops +n1 wg set wg0 peer "$pub3" allowed-ips 192.168.242.2/32 endpoint 192.168.241.2:5 +ip1 addr add 192.168.242.1/24 dev wg0 +ip2 link add wg1 type wireguard +ip2 addr add 192.168.242.2/24 dev wg1 +n2 wg set wg1 private-key <(echo "$key3") listen-port 5 peer "$pub1" allowed-ips 192.168.242.1/32 +ip2 link set wg1 up +n1 ping -W 1 -c 1 192.168.242.2 +ip2 link del wg1 +n1 wg set wg0 peer "$pub3" endpoint 192.168.242.2:5 +! n1 ping -W 1 -c 1 192.168.242.2 || false # Should not crash kernel +n1 wg set wg0 peer "$pub3" remove +ip1 addr del 192.168.242.1/24 dev wg0 + # Do a wg-quick(8)-style policy routing for the default route, making sure vethc has a v6 address to tease out bugs. ip1 -6 addr add fc00::9/96 dev vethc ip1 -6 route add default via fc00::1 diff --git a/tools/testing/selftests/wireguard/qemu/.gitignore b/tools/testing/selftests/wireguard/qemu/.gitignore index 415b542a9d59..bfa15e6feb2f 100644 --- a/tools/testing/selftests/wireguard/qemu/.gitignore +++ b/tools/testing/selftests/wireguard/qemu/.gitignore @@ -1,2 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only build/ distfiles/ diff --git a/tools/testing/selftests/wireguard/qemu/Makefile b/tools/testing/selftests/wireguard/qemu/Makefile index 90598a425c18..4bdd6c1a19d3 100644 --- a/tools/testing/selftests/wireguard/qemu/Makefile +++ b/tools/testing/selftests/wireguard/qemu/Makefile @@ -44,7 +44,7 @@ endef $(eval $(call tar_download,MUSL,musl,1.2.0,.tar.gz,https://musl.libc.org/releases/,c6de7b191139142d3f9a7b5b702c9cae1b5ee6e7f57e582da9328629408fd4e8)) $(eval $(call tar_download,IPERF,iperf,3.7,.tar.gz,https://downloads.es.net/pub/iperf/,d846040224317caf2f75c843d309a950a7db23f9b44b94688ccbe557d6d1710c)) $(eval $(call tar_download,BASH,bash,5.0,.tar.gz,https://ftp.gnu.org/gnu/bash/,b4a80f2ac66170b2913efbfb9f2594f1f76c7b1afd11f799e22035d63077fb4d)) -$(eval $(call tar_download,IPROUTE2,iproute2,5.4.0,.tar.xz,https://www.kernel.org/pub/linux/utils/net/iproute2/,fe97aa60a0d4c5ac830be18937e18dc3400ca713a33a89ad896ff1e3d46086ae)) +$(eval $(call tar_download,IPROUTE2,iproute2,5.6.0,.tar.xz,https://www.kernel.org/pub/linux/utils/net/iproute2/,1b5b0e25ce6e23da7526ea1da044e814ad85ba761b10dd29c2b027c056b04692)) $(eval $(call tar_download,IPTABLES,iptables,1.8.4,.tar.bz2,https://www.netfilter.org/projects/iptables/files/,993a3a5490a544c2cbf2ef15cf7e7ed21af1845baf228318d5c36ef8827e157c)) $(eval $(call tar_download,NMAP,nmap,7.80,.tar.bz2,https://nmap.org/dist/,fcfa5a0e42099e12e4bf7a68ebe6fde05553383a682e816a7ec9256ab4773faa)) $(eval $(call tar_download,IPUTILS,iputils,s20190709,.tar.gz,https://github.com/iputils/iputils/archive/s20190709.tar.gz/#,a15720dd741d7538dd2645f9f516d193636ae4300ff7dbc8bfca757bf166490a)) diff --git a/tools/testing/selftests/wireguard/qemu/arch/powerpc64le.config b/tools/testing/selftests/wireguard/qemu/arch/powerpc64le.config index 990c510a9cfa..f52f1e2bc7f6 100644 --- a/tools/testing/selftests/wireguard/qemu/arch/powerpc64le.config +++ b/tools/testing/selftests/wireguard/qemu/arch/powerpc64le.config @@ -10,3 +10,4 @@ CONFIG_CMDLINE_BOOL=y CONFIG_CMDLINE="console=hvc0 wg.success=hvc1" CONFIG_SECTION_MISMATCH_WARN_ONLY=y CONFIG_FRAME_WARN=1280 +CONFIG_THREAD_SHIFT=14 diff --git a/tools/testing/selftests/wireguard/qemu/debug.config b/tools/testing/selftests/wireguard/qemu/debug.config index 5909e7ef2a5c..b50c2085c1ac 100644 --- a/tools/testing/selftests/wireguard/qemu/debug.config +++ b/tools/testing/selftests/wireguard/qemu/debug.config @@ -25,7 +25,6 @@ CONFIG_KASAN=y CONFIG_KASAN_INLINE=y CONFIG_UBSAN=y CONFIG_UBSAN_SANITIZE_ALL=y -CONFIG_UBSAN_NO_ALIGNMENT=y CONFIG_UBSAN_NULL=y CONFIG_DEBUG_KMEMLEAK=y CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE=8192 @@ -58,7 +57,6 @@ CONFIG_RCU_EQS_DEBUG=y CONFIG_USER_STACKTRACE_SUPPORT=y CONFIG_DEBUG_SG=y CONFIG_DEBUG_NOTIFIERS=y -CONFIG_DOUBLEFAULT=y CONFIG_X86_DEBUG_FPU=y CONFIG_DEBUG_SECTION_MISMATCH=y CONFIG_DEBUG_PAGEALLOC=y diff --git a/tools/testing/selftests/x86/.gitignore b/tools/testing/selftests/x86/.gitignore index 7757f73ff9a3..1aaef5bf119a 100644 --- a/tools/testing/selftests/x86/.gitignore +++ b/tools/testing/selftests/x86/.gitignore @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only *_32 *_64 single_step_syscall @@ -11,5 +12,4 @@ ldt_gdt iopl mpx-mini-test ioperm -protection_keys test_vdso diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index 5d49bfec1e9a..5f16821c7f63 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -12,7 +12,7 @@ CAN_BUILD_WITH_NOPIE := $(shell ./check_cc.sh $(CC) trivial_program.c -no-pie) TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap_vdso \ check_initial_reg_state sigreturn iopl ioperm \ - protection_keys test_vdso test_vsyscall mov_ss_trap \ + test_vdso test_vsyscall mov_ss_trap \ syscall_arg_fault TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \ test_FCMOV test_FCOMI test_FISTTP \ diff --git a/tools/testing/selftests/x86/pkey-helpers.h b/tools/testing/selftests/x86/pkey-helpers.h deleted file mode 100644 index 254e5436bdd9..000000000000 --- a/tools/testing/selftests/x86/pkey-helpers.h +++ /dev/null @@ -1,219 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _PKEYS_HELPER_H -#define _PKEYS_HELPER_H -#define _GNU_SOURCE -#include <string.h> -#include <stdarg.h> -#include <stdio.h> -#include <stdint.h> -#include <stdbool.h> -#include <signal.h> -#include <assert.h> -#include <stdlib.h> -#include <ucontext.h> -#include <sys/mman.h> - -#define NR_PKEYS 16 -#define PKRU_BITS_PER_PKEY 2 - -#ifndef DEBUG_LEVEL -#define DEBUG_LEVEL 0 -#endif -#define DPRINT_IN_SIGNAL_BUF_SIZE 4096 -extern int dprint_in_signal; -extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; -static inline void sigsafe_printf(const char *format, ...) -{ - va_list ap; - - if (!dprint_in_signal) { - va_start(ap, format); - vprintf(format, ap); - va_end(ap); - } else { - int ret; - /* - * No printf() functions are signal-safe. - * They deadlock easily. Write the format - * string to get some output, even if - * incomplete. - */ - ret = write(1, format, strlen(format)); - if (ret < 0) - exit(1); - } -} -#define dprintf_level(level, args...) do { \ - if (level <= DEBUG_LEVEL) \ - sigsafe_printf(args); \ -} while (0) -#define dprintf0(args...) dprintf_level(0, args) -#define dprintf1(args...) dprintf_level(1, args) -#define dprintf2(args...) dprintf_level(2, args) -#define dprintf3(args...) dprintf_level(3, args) -#define dprintf4(args...) dprintf_level(4, args) - -extern unsigned int shadow_pkru; -static inline unsigned int __rdpkru(void) -{ - unsigned int eax, edx; - unsigned int ecx = 0; - unsigned int pkru; - - asm volatile(".byte 0x0f,0x01,0xee\n\t" - : "=a" (eax), "=d" (edx) - : "c" (ecx)); - pkru = eax; - return pkru; -} - -static inline unsigned int _rdpkru(int line) -{ - unsigned int pkru = __rdpkru(); - - dprintf4("rdpkru(line=%d) pkru: %x shadow: %x\n", - line, pkru, shadow_pkru); - assert(pkru == shadow_pkru); - - return pkru; -} - -#define rdpkru() _rdpkru(__LINE__) - -static inline void __wrpkru(unsigned int pkru) -{ - unsigned int eax = pkru; - unsigned int ecx = 0; - unsigned int edx = 0; - - dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru); - asm volatile(".byte 0x0f,0x01,0xef\n\t" - : : "a" (eax), "c" (ecx), "d" (edx)); - assert(pkru == __rdpkru()); -} - -static inline void wrpkru(unsigned int pkru) -{ - dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru); - /* will do the shadow check for us: */ - rdpkru(); - __wrpkru(pkru); - shadow_pkru = pkru; - dprintf4("%s(%08x) pkru: %08x\n", __func__, pkru, __rdpkru()); -} - -/* - * These are technically racy. since something could - * change PKRU between the read and the write. - */ -static inline void __pkey_access_allow(int pkey, int do_allow) -{ - unsigned int pkru = rdpkru(); - int bit = pkey * 2; - - if (do_allow) - pkru &= (1<<bit); - else - pkru |= (1<<bit); - - dprintf4("pkru now: %08x\n", rdpkru()); - wrpkru(pkru); -} - -static inline void __pkey_write_allow(int pkey, int do_allow_write) -{ - long pkru = rdpkru(); - int bit = pkey * 2 + 1; - - if (do_allow_write) - pkru &= (1<<bit); - else - pkru |= (1<<bit); - - wrpkru(pkru); - dprintf4("pkru now: %08x\n", rdpkru()); -} - -#define PROT_PKEY0 0x10 /* protection key value (bit 0) */ -#define PROT_PKEY1 0x20 /* protection key value (bit 1) */ -#define PROT_PKEY2 0x40 /* protection key value (bit 2) */ -#define PROT_PKEY3 0x80 /* protection key value (bit 3) */ - -#define PAGE_SIZE 4096 -#define MB (1<<20) - -static inline void __cpuid(unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx) -{ - /* ecx is often an input as well as an output. */ - asm volatile( - "cpuid;" - : "=a" (*eax), - "=b" (*ebx), - "=c" (*ecx), - "=d" (*edx) - : "0" (*eax), "2" (*ecx)); -} - -/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx) */ -#define X86_FEATURE_PKU (1<<3) /* Protection Keys for Userspace */ -#define X86_FEATURE_OSPKE (1<<4) /* OS Protection Keys Enable */ - -static inline int cpu_has_pku(void) -{ - unsigned int eax; - unsigned int ebx; - unsigned int ecx; - unsigned int edx; - - eax = 0x7; - ecx = 0x0; - __cpuid(&eax, &ebx, &ecx, &edx); - - if (!(ecx & X86_FEATURE_PKU)) { - dprintf2("cpu does not have PKU\n"); - return 0; - } - if (!(ecx & X86_FEATURE_OSPKE)) { - dprintf2("cpu does not have OSPKE\n"); - return 0; - } - return 1; -} - -#define XSTATE_PKRU_BIT (9) -#define XSTATE_PKRU 0x200 - -int pkru_xstate_offset(void) -{ - unsigned int eax; - unsigned int ebx; - unsigned int ecx; - unsigned int edx; - int xstate_offset; - int xstate_size; - unsigned long XSTATE_CPUID = 0xd; - int leaf; - - /* assume that XSTATE_PKRU is set in XCR0 */ - leaf = XSTATE_PKRU_BIT; - { - eax = XSTATE_CPUID; - ecx = leaf; - __cpuid(&eax, &ebx, &ecx, &edx); - - if (leaf == XSTATE_PKRU_BIT) { - xstate_offset = ebx; - xstate_size = eax; - } - } - - if (xstate_size == 0) { - printf("could not find size/offset of PKRU in xsave state\n"); - return 0; - } - - return xstate_offset; -} - -#endif /* _PKEYS_HELPER_H */ diff --git a/tools/testing/selftests/x86/ptrace_syscall.c b/tools/testing/selftests/x86/ptrace_syscall.c index 6f22238f3217..12aaa063196e 100644 --- a/tools/testing/selftests/x86/ptrace_syscall.c +++ b/tools/testing/selftests/x86/ptrace_syscall.c @@ -414,8 +414,12 @@ int main() #if defined(__i386__) && (!defined(__GLIBC__) || __GLIBC__ > 2 || __GLIBC_MINOR__ >= 16) vsyscall32 = (void *)getauxval(AT_SYSINFO); - printf("[RUN]\tCheck AT_SYSINFO return regs\n"); - test_sys32_regs(do_full_vsyscall32); + if (vsyscall32) { + printf("[RUN]\tCheck AT_SYSINFO return regs\n"); + test_sys32_regs(do_full_vsyscall32); + } else { + printf("[SKIP]\tAT_SYSINFO is not available\n"); + } #endif test_ptrace_syscall_restart(); diff --git a/tools/testing/selftests/x86/test_vdso.c b/tools/testing/selftests/x86/test_vdso.c index 35edd61d1663..42052db0f870 100644 --- a/tools/testing/selftests/x86/test_vdso.c +++ b/tools/testing/selftests/x86/test_vdso.c @@ -259,6 +259,11 @@ static void test_one_clock_gettime(int clock, const char *name) static void test_clock_gettime(void) { + if (!vdso_clock_gettime) { + printf("[SKIP]\tNo vDSO, so skipping clock_gettime() tests\n"); + return; + } + for (int clock = 0; clock < sizeof(clocknames) / sizeof(clocknames[0]); clock++) { test_one_clock_gettime(clock, clocknames[clock]); diff --git a/tools/testing/selftests/x86/vdso_restorer.c b/tools/testing/selftests/x86/vdso_restorer.c index 29a5c94c4b50..fe99f2434155 100644 --- a/tools/testing/selftests/x86/vdso_restorer.c +++ b/tools/testing/selftests/x86/vdso_restorer.c @@ -15,6 +15,7 @@ #include <err.h> #include <stdio.h> +#include <dlfcn.h> #include <string.h> #include <signal.h> #include <unistd.h> @@ -46,11 +47,23 @@ int main() int nerrs = 0; struct real_sigaction sa; + void *vdso = dlopen("linux-vdso.so.1", + RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD); + if (!vdso) + vdso = dlopen("linux-gate.so.1", + RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD); + if (!vdso) { + printf("[SKIP]\tFailed to find vDSO. Tests are not expected to work.\n"); + return 0; + } + memset(&sa, 0, sizeof(sa)); sa.handler = handler_with_siginfo; sa.flags = SA_SIGINFO; sa.restorer = NULL; /* request kernel-provided restorer */ + printf("[RUN]\tRaise a signal, SA_SIGINFO, sa.restorer == NULL\n"); + if (syscall(SYS_rt_sigaction, SIGUSR1, &sa, NULL, 8) != 0) err(1, "raw rt_sigaction syscall"); @@ -63,6 +76,8 @@ int main() nerrs++; } + printf("[RUN]\tRaise a signal, !SA_SIGINFO, sa.restorer == NULL\n"); + sa.flags = 0; sa.handler = handler_without_siginfo; if (syscall(SYS_sigaction, SIGUSR1, &sa, 0) != 0) |