summaryrefslogtreecommitdiff
path: root/tools/testing/selftests
diff options
context:
space:
mode:
Diffstat (limited to 'tools/testing/selftests')
-rw-r--r--tools/testing/selftests/Makefile2
-rw-r--r--tools/testing/selftests/bpf/.gitignore1
-rw-r--r--tools/testing/selftests/bpf/Makefile5
-rw-r--r--tools/testing/selftests/bpf/prog_tests/recursive_attach.c67
-rw-r--r--tools/testing/selftests/bpf/prog_tests/snprintf.c2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/test_sysctl.c (renamed from tools/testing/selftests/bpf/test_sysctl.c)37
-rw-r--r--tools/testing/selftests/bpf/progs/btf_type_tag_percpu.c6
-rw-r--r--tools/testing/selftests/bpf/progs/test_global_map_resize.c16
-rw-r--r--tools/testing/selftests/bpf/progs/verifier_vfs_accept.c18
-rw-r--r--tools/testing/selftests/bpf/progs/verifier_vfs_reject.c15
-rw-r--r--tools/testing/selftests/bpf/test_kmods/bpf_testmod.c6
-rw-r--r--tools/testing/selftests/bpf/test_lru_map.c105
-rw-r--r--tools/testing/selftests/cgroup/Makefile21
-rw-r--r--tools/testing/selftests/cgroup/lib/cgroup_util.c (renamed from tools/testing/selftests/cgroup/cgroup_util.c)118
-rw-r--r--tools/testing/selftests/cgroup/lib/include/cgroup_util.h (renamed from tools/testing/selftests/cgroup/cgroup_util.h)13
-rw-r--r--tools/testing/selftests/cgroup/lib/libcgroup.mk19
-rw-r--r--tools/testing/selftests/cgroup/test_memcontrol.c100
-rw-r--r--tools/testing/selftests/coredump/stackdump_test.c5
-rw-r--r--tools/testing/selftests/damon/Makefile2
-rw-r--r--tools/testing/selftests/damon/_chk_dependency.sh52
-rw-r--r--tools/testing/selftests/damon/_damon_sysfs.py9
-rw-r--r--tools/testing/selftests/damon/_debugfs_common.sh64
-rw-r--r--tools/testing/selftests/drivers/net/hw/config5
-rwxr-xr-xtools/testing/selftests/drivers/net/hw/rss_ctx.py59
-rwxr-xr-xtools/testing/selftests/drivers/net/hw/rss_input_xfrm.py2
-rwxr-xr-xtools/testing/selftests/drivers/net/hw/tso.py4
-rwxr-xr-xtools/testing/selftests/drivers/net/netdevsim/peer.sh3
-rw-r--r--tools/testing/selftests/filesystems/anon_inode_test.c2
-rw-r--r--tools/testing/selftests/filesystems/eventfd/eventfd_test.c7
-rw-r--r--tools/testing/selftests/filesystems/file_stressor.c2
-rw-r--r--tools/testing/selftests/futex/functional/.gitignore1
-rw-r--r--tools/testing/selftests/futex/functional/futex_numa_mpol.c10
-rw-r--r--tools/testing/selftests/futex/functional/futex_priv_hash.c2
-rw-r--r--tools/testing/selftests/futex/include/futex2test.h8
-rw-r--r--tools/testing/selftests/hid/tests/test_mouse.py70
-rw-r--r--tools/testing/selftests/iommu/iommufd.c40
-rw-r--r--tools/testing/selftests/iommu/iommufd_utils.h9
-rw-r--r--tools/testing/selftests/kmod/config5
-rw-r--r--tools/testing/selftests/kvm/Makefile.kvm6
-rw-r--r--tools/testing/selftests/kvm/access_tracking_perf_test.c281
-rw-r--r--tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c55
-rw-r--r--tools/testing/selftests/kvm/include/kvm_util.h35
-rw-r--r--tools/testing/selftests/kvm/include/lru_gen_util.h51
-rw-r--r--tools/testing/selftests/kvm/include/test_util.h1
-rw-r--r--tools/testing/selftests/kvm/include/x86/processor.h1
-rw-r--r--tools/testing/selftests/kvm/include/x86/sev.h53
-rw-r--r--tools/testing/selftests/kvm/lib/kvm_util.c21
-rw-r--r--tools/testing/selftests/kvm/lib/lru_gen_util.c387
-rw-r--r--tools/testing/selftests/kvm/lib/test_util.c42
-rw-r--r--tools/testing/selftests/kvm/lib/x86/processor.c4
-rw-r--r--tools/testing/selftests/kvm/lib/x86/sev.c76
-rw-r--r--tools/testing/selftests/kvm/x86/fastops_test.c165
-rw-r--r--tools/testing/selftests/kvm/x86/hyperv_cpuid.c21
-rw-r--r--tools/testing/selftests/kvm/x86/kvm_buslock_test.c135
-rw-r--r--tools/testing/selftests/kvm/x86/monitor_mwait_test.c1
-rw-r--r--tools/testing/selftests/kvm/x86/sev_init2_tests.c13
-rw-r--r--tools/testing/selftests/kvm/x86/sev_smoke_test.c75
-rw-r--r--tools/testing/selftests/mm/.gitignore2
-rw-r--r--tools/testing/selftests/mm/Makefile2
-rw-r--r--tools/testing/selftests/mm/config3
-rw-r--r--tools/testing/selftests/mm/cow.c340
-rw-r--r--tools/testing/selftests/mm/guard-regions.c74
-rw-r--r--tools/testing/selftests/mm/gup_longterm.c157
-rwxr-xr-xtools/testing/selftests/mm/hugetlb_reparenting_test.sh98
-rw-r--r--tools/testing/selftests/mm/ksm_tests.c32
-rw-r--r--tools/testing/selftests/mm/madv_populate.c18
-rw-r--r--tools/testing/selftests/mm/map_fixed_noreplace.c2
-rw-r--r--tools/testing/selftests/mm/merge.c501
-rw-r--r--tools/testing/selftests/mm/mlock2-tests.c2
-rw-r--r--tools/testing/selftests/mm/pagemap_ioctl.c16
-rw-r--r--tools/testing/selftests/mm/pfnmap.c249
-rwxr-xr-xtools/testing/selftests/mm/run_vmtests.sh8
-rw-r--r--tools/testing/selftests/mm/settings2
-rw-r--r--tools/testing/selftests/mm/thuge-gen.c10
-rw-r--r--tools/testing/selftests/mm/uffd-unit-tests.c202
-rwxr-xr-xtools/testing/selftests/mm/va_high_addr_switch.sh26
-rw-r--r--tools/testing/selftests/mm/virtual_address_range.c7
-rw-r--r--tools/testing/selftests/mm/vm_util.c100
-rw-r--r--tools/testing/selftests/mm/vm_util.h43
-rw-r--r--tools/testing/selftests/mount_setattr/mount_setattr_test.c17
-rw-r--r--tools/testing/selftests/net/.gitignore1
-rw-r--r--tools/testing/selftests/net/Makefile3
-rw-r--r--tools/testing/selftests/net/af_unix/msg_oob.c142
-rwxr-xr-xtools/testing/selftests/net/gre_ipv6_lladdr.sh27
-rw-r--r--tools/testing/selftests/net/lib.sh2
-rwxr-xr-xtools/testing/selftests/net/nat6to4.sh15
-rw-r--r--tools/testing/selftests/net/netfilter/.gitignore1
-rw-r--r--tools/testing/selftests/net/netfilter/Makefile3
-rwxr-xr-xtools/testing/selftests/net/netfilter/conntrack_clash.sh175
-rwxr-xr-xtools/testing/selftests/net/netfilter/conntrack_resize.sh97
-rwxr-xr-xtools/testing/selftests/net/netfilter/nft_concat_range.sh105
-rwxr-xr-xtools/testing/selftests/net/netfilter/nft_nat.sh81
-rw-r--r--tools/testing/selftests/net/netfilter/udpclash.c158
-rw-r--r--tools/testing/selftests/net/ovpn/ovpn-cli.c1
-rwxr-xr-xtools/testing/selftests/net/ovpn/test-large-mtu.sh9
-rw-r--r--tools/testing/selftests/net/packetdrill/tcp_ooo-before-and-after-accept.pkt53
-rw-r--r--tools/testing/selftests/net/tfo.c171
-rwxr-xr-xtools/testing/selftests/net/tfo_passive.sh112
-rwxr-xr-xtools/testing/selftests/net/udpgro.sh8
-rwxr-xr-xtools/testing/selftests/net/vlan_hw_filter.sh98
-rw-r--r--tools/testing/selftests/ptrace/Makefile2
-rw-r--r--tools/testing/selftests/ptrace/set_syscall_info.c519
-rw-r--r--tools/testing/selftests/sched_ext/exit.c8
-rw-r--r--tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json129
-rw-r--r--tools/testing/selftests/thermal/intel/power_floor/power_floor_test.c2
-rw-r--r--tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c4
-rw-r--r--tools/testing/selftests/ublk/Makefile1
-rw-r--r--tools/testing/selftests/ublk/fault_inject.c4
-rw-r--r--tools/testing/selftests/ublk/file_backed.c20
-rw-r--r--tools/testing/selftests/ublk/kublk.c374
-rw-r--r--tools/testing/selftests/ublk/kublk.h73
-rw-r--r--tools/testing/selftests/ublk/null.c22
-rw-r--r--tools/testing/selftests/ublk/stripe.c17
-rwxr-xr-xtools/testing/selftests/ublk/test_common.sh5
-rwxr-xr-xtools/testing/selftests/ublk/test_generic_12.sh55
-rwxr-xr-xtools/testing/selftests/ublk/test_stress_03.sh11
-rwxr-xr-xtools/testing/selftests/ublk/test_stress_04.sh7
-rwxr-xr-xtools/testing/selftests/ublk/test_stress_05.sh7
-rw-r--r--tools/testing/selftests/ublk/trace/count_ios_per_tid.bt11
-rw-r--r--tools/testing/selftests/vDSO/vgetrandom-chacha.S2
-rw-r--r--tools/testing/selftests/x86/Makefile2
-rw-r--r--tools/testing/selftests/x86/sigtrap_loop.c101
122 files changed, 5844 insertions, 1040 deletions
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 6aa11cd3db42..339b31e6a6b5 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -205,7 +205,7 @@ export KHDR_INCLUDES
all:
@ret=1; \
- for TARGET in $(TARGETS); do \
+ for TARGET in $(TARGETS) $(INSTALL_DEP_TARGETS); do \
BUILD_TARGET=$$BUILD/$$TARGET; \
mkdir $$BUILD_TARGET -p; \
$(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET \
diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index e2a2c46c008b..3d8378972d26 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -21,7 +21,6 @@ test_lirc_mode2_user
flow_dissector_load
test_tcpnotify_user
test_libbpf
-test_sysctl
xdping
test_cpp
*.d
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index cf5ed3bee573..910d8d6402ef 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -73,7 +73,7 @@ endif
# Order correspond to 'make run_tests' order
TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_progs \
test_sockmap \
- test_tcpnotify_user test_sysctl \
+ test_tcpnotify_user \
test_progs-no_alu32
TEST_INST_SUBDIRS := no_alu32
@@ -220,7 +220,7 @@ ifeq ($(VMLINUX_BTF),)
$(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)")
endif
-# Define simple and short `make test_progs`, `make test_sysctl`, etc targets
+# Define simple and short `make test_progs`, `make test_maps`, etc targets
# to build individual tests.
# NOTE: Semicolon at the end is critical to override lib.mk's default static
# rule for binaries.
@@ -329,7 +329,6 @@ NETWORK_HELPERS := $(OUTPUT)/network_helpers.o
$(OUTPUT)/test_sockmap: $(CGROUP_HELPERS) $(TESTING_HELPERS)
$(OUTPUT)/test_tcpnotify_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(TRACE_HELPERS)
$(OUTPUT)/test_sock_fields: $(CGROUP_HELPERS) $(TESTING_HELPERS)
-$(OUTPUT)/test_sysctl: $(CGROUP_HELPERS) $(TESTING_HELPERS)
$(OUTPUT)/test_tag: $(TESTING_HELPERS)
$(OUTPUT)/test_lirc_mode2_user: $(TESTING_HELPERS)
$(OUTPUT)/xdping: $(TESTING_HELPERS)
diff --git a/tools/testing/selftests/bpf/prog_tests/recursive_attach.c b/tools/testing/selftests/bpf/prog_tests/recursive_attach.c
index 8100509e561b..0ffa01d54ce2 100644
--- a/tools/testing/selftests/bpf/prog_tests/recursive_attach.c
+++ b/tools/testing/selftests/bpf/prog_tests/recursive_attach.c
@@ -149,3 +149,70 @@ close_prog:
fentry_recursive_target__destroy(target_skel);
fentry_recursive__destroy(tracing_skel);
}
+
+static void *fentry_target_test_run(void *arg)
+{
+ for (;;) {
+ int prog_fd = __atomic_load_n((int *)arg, __ATOMIC_SEQ_CST);
+ LIBBPF_OPTS(bpf_test_run_opts, topts);
+ int err;
+
+ if (prog_fd == -1)
+ break;
+ err = bpf_prog_test_run_opts(prog_fd, &topts);
+ if (!ASSERT_OK(err, "fentry_target test_run"))
+ break;
+ }
+
+ return NULL;
+}
+
+void test_fentry_attach_stress(void)
+{
+ struct fentry_recursive_target *target_skel = NULL;
+ struct fentry_recursive *tracing_skel = NULL;
+ struct bpf_program *prog;
+ int err, i, tgt_prog_fd;
+ pthread_t thread;
+
+ target_skel = fentry_recursive_target__open_and_load();
+ if (!ASSERT_OK_PTR(target_skel,
+ "fentry_recursive_target__open_and_load"))
+ goto close_prog;
+ tgt_prog_fd = bpf_program__fd(target_skel->progs.fentry_target);
+ err = pthread_create(&thread, NULL,
+ fentry_target_test_run, &tgt_prog_fd);
+ if (!ASSERT_OK(err, "bpf_program__set_attach_target"))
+ goto close_prog;
+
+ for (i = 0; i < 1000; i++) {
+ tracing_skel = fentry_recursive__open();
+ if (!ASSERT_OK_PTR(tracing_skel, "fentry_recursive__open"))
+ goto stop_thread;
+
+ prog = tracing_skel->progs.recursive_attach;
+ err = bpf_program__set_attach_target(prog, tgt_prog_fd,
+ "fentry_target");
+ if (!ASSERT_OK(err, "bpf_program__set_attach_target"))
+ goto stop_thread;
+
+ err = fentry_recursive__load(tracing_skel);
+ if (!ASSERT_OK(err, "fentry_recursive__load"))
+ goto stop_thread;
+
+ err = fentry_recursive__attach(tracing_skel);
+ if (!ASSERT_OK(err, "fentry_recursive__attach"))
+ goto stop_thread;
+
+ fentry_recursive__destroy(tracing_skel);
+ tracing_skel = NULL;
+ }
+
+stop_thread:
+ __atomic_store_n(&tgt_prog_fd, -1, __ATOMIC_SEQ_CST);
+ err = pthread_join(thread, NULL);
+ ASSERT_OK(err, "pthread_join");
+close_prog:
+ fentry_recursive__destroy(tracing_skel);
+ fentry_recursive_target__destroy(target_skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/snprintf.c b/tools/testing/selftests/bpf/prog_tests/snprintf.c
index 4be6fdb78c6a..594441acb707 100644
--- a/tools/testing/selftests/bpf/prog_tests/snprintf.c
+++ b/tools/testing/selftests/bpf/prog_tests/snprintf.c
@@ -116,6 +116,8 @@ static void test_snprintf_negative(void)
ASSERT_ERR(load_single_snprintf("%llc"), "invalid specifier 7");
ASSERT_ERR(load_single_snprintf("\x80"), "non ascii character");
ASSERT_ERR(load_single_snprintf("\x1"), "non printable character");
+ ASSERT_ERR(load_single_snprintf("%p%"), "invalid specifier 8");
+ ASSERT_ERR(load_single_snprintf("%s%"), "invalid specifier 9");
}
void test_snprintf(void)
diff --git a/tools/testing/selftests/bpf/test_sysctl.c b/tools/testing/selftests/bpf/prog_tests/test_sysctl.c
index bcdbd27f22f0..273dd41ca09e 100644
--- a/tools/testing/selftests/bpf/test_sysctl.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_sysctl.c
@@ -1,22 +1,8 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (c) 2019 Facebook
-#include <fcntl.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-
-#include <linux/filter.h>
-
-#include <bpf/bpf.h>
-#include <bpf/libbpf.h>
-
-#include <bpf/bpf_endian.h>
-#include "bpf_util.h"
+#include "test_progs.h"
#include "cgroup_helpers.h"
-#include "testing_helpers.h"
#define CG_PATH "/foo"
#define MAX_INSNS 512
@@ -1608,26 +1594,19 @@ static int run_tests(int cgfd)
return fails ? -1 : 0;
}
-int main(int argc, char **argv)
+void test_sysctl(void)
{
- int cgfd = -1;
- int err = 0;
+ int cgfd;
cgfd = cgroup_setup_and_join(CG_PATH);
- if (cgfd < 0)
- goto err;
+ if (!ASSERT_OK_FD(cgfd < 0, "create_cgroup"))
+ goto out;
- /* Use libbpf 1.0 API mode */
- libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+ if (!ASSERT_OK(run_tests(cgfd), "run_tests"))
+ goto out;
- if (run_tests(cgfd))
- goto err;
-
- goto out;
-err:
- err = -1;
out:
close(cgfd);
cleanup_cgroup_environment();
- return err;
+ return;
}
diff --git a/tools/testing/selftests/bpf/progs/btf_type_tag_percpu.c b/tools/testing/selftests/bpf/progs/btf_type_tag_percpu.c
index 69f81cb555ca..d93f68024cc6 100644
--- a/tools/testing/selftests/bpf/progs/btf_type_tag_percpu.c
+++ b/tools/testing/selftests/bpf/progs/btf_type_tag_percpu.c
@@ -57,15 +57,15 @@ int BPF_PROG(test_percpu_load, struct cgroup *cgrp, const char *path)
SEC("tp_btf/cgroup_mkdir")
int BPF_PROG(test_percpu_helper, struct cgroup *cgrp, const char *path)
{
- struct cgroup_rstat_cpu *rstat;
+ struct css_rstat_cpu *rstat;
__u32 cpu;
cpu = bpf_get_smp_processor_id();
- rstat = (struct cgroup_rstat_cpu *)bpf_per_cpu_ptr(
+ rstat = (struct css_rstat_cpu *)bpf_per_cpu_ptr(
cgrp->self.rstat_cpu, cpu);
if (rstat) {
/* READ_ONCE */
- *(volatile int *)rstat;
+ *(volatile long *)rstat;
}
return 0;
diff --git a/tools/testing/selftests/bpf/progs/test_global_map_resize.c b/tools/testing/selftests/bpf/progs/test_global_map_resize.c
index a3f220ba7025..ee65bad0436d 100644
--- a/tools/testing/selftests/bpf/progs/test_global_map_resize.c
+++ b/tools/testing/selftests/bpf/progs/test_global_map_resize.c
@@ -32,6 +32,16 @@ int my_int_last SEC(".data.array_not_last");
int percpu_arr[1] SEC(".data.percpu_arr");
+/* at least one extern is included, to ensure that a specific
+ * regression is tested whereby resizing resulted in a free-after-use
+ * bug after type information is invalidated by the resize operation.
+ *
+ * There isn't a particularly good API to test for this specific condition,
+ * but by having externs for the resizing tests it will cover this path.
+ */
+extern int LINUX_KERNEL_VERSION __kconfig;
+long version_sink;
+
SEC("tp/syscalls/sys_enter_getpid")
int bss_array_sum(void *ctx)
{
@@ -44,6 +54,9 @@ int bss_array_sum(void *ctx)
for (size_t i = 0; i < bss_array_len; ++i)
sum += array[i];
+ /* see above; ensure this is not optimized out */
+ version_sink = LINUX_KERNEL_VERSION;
+
return 0;
}
@@ -59,6 +72,9 @@ int data_array_sum(void *ctx)
for (size_t i = 0; i < data_array_len; ++i)
sum += my_array[i];
+ /* see above; ensure this is not optimized out */
+ version_sink = LINUX_KERNEL_VERSION;
+
return 0;
}
diff --git a/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c b/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c
index a7c0a553aa50..3e2d76ee8050 100644
--- a/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c
+++ b/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c
@@ -2,6 +2,7 @@
/* Copyright (c) 2024 Google LLC. */
#include <vmlinux.h>
+#include <errno.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
@@ -82,4 +83,21 @@ int BPF_PROG(path_d_path_from_file_argument, struct file *file)
return 0;
}
+SEC("lsm.s/inode_rename")
+__success
+int BPF_PROG(inode_rename, struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
+{
+ struct inode *inode = new_dentry->d_inode;
+ ino_t ino;
+
+ if (!inode)
+ return 0;
+ ino = inode->i_ino;
+ if (ino == 0)
+ return -EACCES;
+ return 0;
+}
+
char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c b/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c
index d6d3f4fcb24c..4b392c6c8fc4 100644
--- a/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c
+++ b/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c
@@ -2,6 +2,7 @@
/* Copyright (c) 2024 Google LLC. */
#include <vmlinux.h>
+#include <errno.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include <linux/limits.h>
@@ -158,4 +159,18 @@ int BPF_PROG(path_d_path_kfunc_non_lsm, struct path *path, struct file *f)
return 0;
}
+SEC("lsm.s/inode_rename")
+__failure __msg("invalid mem access 'trusted_ptr_or_null_'")
+int BPF_PROG(inode_rename, struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
+{
+ struct inode *inode = new_dentry->d_inode;
+ ino_t ino;
+
+ ino = inode->i_ino;
+ if (ino == 0)
+ return -EACCES;
+ return 0;
+}
char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
index e6c248e3ae54..e9e918cdf31f 100644
--- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
@@ -385,7 +385,7 @@ int bpf_testmod_fentry_ok;
noinline ssize_t
bpf_testmod_test_read(struct file *file, struct kobject *kobj,
- struct bin_attribute *bin_attr,
+ const struct bin_attribute *bin_attr,
char *buf, loff_t off, size_t len)
{
struct bpf_testmod_test_read_ctx ctx = {
@@ -465,7 +465,7 @@ ALLOW_ERROR_INJECTION(bpf_testmod_test_read, ERRNO);
noinline ssize_t
bpf_testmod_test_write(struct file *file, struct kobject *kobj,
- struct bin_attribute *bin_attr,
+ const struct bin_attribute *bin_attr,
char *buf, loff_t off, size_t len)
{
struct bpf_testmod_test_write_ctx ctx = {
@@ -567,7 +567,7 @@ static void testmod_unregister_uprobe(void)
static ssize_t
bpf_testmod_uprobe_write(struct file *file, struct kobject *kobj,
- struct bin_attribute *bin_attr,
+ const struct bin_attribute *bin_attr,
char *buf, loff_t off, size_t len)
{
unsigned long offset = 0;
diff --git a/tools/testing/selftests/bpf/test_lru_map.c b/tools/testing/selftests/bpf/test_lru_map.c
index fda7589c5023..0921939532c6 100644
--- a/tools/testing/selftests/bpf/test_lru_map.c
+++ b/tools/testing/selftests/bpf/test_lru_map.c
@@ -138,6 +138,18 @@ static int sched_next_online(int pid, int *next_to_try)
return ret;
}
+/* Derive target_free from map_size, same as bpf_common_lru_populate */
+static unsigned int __tgt_size(unsigned int map_size)
+{
+ return (map_size / nr_cpus) / 2;
+}
+
+/* Inverse of how bpf_common_lru_populate derives target_free from map_size. */
+static unsigned int __map_size(unsigned int tgt_free)
+{
+ return tgt_free * nr_cpus * 2;
+}
+
/* Size of the LRU map is 2
* Add key=1 (+1 key)
* Add key=2 (+1 key)
@@ -231,11 +243,11 @@ static void test_lru_sanity0(int map_type, int map_flags)
printf("Pass\n");
}
-/* Size of the LRU map is 1.5*tgt_free
- * Insert 1 to tgt_free (+tgt_free keys)
- * Lookup 1 to tgt_free/2
- * Insert 1+tgt_free to 2*tgt_free (+tgt_free keys)
- * => 1+tgt_free/2 to LOCALFREE_TARGET will be removed by LRU
+/* Verify that unreferenced elements are recycled before referenced ones.
+ * Insert elements.
+ * Reference a subset of these.
+ * Insert more, enough to trigger recycling.
+ * Verify that unreferenced are recycled.
*/
static void test_lru_sanity1(int map_type, int map_flags, unsigned int tgt_free)
{
@@ -257,7 +269,7 @@ static void test_lru_sanity1(int map_type, int map_flags, unsigned int tgt_free)
batch_size = tgt_free / 2;
assert(batch_size * 2 == tgt_free);
- map_size = tgt_free + batch_size;
+ map_size = __map_size(tgt_free) + batch_size;
lru_map_fd = create_map(map_type, map_flags, map_size);
assert(lru_map_fd != -1);
@@ -266,13 +278,13 @@ static void test_lru_sanity1(int map_type, int map_flags, unsigned int tgt_free)
value[0] = 1234;
- /* Insert 1 to tgt_free (+tgt_free keys) */
- end_key = 1 + tgt_free;
+ /* Insert map_size - batch_size keys */
+ end_key = 1 + __map_size(tgt_free);
for (key = 1; key < end_key; key++)
assert(!bpf_map_update_elem(lru_map_fd, &key, value,
BPF_NOEXIST));
- /* Lookup 1 to tgt_free/2 */
+ /* Lookup 1 to batch_size */
end_key = 1 + batch_size;
for (key = 1; key < end_key; key++) {
assert(!bpf_map_lookup_elem_with_ref_bit(lru_map_fd, key, value));
@@ -280,12 +292,13 @@ static void test_lru_sanity1(int map_type, int map_flags, unsigned int tgt_free)
BPF_NOEXIST));
}
- /* Insert 1+tgt_free to 2*tgt_free
- * => 1+tgt_free/2 to LOCALFREE_TARGET will be
+ /* Insert another map_size - batch_size keys
+ * Map will contain 1 to batch_size plus these latest, i.e.,
+ * => previous 1+batch_size to map_size - batch_size will have been
* removed by LRU
*/
- key = 1 + tgt_free;
- end_key = key + tgt_free;
+ key = 1 + __map_size(tgt_free);
+ end_key = key + __map_size(tgt_free);
for (; key < end_key; key++) {
assert(!bpf_map_update_elem(lru_map_fd, &key, value,
BPF_NOEXIST));
@@ -301,17 +314,8 @@ static void test_lru_sanity1(int map_type, int map_flags, unsigned int tgt_free)
printf("Pass\n");
}
-/* Size of the LRU map 1.5 * tgt_free
- * Insert 1 to tgt_free (+tgt_free keys)
- * Update 1 to tgt_free/2
- * => The original 1 to tgt_free/2 will be removed due to
- * the LRU shrink process
- * Re-insert 1 to tgt_free/2 again and do a lookup immeidately
- * Insert 1+tgt_free to tgt_free*3/2
- * Insert 1+tgt_free*3/2 to tgt_free*5/2
- * => Key 1+tgt_free to tgt_free*3/2
- * will be removed from LRU because it has never
- * been lookup and ref bit is not set
+/* Verify that insertions exceeding map size will recycle the oldest.
+ * Verify that unreferenced elements are recycled before referenced.
*/
static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free)
{
@@ -334,7 +338,7 @@ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free)
batch_size = tgt_free / 2;
assert(batch_size * 2 == tgt_free);
- map_size = tgt_free + batch_size;
+ map_size = __map_size(tgt_free) + batch_size;
lru_map_fd = create_map(map_type, map_flags, map_size);
assert(lru_map_fd != -1);
@@ -343,8 +347,8 @@ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free)
value[0] = 1234;
- /* Insert 1 to tgt_free (+tgt_free keys) */
- end_key = 1 + tgt_free;
+ /* Insert map_size - batch_size keys */
+ end_key = 1 + __map_size(tgt_free);
for (key = 1; key < end_key; key++)
assert(!bpf_map_update_elem(lru_map_fd, &key, value,
BPF_NOEXIST));
@@ -357,8 +361,7 @@ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free)
* shrink the inactive list to get tgt_free
* number of free nodes.
*
- * Hence, the oldest key 1 to tgt_free/2
- * are removed from the LRU list.
+ * Hence, the oldest key is removed from the LRU list.
*/
key = 1;
if (map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
@@ -370,8 +373,7 @@ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free)
BPF_EXIST));
}
- /* Re-insert 1 to tgt_free/2 again and do a lookup
- * immeidately.
+ /* Re-insert 1 to batch_size again and do a lookup immediately.
*/
end_key = 1 + batch_size;
value[0] = 4321;
@@ -387,17 +389,18 @@ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free)
value[0] = 1234;
- /* Insert 1+tgt_free to tgt_free*3/2 */
- end_key = 1 + tgt_free + batch_size;
- for (key = 1 + tgt_free; key < end_key; key++)
+ /* Insert batch_size new elements */
+ key = 1 + __map_size(tgt_free);
+ end_key = key + batch_size;
+ for (; key < end_key; key++)
/* These newly added but not referenced keys will be
* gone during the next LRU shrink.
*/
assert(!bpf_map_update_elem(lru_map_fd, &key, value,
BPF_NOEXIST));
- /* Insert 1+tgt_free*3/2 to tgt_free*5/2 */
- end_key = key + tgt_free;
+ /* Insert map_size - batch_size elements */
+ end_key += __map_size(tgt_free);
for (; key < end_key; key++) {
assert(!bpf_map_update_elem(lru_map_fd, &key, value,
BPF_NOEXIST));
@@ -413,12 +416,12 @@ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free)
printf("Pass\n");
}
-/* Size of the LRU map is 2*tgt_free
- * It is to test the active/inactive list rotation
- * Insert 1 to 2*tgt_free (+2*tgt_free keys)
- * Lookup key 1 to tgt_free*3/2
- * Add 1+2*tgt_free to tgt_free*5/2 (+tgt_free/2 keys)
- * => key 1+tgt_free*3/2 to 2*tgt_free are removed from LRU
+/* Test the active/inactive list rotation
+ *
+ * Fill the whole map, deplete the free list.
+ * Reference all except the last lru->target_free elements.
+ * Insert lru->target_free new elements. This triggers one shrink.
+ * Verify that the non-referenced elements are replaced.
*/
static void test_lru_sanity3(int map_type, int map_flags, unsigned int tgt_free)
{
@@ -437,8 +440,7 @@ static void test_lru_sanity3(int map_type, int map_flags, unsigned int tgt_free)
assert(sched_next_online(0, &next_cpu) != -1);
- batch_size = tgt_free / 2;
- assert(batch_size * 2 == tgt_free);
+ batch_size = __tgt_size(tgt_free);
map_size = tgt_free * 2;
lru_map_fd = create_map(map_type, map_flags, map_size);
@@ -449,23 +451,21 @@ static void test_lru_sanity3(int map_type, int map_flags, unsigned int tgt_free)
value[0] = 1234;
- /* Insert 1 to 2*tgt_free (+2*tgt_free keys) */
- end_key = 1 + (2 * tgt_free);
+ /* Fill the map */
+ end_key = 1 + map_size;
for (key = 1; key < end_key; key++)
assert(!bpf_map_update_elem(lru_map_fd, &key, value,
BPF_NOEXIST));
- /* Lookup key 1 to tgt_free*3/2 */
- end_key = tgt_free + batch_size;
+ /* Reference all but the last batch_size */
+ end_key = 1 + map_size - batch_size;
for (key = 1; key < end_key; key++) {
assert(!bpf_map_lookup_elem_with_ref_bit(lru_map_fd, key, value));
assert(!bpf_map_update_elem(expected_map_fd, &key, value,
BPF_NOEXIST));
}
- /* Add 1+2*tgt_free to tgt_free*5/2
- * (+tgt_free/2 keys)
- */
+ /* Insert new batch_size: replaces the non-referenced elements */
key = 2 * tgt_free + 1;
end_key = key + batch_size;
for (; key < end_key; key++) {
@@ -500,7 +500,8 @@ static void test_lru_sanity4(int map_type, int map_flags, unsigned int tgt_free)
lru_map_fd = create_map(map_type, map_flags,
3 * tgt_free * nr_cpus);
else
- lru_map_fd = create_map(map_type, map_flags, 3 * tgt_free);
+ lru_map_fd = create_map(map_type, map_flags,
+ 3 * __map_size(tgt_free));
assert(lru_map_fd != -1);
expected_map_fd = create_map(BPF_MAP_TYPE_HASH, 0,
diff --git a/tools/testing/selftests/cgroup/Makefile b/tools/testing/selftests/cgroup/Makefile
index 1b897152bab6..e01584c2189a 100644
--- a/tools/testing/selftests/cgroup/Makefile
+++ b/tools/testing/selftests/cgroup/Makefile
@@ -21,14 +21,15 @@ TEST_GEN_PROGS += test_zswap
LOCAL_HDRS += $(selfdir)/clone3/clone3_selftests.h $(selfdir)/pidfd/pidfd.h
include ../lib.mk
+include lib/libcgroup.mk
-$(OUTPUT)/test_core: cgroup_util.c
-$(OUTPUT)/test_cpu: cgroup_util.c
-$(OUTPUT)/test_cpuset: cgroup_util.c
-$(OUTPUT)/test_freezer: cgroup_util.c
-$(OUTPUT)/test_hugetlb_memcg: cgroup_util.c
-$(OUTPUT)/test_kill: cgroup_util.c
-$(OUTPUT)/test_kmem: cgroup_util.c
-$(OUTPUT)/test_memcontrol: cgroup_util.c
-$(OUTPUT)/test_pids: cgroup_util.c
-$(OUTPUT)/test_zswap: cgroup_util.c
+$(OUTPUT)/test_core: $(LIBCGROUP_O)
+$(OUTPUT)/test_cpu: $(LIBCGROUP_O)
+$(OUTPUT)/test_cpuset: $(LIBCGROUP_O)
+$(OUTPUT)/test_freezer: $(LIBCGROUP_O)
+$(OUTPUT)/test_hugetlb_memcg: $(LIBCGROUP_O)
+$(OUTPUT)/test_kill: $(LIBCGROUP_O)
+$(OUTPUT)/test_kmem: $(LIBCGROUP_O)
+$(OUTPUT)/test_memcontrol: $(LIBCGROUP_O)
+$(OUTPUT)/test_pids: $(LIBCGROUP_O)
+$(OUTPUT)/test_zswap: $(LIBCGROUP_O)
diff --git a/tools/testing/selftests/cgroup/cgroup_util.c b/tools/testing/selftests/cgroup/lib/cgroup_util.c
index 1e2d46636a0c..8832f3d1cb61 100644
--- a/tools/testing/selftests/cgroup/cgroup_util.c
+++ b/tools/testing/selftests/cgroup/lib/cgroup_util.c
@@ -17,10 +17,10 @@
#include <unistd.h>
#include "cgroup_util.h"
-#include "../clone3/clone3_selftests.h"
+#include "../../clone3/clone3_selftests.h"
/* Returns read len on success, or -errno on failure. */
-static ssize_t read_text(const char *path, char *buf, size_t max_len)
+ssize_t read_text(const char *path, char *buf, size_t max_len)
{
ssize_t len;
int fd;
@@ -39,7 +39,7 @@ static ssize_t read_text(const char *path, char *buf, size_t max_len)
}
/* Returns written len on success, or -errno on failure. */
-static ssize_t write_text(const char *path, char *buf, ssize_t len)
+ssize_t write_text(const char *path, char *buf, ssize_t len)
{
int fd;
@@ -217,7 +217,8 @@ int cg_write_numeric(const char *cgroup, const char *control, long value)
return cg_write(cgroup, control, buf);
}
-int cg_find_unified_root(char *root, size_t len, bool *nsdelegate)
+static int cg_find_root(char *root, size_t len, const char *controller,
+ bool *nsdelegate)
{
char buf[10 * PAGE_SIZE];
char *fs, *mount, *type, *options;
@@ -236,18 +237,37 @@ int cg_find_unified_root(char *root, size_t len, bool *nsdelegate)
options = strtok(NULL, delim);
strtok(NULL, delim);
strtok(NULL, delim);
-
- if (strcmp(type, "cgroup2") == 0) {
- strncpy(root, mount, len);
- if (nsdelegate)
- *nsdelegate = !!strstr(options, "nsdelegate");
- return 0;
+ if (strcmp(type, "cgroup") == 0) {
+ if (!controller || !strstr(options, controller))
+ continue;
+ } else if (strcmp(type, "cgroup2") == 0) {
+ if (controller &&
+ cg_read_strstr(mount, "cgroup.controllers", controller))
+ continue;
+ } else {
+ continue;
}
+ strncpy(root, mount, len);
+
+ if (nsdelegate)
+ *nsdelegate = !!strstr(options, "nsdelegate");
+ return 0;
+
}
return -1;
}
+int cg_find_controller_root(char *root, size_t len, const char *controller)
+{
+ return cg_find_root(root, len, controller, NULL);
+}
+
+int cg_find_unified_root(char *root, size_t len, bool *nsdelegate)
+{
+ return cg_find_root(root, len, NULL, nsdelegate);
+}
+
int cg_create(const char *cgroup)
{
return mkdir(cgroup, 0755);
@@ -488,84 +508,6 @@ int cg_run_nowait(const char *cgroup,
return pid;
}
-int get_temp_fd(void)
-{
- return open(".", O_TMPFILE | O_RDWR | O_EXCL);
-}
-
-int alloc_pagecache(int fd, size_t size)
-{
- char buf[PAGE_SIZE];
- struct stat st;
- int i;
-
- if (fstat(fd, &st))
- goto cleanup;
-
- size += st.st_size;
-
- if (ftruncate(fd, size))
- goto cleanup;
-
- for (i = 0; i < size; i += sizeof(buf))
- read(fd, buf, sizeof(buf));
-
- return 0;
-
-cleanup:
- return -1;
-}
-
-int alloc_anon(const char *cgroup, void *arg)
-{
- size_t size = (unsigned long)arg;
- char *buf, *ptr;
-
- buf = malloc(size);
- for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
- *ptr = 0;
-
- free(buf);
- return 0;
-}
-
-int is_swap_enabled(void)
-{
- char buf[PAGE_SIZE];
- const char delim[] = "\n";
- int cnt = 0;
- char *line;
-
- if (read_text("/proc/swaps", buf, sizeof(buf)) <= 0)
- return -1;
-
- for (line = strtok(buf, delim); line; line = strtok(NULL, delim))
- cnt++;
-
- return cnt > 1;
-}
-
-int set_oom_adj_score(int pid, int score)
-{
- char path[PATH_MAX];
- int fd, len;
-
- sprintf(path, "/proc/%d/oom_score_adj", pid);
-
- fd = open(path, O_WRONLY | O_APPEND);
- if (fd < 0)
- return fd;
-
- len = dprintf(fd, "%d", score);
- if (len < 0) {
- close(fd);
- return len;
- }
-
- close(fd);
- return 0;
-}
-
int proc_mount_contains(const char *option)
{
char buf[4 * PAGE_SIZE];
diff --git a/tools/testing/selftests/cgroup/cgroup_util.h b/tools/testing/selftests/cgroup/lib/include/cgroup_util.h
index 19b131ee7707..adb2bc193183 100644
--- a/tools/testing/selftests/cgroup/cgroup_util.h
+++ b/tools/testing/selftests/cgroup/lib/include/cgroup_util.h
@@ -2,9 +2,9 @@
#include <stdbool.h>
#include <stdlib.h>
-#include "../kselftest.h"
-
+#ifndef PAGE_SIZE
#define PAGE_SIZE 4096
+#endif
#define MB(x) (x << 20)
@@ -21,6 +21,10 @@ static inline int values_close(long a, long b, int err)
return labs(a - b) <= (a + b) / 100 * err;
}
+extern ssize_t read_text(const char *path, char *buf, size_t max_len);
+extern ssize_t write_text(const char *path, char *buf, ssize_t len);
+
+extern int cg_find_controller_root(char *root, size_t len, const char *controller);
extern int cg_find_unified_root(char *root, size_t len, bool *nsdelegate);
extern char *cg_name(const char *root, const char *name);
extern char *cg_name_indexed(const char *root, const char *name, int index);
@@ -49,11 +53,6 @@ extern int cg_enter_current_thread(const char *cgroup);
extern int cg_run_nowait(const char *cgroup,
int (*fn)(const char *cgroup, void *arg),
void *arg);
-extern int get_temp_fd(void);
-extern int alloc_pagecache(int fd, size_t size);
-extern int alloc_anon(const char *cgroup, void *arg);
-extern int is_swap_enabled(void);
-extern int set_oom_adj_score(int pid, int score);
extern int cg_wait_for_proc_count(const char *cgroup, int count);
extern int cg_killall(const char *cgroup);
int proc_mount_contains(const char *option);
diff --git a/tools/testing/selftests/cgroup/lib/libcgroup.mk b/tools/testing/selftests/cgroup/lib/libcgroup.mk
new file mode 100644
index 000000000000..7a73007204c3
--- /dev/null
+++ b/tools/testing/selftests/cgroup/lib/libcgroup.mk
@@ -0,0 +1,19 @@
+CGROUP_DIR := $(selfdir)/cgroup
+
+LIBCGROUP_C := lib/cgroup_util.c
+
+LIBCGROUP_O := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBCGROUP_C))
+
+LIBCGROUP_O_DIRS := $(shell dirname $(LIBCGROUP_O) | uniq)
+
+CFLAGS += -I$(CGROUP_DIR)/lib/include
+
+EXTRA_HDRS := $(selfdir)/clone3/clone3_selftests.h
+
+$(LIBCGROUP_O_DIRS):
+ mkdir -p $@
+
+$(LIBCGROUP_O): $(OUTPUT)/%.o : $(CGROUP_DIR)/%.c $(EXTRA_HDRS) $(LIBCGROUP_O_DIRS)
+ $(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@
+
+EXTRA_CLEAN += $(LIBCGROUP_O)
diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c
index 16f5d74ae762..a680f773f2d5 100644
--- a/tools/testing/selftests/cgroup/test_memcontrol.c
+++ b/tools/testing/selftests/cgroup/test_memcontrol.c
@@ -24,6 +24,84 @@
static bool has_localevents;
static bool has_recursiveprot;
+int get_temp_fd(void)
+{
+ return open(".", O_TMPFILE | O_RDWR | O_EXCL);
+}
+
+int alloc_pagecache(int fd, size_t size)
+{
+ char buf[PAGE_SIZE];
+ struct stat st;
+ int i;
+
+ if (fstat(fd, &st))
+ goto cleanup;
+
+ size += st.st_size;
+
+ if (ftruncate(fd, size))
+ goto cleanup;
+
+ for (i = 0; i < size; i += sizeof(buf))
+ read(fd, buf, sizeof(buf));
+
+ return 0;
+
+cleanup:
+ return -1;
+}
+
+int alloc_anon(const char *cgroup, void *arg)
+{
+ size_t size = (unsigned long)arg;
+ char *buf, *ptr;
+
+ buf = malloc(size);
+ for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
+ *ptr = 0;
+
+ free(buf);
+ return 0;
+}
+
+int is_swap_enabled(void)
+{
+ char buf[PAGE_SIZE];
+ const char delim[] = "\n";
+ int cnt = 0;
+ char *line;
+
+ if (read_text("/proc/swaps", buf, sizeof(buf)) <= 0)
+ return -1;
+
+ for (line = strtok(buf, delim); line; line = strtok(NULL, delim))
+ cnt++;
+
+ return cnt > 1;
+}
+
+int set_oom_adj_score(int pid, int score)
+{
+ char path[PATH_MAX];
+ int fd, len;
+
+ sprintf(path, "/proc/%d/oom_score_adj", pid);
+
+ fd = open(path, O_WRONLY | O_APPEND);
+ if (fd < 0)
+ return fd;
+
+ len = dprintf(fd, "%d", score);
+ if (len < 0) {
+ close(fd);
+ return len;
+ }
+
+ close(fd);
+ return 0;
+}
+
/*
* This test creates two nested cgroups with and without enabling
* the memory controller.
@@ -380,10 +458,11 @@ static bool reclaim_until(const char *memcg, long goal);
*
* Then it checks actual memory usages and expects that:
* A/B memory.current ~= 50M
- * A/B/C memory.current ~= 29M
- * A/B/D memory.current ~= 21M
- * A/B/E memory.current ~= 0
- * A/B/F memory.current = 0
+ * A/B/C memory.current ~= 29M [memory.events:low > 0]
+ * A/B/D memory.current ~= 21M [memory.events:low > 0]
+ * A/B/E memory.current ~= 0 [memory.events:low == 0 if !memory_recursiveprot,
+ * undefined otherwise]
+ * A/B/F memory.current = 0 [memory.events:low == 0]
* (for origin of the numbers, see model in memcg_protection.m.)
*
* After that it tries to allocate more than there is
@@ -495,10 +574,10 @@ static int test_memcg_protection(const char *root, bool min)
for (i = 0; i < ARRAY_SIZE(children); i++)
c[i] = cg_read_long(children[i], "memory.current");
- if (!values_close(c[0], MB(29), 10))
+ if (!values_close(c[0], MB(29), 15))
goto cleanup;
- if (!values_close(c[1], MB(21), 10))
+ if (!values_close(c[1], MB(21), 20))
goto cleanup;
if (c[3] != 0)
@@ -525,7 +604,14 @@ static int test_memcg_protection(const char *root, bool min)
goto cleanup;
}
+ /*
+ * Child 2 has memory.low=0, but some low protection may still be
+ * distributed down from its parent with memory.low=50M if cgroup2
+ * memory_recursiveprot mount option is enabled. Ignore the low
+ * event count in this case.
+ */
for (i = 0; i < ARRAY_SIZE(children); i++) {
+ int ignore_low_events_index = has_recursiveprot ? 2 : -1;
int no_low_events_index = 1;
long low, oom;
@@ -534,6 +620,8 @@ static int test_memcg_protection(const char *root, bool min)
if (oom)
goto cleanup;
+ if (i == ignore_low_events_index)
+ continue;
if (i <= no_low_events_index && low <= 0)
goto cleanup;
if (i > no_low_events_index && low)
diff --git a/tools/testing/selftests/coredump/stackdump_test.c b/tools/testing/selftests/coredump/stackdump_test.c
index 9984413be9f0..68f8e479ac36 100644
--- a/tools/testing/selftests/coredump/stackdump_test.c
+++ b/tools/testing/selftests/coredump/stackdump_test.c
@@ -461,10 +461,15 @@ TEST_F(coredump, socket_detect_userspace_client)
_exit(EXIT_FAILURE);
}
+ ret = read(fd_coredump, &c, 1);
+
close(fd_coredump);
close(fd_server);
close(fd_peer_pidfd);
close(fd_core_file);
+
+ if (ret < 1)
+ _exit(EXIT_FAILURE);
_exit(EXIT_SUCCESS);
}
self->pid_coredump_server = pid_coredump_server;
diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile
index ecbf07afc6dd..ff21524be458 100644
--- a/tools/testing/selftests/damon/Makefile
+++ b/tools/testing/selftests/damon/Makefile
@@ -3,7 +3,7 @@
TEST_GEN_FILES += access_memory access_memory_even
-TEST_FILES = _chk_dependency.sh _damon_sysfs.py
+TEST_FILES = _damon_sysfs.py
# functionality tests
TEST_PROGS += sysfs.sh
diff --git a/tools/testing/selftests/damon/_chk_dependency.sh b/tools/testing/selftests/damon/_chk_dependency.sh
deleted file mode 100644
index dda3a87dc00a..000000000000
--- a/tools/testing/selftests/damon/_chk_dependency.sh
+++ /dev/null
@@ -1,52 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-
-# Kselftest framework requirement - SKIP code is 4.
-ksft_skip=4
-
-DBGFS=$(grep debugfs /proc/mounts --max-count 1 | awk '{print $2}')
-if [ "$DBGFS" = "" ]
-then
- echo "debugfs not mounted"
- exit $ksft_skip
-fi
-
-DBGFS+="/damon"
-
-if [ $EUID -ne 0 ];
-then
- echo "Run as root"
- exit $ksft_skip
-fi
-
-if [ ! -d "$DBGFS" ]
-then
- echo "$DBGFS not found"
- exit $ksft_skip
-fi
-
-if [ -f "$DBGFS/monitor_on_DEPRECATED" ]
-then
- monitor_on_file="monitor_on_DEPRECATED"
-else
- monitor_on_file="monitor_on"
-fi
-
-for f in attrs target_ids "$monitor_on_file"
-do
- if [ ! -f "$DBGFS/$f" ]
- then
- echo "$f not found"
- exit 1
- fi
-done
-
-permission_error="Operation not permitted"
-for f in attrs target_ids "$monitor_on_file"
-do
- status=$( cat "$DBGFS/$f" 2>&1 )
- if [ "${status#*$permission_error}" != "$status" ]; then
- echo "Permission for reading $DBGFS/$f denied; maybe secureboot enabled?"
- exit $ksft_skip
- fi
-done
diff --git a/tools/testing/selftests/damon/_damon_sysfs.py b/tools/testing/selftests/damon/_damon_sysfs.py
index 6e136dc3df19..5b1cb6b3ce4e 100644
--- a/tools/testing/selftests/damon/_damon_sysfs.py
+++ b/tools/testing/selftests/damon/_damon_sysfs.py
@@ -15,6 +15,10 @@ if sysfs_root is None:
print('Seems sysfs not mounted?')
exit(ksft_skip)
+if not os.path.exists(sysfs_root):
+ print('Seems DAMON disabled?')
+ exit(ksft_skip)
+
def write_file(path, string):
"Returns error string if failed, or None otherwise"
string = '%s' % string
@@ -420,11 +424,16 @@ class Kdamond:
tried_regions = []
tried_regions_dir = os.path.join(
scheme.sysfs_dir(), 'tried_regions')
+ region_indices = []
for filename in os.listdir(
os.path.join(scheme.sysfs_dir(), 'tried_regions')):
tried_region_dir = os.path.join(tried_regions_dir, filename)
if not os.path.isdir(tried_region_dir):
continue
+ region_indices.append(int(filename))
+ for region_idx in sorted(region_indices):
+ tried_region_dir = os.path.join(tried_regions_dir,
+ '%d' % region_idx)
region_values = []
for f in ['start', 'end', 'nr_accesses', 'age']:
content, err = read_file(
diff --git a/tools/testing/selftests/damon/_debugfs_common.sh b/tools/testing/selftests/damon/_debugfs_common.sh
deleted file mode 100644
index 54d45791b0d9..000000000000
--- a/tools/testing/selftests/damon/_debugfs_common.sh
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-
-test_write_result() {
- file=$1
- content=$2
- orig_content=$3
- expect_reason=$4
- expected=$5
-
- if [ "$expected" = "0" ]
- then
- echo "$content" > "$file"
- else
- echo "$content" > "$file" 2> /dev/null
- fi
- if [ $? -ne "$expected" ]
- then
- echo "writing $content to $file doesn't return $expected"
- echo "expected because: $expect_reason"
- echo "$orig_content" > "$file"
- exit 1
- fi
-}
-
-test_write_succ() {
- test_write_result "$1" "$2" "$3" "$4" 0
-}
-
-test_write_fail() {
- test_write_result "$1" "$2" "$3" "$4" 1
-}
-
-test_content() {
- file=$1
- orig_content=$2
- expected=$3
- expect_reason=$4
-
- content=$(cat "$file")
- if [ "$content" != "$expected" ]
- then
- echo "reading $file expected $expected but $content"
- echo "expected because: $expect_reason"
- echo "$orig_content" > "$file"
- exit 1
- fi
-}
-
-source ./_chk_dependency.sh
-
-damon_onoff="$DBGFS/monitor_on"
-if [ -f "$DBGFS/monitor_on_DEPRECATED" ]
-then
- damon_onoff="$DBGFS/monitor_on_DEPRECATED"
-else
- damon_onoff="$DBGFS/monitor_on"
-fi
-
-if [ $(cat "$damon_onoff") = "on" ]
-then
- echo "monitoring is on"
- exit $ksft_skip
-fi
diff --git a/tools/testing/selftests/drivers/net/hw/config b/tools/testing/selftests/drivers/net/hw/config
new file mode 100644
index 000000000000..88ae719e6f8f
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/hw/config
@@ -0,0 +1,5 @@
+CONFIG_IPV6=y
+CONFIG_IPV6_GRE=y
+CONFIG_NET_IPGRE=y
+CONFIG_NET_IPGRE_DEMUX=y
+CONFIG_VXLAN=y
diff --git a/tools/testing/selftests/drivers/net/hw/rss_ctx.py b/tools/testing/selftests/drivers/net/hw/rss_ctx.py
index ca60ae325c22..7bb552f8b182 100755
--- a/tools/testing/selftests/drivers/net/hw/rss_ctx.py
+++ b/tools/testing/selftests/drivers/net/hw/rss_ctx.py
@@ -747,6 +747,62 @@ def test_rss_ntuple_addition(cfg):
'noise' : (0,) })
+def test_rss_default_context_rule(cfg):
+ """
+ Allocate a port, direct this port to context 0, then create a new RSS
+ context and steer all TCP traffic to it (context 1). Verify that:
+ * Traffic to the specific port continues to use queues of the main
+ context (0/1).
+ * Traffic to any other TCP port is redirected to the new context
+ (queues 2/3).
+ """
+
+ require_ntuple(cfg)
+
+ queue_cnt = len(_get_rx_cnts(cfg))
+ if queue_cnt < 4:
+ try:
+ ksft_pr(f"Increasing queue count {queue_cnt} -> 4")
+ ethtool(f"-L {cfg.ifname} combined 4")
+ defer(ethtool, f"-L {cfg.ifname} combined {queue_cnt}")
+ except Exception as exc:
+ raise KsftSkipEx("Not enough queues for the test") from exc
+
+ # Use queues 0 and 1 for the main context
+ ethtool(f"-X {cfg.ifname} equal 2")
+ defer(ethtool, f"-X {cfg.ifname} default")
+
+ # Create a new RSS context that uses queues 2 and 3
+ ctx_id = ethtool_create(cfg, "-X", "context new start 2 equal 2")
+ defer(ethtool, f"-X {cfg.ifname} context {ctx_id} delete")
+
+ # Generic low-priority rule: redirect all TCP traffic to the new context.
+ # Give it an explicit higher location number (lower priority).
+ flow_generic = f"flow-type tcp{cfg.addr_ipver} dst-ip {cfg.addr} context {ctx_id} loc 1"
+ ethtool(f"-N {cfg.ifname} {flow_generic}")
+ defer(ethtool, f"-N {cfg.ifname} delete 1")
+
+ # Specific high-priority rule for a random port that should stay on context 0.
+ # Assign loc 0 so it is evaluated before the generic rule.
+ port_main = rand_port()
+ flow_main = f"flow-type tcp{cfg.addr_ipver} dst-ip {cfg.addr} dst-port {port_main} context 0 loc 0"
+ ethtool(f"-N {cfg.ifname} {flow_main}")
+ defer(ethtool, f"-N {cfg.ifname} delete 0")
+
+ _ntuple_rule_check(cfg, 1, ctx_id)
+
+ # Verify that traffic matching the specific rule still goes to queues 0/1
+ _send_traffic_check(cfg, port_main, "context 0",
+ { 'target': (0, 1),
+ 'empty' : (2, 3) })
+
+ # And that traffic for any other port is steered to the new context
+ port_other = rand_port()
+ _send_traffic_check(cfg, port_other, f"context {ctx_id}",
+ { 'target': (2, 3),
+ 'noise' : (0, 1) })
+
+
def main() -> None:
with NetDrvEpEnv(__file__, nsim_test=False) as cfg:
cfg.context_cnt = None
@@ -760,7 +816,8 @@ def main() -> None:
test_rss_context_overlap, test_rss_context_overlap2,
test_rss_context_out_of_order, test_rss_context4_create_with_cfg,
test_flow_add_context_missing,
- test_delete_rss_context_busy, test_rss_ntuple_addition],
+ test_delete_rss_context_busy, test_rss_ntuple_addition,
+ test_rss_default_context_rule],
args=(cfg, ))
ksft_exit()
diff --git a/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py b/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py
index f439c434ba36..648ff50bc1c3 100755
--- a/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py
+++ b/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py
@@ -38,7 +38,7 @@ def test_rss_input_xfrm(cfg, ipver):
raise KsftSkipEx("socket.SO_INCOMING_CPU was added in Python 3.11")
input_xfrm = cfg.ethnl.rss_get(
- {'header': {'dev-name': cfg.ifname}}).get('input_xfrm')
+ {'header': {'dev-name': cfg.ifname}}).get('input-xfrm')
# Check for symmetric xor/or-xor
if not input_xfrm or (input_xfrm != 1 and input_xfrm != 2):
diff --git a/tools/testing/selftests/drivers/net/hw/tso.py b/tools/testing/selftests/drivers/net/hw/tso.py
index e1ecb92f79d9..3370827409aa 100755
--- a/tools/testing/selftests/drivers/net/hw/tso.py
+++ b/tools/testing/selftests/drivers/net/hw/tso.py
@@ -39,7 +39,7 @@ def run_one_stream(cfg, ipver, remote_v4, remote_v6, should_lso):
port = rand_port()
listen_cmd = f"socat -{ipver} -t 2 -u TCP-LISTEN:{port},reuseport /dev/null,ignoreeof"
- with bkg(listen_cmd, host=cfg.remote) as nc:
+ with bkg(listen_cmd, host=cfg.remote, exit_wait=True) as nc:
wait_port_listen(port, host=cfg.remote)
if ipver == "4":
@@ -216,7 +216,7 @@ def main() -> None:
("", "6", "tx-tcp6-segmentation", None),
("vxlan", "", "tx-udp_tnl-segmentation", ("vxlan", True, "id 100 dstport 4789 noudpcsum")),
("vxlan_csum", "", "tx-udp_tnl-csum-segmentation", ("vxlan", False, "id 100 dstport 4789 udpcsum")),
- ("gre", "4", "tx-gre-segmentation", ("ipgre", False, "")),
+ ("gre", "4", "tx-gre-segmentation", ("gre", False, "")),
("gre", "6", "tx-gre-segmentation", ("ip6gre", False, "")),
)
diff --git a/tools/testing/selftests/drivers/net/netdevsim/peer.sh b/tools/testing/selftests/drivers/net/netdevsim/peer.sh
index 1bb46ec435d4..7f32b5600925 100755
--- a/tools/testing/selftests/drivers/net/netdevsim/peer.sh
+++ b/tools/testing/selftests/drivers/net/netdevsim/peer.sh
@@ -1,7 +1,8 @@
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0-only
-source ../../../net/lib.sh
+lib_dir=$(dirname $0)/../../../net
+source $lib_dir/lib.sh
NSIM_DEV_1_ID=$((256 + RANDOM % 256))
NSIM_DEV_1_SYS=/sys/bus/netdevsim/devices/netdevsim$NSIM_DEV_1_ID
diff --git a/tools/testing/selftests/filesystems/anon_inode_test.c b/tools/testing/selftests/filesystems/anon_inode_test.c
index e8e0ef1460d2..73e0a4d4fb2f 100644
--- a/tools/testing/selftests/filesystems/anon_inode_test.c
+++ b/tools/testing/selftests/filesystems/anon_inode_test.c
@@ -7,7 +7,7 @@
#include <sys/stat.h>
#include "../kselftest_harness.h"
-#include "overlayfs/wrappers.h"
+#include "wrappers.h"
TEST(anon_inode_no_chown)
{
diff --git a/tools/testing/selftests/filesystems/eventfd/eventfd_test.c b/tools/testing/selftests/filesystems/eventfd/eventfd_test.c
index 85acb4e3ef00..72d51ad0ee0e 100644
--- a/tools/testing/selftests/filesystems/eventfd/eventfd_test.c
+++ b/tools/testing/selftests/filesystems/eventfd/eventfd_test.c
@@ -50,7 +50,7 @@ TEST(eventfd_check_flag_rdwr)
ASSERT_GE(fd, 0);
flags = fcntl(fd, F_GETFL);
- // since the kernel automatically added O_RDWR.
+ // The kernel automatically adds the O_RDWR flag.
EXPECT_EQ(flags, O_RDWR);
close(fd);
@@ -85,7 +85,7 @@ TEST(eventfd_check_flag_nonblock)
close(fd);
}
-TEST(eventfd_chek_flag_cloexec_and_nonblock)
+TEST(eventfd_check_flag_cloexec_and_nonblock)
{
int fd, flags;
@@ -178,8 +178,7 @@ TEST(eventfd_check_flag_semaphore)
// The semaphore could only be obtained from fdinfo.
ret = verify_fdinfo(fd, &err, "eventfd-semaphore: ", 19, "1\n");
if (ret != 0)
- ksft_print_msg("eventfd-semaphore check failed, msg: %s\n",
- err.msg);
+ ksft_print_msg("eventfd semaphore flag check failed: %s\n", err.msg);
EXPECT_EQ(ret, 0);
close(fd);
diff --git a/tools/testing/selftests/filesystems/file_stressor.c b/tools/testing/selftests/filesystems/file_stressor.c
index 1136f93a9977..01dd89f8e52f 100644
--- a/tools/testing/selftests/filesystems/file_stressor.c
+++ b/tools/testing/selftests/filesystems/file_stressor.c
@@ -156,7 +156,7 @@ TEST_F_TIMEOUT(file_stressor, slab_typesafe_by_rcu, 900 * 2)
ssize_t nr_read;
/*
- * Concurrently read /proc/<pid>/fd/ which rougly does:
+ * Concurrently read /proc/<pid>/fd/ which roughly does:
*
* f = fget_task_next(p, &fd);
* if (!f)
diff --git a/tools/testing/selftests/futex/functional/.gitignore b/tools/testing/selftests/futex/functional/.gitignore
index 7b24ae89594a..776ad658f75e 100644
--- a/tools/testing/selftests/futex/functional/.gitignore
+++ b/tools/testing/selftests/futex/functional/.gitignore
@@ -11,3 +11,4 @@ futex_wait_timeout
futex_wait_uninitialized_heap
futex_wait_wouldblock
futex_waitv
+futex_numa
diff --git a/tools/testing/selftests/futex/functional/futex_numa_mpol.c b/tools/testing/selftests/futex/functional/futex_numa_mpol.c
index 20a9d3ecf743..a9ecfb2d3932 100644
--- a/tools/testing/selftests/futex/functional/futex_numa_mpol.c
+++ b/tools/testing/selftests/futex/functional/futex_numa_mpol.c
@@ -144,7 +144,7 @@ int main(int argc, char *argv[])
struct futex32_numa *futex_numa;
int mem_size, i;
void *futex_ptr;
- char c;
+ int c;
while ((c = getopt(argc, argv, "chv:")) != -1) {
switch (c) {
@@ -210,6 +210,10 @@ int main(int argc, char *argv[])
ret = mbind(futex_ptr, mem_size, MPOL_BIND, &nodemask,
sizeof(nodemask) * 8, 0);
if (ret == 0) {
+ ret = numa_set_mempolicy_home_node(futex_ptr, mem_size, i, 0);
+ if (ret != 0)
+ ksft_exit_fail_msg("Failed to set home node: %m, %d\n", errno);
+
ksft_print_msg("Node %d test\n", i);
futex_numa->futex = 0;
futex_numa->numa = FUTEX_NO_NODE;
@@ -220,8 +224,8 @@ int main(int argc, char *argv[])
if (0)
test_futex_mpol(futex_numa, 0);
if (futex_numa->numa != i) {
- ksft_test_result_fail("Returned NUMA node is %d expected %d\n",
- futex_numa->numa, i);
+ ksft_exit_fail_msg("Returned NUMA node is %d expected %d\n",
+ futex_numa->numa, i);
}
}
}
diff --git a/tools/testing/selftests/futex/functional/futex_priv_hash.c b/tools/testing/selftests/futex/functional/futex_priv_hash.c
index 2dca18fefedc..24a92dc94eb8 100644
--- a/tools/testing/selftests/futex/functional/futex_priv_hash.c
+++ b/tools/testing/selftests/futex/functional/futex_priv_hash.c
@@ -130,7 +130,7 @@ int main(int argc, char *argv[])
pthread_mutexattr_t mutex_attr_pi;
int use_global_hash = 0;
int ret;
- char c;
+ int c;
while ((c = getopt(argc, argv, "cghv:")) != -1) {
switch (c) {
diff --git a/tools/testing/selftests/futex/include/futex2test.h b/tools/testing/selftests/futex/include/futex2test.h
index ea79662405bc..1f625b39948a 100644
--- a/tools/testing/selftests/futex/include/futex2test.h
+++ b/tools/testing/selftests/futex/include/futex2test.h
@@ -4,6 +4,7 @@
*
* Copyright 2021 Collabora Ltd.
*/
+#include <linux/time_types.h>
#include <stdint.h>
#define u64_to_ptr(x) ((void *)(uintptr_t)(x))
@@ -65,7 +66,12 @@ struct futex32_numa {
static inline int futex_waitv(volatile struct futex_waitv *waiters, unsigned long nr_waiters,
unsigned long flags, struct timespec *timo, clockid_t clockid)
{
- return syscall(__NR_futex_waitv, waiters, nr_waiters, flags, timo, clockid);
+ struct __kernel_timespec ts = {
+ .tv_sec = timo->tv_sec,
+ .tv_nsec = timo->tv_nsec,
+ };
+
+ return syscall(__NR_futex_waitv, waiters, nr_waiters, flags, &ts, clockid);
}
/*
diff --git a/tools/testing/selftests/hid/tests/test_mouse.py b/tools/testing/selftests/hid/tests/test_mouse.py
index 66daf7e5975c..eb4e15a0e53b 100644
--- a/tools/testing/selftests/hid/tests/test_mouse.py
+++ b/tools/testing/selftests/hid/tests/test_mouse.py
@@ -439,6 +439,68 @@ class BadResolutionMultiplierMouse(ResolutionMultiplierMouse):
return 32 # EPIPE
+class BadReportDescriptorMouse(BaseMouse):
+ """
+ This "device" was one autogenerated by syzbot. There are a lot of issues in
+ it, and the most problematic is that it declares features that have no
+ size.
+
+ This leads to report->size being set to 0 and can mess up with usbhid
+ internals. Fortunately, uhid merely passes the incoming buffer, without
+ touching it so a buffer of size 0 will be translated to [] without
+ triggering a kernel oops.
+
+ Because the report descriptor is wrong, no input are created, and we need
+ to tweak a little bit the parameters to make it look correct.
+ """
+
+ # fmt: off
+ report_descriptor = [
+ 0x96, 0x01, 0x00, # Report Count (1) 0
+ 0x06, 0x01, 0x00, # Usage Page (Generic Desktop) 3
+ # 0x03, 0x00, 0x00, 0x00, 0x00, # Ignored by the kernel somehow
+ 0x2a, 0x90, 0xa0, # Usage Maximum (41104) 6
+ 0x27, 0x00, 0x00, 0x00, 0x00, # Logical Maximum (0) 9
+ 0xb3, 0x81, 0x3e, 0x25, 0x03, # Feature (Cnst,Arr,Abs,Vol) 14
+ 0x1b, 0xdd, 0xe8, 0x40, 0x50, # Usage Minimum (1346431197) 19
+ 0x3b, 0x5d, 0x8c, 0x3d, 0xda, # Designator Index 24
+ ]
+ # fmt: on
+
+ def __init__(
+ self, rdesc=report_descriptor, name=None, input_info=(3, 0x045E, 0x07DA)
+ ):
+ super().__init__(rdesc, name, input_info)
+ self.high_resolution_report_called = False
+
+ def get_evdev(self, application=None):
+ assert self._input_nodes is None
+ return (
+ "Ok" # should be a list or None, but both would fail, so abusing the system
+ )
+
+ def next_sync_events(self, application=None):
+ # there are no evdev nodes, so no events
+ return []
+
+ def is_ready(self):
+ # we wait for the SET_REPORT command to come
+ return self.high_resolution_report_called
+
+ def set_report(self, req, rnum, rtype, data):
+ if rtype != self.UHID_FEATURE_REPORT:
+ raise InvalidHIDCommunication(f"Unexpected report type: {rtype}")
+ if rnum != 0x0:
+ raise InvalidHIDCommunication(f"Unexpected report number: {rnum}")
+
+ if len(data) != 1:
+ raise InvalidHIDCommunication(f"Unexpected data: {data}, expected '[0]'")
+
+ self.high_resolution_report_called = True
+
+ return 0
+
+
class ResolutionMultiplierHWheelMouse(TwoWheelMouse):
# fmt: off
report_descriptor = [
@@ -975,3 +1037,11 @@ class TestMiMouse(TestWheelMouse):
# assert below print out the real error
pass
assert remaining == []
+
+
+class TestBadReportDescriptorMouse(base.BaseTestCase.TestUhid):
+ def create_device(self):
+ return BadReportDescriptorMouse()
+
+ def assertName(self, uhdev):
+ pass
diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c
index 1a8e85afe9aa..1926ef6b40ab 100644
--- a/tools/testing/selftests/iommu/iommufd.c
+++ b/tools/testing/selftests/iommu/iommufd.c
@@ -54,6 +54,8 @@ static __attribute__((constructor)) void setup_sizes(void)
mfd_buffer = memfd_mmap(BUFFER_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
&mfd);
+ assert(mfd_buffer != MAP_FAILED);
+ assert(mfd > 0);
}
FIXTURE(iommufd)
@@ -1746,13 +1748,15 @@ TEST_F(iommufd_mock_domain, all_aligns)
unsigned int end;
uint8_t *buf;
int prot = PROT_READ | PROT_WRITE;
- int mfd;
+ int mfd = -1;
if (variant->file)
buf = memfd_mmap(buf_size, prot, MAP_SHARED, &mfd);
else
buf = mmap(0, buf_size, prot, self->mmap_flags, -1, 0);
ASSERT_NE(MAP_FAILED, buf);
+ if (variant->file)
+ ASSERT_GT(mfd, 0);
check_refs(buf, buf_size, 0);
/*
@@ -1798,13 +1802,15 @@ TEST_F(iommufd_mock_domain, all_aligns_copy)
unsigned int end;
uint8_t *buf;
int prot = PROT_READ | PROT_WRITE;
- int mfd;
+ int mfd = -1;
if (variant->file)
buf = memfd_mmap(buf_size, prot, MAP_SHARED, &mfd);
else
buf = mmap(0, buf_size, prot, self->mmap_flags, -1, 0);
ASSERT_NE(MAP_FAILED, buf);
+ if (variant->file)
+ ASSERT_GT(mfd, 0);
check_refs(buf, buf_size, 0);
/*
@@ -2008,6 +2014,7 @@ FIXTURE_VARIANT(iommufd_dirty_tracking)
FIXTURE_SETUP(iommufd_dirty_tracking)
{
+ size_t mmap_buffer_size;
unsigned long size;
int mmap_flags;
void *vrc;
@@ -2022,22 +2029,33 @@ FIXTURE_SETUP(iommufd_dirty_tracking)
self->fd = open("/dev/iommu", O_RDWR);
ASSERT_NE(-1, self->fd);
- rc = posix_memalign(&self->buffer, HUGEPAGE_SIZE, variant->buffer_size);
- if (rc || !self->buffer) {
- SKIP(return, "Skipping buffer_size=%lu due to errno=%d",
- variant->buffer_size, rc);
- }
-
mmap_flags = MAP_SHARED | MAP_ANONYMOUS | MAP_FIXED;
+ mmap_buffer_size = variant->buffer_size;
if (variant->hugepages) {
/*
* MAP_POPULATE will cause the kernel to fail mmap if THPs are
* not available.
*/
mmap_flags |= MAP_HUGETLB | MAP_POPULATE;
+
+ /*
+ * Allocation must be aligned to the HUGEPAGE_SIZE, because the
+ * following mmap() will automatically align the length to be a
+ * multiple of the underlying huge page size. Failing to do the
+ * same at this allocation will result in a memory overwrite by
+ * the mmap().
+ */
+ if (mmap_buffer_size < HUGEPAGE_SIZE)
+ mmap_buffer_size = HUGEPAGE_SIZE;
+ }
+
+ rc = posix_memalign(&self->buffer, HUGEPAGE_SIZE, mmap_buffer_size);
+ if (rc || !self->buffer) {
+ SKIP(return, "Skipping buffer_size=%lu due to errno=%d",
+ mmap_buffer_size, rc);
}
assert((uintptr_t)self->buffer % HUGEPAGE_SIZE == 0);
- vrc = mmap(self->buffer, variant->buffer_size, PROT_READ | PROT_WRITE,
+ vrc = mmap(self->buffer, mmap_buffer_size, PROT_READ | PROT_WRITE,
mmap_flags, -1, 0);
assert(vrc == self->buffer);
@@ -2066,8 +2084,8 @@ FIXTURE_SETUP(iommufd_dirty_tracking)
FIXTURE_TEARDOWN(iommufd_dirty_tracking)
{
- munmap(self->buffer, variant->buffer_size);
- munmap(self->bitmap, DIV_ROUND_UP(self->bitmap_size, BITS_PER_BYTE));
+ free(self->buffer);
+ free(self->bitmap);
teardown_iommufd(self->fd, _metadata);
}
diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h
index 72f6636e5d90..6e967b58acfd 100644
--- a/tools/testing/selftests/iommu/iommufd_utils.h
+++ b/tools/testing/selftests/iommu/iommufd_utils.h
@@ -60,13 +60,18 @@ static inline void *memfd_mmap(size_t length, int prot, int flags, int *mfd_p)
{
int mfd_flags = (flags & MAP_HUGETLB) ? MFD_HUGETLB : 0;
int mfd = memfd_create("buffer", mfd_flags);
+ void *buf = MAP_FAILED;
if (mfd <= 0)
return MAP_FAILED;
if (ftruncate(mfd, length))
- return MAP_FAILED;
+ goto out;
*mfd_p = mfd;
- return mmap(0, length, prot, flags, mfd, 0);
+ buf = mmap(0, length, prot, flags, mfd, 0);
+out:
+ if (buf == MAP_FAILED)
+ close(mfd);
+ return buf;
}
/*
diff --git a/tools/testing/selftests/kmod/config b/tools/testing/selftests/kmod/config
index 259f4fd6b5e2..1f1e63494af9 100644
--- a/tools/testing/selftests/kmod/config
+++ b/tools/testing/selftests/kmod/config
@@ -1,7 +1,2 @@
CONFIG_TEST_KMOD=m
CONFIG_TEST_LKM=m
-CONFIG_XFS_FS=m
-
-# For the module parameter force_init_test is used
-CONFIG_TUN=m
-CONFIG_BTRFS_FS=m
diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm
index 3e786080473d..38b95998e1e6 100644
--- a/tools/testing/selftests/kvm/Makefile.kvm
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@@ -8,6 +8,7 @@ LIBKVM += lib/elf.c
LIBKVM += lib/guest_modes.c
LIBKVM += lib/io.c
LIBKVM += lib/kvm_util.c
+LIBKVM += lib/lru_gen_util.c
LIBKVM += lib/memstress.c
LIBKVM += lib/guest_sprintf.c
LIBKVM += lib/rbtree.c
@@ -70,6 +71,7 @@ TEST_GEN_PROGS_x86 += x86/cr4_cpuid_sync_test
TEST_GEN_PROGS_x86 += x86/dirty_log_page_splitting_test
TEST_GEN_PROGS_x86 += x86/feature_msrs_test
TEST_GEN_PROGS_x86 += x86/exit_on_emulation_failure_test
+TEST_GEN_PROGS_x86 += x86/fastops_test
TEST_GEN_PROGS_x86 += x86/fix_hypercall_test
TEST_GEN_PROGS_x86 += x86/hwcr_msr_test
TEST_GEN_PROGS_x86 += x86/hyperv_clock
@@ -82,6 +84,7 @@ TEST_GEN_PROGS_x86 += x86/hyperv_svm_test
TEST_GEN_PROGS_x86 += x86/hyperv_tlb_flush
TEST_GEN_PROGS_x86 += x86/kvm_clock_test
TEST_GEN_PROGS_x86 += x86/kvm_pv_test
+TEST_GEN_PROGS_x86 += x86/kvm_buslock_test
TEST_GEN_PROGS_x86 += x86/monitor_mwait_test
TEST_GEN_PROGS_x86 += x86/nested_emulation_test
TEST_GEN_PROGS_x86 += x86/nested_exceptions_test
@@ -222,6 +225,7 @@ OVERRIDE_TARGETS = 1
# importantly defines, i.e. overwrites, $(CC) (unless `make -e` or `make CC=`,
# which causes the environment variable to override the makefile).
include ../lib.mk
+include ../cgroup/lib/libcgroup.mk
INSTALL_HDR_PATH = $(top_srcdir)/usr
LINUX_HDR_PATH = $(INSTALL_HDR_PATH)/include/
@@ -275,7 +279,7 @@ LIBKVM_S := $(filter %.S,$(LIBKVM))
LIBKVM_C_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM_C))
LIBKVM_S_OBJ := $(patsubst %.S, $(OUTPUT)/%.o, $(LIBKVM_S))
LIBKVM_STRING_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM_STRING))
-LIBKVM_OBJS = $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ) $(LIBKVM_STRING_OBJ)
+LIBKVM_OBJS = $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ) $(LIBKVM_STRING_OBJ) $(LIBCGROUP_O)
SPLIT_TEST_GEN_PROGS := $(patsubst %, $(OUTPUT)/%, $(SPLIT_TESTS))
SPLIT_TEST_GEN_OBJ := $(patsubst %, $(OUTPUT)/$(ARCH)/%.o, $(SPLIT_TESTS))
diff --git a/tools/testing/selftests/kvm/access_tracking_perf_test.c b/tools/testing/selftests/kvm/access_tracking_perf_test.c
index 447e619cf856..da7196fd1b23 100644
--- a/tools/testing/selftests/kvm/access_tracking_perf_test.c
+++ b/tools/testing/selftests/kvm/access_tracking_perf_test.c
@@ -7,9 +7,11 @@
* This test measures the performance effects of KVM's access tracking.
* Access tracking is driven by the MMU notifiers test_young, clear_young, and
* clear_flush_young. These notifiers do not have a direct userspace API,
- * however the clear_young notifier can be triggered by marking a pages as idle
- * in /sys/kernel/mm/page_idle/bitmap. This test leverages that mechanism to
- * enable access tracking on guest memory.
+ * however the clear_young notifier can be triggered either by
+ * 1. marking a pages as idle in /sys/kernel/mm/page_idle/bitmap OR
+ * 2. adding a new MGLRU generation using the lru_gen debugfs file.
+ * This test leverages page_idle to enable access tracking on guest memory
+ * unless MGLRU is enabled, in which case MGLRU is used.
*
* To measure performance this test runs a VM with a configurable number of
* vCPUs that each touch every page in disjoint regions of memory. Performance
@@ -17,10 +19,11 @@
* predefined region.
*
* Note that a deterministic correctness test of access tracking is not possible
- * by using page_idle as it exists today. This is for a few reasons:
+ * by using page_idle or MGLRU aging as it exists today. This is for a few
+ * reasons:
*
- * 1. page_idle only issues clear_young notifiers, which lack a TLB flush. This
- * means subsequent guest accesses are not guaranteed to see page table
+ * 1. page_idle and MGLRU only issue clear_young notifiers, which lack a TLB flush.
+ * This means subsequent guest accesses are not guaranteed to see page table
* updates made by KVM until some time in the future.
*
* 2. page_idle only operates on LRU pages. Newly allocated pages are not
@@ -48,9 +51,17 @@
#include "guest_modes.h"
#include "processor.h"
+#include "cgroup_util.h"
+#include "lru_gen_util.h"
+
+static const char *TEST_MEMCG_NAME = "access_tracking_perf_test";
+
/* Global variable used to synchronize all of the vCPU threads. */
static int iteration;
+/* The cgroup memory controller root. Needed for lru_gen-based aging. */
+char cgroup_root[PATH_MAX];
+
/* Defines what vCPU threads should do during a given iteration. */
static enum {
/* Run the vCPU to access all its memory. */
@@ -65,6 +76,25 @@ static int vcpu_last_completed_iteration[KVM_MAX_VCPUS];
/* Whether to overlap the regions of memory vCPUs access. */
static bool overlap_memory_access;
+/*
+ * If the test should only warn if there are too many idle pages (i.e., it is
+ * expected).
+ * -1: Not yet set.
+ * 0: We do not expect too many idle pages, so FAIL if too many idle pages.
+ * 1: Having too many idle pages is expected, so merely print a warning if
+ * too many idle pages are found.
+ */
+static int idle_pages_warn_only = -1;
+
+/* Whether or not to use MGLRU instead of page_idle for access tracking */
+static bool use_lru_gen;
+
+/* Total number of pages to expect in the memcg after touching everything */
+static long test_pages;
+
+/* Last generation we found the pages in */
+static int lru_gen_last_gen = -1;
+
struct test_params {
/* The backing source for the region of memory. */
enum vm_mem_backing_src_type backing_src;
@@ -123,8 +153,24 @@ static void mark_page_idle(int page_idle_fd, uint64_t pfn)
"Set page_idle bits for PFN 0x%" PRIx64, pfn);
}
-static void mark_vcpu_memory_idle(struct kvm_vm *vm,
- struct memstress_vcpu_args *vcpu_args)
+static void too_many_idle_pages(long idle_pages, long total_pages, int vcpu_idx)
+{
+ char prefix[18] = {};
+
+ if (vcpu_idx >= 0)
+ snprintf(prefix, 18, "vCPU%d: ", vcpu_idx);
+
+ TEST_ASSERT(idle_pages_warn_only,
+ "%sToo many pages still idle (%lu out of %lu)",
+ prefix, idle_pages, total_pages);
+
+ printf("WARNING: %sToo many pages still idle (%lu out of %lu), "
+ "this will affect performance results.\n",
+ prefix, idle_pages, total_pages);
+}
+
+static void pageidle_mark_vcpu_memory_idle(struct kvm_vm *vm,
+ struct memstress_vcpu_args *vcpu_args)
{
int vcpu_idx = vcpu_args->vcpu_idx;
uint64_t base_gva = vcpu_args->gva;
@@ -177,27 +223,79 @@ static void mark_vcpu_memory_idle(struct kvm_vm *vm,
* arbitrary; high enough that we ensure most memory access went through
* access tracking but low enough as to not make the test too brittle
* over time and across architectures.
- *
- * When running the guest as a nested VM, "warn" instead of asserting
- * as the TLB size is effectively unlimited and the KVM doesn't
- * explicitly flush the TLB when aging SPTEs. As a result, more pages
- * are cached and the guest won't see the "idle" bit cleared.
*/
- if (still_idle >= pages / 10) {
-#ifdef __x86_64__
- TEST_ASSERT(this_cpu_has(X86_FEATURE_HYPERVISOR),
- "vCPU%d: Too many pages still idle (%lu out of %lu)",
- vcpu_idx, still_idle, pages);
-#endif
- printf("WARNING: vCPU%d: Too many pages still idle (%lu out of %lu), "
- "this will affect performance results.\n",
- vcpu_idx, still_idle, pages);
- }
+ if (still_idle >= pages / 10)
+ too_many_idle_pages(still_idle, pages,
+ overlap_memory_access ? -1 : vcpu_idx);
close(page_idle_fd);
close(pagemap_fd);
}
+int find_generation(struct memcg_stats *stats, long total_pages)
+{
+ /*
+ * For finding the generation that contains our pages, use the same
+ * 90% threshold that page_idle uses.
+ */
+ int gen = lru_gen_find_generation(stats, total_pages * 9 / 10);
+
+ if (gen >= 0)
+ return gen;
+
+ if (!idle_pages_warn_only) {
+ TEST_FAIL("Could not find a generation with 90%% of guest memory (%ld pages).",
+ total_pages * 9 / 10);
+ return gen;
+ }
+
+ /*
+ * We couldn't find a generation with 90% of guest memory, which can
+ * happen if access tracking is unreliable. Simply look for a majority
+ * of pages.
+ */
+ puts("WARNING: Couldn't find a generation with 90% of guest memory. "
+ "Performance results may not be accurate.");
+ gen = lru_gen_find_generation(stats, total_pages / 2);
+ TEST_ASSERT(gen >= 0,
+ "Could not find a generation with 50%% of guest memory (%ld pages).",
+ total_pages / 2);
+ return gen;
+}
+
+static void lru_gen_mark_memory_idle(struct kvm_vm *vm)
+{
+ struct timespec ts_start;
+ struct timespec ts_elapsed;
+ struct memcg_stats stats;
+ int new_gen;
+
+ /* Make a new generation */
+ clock_gettime(CLOCK_MONOTONIC, &ts_start);
+ lru_gen_do_aging(&stats, TEST_MEMCG_NAME);
+ ts_elapsed = timespec_elapsed(ts_start);
+
+ /* Check the generation again */
+ new_gen = find_generation(&stats, test_pages);
+
+ /*
+ * This function should only be invoked with newly-accessed pages,
+ * so pages should always move to a newer generation.
+ */
+ if (new_gen <= lru_gen_last_gen) {
+ /* We did not move to a newer generation. */
+ long idle_pages = lru_gen_sum_memcg_stats_for_gen(lru_gen_last_gen,
+ &stats);
+
+ too_many_idle_pages(min_t(long, idle_pages, test_pages),
+ test_pages, -1);
+ }
+ pr_info("%-30s: %ld.%09lds\n",
+ "Mark memory idle (lru_gen)", ts_elapsed.tv_sec,
+ ts_elapsed.tv_nsec);
+ lru_gen_last_gen = new_gen;
+}
+
static void assert_ucall(struct kvm_vcpu *vcpu, uint64_t expected_ucall)
{
struct ucall uc;
@@ -237,7 +335,7 @@ static void vcpu_thread_main(struct memstress_vcpu_args *vcpu_args)
assert_ucall(vcpu, UCALL_SYNC);
break;
case ITERATION_MARK_IDLE:
- mark_vcpu_memory_idle(vm, vcpu_args);
+ pageidle_mark_vcpu_memory_idle(vm, vcpu_args);
break;
}
@@ -289,15 +387,18 @@ static void access_memory(struct kvm_vm *vm, int nr_vcpus,
static void mark_memory_idle(struct kvm_vm *vm, int nr_vcpus)
{
+ if (use_lru_gen)
+ return lru_gen_mark_memory_idle(vm);
+
/*
* Even though this parallelizes the work across vCPUs, this is still a
* very slow operation because page_idle forces the test to mark one pfn
- * at a time and the clear_young notifier serializes on the KVM MMU
+ * at a time and the clear_young notifier may serialize on the KVM MMU
* lock.
*/
pr_debug("Marking VM memory idle (slow)...\n");
iteration_work = ITERATION_MARK_IDLE;
- run_iteration(vm, nr_vcpus, "Mark memory idle");
+ run_iteration(vm, nr_vcpus, "Mark memory idle (page_idle)");
}
static void run_test(enum vm_guest_mode mode, void *arg)
@@ -309,11 +410,38 @@ static void run_test(enum vm_guest_mode mode, void *arg)
vm = memstress_create_vm(mode, nr_vcpus, params->vcpu_memory_bytes, 1,
params->backing_src, !overlap_memory_access);
+ /*
+ * If guest_page_size is larger than the host's page size, the
+ * guest (memstress) will only fault in a subset of the host's pages.
+ */
+ test_pages = params->nr_vcpus * params->vcpu_memory_bytes /
+ max(memstress_args.guest_page_size,
+ (uint64_t)getpagesize());
+
memstress_start_vcpu_threads(nr_vcpus, vcpu_thread_main);
pr_info("\n");
access_memory(vm, nr_vcpus, ACCESS_WRITE, "Populating memory");
+ if (use_lru_gen) {
+ struct memcg_stats stats;
+
+ /*
+ * Do a page table scan now. Following initial population, aging
+ * may not cause the pages to move to a newer generation. Do
+ * an aging pass now so that future aging passes always move
+ * pages to a newer generation.
+ */
+ printf("Initial aging pass (lru_gen)\n");
+ lru_gen_do_aging(&stats, TEST_MEMCG_NAME);
+ TEST_ASSERT(lru_gen_sum_memcg_stats(&stats) >= test_pages,
+ "Not all pages accounted for (looking for %ld). "
+ "Was the memcg set up correctly?", test_pages);
+ access_memory(vm, nr_vcpus, ACCESS_WRITE, "Re-populating memory");
+ lru_gen_read_memcg_stats(&stats, TEST_MEMCG_NAME);
+ lru_gen_last_gen = find_generation(&stats, test_pages);
+ }
+
/* As a control, read and write to the populated memory first. */
access_memory(vm, nr_vcpus, ACCESS_WRITE, "Writing to populated memory");
access_memory(vm, nr_vcpus, ACCESS_READ, "Reading from populated memory");
@@ -328,6 +456,37 @@ static void run_test(enum vm_guest_mode mode, void *arg)
memstress_destroy_vm(vm);
}
+static int access_tracking_unreliable(void)
+{
+#ifdef __x86_64__
+ /*
+ * When running nested, the TLB size may be effectively unlimited (for
+ * example, this is the case when running on KVM L0), and KVM doesn't
+ * explicitly flush the TLB when aging SPTEs. As a result, more pages
+ * are cached and the guest won't see the "idle" bit cleared.
+ */
+ if (this_cpu_has(X86_FEATURE_HYPERVISOR)) {
+ puts("Skipping idle page count sanity check, because the test is run nested");
+ return 1;
+ }
+#endif
+ /*
+ * When NUMA balancing is enabled, guest memory will be unmapped to get
+ * NUMA faults, dropping the Accessed bits.
+ */
+ if (is_numa_balancing_enabled()) {
+ puts("Skipping idle page count sanity check, because NUMA balancing is enabled");
+ return 1;
+ }
+ return 0;
+}
+
+static int run_test_for_each_guest_mode(const char *cgroup, void *arg)
+{
+ for_each_guest_mode(run_test, arg);
+ return 0;
+}
+
static void help(char *name)
{
puts("");
@@ -342,11 +501,22 @@ static void help(char *name)
printf(" -v: specify the number of vCPUs to run.\n");
printf(" -o: Overlap guest memory accesses instead of partitioning\n"
" them into a separate region of memory for each vCPU.\n");
+ printf(" -w: Control whether the test warns or fails if more than 10%%\n"
+ " of pages are still seen as idle/old after accessing guest\n"
+ " memory. >0 == warn only, 0 == fail, <0 == auto. For auto\n"
+ " mode, the test fails by default, but switches to warn only\n"
+ " if NUMA balancing is enabled or the test detects it's running\n"
+ " in a VM.\n");
backing_src_help("-s");
puts("");
exit(0);
}
+void destroy_cgroup(char *cg)
+{
+ printf("Destroying cgroup: %s\n", cg);
+}
+
int main(int argc, char *argv[])
{
struct test_params params = {
@@ -354,12 +524,13 @@ int main(int argc, char *argv[])
.vcpu_memory_bytes = DEFAULT_PER_VCPU_MEM_SIZE,
.nr_vcpus = 1,
};
+ char *new_cg = NULL;
int page_idle_fd;
int opt;
guest_modes_append_default();
- while ((opt = getopt(argc, argv, "hm:b:v:os:")) != -1) {
+ while ((opt = getopt(argc, argv, "hm:b:v:os:w:")) != -1) {
switch (opt) {
case 'm':
guest_modes_cmdline(optarg);
@@ -376,6 +547,11 @@ int main(int argc, char *argv[])
case 's':
params.backing_src = parse_backing_src_type(optarg);
break;
+ case 'w':
+ idle_pages_warn_only =
+ atoi_non_negative("Idle pages warning",
+ optarg);
+ break;
case 'h':
default:
help(argv[0]);
@@ -383,12 +559,53 @@ int main(int argc, char *argv[])
}
}
- page_idle_fd = open("/sys/kernel/mm/page_idle/bitmap", O_RDWR);
- __TEST_REQUIRE(page_idle_fd >= 0,
- "CONFIG_IDLE_PAGE_TRACKING is not enabled");
- close(page_idle_fd);
+ if (idle_pages_warn_only == -1)
+ idle_pages_warn_only = access_tracking_unreliable();
+
+ if (lru_gen_usable()) {
+ bool cg_created = true;
+ int ret;
- for_each_guest_mode(run_test, &params);
+ puts("Using lru_gen for aging");
+ use_lru_gen = true;
+
+ if (cg_find_controller_root(cgroup_root, sizeof(cgroup_root), "memory"))
+ ksft_exit_skip("Cannot find memory cgroup controller\n");
+
+ new_cg = cg_name(cgroup_root, TEST_MEMCG_NAME);
+ printf("Creating cgroup: %s\n", new_cg);
+ if (cg_create(new_cg)) {
+ if (errno == EEXIST) {
+ printf("Found existing cgroup");
+ cg_created = false;
+ } else {
+ ksft_exit_skip("could not create new cgroup: %s\n", new_cg);
+ }
+ }
+
+ /*
+ * This will fork off a new process to run the test within
+ * a new memcg, so we need to properly propagate the return
+ * value up.
+ */
+ ret = cg_run(new_cg, &run_test_for_each_guest_mode, &params);
+ if (cg_created)
+ cg_destroy(new_cg);
+ if (ret < 0)
+ TEST_FAIL("child did not spawn or was abnormally killed");
+ if (ret)
+ return ret;
+ } else {
+ page_idle_fd = open("/sys/kernel/mm/page_idle/bitmap", O_RDWR);
+ __TEST_REQUIRE(page_idle_fd >= 0,
+ "Couldn't open /sys/kernel/mm/page_idle/bitmap. "
+ "Is CONFIG_IDLE_PAGE_TRACKING enabled?");
+
+ close(page_idle_fd);
+
+ puts("Using page_idle for aging");
+ run_test_for_each_guest_mode(NULL, &params);
+ }
return 0;
}
diff --git a/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c b/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c
index a36a7e2db434..4e71740a098b 100644
--- a/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c
+++ b/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c
@@ -22,7 +22,8 @@
#include "gic.h"
#include "vgic.h"
-static const uint64_t CVAL_MAX = ~0ULL;
+/* Depends on counter width. */
+static uint64_t CVAL_MAX;
/* tval is a signed 32-bit int. */
static const int32_t TVAL_MAX = INT32_MAX;
static const int32_t TVAL_MIN = INT32_MIN;
@@ -30,8 +31,8 @@ static const int32_t TVAL_MIN = INT32_MIN;
/* After how much time we say there is no IRQ. */
static const uint32_t TIMEOUT_NO_IRQ_US = 50000;
-/* A nice counter value to use as the starting one for most tests. */
-static const uint64_t DEF_CNT = (CVAL_MAX / 2);
+/* Counter value to use as the starting one for most tests. Set to CVAL_MAX/2 */
+static uint64_t DEF_CNT;
/* Number of runs. */
static const uint32_t NR_TEST_ITERS_DEF = 5;
@@ -191,8 +192,8 @@ static void set_tval_irq(enum arch_timer timer, uint64_t tval_cycles,
{
atomic_set(&shared_data.handled, 0);
atomic_set(&shared_data.spurious, 0);
- timer_set_ctl(timer, ctl);
timer_set_tval(timer, tval_cycles);
+ timer_set_ctl(timer, ctl);
}
static void set_xval_irq(enum arch_timer timer, uint64_t xval, uint32_t ctl,
@@ -732,12 +733,6 @@ static void test_move_counters_ahead_of_timers(enum arch_timer timer)
test_set_cnt_after_tval(timer, 0, tval, (uint64_t) tval + 1,
wm);
}
-
- for (i = 0; i < ARRAY_SIZE(sleep_method); i++) {
- sleep_method_t sm = sleep_method[i];
-
- test_set_cnt_after_cval_no_irq(timer, 0, DEF_CNT, CVAL_MAX, sm);
- }
}
/*
@@ -849,17 +844,17 @@ static void guest_code(enum arch_timer timer)
GUEST_DONE();
}
+static cpu_set_t default_cpuset;
+
static uint32_t next_pcpu(void)
{
uint32_t max = get_nprocs();
uint32_t cur = sched_getcpu();
uint32_t next = cur;
- cpu_set_t cpuset;
+ cpu_set_t cpuset = default_cpuset;
TEST_ASSERT(max > 1, "Need at least two physical cpus");
- sched_getaffinity(0, sizeof(cpuset), &cpuset);
-
do {
next = (next + 1) % CPU_SETSIZE;
} while (!CPU_ISSET(next, &cpuset));
@@ -959,6 +954,8 @@ static void test_init_timer_irq(struct kvm_vm *vm, struct kvm_vcpu *vcpu)
pr_debug("ptimer_irq: %d; vtimer_irq: %d\n", ptimer_irq, vtimer_irq);
}
+static int gic_fd;
+
static void test_vm_create(struct kvm_vm **vm, struct kvm_vcpu **vcpu,
enum arch_timer timer)
{
@@ -973,8 +970,18 @@ static void test_vm_create(struct kvm_vm **vm, struct kvm_vcpu **vcpu,
vcpu_args_set(*vcpu, 1, timer);
test_init_timer_irq(*vm, *vcpu);
- vgic_v3_setup(*vm, 1, 64);
+ gic_fd = vgic_v3_setup(*vm, 1, 64);
+ __TEST_REQUIRE(gic_fd >= 0, "Failed to create vgic-v3");
+
sync_global_to_guest(*vm, test_args);
+ sync_global_to_guest(*vm, CVAL_MAX);
+ sync_global_to_guest(*vm, DEF_CNT);
+}
+
+static void test_vm_cleanup(struct kvm_vm *vm)
+{
+ close(gic_fd);
+ kvm_vm_free(vm);
}
static void test_print_help(char *name)
@@ -986,7 +993,7 @@ static void test_print_help(char *name)
pr_info("\t-b: Test both physical and virtual timers (default: true)\n");
pr_info("\t-l: Delta (in ms) used for long wait time test (default: %u)\n",
LONG_WAIT_TEST_MS);
- pr_info("\t-l: Delta (in ms) used for wait times (default: %u)\n",
+ pr_info("\t-w: Delta (in ms) used for wait times (default: %u)\n",
WAIT_TEST_MS);
pr_info("\t-p: Test physical timer (default: true)\n");
pr_info("\t-v: Test virtual timer (default: true)\n");
@@ -1035,6 +1042,17 @@ static bool parse_args(int argc, char *argv[])
return false;
}
+static void set_counter_defaults(void)
+{
+ const uint64_t MIN_ROLLOVER_SECS = 40ULL * 365 * 24 * 3600;
+ uint64_t freq = read_sysreg(CNTFRQ_EL0);
+ uint64_t width = ilog2(MIN_ROLLOVER_SECS * freq);
+
+ width = clamp(width, 56, 64);
+ CVAL_MAX = GENMASK_ULL(width - 1, 0);
+ DEF_CNT = CVAL_MAX / 2;
+}
+
int main(int argc, char *argv[])
{
struct kvm_vcpu *vcpu;
@@ -1046,16 +1064,19 @@ int main(int argc, char *argv[])
if (!parse_args(argc, argv))
exit(KSFT_SKIP);
+ sched_getaffinity(0, sizeof(default_cpuset), &default_cpuset);
+ set_counter_defaults();
+
if (test_args.test_virtual) {
test_vm_create(&vm, &vcpu, VIRTUAL);
test_run(vm, vcpu);
- kvm_vm_free(vm);
+ test_vm_cleanup(vm);
}
if (test_args.test_physical) {
test_vm_create(&vm, &vcpu, PHYSICAL);
test_run(vm, vcpu);
- kvm_vm_free(vm);
+ test_vm_cleanup(vm);
}
return 0;
diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index 93013564428b..bee65ca08721 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -555,6 +555,41 @@ void kvm_get_stat(struct kvm_binary_stats *stats, const char *name,
#define vm_get_stat(vm, stat) __get_stat(&(vm)->stats, stat)
#define vcpu_get_stat(vcpu, stat) __get_stat(&(vcpu)->stats, stat)
+static inline bool read_smt_control(char *buf, size_t buf_size)
+{
+ FILE *f = fopen("/sys/devices/system/cpu/smt/control", "r");
+ bool ret;
+
+ if (!f)
+ return false;
+
+ ret = fread(buf, sizeof(*buf), buf_size, f) > 0;
+ fclose(f);
+
+ return ret;
+}
+
+static inline bool is_smt_possible(void)
+{
+ char buf[16];
+
+ if (read_smt_control(buf, sizeof(buf)) &&
+ (!strncmp(buf, "forceoff", 8) || !strncmp(buf, "notsupported", 12)))
+ return false;
+
+ return true;
+}
+
+static inline bool is_smt_on(void)
+{
+ char buf[16];
+
+ if (read_smt_control(buf, sizeof(buf)) && !strncmp(buf, "on", 2))
+ return true;
+
+ return false;
+}
+
void vm_create_irqchip(struct kvm_vm *vm);
static inline int __vm_create_guest_memfd(struct kvm_vm *vm, uint64_t size,
diff --git a/tools/testing/selftests/kvm/include/lru_gen_util.h b/tools/testing/selftests/kvm/include/lru_gen_util.h
new file mode 100644
index 000000000000..d32ff5d8ffd0
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/lru_gen_util.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Tools for integrating with lru_gen, like parsing the lru_gen debugfs output.
+ *
+ * Copyright (C) 2025, Google LLC.
+ */
+#ifndef SELFTEST_KVM_LRU_GEN_UTIL_H
+#define SELFTEST_KVM_LRU_GEN_UTIL_H
+
+#include <inttypes.h>
+#include <limits.h>
+#include <stdlib.h>
+
+#include "test_util.h"
+
+#define MAX_NR_GENS 16 /* MAX_NR_GENS in include/linux/mmzone.h */
+#define MAX_NR_NODES 4 /* Maximum number of nodes supported by the test */
+
+#define LRU_GEN_DEBUGFS "/sys/kernel/debug/lru_gen"
+#define LRU_GEN_ENABLED_PATH "/sys/kernel/mm/lru_gen/enabled"
+#define LRU_GEN_ENABLED 1
+#define LRU_GEN_MM_WALK 2
+
+struct generation_stats {
+ int gen;
+ long age_ms;
+ long nr_anon;
+ long nr_file;
+};
+
+struct node_stats {
+ int node;
+ int nr_gens; /* Number of populated gens entries. */
+ struct generation_stats gens[MAX_NR_GENS];
+};
+
+struct memcg_stats {
+ unsigned long memcg_id;
+ int nr_nodes; /* Number of populated nodes entries. */
+ struct node_stats nodes[MAX_NR_NODES];
+};
+
+void lru_gen_read_memcg_stats(struct memcg_stats *stats, const char *memcg);
+long lru_gen_sum_memcg_stats(const struct memcg_stats *stats);
+long lru_gen_sum_memcg_stats_for_gen(int gen, const struct memcg_stats *stats);
+void lru_gen_do_aging(struct memcg_stats *stats, const char *memcg);
+int lru_gen_find_generation(const struct memcg_stats *stats,
+ unsigned long total_pages);
+bool lru_gen_usable(void);
+
+#endif /* SELFTEST_KVM_LRU_GEN_UTIL_H */
diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h
index 77d13d7920cb..c6ef895fbd9a 100644
--- a/tools/testing/selftests/kvm/include/test_util.h
+++ b/tools/testing/selftests/kvm/include/test_util.h
@@ -153,6 +153,7 @@ bool is_backing_src_hugetlb(uint32_t i);
void backing_src_help(const char *flag);
enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name);
long get_run_delay(void);
+bool is_numa_balancing_enabled(void);
/*
* Whether or not the given source type is shared memory (as opposed to
diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h
index 32ab6ca7ec32..b11b5a53ebd5 100644
--- a/tools/testing/selftests/kvm/include/x86/processor.h
+++ b/tools/testing/selftests/kvm/include/x86/processor.h
@@ -203,6 +203,7 @@ struct kvm_x86_cpu_feature {
#define X86_FEATURE_IDLE_HLT KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 30)
#define X86_FEATURE_SEV KVM_X86_CPU_FEATURE(0x8000001F, 0, EAX, 1)
#define X86_FEATURE_SEV_ES KVM_X86_CPU_FEATURE(0x8000001F, 0, EAX, 3)
+#define X86_FEATURE_SEV_SNP KVM_X86_CPU_FEATURE(0x8000001F, 0, EAX, 4)
#define X86_FEATURE_PERFMON_V2 KVM_X86_CPU_FEATURE(0x80000022, 0, EAX, 0)
#define X86_FEATURE_LBR_PMC_FREEZE KVM_X86_CPU_FEATURE(0x80000022, 0, EAX, 2)
diff --git a/tools/testing/selftests/kvm/include/x86/sev.h b/tools/testing/selftests/kvm/include/x86/sev.h
index 82c11c81a956..008b4169f5e2 100644
--- a/tools/testing/selftests/kvm/include/x86/sev.h
+++ b/tools/testing/selftests/kvm/include/x86/sev.h
@@ -25,19 +25,51 @@ enum sev_guest_state {
#define SEV_POLICY_NO_DBG (1UL << 0)
#define SEV_POLICY_ES (1UL << 2)
+#define SNP_POLICY_SMT (1ULL << 16)
+#define SNP_POLICY_RSVD_MBO (1ULL << 17)
+#define SNP_POLICY_DBG (1ULL << 19)
+
#define GHCB_MSR_TERM_REQ 0x100
+static inline bool is_sev_snp_vm(struct kvm_vm *vm)
+{
+ return vm->type == KVM_X86_SNP_VM;
+}
+
+static inline bool is_sev_es_vm(struct kvm_vm *vm)
+{
+ return is_sev_snp_vm(vm) || vm->type == KVM_X86_SEV_ES_VM;
+}
+
+static inline bool is_sev_vm(struct kvm_vm *vm)
+{
+ return is_sev_es_vm(vm) || vm->type == KVM_X86_SEV_VM;
+}
+
void sev_vm_launch(struct kvm_vm *vm, uint32_t policy);
void sev_vm_launch_measure(struct kvm_vm *vm, uint8_t *measurement);
void sev_vm_launch_finish(struct kvm_vm *vm);
+void snp_vm_launch_start(struct kvm_vm *vm, uint64_t policy);
+void snp_vm_launch_update(struct kvm_vm *vm);
+void snp_vm_launch_finish(struct kvm_vm *vm);
struct kvm_vm *vm_sev_create_with_one_vcpu(uint32_t type, void *guest_code,
struct kvm_vcpu **cpu);
-void vm_sev_launch(struct kvm_vm *vm, uint32_t policy, uint8_t *measurement);
+void vm_sev_launch(struct kvm_vm *vm, uint64_t policy, uint8_t *measurement);
kvm_static_assert(SEV_RET_SUCCESS == 0);
/*
+ * A SEV-SNP VM requires the policy reserved bit to always be set.
+ * The SMT policy bit is also required to be set based on SMT being
+ * available and active on the system.
+ */
+static inline u64 snp_default_policy(void)
+{
+ return SNP_POLICY_RSVD_MBO | (is_smt_on() ? SNP_POLICY_SMT : 0);
+}
+
+/*
* The KVM_MEMORY_ENCRYPT_OP uAPI is utter garbage and takes an "unsigned long"
* instead of a proper struct. The size of the parameter is embedded in the
* ioctl number, i.e. is ABI and thus immutable. Hack around the mess by
@@ -70,6 +102,12 @@ kvm_static_assert(SEV_RET_SUCCESS == 0);
void sev_vm_init(struct kvm_vm *vm);
void sev_es_vm_init(struct kvm_vm *vm);
+void snp_vm_init(struct kvm_vm *vm);
+
+static inline void vmgexit(void)
+{
+ __asm__ __volatile__("rep; vmmcall");
+}
static inline void sev_register_encrypted_memory(struct kvm_vm *vm,
struct userspace_mem_region *region)
@@ -93,4 +131,17 @@ static inline void sev_launch_update_data(struct kvm_vm *vm, vm_paddr_t gpa,
vm_sev_ioctl(vm, KVM_SEV_LAUNCH_UPDATE_DATA, &update_data);
}
+static inline void snp_launch_update_data(struct kvm_vm *vm, vm_paddr_t gpa,
+ uint64_t hva, uint64_t size, uint8_t type)
+{
+ struct kvm_sev_snp_launch_update update_data = {
+ .uaddr = hva,
+ .gfn_start = gpa >> PAGE_SHIFT,
+ .len = size,
+ .type = type,
+ };
+
+ vm_sev_ioctl(vm, KVM_SEV_SNP_LAUNCH_UPDATE, &update_data);
+}
+
#endif /* SELFTEST_KVM_SEV_H */
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 5649cf2f40e8..a055343a7bf7 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -447,6 +447,15 @@ void kvm_set_files_rlimit(uint32_t nr_vcpus)
}
+static bool is_guest_memfd_required(struct vm_shape shape)
+{
+#ifdef __x86_64__
+ return shape.type == KVM_X86_SNP_VM;
+#else
+ return false;
+#endif
+}
+
struct kvm_vm *__vm_create(struct vm_shape shape, uint32_t nr_runnable_vcpus,
uint64_t nr_extra_pages)
{
@@ -454,7 +463,7 @@ struct kvm_vm *__vm_create(struct vm_shape shape, uint32_t nr_runnable_vcpus,
nr_extra_pages);
struct userspace_mem_region *slot0;
struct kvm_vm *vm;
- int i;
+ int i, flags;
kvm_set_files_rlimit(nr_runnable_vcpus);
@@ -463,7 +472,15 @@ struct kvm_vm *__vm_create(struct vm_shape shape, uint32_t nr_runnable_vcpus,
vm = ____vm_create(shape);
- vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0, 0, nr_pages, 0);
+ /*
+ * Force GUEST_MEMFD for the primary memory region if necessary, e.g.
+ * for CoCo VMs that require GUEST_MEMFD backed private memory.
+ */
+ flags = 0;
+ if (is_guest_memfd_required(shape))
+ flags |= KVM_MEM_GUEST_MEMFD;
+
+ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0, 0, nr_pages, flags);
for (i = 0; i < NR_MEM_REGIONS; i++)
vm->memslots[i] = 0;
diff --git a/tools/testing/selftests/kvm/lib/lru_gen_util.c b/tools/testing/selftests/kvm/lib/lru_gen_util.c
new file mode 100644
index 000000000000..46a14fd63d9e
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/lru_gen_util.c
@@ -0,0 +1,387 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2025, Google LLC.
+ */
+
+#include <time.h>
+
+#include "lru_gen_util.h"
+
+/*
+ * Tracks state while we parse memcg lru_gen stats. The file we're parsing is
+ * structured like this (some extra whitespace elided):
+ *
+ * memcg (id) (path)
+ * node (id)
+ * (gen_nr) (age_in_ms) (nr_anon_pages) (nr_file_pages)
+ */
+struct memcg_stats_parse_context {
+ bool consumed; /* Whether or not this line was consumed */
+ /* Next parse handler to invoke */
+ void (*next_handler)(struct memcg_stats *stats,
+ struct memcg_stats_parse_context *ctx,
+ char *line);
+ int current_node_idx; /* Current index in nodes array */
+ const char *name; /* The name of the memcg we're looking for */
+};
+
+static void memcg_stats_handle_searching(struct memcg_stats *stats,
+ struct memcg_stats_parse_context *ctx,
+ char *line);
+static void memcg_stats_handle_in_memcg(struct memcg_stats *stats,
+ struct memcg_stats_parse_context *ctx,
+ char *line);
+static void memcg_stats_handle_in_node(struct memcg_stats *stats,
+ struct memcg_stats_parse_context *ctx,
+ char *line);
+
+struct split_iterator {
+ char *str;
+ char *save;
+};
+
+static char *split_next(struct split_iterator *it)
+{
+ char *ret = strtok_r(it->str, " \t\n\r", &it->save);
+
+ it->str = NULL;
+ return ret;
+}
+
+static void memcg_stats_handle_searching(struct memcg_stats *stats,
+ struct memcg_stats_parse_context *ctx,
+ char *line)
+{
+ struct split_iterator it = { .str = line };
+ char *prefix = split_next(&it);
+ char *memcg_id = split_next(&it);
+ char *memcg_name = split_next(&it);
+ char *end;
+
+ ctx->consumed = true;
+
+ if (!prefix || strcmp("memcg", prefix))
+ return; /* Not a memcg line (maybe empty), skip */
+
+ TEST_ASSERT(memcg_id && memcg_name,
+ "malformed memcg line; no memcg id or memcg_name");
+
+ if (strcmp(memcg_name + 1, ctx->name))
+ return; /* Wrong memcg, skip */
+
+ /* Found it! */
+
+ stats->memcg_id = strtoul(memcg_id, &end, 10);
+ TEST_ASSERT(*end == '\0', "malformed memcg id '%s'", memcg_id);
+ if (!stats->memcg_id)
+ return; /* Removed memcg? */
+
+ ctx->next_handler = memcg_stats_handle_in_memcg;
+}
+
+static void memcg_stats_handle_in_memcg(struct memcg_stats *stats,
+ struct memcg_stats_parse_context *ctx,
+ char *line)
+{
+ struct split_iterator it = { .str = line };
+ char *prefix = split_next(&it);
+ char *id = split_next(&it);
+ long found_node_id;
+ char *end;
+
+ ctx->consumed = true;
+ ctx->current_node_idx = -1;
+
+ if (!prefix)
+ return; /* Skip empty lines */
+
+ if (!strcmp("memcg", prefix)) {
+ /* Memcg done, found next one; stop. */
+ ctx->next_handler = NULL;
+ return;
+ } else if (strcmp("node", prefix))
+ TEST_ASSERT(false, "found malformed line after 'memcg ...',"
+ "token: '%s'", prefix);
+
+ /* At this point we know we have a node line. Parse the ID. */
+
+ TEST_ASSERT(id, "malformed node line; no node id");
+
+ found_node_id = strtol(id, &end, 10);
+ TEST_ASSERT(*end == '\0', "malformed node id '%s'", id);
+
+ ctx->current_node_idx = stats->nr_nodes++;
+ TEST_ASSERT(ctx->current_node_idx < MAX_NR_NODES,
+ "memcg has stats for too many nodes, max is %d",
+ MAX_NR_NODES);
+ stats->nodes[ctx->current_node_idx].node = found_node_id;
+
+ ctx->next_handler = memcg_stats_handle_in_node;
+}
+
+static void memcg_stats_handle_in_node(struct memcg_stats *stats,
+ struct memcg_stats_parse_context *ctx,
+ char *line)
+{
+ char *my_line = strdup(line);
+ struct split_iterator it = { .str = my_line };
+ char *gen, *age, *nr_anon, *nr_file;
+ struct node_stats *node_stats;
+ struct generation_stats *gen_stats;
+ char *end;
+
+ TEST_ASSERT(it.str, "failed to copy input line");
+
+ gen = split_next(&it);
+
+ if (!gen)
+ goto out_consume; /* Skip empty lines */
+
+ if (!strcmp("memcg", gen) || !strcmp("node", gen)) {
+ /*
+ * Reached next memcg or node section. Don't consume, let the
+ * other handler deal with this.
+ */
+ ctx->next_handler = memcg_stats_handle_in_memcg;
+ goto out;
+ }
+
+ node_stats = &stats->nodes[ctx->current_node_idx];
+ TEST_ASSERT(node_stats->nr_gens < MAX_NR_GENS,
+ "found too many generation lines; max is %d",
+ MAX_NR_GENS);
+ gen_stats = &node_stats->gens[node_stats->nr_gens++];
+
+ age = split_next(&it);
+ nr_anon = split_next(&it);
+ nr_file = split_next(&it);
+
+ TEST_ASSERT(age && nr_anon && nr_file,
+ "malformed generation line; not enough tokens");
+
+ gen_stats->gen = (int)strtol(gen, &end, 10);
+ TEST_ASSERT(*end == '\0', "malformed generation number '%s'", gen);
+
+ gen_stats->age_ms = strtol(age, &end, 10);
+ TEST_ASSERT(*end == '\0', "malformed generation age '%s'", age);
+
+ gen_stats->nr_anon = strtol(nr_anon, &end, 10);
+ TEST_ASSERT(*end == '\0', "malformed anonymous page count '%s'",
+ nr_anon);
+
+ gen_stats->nr_file = strtol(nr_file, &end, 10);
+ TEST_ASSERT(*end == '\0', "malformed file page count '%s'", nr_file);
+
+out_consume:
+ ctx->consumed = true;
+out:
+ free(my_line);
+}
+
+static void print_memcg_stats(const struct memcg_stats *stats, const char *name)
+{
+ int node, gen;
+
+ pr_debug("stats for memcg %s (id %lu):\n", name, stats->memcg_id);
+ for (node = 0; node < stats->nr_nodes; ++node) {
+ pr_debug("\tnode %d\n", stats->nodes[node].node);
+ for (gen = 0; gen < stats->nodes[node].nr_gens; ++gen) {
+ const struct generation_stats *gstats =
+ &stats->nodes[node].gens[gen];
+
+ pr_debug("\t\tgen %d\tage_ms %ld"
+ "\tnr_anon %ld\tnr_file %ld\n",
+ gstats->gen, gstats->age_ms, gstats->nr_anon,
+ gstats->nr_file);
+ }
+ }
+}
+
+/* Re-read lru_gen debugfs information for @memcg into @stats. */
+void lru_gen_read_memcg_stats(struct memcg_stats *stats, const char *memcg)
+{
+ FILE *f;
+ ssize_t read = 0;
+ char *line = NULL;
+ size_t bufsz;
+ struct memcg_stats_parse_context ctx = {
+ .next_handler = memcg_stats_handle_searching,
+ .name = memcg,
+ };
+
+ memset(stats, 0, sizeof(struct memcg_stats));
+
+ f = fopen(LRU_GEN_DEBUGFS, "r");
+ TEST_ASSERT(f, "fopen(%s) failed", LRU_GEN_DEBUGFS);
+
+ while (ctx.next_handler && (read = getline(&line, &bufsz, f)) > 0) {
+ ctx.consumed = false;
+
+ do {
+ ctx.next_handler(stats, &ctx, line);
+ if (!ctx.next_handler)
+ break;
+ } while (!ctx.consumed);
+ }
+
+ if (read < 0 && !feof(f))
+ TEST_ASSERT(false, "getline(%s) failed", LRU_GEN_DEBUGFS);
+
+ TEST_ASSERT(stats->memcg_id > 0, "Couldn't find memcg: %s\n"
+ "Did the memcg get created in the proper mount?",
+ memcg);
+ if (line)
+ free(line);
+ TEST_ASSERT(!fclose(f), "fclose(%s) failed", LRU_GEN_DEBUGFS);
+
+ print_memcg_stats(stats, memcg);
+}
+
+/*
+ * Find all pages tracked by lru_gen for this memcg in generation @target_gen.
+ *
+ * If @target_gen is negative, look for all generations.
+ */
+long lru_gen_sum_memcg_stats_for_gen(int target_gen,
+ const struct memcg_stats *stats)
+{
+ int node, gen;
+ long total_nr = 0;
+
+ for (node = 0; node < stats->nr_nodes; ++node) {
+ const struct node_stats *node_stats = &stats->nodes[node];
+
+ for (gen = 0; gen < node_stats->nr_gens; ++gen) {
+ const struct generation_stats *gen_stats =
+ &node_stats->gens[gen];
+
+ if (target_gen >= 0 && gen_stats->gen != target_gen)
+ continue;
+
+ total_nr += gen_stats->nr_anon + gen_stats->nr_file;
+ }
+ }
+
+ return total_nr;
+}
+
+/* Find all pages tracked by lru_gen for this memcg. */
+long lru_gen_sum_memcg_stats(const struct memcg_stats *stats)
+{
+ return lru_gen_sum_memcg_stats_for_gen(-1, stats);
+}
+
+/*
+ * If lru_gen aging should force page table scanning.
+ *
+ * If you want to set this to false, you will need to do eviction
+ * before doing extra aging passes.
+ */
+static const bool force_scan = true;
+
+static void run_aging_impl(unsigned long memcg_id, int node_id, int max_gen)
+{
+ FILE *f = fopen(LRU_GEN_DEBUGFS, "w");
+ char *command;
+ size_t sz;
+
+ TEST_ASSERT(f, "fopen(%s) failed", LRU_GEN_DEBUGFS);
+ sz = asprintf(&command, "+ %lu %d %d 1 %d\n",
+ memcg_id, node_id, max_gen, force_scan);
+ TEST_ASSERT(sz > 0, "creating aging command failed");
+
+ pr_debug("Running aging command: %s", command);
+ if (fwrite(command, sizeof(char), sz, f) < sz) {
+ TEST_ASSERT(false, "writing aging command %s to %s failed",
+ command, LRU_GEN_DEBUGFS);
+ }
+
+ TEST_ASSERT(!fclose(f), "fclose(%s) failed", LRU_GEN_DEBUGFS);
+}
+
+void lru_gen_do_aging(struct memcg_stats *stats, const char *memcg)
+{
+ int node, gen;
+
+ pr_debug("lru_gen: invoking aging...\n");
+
+ /* Must read memcg stats to construct the proper aging command. */
+ lru_gen_read_memcg_stats(stats, memcg);
+
+ for (node = 0; node < stats->nr_nodes; ++node) {
+ int max_gen = 0;
+
+ for (gen = 0; gen < stats->nodes[node].nr_gens; ++gen) {
+ int this_gen = stats->nodes[node].gens[gen].gen;
+
+ max_gen = max_gen > this_gen ? max_gen : this_gen;
+ }
+
+ run_aging_impl(stats->memcg_id, stats->nodes[node].node,
+ max_gen);
+ }
+
+ /* Re-read so callers get updated information */
+ lru_gen_read_memcg_stats(stats, memcg);
+}
+
+/*
+ * Find which generation contains at least @pages pages, assuming that
+ * such a generation exists.
+ */
+int lru_gen_find_generation(const struct memcg_stats *stats,
+ unsigned long pages)
+{
+ int node, gen, gen_idx, min_gen = INT_MAX, max_gen = -1;
+
+ for (node = 0; node < stats->nr_nodes; ++node)
+ for (gen_idx = 0; gen_idx < stats->nodes[node].nr_gens;
+ ++gen_idx) {
+ gen = stats->nodes[node].gens[gen_idx].gen;
+ max_gen = gen > max_gen ? gen : max_gen;
+ min_gen = gen < min_gen ? gen : min_gen;
+ }
+
+ for (gen = min_gen; gen <= max_gen; ++gen)
+ /* See if this generation has enough pages. */
+ if (lru_gen_sum_memcg_stats_for_gen(gen, stats) > pages)
+ return gen;
+
+ return -1;
+}
+
+bool lru_gen_usable(void)
+{
+ long required_features = LRU_GEN_ENABLED | LRU_GEN_MM_WALK;
+ int lru_gen_fd, lru_gen_debug_fd;
+ char mglru_feature_str[8] = {};
+ long mglru_features;
+
+ lru_gen_fd = open(LRU_GEN_ENABLED_PATH, O_RDONLY);
+ if (lru_gen_fd < 0) {
+ puts("lru_gen: Could not open " LRU_GEN_ENABLED_PATH);
+ return false;
+ }
+ if (read(lru_gen_fd, &mglru_feature_str, 7) < 7) {
+ puts("lru_gen: Could not read from " LRU_GEN_ENABLED_PATH);
+ close(lru_gen_fd);
+ return false;
+ }
+ close(lru_gen_fd);
+
+ mglru_features = strtol(mglru_feature_str, NULL, 16);
+ if ((mglru_features & required_features) != required_features) {
+ printf("lru_gen: missing features, got: 0x%lx, expected: 0x%lx\n",
+ mglru_features, required_features);
+ printf("lru_gen: Try 'echo 0x%lx > /sys/kernel/mm/lru_gen/enabled'\n",
+ required_features);
+ return false;
+ }
+
+ lru_gen_debug_fd = open(LRU_GEN_DEBUGFS, O_RDWR);
+ __TEST_REQUIRE(lru_gen_debug_fd >= 0,
+ "lru_gen: Could not open " LRU_GEN_DEBUGFS ", "
+ "but lru_gen is enabled, so cannot use page_idle.");
+ close(lru_gen_debug_fd);
+ return true;
+}
diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c
index 8ed0b74ae837..03eb99af9b8d 100644
--- a/tools/testing/selftests/kvm/lib/test_util.c
+++ b/tools/testing/selftests/kvm/lib/test_util.c
@@ -132,37 +132,57 @@ void print_skip(const char *fmt, ...)
puts(", skipping test");
}
-bool thp_configured(void)
+static bool test_sysfs_path(const char *path)
{
- int ret;
struct stat statbuf;
+ int ret;
- ret = stat("/sys/kernel/mm/transparent_hugepage", &statbuf);
+ ret = stat(path, &statbuf);
TEST_ASSERT(ret == 0 || (ret == -1 && errno == ENOENT),
- "Error in stating /sys/kernel/mm/transparent_hugepage");
+ "Error in stat()ing '%s'", path);
return ret == 0;
}
-size_t get_trans_hugepagesz(void)
+bool thp_configured(void)
+{
+ return test_sysfs_path("/sys/kernel/mm/transparent_hugepage");
+}
+
+static size_t get_sysfs_val(const char *path)
{
size_t size;
FILE *f;
int ret;
- TEST_ASSERT(thp_configured(), "THP is not configured in host kernel");
-
- f = fopen("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size", "r");
- TEST_ASSERT(f != NULL, "Error in opening transparent_hugepage/hpage_pmd_size");
+ f = fopen(path, "r");
+ TEST_ASSERT(f, "Error opening '%s'", path);
ret = fscanf(f, "%ld", &size);
+ TEST_ASSERT(ret > 0, "Error reading '%s'", path);
+
+ /* Re-scan the input stream to verify the entire file was read. */
ret = fscanf(f, "%ld", &size);
- TEST_ASSERT(ret < 1, "Error reading transparent_hugepage/hpage_pmd_size");
- fclose(f);
+ TEST_ASSERT(ret < 1, "Error reading '%s'", path);
+ fclose(f);
return size;
}
+size_t get_trans_hugepagesz(void)
+{
+ TEST_ASSERT(thp_configured(), "THP is not configured in host kernel");
+
+ return get_sysfs_val("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size");
+}
+
+bool is_numa_balancing_enabled(void)
+{
+ if (!test_sysfs_path("/proc/sys/kernel/numa_balancing"))
+ return false;
+ return get_sysfs_val("/proc/sys/kernel/numa_balancing") == 1;
+}
+
size_t get_def_hugetlb_pagesz(void)
{
char buf[64];
diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c
index bd5a802fa7a5..a92dc1dad085 100644
--- a/tools/testing/selftests/kvm/lib/x86/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86/processor.c
@@ -639,7 +639,7 @@ void kvm_arch_vm_post_create(struct kvm_vm *vm)
sync_global_to_guest(vm, host_cpu_is_amd);
sync_global_to_guest(vm, is_forced_emulation_enabled);
- if (vm->type == KVM_X86_SEV_VM || vm->type == KVM_X86_SEV_ES_VM) {
+ if (is_sev_vm(vm)) {
struct kvm_sev_init init = { 0 };
vm_sev_ioctl(vm, KVM_SEV_INIT2, &init);
@@ -1156,7 +1156,7 @@ void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits)
void kvm_init_vm_address_properties(struct kvm_vm *vm)
{
- if (vm->type == KVM_X86_SEV_VM || vm->type == KVM_X86_SEV_ES_VM) {
+ if (is_sev_vm(vm)) {
vm->arch.sev_fd = open_sev_dev_path_or_exit();
vm->arch.c_bit = BIT_ULL(this_cpu_property(X86_PROPERTY_SEV_C_BIT));
vm->gpa_tag_mask = vm->arch.c_bit;
diff --git a/tools/testing/selftests/kvm/lib/x86/sev.c b/tools/testing/selftests/kvm/lib/x86/sev.c
index e9535ee20b7f..c3a9838f4806 100644
--- a/tools/testing/selftests/kvm/lib/x86/sev.c
+++ b/tools/testing/selftests/kvm/lib/x86/sev.c
@@ -14,7 +14,8 @@
* and find the first range, but that's correct because the condition
* expression would cause us to quit the loop.
*/
-static void encrypt_region(struct kvm_vm *vm, struct userspace_mem_region *region)
+static void encrypt_region(struct kvm_vm *vm, struct userspace_mem_region *region,
+ uint8_t page_type, bool private)
{
const struct sparsebit *protected_phy_pages = region->protected_phy_pages;
const vm_paddr_t gpa_base = region->region.guest_phys_addr;
@@ -24,25 +25,35 @@ static void encrypt_region(struct kvm_vm *vm, struct userspace_mem_region *regio
if (!sparsebit_any_set(protected_phy_pages))
return;
- sev_register_encrypted_memory(vm, region);
+ if (!is_sev_snp_vm(vm))
+ sev_register_encrypted_memory(vm, region);
sparsebit_for_each_set_range(protected_phy_pages, i, j) {
const uint64_t size = (j - i + 1) * vm->page_size;
const uint64_t offset = (i - lowest_page_in_region) * vm->page_size;
- sev_launch_update_data(vm, gpa_base + offset, size);
+ if (private)
+ vm_mem_set_private(vm, gpa_base + offset, size);
+
+ if (is_sev_snp_vm(vm))
+ snp_launch_update_data(vm, gpa_base + offset,
+ (uint64_t)addr_gpa2hva(vm, gpa_base + offset),
+ size, page_type);
+ else
+ sev_launch_update_data(vm, gpa_base + offset, size);
+
}
}
void sev_vm_init(struct kvm_vm *vm)
{
if (vm->type == KVM_X86_DEFAULT_VM) {
- assert(vm->arch.sev_fd == -1);
+ TEST_ASSERT_EQ(vm->arch.sev_fd, -1);
vm->arch.sev_fd = open_sev_dev_path_or_exit();
vm_sev_ioctl(vm, KVM_SEV_INIT, NULL);
} else {
struct kvm_sev_init init = { 0 };
- assert(vm->type == KVM_X86_SEV_VM);
+ TEST_ASSERT_EQ(vm->type, KVM_X86_SEV_VM);
vm_sev_ioctl(vm, KVM_SEV_INIT2, &init);
}
}
@@ -50,16 +61,24 @@ void sev_vm_init(struct kvm_vm *vm)
void sev_es_vm_init(struct kvm_vm *vm)
{
if (vm->type == KVM_X86_DEFAULT_VM) {
- assert(vm->arch.sev_fd == -1);
+ TEST_ASSERT_EQ(vm->arch.sev_fd, -1);
vm->arch.sev_fd = open_sev_dev_path_or_exit();
vm_sev_ioctl(vm, KVM_SEV_ES_INIT, NULL);
} else {
struct kvm_sev_init init = { 0 };
- assert(vm->type == KVM_X86_SEV_ES_VM);
+ TEST_ASSERT_EQ(vm->type, KVM_X86_SEV_ES_VM);
vm_sev_ioctl(vm, KVM_SEV_INIT2, &init);
}
}
+void snp_vm_init(struct kvm_vm *vm)
+{
+ struct kvm_sev_init init = { 0 };
+
+ TEST_ASSERT_EQ(vm->type, KVM_X86_SNP_VM);
+ vm_sev_ioctl(vm, KVM_SEV_INIT2, &init);
+}
+
void sev_vm_launch(struct kvm_vm *vm, uint32_t policy)
{
struct kvm_sev_launch_start launch_start = {
@@ -76,7 +95,7 @@ void sev_vm_launch(struct kvm_vm *vm, uint32_t policy)
TEST_ASSERT_EQ(status.state, SEV_GUEST_STATE_LAUNCH_UPDATE);
hash_for_each(vm->regions.slot_hash, ctr, region, slot_node)
- encrypt_region(vm, region);
+ encrypt_region(vm, region, KVM_SEV_PAGE_TYPE_INVALID, false);
if (policy & SEV_POLICY_ES)
vm_sev_ioctl(vm, KVM_SEV_LAUNCH_UPDATE_VMSA, NULL);
@@ -112,6 +131,33 @@ void sev_vm_launch_finish(struct kvm_vm *vm)
TEST_ASSERT_EQ(status.state, SEV_GUEST_STATE_RUNNING);
}
+void snp_vm_launch_start(struct kvm_vm *vm, uint64_t policy)
+{
+ struct kvm_sev_snp_launch_start launch_start = {
+ .policy = policy,
+ };
+
+ vm_sev_ioctl(vm, KVM_SEV_SNP_LAUNCH_START, &launch_start);
+}
+
+void snp_vm_launch_update(struct kvm_vm *vm)
+{
+ struct userspace_mem_region *region;
+ int ctr;
+
+ hash_for_each(vm->regions.slot_hash, ctr, region, slot_node)
+ encrypt_region(vm, region, KVM_SEV_SNP_PAGE_TYPE_NORMAL, true);
+
+ vm->arch.is_pt_protected = true;
+}
+
+void snp_vm_launch_finish(struct kvm_vm *vm)
+{
+ struct kvm_sev_snp_launch_finish launch_finish = { 0 };
+
+ vm_sev_ioctl(vm, KVM_SEV_SNP_LAUNCH_FINISH, &launch_finish);
+}
+
struct kvm_vm *vm_sev_create_with_one_vcpu(uint32_t type, void *guest_code,
struct kvm_vcpu **cpu)
{
@@ -128,8 +174,20 @@ struct kvm_vm *vm_sev_create_with_one_vcpu(uint32_t type, void *guest_code,
return vm;
}
-void vm_sev_launch(struct kvm_vm *vm, uint32_t policy, uint8_t *measurement)
+void vm_sev_launch(struct kvm_vm *vm, uint64_t policy, uint8_t *measurement)
{
+ if (is_sev_snp_vm(vm)) {
+ vm_enable_cap(vm, KVM_CAP_EXIT_HYPERCALL, BIT(KVM_HC_MAP_GPA_RANGE));
+
+ snp_vm_launch_start(vm, policy);
+
+ snp_vm_launch_update(vm);
+
+ snp_vm_launch_finish(vm);
+
+ return;
+ }
+
sev_vm_launch(vm, policy);
if (!measurement)
diff --git a/tools/testing/selftests/kvm/x86/fastops_test.c b/tools/testing/selftests/kvm/x86/fastops_test.c
new file mode 100644
index 000000000000..2ac89d6c1e46
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/fastops_test.c
@@ -0,0 +1,165 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+/*
+ * Execute a fastop() instruction, with or without forced emulation. BT bit 0
+ * to set RFLAGS.CF based on whether or not the input is even or odd, so that
+ * instructions like ADC and SBB are deterministic.
+ */
+#define guest_execute_fastop_1(FEP, insn, __val, __flags) \
+({ \
+ __asm__ __volatile__("bt $0, %[val]\n\t" \
+ FEP insn " %[val]\n\t" \
+ "pushfq\n\t" \
+ "pop %[flags]\n\t" \
+ : [val]"+r"(__val), [flags]"=r"(__flags) \
+ : : "cc", "memory"); \
+})
+
+#define guest_test_fastop_1(insn, type_t, __val) \
+({ \
+ type_t val = __val, ex_val = __val, input = __val; \
+ uint64_t flags, ex_flags; \
+ \
+ guest_execute_fastop_1("", insn, ex_val, ex_flags); \
+ guest_execute_fastop_1(KVM_FEP, insn, val, flags); \
+ \
+ __GUEST_ASSERT(val == ex_val, \
+ "Wanted 0x%lx for '%s 0x%lx', got 0x%lx", \
+ (uint64_t)ex_val, insn, (uint64_t)input, (uint64_t)val); \
+ __GUEST_ASSERT(flags == ex_flags, \
+ "Wanted flags 0x%lx for '%s 0x%lx', got 0x%lx", \
+ ex_flags, insn, (uint64_t)input, flags); \
+})
+
+#define guest_execute_fastop_2(FEP, insn, __input, __output, __flags) \
+({ \
+ __asm__ __volatile__("bt $0, %[output]\n\t" \
+ FEP insn " %[input], %[output]\n\t" \
+ "pushfq\n\t" \
+ "pop %[flags]\n\t" \
+ : [output]"+r"(__output), [flags]"=r"(__flags) \
+ : [input]"r"(__input) : "cc", "memory"); \
+})
+
+#define guest_test_fastop_2(insn, type_t, __val1, __val2) \
+({ \
+ type_t input = __val1, input2 = __val2, output = __val2, ex_output = __val2; \
+ uint64_t flags, ex_flags; \
+ \
+ guest_execute_fastop_2("", insn, input, ex_output, ex_flags); \
+ guest_execute_fastop_2(KVM_FEP, insn, input, output, flags); \
+ \
+ __GUEST_ASSERT(output == ex_output, \
+ "Wanted 0x%lx for '%s 0x%lx 0x%lx', got 0x%lx", \
+ (uint64_t)ex_output, insn, (uint64_t)input, \
+ (uint64_t)input2, (uint64_t)output); \
+ __GUEST_ASSERT(flags == ex_flags, \
+ "Wanted flags 0x%lx for '%s 0x%lx, 0x%lx', got 0x%lx", \
+ ex_flags, insn, (uint64_t)input, (uint64_t)input2, flags); \
+})
+
+#define guest_execute_fastop_cl(FEP, insn, __shift, __output, __flags) \
+({ \
+ __asm__ __volatile__("bt $0, %[output]\n\t" \
+ FEP insn " %%cl, %[output]\n\t" \
+ "pushfq\n\t" \
+ "pop %[flags]\n\t" \
+ : [output]"+r"(__output), [flags]"=r"(__flags) \
+ : "c"(__shift) : "cc", "memory"); \
+})
+
+#define guest_test_fastop_cl(insn, type_t, __val1, __val2) \
+({ \
+ type_t output = __val2, ex_output = __val2, input = __val2; \
+ uint8_t shift = __val1; \
+ uint64_t flags, ex_flags; \
+ \
+ guest_execute_fastop_cl("", insn, shift, ex_output, ex_flags); \
+ guest_execute_fastop_cl(KVM_FEP, insn, shift, output, flags); \
+ \
+ __GUEST_ASSERT(output == ex_output, \
+ "Wanted 0x%lx for '%s 0x%x, 0x%lx', got 0x%lx", \
+ (uint64_t)ex_output, insn, shift, (uint64_t)input, \
+ (uint64_t)output); \
+ __GUEST_ASSERT(flags == ex_flags, \
+ "Wanted flags 0x%lx for '%s 0x%x, 0x%lx', got 0x%lx", \
+ ex_flags, insn, shift, (uint64_t)input, flags); \
+})
+
+static const uint64_t vals[] = {
+ 0,
+ 1,
+ 2,
+ 4,
+ 7,
+ 0x5555555555555555,
+ 0xaaaaaaaaaaaaaaaa,
+ 0xfefefefefefefefe,
+ 0xffffffffffffffff,
+};
+
+#define guest_test_fastops(type_t, suffix) \
+do { \
+ int i, j; \
+ \
+ for (i = 0; i < ARRAY_SIZE(vals); i++) { \
+ guest_test_fastop_1("dec" suffix, type_t, vals[i]); \
+ guest_test_fastop_1("inc" suffix, type_t, vals[i]); \
+ guest_test_fastop_1("neg" suffix, type_t, vals[i]); \
+ guest_test_fastop_1("not" suffix, type_t, vals[i]); \
+ \
+ for (j = 0; j < ARRAY_SIZE(vals); j++) { \
+ guest_test_fastop_2("add" suffix, type_t, vals[i], vals[j]); \
+ guest_test_fastop_2("adc" suffix, type_t, vals[i], vals[j]); \
+ guest_test_fastop_2("and" suffix, type_t, vals[i], vals[j]); \
+ guest_test_fastop_2("bsf" suffix, type_t, vals[i], vals[j]); \
+ guest_test_fastop_2("bsr" suffix, type_t, vals[i], vals[j]); \
+ guest_test_fastop_2("bt" suffix, type_t, vals[i], vals[j]); \
+ guest_test_fastop_2("btc" suffix, type_t, vals[i], vals[j]); \
+ guest_test_fastop_2("btr" suffix, type_t, vals[i], vals[j]); \
+ guest_test_fastop_2("bts" suffix, type_t, vals[i], vals[j]); \
+ guest_test_fastop_2("cmp" suffix, type_t, vals[i], vals[j]); \
+ guest_test_fastop_2("imul" suffix, type_t, vals[i], vals[j]); \
+ guest_test_fastop_2("or" suffix, type_t, vals[i], vals[j]); \
+ guest_test_fastop_2("sbb" suffix, type_t, vals[i], vals[j]); \
+ guest_test_fastop_2("sub" suffix, type_t, vals[i], vals[j]); \
+ guest_test_fastop_2("test" suffix, type_t, vals[i], vals[j]); \
+ guest_test_fastop_2("xor" suffix, type_t, vals[i], vals[j]); \
+ \
+ guest_test_fastop_cl("rol" suffix, type_t, vals[i], vals[j]); \
+ guest_test_fastop_cl("ror" suffix, type_t, vals[i], vals[j]); \
+ guest_test_fastop_cl("rcl" suffix, type_t, vals[i], vals[j]); \
+ guest_test_fastop_cl("rcr" suffix, type_t, vals[i], vals[j]); \
+ guest_test_fastop_cl("sar" suffix, type_t, vals[i], vals[j]); \
+ guest_test_fastop_cl("shl" suffix, type_t, vals[i], vals[j]); \
+ guest_test_fastop_cl("shr" suffix, type_t, vals[i], vals[j]); \
+ } \
+ } \
+} while (0)
+
+static void guest_code(void)
+{
+ guest_test_fastops(uint16_t, "w");
+ guest_test_fastops(uint32_t, "l");
+ guest_test_fastops(uint64_t, "q");
+
+ GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+ struct kvm_vcpu *vcpu;
+ struct kvm_vm *vm;
+
+ TEST_REQUIRE(is_forced_emulation_enabled);
+
+ vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+
+ vcpu_run(vcpu);
+ TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_DONE);
+
+ kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86/hyperv_cpuid.c
index 4e920705681a..c863a689aa98 100644
--- a/tools/testing/selftests/kvm/x86/hyperv_cpuid.c
+++ b/tools/testing/selftests/kvm/x86/hyperv_cpuid.c
@@ -22,25 +22,6 @@ static void guest_code(void)
{
}
-static bool smt_possible(void)
-{
- char buf[16];
- FILE *f;
- bool res = true;
-
- f = fopen("/sys/devices/system/cpu/smt/control", "r");
- if (f) {
- if (fread(buf, sizeof(*buf), sizeof(buf), f) > 0) {
- if (!strncmp(buf, "forceoff", 8) ||
- !strncmp(buf, "notsupported", 12))
- res = false;
- }
- fclose(f);
- }
-
- return res;
-}
-
static void test_hv_cpuid(struct kvm_vcpu *vcpu, bool evmcs_expected)
{
const bool has_irqchip = !vcpu || vcpu->vm->has_irqchip;
@@ -93,7 +74,7 @@ static void test_hv_cpuid(struct kvm_vcpu *vcpu, bool evmcs_expected)
case 0x40000004:
test_val = entry->eax & (1UL << 18);
- TEST_ASSERT(!!test_val == !smt_possible(),
+ TEST_ASSERT(!!test_val == !is_smt_possible(),
"NoNonArchitecturalCoreSharing bit"
" doesn't reflect SMT setting");
diff --git a/tools/testing/selftests/kvm/x86/kvm_buslock_test.c b/tools/testing/selftests/kvm/x86/kvm_buslock_test.c
new file mode 100644
index 000000000000..d88500c118eb
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/kvm_buslock_test.c
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2024 Advanced Micro Devices, Inc.
+ */
+#include <linux/atomic.h>
+
+#include "kvm_util.h"
+#include "processor.h"
+#include "svm_util.h"
+#include "vmx.h"
+#include "test_util.h"
+
+#define NR_BUS_LOCKS_PER_LEVEL 100
+#define CACHE_LINE_SIZE 64
+
+/*
+ * To generate a bus lock, carve out a buffer that precisely occupies two cache
+ * lines and perform an atomic access that splits the two lines.
+ */
+static u8 buffer[CACHE_LINE_SIZE * 2] __aligned(CACHE_LINE_SIZE);
+static atomic_t *val = (void *)&buffer[CACHE_LINE_SIZE - (sizeof(*val) / 2)];
+
+static void guest_generate_buslocks(void)
+{
+ for (int i = 0; i < NR_BUS_LOCKS_PER_LEVEL; i++)
+ atomic_inc(val);
+}
+
+#define L2_GUEST_STACK_SIZE 64
+
+static void l2_guest_code(void)
+{
+ guest_generate_buslocks();
+ GUEST_DONE();
+}
+
+static void l1_svm_code(struct svm_test_data *svm)
+{
+ unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+ struct vmcb *vmcb = svm->vmcb;
+
+ generic_svm_setup(svm, l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ run_guest(vmcb, svm->vmcb_gpa);
+}
+
+static void l1_vmx_code(struct vmx_pages *vmx)
+{
+ unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+
+ GUEST_ASSERT_EQ(prepare_for_vmx_operation(vmx), true);
+ GUEST_ASSERT_EQ(load_vmcs(vmx), true);
+
+ prepare_vmcs(vmx, NULL, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+ GUEST_ASSERT(!vmwrite(GUEST_RIP, (u64)l2_guest_code));
+ GUEST_ASSERT(!vmlaunch());
+}
+
+static void guest_code(void *test_data)
+{
+ guest_generate_buslocks();
+
+ if (this_cpu_has(X86_FEATURE_SVM))
+ l1_svm_code(test_data);
+ else if (this_cpu_has(X86_FEATURE_VMX))
+ l1_vmx_code(test_data);
+ else
+ GUEST_DONE();
+
+ TEST_FAIL("L2 should have signaled 'done'");
+}
+
+int main(int argc, char *argv[])
+{
+ const bool has_nested = kvm_cpu_has(X86_FEATURE_SVM) || kvm_cpu_has(X86_FEATURE_VMX);
+ vm_vaddr_t nested_test_data_gva;
+ struct kvm_vcpu *vcpu;
+ struct kvm_run *run;
+ struct kvm_vm *vm;
+ int i, bus_locks = 0;
+
+ TEST_REQUIRE(kvm_has_cap(KVM_CAP_X86_BUS_LOCK_EXIT));
+
+ vm = vm_create(1);
+ vm_enable_cap(vm, KVM_CAP_X86_BUS_LOCK_EXIT, KVM_BUS_LOCK_DETECTION_EXIT);
+ vcpu = vm_vcpu_add(vm, 0, guest_code);
+
+ if (kvm_cpu_has(X86_FEATURE_SVM))
+ vcpu_alloc_svm(vm, &nested_test_data_gva);
+ else
+ vcpu_alloc_vmx(vm, &nested_test_data_gva);
+
+ vcpu_args_set(vcpu, 1, nested_test_data_gva);
+
+ run = vcpu->run;
+
+ for (i = 0; i <= NR_BUS_LOCKS_PER_LEVEL * (1 + has_nested); i++) {
+ struct ucall uc;
+
+ vcpu_run(vcpu);
+
+ if (run->exit_reason == KVM_EXIT_IO) {
+ switch (get_ucall(vcpu, &uc)) {
+ case UCALL_ABORT:
+ REPORT_GUEST_ASSERT(uc);
+ goto done;
+ case UCALL_SYNC:
+ continue;
+ case UCALL_DONE:
+ goto done;
+ default:
+ TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd);
+ }
+ }
+
+ TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_X86_BUS_LOCK);
+
+ /*
+ * Verify the counter is actually getting incremented, e.g. that
+ * KVM isn't skipping the instruction. On Intel, the exit is
+ * trap-like, i.e. the counter should already have been
+ * incremented. On AMD, it's fault-like, i.e. the counter will
+ * be incremented when the guest re-executes the instruction.
+ */
+ sync_global_from_guest(vm, *val);
+ TEST_ASSERT_EQ(atomic_read(val), bus_locks + host_cpu_is_intel);
+
+ bus_locks++;
+ }
+ TEST_FAIL("Didn't receive UCALL_DONE, took %u bus lock exits\n", bus_locks);
+done:
+ TEST_ASSERT_EQ(i, bus_locks);
+ kvm_vm_free(vm);
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86/monitor_mwait_test.c b/tools/testing/selftests/kvm/x86/monitor_mwait_test.c
index 390ae2d87493..0eb371c62ab8 100644
--- a/tools/testing/selftests/kvm/x86/monitor_mwait_test.c
+++ b/tools/testing/selftests/kvm/x86/monitor_mwait_test.c
@@ -74,6 +74,7 @@ int main(int argc, char *argv[])
int testcase;
char test[80];
+ TEST_REQUIRE(this_cpu_has(X86_FEATURE_MWAIT));
TEST_REQUIRE(kvm_has_cap(KVM_CAP_DISABLE_QUIRKS2));
ksft_print_header();
diff --git a/tools/testing/selftests/kvm/x86/sev_init2_tests.c b/tools/testing/selftests/kvm/x86/sev_init2_tests.c
index 3fb967f40c6a..b238615196ad 100644
--- a/tools/testing/selftests/kvm/x86/sev_init2_tests.c
+++ b/tools/testing/selftests/kvm/x86/sev_init2_tests.c
@@ -28,6 +28,7 @@
int kvm_fd;
u64 supported_vmsa_features;
bool have_sev_es;
+bool have_snp;
static int __sev_ioctl(int vm_fd, int cmd_id, void *data)
{
@@ -83,6 +84,9 @@ void test_vm_types(void)
if (have_sev_es)
test_init2(KVM_X86_SEV_ES_VM, &(struct kvm_sev_init){});
+ if (have_snp)
+ test_init2(KVM_X86_SNP_VM, &(struct kvm_sev_init){});
+
test_init2_invalid(0, &(struct kvm_sev_init){},
"VM type is KVM_X86_DEFAULT_VM");
if (kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM))
@@ -138,15 +142,24 @@ int main(int argc, char *argv[])
"sev-es: KVM_CAP_VM_TYPES (%x) does not match cpuid (checking %x)",
kvm_check_cap(KVM_CAP_VM_TYPES), 1 << KVM_X86_SEV_ES_VM);
+ have_snp = kvm_cpu_has(X86_FEATURE_SEV_SNP);
+ TEST_ASSERT(have_snp == !!(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SNP_VM)),
+ "sev-snp: KVM_CAP_VM_TYPES (%x) indicates SNP support (bit %d), but CPUID does not",
+ kvm_check_cap(KVM_CAP_VM_TYPES), KVM_X86_SNP_VM);
+
test_vm_types();
test_flags(KVM_X86_SEV_VM);
if (have_sev_es)
test_flags(KVM_X86_SEV_ES_VM);
+ if (have_snp)
+ test_flags(KVM_X86_SNP_VM);
test_features(KVM_X86_SEV_VM, 0);
if (have_sev_es)
test_features(KVM_X86_SEV_ES_VM, supported_vmsa_features);
+ if (have_snp)
+ test_features(KVM_X86_SNP_VM, supported_vmsa_features);
return 0;
}
diff --git a/tools/testing/selftests/kvm/x86/sev_smoke_test.c b/tools/testing/selftests/kvm/x86/sev_smoke_test.c
index d97816dc476a..77256c89bb8d 100644
--- a/tools/testing/selftests/kvm/x86/sev_smoke_test.c
+++ b/tools/testing/selftests/kvm/x86/sev_smoke_test.c
@@ -16,6 +16,18 @@
#define XFEATURE_MASK_X87_AVX (XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM)
+static void guest_snp_code(void)
+{
+ uint64_t sev_msr = rdmsr(MSR_AMD64_SEV);
+
+ GUEST_ASSERT(sev_msr & MSR_AMD64_SEV_ENABLED);
+ GUEST_ASSERT(sev_msr & MSR_AMD64_SEV_ES_ENABLED);
+ GUEST_ASSERT(sev_msr & MSR_AMD64_SEV_SNP_ENABLED);
+
+ wrmsr(MSR_AMD64_SEV_ES_GHCB, GHCB_MSR_TERM_REQ);
+ vmgexit();
+}
+
static void guest_sev_es_code(void)
{
/* TODO: Check CPUID after GHCB-based hypercall support is added. */
@@ -27,7 +39,7 @@ static void guest_sev_es_code(void)
* force "termination" to signal "done" via the GHCB MSR protocol.
*/
wrmsr(MSR_AMD64_SEV_ES_GHCB, GHCB_MSR_TERM_REQ);
- __asm__ __volatile__("rep; vmmcall");
+ vmgexit();
}
static void guest_sev_code(void)
@@ -62,7 +74,7 @@ static void compare_xsave(u8 *from_host, u8 *from_guest)
abort();
}
-static void test_sync_vmsa(uint32_t policy)
+static void test_sync_vmsa(uint32_t type, uint64_t policy)
{
struct kvm_vcpu *vcpu;
struct kvm_vm *vm;
@@ -72,7 +84,7 @@ static void test_sync_vmsa(uint32_t policy)
double x87val = M_PI;
struct kvm_xsave __attribute__((aligned(64))) xsave = { 0 };
- vm = vm_sev_create_with_one_vcpu(KVM_X86_SEV_ES_VM, guest_code_xsave, &vcpu);
+ vm = vm_sev_create_with_one_vcpu(type, guest_code_xsave, &vcpu);
gva = vm_vaddr_alloc_shared(vm, PAGE_SIZE, KVM_UTIL_MIN_VADDR,
MEM_REGION_TEST_DATA);
hva = addr_gva2hva(vm, gva);
@@ -89,7 +101,7 @@ static void test_sync_vmsa(uint32_t policy)
: "ymm4", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)");
vcpu_xsave_set(vcpu, &xsave);
- vm_sev_launch(vm, SEV_POLICY_ES | policy, NULL);
+ vm_sev_launch(vm, policy, NULL);
/* This page is shared, so make it decrypted. */
memset(hva, 0, 4096);
@@ -108,14 +120,12 @@ static void test_sync_vmsa(uint32_t policy)
kvm_vm_free(vm);
}
-static void test_sev(void *guest_code, uint64_t policy)
+static void test_sev(void *guest_code, uint32_t type, uint64_t policy)
{
struct kvm_vcpu *vcpu;
struct kvm_vm *vm;
struct ucall uc;
- uint32_t type = policy & SEV_POLICY_ES ? KVM_X86_SEV_ES_VM : KVM_X86_SEV_VM;
-
vm = vm_sev_create_with_one_vcpu(type, guest_code, &vcpu);
/* TODO: Validate the measurement is as expected. */
@@ -124,7 +134,7 @@ static void test_sev(void *guest_code, uint64_t policy)
for (;;) {
vcpu_run(vcpu);
- if (policy & SEV_POLICY_ES) {
+ if (is_sev_es_vm(vm)) {
TEST_ASSERT(vcpu->run->exit_reason == KVM_EXIT_SYSTEM_EVENT,
"Wanted SYSTEM_EVENT, got %s",
exit_reason_str(vcpu->run->exit_reason));
@@ -161,16 +171,14 @@ static void guest_shutdown_code(void)
__asm__ __volatile__("ud2");
}
-static void test_sev_es_shutdown(void)
+static void test_sev_shutdown(uint32_t type, uint64_t policy)
{
struct kvm_vcpu *vcpu;
struct kvm_vm *vm;
- uint32_t type = KVM_X86_SEV_ES_VM;
-
vm = vm_sev_create_with_one_vcpu(type, guest_shutdown_code, &vcpu);
- vm_sev_launch(vm, SEV_POLICY_ES, NULL);
+ vm_sev_launch(vm, policy, NULL);
vcpu_run(vcpu);
TEST_ASSERT(vcpu->run->exit_reason == KVM_EXIT_SHUTDOWN,
@@ -180,27 +188,42 @@ static void test_sev_es_shutdown(void)
kvm_vm_free(vm);
}
-int main(int argc, char *argv[])
+static void test_sev_smoke(void *guest, uint32_t type, uint64_t policy)
{
const u64 xf_mask = XFEATURE_MASK_X87_AVX;
- TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SEV));
-
- test_sev(guest_sev_code, SEV_POLICY_NO_DBG);
- test_sev(guest_sev_code, 0);
+ if (type == KVM_X86_SNP_VM)
+ test_sev(guest, type, policy | SNP_POLICY_DBG);
+ else
+ test_sev(guest, type, policy | SEV_POLICY_NO_DBG);
+ test_sev(guest, type, policy);
- if (kvm_cpu_has(X86_FEATURE_SEV_ES)) {
- test_sev(guest_sev_es_code, SEV_POLICY_ES | SEV_POLICY_NO_DBG);
- test_sev(guest_sev_es_code, SEV_POLICY_ES);
+ if (type == KVM_X86_SEV_VM)
+ return;
- test_sev_es_shutdown();
+ test_sev_shutdown(type, policy);
- if (kvm_has_cap(KVM_CAP_XCRS) &&
- (xgetbv(0) & kvm_cpu_supported_xcr0() & xf_mask) == xf_mask) {
- test_sync_vmsa(0);
- test_sync_vmsa(SEV_POLICY_NO_DBG);
- }
+ if (kvm_has_cap(KVM_CAP_XCRS) &&
+ (xgetbv(0) & kvm_cpu_supported_xcr0() & xf_mask) == xf_mask) {
+ test_sync_vmsa(type, policy);
+ if (type == KVM_X86_SNP_VM)
+ test_sync_vmsa(type, policy | SNP_POLICY_DBG);
+ else
+ test_sync_vmsa(type, policy | SEV_POLICY_NO_DBG);
}
+}
+
+int main(int argc, char *argv[])
+{
+ TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SEV));
+
+ test_sev_smoke(guest_sev_code, KVM_X86_SEV_VM, 0);
+
+ if (kvm_cpu_has(X86_FEATURE_SEV_ES))
+ test_sev_smoke(guest_sev_es_code, KVM_X86_SEV_ES_VM, SEV_POLICY_ES);
+
+ if (kvm_cpu_has(X86_FEATURE_SEV_SNP))
+ test_sev_smoke(guest_snp_code, KVM_X86_SNP_VM, snp_default_policy());
return 0;
}
diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore
index c5241b193db8..824266982aa3 100644
--- a/tools/testing/selftests/mm/.gitignore
+++ b/tools/testing/selftests/mm/.gitignore
@@ -20,6 +20,7 @@ mremap_test
on-fault-limit
transhuge-stress
pagemap_ioctl
+pfnmap
*.tmp*
protection_keys
protection_keys_32
@@ -58,3 +59,4 @@ hugetlb_dio
pkey_sighandler_tests_32
pkey_sighandler_tests_64
guard-regions
+merge
diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index 8270895039d1..ae6f994d3add 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -84,6 +84,7 @@ TEST_GEN_FILES += mremap_test
TEST_GEN_FILES += mseal_test
TEST_GEN_FILES += on-fault-limit
TEST_GEN_FILES += pagemap_ioctl
+TEST_GEN_FILES += pfnmap
TEST_GEN_FILES += thuge-gen
TEST_GEN_FILES += transhuge-stress
TEST_GEN_FILES += uffd-stress
@@ -98,6 +99,7 @@ TEST_GEN_FILES += hugetlb_madv_vs_map
TEST_GEN_FILES += hugetlb_dio
TEST_GEN_FILES += droppable
TEST_GEN_FILES += guard-regions
+TEST_GEN_FILES += merge
ifneq ($(ARCH),arm64)
TEST_GEN_FILES += soft-dirty
diff --git a/tools/testing/selftests/mm/config b/tools/testing/selftests/mm/config
index a28baa536332..deba93379c80 100644
--- a/tools/testing/selftests/mm/config
+++ b/tools/testing/selftests/mm/config
@@ -8,3 +8,6 @@ CONFIG_GUP_TEST=y
CONFIG_TRANSPARENT_HUGEPAGE=y
CONFIG_MEM_SOFT_DIRTY=y
CONFIG_ANON_VMA_NAME=y
+CONFIG_FTRACE=y
+CONFIG_PROFILING=y
+CONFIG_UPROBES=y
diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c
index b6cfe0a4b7df..dbbcc5eb3dce 100644
--- a/tools/testing/selftests/mm/cow.c
+++ b/tools/testing/selftests/mm/cow.c
@@ -112,9 +112,12 @@ struct comm_pipes {
static int setup_comm_pipes(struct comm_pipes *comm_pipes)
{
- if (pipe(comm_pipes->child_ready) < 0)
+ if (pipe(comm_pipes->child_ready) < 0) {
+ ksft_perror("pipe()");
return -errno;
+ }
if (pipe(comm_pipes->parent_ready) < 0) {
+ ksft_perror("pipe()");
close(comm_pipes->child_ready[0]);
close(comm_pipes->child_ready[1]);
return -errno;
@@ -207,13 +210,14 @@ static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect,
ret = setup_comm_pipes(&comm_pipes);
if (ret) {
- ksft_test_result_fail("pipe() failed\n");
+ log_test_result(KSFT_FAIL);
return;
}
ret = fork();
if (ret < 0) {
- ksft_test_result_fail("fork() failed\n");
+ ksft_perror("fork() failed");
+ log_test_result(KSFT_FAIL);
goto close_comm_pipes;
} else if (!ret) {
exit(fn(mem, size, &comm_pipes));
@@ -228,9 +232,18 @@ static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect,
* write-faults by directly mapping pages writable.
*/
ret = mprotect(mem, size, PROT_READ);
- ret |= mprotect(mem, size, PROT_READ|PROT_WRITE);
if (ret) {
- ksft_test_result_fail("mprotect() failed\n");
+ ksft_perror("mprotect() failed");
+ log_test_result(KSFT_FAIL);
+ write(comm_pipes.parent_ready[1], "0", 1);
+ wait(&ret);
+ goto close_comm_pipes;
+ }
+
+ ret = mprotect(mem, size, PROT_READ|PROT_WRITE);
+ if (ret) {
+ ksft_perror("mprotect() failed");
+ log_test_result(KSFT_FAIL);
write(comm_pipes.parent_ready[1], "0", 1);
wait(&ret);
goto close_comm_pipes;
@@ -248,16 +261,16 @@ static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect,
ret = -EINVAL;
if (!ret) {
- ksft_test_result_pass("No leak from parent into child\n");
+ log_test_result(KSFT_PASS);
} else if (xfail) {
/*
* With hugetlb, some vmsplice() tests are currently expected to
* fail because (a) harder to fix and (b) nobody really cares.
* Flag them as expected failure for now.
*/
- ksft_test_result_xfail("Leak from parent into child\n");
+ log_test_result(KSFT_XFAIL);
} else {
- ksft_test_result_fail("Leak from parent into child\n");
+ log_test_result(KSFT_FAIL);
}
close_comm_pipes:
close_comm_pipes(&comm_pipes);
@@ -306,26 +319,29 @@ static void do_test_vmsplice_in_parent(char *mem, size_t size,
ret = setup_comm_pipes(&comm_pipes);
if (ret) {
- ksft_test_result_fail("pipe() failed\n");
+ log_test_result(KSFT_FAIL);
goto free;
}
if (pipe(fds) < 0) {
- ksft_test_result_fail("pipe() failed\n");
+ ksft_perror("pipe() failed");
+ log_test_result(KSFT_FAIL);
goto close_comm_pipes;
}
if (before_fork) {
transferred = vmsplice(fds[1], &iov, 1, 0);
if (transferred <= 0) {
- ksft_test_result_fail("vmsplice() failed\n");
+ ksft_print_msg("vmsplice() failed\n");
+ log_test_result(KSFT_FAIL);
goto close_pipe;
}
}
ret = fork();
if (ret < 0) {
- ksft_test_result_fail("fork() failed\n");
+ ksft_perror("fork() failed\n");
+ log_test_result(KSFT_FAIL);
goto close_pipe;
} else if (!ret) {
write(comm_pipes.child_ready[1], "0", 1);
@@ -339,7 +355,8 @@ static void do_test_vmsplice_in_parent(char *mem, size_t size,
if (!before_fork) {
transferred = vmsplice(fds[1], &iov, 1, 0);
if (transferred <= 0) {
- ksft_test_result_fail("vmsplice() failed\n");
+ ksft_perror("vmsplice() failed");
+ log_test_result(KSFT_FAIL);
wait(&ret);
goto close_pipe;
}
@@ -348,7 +365,8 @@ static void do_test_vmsplice_in_parent(char *mem, size_t size,
while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
;
if (munmap(mem, size) < 0) {
- ksft_test_result_fail("munmap() failed\n");
+ ksft_perror("munmap() failed");
+ log_test_result(KSFT_FAIL);
goto close_pipe;
}
write(comm_pipes.parent_ready[1], "0", 1);
@@ -356,7 +374,8 @@ static void do_test_vmsplice_in_parent(char *mem, size_t size,
/* Wait until the child is done writing. */
wait(&ret);
if (!WIFEXITED(ret)) {
- ksft_test_result_fail("wait() failed\n");
+ ksft_perror("wait() failed");
+ log_test_result(KSFT_FAIL);
goto close_pipe;
}
@@ -364,22 +383,23 @@ static void do_test_vmsplice_in_parent(char *mem, size_t size,
for (total = 0; total < transferred; total += cur) {
cur = read(fds[0], new + total, transferred - total);
if (cur < 0) {
- ksft_test_result_fail("read() failed\n");
+ ksft_perror("read() failed");
+ log_test_result(KSFT_FAIL);
goto close_pipe;
}
}
if (!memcmp(old, new, transferred)) {
- ksft_test_result_pass("No leak from child into parent\n");
+ log_test_result(KSFT_PASS);
} else if (xfail) {
/*
* With hugetlb, some vmsplice() tests are currently expected to
* fail because (a) harder to fix and (b) nobody really cares.
* Flag them as expected failure for now.
*/
- ksft_test_result_xfail("Leak from child into parent\n");
+ log_test_result(KSFT_XFAIL);
} else {
- ksft_test_result_fail("Leak from child into parent\n");
+ log_test_result(KSFT_FAIL);
}
close_pipe:
close(fds[0]);
@@ -416,13 +436,14 @@ static void do_test_iouring(char *mem, size_t size, bool use_fork)
ret = setup_comm_pipes(&comm_pipes);
if (ret) {
- ksft_test_result_fail("pipe() failed\n");
+ log_test_result(KSFT_FAIL);
return;
}
file = tmpfile();
if (!file) {
- ksft_test_result_fail("tmpfile() failed\n");
+ ksft_perror("tmpfile() failed");
+ log_test_result(KSFT_FAIL);
goto close_comm_pipes;
}
fd = fileno(file);
@@ -430,14 +451,16 @@ static void do_test_iouring(char *mem, size_t size, bool use_fork)
tmp = malloc(size);
if (!tmp) {
- ksft_test_result_fail("malloc() failed\n");
+ ksft_print_msg("malloc() failed\n");
+ log_test_result(KSFT_FAIL);
goto close_file;
}
/* Skip on errors, as we might just lack kernel support. */
ret = io_uring_queue_init(1, &ring, 0);
if (ret < 0) {
- ksft_test_result_skip("io_uring_queue_init() failed\n");
+ ksft_print_msg("io_uring_queue_init() failed\n");
+ log_test_result(KSFT_SKIP);
goto free_tmp;
}
@@ -452,7 +475,8 @@ static void do_test_iouring(char *mem, size_t size, bool use_fork)
iov.iov_len = size;
ret = io_uring_register_buffers(&ring, &iov, 1);
if (ret) {
- ksft_test_result_skip("io_uring_register_buffers() failed\n");
+ ksft_print_msg("io_uring_register_buffers() failed\n");
+ log_test_result(KSFT_SKIP);
goto queue_exit;
}
@@ -463,7 +487,8 @@ static void do_test_iouring(char *mem, size_t size, bool use_fork)
*/
ret = fork();
if (ret < 0) {
- ksft_test_result_fail("fork() failed\n");
+ ksft_perror("fork() failed");
+ log_test_result(KSFT_FAIL);
goto unregister_buffers;
} else if (!ret) {
write(comm_pipes.child_ready[1], "0", 1);
@@ -483,10 +508,17 @@ static void do_test_iouring(char *mem, size_t size, bool use_fork)
* if the page is mapped R/O vs. R/W).
*/
ret = mprotect(mem, size, PROT_READ);
+ if (ret) {
+ ksft_perror("mprotect() failed");
+ log_test_result(KSFT_FAIL);
+ goto unregister_buffers;
+ }
+
clear_softdirty();
- ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
+ ret = mprotect(mem, size, PROT_READ | PROT_WRITE);
if (ret) {
- ksft_test_result_fail("mprotect() failed\n");
+ ksft_perror("mprotect() failed");
+ log_test_result(KSFT_FAIL);
goto unregister_buffers;
}
}
@@ -498,25 +530,29 @@ static void do_test_iouring(char *mem, size_t size, bool use_fork)
memset(mem, 0xff, size);
sqe = io_uring_get_sqe(&ring);
if (!sqe) {
- ksft_test_result_fail("io_uring_get_sqe() failed\n");
+ ksft_print_msg("io_uring_get_sqe() failed\n");
+ log_test_result(KSFT_FAIL);
goto quit_child;
}
io_uring_prep_write_fixed(sqe, fd, mem, size, 0, 0);
ret = io_uring_submit(&ring);
if (ret < 0) {
- ksft_test_result_fail("io_uring_submit() failed\n");
+ ksft_print_msg("io_uring_submit() failed\n");
+ log_test_result(KSFT_FAIL);
goto quit_child;
}
ret = io_uring_wait_cqe(&ring, &cqe);
if (ret < 0) {
- ksft_test_result_fail("io_uring_wait_cqe() failed\n");
+ ksft_print_msg("io_uring_wait_cqe() failed\n");
+ log_test_result(KSFT_FAIL);
goto quit_child;
}
if (cqe->res != size) {
- ksft_test_result_fail("write_fixed failed\n");
+ ksft_print_msg("write_fixed failed\n");
+ log_test_result(KSFT_FAIL);
goto quit_child;
}
io_uring_cqe_seen(&ring, cqe);
@@ -526,15 +562,18 @@ static void do_test_iouring(char *mem, size_t size, bool use_fork)
while (total < size) {
cur = pread(fd, tmp + total, size - total, total);
if (cur < 0) {
- ksft_test_result_fail("pread() failed\n");
+ ksft_print_msg("pread() failed\n");
+ log_test_result(KSFT_FAIL);
goto quit_child;
}
total += cur;
}
/* Finally, check if we read what we expected. */
- ksft_test_result(!memcmp(mem, tmp, size),
- "Longterm R/W pin is reliable\n");
+ if (!memcmp(mem, tmp, size))
+ log_test_result(KSFT_PASS);
+ else
+ log_test_result(KSFT_FAIL);
quit_child:
if (use_fork) {
@@ -582,19 +621,21 @@ static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test,
int ret;
if (gup_fd < 0) {
- ksft_test_result_skip("gup_test not available\n");
+ ksft_print_msg("gup_test not available\n");
+ log_test_result(KSFT_SKIP);
return;
}
tmp = malloc(size);
if (!tmp) {
- ksft_test_result_fail("malloc() failed\n");
+ ksft_print_msg("malloc() failed\n");
+ log_test_result(KSFT_FAIL);
return;
}
ret = setup_comm_pipes(&comm_pipes);
if (ret) {
- ksft_test_result_fail("pipe() failed\n");
+ log_test_result(KSFT_FAIL);
goto free_tmp;
}
@@ -609,7 +650,8 @@ static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test,
*/
ret = fork();
if (ret < 0) {
- ksft_test_result_fail("fork() failed\n");
+ ksft_perror("fork() failed");
+ log_test_result(KSFT_FAIL);
goto close_comm_pipes;
} else if (!ret) {
write(comm_pipes.child_ready[1], "0", 1);
@@ -646,7 +688,8 @@ static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test,
clear_softdirty();
ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
if (ret) {
- ksft_test_result_fail("mprotect() failed\n");
+ ksft_perror("mprotect() failed");
+ log_test_result(KSFT_FAIL);
goto close_comm_pipes;
}
break;
@@ -661,9 +704,11 @@ static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test,
ret = ioctl(gup_fd, PIN_LONGTERM_TEST_START, &args);
if (ret) {
if (errno == EINVAL)
- ksft_test_result_skip("PIN_LONGTERM_TEST_START failed\n");
+ ret = KSFT_SKIP;
else
- ksft_test_result_fail("PIN_LONGTERM_TEST_START failed\n");
+ ret = KSFT_FAIL;
+ ksft_perror("PIN_LONGTERM_TEST_START failed");
+ log_test_result(ret);
goto wait;
}
@@ -676,22 +721,26 @@ static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test,
*/
tmp_val = (__u64)(uintptr_t)tmp;
ret = ioctl(gup_fd, PIN_LONGTERM_TEST_READ, &tmp_val);
- if (ret)
- ksft_test_result_fail("PIN_LONGTERM_TEST_READ failed\n");
- else
- ksft_test_result(!memcmp(mem, tmp, size),
- "Longterm R/O pin is reliable\n");
+ if (ret) {
+ ksft_perror("PIN_LONGTERM_TEST_READ failed");
+ log_test_result(KSFT_FAIL);
+ } else {
+ if (!memcmp(mem, tmp, size))
+ log_test_result(KSFT_PASS);
+ else
+ log_test_result(KSFT_FAIL);
+ }
ret = ioctl(gup_fd, PIN_LONGTERM_TEST_STOP);
if (ret)
- ksft_print_msg("[INFO] PIN_LONGTERM_TEST_STOP failed\n");
+ ksft_perror("PIN_LONGTERM_TEST_STOP failed");
wait:
switch (test) {
case RO_PIN_TEST_SHARED:
write(comm_pipes.parent_ready[1], "0", 1);
wait(&ret);
if (!WIFEXITED(ret))
- ksft_print_msg("[INFO] wait() failed\n");
+ ksft_perror("wait() failed");
break;
default:
break;
@@ -746,14 +795,16 @@ static void do_run_with_base_page(test_fn fn, bool swapout)
mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (mem == MAP_FAILED) {
- ksft_test_result_fail("mmap() failed\n");
+ ksft_perror("mmap() failed");
+ log_test_result(KSFT_FAIL);
return;
}
ret = madvise(mem, pagesize, MADV_NOHUGEPAGE);
/* Ignore if not around on a kernel. */
if (ret && errno != EINVAL) {
- ksft_test_result_fail("MADV_NOHUGEPAGE failed\n");
+ ksft_perror("MADV_NOHUGEPAGE failed");
+ log_test_result(KSFT_FAIL);
goto munmap;
}
@@ -763,7 +814,8 @@ static void do_run_with_base_page(test_fn fn, bool swapout)
if (swapout) {
madvise(mem, pagesize, MADV_PAGEOUT);
if (!pagemap_is_swapped(pagemap_fd, mem)) {
- ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n");
+ ksft_print_msg("MADV_PAGEOUT did not work, is swap enabled?\n");
+ log_test_result(KSFT_SKIP);
goto munmap;
}
}
@@ -775,13 +827,13 @@ munmap:
static void run_with_base_page(test_fn fn, const char *desc)
{
- ksft_print_msg("[RUN] %s ... with base page\n", desc);
+ log_test_start("%s ... with base page", desc);
do_run_with_base_page(fn, false);
}
static void run_with_base_page_swap(test_fn fn, const char *desc)
{
- ksft_print_msg("[RUN] %s ... with swapped out base page\n", desc);
+ log_test_start("%s ... with swapped out base page", desc);
do_run_with_base_page(fn, true);
}
@@ -807,7 +859,8 @@ static void do_run_with_thp(test_fn fn, enum thp_run thp_run, size_t thpsize)
mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (mmap_mem == MAP_FAILED) {
- ksft_test_result_fail("mmap() failed\n");
+ ksft_perror("mmap() failed");
+ log_test_result(KSFT_FAIL);
return;
}
@@ -816,7 +869,8 @@ static void do_run_with_thp(test_fn fn, enum thp_run thp_run, size_t thpsize)
ret = madvise(mem, thpsize, MADV_HUGEPAGE);
if (ret) {
- ksft_test_result_fail("MADV_HUGEPAGE failed\n");
+ ksft_perror("MADV_HUGEPAGE failed");
+ log_test_result(KSFT_FAIL);
goto munmap;
}
@@ -826,7 +880,8 @@ static void do_run_with_thp(test_fn fn, enum thp_run thp_run, size_t thpsize)
*/
mem[0] = 1;
if (!pagemap_is_populated(pagemap_fd, mem + thpsize - pagesize)) {
- ksft_test_result_skip("Did not get a THP populated\n");
+ ksft_print_msg("Did not get a THP populated\n");
+ log_test_result(KSFT_SKIP);
goto munmap;
}
memset(mem, 1, thpsize);
@@ -846,12 +901,14 @@ static void do_run_with_thp(test_fn fn, enum thp_run thp_run, size_t thpsize)
*/
ret = mprotect(mem + pagesize, pagesize, PROT_READ);
if (ret) {
- ksft_test_result_fail("mprotect() failed\n");
+ ksft_perror("mprotect() failed");
+ log_test_result(KSFT_FAIL);
goto munmap;
}
ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
if (ret) {
- ksft_test_result_fail("mprotect() failed\n");
+ ksft_perror("mprotect() failed");
+ log_test_result(KSFT_FAIL);
goto munmap;
}
break;
@@ -863,7 +920,8 @@ static void do_run_with_thp(test_fn fn, enum thp_run thp_run, size_t thpsize)
*/
ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTNEED);
if (ret) {
- ksft_test_result_fail("MADV_DONTNEED failed\n");
+ ksft_perror("MADV_DONTNEED failed");
+ log_test_result(KSFT_FAIL);
goto munmap;
}
size = pagesize;
@@ -877,13 +935,15 @@ static void do_run_with_thp(test_fn fn, enum thp_run thp_run, size_t thpsize)
mremap_mem = mmap(NULL, mremap_size, PROT_NONE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (mremap_mem == MAP_FAILED) {
- ksft_test_result_fail("mmap() failed\n");
+ ksft_perror("mmap() failed");
+ log_test_result(KSFT_FAIL);
goto munmap;
}
tmp = mremap(mem + mremap_size, mremap_size, mremap_size,
MREMAP_MAYMOVE | MREMAP_FIXED, mremap_mem);
if (tmp != mremap_mem) {
- ksft_test_result_fail("mremap() failed\n");
+ ksft_perror("mremap() failed");
+ log_test_result(KSFT_FAIL);
goto munmap;
}
size = mremap_size;
@@ -896,12 +956,14 @@ static void do_run_with_thp(test_fn fn, enum thp_run thp_run, size_t thpsize)
*/
ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTFORK);
if (ret) {
- ksft_test_result_fail("MADV_DONTFORK failed\n");
+ ksft_perror("MADV_DONTFORK failed");
+ log_test_result(KSFT_FAIL);
goto munmap;
}
ret = fork();
if (ret < 0) {
- ksft_test_result_fail("fork() failed\n");
+ ksft_perror("fork() failed");
+ log_test_result(KSFT_FAIL);
goto munmap;
} else if (!ret) {
exit(0);
@@ -910,7 +972,8 @@ static void do_run_with_thp(test_fn fn, enum thp_run thp_run, size_t thpsize)
/* Allow for sharing all pages again. */
ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DOFORK);
if (ret) {
- ksft_test_result_fail("MADV_DOFORK failed\n");
+ ksft_perror("MADV_DOFORK failed");
+ log_test_result(KSFT_FAIL);
goto munmap;
}
break;
@@ -924,7 +987,8 @@ static void do_run_with_thp(test_fn fn, enum thp_run thp_run, size_t thpsize)
case THP_RUN_SINGLE_PTE_SWAPOUT:
madvise(mem, size, MADV_PAGEOUT);
if (!range_is_swapped(mem, size)) {
- ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n");
+ ksft_print_msg("MADV_PAGEOUT did not work, is swap enabled?\n");
+ log_test_result(KSFT_SKIP);
goto munmap;
}
break;
@@ -941,56 +1005,56 @@ munmap:
static void run_with_thp(test_fn fn, const char *desc, size_t size)
{
- ksft_print_msg("[RUN] %s ... with THP (%zu kB)\n",
+ log_test_start("%s ... with THP (%zu kB)",
desc, size / 1024);
do_run_with_thp(fn, THP_RUN_PMD, size);
}
static void run_with_thp_swap(test_fn fn, const char *desc, size_t size)
{
- ksft_print_msg("[RUN] %s ... with swapped-out THP (%zu kB)\n",
+ log_test_start("%s ... with swapped-out THP (%zu kB)",
desc, size / 1024);
do_run_with_thp(fn, THP_RUN_PMD_SWAPOUT, size);
}
static void run_with_pte_mapped_thp(test_fn fn, const char *desc, size_t size)
{
- ksft_print_msg("[RUN] %s ... with PTE-mapped THP (%zu kB)\n",
+ log_test_start("%s ... with PTE-mapped THP (%zu kB)",
desc, size / 1024);
do_run_with_thp(fn, THP_RUN_PTE, size);
}
static void run_with_pte_mapped_thp_swap(test_fn fn, const char *desc, size_t size)
{
- ksft_print_msg("[RUN] %s ... with swapped-out, PTE-mapped THP (%zu kB)\n",
+ log_test_start("%s ... with swapped-out, PTE-mapped THP (%zu kB)",
desc, size / 1024);
do_run_with_thp(fn, THP_RUN_PTE_SWAPOUT, size);
}
static void run_with_single_pte_of_thp(test_fn fn, const char *desc, size_t size)
{
- ksft_print_msg("[RUN] %s ... with single PTE of THP (%zu kB)\n",
+ log_test_start("%s ... with single PTE of THP (%zu kB)",
desc, size / 1024);
do_run_with_thp(fn, THP_RUN_SINGLE_PTE, size);
}
static void run_with_single_pte_of_thp_swap(test_fn fn, const char *desc, size_t size)
{
- ksft_print_msg("[RUN] %s ... with single PTE of swapped-out THP (%zu kB)\n",
+ log_test_start("%s ... with single PTE of swapped-out THP (%zu kB)",
desc, size / 1024);
do_run_with_thp(fn, THP_RUN_SINGLE_PTE_SWAPOUT, size);
}
static void run_with_partial_mremap_thp(test_fn fn, const char *desc, size_t size)
{
- ksft_print_msg("[RUN] %s ... with partially mremap()'ed THP (%zu kB)\n",
+ log_test_start("%s ... with partially mremap()'ed THP (%zu kB)",
desc, size / 1024);
do_run_with_thp(fn, THP_RUN_PARTIAL_MREMAP, size);
}
static void run_with_partial_shared_thp(test_fn fn, const char *desc, size_t size)
{
- ksft_print_msg("[RUN] %s ... with partially shared THP (%zu kB)\n",
+ log_test_start("%s ... with partially shared THP (%zu kB)",
desc, size / 1024);
do_run_with_thp(fn, THP_RUN_PARTIAL_SHARED, size);
}
@@ -1000,14 +1064,15 @@ static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize)
int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB;
char *mem, *dummy;
- ksft_print_msg("[RUN] %s ... with hugetlb (%zu kB)\n", desc,
+ log_test_start("%s ... with hugetlb (%zu kB)", desc,
hugetlbsize / 1024);
flags |= __builtin_ctzll(hugetlbsize) << MAP_HUGE_SHIFT;
mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
if (mem == MAP_FAILED) {
- ksft_test_result_skip("need more free huge pages\n");
+ ksft_perror("need more free huge pages");
+ log_test_result(KSFT_SKIP);
return;
}
@@ -1020,7 +1085,8 @@ static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize)
*/
dummy = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
if (dummy == MAP_FAILED) {
- ksft_test_result_skip("need more free huge pages\n");
+ ksft_perror("need more free huge pages");
+ log_test_result(KSFT_SKIP);
goto munmap;
}
munmap(dummy, hugetlbsize);
@@ -1226,7 +1292,7 @@ static void do_test_anon_thp_collapse(char *mem, size_t size,
ret = setup_comm_pipes(&comm_pipes);
if (ret) {
- ksft_test_result_fail("pipe() failed\n");
+ log_test_result(KSFT_FAIL);
return;
}
@@ -1236,12 +1302,14 @@ static void do_test_anon_thp_collapse(char *mem, size_t size,
*/
ret = mprotect(mem + pagesize, pagesize, PROT_READ);
if (ret) {
- ksft_test_result_fail("mprotect() failed\n");
+ ksft_perror("mprotect() failed");
+ log_test_result(KSFT_FAIL);
goto close_comm_pipes;
}
ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
if (ret) {
- ksft_test_result_fail("mprotect() failed\n");
+ ksft_perror("mprotect() failed");
+ log_test_result(KSFT_FAIL);
goto close_comm_pipes;
}
@@ -1250,8 +1318,8 @@ static void do_test_anon_thp_collapse(char *mem, size_t size,
/* Collapse before actually COW-sharing the page. */
ret = madvise(mem, size, MADV_COLLAPSE);
if (ret) {
- ksft_test_result_skip("MADV_COLLAPSE failed: %s\n",
- strerror(errno));
+ ksft_perror("MADV_COLLAPSE failed");
+ log_test_result(KSFT_SKIP);
goto close_comm_pipes;
}
break;
@@ -1262,7 +1330,8 @@ static void do_test_anon_thp_collapse(char *mem, size_t size,
/* Don't COW-share the upper part of the THP. */
ret = madvise(mem + size / 2, size / 2, MADV_DONTFORK);
if (ret) {
- ksft_test_result_fail("MADV_DONTFORK failed\n");
+ ksft_perror("MADV_DONTFORK failed");
+ log_test_result(KSFT_FAIL);
goto close_comm_pipes;
}
break;
@@ -1270,7 +1339,8 @@ static void do_test_anon_thp_collapse(char *mem, size_t size,
/* Don't COW-share the lower part of the THP. */
ret = madvise(mem, size / 2, MADV_DONTFORK);
if (ret) {
- ksft_test_result_fail("MADV_DONTFORK failed\n");
+ ksft_perror("MADV_DONTFORK failed");
+ log_test_result(KSFT_FAIL);
goto close_comm_pipes;
}
break;
@@ -1280,7 +1350,8 @@ static void do_test_anon_thp_collapse(char *mem, size_t size,
ret = fork();
if (ret < 0) {
- ksft_test_result_fail("fork() failed\n");
+ ksft_perror("fork() failed");
+ log_test_result(KSFT_FAIL);
goto close_comm_pipes;
} else if (!ret) {
switch (test) {
@@ -1314,7 +1385,8 @@ static void do_test_anon_thp_collapse(char *mem, size_t size,
*/
ret = madvise(mem, size, MADV_DOFORK);
if (ret) {
- ksft_test_result_fail("MADV_DOFORK failed\n");
+ ksft_perror("MADV_DOFORK failed");
+ log_test_result(KSFT_FAIL);
write(comm_pipes.parent_ready[1], "0", 1);
wait(&ret);
goto close_comm_pipes;
@@ -1324,8 +1396,8 @@ static void do_test_anon_thp_collapse(char *mem, size_t size,
/* Collapse before anyone modified the COW-shared page. */
ret = madvise(mem, size, MADV_COLLAPSE);
if (ret) {
- ksft_test_result_skip("MADV_COLLAPSE failed: %s\n",
- strerror(errno));
+ ksft_perror("MADV_COLLAPSE failed");
+ log_test_result(KSFT_SKIP);
write(comm_pipes.parent_ready[1], "0", 1);
wait(&ret);
goto close_comm_pipes;
@@ -1345,7 +1417,10 @@ static void do_test_anon_thp_collapse(char *mem, size_t size,
else
ret = -EINVAL;
- ksft_test_result(!ret, "No leak from parent into child\n");
+ if (!ret)
+ log_test_result(KSFT_PASS);
+ else
+ log_test_result(KSFT_FAIL);
close_comm_pipes:
close_comm_pipes(&comm_pipes);
}
@@ -1430,7 +1505,7 @@ static void run_anon_thp_test_cases(void)
for (i = 0; i < ARRAY_SIZE(anon_thp_test_cases); i++) {
struct test_case const *test_case = &anon_thp_test_cases[i];
- ksft_print_msg("[RUN] %s\n", test_case->desc);
+ log_test_start("%s", test_case->desc);
do_run_with_thp(test_case->fn, THP_RUN_PMD, pmdsize);
}
}
@@ -1453,8 +1528,10 @@ static void test_cow(char *mem, const char *smem, size_t size)
memset(mem, 0xff, size);
/* See if we still read the old values via the other mapping. */
- ksft_test_result(!memcmp(smem, old, size),
- "Other mapping not modified\n");
+ if (!memcmp(smem, old, size))
+ log_test_result(KSFT_PASS);
+ else
+ log_test_result(KSFT_FAIL);
free(old);
}
@@ -1472,18 +1549,20 @@ static void run_with_zeropage(non_anon_test_fn fn, const char *desc)
{
char *mem, *smem, tmp;
- ksft_print_msg("[RUN] %s ... with shared zeropage\n", desc);
+ log_test_start("%s ... with shared zeropage", desc);
mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANON, -1, 0);
if (mem == MAP_FAILED) {
- ksft_test_result_fail("mmap() failed\n");
+ ksft_perror("mmap() failed");
+ log_test_result(KSFT_FAIL);
return;
}
smem = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0);
if (smem == MAP_FAILED) {
- ksft_test_result_fail("mmap() failed\n");
+ ksft_perror("mmap() failed");
+ log_test_result(KSFT_FAIL);
goto munmap;
}
@@ -1504,10 +1583,11 @@ static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc)
size_t mmap_size;
int ret;
- ksft_print_msg("[RUN] %s ... with huge zeropage\n", desc);
+ log_test_start("%s ... with huge zeropage", desc);
if (!has_huge_zeropage) {
- ksft_test_result_skip("Huge zeropage not enabled\n");
+ ksft_print_msg("Huge zeropage not enabled\n");
+ log_test_result(KSFT_SKIP);
return;
}
@@ -1516,13 +1596,15 @@ static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc)
mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (mmap_mem == MAP_FAILED) {
- ksft_test_result_fail("mmap() failed\n");
+ ksft_perror("mmap() failed");
+ log_test_result(KSFT_FAIL);
return;
}
mmap_smem = mmap(NULL, mmap_size, PROT_READ,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (mmap_smem == MAP_FAILED) {
- ksft_test_result_fail("mmap() failed\n");
+ ksft_perror("mmap() failed");
+ log_test_result(KSFT_FAIL);
goto munmap;
}
@@ -1531,9 +1613,15 @@ static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc)
smem = (char *)(((uintptr_t)mmap_smem + pmdsize) & ~(pmdsize - 1));
ret = madvise(mem, pmdsize, MADV_HUGEPAGE);
+ if (ret != 0) {
+ ksft_perror("madvise()");
+ log_test_result(KSFT_FAIL);
+ goto munmap;
+ }
ret |= madvise(smem, pmdsize, MADV_HUGEPAGE);
- if (ret) {
- ksft_test_result_fail("MADV_HUGEPAGE failed\n");
+ if (ret != 0) {
+ ksft_perror("madvise()");
+ log_test_result(KSFT_FAIL);
goto munmap;
}
@@ -1562,29 +1650,33 @@ static void run_with_memfd(non_anon_test_fn fn, const char *desc)
char *mem, *smem, tmp;
int fd;
- ksft_print_msg("[RUN] %s ... with memfd\n", desc);
+ log_test_start("%s ... with memfd", desc);
fd = memfd_create("test", 0);
if (fd < 0) {
- ksft_test_result_fail("memfd_create() failed\n");
+ ksft_perror("memfd_create() failed");
+ log_test_result(KSFT_FAIL);
return;
}
/* File consists of a single page filled with zeroes. */
if (fallocate(fd, 0, 0, pagesize)) {
- ksft_test_result_fail("fallocate() failed\n");
+ ksft_perror("fallocate() failed");
+ log_test_result(KSFT_FAIL);
goto close;
}
/* Create a private mapping of the memfd. */
mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
if (mem == MAP_FAILED) {
- ksft_test_result_fail("mmap() failed\n");
+ ksft_perror("mmap() failed");
+ log_test_result(KSFT_FAIL);
goto close;
}
smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
if (smem == MAP_FAILED) {
- ksft_test_result_fail("mmap() failed\n");
+ ksft_perror("mmap() failed");
+ log_test_result(KSFT_FAIL);
goto munmap;
}
@@ -1607,35 +1699,40 @@ static void run_with_tmpfile(non_anon_test_fn fn, const char *desc)
FILE *file;
int fd;
- ksft_print_msg("[RUN] %s ... with tmpfile\n", desc);
+ log_test_start("%s ... with tmpfile", desc);
file = tmpfile();
if (!file) {
- ksft_test_result_fail("tmpfile() failed\n");
+ ksft_perror("tmpfile() failed");
+ log_test_result(KSFT_FAIL);
return;
}
fd = fileno(file);
if (fd < 0) {
- ksft_test_result_skip("fileno() failed\n");
+ ksft_perror("fileno() failed");
+ log_test_result(KSFT_SKIP);
return;
}
/* File consists of a single page filled with zeroes. */
if (fallocate(fd, 0, 0, pagesize)) {
- ksft_test_result_fail("fallocate() failed\n");
+ ksft_perror("fallocate() failed");
+ log_test_result(KSFT_FAIL);
goto close;
}
/* Create a private mapping of the memfd. */
mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
if (mem == MAP_FAILED) {
- ksft_test_result_fail("mmap() failed\n");
+ ksft_perror("mmap() failed");
+ log_test_result(KSFT_FAIL);
goto close;
}
smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
if (smem == MAP_FAILED) {
- ksft_test_result_fail("mmap() failed\n");
+ ksft_perror("mmap() failed");
+ log_test_result(KSFT_FAIL);
goto munmap;
}
@@ -1659,20 +1756,22 @@ static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc,
char *mem, *smem, tmp;
int fd;
- ksft_print_msg("[RUN] %s ... with memfd hugetlb (%zu kB)\n", desc,
+ log_test_start("%s ... with memfd hugetlb (%zu kB)", desc,
hugetlbsize / 1024);
flags |= __builtin_ctzll(hugetlbsize) << MFD_HUGE_SHIFT;
fd = memfd_create("test", flags);
if (fd < 0) {
- ksft_test_result_skip("memfd_create() failed\n");
+ ksft_perror("memfd_create() failed");
+ log_test_result(KSFT_SKIP);
return;
}
/* File consists of a single page filled with zeroes. */
if (fallocate(fd, 0, 0, hugetlbsize)) {
- ksft_test_result_skip("need more free huge pages\n");
+ ksft_perror("need more free huge pages");
+ log_test_result(KSFT_SKIP);
goto close;
}
@@ -1680,12 +1779,14 @@ static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc,
mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd,
0);
if (mem == MAP_FAILED) {
- ksft_test_result_skip("need more free huge pages\n");
+ ksft_perror("need more free huge pages");
+ log_test_result(KSFT_SKIP);
goto close;
}
smem = mmap(NULL, hugetlbsize, PROT_READ, MAP_SHARED, fd, 0);
if (smem == MAP_FAILED) {
- ksft_test_result_fail("mmap() failed\n");
+ ksft_perror("mmap() failed");
+ log_test_result(KSFT_FAIL);
goto munmap;
}
@@ -1771,7 +1872,6 @@ static int tests_per_non_anon_test_case(void)
int main(int argc, char **argv)
{
- int err;
struct thp_settings default_settings;
ksft_print_header();
@@ -1811,9 +1911,5 @@ int main(int argc, char **argv)
thp_restore_settings();
}
- err = ksft_get_fail_cnt();
- if (err)
- ksft_exit_fail_msg("%d out of %d tests failed\n",
- err, ksft_test_num());
- ksft_exit_pass();
+ ksft_finished();
}
diff --git a/tools/testing/selftests/mm/guard-regions.c b/tools/testing/selftests/mm/guard-regions.c
index eba43ead13ae..93af3d3760f9 100644
--- a/tools/testing/selftests/mm/guard-regions.c
+++ b/tools/testing/selftests/mm/guard-regions.c
@@ -8,6 +8,7 @@
#include <fcntl.h>
#include <linux/limits.h>
#include <linux/userfaultfd.h>
+#include <linux/fs.h>
#include <setjmp.h>
#include <signal.h>
#include <stdbool.h>
@@ -1452,8 +1453,21 @@ TEST_F(guard_regions, uffd)
/* Set up uffd. */
uffd = userfaultfd(0);
- if (uffd == -1 && errno == EPERM)
- ksft_exit_skip("No userfaultfd permissions, try running as root.\n");
+ if (uffd == -1) {
+ switch (errno) {
+ case EPERM:
+ SKIP(return, "No userfaultfd permissions, try running as root.");
+ break;
+ case ENOSYS:
+ SKIP(return, "userfaultfd is not supported/not enabled.");
+ break;
+ default:
+ ksft_exit_fail_msg("userfaultfd failed with %s\n",
+ strerror(errno));
+ break;
+ }
+ }
+
ASSERT_NE(uffd, -1);
ASSERT_EQ(ioctl(uffd, UFFDIO_API, &api), 0);
@@ -2075,4 +2089,60 @@ TEST_F(guard_regions, pagemap)
ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
}
+/*
+ * Assert that PAGEMAP_SCAN correctly reports guard region ranges.
+ */
+TEST_F(guard_regions, pagemap_scan)
+{
+ const unsigned long page_size = self->page_size;
+ struct page_region pm_regs[10];
+ struct pm_scan_arg pm_scan_args = {
+ .size = sizeof(struct pm_scan_arg),
+ .category_anyof_mask = PAGE_IS_GUARD,
+ .return_mask = PAGE_IS_GUARD,
+ .vec = (long)&pm_regs,
+ .vec_len = ARRAY_SIZE(pm_regs),
+ };
+ int proc_fd, i;
+ char *ptr;
+
+ proc_fd = open("/proc/self/pagemap", O_RDONLY);
+ ASSERT_NE(proc_fd, -1);
+
+ ptr = mmap_(self, variant, NULL, 10 * page_size,
+ PROT_READ | PROT_WRITE, 0, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ pm_scan_args.start = (long)ptr;
+ pm_scan_args.end = (long)ptr + 10 * page_size;
+ ASSERT_EQ(ioctl(proc_fd, PAGEMAP_SCAN, &pm_scan_args), 0);
+ ASSERT_EQ(pm_scan_args.walk_end, (long)ptr + 10 * page_size);
+
+ /* Install a guard region in every other page. */
+ for (i = 0; i < 10; i += 2) {
+ char *ptr_p = &ptr[i * page_size];
+
+ ASSERT_EQ(syscall(__NR_madvise, ptr_p, page_size, MADV_GUARD_INSTALL), 0);
+ }
+
+ /*
+ * Assert ioctl() returns the count of located regions, where each
+ * region spans every other page within the range of 10 pages.
+ */
+ ASSERT_EQ(ioctl(proc_fd, PAGEMAP_SCAN, &pm_scan_args), 5);
+ ASSERT_EQ(pm_scan_args.walk_end, (long)ptr + 10 * page_size);
+
+ /* Re-read from pagemap, and assert guard regions are detected. */
+ for (i = 0; i < 5; i++) {
+ long ptr_p = (long)&ptr[2 * i * page_size];
+
+ ASSERT_EQ(pm_regs[i].start, ptr_p);
+ ASSERT_EQ(pm_regs[i].end, ptr_p + page_size);
+ ASSERT_EQ(pm_regs[i].categories, PAGE_IS_GUARD);
+ }
+
+ ASSERT_EQ(close(proc_fd), 0);
+ ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/mm/gup_longterm.c b/tools/testing/selftests/mm/gup_longterm.c
index 21595b20bbc3..29047d2e0c49 100644
--- a/tools/testing/selftests/mm/gup_longterm.c
+++ b/tools/testing/selftests/mm/gup_longterm.c
@@ -93,33 +93,48 @@ static void do_test(int fd, size_t size, enum test_type type, bool shared)
__fsword_t fs_type = get_fs_type(fd);
bool should_work;
char *mem;
+ int result = KSFT_PASS;
int ret;
+ if (fd < 0) {
+ result = KSFT_FAIL;
+ goto report;
+ }
+
if (ftruncate(fd, size)) {
if (errno == ENOENT) {
skip_test_dodgy_fs("ftruncate()");
} else {
- ksft_test_result_fail("ftruncate() failed (%s)\n", strerror(errno));
+ ksft_print_msg("ftruncate() failed (%s)\n",
+ strerror(errno));
+ result = KSFT_FAIL;
+ goto report;
}
return;
}
if (fallocate(fd, 0, 0, size)) {
- if (size == pagesize)
- ksft_test_result_fail("fallocate() failed (%s)\n", strerror(errno));
- else
- ksft_test_result_skip("need more free huge pages\n");
- return;
+ if (size == pagesize) {
+ ksft_print_msg("fallocate() failed (%s)\n", strerror(errno));
+ result = KSFT_FAIL;
+ } else {
+ ksft_print_msg("need more free huge pages\n");
+ result = KSFT_SKIP;
+ }
+ goto report;
}
mem = mmap(NULL, size, PROT_READ | PROT_WRITE,
shared ? MAP_SHARED : MAP_PRIVATE, fd, 0);
if (mem == MAP_FAILED) {
- if (size == pagesize || shared)
- ksft_test_result_fail("mmap() failed (%s)\n", strerror(errno));
- else
- ksft_test_result_skip("need more free huge pages\n");
- return;
+ if (size == pagesize || shared) {
+ ksft_print_msg("mmap() failed (%s)\n", strerror(errno));
+ result = KSFT_FAIL;
+ } else {
+ ksft_print_msg("need more free huge pages\n");
+ result = KSFT_SKIP;
+ }
+ goto report;
}
/* Fault in the page such that GUP-fast can pin it directly. */
@@ -134,7 +149,8 @@ static void do_test(int fd, size_t size, enum test_type type, bool shared)
*/
ret = mprotect(mem, size, PROT_READ);
if (ret) {
- ksft_test_result_fail("mprotect() failed (%s)\n", strerror(errno));
+ ksft_print_msg("mprotect() failed (%s)\n", strerror(errno));
+ result = KSFT_FAIL;
goto munmap;
}
/* FALLTHROUGH */
@@ -147,18 +163,20 @@ static void do_test(int fd, size_t size, enum test_type type, bool shared)
type == TEST_TYPE_RW_FAST;
if (gup_fd < 0) {
- ksft_test_result_skip("gup_test not available\n");
+ ksft_print_msg("gup_test not available\n");
+ result = KSFT_SKIP;
break;
}
if (rw && shared && fs_is_unknown(fs_type)) {
- ksft_test_result_skip("Unknown filesystem\n");
+ ksft_print_msg("Unknown filesystem\n");
+ result = KSFT_SKIP;
return;
}
/*
* R/O pinning or pinning in a private mapping is always
* expected to work. Otherwise, we expect long-term R/W pinning
- * to only succeed for special fielesystems.
+ * to only succeed for special filesystems.
*/
should_work = !shared || !rw ||
fs_supports_writable_longterm_pinning(fs_type);
@@ -169,14 +187,19 @@ static void do_test(int fd, size_t size, enum test_type type, bool shared)
args.flags |= rw ? PIN_LONGTERM_TEST_FLAG_USE_WRITE : 0;
ret = ioctl(gup_fd, PIN_LONGTERM_TEST_START, &args);
if (ret && errno == EINVAL) {
- ksft_test_result_skip("PIN_LONGTERM_TEST_START failed (EINVAL)n");
+ ksft_print_msg("PIN_LONGTERM_TEST_START failed (EINVAL)n");
+ result = KSFT_SKIP;
break;
} else if (ret && errno == EFAULT) {
- ksft_test_result(!should_work, "Should have failed\n");
+ if (should_work)
+ result = KSFT_FAIL;
+ else
+ result = KSFT_PASS;
break;
} else if (ret) {
- ksft_test_result_fail("PIN_LONGTERM_TEST_START failed (%s)\n",
- strerror(errno));
+ ksft_print_msg("PIN_LONGTERM_TEST_START failed (%s)\n",
+ strerror(errno));
+ result = KSFT_FAIL;
break;
}
@@ -189,7 +212,10 @@ static void do_test(int fd, size_t size, enum test_type type, bool shared)
* some previously unsupported filesystems, we might want to
* perform some additional tests for possible data corruptions.
*/
- ksft_test_result(should_work, "Should have worked\n");
+ if (should_work)
+ result = KSFT_PASS;
+ else
+ result = KSFT_FAIL;
break;
}
#ifdef LOCAL_CONFIG_HAVE_LIBURING
@@ -199,8 +225,9 @@ static void do_test(int fd, size_t size, enum test_type type, bool shared)
/* io_uring always pins pages writable. */
if (shared && fs_is_unknown(fs_type)) {
- ksft_test_result_skip("Unknown filesystem\n");
- return;
+ ksft_print_msg("Unknown filesystem\n");
+ result = KSFT_SKIP;
+ goto report;
}
should_work = !shared ||
fs_supports_writable_longterm_pinning(fs_type);
@@ -208,8 +235,9 @@ static void do_test(int fd, size_t size, enum test_type type, bool shared)
/* Skip on errors, as we might just lack kernel support. */
ret = io_uring_queue_init(1, &ring, 0);
if (ret < 0) {
- ksft_test_result_skip("io_uring_queue_init() failed (%s)\n",
- strerror(-ret));
+ ksft_print_msg("io_uring_queue_init() failed (%s)\n",
+ strerror(-ret));
+ result = KSFT_SKIP;
break;
}
/*
@@ -222,17 +250,28 @@ static void do_test(int fd, size_t size, enum test_type type, bool shared)
/* Only new kernels return EFAULT. */
if (ret && (errno == ENOSPC || errno == EOPNOTSUPP ||
errno == EFAULT)) {
- ksft_test_result(!should_work, "Should have failed (%s)\n",
- strerror(errno));
+ if (should_work) {
+ ksft_print_msg("Should have failed (%s)\n",
+ strerror(errno));
+ result = KSFT_FAIL;
+ } else {
+ result = KSFT_PASS;
+ }
} else if (ret) {
/*
* We might just lack support or have insufficient
* MEMLOCK limits.
*/
- ksft_test_result_skip("io_uring_register_buffers() failed (%s)\n",
- strerror(-ret));
+ ksft_print_msg("io_uring_register_buffers() failed (%s)\n",
+ strerror(-ret));
+ result = KSFT_SKIP;
} else {
- ksft_test_result(should_work, "Should have worked\n");
+ if (should_work) {
+ result = KSFT_PASS;
+ } else {
+ ksft_print_msg("Should have worked\n");
+ result = KSFT_FAIL;
+ }
io_uring_unregister_buffers(&ring);
}
@@ -246,6 +285,8 @@ static void do_test(int fd, size_t size, enum test_type type, bool shared)
munmap:
munmap(mem, size);
+report:
+ log_test_result(result);
}
typedef void (*test_fn)(int fd, size_t size);
@@ -254,11 +295,12 @@ static void run_with_memfd(test_fn fn, const char *desc)
{
int fd;
- ksft_print_msg("[RUN] %s ... with memfd\n", desc);
+ log_test_start("%s ... with memfd", desc);
fd = memfd_create("test", 0);
if (fd < 0) {
- ksft_test_result_fail("memfd_create() failed (%s)\n", strerror(errno));
+ ksft_print_msg("memfd_create() failed (%s)\n", strerror(errno));
+ log_test_result(KSFT_SKIP);
return;
}
@@ -271,23 +313,23 @@ static void run_with_tmpfile(test_fn fn, const char *desc)
FILE *file;
int fd;
- ksft_print_msg("[RUN] %s ... with tmpfile\n", desc);
+ log_test_start("%s ... with tmpfile", desc);
file = tmpfile();
if (!file) {
- ksft_test_result_fail("tmpfile() failed (%s)\n", strerror(errno));
- return;
- }
-
- fd = fileno(file);
- if (fd < 0) {
- ksft_test_result_fail("fileno() failed (%s)\n", strerror(errno));
- goto close;
+ ksft_print_msg("tmpfile() failed (%s)\n", strerror(errno));
+ fd = -1;
+ } else {
+ fd = fileno(file);
+ if (fd < 0) {
+ ksft_print_msg("fileno() failed (%s)\n", strerror(errno));
+ }
}
fn(fd, pagesize);
-close:
- fclose(file);
+
+ if (file)
+ fclose(file);
}
static void run_with_local_tmpfile(test_fn fn, const char *desc)
@@ -295,22 +337,22 @@ static void run_with_local_tmpfile(test_fn fn, const char *desc)
char filename[] = __FILE__"_tmpfile_XXXXXX";
int fd;
- ksft_print_msg("[RUN] %s ... with local tmpfile\n", desc);
+ log_test_start("%s ... with local tmpfile", desc);
fd = mkstemp(filename);
- if (fd < 0) {
- ksft_test_result_fail("mkstemp() failed (%s)\n", strerror(errno));
- return;
- }
+ if (fd < 0)
+ ksft_print_msg("mkstemp() failed (%s)\n", strerror(errno));
if (unlink(filename)) {
- ksft_test_result_fail("unlink() failed (%s)\n", strerror(errno));
- goto close;
+ ksft_print_msg("unlink() failed (%s)\n", strerror(errno));
+ close(fd);
+ fd = -1;
}
fn(fd, pagesize);
-close:
- close(fd);
+
+ if (fd >= 0)
+ close(fd);
}
static void run_with_memfd_hugetlb(test_fn fn, const char *desc,
@@ -319,14 +361,15 @@ static void run_with_memfd_hugetlb(test_fn fn, const char *desc,
int flags = MFD_HUGETLB;
int fd;
- ksft_print_msg("[RUN] %s ... with memfd hugetlb (%zu kB)\n", desc,
+ log_test_start("%s ... with memfd hugetlb (%zu kB)", desc,
hugetlbsize / 1024);
flags |= __builtin_ctzll(hugetlbsize) << MFD_HUGE_SHIFT;
fd = memfd_create("test", flags);
if (fd < 0) {
- ksft_test_result_skip("memfd_create() failed (%s)\n", strerror(errno));
+ ksft_print_msg("memfd_create() failed (%s)\n", strerror(errno));
+ log_test_result(KSFT_SKIP);
return;
}
@@ -455,7 +498,7 @@ static int tests_per_test_case(void)
int main(int argc, char **argv)
{
- int i, err;
+ int i;
pagesize = getpagesize();
nr_hugetlbsizes = detect_hugetlb_page_sizes(hugetlbsizes,
@@ -469,9 +512,5 @@ int main(int argc, char **argv)
for (i = 0; i < ARRAY_SIZE(test_cases); i++)
run_test_case(&test_cases[i]);
- err = ksft_get_fail_cnt();
- if (err)
- ksft_exit_fail_msg("%d out of %d tests failed\n",
- err, ksft_test_num());
- ksft_exit_pass();
+ ksft_finished();
}
diff --git a/tools/testing/selftests/mm/hugetlb_reparenting_test.sh b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh
index 0b0d4ba1af27..0dd31892ff67 100755
--- a/tools/testing/selftests/mm/hugetlb_reparenting_test.sh
+++ b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh
@@ -36,7 +36,7 @@ else
do_umount=1
fi
fi
-MNT='/mnt/huge/'
+MNT='/mnt/huge'
function get_machine_hugepage_size() {
hpz=$(grep -i hugepagesize /proc/meminfo)
@@ -56,10 +56,45 @@ function cleanup() {
rmdir "$CGROUP_ROOT"/a/b 2>/dev/null
rmdir "$CGROUP_ROOT"/a 2>/dev/null
rmdir "$CGROUP_ROOT"/test1 2>/dev/null
- echo 0 >/proc/sys/vm/nr_hugepages
+ echo $nr_hugepgs >/proc/sys/vm/nr_hugepages
set -e
}
+function assert_with_retry() {
+ local actual_path="$1"
+ local expected="$2"
+ local tolerance=$((7 * 1024 * 1024))
+ local timeout=20
+ local interval=1
+ local start_time
+ local now
+ local elapsed
+ local actual
+
+ start_time=$(date +%s)
+
+ while true; do
+ actual="$(cat "$actual_path")"
+
+ if [[ $actual -ge $(($expected - $tolerance)) ]] &&
+ [[ $actual -le $(($expected + $tolerance)) ]]; then
+ return 0
+ fi
+
+ now=$(date +%s)
+ elapsed=$((now - start_time))
+
+ if [[ $elapsed -ge $timeout ]]; then
+ echo "actual = $((${actual%% *} / 1024 / 1024)) MB"
+ echo "expected = $((${expected%% *} / 1024 / 1024)) MB"
+ cleanup
+ exit 1
+ fi
+
+ sleep $interval
+ done
+}
+
function assert_state() {
local expected_a="$1"
local expected_a_hugetlb="$2"
@@ -70,58 +105,13 @@ function assert_state() {
expected_b="$3"
expected_b_hugetlb="$4"
fi
- local tolerance=$((5 * 1024 * 1024))
-
- local actual_a
- actual_a="$(cat "$CGROUP_ROOT"/a/memory.$usage_file)"
- if [[ $actual_a -lt $(($expected_a - $tolerance)) ]] ||
- [[ $actual_a -gt $(($expected_a + $tolerance)) ]]; then
- echo actual a = $((${actual_a%% *} / 1024 / 1024)) MB
- echo expected a = $((${expected_a%% *} / 1024 / 1024)) MB
- echo fail
-
- cleanup
- exit 1
- fi
-
- local actual_a_hugetlb
- actual_a_hugetlb="$(cat "$CGROUP_ROOT"/a/hugetlb.${MB}MB.$usage_file)"
- if [[ $actual_a_hugetlb -lt $(($expected_a_hugetlb - $tolerance)) ]] ||
- [[ $actual_a_hugetlb -gt $(($expected_a_hugetlb + $tolerance)) ]]; then
- echo actual a hugetlb = $((${actual_a_hugetlb%% *} / 1024 / 1024)) MB
- echo expected a hugetlb = $((${expected_a_hugetlb%% *} / 1024 / 1024)) MB
- echo fail
-
- cleanup
- exit 1
- fi
-
- if [[ -z "$expected_b" || -z "$expected_b_hugetlb" ]]; then
- return
- fi
-
- local actual_b
- actual_b="$(cat "$CGROUP_ROOT"/a/b/memory.$usage_file)"
- if [[ $actual_b -lt $(($expected_b - $tolerance)) ]] ||
- [[ $actual_b -gt $(($expected_b + $tolerance)) ]]; then
- echo actual b = $((${actual_b%% *} / 1024 / 1024)) MB
- echo expected b = $((${expected_b%% *} / 1024 / 1024)) MB
- echo fail
-
- cleanup
- exit 1
- fi
- local actual_b_hugetlb
- actual_b_hugetlb="$(cat "$CGROUP_ROOT"/a/b/hugetlb.${MB}MB.$usage_file)"
- if [[ $actual_b_hugetlb -lt $(($expected_b_hugetlb - $tolerance)) ]] ||
- [[ $actual_b_hugetlb -gt $(($expected_b_hugetlb + $tolerance)) ]]; then
- echo actual b hugetlb = $((${actual_b_hugetlb%% *} / 1024 / 1024)) MB
- echo expected b hugetlb = $((${expected_b_hugetlb%% *} / 1024 / 1024)) MB
- echo fail
+ assert_with_retry "$CGROUP_ROOT/a/memory.$usage_file" "$expected_a"
+ assert_with_retry "$CGROUP_ROOT/a/hugetlb.${MB}MB.$usage_file" "$expected_a_hugetlb"
- cleanup
- exit 1
+ if [[ -n "$expected_b" && -n "$expected_b_hugetlb" ]]; then
+ assert_with_retry "$CGROUP_ROOT/a/b/memory.$usage_file" "$expected_b"
+ assert_with_retry "$CGROUP_ROOT/a/b/hugetlb.${MB}MB.$usage_file" "$expected_b_hugetlb"
fi
}
@@ -175,7 +165,6 @@ size=$((${MB} * 1024 * 1024 * 25)) # 50MB = 25 * 2MB hugepages.
cleanup
echo
-echo
echo Test charge, rmdir, uncharge
setup
echo mkdir
@@ -195,7 +184,6 @@ cleanup
echo done
echo
-echo
if [[ ! $cgroup2 ]]; then
echo "Test parent and child hugetlb usage"
setup
@@ -212,7 +200,6 @@ if [[ ! $cgroup2 ]]; then
assert_state 0 $(($size * 2)) 0 $size
rmdir "$CGROUP_ROOT"/a/b
- sleep 5
echo Assert memory reparent correctly.
assert_state 0 $(($size * 2))
@@ -225,7 +212,6 @@ if [[ ! $cgroup2 ]]; then
fi
echo
-echo
echo "Test child only hugetlb usage"
echo setup
setup
diff --git a/tools/testing/selftests/mm/ksm_tests.c b/tools/testing/selftests/mm/ksm_tests.c
index dcdd5bb20f3d..e80deac1436b 100644
--- a/tools/testing/selftests/mm/ksm_tests.c
+++ b/tools/testing/selftests/mm/ksm_tests.c
@@ -58,40 +58,12 @@ int debug;
static int ksm_write_sysfs(const char *file_path, unsigned long val)
{
- FILE *f = fopen(file_path, "w");
-
- if (!f) {
- fprintf(stderr, "f %s\n", file_path);
- perror("fopen");
- return 1;
- }
- if (fprintf(f, "%lu", val) < 0) {
- perror("fprintf");
- fclose(f);
- return 1;
- }
- fclose(f);
-
- return 0;
+ return write_sysfs(file_path, val);
}
static int ksm_read_sysfs(const char *file_path, unsigned long *val)
{
- FILE *f = fopen(file_path, "r");
-
- if (!f) {
- fprintf(stderr, "f %s\n", file_path);
- perror("fopen");
- return 1;
- }
- if (fscanf(f, "%lu", val) != 1) {
- perror("fscanf");
- fclose(f);
- return 1;
- }
- fclose(f);
-
- return 0;
+ return read_sysfs(file_path, val);
}
static void ksm_print_sysfs(void)
diff --git a/tools/testing/selftests/mm/madv_populate.c b/tools/testing/selftests/mm/madv_populate.c
index ef7d911da13e..b6fabd5c27ed 100644
--- a/tools/testing/selftests/mm/madv_populate.c
+++ b/tools/testing/selftests/mm/madv_populate.c
@@ -172,12 +172,12 @@ static void test_populate_read(void)
if (addr == MAP_FAILED)
ksft_exit_fail_msg("mmap failed\n");
ksft_test_result(range_is_not_populated(addr, SIZE),
- "range initially not populated\n");
+ "read range initially not populated\n");
ret = madvise(addr, SIZE, MADV_POPULATE_READ);
ksft_test_result(!ret, "MADV_POPULATE_READ\n");
ksft_test_result(range_is_populated(addr, SIZE),
- "range is populated\n");
+ "read range is populated\n");
munmap(addr, SIZE);
}
@@ -194,12 +194,12 @@ static void test_populate_write(void)
if (addr == MAP_FAILED)
ksft_exit_fail_msg("mmap failed\n");
ksft_test_result(range_is_not_populated(addr, SIZE),
- "range initially not populated\n");
+ "write range initially not populated\n");
ret = madvise(addr, SIZE, MADV_POPULATE_WRITE);
ksft_test_result(!ret, "MADV_POPULATE_WRITE\n");
ksft_test_result(range_is_populated(addr, SIZE),
- "range is populated\n");
+ "write range is populated\n");
munmap(addr, SIZE);
}
@@ -247,19 +247,19 @@ static void test_softdirty(void)
/* Clear any softdirty bits. */
clear_softdirty();
ksft_test_result(range_is_not_softdirty(addr, SIZE),
- "range is not softdirty\n");
+ "cleared range is not softdirty\n");
/* Populating READ should set softdirty. */
ret = madvise(addr, SIZE, MADV_POPULATE_READ);
- ksft_test_result(!ret, "MADV_POPULATE_READ\n");
+ ksft_test_result(!ret, "softdirty MADV_POPULATE_READ\n");
ksft_test_result(range_is_not_softdirty(addr, SIZE),
- "range is not softdirty\n");
+ "range is not softdirty after MADV_POPULATE_READ\n");
/* Populating WRITE should set softdirty. */
ret = madvise(addr, SIZE, MADV_POPULATE_WRITE);
- ksft_test_result(!ret, "MADV_POPULATE_WRITE\n");
+ ksft_test_result(!ret, "softdirty MADV_POPULATE_WRITE\n");
ksft_test_result(range_is_softdirty(addr, SIZE),
- "range is softdirty\n");
+ "range is softdirty after MADV_POPULATE_WRITE \n");
munmap(addr, SIZE);
}
diff --git a/tools/testing/selftests/mm/map_fixed_noreplace.c b/tools/testing/selftests/mm/map_fixed_noreplace.c
index d53de2486080..1e9980b8993c 100644
--- a/tools/testing/selftests/mm/map_fixed_noreplace.c
+++ b/tools/testing/selftests/mm/map_fixed_noreplace.c
@@ -96,7 +96,7 @@ int main(void)
ksft_exit_fail_msg("Error:1: mmap() succeeded when it shouldn't have\n");
}
ksft_print_msg("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
- ksft_test_result_pass("mmap() 5*PAGE_SIZE at base\n");
+ ksft_test_result_pass("Second mmap() 5*PAGE_SIZE at base\n");
/*
* Second mapping contained within first:
diff --git a/tools/testing/selftests/mm/merge.c b/tools/testing/selftests/mm/merge.c
new file mode 100644
index 000000000000..cc26480098ae
--- /dev/null
+++ b/tools/testing/selftests/mm/merge.c
@@ -0,0 +1,501 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#define _GNU_SOURCE
+#include "../kselftest_harness.h"
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/wait.h>
+#include <linux/perf_event.h>
+#include "vm_util.h"
+
+FIXTURE(merge)
+{
+ unsigned int page_size;
+ char *carveout;
+ struct procmap_fd procmap;
+};
+
+FIXTURE_SETUP(merge)
+{
+ self->page_size = psize();
+ /* Carve out PROT_NONE region to map over. */
+ self->carveout = mmap(NULL, 12 * self->page_size, PROT_NONE,
+ MAP_ANON | MAP_PRIVATE, -1, 0);
+ ASSERT_NE(self->carveout, MAP_FAILED);
+ /* Setup PROCMAP_QUERY interface. */
+ ASSERT_EQ(open_self_procmap(&self->procmap), 0);
+}
+
+FIXTURE_TEARDOWN(merge)
+{
+ ASSERT_EQ(munmap(self->carveout, 12 * self->page_size), 0);
+ ASSERT_EQ(close_procmap(&self->procmap), 0);
+}
+
+TEST_F(merge, mprotect_unfaulted_left)
+{
+ unsigned int page_size = self->page_size;
+ char *carveout = self->carveout;
+ struct procmap_fd *procmap = &self->procmap;
+ char *ptr;
+
+ /*
+ * Map 10 pages of R/W memory within. MAP_NORESERVE so we don't hit
+ * merge failure due to lack of VM_ACCOUNT flag by mistake.
+ *
+ * |-----------------------|
+ * | unfaulted |
+ * |-----------------------|
+ */
+ ptr = mmap(&carveout[page_size], 10 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANON | MAP_PRIVATE | MAP_FIXED | MAP_NORESERVE, -1, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+ /*
+ * Now make the first 5 pages read-only, splitting the VMA:
+ *
+ * RO RW
+ * |-----------|-----------|
+ * | unfaulted | unfaulted |
+ * |-----------|-----------|
+ */
+ ASSERT_EQ(mprotect(ptr, 5 * page_size, PROT_READ), 0);
+ /*
+ * Fault in the first of the last 5 pages so it gets an anon_vma and
+ * thus the whole VMA becomes 'faulted':
+ *
+ * RO RW
+ * |-----------|-----------|
+ * | unfaulted | faulted |
+ * |-----------|-----------|
+ */
+ ptr[5 * page_size] = 'x';
+ /*
+ * Now mprotect() the RW region read-only, we should merge (though for
+ * ~15 years we did not! :):
+ *
+ * RO
+ * |-----------------------|
+ * | faulted |
+ * |-----------------------|
+ */
+ ASSERT_EQ(mprotect(&ptr[5 * page_size], 5 * page_size, PROT_READ), 0);
+
+ /* Assert that the merge succeeded using PROCMAP_QUERY. */
+ ASSERT_TRUE(find_vma_procmap(procmap, ptr));
+ ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr);
+ ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr + 10 * page_size);
+}
+
+TEST_F(merge, mprotect_unfaulted_right)
+{
+ unsigned int page_size = self->page_size;
+ char *carveout = self->carveout;
+ struct procmap_fd *procmap = &self->procmap;
+ char *ptr;
+
+ /*
+ * |-----------------------|
+ * | unfaulted |
+ * |-----------------------|
+ */
+ ptr = mmap(&carveout[page_size], 10 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANON | MAP_PRIVATE | MAP_FIXED | MAP_NORESERVE, -1, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+ /*
+ * Now make the last 5 pages read-only, splitting the VMA:
+ *
+ * RW RO
+ * |-----------|-----------|
+ * | unfaulted | unfaulted |
+ * |-----------|-----------|
+ */
+ ASSERT_EQ(mprotect(&ptr[5 * page_size], 5 * page_size, PROT_READ), 0);
+ /*
+ * Fault in the first of the first 5 pages so it gets an anon_vma and
+ * thus the whole VMA becomes 'faulted':
+ *
+ * RW RO
+ * |-----------|-----------|
+ * | faulted | unfaulted |
+ * |-----------|-----------|
+ */
+ ptr[0] = 'x';
+ /*
+ * Now mprotect() the RW region read-only, we should merge:
+ *
+ * RO
+ * |-----------------------|
+ * | faulted |
+ * |-----------------------|
+ */
+ ASSERT_EQ(mprotect(ptr, 5 * page_size, PROT_READ), 0);
+
+ /* Assert that the merge succeeded using PROCMAP_QUERY. */
+ ASSERT_TRUE(find_vma_procmap(procmap, ptr));
+ ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr);
+ ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr + 10 * page_size);
+}
+
+TEST_F(merge, mprotect_unfaulted_both)
+{
+ unsigned int page_size = self->page_size;
+ char *carveout = self->carveout;
+ struct procmap_fd *procmap = &self->procmap;
+ char *ptr;
+
+ /*
+ * |-----------------------|
+ * | unfaulted |
+ * |-----------------------|
+ */
+ ptr = mmap(&carveout[2 * page_size], 9 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANON | MAP_PRIVATE | MAP_FIXED | MAP_NORESERVE, -1, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+ /*
+ * Now make the first and last 3 pages read-only, splitting the VMA:
+ *
+ * RO RW RO
+ * |-----------|-----------|-----------|
+ * | unfaulted | unfaulted | unfaulted |
+ * |-----------|-----------|-----------|
+ */
+ ASSERT_EQ(mprotect(ptr, 3 * page_size, PROT_READ), 0);
+ ASSERT_EQ(mprotect(&ptr[6 * page_size], 3 * page_size, PROT_READ), 0);
+ /*
+ * Fault in the first of the middle 3 pages so it gets an anon_vma and
+ * thus the whole VMA becomes 'faulted':
+ *
+ * RO RW RO
+ * |-----------|-----------|-----------|
+ * | unfaulted | faulted | unfaulted |
+ * |-----------|-----------|-----------|
+ */
+ ptr[3 * page_size] = 'x';
+ /*
+ * Now mprotect() the RW region read-only, we should merge:
+ *
+ * RO
+ * |-----------------------|
+ * | faulted |
+ * |-----------------------|
+ */
+ ASSERT_EQ(mprotect(&ptr[3 * page_size], 3 * page_size, PROT_READ), 0);
+
+ /* Assert that the merge succeeded using PROCMAP_QUERY. */
+ ASSERT_TRUE(find_vma_procmap(procmap, ptr));
+ ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr);
+ ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr + 9 * page_size);
+}
+
+TEST_F(merge, mprotect_faulted_left_unfaulted_right)
+{
+ unsigned int page_size = self->page_size;
+ char *carveout = self->carveout;
+ struct procmap_fd *procmap = &self->procmap;
+ char *ptr;
+
+ /*
+ * |-----------------------|
+ * | unfaulted |
+ * |-----------------------|
+ */
+ ptr = mmap(&carveout[2 * page_size], 9 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANON | MAP_PRIVATE | MAP_FIXED | MAP_NORESERVE, -1, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+ /*
+ * Now make the last 3 pages read-only, splitting the VMA:
+ *
+ * RW RO
+ * |-----------------------|-----------|
+ * | unfaulted | unfaulted |
+ * |-----------------------|-----------|
+ */
+ ASSERT_EQ(mprotect(&ptr[6 * page_size], 3 * page_size, PROT_READ), 0);
+ /*
+ * Fault in the first of the first 6 pages so it gets an anon_vma and
+ * thus the whole VMA becomes 'faulted':
+ *
+ * RW RO
+ * |-----------------------|-----------|
+ * | unfaulted | unfaulted |
+ * |-----------------------|-----------|
+ */
+ ptr[0] = 'x';
+ /*
+ * Now make the first 3 pages read-only, splitting the VMA:
+ *
+ * RO RW RO
+ * |-----------|-----------|-----------|
+ * | faulted | faulted | unfaulted |
+ * |-----------|-----------|-----------|
+ */
+ ASSERT_EQ(mprotect(ptr, 3 * page_size, PROT_READ), 0);
+ /*
+ * Now mprotect() the RW region read-only, we should merge:
+ *
+ * RO
+ * |-----------------------|
+ * | faulted |
+ * |-----------------------|
+ */
+ ASSERT_EQ(mprotect(&ptr[3 * page_size], 3 * page_size, PROT_READ), 0);
+
+ /* Assert that the merge succeeded using PROCMAP_QUERY. */
+ ASSERT_TRUE(find_vma_procmap(procmap, ptr));
+ ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr);
+ ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr + 9 * page_size);
+}
+
+TEST_F(merge, mprotect_unfaulted_left_faulted_right)
+{
+ unsigned int page_size = self->page_size;
+ char *carveout = self->carveout;
+ struct procmap_fd *procmap = &self->procmap;
+ char *ptr;
+
+ /*
+ * |-----------------------|
+ * | unfaulted |
+ * |-----------------------|
+ */
+ ptr = mmap(&carveout[2 * page_size], 9 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANON | MAP_PRIVATE | MAP_FIXED | MAP_NORESERVE, -1, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+ /*
+ * Now make the first 3 pages read-only, splitting the VMA:
+ *
+ * RO RW
+ * |-----------|-----------------------|
+ * | unfaulted | unfaulted |
+ * |-----------|-----------------------|
+ */
+ ASSERT_EQ(mprotect(ptr, 3 * page_size, PROT_READ), 0);
+ /*
+ * Fault in the first of the last 6 pages so it gets an anon_vma and
+ * thus the whole VMA becomes 'faulted':
+ *
+ * RO RW
+ * |-----------|-----------------------|
+ * | unfaulted | faulted |
+ * |-----------|-----------------------|
+ */
+ ptr[3 * page_size] = 'x';
+ /*
+ * Now make the last 3 pages read-only, splitting the VMA:
+ *
+ * RO RW RO
+ * |-----------|-----------|-----------|
+ * | unfaulted | faulted | faulted |
+ * |-----------|-----------|-----------|
+ */
+ ASSERT_EQ(mprotect(&ptr[6 * page_size], 3 * page_size, PROT_READ), 0);
+ /*
+ * Now mprotect() the RW region read-only, we should merge:
+ *
+ * RO
+ * |-----------------------|
+ * | faulted |
+ * |-----------------------|
+ */
+ ASSERT_EQ(mprotect(&ptr[3 * page_size], 3 * page_size, PROT_READ), 0);
+
+ /* Assert that the merge succeeded using PROCMAP_QUERY. */
+ ASSERT_TRUE(find_vma_procmap(procmap, ptr));
+ ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr);
+ ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr + 9 * page_size);
+}
+
+TEST_F(merge, forked_target_vma)
+{
+ unsigned int page_size = self->page_size;
+ char *carveout = self->carveout;
+ struct procmap_fd *procmap = &self->procmap;
+ pid_t pid;
+ char *ptr, *ptr2;
+ int i;
+
+ /*
+ * |-----------|
+ * | unfaulted |
+ * |-----------|
+ */
+ ptr = mmap(&carveout[page_size], 5 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /*
+ * Fault in process.
+ *
+ * |-----------|
+ * | faulted |
+ * |-----------|
+ */
+ ptr[0] = 'x';
+
+ pid = fork();
+ ASSERT_NE(pid, -1);
+
+ if (pid != 0) {
+ wait(NULL);
+ return;
+ }
+
+ /* Child process below: */
+
+ /* Reopen for child. */
+ ASSERT_EQ(close_procmap(&self->procmap), 0);
+ ASSERT_EQ(open_self_procmap(&self->procmap), 0);
+
+ /* unCOWing everything does not cause the AVC to go away. */
+ for (i = 0; i < 5 * page_size; i += page_size)
+ ptr[i] = 'x';
+
+ /*
+ * Map in adjacent VMA in child.
+ *
+ * forked
+ * |-----------|-----------|
+ * | faulted | unfaulted |
+ * |-----------|-----------|
+ * ptr ptr2
+ */
+ ptr2 = mmap(&ptr[5 * page_size], 5 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, 0);
+ ASSERT_NE(ptr2, MAP_FAILED);
+
+ /* Make sure not merged. */
+ ASSERT_TRUE(find_vma_procmap(procmap, ptr));
+ ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr);
+ ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr + 5 * page_size);
+}
+
+TEST_F(merge, forked_source_vma)
+{
+ unsigned int page_size = self->page_size;
+ char *carveout = self->carveout;
+ struct procmap_fd *procmap = &self->procmap;
+ pid_t pid;
+ char *ptr, *ptr2;
+ int i;
+
+ /*
+ * |-----------|------------|
+ * | unfaulted | <unmapped> |
+ * |-----------|------------|
+ */
+ ptr = mmap(&carveout[page_size], 5 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANON | MAP_PRIVATE | MAP_FIXED | MAP_NORESERVE, -1, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /*
+ * Fault in process.
+ *
+ * |-----------|------------|
+ * | faulted | <unmapped> |
+ * |-----------|------------|
+ */
+ ptr[0] = 'x';
+
+ pid = fork();
+ ASSERT_NE(pid, -1);
+
+ if (pid != 0) {
+ wait(NULL);
+ return;
+ }
+
+ /* Child process below: */
+
+ /* Reopen for child. */
+ ASSERT_EQ(close_procmap(&self->procmap), 0);
+ ASSERT_EQ(open_self_procmap(&self->procmap), 0);
+
+ /* unCOWing everything does not cause the AVC to go away. */
+ for (i = 0; i < 5 * page_size; i += page_size)
+ ptr[i] = 'x';
+
+ /*
+ * Map in adjacent VMA in child, ptr2 after ptr, but incompatible.
+ *
+ * forked RW RWX
+ * |-----------|-----------|
+ * | faulted | unfaulted |
+ * |-----------|-----------|
+ * ptr ptr2
+ */
+ ptr2 = mmap(&carveout[6 * page_size], 5 * page_size, PROT_READ | PROT_WRITE | PROT_EXEC,
+ MAP_ANON | MAP_PRIVATE | MAP_FIXED | MAP_NORESERVE, -1, 0);
+ ASSERT_NE(ptr2, MAP_FAILED);
+
+ /* Make sure not merged. */
+ ASSERT_TRUE(find_vma_procmap(procmap, ptr2));
+ ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr2);
+ ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr2 + 5 * page_size);
+
+ /*
+ * Now mprotect forked region to RWX so it becomes the source for the
+ * merge to unfaulted region:
+ *
+ * forked RWX RWX
+ * |-----------|-----------|
+ * | faulted | unfaulted |
+ * |-----------|-----------|
+ * ptr ptr2
+ *
+ * This should NOT result in a merge, as ptr was forked.
+ */
+ ASSERT_EQ(mprotect(ptr, 5 * page_size, PROT_READ | PROT_WRITE | PROT_EXEC), 0);
+ /* Again, make sure not merged. */
+ ASSERT_TRUE(find_vma_procmap(procmap, ptr2));
+ ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr2);
+ ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr2 + 5 * page_size);
+}
+
+TEST_F(merge, handle_uprobe_upon_merged_vma)
+{
+ const size_t attr_sz = sizeof(struct perf_event_attr);
+ unsigned int page_size = self->page_size;
+ const char *probe_file = "./foo";
+ char *carveout = self->carveout;
+ struct perf_event_attr attr;
+ unsigned long type;
+ void *ptr1, *ptr2;
+ int fd;
+
+ fd = open(probe_file, O_RDWR|O_CREAT, 0600);
+ ASSERT_GE(fd, 0);
+
+ ASSERT_EQ(ftruncate(fd, page_size), 0);
+ if (read_sysfs("/sys/bus/event_source/devices/uprobe/type", &type) != 0) {
+ SKIP(goto out, "Failed to read uprobe sysfs file, skipping");
+ }
+
+ memset(&attr, 0, attr_sz);
+ attr.size = attr_sz;
+ attr.type = type;
+ attr.config1 = (__u64)(long)probe_file;
+ attr.config2 = 0x0;
+
+ ASSERT_GE(syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0), 0);
+
+ ptr1 = mmap(&carveout[page_size], 10 * page_size, PROT_EXEC,
+ MAP_PRIVATE | MAP_FIXED, fd, 0);
+ ASSERT_NE(ptr1, MAP_FAILED);
+
+ ptr2 = mremap(ptr1, page_size, 2 * page_size,
+ MREMAP_MAYMOVE | MREMAP_FIXED, ptr1 + 5 * page_size);
+ ASSERT_NE(ptr2, MAP_FAILED);
+
+ ASSERT_NE(mremap(ptr2, page_size, page_size,
+ MREMAP_MAYMOVE | MREMAP_FIXED, ptr1), MAP_FAILED);
+
+out:
+ close(fd);
+ remove(probe_file);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/mm/mlock2-tests.c b/tools/testing/selftests/mm/mlock2-tests.c
index 7f0d50fa361d..3e90ff37e336 100644
--- a/tools/testing/selftests/mm/mlock2-tests.c
+++ b/tools/testing/selftests/mm/mlock2-tests.c
@@ -196,7 +196,7 @@ static void test_mlock_lock(void)
ksft_exit_fail_msg("munlock(): %s\n", strerror(errno));
}
- ksft_test_result(!unlock_lock_check(map), "%s: Locked\n", __func__);
+ ksft_test_result(!unlock_lock_check(map), "%s: Unlocked\n", __func__);
munmap(map, 2 * page_size);
}
diff --git a/tools/testing/selftests/mm/pagemap_ioctl.c b/tools/testing/selftests/mm/pagemap_ioctl.c
index 57b4bba2b45f..b07acc86f4f0 100644
--- a/tools/testing/selftests/mm/pagemap_ioctl.c
+++ b/tools/testing/selftests/mm/pagemap_ioctl.c
@@ -34,7 +34,7 @@
#define PAGEMAP "/proc/self/pagemap"
int pagemap_fd;
int uffd;
-unsigned int page_size;
+unsigned long page_size;
unsigned int hpage_size;
const char *progname;
@@ -112,7 +112,7 @@ int init_uffd(void)
return 0;
}
-int wp_init(void *lpBaseAddress, int dwRegionSize)
+int wp_init(void *lpBaseAddress, long dwRegionSize)
{
struct uffdio_register uffdio_register;
struct uffdio_writeprotect wp;
@@ -136,7 +136,7 @@ int wp_init(void *lpBaseAddress, int dwRegionSize)
return 0;
}
-int wp_free(void *lpBaseAddress, int dwRegionSize)
+int wp_free(void *lpBaseAddress, long dwRegionSize)
{
struct uffdio_register uffdio_register;
@@ -184,7 +184,7 @@ void *gethugetlb_mem(int size, int *shmid)
int userfaultfd_tests(void)
{
- int mem_size, vec_size, written, num_pages = 16;
+ long mem_size, vec_size, written, num_pages = 16;
char *mem, *vec;
mem_size = num_pages * page_size;
@@ -213,7 +213,7 @@ int userfaultfd_tests(void)
written = pagemap_ioctl(mem, mem_size, vec, 1, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC,
vec_size - 2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN);
if (written < 0)
- ksft_exit_fail_msg("error %d %d %s\n", written, errno, strerror(errno));
+ ksft_exit_fail_msg("error %ld %d %s\n", written, errno, strerror(errno));
ksft_test_result(written == 0, "%s all new pages must not be written (dirty)\n", __func__);
@@ -995,7 +995,7 @@ int unmapped_region_tests(void)
{
void *start = (void *)0x10000000;
int written, len = 0x00040000;
- int vec_size = len / page_size;
+ long vec_size = len / page_size;
struct page_region *vec = malloc(sizeof(struct page_region) * vec_size);
/* 1. Get written pages */
@@ -1051,7 +1051,7 @@ static void test_simple(void)
int sanity_tests(void)
{
unsigned long long mem_size, vec_size;
- int ret, fd, i, buf_size;
+ long ret, fd, i, buf_size;
struct page_region *vec;
char *mem, *fmem;
struct stat sbuf;
@@ -1160,7 +1160,7 @@ int sanity_tests(void)
ret = stat(progname, &sbuf);
if (ret < 0)
- ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno));
+ ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
fmem = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
if (fmem == MAP_FAILED)
diff --git a/tools/testing/selftests/mm/pfnmap.c b/tools/testing/selftests/mm/pfnmap.c
new file mode 100644
index 000000000000..866ac023baf5
--- /dev/null
+++ b/tools/testing/selftests/mm/pfnmap.c
@@ -0,0 +1,249 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Basic VM_PFNMAP tests relying on mmap() of '/dev/mem'
+ *
+ * Copyright 2025, Red Hat, Inc.
+ *
+ * Author(s): David Hildenbrand <david@redhat.com>
+ */
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <errno.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <setjmp.h>
+#include <linux/mman.h>
+#include <sys/mman.h>
+#include <sys/wait.h>
+
+#include "../kselftest_harness.h"
+#include "vm_util.h"
+
+static sigjmp_buf sigjmp_buf_env;
+
+static void signal_handler(int sig)
+{
+ siglongjmp(sigjmp_buf_env, -EFAULT);
+}
+
+static int test_read_access(char *addr, size_t size, size_t pagesize)
+{
+ size_t offs;
+ int ret;
+
+ if (signal(SIGSEGV, signal_handler) == SIG_ERR)
+ return -EINVAL;
+
+ ret = sigsetjmp(sigjmp_buf_env, 1);
+ if (!ret) {
+ for (offs = 0; offs < size; offs += pagesize)
+ /* Force a read that the compiler cannot optimize out. */
+ *((volatile char *)(addr + offs));
+ }
+ if (signal(SIGSEGV, SIG_DFL) == SIG_ERR)
+ return -EINVAL;
+
+ return ret;
+}
+
+static int find_ram_target(off_t *phys_addr,
+ unsigned long long pagesize)
+{
+ unsigned long long start, end;
+ char line[80], *end_ptr;
+ FILE *file;
+
+ /* Search /proc/iomem for the first suitable "System RAM" range. */
+ file = fopen("/proc/iomem", "r");
+ if (!file)
+ return -errno;
+
+ while (fgets(line, sizeof(line), file)) {
+ /* Ignore any child nodes. */
+ if (!isalnum(line[0]))
+ continue;
+
+ if (!strstr(line, "System RAM\n"))
+ continue;
+
+ start = strtoull(line, &end_ptr, 16);
+ /* Skip over the "-" */
+ end_ptr++;
+ /* Make end "exclusive". */
+ end = strtoull(end_ptr, NULL, 16) + 1;
+
+ /* Actual addresses are not exported */
+ if (!start && !end)
+ break;
+
+ /* We need full pages. */
+ start = (start + pagesize - 1) & ~(pagesize - 1);
+ end &= ~(pagesize - 1);
+
+ if (start != (off_t)start)
+ break;
+
+ /* We need two pages. */
+ if (end > start + 2 * pagesize) {
+ fclose(file);
+ *phys_addr = start;
+ return 0;
+ }
+ }
+ return -ENOENT;
+}
+
+FIXTURE(pfnmap)
+{
+ off_t phys_addr;
+ size_t pagesize;
+ int dev_mem_fd;
+ char *addr1;
+ size_t size1;
+ char *addr2;
+ size_t size2;
+};
+
+FIXTURE_SETUP(pfnmap)
+{
+ self->pagesize = getpagesize();
+
+ /* We'll require two physical pages throughout our tests ... */
+ if (find_ram_target(&self->phys_addr, self->pagesize))
+ SKIP(return, "Cannot find ram target in '/proc/iomem'\n");
+
+ self->dev_mem_fd = open("/dev/mem", O_RDONLY);
+ if (self->dev_mem_fd < 0)
+ SKIP(return, "Cannot open '/dev/mem'\n");
+
+ self->size1 = self->pagesize * 2;
+ self->addr1 = mmap(NULL, self->size1, PROT_READ, MAP_SHARED,
+ self->dev_mem_fd, self->phys_addr);
+ if (self->addr1 == MAP_FAILED)
+ SKIP(return, "Cannot mmap '/dev/mem'\n");
+
+ /* ... and want to be able to read from them. */
+ if (test_read_access(self->addr1, self->size1, self->pagesize))
+ SKIP(return, "Cannot read-access mmap'ed '/dev/mem'\n");
+
+ self->size2 = 0;
+ self->addr2 = MAP_FAILED;
+}
+
+FIXTURE_TEARDOWN(pfnmap)
+{
+ if (self->addr2 != MAP_FAILED)
+ munmap(self->addr2, self->size2);
+ if (self->addr1 != MAP_FAILED)
+ munmap(self->addr1, self->size1);
+ if (self->dev_mem_fd >= 0)
+ close(self->dev_mem_fd);
+}
+
+TEST_F(pfnmap, madvise_disallowed)
+{
+ int advices[] = {
+ MADV_DONTNEED,
+ MADV_DONTNEED_LOCKED,
+ MADV_FREE,
+ MADV_WIPEONFORK,
+ MADV_COLD,
+ MADV_PAGEOUT,
+ MADV_POPULATE_READ,
+ MADV_POPULATE_WRITE,
+ };
+ int i;
+
+ /* All these advices must be rejected. */
+ for (i = 0; i < ARRAY_SIZE(advices); i++) {
+ EXPECT_LT(madvise(self->addr1, self->pagesize, advices[i]), 0);
+ EXPECT_EQ(errno, EINVAL);
+ }
+}
+
+TEST_F(pfnmap, munmap_split)
+{
+ /*
+ * Unmap the first page. This munmap() call is not really expected to
+ * fail, but we might be able to trigger other internal issues.
+ */
+ ASSERT_EQ(munmap(self->addr1, self->pagesize), 0);
+
+ /*
+ * Remap the first page while the second page is still mapped. This
+ * makes sure that any PAT tracking on x86 will allow for mmap()'ing
+ * a page again while some parts of the first mmap() are still
+ * around.
+ */
+ self->size2 = self->pagesize;
+ self->addr2 = mmap(NULL, self->pagesize, PROT_READ, MAP_SHARED,
+ self->dev_mem_fd, self->phys_addr);
+ ASSERT_NE(self->addr2, MAP_FAILED);
+}
+
+TEST_F(pfnmap, mremap_fixed)
+{
+ char *ret;
+
+ /* Reserve a destination area. */
+ self->size2 = self->size1;
+ self->addr2 = mmap(NULL, self->size2, PROT_READ, MAP_ANON | MAP_PRIVATE,
+ -1, 0);
+ ASSERT_NE(self->addr2, MAP_FAILED);
+
+ /* mremap() over our destination. */
+ ret = mremap(self->addr1, self->size1, self->size2,
+ MREMAP_FIXED | MREMAP_MAYMOVE, self->addr2);
+ ASSERT_NE(ret, MAP_FAILED);
+}
+
+TEST_F(pfnmap, mremap_shrink)
+{
+ char *ret;
+
+ /* Shrinking is expected to work. */
+ ret = mremap(self->addr1, self->size1, self->size1 - self->pagesize, 0);
+ ASSERT_NE(ret, MAP_FAILED);
+}
+
+TEST_F(pfnmap, mremap_expand)
+{
+ /*
+ * Growing is not expected to work, and getting it right would
+ * be challenging. So this test primarily serves as an early warning
+ * that something that probably should never work suddenly works.
+ */
+ self->size2 = self->size1 + self->pagesize;
+ self->addr2 = mremap(self->addr1, self->size1, self->size2, MREMAP_MAYMOVE);
+ ASSERT_EQ(self->addr2, MAP_FAILED);
+}
+
+TEST_F(pfnmap, fork)
+{
+ pid_t pid;
+ int ret;
+
+ /* fork() a child and test if the child can access the pages. */
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (!pid) {
+ EXPECT_EQ(test_read_access(self->addr1, self->size1,
+ self->pagesize), 0);
+ exit(0);
+ }
+
+ wait(&ret);
+ if (WIFEXITED(ret))
+ ret = WEXITSTATUS(ret);
+ else
+ ret = -EINVAL;
+ ASSERT_EQ(ret, 0);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
index 9aff33b10999..dddd1dd8af14 100755
--- a/tools/testing/selftests/mm/run_vmtests.sh
+++ b/tools/testing/selftests/mm/run_vmtests.sh
@@ -63,6 +63,8 @@ separated by spaces:
test soft dirty page bit semantics
- pagemap
test pagemap_scan IOCTL
+- pfnmap
+ tests for VM_PFNMAP handling
- cow
test copy-on-write semantics
- thp
@@ -79,6 +81,8 @@ separated by spaces:
test prctl(PR_SET_MDWE, ...)
- page_frag
test handling of page fragment allocation and freeing
+- vma_merge
+ test VMA merge cases behave as expected
example: ./run_vmtests.sh -t "hmm mmap ksm"
EOF
@@ -421,6 +425,8 @@ CATEGORY="madv_guard" run_test ./guard-regions
# MADV_POPULATE_READ and MADV_POPULATE_WRITE tests
CATEGORY="madv_populate" run_test ./madv_populate
+CATEGORY="vma_merge" run_test ./merge
+
if [ -x ./memfd_secret ]
then
(echo 0 > /proc/sys/kernel/yama/ptrace_scope 2>&1) | tap_prefix
@@ -468,6 +474,8 @@ fi
CATEGORY="pagemap" run_test ./pagemap_ioctl
+CATEGORY="pfnmap" run_test ./pfnmap
+
# COW tests
CATEGORY="cow" run_test ./cow
diff --git a/tools/testing/selftests/mm/settings b/tools/testing/selftests/mm/settings
index a953c96aa16e..e2206265f67c 100644
--- a/tools/testing/selftests/mm/settings
+++ b/tools/testing/selftests/mm/settings
@@ -1 +1 @@
-timeout=180
+timeout=900
diff --git a/tools/testing/selftests/mm/thuge-gen.c b/tools/testing/selftests/mm/thuge-gen.c
index cd5174d735be..95b6f043a3cb 100644
--- a/tools/testing/selftests/mm/thuge-gen.c
+++ b/tools/testing/selftests/mm/thuge-gen.c
@@ -77,7 +77,7 @@ void show(unsigned long ps)
system(buf);
}
-unsigned long read_sysfs(int warn, char *fmt, ...)
+unsigned long thuge_read_sysfs(int warn, char *fmt, ...)
{
char *line = NULL;
size_t linelen = 0;
@@ -106,7 +106,7 @@ unsigned long read_sysfs(int warn, char *fmt, ...)
unsigned long read_free(unsigned long ps)
{
- return read_sysfs(ps != getpagesize(),
+ return thuge_read_sysfs(ps != getpagesize(),
"/sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages",
ps >> 10);
}
@@ -127,7 +127,7 @@ void test_mmap(unsigned long size, unsigned flags)
show(size);
ksft_test_result(size == getpagesize() || (before - after) == NUM_PAGES,
- "%s mmap %lu\n", __func__, size);
+ "%s mmap %lu %x\n", __func__, size, flags);
if (munmap(map, size * NUM_PAGES))
ksft_exit_fail_msg("%s: unmap %s\n", __func__, strerror(errno));
@@ -165,7 +165,7 @@ void test_shmget(unsigned long size, unsigned flags)
show(size);
ksft_test_result(size == getpagesize() || (before - after) == NUM_PAGES,
- "%s: mmap %lu\n", __func__, size);
+ "%s: mmap %lu %x\n", __func__, size, flags);
if (shmdt(map))
ksft_exit_fail_msg("%s: shmdt: %s\n", __func__, strerror(errno));
}
@@ -195,7 +195,7 @@ void find_pagesizes(void)
}
globfree(&g);
- if (read_sysfs(0, "/proc/sys/kernel/shmmax") < NUM_PAGES * largest)
+ if (thuge_read_sysfs(0, "/proc/sys/kernel/shmmax") < NUM_PAGES * largest)
ksft_exit_fail_msg("Please do echo %lu > /proc/sys/kernel/shmmax",
largest * NUM_PAGES);
diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c
index e8fd9011c2a3..c73fd5d455c8 100644
--- a/tools/testing/selftests/mm/uffd-unit-tests.c
+++ b/tools/testing/selftests/mm/uffd-unit-tests.c
@@ -1231,6 +1231,182 @@ static void uffd_move_pmd_split_test(uffd_test_args_t *targs)
uffd_move_pmd_handle_fault);
}
+static bool
+uffdio_verify_results(const char *name, int ret, int error, long result)
+{
+ /*
+ * Should always return -1 with errno=EAGAIN, with corresponding
+ * result field updated in ioctl() args to be -EAGAIN too
+ * (e.g. copy.copy field for UFFDIO_COPY).
+ */
+ if (ret != -1) {
+ uffd_test_fail("%s should have returned -1", name);
+ return false;
+ }
+
+ if (error != EAGAIN) {
+ uffd_test_fail("%s should have errno==EAGAIN", name);
+ return false;
+ }
+
+ if (result != -EAGAIN) {
+ uffd_test_fail("%s should have been updated for -EAGAIN",
+ name);
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * This defines a function to test one ioctl. Note that here "field" can
+ * be 1 or anything not -EAGAIN. With that initial value set, we can
+ * verify later that it should be updated by kernel (when -EAGAIN
+ * returned), by checking whether it is also updated to -EAGAIN.
+ */
+#define DEFINE_MMAP_CHANGING_TEST(name, ioctl_name, field) \
+ static bool uffdio_mmap_changing_test_##name(int fd) \
+ { \
+ int ret; \
+ struct uffdio_##name args = { \
+ .field = 1, \
+ }; \
+ ret = ioctl(fd, ioctl_name, &args); \
+ return uffdio_verify_results(#ioctl_name, ret, errno, args.field); \
+ }
+
+DEFINE_MMAP_CHANGING_TEST(zeropage, UFFDIO_ZEROPAGE, zeropage)
+DEFINE_MMAP_CHANGING_TEST(copy, UFFDIO_COPY, copy)
+DEFINE_MMAP_CHANGING_TEST(move, UFFDIO_MOVE, move)
+DEFINE_MMAP_CHANGING_TEST(poison, UFFDIO_POISON, updated)
+DEFINE_MMAP_CHANGING_TEST(continue, UFFDIO_CONTINUE, mapped)
+
+typedef enum {
+ /* We actually do not care about any state except UNINTERRUPTIBLE.. */
+ THR_STATE_UNKNOWN = 0,
+ THR_STATE_UNINTERRUPTIBLE,
+} thread_state;
+
+static void sleep_short(void)
+{
+ usleep(1000);
+}
+
+static thread_state thread_state_get(pid_t tid)
+{
+ const char *header = "State:\t";
+ char tmp[256], *p, c;
+ FILE *fp;
+
+ snprintf(tmp, sizeof(tmp), "/proc/%d/status", tid);
+ fp = fopen(tmp, "r");
+
+ if (!fp)
+ return THR_STATE_UNKNOWN;
+
+ while (fgets(tmp, sizeof(tmp), fp)) {
+ p = strstr(tmp, header);
+ if (p) {
+ /* For example, "State:\tD (disk sleep)" */
+ c = *(p + sizeof(header) - 1);
+ return c == 'D' ?
+ THR_STATE_UNINTERRUPTIBLE : THR_STATE_UNKNOWN;
+ }
+ }
+
+ return THR_STATE_UNKNOWN;
+}
+
+static void thread_state_until(pid_t tid, thread_state state)
+{
+ thread_state s;
+
+ do {
+ s = thread_state_get(tid);
+ sleep_short();
+ } while (s != state);
+}
+
+static void *uffd_mmap_changing_thread(void *opaque)
+{
+ volatile pid_t *pid = opaque;
+ int ret;
+
+ /* Unfortunately, it's only fetch-able from the thread itself.. */
+ assert(*pid == 0);
+ *pid = syscall(SYS_gettid);
+
+ /* Inject an event, this will hang solid until the event read */
+ ret = madvise(area_dst, page_size, MADV_REMOVE);
+ if (ret)
+ err("madvise(MADV_REMOVE) failed");
+
+ return NULL;
+}
+
+static void uffd_consume_message(int fd)
+{
+ struct uffd_msg msg = { 0 };
+
+ while (uffd_read_msg(fd, &msg));
+}
+
+static void uffd_mmap_changing_test(uffd_test_args_t *targs)
+{
+ /*
+ * This stores the real PID (which can be different from how tid is
+ * defined..) for the child thread, 0 means not initialized.
+ */
+ pid_t pid = 0;
+ pthread_t tid;
+ int ret;
+
+ if (uffd_register(uffd, area_dst, nr_pages * page_size,
+ true, false, false))
+ err("uffd_register() failed");
+
+ /* Create a thread to generate the racy event */
+ ret = pthread_create(&tid, NULL, uffd_mmap_changing_thread, &pid);
+ if (ret)
+ err("pthread_create() failed");
+
+ /*
+ * Wait until the thread setup the pid. Use volatile to make sure
+ * it reads from RAM not regs.
+ */
+ while (!(volatile pid_t)pid)
+ sleep_short();
+
+ /* Wait until the thread hangs at REMOVE event */
+ thread_state_until(pid, THR_STATE_UNINTERRUPTIBLE);
+
+ if (!uffdio_mmap_changing_test_copy(uffd))
+ return;
+
+ if (!uffdio_mmap_changing_test_zeropage(uffd))
+ return;
+
+ if (!uffdio_mmap_changing_test_move(uffd))
+ return;
+
+ if (!uffdio_mmap_changing_test_poison(uffd))
+ return;
+
+ if (!uffdio_mmap_changing_test_continue(uffd))
+ return;
+
+ /*
+ * All succeeded above! Recycle everything. Start by reading the
+ * event so as to kick the thread roll again..
+ */
+ uffd_consume_message(uffd);
+
+ ret = pthread_join(tid, NULL);
+ assert(ret == 0);
+
+ uffd_test_pass();
+}
+
static int prevent_hugepages(const char **errmsg)
{
/* This should be done before source area is populated */
@@ -1470,6 +1646,32 @@ uffd_test_case_t uffd_tests[] = {
.mem_targets = MEM_ALL,
.uffd_feature_required = UFFD_FEATURE_POISON,
},
+ {
+ .name = "mmap-changing",
+ .uffd_fn = uffd_mmap_changing_test,
+ /*
+ * There's no point running this test over all mem types as
+ * they share the same code paths.
+ *
+ * Choose shmem for simplicity, because (1) shmem supports
+ * MINOR mode to cover UFFDIO_CONTINUE, and (2) shmem is
+ * almost always available (unlike hugetlb). Here we
+ * abused SHMEM for UFFDIO_MOVE, but the test we want to
+ * cover doesn't yet need the correct memory type..
+ */
+ .mem_targets = MEM_SHMEM,
+ /*
+ * Any UFFD_FEATURE_EVENT_* should work to trigger the
+ * race logically, but choose the simplest (REMOVE).
+ *
+ * Meanwhile, since we'll cover quite a few new ioctl()s
+ * (CONTINUE, POISON, MOVE), skip this test for old kernels
+ * by choosing all of them.
+ */
+ .uffd_feature_required = UFFD_FEATURE_EVENT_REMOVE |
+ UFFD_FEATURE_MOVE | UFFD_FEATURE_POISON |
+ UFFD_FEATURE_MINOR_SHMEM,
+ },
};
static void usage(const char *prog)
diff --git a/tools/testing/selftests/mm/va_high_addr_switch.sh b/tools/testing/selftests/mm/va_high_addr_switch.sh
index 1f92e8caceac..325de53966b6 100755
--- a/tools/testing/selftests/mm/va_high_addr_switch.sh
+++ b/tools/testing/selftests/mm/va_high_addr_switch.sh
@@ -7,23 +7,20 @@
# real test to check that the kernel is configured to support at least 5
# pagetable levels.
-# 1 means the test failed
-exitcode=1
-
# Kselftest framework requirement - SKIP code is 4.
ksft_skip=4
-fail()
+skip()
{
echo "$1"
- exit $exitcode
+ exit $ksft_skip
}
check_supported_x86_64()
{
local config="/proc/config.gz"
[[ -f "${config}" ]] || config="/boot/config-$(uname -r)"
- [[ -f "${config}" ]] || fail "Cannot find kernel config in /proc or /boot"
+ [[ -f "${config}" ]] || skip "Cannot find kernel config in /proc or /boot"
# gzip -dcfq automatically handles both compressed and plaintext input.
# See man 1 gzip under '-f'.
@@ -33,11 +30,9 @@ check_supported_x86_64()
else {print 1}; exit}' /proc/cpuinfo 2>/dev/null)
if [[ "${pg_table_levels}" -lt 5 ]]; then
- echo "$0: PGTABLE_LEVELS=${pg_table_levels}, must be >= 5 to run this test"
- exit $ksft_skip
+ skip "$0: PGTABLE_LEVELS=${pg_table_levels}, must be >= 5 to run this test"
elif [[ "${cpu_supports_pl5}" -ne 0 ]]; then
- echo "$0: CPU does not have the necessary la57 flag to support page table level 5"
- exit $ksft_skip
+ skip "$0: CPU does not have the necessary la57 flag to support page table level 5"
fi
}
@@ -45,24 +40,21 @@ check_supported_ppc64()
{
local config="/proc/config.gz"
[[ -f "${config}" ]] || config="/boot/config-$(uname -r)"
- [[ -f "${config}" ]] || fail "Cannot find kernel config in /proc or /boot"
+ [[ -f "${config}" ]] || skip "Cannot find kernel config in /proc or /boot"
local pg_table_levels=$(gzip -dcfq "${config}" | grep PGTABLE_LEVELS | cut -d'=' -f 2)
if [[ "${pg_table_levels}" -lt 5 ]]; then
- echo "$0: PGTABLE_LEVELS=${pg_table_levels}, must be >= 5 to run this test"
- exit $ksft_skip
+ skip "$0: PGTABLE_LEVELS=${pg_table_levels}, must be >= 5 to run this test"
fi
local mmu_support=$(grep -m1 "mmu" /proc/cpuinfo | awk '{print $3}')
if [[ "$mmu_support" != "radix" ]]; then
- echo "$0: System does not use Radix MMU, required for 5-level paging"
- exit $ksft_skip
+ skip "$0: System does not use Radix MMU, required for 5-level paging"
fi
local hugepages_total=$(awk '/HugePages_Total/ {print $2}' /proc/meminfo)
if [[ "${hugepages_total}" -eq 0 ]]; then
- echo "$0: HugePages are not enabled, required for some tests"
- exit $ksft_skip
+ skip "$0: HugePages are not enabled, required for some tests"
fi
}
diff --git a/tools/testing/selftests/mm/virtual_address_range.c b/tools/testing/selftests/mm/virtual_address_range.c
index b380e102b22f..169dbd692bf5 100644
--- a/tools/testing/selftests/mm/virtual_address_range.c
+++ b/tools/testing/selftests/mm/virtual_address_range.c
@@ -77,8 +77,11 @@ static void validate_addr(char *ptr, int high_addr)
{
unsigned long addr = (unsigned long) ptr;
- if (high_addr && addr < HIGH_ADDR_MARK)
- ksft_exit_fail_msg("Bad address %lx\n", addr);
+ if (high_addr) {
+ if (addr < HIGH_ADDR_MARK)
+ ksft_exit_fail_msg("Bad address %lx\n", addr);
+ return;
+ }
if (addr > HIGH_ADDR_MARK)
ksft_exit_fail_msg("Bad address %lx\n", addr);
diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c
index a36734fb62f3..5492e3f784df 100644
--- a/tools/testing/selftests/mm/vm_util.c
+++ b/tools/testing/selftests/mm/vm_util.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include <string.h>
+#include <errno.h>
#include <fcntl.h>
#include <dirent.h>
#include <inttypes.h>
@@ -424,3 +425,102 @@ bool check_vmflag_io(void *addr)
flags += flaglen;
}
}
+
+/*
+ * Open an fd at /proc/$pid/maps and configure procmap_out ready for
+ * PROCMAP_QUERY query. Returns 0 on success, or an error code otherwise.
+ */
+int open_procmap(pid_t pid, struct procmap_fd *procmap_out)
+{
+ char path[256];
+ int ret = 0;
+
+ memset(procmap_out, '\0', sizeof(*procmap_out));
+ sprintf(path, "/proc/%d/maps", pid);
+ procmap_out->query.size = sizeof(procmap_out->query);
+ procmap_out->fd = open(path, O_RDONLY);
+ if (procmap_out->fd < 0)
+ ret = -errno;
+
+ return ret;
+}
+
+/* Perform PROCMAP_QUERY. Returns 0 on success, or an error code otherwise. */
+int query_procmap(struct procmap_fd *procmap)
+{
+ int ret = 0;
+
+ if (ioctl(procmap->fd, PROCMAP_QUERY, &procmap->query) == -1)
+ ret = -errno;
+
+ return ret;
+}
+
+/*
+ * Try to find the VMA at specified address, returns true if found, false if not
+ * found, and the test is failed if any other error occurs.
+ *
+ * On success, procmap->query is populated with the results.
+ */
+bool find_vma_procmap(struct procmap_fd *procmap, void *address)
+{
+ int err;
+
+ procmap->query.query_flags = 0;
+ procmap->query.query_addr = (unsigned long)address;
+ err = query_procmap(procmap);
+ if (!err)
+ return true;
+
+ if (err != -ENOENT)
+ ksft_exit_fail_msg("%s: Error %d on ioctl(PROCMAP_QUERY)\n",
+ __func__, err);
+ return false;
+}
+
+/*
+ * Close fd used by PROCMAP_QUERY mechanism. Returns 0 on success, or an error
+ * code otherwise.
+ */
+int close_procmap(struct procmap_fd *procmap)
+{
+ return close(procmap->fd);
+}
+
+int write_sysfs(const char *file_path, unsigned long val)
+{
+ FILE *f = fopen(file_path, "w");
+
+ if (!f) {
+ fprintf(stderr, "f %s\n", file_path);
+ perror("fopen");
+ return 1;
+ }
+ if (fprintf(f, "%lu", val) < 0) {
+ perror("fprintf");
+ fclose(f);
+ return 1;
+ }
+ fclose(f);
+
+ return 0;
+}
+
+int read_sysfs(const char *file_path, unsigned long *val)
+{
+ FILE *f = fopen(file_path, "r");
+
+ if (!f) {
+ fprintf(stderr, "f %s\n", file_path);
+ perror("fopen");
+ return 1;
+ }
+ if (fscanf(f, "%lu", val) != 1) {
+ perror("fscanf");
+ fclose(f);
+ return 1;
+ }
+ fclose(f);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h
index 6effafdc4d8a..b8136d12a0f8 100644
--- a/tools/testing/selftests/mm/vm_util.h
+++ b/tools/testing/selftests/mm/vm_util.h
@@ -3,9 +3,11 @@
#include <stdbool.h>
#include <sys/mman.h>
#include <err.h>
+#include <stdarg.h>
#include <strings.h> /* ffsl() */
#include <unistd.h> /* _SC_PAGESIZE */
#include "../kselftest.h"
+#include <linux/fs.h>
#define BIT_ULL(nr) (1ULL << (nr))
#define PM_SOFT_DIRTY BIT_ULL(55)
@@ -19,6 +21,15 @@
extern unsigned int __page_size;
extern unsigned int __page_shift;
+/*
+ * Represents an open fd and PROCMAP_QUERY state for binary (via ioctl)
+ * /proc/$pid/[s]maps lookup.
+ */
+struct procmap_fd {
+ int fd;
+ struct procmap_query query;
+};
+
static inline unsigned int psize(void)
{
if (!__page_size)
@@ -73,6 +84,38 @@ int uffd_register_with_ioctls(int uffd, void *addr, uint64_t len,
bool miss, bool wp, bool minor, uint64_t *ioctls);
unsigned long get_free_hugepages(void);
bool check_vmflag_io(void *addr);
+int open_procmap(pid_t pid, struct procmap_fd *procmap_out);
+int query_procmap(struct procmap_fd *procmap);
+bool find_vma_procmap(struct procmap_fd *procmap, void *address);
+int close_procmap(struct procmap_fd *procmap);
+int write_sysfs(const char *file_path, unsigned long val);
+int read_sysfs(const char *file_path, unsigned long *val);
+
+static inline int open_self_procmap(struct procmap_fd *procmap_out)
+{
+ pid_t pid = getpid();
+
+ return open_procmap(pid, procmap_out);
+}
+
+/* These helpers need to be inline to match the kselftest.h idiom. */
+static char test_name[1024];
+
+static inline void log_test_start(const char *name, ...)
+{
+ va_list args;
+ va_start(args, name);
+
+ vsnprintf(test_name, sizeof(test_name), name, args);
+ ksft_print_msg("[RUN] %s\n", test_name);
+
+ va_end(args);
+}
+
+static inline void log_test_result(int result)
+{
+ ksft_test_result_report(result, "%s\n", test_name);
+}
/*
* On ppc64 this will only work with radix 2M hugepage size
diff --git a/tools/testing/selftests/mount_setattr/mount_setattr_test.c b/tools/testing/selftests/mount_setattr/mount_setattr_test.c
index 8b378c91debf..b1e4618399be 100644
--- a/tools/testing/selftests/mount_setattr/mount_setattr_test.c
+++ b/tools/testing/selftests/mount_setattr/mount_setattr_test.c
@@ -2079,24 +2079,9 @@ TEST_F(mount_setattr, detached_tree_propagation)
* means that the device information will be different for any
* statx() that was taken from /mnt/A before the mount compared
* to one after the mount.
- *
- * Since we already now that the device information between the
- * stx1 and stx2 samples are identical we also now that stx2 and
- * stx3 device information will necessarily differ.
*/
ASSERT_NE(stx1.stx_dev_minor, stx3.stx_dev_minor);
-
- /*
- * If mount propagation worked correctly then the tmpfs mount
- * that was created after the mount namespace was unshared will
- * have propagated onto /mnt/A in the detached mount tree.
- *
- * Verify that the device information for stx3 and stx4 are
- * identical. It is already established that stx3 is different
- * from both stx1 and stx2 sampled before the tmpfs mount was
- * done so if stx3 and stx4 are identical the proof is done.
- */
- ASSERT_EQ(stx3.stx_dev_minor, stx4.stx_dev_minor);
+ ASSERT_EQ(stx1.stx_dev_minor, stx4.stx_dev_minor);
EXPECT_EQ(close(fd_tree), 0);
}
diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore
index 532bb732bc6d..c6dd2a335cf4 100644
--- a/tools/testing/selftests/net/.gitignore
+++ b/tools/testing/selftests/net/.gitignore
@@ -50,6 +50,7 @@ tap
tcp_fastopen_backup_key
tcp_inq
tcp_mmap
+tfo
timestamping
tls
toeplitz
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index ea84b88bcb30..332f387615d7 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -27,6 +27,7 @@ TEST_PROGS += amt.sh
TEST_PROGS += unicast_extensions.sh
TEST_PROGS += udpgro_fwd.sh
TEST_PROGS += udpgro_frglist.sh
+TEST_PROGS += nat6to4.sh
TEST_PROGS += veth.sh
TEST_PROGS += ioam6.sh
TEST_PROGS += gro.sh
@@ -109,6 +110,8 @@ TEST_GEN_PROGS += proc_net_pktgen
TEST_PROGS += lwt_dst_cache_ref_loop.sh
TEST_PROGS += skf_net_off.sh
TEST_GEN_FILES += skf_net_off
+TEST_GEN_FILES += tfo
+TEST_PROGS += tfo_passive.sh
# YNL files, must be before "include ..lib.mk"
YNL_GEN_FILES := busy_poller netlink-dumps
diff --git a/tools/testing/selftests/net/af_unix/msg_oob.c b/tools/testing/selftests/net/af_unix/msg_oob.c
index 3ed3882a93b8..b5f474969917 100644
--- a/tools/testing/selftests/net/af_unix/msg_oob.c
+++ b/tools/testing/selftests/net/af_unix/msg_oob.c
@@ -210,7 +210,7 @@ static void __sendpair(struct __test_metadata *_metadata,
static void __recvpair(struct __test_metadata *_metadata,
FIXTURE_DATA(msg_oob) *self,
const char *expected_buf, int expected_len,
- int buf_len, int flags)
+ int buf_len, int flags, bool is_sender)
{
int i, ret[2], recv_errno[2], expected_errno = 0;
char recv_buf[2][BUF_SZ] = {};
@@ -221,7 +221,9 @@ static void __recvpair(struct __test_metadata *_metadata,
errno = 0;
for (i = 0; i < 2; i++) {
- ret[i] = recv(self->fd[i * 2 + 1], recv_buf[i], buf_len, flags);
+ int index = is_sender ? i * 2 : i * 2 + 1;
+
+ ret[i] = recv(self->fd[index], recv_buf[i], buf_len, flags);
recv_errno[i] = errno;
}
@@ -308,6 +310,20 @@ static void __siocatmarkpair(struct __test_metadata *_metadata,
ASSERT_EQ(answ[0], answ[1]);
}
+static void __resetpair(struct __test_metadata *_metadata,
+ FIXTURE_DATA(msg_oob) *self,
+ const FIXTURE_VARIANT(msg_oob) *variant,
+ bool reset)
+{
+ int i;
+
+ for (i = 0; i < 2; i++)
+ close(self->fd[i * 2 + 1]);
+
+ __recvpair(_metadata, self, "", reset ? -ECONNRESET : 0, 1,
+ variant->peek ? MSG_PEEK : 0, true);
+}
+
#define sendpair(buf, len, flags) \
__sendpair(_metadata, self, buf, len, flags)
@@ -316,9 +332,10 @@ static void __siocatmarkpair(struct __test_metadata *_metadata,
if (variant->peek) \
__recvpair(_metadata, self, \
expected_buf, expected_len, \
- buf_len, (flags) | MSG_PEEK); \
+ buf_len, (flags) | MSG_PEEK, false); \
__recvpair(_metadata, self, \
- expected_buf, expected_len, buf_len, flags); \
+ expected_buf, expected_len, \
+ buf_len, flags, false); \
} while (0)
#define epollpair(oob_remaining) \
@@ -330,6 +347,9 @@ static void __siocatmarkpair(struct __test_metadata *_metadata,
#define setinlinepair() \
__setinlinepair(_metadata, self)
+#define resetpair(reset) \
+ __resetpair(_metadata, self, variant, reset)
+
#define tcp_incompliant \
for (self->tcp_compliant = false; \
self->tcp_compliant == false; \
@@ -344,6 +364,21 @@ TEST_F(msg_oob, non_oob)
recvpair("", -EINVAL, 1, MSG_OOB);
epollpair(false);
siocatmarkpair(false);
+
+ resetpair(true);
+}
+
+TEST_F(msg_oob, non_oob_no_reset)
+{
+ sendpair("x", 1, 0);
+ epollpair(false);
+ siocatmarkpair(false);
+
+ recvpair("x", 1, 1, 0);
+ epollpair(false);
+ siocatmarkpair(false);
+
+ resetpair(false);
}
TEST_F(msg_oob, oob)
@@ -355,6 +390,19 @@ TEST_F(msg_oob, oob)
recvpair("x", 1, 1, MSG_OOB);
epollpair(false);
siocatmarkpair(true);
+
+ tcp_incompliant {
+ resetpair(false); /* TCP sets -ECONNRESET for ex-OOB. */
+ }
+}
+
+TEST_F(msg_oob, oob_reset)
+{
+ sendpair("x", 1, MSG_OOB);
+ epollpair(true);
+ siocatmarkpair(true);
+
+ resetpair(true);
}
TEST_F(msg_oob, oob_drop)
@@ -370,6 +418,8 @@ TEST_F(msg_oob, oob_drop)
recvpair("", -EINVAL, 1, MSG_OOB);
epollpair(false);
siocatmarkpair(false);
+
+ resetpair(false);
}
TEST_F(msg_oob, oob_ahead)
@@ -385,6 +435,10 @@ TEST_F(msg_oob, oob_ahead)
recvpair("hell", 4, 4, 0);
epollpair(false);
siocatmarkpair(true);
+
+ tcp_incompliant {
+ resetpair(false); /* TCP sets -ECONNRESET for ex-OOB. */
+ }
}
TEST_F(msg_oob, oob_break)
@@ -403,6 +457,8 @@ TEST_F(msg_oob, oob_break)
recvpair("", -EAGAIN, 1, 0);
siocatmarkpair(false);
+
+ resetpair(false);
}
TEST_F(msg_oob, oob_ahead_break)
@@ -426,6 +482,8 @@ TEST_F(msg_oob, oob_ahead_break)
recvpair("world", 5, 5, 0);
epollpair(false);
siocatmarkpair(false);
+
+ resetpair(false);
}
TEST_F(msg_oob, oob_break_drop)
@@ -449,6 +507,8 @@ TEST_F(msg_oob, oob_break_drop)
recvpair("", -EINVAL, 1, MSG_OOB);
epollpair(false);
siocatmarkpair(false);
+
+ resetpair(false);
}
TEST_F(msg_oob, ex_oob_break)
@@ -476,6 +536,8 @@ TEST_F(msg_oob, ex_oob_break)
recvpair("ld", 2, 2, 0);
epollpair(false);
siocatmarkpair(false);
+
+ resetpair(false);
}
TEST_F(msg_oob, ex_oob_drop)
@@ -498,6 +560,8 @@ TEST_F(msg_oob, ex_oob_drop)
epollpair(false);
siocatmarkpair(true);
}
+
+ resetpair(false);
}
TEST_F(msg_oob, ex_oob_drop_2)
@@ -523,6 +587,8 @@ TEST_F(msg_oob, ex_oob_drop_2)
epollpair(false);
siocatmarkpair(true);
}
+
+ resetpair(false);
}
TEST_F(msg_oob, ex_oob_oob)
@@ -546,6 +612,54 @@ TEST_F(msg_oob, ex_oob_oob)
recvpair("", -EINVAL, 1, MSG_OOB);
epollpair(false);
siocatmarkpair(false);
+
+ resetpair(false);
+}
+
+TEST_F(msg_oob, ex_oob_ex_oob)
+{
+ sendpair("x", 1, MSG_OOB);
+ epollpair(true);
+ siocatmarkpair(true);
+
+ recvpair("x", 1, 1, MSG_OOB);
+ epollpair(false);
+ siocatmarkpair(true);
+
+ sendpair("y", 1, MSG_OOB);
+ epollpair(true);
+ siocatmarkpair(true);
+
+ recvpair("y", 1, 1, MSG_OOB);
+ epollpair(false);
+ siocatmarkpair(true);
+
+ tcp_incompliant {
+ resetpair(false); /* TCP sets -ECONNRESET for ex-OOB. */
+ }
+}
+
+TEST_F(msg_oob, ex_oob_ex_oob_oob)
+{
+ sendpair("x", 1, MSG_OOB);
+ epollpair(true);
+ siocatmarkpair(true);
+
+ recvpair("x", 1, 1, MSG_OOB);
+ epollpair(false);
+ siocatmarkpair(true);
+
+ sendpair("y", 1, MSG_OOB);
+ epollpair(true);
+ siocatmarkpair(true);
+
+ recvpair("y", 1, 1, MSG_OOB);
+ epollpair(false);
+ siocatmarkpair(true);
+
+ sendpair("z", 1, MSG_OOB);
+ epollpair(true);
+ siocatmarkpair(true);
}
TEST_F(msg_oob, ex_oob_ahead_break)
@@ -576,6 +690,10 @@ TEST_F(msg_oob, ex_oob_ahead_break)
recvpair("d", 1, 1, MSG_OOB);
epollpair(false);
siocatmarkpair(true);
+
+ tcp_incompliant {
+ resetpair(false); /* TCP sets -ECONNRESET for ex-OOB. */
+ }
}
TEST_F(msg_oob, ex_oob_siocatmark)
@@ -595,6 +713,8 @@ TEST_F(msg_oob, ex_oob_siocatmark)
recvpair("hell", 4, 4, 0); /* Intentionally stop at ex-OOB. */
epollpair(true);
siocatmarkpair(false);
+
+ resetpair(true);
}
TEST_F(msg_oob, inline_oob)
@@ -612,6 +732,8 @@ TEST_F(msg_oob, inline_oob)
recvpair("x", 1, 1, 0);
epollpair(false);
siocatmarkpair(false);
+
+ resetpair(false);
}
TEST_F(msg_oob, inline_oob_break)
@@ -633,6 +755,8 @@ TEST_F(msg_oob, inline_oob_break)
recvpair("o", 1, 1, 0);
epollpair(false);
siocatmarkpair(false);
+
+ resetpair(false);
}
TEST_F(msg_oob, inline_oob_ahead_break)
@@ -661,6 +785,8 @@ TEST_F(msg_oob, inline_oob_ahead_break)
epollpair(false);
siocatmarkpair(false);
+
+ resetpair(false);
}
TEST_F(msg_oob, inline_ex_oob_break)
@@ -686,6 +812,8 @@ TEST_F(msg_oob, inline_ex_oob_break)
recvpair("rld", 3, 3, 0);
epollpair(false);
siocatmarkpair(false);
+
+ resetpair(false);
}
TEST_F(msg_oob, inline_ex_oob_no_drop)
@@ -707,6 +835,8 @@ TEST_F(msg_oob, inline_ex_oob_no_drop)
recvpair("y", 1, 1, 0);
epollpair(false);
siocatmarkpair(false);
+
+ resetpair(false);
}
TEST_F(msg_oob, inline_ex_oob_drop)
@@ -731,6 +861,8 @@ TEST_F(msg_oob, inline_ex_oob_drop)
epollpair(false);
siocatmarkpair(false);
}
+
+ resetpair(false);
}
TEST_F(msg_oob, inline_ex_oob_siocatmark)
@@ -752,6 +884,8 @@ TEST_F(msg_oob, inline_ex_oob_siocatmark)
recvpair("hell", 4, 4, 0); /* Intentionally stop at ex-OOB. */
epollpair(true);
siocatmarkpair(false);
+
+ resetpair(true);
}
TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/net/gre_ipv6_lladdr.sh b/tools/testing/selftests/net/gre_ipv6_lladdr.sh
index 5b34f6e1f831..48eb999a3120 100755
--- a/tools/testing/selftests/net/gre_ipv6_lladdr.sh
+++ b/tools/testing/selftests/net/gre_ipv6_lladdr.sh
@@ -24,7 +24,10 @@ setup_basenet()
ip -netns "${NS0}" address add dev lo 2001:db8::10/64 nodad
}
-# Check if network device has an IPv6 link-local address assigned.
+# Check the IPv6 configuration of a network device.
+#
+# We currently check the generation of the link-local IPv6 address and the
+# creation of the ff00::/8 multicast route.
#
# Parameters:
#
@@ -35,7 +38,7 @@ setup_basenet()
# a link-local address)
# * $4: The user visible name for the scenario being tested
#
-check_ipv6_ll_addr()
+check_ipv6_device_config()
{
local DEV="$1"
local EXTRA_MATCH="$2"
@@ -45,7 +48,11 @@ check_ipv6_ll_addr()
RET=0
set +e
ip -netns "${NS0}" -6 address show dev "${DEV}" scope link | grep "fe80::" | grep -q "${EXTRA_MATCH}"
- check_err_fail "${XRET}" $? ""
+ check_err_fail "${XRET}" $? "IPv6 link-local address generation"
+
+ ip -netns "${NS0}" -6 route show table local type multicast ff00::/8 proto kernel | grep -q "${DEV}"
+ check_err_fail 0 $? "IPv6 multicast route creation"
+
log_test "${MSG}"
set -e
}
@@ -102,20 +109,20 @@ test_gre_device()
;;
esac
- # Check that IPv6 link-local address is generated when device goes up
+ # Check the IPv6 device configuration when it goes up
ip netns exec "${NS0}" sysctl -qw net.ipv6.conf.gretest.addr_gen_mode="${ADDR_GEN_MODE}"
ip -netns "${NS0}" link set dev gretest up
- check_ipv6_ll_addr gretest "${MATCH_REGEXP}" "${XRET}" "config: ${MSG}"
+ check_ipv6_device_config gretest "${MATCH_REGEXP}" "${XRET}" "config: ${MSG}"
# Now disable link-local address generation
ip -netns "${NS0}" link set dev gretest down
ip netns exec "${NS0}" sysctl -qw net.ipv6.conf.gretest.addr_gen_mode=1
ip -netns "${NS0}" link set dev gretest up
- # Check that link-local address generation works when re-enabled while
- # the device is already up
+ # Check the IPv6 device configuration when link-local address
+ # generation is re-enabled while the device is already up
ip netns exec "${NS0}" sysctl -qw net.ipv6.conf.gretest.addr_gen_mode="${ADDR_GEN_MODE}"
- check_ipv6_ll_addr gretest "${MATCH_REGEXP}" "${XRET}" "update: ${MSG}"
+ check_ipv6_device_config gretest "${MATCH_REGEXP}" "${XRET}" "update: ${MSG}"
ip -netns "${NS0}" link del dev gretest
}
@@ -126,7 +133,7 @@ test_gre4()
local MODE
for GRE_TYPE in "gre" "gretap"; do
- printf "\n####\nTesting IPv6 link-local address generation on ${GRE_TYPE} devices\n####\n\n"
+ printf "\n####\nTesting IPv6 configuration of ${GRE_TYPE} devices\n####\n\n"
for MODE in "eui64" "none" "stable-privacy" "random"; do
test_gre_device "${GRE_TYPE}" 192.0.2.10 192.0.2.11 "${MODE}"
@@ -142,7 +149,7 @@ test_gre6()
local MODE
for GRE_TYPE in "ip6gre" "ip6gretap"; do
- printf "\n####\nTesting IPv6 link-local address generation on ${GRE_TYPE} devices\n####\n\n"
+ printf "\n####\nTesting IPv6 configuration of ${GRE_TYPE} devices\n####\n\n"
for MODE in "eui64" "none" "stable-privacy" "random"; do
test_gre_device "${GRE_TYPE}" 2001:db8::10 2001:db8::11 "${MODE}"
diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh
index 006fdadcc4b9..86a216e9aca8 100644
--- a/tools/testing/selftests/net/lib.sh
+++ b/tools/testing/selftests/net/lib.sh
@@ -312,7 +312,7 @@ log_test_result()
local test_name=$1; shift
local opt_str=$1; shift
local result=$1; shift
- local retmsg=$1; shift
+ local retmsg=$1
printf "TEST: %-60s [%s]\n" "$test_name $opt_str" "$result"
if [[ $retmsg ]]; then
diff --git a/tools/testing/selftests/net/nat6to4.sh b/tools/testing/selftests/net/nat6to4.sh
new file mode 100755
index 000000000000..0ee859b622a4
--- /dev/null
+++ b/tools/testing/selftests/net/nat6to4.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+NS="ns-peer-$(mktemp -u XXXXXX)"
+
+ip netns add "${NS}"
+ip -netns "${NS}" link set lo up
+ip -netns "${NS}" route add default via 127.0.0.2 dev lo
+
+tc -n "${NS}" qdisc add dev lo ingress
+tc -n "${NS}" filter add dev lo ingress prio 4 protocol ip \
+ bpf object-file nat6to4.bpf.o section schedcls/egress4/snat4 direct-action
+
+ip netns exec "${NS}" \
+ bash -c 'echo 012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789abc | socat - UDP4-DATAGRAM:224.1.0.1:6666,ip-multicast-loop=1'
diff --git a/tools/testing/selftests/net/netfilter/.gitignore b/tools/testing/selftests/net/netfilter/.gitignore
index 64c4f8d9aa6c..5d2be9a00627 100644
--- a/tools/testing/selftests/net/netfilter/.gitignore
+++ b/tools/testing/selftests/net/netfilter/.gitignore
@@ -5,3 +5,4 @@ conntrack_dump_flush
conntrack_reverse_clash
sctp_collision
nf_queue
+udpclash
diff --git a/tools/testing/selftests/net/netfilter/Makefile b/tools/testing/selftests/net/netfilter/Makefile
index e9b2f553588d..a98ed892f55f 100644
--- a/tools/testing/selftests/net/netfilter/Makefile
+++ b/tools/testing/selftests/net/netfilter/Makefile
@@ -15,6 +15,7 @@ TEST_PROGS += conntrack_tcp_unreplied.sh
TEST_PROGS += conntrack_resize.sh
TEST_PROGS += conntrack_sctp_collision.sh
TEST_PROGS += conntrack_vrf.sh
+TEST_PROGS += conntrack_clash.sh
TEST_PROGS += conntrack_reverse_clash.sh
TEST_PROGS += ipvs.sh
TEST_PROGS += nf_conntrack_packetdrill.sh
@@ -44,6 +45,7 @@ TEST_GEN_FILES += connect_close nf_queue
TEST_GEN_FILES += conntrack_dump_flush
TEST_GEN_FILES += conntrack_reverse_clash
TEST_GEN_FILES += sctp_collision
+TEST_GEN_FILES += udpclash
include ../../lib.mk
@@ -52,6 +54,7 @@ $(OUTPUT)/nf_queue: LDLIBS += $(MNL_LDLIBS)
$(OUTPUT)/conntrack_dump_flush: CFLAGS += $(MNL_CFLAGS)
$(OUTPUT)/conntrack_dump_flush: LDLIBS += $(MNL_LDLIBS)
+$(OUTPUT)/udpclash: LDLIBS += -lpthread
TEST_FILES := lib.sh
TEST_FILES += packetdrill
diff --git a/tools/testing/selftests/net/netfilter/conntrack_clash.sh b/tools/testing/selftests/net/netfilter/conntrack_clash.sh
new file mode 100755
index 000000000000..3712c1b9b38b
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/conntrack_clash.sh
@@ -0,0 +1,175 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source lib.sh
+
+clash_resolution_active=0
+dport=22111
+ret=0
+
+cleanup()
+{
+ # netns cleanup also zaps any remaining socat echo server.
+ cleanup_all_ns
+}
+
+checktool "nft --version" "run test without nft"
+checktool "conntrack --version" "run test without conntrack"
+checktool "socat -h" "run test without socat"
+
+trap cleanup EXIT
+
+setup_ns nsclient1 nsclient2 nsrouter
+
+ip netns exec "$nsrouter" nft -f -<<EOF
+table ip t {
+ chain lb {
+ meta l4proto udp dnat to numgen random mod 3 map { 0 : 10.0.2.1 . 9000, 1 : 10.0.2.1 . 9001, 2 : 10.0.2.1 . 9002 }
+ }
+
+ chain prerouting {
+ type nat hook prerouting priority dstnat
+
+ udp dport $dport counter jump lb
+ }
+
+ chain output {
+ type nat hook output priority dstnat
+
+ udp dport $dport counter jump lb
+ }
+}
+EOF
+
+load_simple_ruleset()
+{
+ip netns exec "$1" nft -f -<<EOF
+table ip t {
+ chain forward {
+ type filter hook forward priority 0
+
+ ct state new counter
+ }
+}
+EOF
+}
+
+spawn_servers()
+{
+ local ns="$1"
+ local ports="9000 9001 9002"
+
+ for port in $ports; do
+ ip netns exec "$ns" socat UDP-RECVFROM:$port,fork PIPE 2>/dev/null &
+ done
+
+ for port in $ports; do
+ wait_local_port_listen "$ns" $port udp
+ done
+}
+
+add_addr()
+{
+ local ns="$1"
+ local dev="$2"
+ local i="$3"
+ local j="$4"
+
+ ip -net "$ns" link set "$dev" up
+ ip -net "$ns" addr add "10.0.$i.$j/24" dev "$dev"
+}
+
+ping_test()
+{
+ local ns="$1"
+ local daddr="$2"
+
+ if ! ip netns exec "$ns" ping -q -c 1 $daddr > /dev/null;then
+ echo "FAIL: ping from $ns to $daddr"
+ exit 1
+ fi
+}
+
+run_one_clash_test()
+{
+ local ns="$1"
+ local daddr="$2"
+ local dport="$3"
+ local entries
+ local cre
+
+ if ! ip netns exec "$ns" ./udpclash $daddr $dport;then
+ echo "FAIL: did not receive expected number of replies for $daddr:$dport"
+ ret=1
+ return 1
+ fi
+
+ entries=$(conntrack -S | wc -l)
+ cre=$(conntrack -S | grep -v "clash_resolve=0" | wc -l)
+
+ if [ "$cre" -ne "$entries" ] ;then
+ clash_resolution_active=1
+ return 0
+ fi
+
+ # 1 cpu -> parallel insertion impossible
+ if [ "$entries" -eq 1 ]; then
+ return 0
+ fi
+
+ # not a failure: clash resolution logic did not trigger, but all replies
+ # were received. With right timing, xmit completed sequentially and
+ # no parallel insertion occurs.
+ return $ksft_skip
+}
+
+run_clash_test()
+{
+ local ns="$1"
+ local daddr="$2"
+ local dport="$3"
+
+ for i in $(seq 1 10);do
+ run_one_clash_test "$ns" "$daddr" "$dport"
+ local rv=$?
+ if [ $rv -eq 0 ];then
+ echo "PASS: clash resolution test for $daddr:$dport on attempt $i"
+ return 0
+ elif [ $rv -eq 1 ];then
+ echo "FAIL: clash resolution test for $daddr:$dport on attempt $i"
+ return 1
+ fi
+ done
+}
+
+ip link add veth0 netns "$nsclient1" type veth peer name veth0 netns "$nsrouter"
+ip link add veth0 netns "$nsclient2" type veth peer name veth1 netns "$nsrouter"
+add_addr "$nsclient1" veth0 1 1
+add_addr "$nsclient2" veth0 2 1
+add_addr "$nsrouter" veth0 1 99
+add_addr "$nsrouter" veth1 2 99
+
+ip -net "$nsclient1" route add default via 10.0.1.99
+ip -net "$nsclient2" route add default via 10.0.2.99
+ip netns exec "$nsrouter" sysctl -q net.ipv4.ip_forward=1
+
+ping_test "$nsclient1" 10.0.1.99
+ping_test "$nsclient1" 10.0.2.1
+ping_test "$nsclient2" 10.0.1.1
+
+spawn_servers "$nsclient2"
+
+# exercise clash resolution with nat:
+# nsrouter is supposed to dnat to 10.0.2.1:900{0,1,2,3}.
+run_clash_test "$nsclient1" 10.0.1.99 "$dport"
+
+# exercise clash resolution without nat.
+load_simple_ruleset "$nsclient2"
+run_clash_test "$nsclient2" 127.0.0.1 9001
+
+if [ $clash_resolution_active -eq 0 ];then
+ [ "$ret" -eq 0 ] && ret=$ksft_skip
+ echo "SKIP: Clash resolution did not trigger"
+fi
+
+exit $ret
diff --git a/tools/testing/selftests/net/netfilter/conntrack_resize.sh b/tools/testing/selftests/net/netfilter/conntrack_resize.sh
index 9e033e80219e..788cd56ea4a0 100755
--- a/tools/testing/selftests/net/netfilter/conntrack_resize.sh
+++ b/tools/testing/selftests/net/netfilter/conntrack_resize.sh
@@ -12,6 +12,9 @@ tmpfile=""
tmpfile_proc=""
tmpfile_uniq=""
ret=0
+have_socat=0
+
+socat -h > /dev/null && have_socat=1
insert_count=2000
[ "$KSFT_MACHINE_SLOW" = "yes" ] && insert_count=400
@@ -123,7 +126,7 @@ ctflush() {
done
}
-ctflood()
+ct_pingflood()
{
local ns="$1"
local duration="$2"
@@ -152,6 +155,44 @@ ctflood()
wait
}
+ct_udpflood()
+{
+ local ns="$1"
+ local duration="$2"
+ local now=$(date +%s)
+ local end=$((now + duration))
+
+ [ $have_socat -ne "1" ] && return
+
+ while [ $now -lt $end ]; do
+ip netns exec "$ns" bash<<"EOF"
+ for i in $(seq 1 100);do
+ dport=$(((RANDOM%65536)+1))
+
+ echo bar | socat -u STDIN UDP:"127.0.0.1:$dport" &
+ done > /dev/null 2>&1
+ wait
+EOF
+ now=$(date +%s)
+ done
+}
+
+ct_udpclash()
+{
+ local ns="$1"
+ local duration="$2"
+ local now=$(date +%s)
+ local end=$((now + duration))
+
+ [ -x udpclash ] || return
+
+ while [ $now -lt $end ]; do
+ ip netns exec "$ns" ./udpclash 127.0.0.1 $((RANDOM%65536)) > /dev/null 2>&1
+
+ now=$(date +%s)
+ done
+}
+
# dump to /dev/null. We don't want dumps to cause infinite loops
# or use-after-free even when conntrack table is altered while dumps
# are in progress.
@@ -169,6 +210,48 @@ ct_nulldump()
wait
}
+ct_nulldump_loop()
+{
+ local ns="$1"
+ local duration="$2"
+ local now=$(date +%s)
+ local end=$((now + duration))
+
+ while [ $now -lt $end ]; do
+ ct_nulldump "$ns"
+ sleep $((RANDOM%2))
+ now=$(date +%s)
+ done
+}
+
+change_timeouts()
+{
+ local ns="$1"
+ local r1=$((RANDOM%2))
+ local r2=$((RANDOM%2))
+
+ [ "$r1" -eq 1 ] && ip netns exec "$ns" sysctl -q net.netfilter.nf_conntrack_icmp_timeout=$((RANDOM%5))
+ [ "$r2" -eq 1 ] && ip netns exec "$ns" sysctl -q net.netfilter.nf_conntrack_udp_timeout=$((RANDOM%5))
+}
+
+ct_change_timeouts_loop()
+{
+ local ns="$1"
+ local duration="$2"
+ local now=$(date +%s)
+ local end=$((now + duration))
+
+ while [ $now -lt $end ]; do
+ change_timeouts "$ns"
+ sleep $((RANDOM%2))
+ now=$(date +%s)
+ done
+
+ # restore defaults
+ ip netns exec "$ns" sysctl -q net.netfilter.nf_conntrack_icmp_timeout=30
+ ip netns exec "$ns" sysctl -q net.netfilter.nf_conntrack_udp_timeout=30
+}
+
check_taint()
{
local tainted_then="$1"
@@ -198,10 +281,14 @@ insert_flood()
r=$((RANDOM%$insert_count))
- ctflood "$n" "$timeout" "floodresize" &
+ ct_pingflood "$n" "$timeout" "floodresize" &
+ ct_udpflood "$n" "$timeout" &
+ ct_udpclash "$n" "$timeout" &
+
insert_ctnetlink "$n" "$r" &
ctflush "$n" "$timeout" &
- ct_nulldump "$n" &
+ ct_nulldump_loop "$n" "$timeout" &
+ ct_change_timeouts_loop "$n" "$timeout" &
wait
}
@@ -306,7 +393,7 @@ test_dump_all()
ip netns exec "$nsclient1" sysctl -q net.netfilter.nf_conntrack_icmp_timeout=3600
- ctflood "$nsclient1" $timeout "dumpall" &
+ ct_pingflood "$nsclient1" $timeout "dumpall" &
insert_ctnetlink "$nsclient2" $insert_count
wait
@@ -368,7 +455,7 @@ test_conntrack_disable()
ct_flush_once "$nsclient1"
ct_flush_once "$nsclient2"
- ctflood "$nsclient1" "$timeout" "conntrack disable"
+ ct_pingflood "$nsclient1" "$timeout" "conntrack disable"
ip netns exec "$nsclient2" ping -q -c 1 127.0.0.1 >/dev/null 2>&1
# Disabled, should not have picked up any connection.
diff --git a/tools/testing/selftests/net/netfilter/nft_concat_range.sh b/tools/testing/selftests/net/netfilter/nft_concat_range.sh
index efea93cf23d4..20e76b395c85 100755
--- a/tools/testing/selftests/net/netfilter/nft_concat_range.sh
+++ b/tools/testing/selftests/net/netfilter/nft_concat_range.sh
@@ -378,7 +378,7 @@ display net,port,proto
type_spec ipv4_addr . inet_service . inet_proto
chain_spec ip daddr . udp dport . meta l4proto
dst addr4 port proto
-src
+src
start 1
count 9
src_delta 9
@@ -419,6 +419,7 @@ table inet filter {
set test {
type ${type_spec}
+ counter
flags interval,timeout
}
@@ -1158,9 +1159,18 @@ del() {
fi
}
-# Return packet count from 'test' counter in 'inet filter' table
+# Return packet count for elem $1 from 'test' counter in 'inet filter' table
count_packets() {
found=0
+ for token in $(nft reset element inet filter test "${1}" ); do
+ [ ${found} -eq 1 ] && echo "${token}" && return
+ [ "${token}" = "packets" ] && found=1
+ done
+}
+
+# Return packet count from 'test' counter in 'inet filter' table
+count_packets_nomatch() {
+ found=0
for token in $(nft list counter inet filter test); do
[ ${found} -eq 1 ] && echo "${token}" && return
[ "${token}" = "packets" ] && found=1
@@ -1206,6 +1216,10 @@ perf() {
# Set MAC addresses, send single packet, check that it matches, reset counter
send_match() {
+ local elem="$1"
+
+ shift
+
ip link set veth_a address "$(format_mac "${1}")"
ip -n B link set veth_b address "$(format_mac "${2}")"
@@ -1216,7 +1230,7 @@ send_match() {
eval src_"$f"=\$\(format_\$f "${2}"\)
done
eval send_\$proto
- if [ "$(count_packets)" != "1" ]; then
+ if [ "$(count_packets "$elem")" != "1" ]; then
err "${proto} packet to:"
err " $(for f in ${dst}; do
eval format_\$f "${1}"; printf ' '; done)"
@@ -1242,7 +1256,7 @@ send_nomatch() {
eval src_"$f"=\$\(format_\$f "${2}"\)
done
eval send_\$proto
- if [ "$(count_packets)" != "0" ]; then
+ if [ "$(count_packets_nomatch)" != "0" ]; then
err "${proto} packet to:"
err " $(for f in ${dst}; do
eval format_\$f "${1}"; printf ' '; done)"
@@ -1255,13 +1269,54 @@ send_nomatch() {
fi
}
+maybe_send_nomatch() {
+ local elem="$1"
+ local what="$4"
+
+ [ $((RANDOM%20)) -gt 0 ] && return
+
+ dst_addr4="$2"
+ dst_port="$3"
+ send_udp
+
+ if [ "$(count_packets_nomatch)" != "0" ]; then
+ err "Packet to $dst_addr4:$dst_port did match $what"
+ err "$(nft -a list ruleset)"
+ return 1
+ fi
+}
+
+maybe_send_match() {
+ local elem="$1"
+ local what="$4"
+
+ [ $((RANDOM%20)) -gt 0 ] && return
+
+ dst_addr4="$2"
+ dst_port="$3"
+ send_udp
+
+ if [ "$(count_packets "{ $elem }")" != "1" ]; then
+ err "Packet to $dst_addr4:$dst_port did not match $what"
+ err "$(nft -a list ruleset)"
+ return 1
+ fi
+ nft reset counter inet filter test >/dev/null
+ nft reset element inet filter test "{ $elem }" >/dev/null
+}
+
# Correctness test template:
# - add ranged element, check that packets match it
# - check that packets outside range don't match it
# - remove some elements, check that packets don't match anymore
test_correctness_main() {
range_size=1
+
+ send_nomatch $((end + 1)) $((end + 1 + src_delta)) || return 1
+
for i in $(seq "${start}" $((start + count))); do
+ local elem=""
+
end=$((start + range_size))
# Avoid negative or zero-sized port ranges
@@ -1272,15 +1327,16 @@ test_correctness_main() {
srcstart=$((start + src_delta))
srcend=$((end + src_delta))
- add "$(format)" || return 1
+ elem="$(format)"
+ add "$elem" || return 1
for j in $(seq "$start" $((range_size / 2 + 1)) ${end}); do
- send_match "${j}" $((j + src_delta)) || return 1
+ send_match "$elem" "${j}" $((j + src_delta)) || return 1
done
send_nomatch $((end + 1)) $((end + 1 + src_delta)) || return 1
# Delete elements now and then
if [ $((i % 3)) -eq 0 ]; then
- del "$(format)" || return 1
+ del "$elem" || return 1
for j in $(seq "$start" \
$((range_size / 2 + 1)) ${end}); do
send_nomatch "${j}" $((j + src_delta)) \
@@ -1572,14 +1628,17 @@ test_timeout() {
range_size=1
for i in $(seq "$start" $((start + count))); do
+ local elem=""
+
end=$((start + range_size))
srcstart=$((start + src_delta))
srcend=$((end + src_delta))
- add "$(format)" || return 1
+ elem="$(format)"
+ add "$elem" || return 1
for j in $(seq "$start" $((range_size / 2 + 1)) ${end}); do
- send_match "${j}" $((j + src_delta)) || return 1
+ send_match "$elem" "${j}" $((j + src_delta)) || return 1
done
range_size=$((range_size + 1))
@@ -1737,7 +1796,7 @@ test_bug_reload() {
srcend=$((end + src_delta))
for j in $(seq "$start" $((range_size / 2 + 1)) ${end}); do
- send_match "${j}" $((j + src_delta)) || return 1
+ send_match "$(format)" "${j}" $((j + src_delta)) || return 1
done
range_size=$((range_size + 1))
@@ -1756,22 +1815,34 @@ test_bug_net_port_proto_match() {
range_size=1
for i in $(seq 1 10); do
for j in $(seq 1 20) ; do
- elem=$(printf "10.%d.%d.0/24 . %d1-%d0 . 6-17 " ${i} ${j} ${i} "$((i+1))")
+ local dport=$j
+
+ elem=$(printf "10.%d.%d.0/24 . %d-%d0 . 6-17 " ${i} ${j} ${dport} "$((dport+1))")
+
+ # too slow, do not test all addresses
+ maybe_send_nomatch "$elem" $(printf "10.%d.%d.1" $i $j) $(printf "%d1" $((dport+1))) "before add" || return 1
nft "add element inet filter test { $elem }" || return 1
+
+ maybe_send_match "$elem" $(printf "10.%d.%d.1" $i $j) $(printf "%d" $dport) "after add" || return 1
+
nft "get element inet filter test { $elem }" | grep -q "$elem"
if [ $? -ne 0 ];then
local got=$(nft "get element inet filter test { $elem }")
err "post-add: should have returned $elem but got $got"
return 1
fi
+
+ maybe_send_nomatch "$elem" $(printf "10.%d.%d.1" $i $j) $(printf "%d1" $((dport+1))) "out-of-range" || return 1
done
done
# recheck after set was filled
for i in $(seq 1 10); do
for j in $(seq 1 20) ; do
- elem=$(printf "10.%d.%d.0/24 . %d1-%d0 . 6-17 " ${i} ${j} ${i} "$((i+1))")
+ local dport=$j
+
+ elem=$(printf "10.%d.%d.0/24 . %d-%d0 . 6-17 " ${i} ${j} ${dport} "$((dport+1))")
nft "get element inet filter test { $elem }" | grep -q "$elem"
if [ $? -ne 0 ];then
@@ -1779,6 +1850,9 @@ test_bug_net_port_proto_match() {
err "post-fill: should have returned $elem but got $got"
return 1
fi
+
+ maybe_send_match "$elem" $(printf "10.%d.%d.1" $i $j) $(printf "%d" $dport) "recheck" || return 1
+ maybe_send_nomatch "$elem" $(printf "10.%d.%d.1" $i $j) $(printf "%d1" $((dport+1))) "recheck out-of-range" || return 1
done
done
@@ -1786,9 +1860,10 @@ test_bug_net_port_proto_match() {
for i in $(seq 1 10); do
for j in $(seq 1 20) ; do
local rnd=$((RANDOM%10))
+ local dport=$j
local got=""
- elem=$(printf "10.%d.%d.0/24 . %d1-%d0 . 6-17 " ${i} ${j} ${i} "$((i+1))")
+ elem=$(printf "10.%d.%d.0/24 . %d-%d0 . 6-17 " ${i} ${j} ${dport} "$((dport+1))")
if [ $rnd -gt 0 ];then
continue
fi
@@ -1799,6 +1874,8 @@ test_bug_net_port_proto_match() {
err "post-delete: query for $elem returned $got instead of error."
return 1
fi
+
+ maybe_send_nomatch "$elem" $(printf "10.%d.%d.1" $i $j) $(printf "%d" $dport) "match after deletion" || return 1
done
done
@@ -1817,7 +1894,7 @@ test_bug_avx2_mismatch()
dst_addr6="$a2"
send_icmp6
- if [ "$(count_packets)" -gt "0" ]; then
+ if [ "$(count_packets "{ icmpv6 . $a1 }")" -gt "0" ]; then
err "False match for $a2"
return 1
fi
diff --git a/tools/testing/selftests/net/netfilter/nft_nat.sh b/tools/testing/selftests/net/netfilter/nft_nat.sh
index 9e39de26455f..a954754b99b3 100755
--- a/tools/testing/selftests/net/netfilter/nft_nat.sh
+++ b/tools/testing/selftests/net/netfilter/nft_nat.sh
@@ -866,6 +866,24 @@ EOF
ip netns exec "$ns0" nft delete table $family nat
}
+file_cmp()
+{
+ local infile="$1"
+ local outfile="$2"
+
+ if ! cmp "$infile" "$outfile";then
+ echo -n "Infile "
+ ls -l "$infile"
+ echo -n "Outfile "
+ ls -l "$outfile"
+ echo "ERROR: in and output file mismatch when checking $msg" 1>&1
+ ret=1
+ return 1
+ fi
+
+ return 0
+}
+
test_stateless_nat_ip()
{
local lret=0
@@ -966,11 +984,7 @@ EOF
wait
- if ! cmp "$INFILE" "$OUTFILE";then
- ls -l "$INFILE" "$OUTFILE"
- echo "ERROR: in and output file mismatch when checking udp with stateless nat" 1>&2
- lret=1
- fi
+ file_cmp "$INFILE" "$OUTFILE" "udp with stateless nat" || lret=1
:> "$OUTFILE"
@@ -991,6 +1005,62 @@ EOF
return $lret
}
+test_dnat_clash()
+{
+ local lret=0
+
+ if ! socat -h > /dev/null 2>&1;then
+ echo "SKIP: Could not run dnat clash test without socat tool"
+ [ $ret -eq 0 ] && ret=$ksft_skip
+ return $ksft_skip
+ fi
+
+ip netns exec "$ns0" nft -f /dev/stdin <<EOF
+flush ruleset
+table ip dnat-test {
+ chain prerouting {
+ type nat hook prerouting priority dstnat; policy accept;
+ ip daddr 10.0.2.1 udp dport 1234 counter dnat to 10.0.1.1:1234
+ }
+}
+EOF
+ if [ $? -ne 0 ]; then
+ echo "SKIP: Could not add dnat rules"
+ [ $ret -eq 0 ] && ret=$ksft_skip
+ return $ksft_skip
+ fi
+
+ local udpdaddr="10.0.2.1"
+ for i in 1 2;do
+ echo "PING $udpdaddr" > "$INFILE"
+ echo "PONG 10.0.1.1 step $i" | ip netns exec "$ns0" timeout 3 socat STDIO UDP4-LISTEN:1234,bind=10.0.1.1 > "$OUTFILE" 2>/dev/null &
+ local lpid=$!
+
+ busywait $BUSYWAIT_TIMEOUT listener_ready "$ns0" 1234 "-u"
+
+ result=$(ip netns exec "$ns1" timeout 3 socat STDIO UDP4-SENDTO:"$udpdaddr:1234,sourceport=4321" < "$INFILE")
+ udpdaddr="10.0.1.1"
+
+ if [ "$result" != "PONG 10.0.1.1 step $i" ] ; then
+ echo "ERROR: failed to test udp $ns1 to $ns2 with dnat rule step $i, result: \"$result\"" 1>&2
+ lret=1
+ ret=1
+ fi
+
+ wait
+
+ file_cmp "$INFILE" "$OUTFILE" "udp dnat step $i" || lret=1
+
+ :> "$OUTFILE"
+ done
+
+ test $lret -eq 0 && echo "PASS: IP dnat clash $ns1:$ns2"
+
+ ip netns exec "$ns0" nft flush ruleset
+
+ return $lret
+}
+
# ip netns exec "$ns0" ping -c 1 -q 10.0.$i.99
for i in "$ns0" "$ns1" "$ns2" ;do
ip netns exec "$i" nft -f /dev/stdin <<EOF
@@ -1147,6 +1217,7 @@ $test_inet_nat && test_redirect6 inet
test_port_shadowing
test_stateless_nat_ip
+test_dnat_clash
if [ $ret -ne 0 ];then
echo -n "FAIL: "
diff --git a/tools/testing/selftests/net/netfilter/udpclash.c b/tools/testing/selftests/net/netfilter/udpclash.c
new file mode 100644
index 000000000000..85c7b906ad08
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/udpclash.c
@@ -0,0 +1,158 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/* Usage: ./udpclash <IP> <PORT>
+ *
+ * Emit THREAD_COUNT UDP packets sharing the same saddr:daddr pair.
+ *
+ * This mimics DNS resolver libraries that emit A and AAAA requests
+ * in parallel.
+ *
+ * This exercises conntrack clash resolution logic added and later
+ * refined in
+ *
+ * 71d8c47fc653 ("netfilter: conntrack: introduce clash resolution on insertion race")
+ * ed07d9a021df ("netfilter: nf_conntrack: resolve clash for matching conntracks")
+ * 6a757c07e51f ("netfilter: conntrack: allow insertion of clashing entries")
+ */
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <arpa/inet.h>
+#include <sys/socket.h>
+#include <pthread.h>
+
+#define THREAD_COUNT 128
+
+struct thread_args {
+ const struct sockaddr_in *si_remote;
+ int sockfd;
+};
+
+static int wait = 1;
+
+static void *thread_main(void *varg)
+{
+ const struct sockaddr_in *si_remote;
+ const struct thread_args *args = varg;
+ static const char msg[] = "foo";
+
+ si_remote = args->si_remote;
+
+ while (wait == 1)
+ ;
+
+ if (sendto(args->sockfd, msg, strlen(msg), MSG_NOSIGNAL,
+ (struct sockaddr *)si_remote, sizeof(*si_remote)) < 0)
+ exit(111);
+
+ return varg;
+}
+
+static int run_test(int fd, const struct sockaddr_in *si_remote)
+{
+ struct thread_args thread_args = {
+ .si_remote = si_remote,
+ .sockfd = fd,
+ };
+ pthread_t *tid = calloc(THREAD_COUNT, sizeof(pthread_t));
+ unsigned int repl_count = 0, timeout = 0;
+ int i;
+
+ if (!tid) {
+ perror("calloc");
+ return 1;
+ }
+
+ for (i = 0; i < THREAD_COUNT; i++) {
+ int err = pthread_create(&tid[i], NULL, &thread_main, &thread_args);
+
+ if (err != 0) {
+ perror("pthread_create");
+ exit(1);
+ }
+ }
+
+ wait = 0;
+
+ for (i = 0; i < THREAD_COUNT; i++)
+ pthread_join(tid[i], NULL);
+
+ while (repl_count < THREAD_COUNT) {
+ struct sockaddr_in si_repl;
+ socklen_t si_repl_len = sizeof(si_repl);
+ char repl[512];
+ ssize_t ret;
+
+ ret = recvfrom(fd, repl, sizeof(repl), MSG_NOSIGNAL,
+ (struct sockaddr *) &si_repl, &si_repl_len);
+ if (ret < 0) {
+ if (timeout++ > 5000) {
+ fputs("timed out while waiting for reply from thread\n", stderr);
+ break;
+ }
+
+ /* give reply time to pass though the stack */
+ usleep(1000);
+ continue;
+ }
+
+ if (si_repl_len != sizeof(*si_remote)) {
+ fprintf(stderr, "warning: reply has unexpected repl_len %d vs %d\n",
+ (int)si_repl_len, (int)sizeof(si_repl));
+ } else if (si_remote->sin_addr.s_addr != si_repl.sin_addr.s_addr ||
+ si_remote->sin_port != si_repl.sin_port) {
+ char a[64], b[64];
+
+ inet_ntop(AF_INET, &si_remote->sin_addr, a, sizeof(a));
+ inet_ntop(AF_INET, &si_repl.sin_addr, b, sizeof(b));
+
+ fprintf(stderr, "reply from wrong source: want %s:%d got %s:%d\n",
+ a, ntohs(si_remote->sin_port), b, ntohs(si_repl.sin_port));
+ }
+
+ repl_count++;
+ }
+
+ printf("got %d of %d replies\n", repl_count, THREAD_COUNT);
+
+ free(tid);
+
+ return repl_count == THREAD_COUNT ? 0 : 1;
+}
+
+int main(int argc, char *argv[])
+{
+ struct sockaddr_in si_local = {
+ .sin_family = AF_INET,
+ };
+ struct sockaddr_in si_remote = {
+ .sin_family = AF_INET,
+ };
+ int fd, ret;
+
+ if (argc < 3) {
+ fputs("Usage: send_udp <daddr> <dport>\n", stderr);
+ return 1;
+ }
+
+ si_remote.sin_port = htons(atoi(argv[2]));
+ si_remote.sin_addr.s_addr = inet_addr(argv[1]);
+
+ fd = socket(AF_INET, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, IPPROTO_UDP);
+ if (fd < 0) {
+ perror("socket");
+ return 1;
+ }
+
+ if (bind(fd, (struct sockaddr *)&si_local, sizeof(si_local)) < 0) {
+ perror("bind");
+ return 1;
+ }
+
+ ret = run_test(fd, &si_remote);
+
+ close(fd);
+
+ return ret;
+}
diff --git a/tools/testing/selftests/net/ovpn/ovpn-cli.c b/tools/testing/selftests/net/ovpn/ovpn-cli.c
index de9c26f98b2e..9201f2905f2c 100644
--- a/tools/testing/selftests/net/ovpn/ovpn-cli.c
+++ b/tools/testing/selftests/net/ovpn/ovpn-cli.c
@@ -2166,6 +2166,7 @@ static int ovpn_parse_cmd_args(struct ovpn_ctx *ovpn, int argc, char *argv[])
ovpn->peers_file = argv[4];
+ ovpn->sa_family = AF_INET;
if (argc > 5 && !strcmp(argv[5], "ipv6"))
ovpn->sa_family = AF_INET6;
break;
diff --git a/tools/testing/selftests/net/ovpn/test-large-mtu.sh b/tools/testing/selftests/net/ovpn/test-large-mtu.sh
new file mode 100755
index 000000000000..ce2a2cb64f72
--- /dev/null
+++ b/tools/testing/selftests/net/ovpn/test-large-mtu.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2025 OpenVPN, Inc.
+#
+# Author: Antonio Quartulli <antonio@openvpn.net>
+
+MTU="1500"
+
+source test.sh
diff --git a/tools/testing/selftests/net/packetdrill/tcp_ooo-before-and-after-accept.pkt b/tools/testing/selftests/net/packetdrill/tcp_ooo-before-and-after-accept.pkt
new file mode 100644
index 000000000000..09aabc775e80
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_ooo-before-and-after-accept.pkt
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0
+
+--mss=1000
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_rmem="4096 131072 $((32*1024*1024))"`
+
+// Test that a not-yet-accepted socket does not change
+// its initial sk_rcvbuf (tcp_rmem[1]) when receiving ooo packets.
+
+ +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+ +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+ +0 bind(3, ..., ...) = 0
+ +0 listen(3, 1) = 0
+
+ +0 < S 0:0(0) win 65535 <mss 1000,nop,nop,sackOK,nop,wscale 7>
+ +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 10>
+ +.1 < . 1:1(0) ack 1 win 257
+ +0 < . 2001:41001(39000) ack 1 win 257
+ +0 > . 1:1(0) ack 1 <nop,nop,sack 2001:41001>
+ +0 < . 41001:101001(60000) ack 1 win 257
+ +0 > . 1:1(0) ack 1 <nop,nop,sack 2001:101001>
+ +0 < . 1:1001(1000) ack 1 win 257
+ +0 > . 1:1(0) ack 1001 <nop,nop,sack 2001:101001>
+ +0 < . 1001:2001(1000) ack 1 win 257
+ +0 > . 1:1(0) ack 101001
+
+ +0 accept(3, ..., ...) = 4
+
+ +0 %{ assert SK_MEMINFO_RCVBUF == 131072, SK_MEMINFO_RCVBUF }%
+
+ +0 close(4) = 0
+ +0 close(3) = 0
+
+// Test that ooo packets for accepted sockets do increase sk_rcvbuf
+ +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+ +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+ +0 bind(3, ..., ...) = 0
+ +0 listen(3, 1) = 0
+
+ +0 < S 0:0(0) win 65535 <mss 1000,nop,nop,sackOK,nop,wscale 7>
+ +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 10>
+ +.1 < . 1:1(0) ack 1 win 257
+
+ +0 accept(3, ..., ...) = 4
+
+ +0 < . 2001:41001(39000) ack 1 win 257
+ +0 > . 1:1(0) ack 1 <nop,nop,sack 2001:41001>
+ +0 < . 41001:101001(60000) ack 1 win 257
+ +0 > . 1:1(0) ack 1 <nop,nop,sack 2001:101001>
+
+ +0 %{ assert SK_MEMINFO_RCVBUF > 131072, SK_MEMINFO_RCVBUF }%
+
diff --git a/tools/testing/selftests/net/tfo.c b/tools/testing/selftests/net/tfo.c
new file mode 100644
index 000000000000..eb3cac5e583c
--- /dev/null
+++ b/tools/testing/selftests/net/tfo.c
@@ -0,0 +1,171 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <error.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <arpa/inet.h>
+#include <sys/socket.h>
+#include <netinet/tcp.h>
+#include <errno.h>
+
+static int cfg_server;
+static int cfg_client;
+static int cfg_port = 8000;
+static struct sockaddr_in6 cfg_addr;
+static char *cfg_outfile;
+
+static int parse_address(const char *str, int port, struct sockaddr_in6 *sin6)
+{
+ int ret;
+
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_port = htons(port);
+
+ ret = inet_pton(sin6->sin6_family, str, &sin6->sin6_addr);
+ if (ret != 1) {
+ /* fallback to plain IPv4 */
+ ret = inet_pton(AF_INET, str, &sin6->sin6_addr.s6_addr32[3]);
+ if (ret != 1)
+ return -1;
+
+ /* add ::ffff prefix */
+ sin6->sin6_addr.s6_addr32[0] = 0;
+ sin6->sin6_addr.s6_addr32[1] = 0;
+ sin6->sin6_addr.s6_addr16[4] = 0;
+ sin6->sin6_addr.s6_addr16[5] = 0xffff;
+ }
+
+ return 0;
+}
+
+static void run_server(void)
+{
+ unsigned long qlen = 32;
+ int fd, opt, connfd;
+ socklen_t len;
+ char buf[64];
+ FILE *outfile;
+
+ outfile = fopen(cfg_outfile, "w");
+ if (!outfile)
+ error(1, errno, "fopen() outfile");
+
+ fd = socket(AF_INET6, SOCK_STREAM, 0);
+ if (fd == -1)
+ error(1, errno, "socket()");
+
+ opt = 1;
+ if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)) < 0)
+ error(1, errno, "setsockopt(SO_REUSEADDR)");
+
+ if (setsockopt(fd, SOL_TCP, TCP_FASTOPEN, &qlen, sizeof(qlen)) < 0)
+ error(1, errno, "setsockopt(TCP_FASTOPEN)");
+
+ if (bind(fd, (struct sockaddr *)&cfg_addr, sizeof(cfg_addr)) < 0)
+ error(1, errno, "bind()");
+
+ if (listen(fd, 5) < 0)
+ error(1, errno, "listen()");
+
+ len = sizeof(cfg_addr);
+ connfd = accept(fd, (struct sockaddr *)&cfg_addr, &len);
+ if (connfd < 0)
+ error(1, errno, "accept()");
+
+ len = sizeof(opt);
+ if (getsockopt(connfd, SOL_SOCKET, SO_INCOMING_NAPI_ID, &opt, &len) < 0)
+ error(1, errno, "getsockopt(SO_INCOMING_NAPI_ID)");
+
+ read(connfd, buf, 64);
+ fprintf(outfile, "%d\n", opt);
+
+ fclose(outfile);
+ close(connfd);
+ close(fd);
+}
+
+static void run_client(void)
+{
+ int fd;
+ char *msg = "Hello, world!";
+
+ fd = socket(AF_INET6, SOCK_STREAM, 0);
+ if (fd == -1)
+ error(1, errno, "socket()");
+
+ sendto(fd, msg, strlen(msg), MSG_FASTOPEN, (struct sockaddr *)&cfg_addr, sizeof(cfg_addr));
+
+ close(fd);
+}
+
+static void usage(const char *filepath)
+{
+ error(1, 0, "Usage: %s (-s|-c) -h<server_ip> -p<port> -o<outfile> ", filepath);
+}
+
+static void parse_opts(int argc, char **argv)
+{
+ struct sockaddr_in6 *addr6 = (void *) &cfg_addr;
+ char *addr = NULL;
+ int ret;
+ int c;
+
+ if (argc <= 1)
+ usage(argv[0]);
+
+ while ((c = getopt(argc, argv, "sch:p:o:")) != -1) {
+ switch (c) {
+ case 's':
+ if (cfg_client)
+ error(1, 0, "Pass one of -s or -c");
+ cfg_server = 1;
+ break;
+ case 'c':
+ if (cfg_server)
+ error(1, 0, "Pass one of -s or -c");
+ cfg_client = 1;
+ break;
+ case 'h':
+ addr = optarg;
+ break;
+ case 'p':
+ cfg_port = strtoul(optarg, NULL, 0);
+ break;
+ case 'o':
+ cfg_outfile = strdup(optarg);
+ if (!cfg_outfile)
+ error(1, 0, "outfile invalid");
+ break;
+ }
+ }
+
+ if (cfg_server && addr)
+ error(1, 0, "Server cannot have -h specified");
+
+ memset(addr6, 0, sizeof(*addr6));
+ addr6->sin6_family = AF_INET6;
+ addr6->sin6_port = htons(cfg_port);
+ addr6->sin6_addr = in6addr_any;
+ if (addr) {
+ ret = parse_address(addr, cfg_port, addr6);
+ if (ret)
+ error(1, 0, "Client address parse error: %s", addr);
+ }
+}
+
+int main(int argc, char **argv)
+{
+ parse_opts(argc, argv);
+
+ if (cfg_server)
+ run_server();
+ else if (cfg_client)
+ run_client();
+
+ return 0;
+}
diff --git a/tools/testing/selftests/net/tfo_passive.sh b/tools/testing/selftests/net/tfo_passive.sh
new file mode 100755
index 000000000000..80bf11fdc046
--- /dev/null
+++ b/tools/testing/selftests/net/tfo_passive.sh
@@ -0,0 +1,112 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+source lib.sh
+
+NSIM_SV_ID=$((256 + RANDOM % 256))
+NSIM_SV_SYS=/sys/bus/netdevsim/devices/netdevsim$NSIM_SV_ID
+NSIM_CL_ID=$((512 + RANDOM % 256))
+NSIM_CL_SYS=/sys/bus/netdevsim/devices/netdevsim$NSIM_CL_ID
+
+NSIM_DEV_SYS_NEW=/sys/bus/netdevsim/new_device
+NSIM_DEV_SYS_DEL=/sys/bus/netdevsim/del_device
+NSIM_DEV_SYS_LINK=/sys/bus/netdevsim/link_device
+NSIM_DEV_SYS_UNLINK=/sys/bus/netdevsim/unlink_device
+
+SERVER_IP=192.168.1.1
+CLIENT_IP=192.168.1.2
+SERVER_PORT=48675
+
+setup_ns()
+{
+ set -e
+ ip netns add nssv
+ ip netns add nscl
+
+ NSIM_SV_NAME=$(find $NSIM_SV_SYS/net -maxdepth 1 -type d ! \
+ -path $NSIM_SV_SYS/net -exec basename {} \;)
+ NSIM_CL_NAME=$(find $NSIM_CL_SYS/net -maxdepth 1 -type d ! \
+ -path $NSIM_CL_SYS/net -exec basename {} \;)
+
+ ip link set $NSIM_SV_NAME netns nssv
+ ip link set $NSIM_CL_NAME netns nscl
+
+ ip netns exec nssv ip addr add "${SERVER_IP}/24" dev $NSIM_SV_NAME
+ ip netns exec nscl ip addr add "${CLIENT_IP}/24" dev $NSIM_CL_NAME
+
+ ip netns exec nssv ip link set dev $NSIM_SV_NAME up
+ ip netns exec nscl ip link set dev $NSIM_CL_NAME up
+
+ # Enable passive TFO
+ ip netns exec nssv sysctl -w net.ipv4.tcp_fastopen=519 > /dev/null
+
+ set +e
+}
+
+cleanup_ns()
+{
+ ip netns del nscl
+ ip netns del nssv
+}
+
+###
+### Code start
+###
+
+modprobe netdevsim
+
+# linking
+
+echo $NSIM_SV_ID > $NSIM_DEV_SYS_NEW
+echo $NSIM_CL_ID > $NSIM_DEV_SYS_NEW
+udevadm settle
+
+setup_ns
+
+NSIM_SV_FD=$((256 + RANDOM % 256))
+exec {NSIM_SV_FD}</var/run/netns/nssv
+NSIM_SV_IFIDX=$(ip netns exec nssv cat /sys/class/net/$NSIM_SV_NAME/ifindex)
+
+NSIM_CL_FD=$((256 + RANDOM % 256))
+exec {NSIM_CL_FD}</var/run/netns/nscl
+NSIM_CL_IFIDX=$(ip netns exec nscl cat /sys/class/net/$NSIM_CL_NAME/ifindex)
+
+echo "$NSIM_SV_FD:$NSIM_SV_IFIDX $NSIM_CL_FD:$NSIM_CL_IFIDX" > \
+ $NSIM_DEV_SYS_LINK
+
+if [ $? -ne 0 ]; then
+ echo "linking netdevsim1 with netdevsim2 should succeed"
+ cleanup_ns
+ exit 1
+fi
+
+out_file=$(mktemp)
+
+timeout -k 1s 30s ip netns exec nssv ./tfo \
+ -s \
+ -p ${SERVER_PORT} \
+ -o ${out_file}&
+
+wait_local_port_listen nssv ${SERVER_PORT} tcp
+
+ip netns exec nscl ./tfo -c -h ${SERVER_IP} -p ${SERVER_PORT}
+
+wait
+
+res=$(cat $out_file)
+rm $out_file
+
+if [ $res -eq 0 ]; then
+ echo "got invalid NAPI ID from passive TFO socket"
+ cleanup_ns
+ exit 1
+fi
+
+echo "$NSIM_SV_FD:$NSIM_SV_IFIDX" > $NSIM_DEV_SYS_UNLINK
+
+echo $NSIM_CL_ID > $NSIM_DEV_SYS_DEL
+
+cleanup_ns
+
+modprobe -r netdevsim
+
+exit 0
diff --git a/tools/testing/selftests/net/udpgro.sh b/tools/testing/selftests/net/udpgro.sh
index 1dc337c709f8..b17e032a6d75 100755
--- a/tools/testing/selftests/net/udpgro.sh
+++ b/tools/testing/selftests/net/udpgro.sh
@@ -48,7 +48,7 @@ run_one() {
cfg_veth
- ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 10 ${rx_args} &
+ ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 100 ${rx_args} &
local PID1=$!
wait_local_port_listen ${PEER_NS} 8000 udp
@@ -95,7 +95,7 @@ run_one_nat() {
# will land on the 'plain' one
ip netns exec "${PEER_NS}" ./udpgso_bench_rx -G ${family} -b ${addr1} -n 0 &
local PID1=$!
- ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 10 ${family} -b ${addr2%/*} ${rx_args} &
+ ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 100 ${family} -b ${addr2%/*} ${rx_args} &
local PID2=$!
wait_local_port_listen "${PEER_NS}" 8000 udp
@@ -117,9 +117,9 @@ run_one_2sock() {
cfg_veth
- ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 10 ${rx_args} -p 12345 &
+ ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 100 ${rx_args} -p 12345 &
local PID1=$!
- ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 2000 -R 10 ${rx_args} &
+ ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 2000 -R 100 ${rx_args} &
local PID2=$!
wait_local_port_listen "${PEER_NS}" 12345 udp
diff --git a/tools/testing/selftests/net/vlan_hw_filter.sh b/tools/testing/selftests/net/vlan_hw_filter.sh
index 7bc804ffaf7c..0fb56baf28e4 100755
--- a/tools/testing/selftests/net/vlan_hw_filter.sh
+++ b/tools/testing/selftests/net/vlan_hw_filter.sh
@@ -3,27 +3,101 @@
readonly NETNS="ns-$(mktemp -u XXXXXX)"
+ALL_TESTS="
+ test_vlan_filter_check
+ test_vlan0_del_crash_01
+ test_vlan0_del_crash_02
+ test_vlan0_del_crash_03
+ test_vid0_memleak
+"
+
ret=0
+setup() {
+ ip netns add ${NETNS}
+}
+
cleanup() {
- ip netns del $NETNS
+ ip netns del $NETNS 2>/dev/null
}
trap cleanup EXIT
fail() {
- echo "ERROR: ${1:-unexpected return code} (ret: $_)" >&2
- ret=1
+ echo "ERROR: ${1:-unexpected return code} (ret: $_)" >&2
+ ret=1
+}
+
+tests_run()
+{
+ local current_test
+ for current_test in ${TESTS:-$ALL_TESTS}; do
+ $current_test
+ done
+}
+
+test_vlan_filter_check() {
+ setup
+ ip netns exec ${NETNS} ip link add bond0 type bond mode 0
+ ip netns exec ${NETNS} ip link add bond_slave_1 type veth peer veth2
+ ip netns exec ${NETNS} ip link set bond_slave_1 master bond0
+ ip netns exec ${NETNS} ethtool -K bond0 rx-vlan-filter off
+ ip netns exec ${NETNS} ip link add link bond_slave_1 name bond_slave_1.0 type vlan id 0
+ ip netns exec ${NETNS} ip link add link bond0 name bond0.0 type vlan id 0
+ ip netns exec ${NETNS} ip link set bond_slave_1 nomaster
+ ip netns exec ${NETNS} ip link del veth2 || fail "Please check vlan HW filter function"
+ cleanup
}
-ip netns add ${NETNS}
-ip netns exec ${NETNS} ip link add bond0 type bond mode 0
-ip netns exec ${NETNS} ip link add bond_slave_1 type veth peer veth2
-ip netns exec ${NETNS} ip link set bond_slave_1 master bond0
-ip netns exec ${NETNS} ethtool -K bond0 rx-vlan-filter off
-ip netns exec ${NETNS} ip link add link bond_slave_1 name bond_slave_1.0 type vlan id 0
-ip netns exec ${NETNS} ip link add link bond0 name bond0.0 type vlan id 0
-ip netns exec ${NETNS} ip link set bond_slave_1 nomaster
-ip netns exec ${NETNS} ip link del veth2 || fail "Please check vlan HW filter function"
+#enable vlan_filter feature of real_dev with vlan0 during running time
+test_vlan0_del_crash_01() {
+ setup
+ ip netns exec ${NETNS} ip link add bond0 type bond mode 0
+ ip netns exec ${NETNS} ip link add link bond0 name vlan0 type vlan id 0 protocol 802.1q
+ ip netns exec ${NETNS} ethtool -K bond0 rx-vlan-filter off
+ ip netns exec ${NETNS} ifconfig bond0 up
+ ip netns exec ${NETNS} ethtool -K bond0 rx-vlan-filter on
+ ip netns exec ${NETNS} ifconfig bond0 down
+ ip netns exec ${NETNS} ifconfig bond0 up
+ ip netns exec ${NETNS} ip link del vlan0 || fail "Please check vlan HW filter function"
+ cleanup
+}
+
+#enable vlan_filter feature and add vlan0 for real_dev during running time
+test_vlan0_del_crash_02() {
+ setup
+ ip netns exec ${NETNS} ip link add bond0 type bond mode 0
+ ip netns exec ${NETNS} ethtool -K bond0 rx-vlan-filter off
+ ip netns exec ${NETNS} ifconfig bond0 up
+ ip netns exec ${NETNS} ethtool -K bond0 rx-vlan-filter on
+ ip netns exec ${NETNS} ip link add link bond0 name vlan0 type vlan id 0 protocol 802.1q
+ ip netns exec ${NETNS} ifconfig bond0 down
+ ip netns exec ${NETNS} ifconfig bond0 up
+ ip netns exec ${NETNS} ip link del vlan0 || fail "Please check vlan HW filter function"
+ cleanup
+}
+
+#enable vlan_filter feature of real_dev during running time
+#test kernel_bug of vlan unregister
+test_vlan0_del_crash_03() {
+ setup
+ ip netns exec ${NETNS} ip link add bond0 type bond mode 0
+ ip netns exec ${NETNS} ip link add link bond0 name vlan0 type vlan id 0 protocol 802.1q
+ ip netns exec ${NETNS} ethtool -K bond0 rx-vlan-filter off
+ ip netns exec ${NETNS} ifconfig bond0 up
+ ip netns exec ${NETNS} ethtool -K bond0 rx-vlan-filter on
+ ip netns exec ${NETNS} ifconfig bond0 down
+ ip netns exec ${NETNS} ip link del vlan0 || fail "Please check vlan HW filter function"
+ cleanup
+}
+
+test_vid0_memleak() {
+ setup
+ ip netns exec ${NETNS} ip link add bond0 up type bond mode 0
+ ip netns exec ${NETNS} ethtool -K bond0 rx-vlan-filter off
+ ip netns exec ${NETNS} ip link del dev bond0 || fail "Please check vlan HW filter function"
+ cleanup
+}
+tests_run
exit $ret
diff --git a/tools/testing/selftests/ptrace/Makefile b/tools/testing/selftests/ptrace/Makefile
index 1c631740a730..c5e0b76ba6ac 100644
--- a/tools/testing/selftests/ptrace/Makefile
+++ b/tools/testing/selftests/ptrace/Makefile
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0-only
CFLAGS += -std=c99 -pthread -Wall $(KHDR_INCLUDES)
-TEST_GEN_PROGS := get_syscall_info peeksiginfo vmaccess get_set_sud
+TEST_GEN_PROGS := get_syscall_info set_syscall_info peeksiginfo vmaccess get_set_sud
include ../lib.mk
diff --git a/tools/testing/selftests/ptrace/set_syscall_info.c b/tools/testing/selftests/ptrace/set_syscall_info.c
new file mode 100644
index 000000000000..4198248ef874
--- /dev/null
+++ b/tools/testing/selftests/ptrace/set_syscall_info.c
@@ -0,0 +1,519 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (c) 2018-2025 Dmitry V. Levin <ldv@strace.io>
+ * All rights reserved.
+ *
+ * Check whether PTRACE_SET_SYSCALL_INFO semantics implemented in the kernel
+ * matches userspace expectations.
+ */
+
+#include "../kselftest_harness.h"
+#include <err.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <asm/unistd.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+
+#if defined(_MIPS_SIM) && _MIPS_SIM == _MIPS_SIM_NABI32
+/*
+ * MIPS N32 is the only architecture where __kernel_ulong_t
+ * does not match the bitness of syscall arguments.
+ */
+typedef unsigned long long kernel_ulong_t;
+#else
+typedef __kernel_ulong_t kernel_ulong_t;
+#endif
+
+struct si_entry {
+ int nr;
+ kernel_ulong_t args[6];
+};
+struct si_exit {
+ unsigned int is_error;
+ int rval;
+};
+
+static unsigned int ptrace_stop;
+static pid_t tracee_pid;
+
+static int
+kill_tracee(pid_t pid)
+{
+ if (!pid)
+ return 0;
+
+ int saved_errno = errno;
+
+ int rc = kill(pid, SIGKILL);
+
+ errno = saved_errno;
+ return rc;
+}
+
+static long
+sys_ptrace(int request, pid_t pid, unsigned long addr, unsigned long data)
+{
+ return syscall(__NR_ptrace, request, pid, addr, data);
+}
+
+#define LOG_KILL_TRACEE(fmt, ...) \
+ do { \
+ kill_tracee(tracee_pid); \
+ TH_LOG("wait #%d: " fmt, \
+ ptrace_stop, ##__VA_ARGS__); \
+ } while (0)
+
+static void
+check_psi_entry(struct __test_metadata *_metadata,
+ const struct ptrace_syscall_info *info,
+ const struct si_entry *exp_entry,
+ const char *text)
+{
+ unsigned int i;
+ int exp_nr = exp_entry->nr;
+#if defined __s390__ || defined __s390x__
+ /* s390 is the only architecture that has 16-bit syscall numbers */
+ exp_nr &= 0xffff;
+#endif
+
+ ASSERT_EQ(PTRACE_SYSCALL_INFO_ENTRY, info->op) {
+ LOG_KILL_TRACEE("%s: entry stop mismatch", text);
+ }
+ ASSERT_TRUE(info->arch) {
+ LOG_KILL_TRACEE("%s: entry stop mismatch", text);
+ }
+ ASSERT_TRUE(info->instruction_pointer) {
+ LOG_KILL_TRACEE("%s: entry stop mismatch", text);
+ }
+ ASSERT_TRUE(info->stack_pointer) {
+ LOG_KILL_TRACEE("%s: entry stop mismatch", text);
+ }
+ ASSERT_EQ(exp_nr, info->entry.nr) {
+ LOG_KILL_TRACEE("%s: syscall nr mismatch", text);
+ }
+ for (i = 0; i < ARRAY_SIZE(exp_entry->args); ++i) {
+ ASSERT_EQ(exp_entry->args[i], info->entry.args[i]) {
+ LOG_KILL_TRACEE("%s: syscall arg #%u mismatch",
+ text, i);
+ }
+ }
+}
+
+static void
+check_psi_exit(struct __test_metadata *_metadata,
+ const struct ptrace_syscall_info *info,
+ const struct si_exit *exp_exit,
+ const char *text)
+{
+ ASSERT_EQ(PTRACE_SYSCALL_INFO_EXIT, info->op) {
+ LOG_KILL_TRACEE("%s: exit stop mismatch", text);
+ }
+ ASSERT_TRUE(info->arch) {
+ LOG_KILL_TRACEE("%s: exit stop mismatch", text);
+ }
+ ASSERT_TRUE(info->instruction_pointer) {
+ LOG_KILL_TRACEE("%s: exit stop mismatch", text);
+ }
+ ASSERT_TRUE(info->stack_pointer) {
+ LOG_KILL_TRACEE("%s: exit stop mismatch", text);
+ }
+ ASSERT_EQ(exp_exit->is_error, info->exit.is_error) {
+ LOG_KILL_TRACEE("%s: exit stop mismatch", text);
+ }
+ ASSERT_EQ(exp_exit->rval, info->exit.rval) {
+ LOG_KILL_TRACEE("%s: exit stop mismatch", text);
+ }
+}
+
+TEST(set_syscall_info)
+{
+ const pid_t tracer_pid = getpid();
+ const kernel_ulong_t dummy[] = {
+ (kernel_ulong_t) 0xdad0bef0bad0fed0ULL,
+ (kernel_ulong_t) 0xdad1bef1bad1fed1ULL,
+ (kernel_ulong_t) 0xdad2bef2bad2fed2ULL,
+ (kernel_ulong_t) 0xdad3bef3bad3fed3ULL,
+ (kernel_ulong_t) 0xdad4bef4bad4fed4ULL,
+ (kernel_ulong_t) 0xdad5bef5bad5fed5ULL,
+ };
+ int splice_in[2], splice_out[2];
+
+ ASSERT_EQ(0, pipe(splice_in));
+ ASSERT_EQ(0, pipe(splice_out));
+ ASSERT_EQ(sizeof(dummy), write(splice_in[1], dummy, sizeof(dummy)));
+
+ const struct {
+ struct si_entry entry[2];
+ struct si_exit exit[2];
+ } si[] = {
+ /* change scno, keep non-error rval */
+ {
+ {
+ {
+ __NR_gettid,
+ {
+ dummy[0], dummy[1], dummy[2],
+ dummy[3], dummy[4], dummy[5]
+ }
+ }, {
+ __NR_getppid,
+ {
+ dummy[0], dummy[1], dummy[2],
+ dummy[3], dummy[4], dummy[5]
+ }
+ }
+ }, {
+ { 0, tracer_pid }, { 0, tracer_pid }
+ }
+ },
+
+ /* set scno to -1, keep error rval */
+ {
+ {
+ {
+ __NR_chdir,
+ {
+ (uintptr_t) ".",
+ dummy[1], dummy[2],
+ dummy[3], dummy[4], dummy[5]
+ }
+ }, {
+ -1,
+ {
+ (uintptr_t) ".",
+ dummy[1], dummy[2],
+ dummy[3], dummy[4], dummy[5]
+ }
+ }
+ }, {
+ { 1, -ENOSYS }, { 1, -ENOSYS }
+ }
+ },
+
+ /* keep scno, change non-error rval */
+ {
+ {
+ {
+ __NR_getppid,
+ {
+ dummy[0], dummy[1], dummy[2],
+ dummy[3], dummy[4], dummy[5]
+ }
+ }, {
+ __NR_getppid,
+ {
+ dummy[0], dummy[1], dummy[2],
+ dummy[3], dummy[4], dummy[5]
+ }
+ }
+ }, {
+ { 0, tracer_pid }, { 0, tracer_pid + 1 }
+ }
+ },
+
+ /* change arg1, keep non-error rval */
+ {
+ {
+ {
+ __NR_chdir,
+ {
+ (uintptr_t) "",
+ dummy[1], dummy[2],
+ dummy[3], dummy[4], dummy[5]
+ }
+ }, {
+ __NR_chdir,
+ {
+ (uintptr_t) ".",
+ dummy[1], dummy[2],
+ dummy[3], dummy[4], dummy[5]
+ }
+ }
+ }, {
+ { 0, 0 }, { 0, 0 }
+ }
+ },
+
+ /* set scno to -1, change error rval to non-error */
+ {
+ {
+ {
+ __NR_gettid,
+ {
+ dummy[0], dummy[1], dummy[2],
+ dummy[3], dummy[4], dummy[5]
+ }
+ }, {
+ -1,
+ {
+ dummy[0], dummy[1], dummy[2],
+ dummy[3], dummy[4], dummy[5]
+ }
+ }
+ }, {
+ { 1, -ENOSYS }, { 0, tracer_pid }
+ }
+ },
+
+ /* change scno, change non-error rval to error */
+ {
+ {
+ {
+ __NR_chdir,
+ {
+ dummy[0], dummy[1], dummy[2],
+ dummy[3], dummy[4], dummy[5]
+ }
+ }, {
+ __NR_getppid,
+ {
+ dummy[0], dummy[1], dummy[2],
+ dummy[3], dummy[4], dummy[5]
+ }
+ }
+ }, {
+ { 0, tracer_pid }, { 1, -EISDIR }
+ }
+ },
+
+ /* change scno and all args, change non-error rval */
+ {
+ {
+ {
+ __NR_gettid,
+ {
+ dummy[0], dummy[1], dummy[2],
+ dummy[3], dummy[4], dummy[5]
+ }
+ }, {
+ __NR_splice,
+ {
+ splice_in[0], 0, splice_out[1], 0,
+ sizeof(dummy), SPLICE_F_NONBLOCK
+ }
+ }
+ }, {
+ { 0, sizeof(dummy) }, { 0, sizeof(dummy) + 1 }
+ }
+ },
+
+ /* change arg1, no exit stop */
+ {
+ {
+ {
+ __NR_exit_group,
+ {
+ dummy[0], dummy[1], dummy[2],
+ dummy[3], dummy[4], dummy[5]
+ }
+ }, {
+ __NR_exit_group,
+ {
+ 0, dummy[1], dummy[2],
+ dummy[3], dummy[4], dummy[5]
+ }
+ }
+ }, {
+ { 0, 0 }, { 0, 0 }
+ }
+ },
+ };
+
+ long rc;
+ unsigned int i;
+
+ tracee_pid = fork();
+
+ ASSERT_LE(0, tracee_pid) {
+ TH_LOG("fork: %m");
+ }
+
+ if (tracee_pid == 0) {
+ /* get the pid before PTRACE_TRACEME */
+ tracee_pid = getpid();
+ ASSERT_EQ(0, sys_ptrace(PTRACE_TRACEME, 0, 0, 0)) {
+ TH_LOG("PTRACE_TRACEME: %m");
+ }
+ ASSERT_EQ(0, kill(tracee_pid, SIGSTOP)) {
+ /* cannot happen */
+ TH_LOG("kill SIGSTOP: %m");
+ }
+ for (i = 0; i < ARRAY_SIZE(si); ++i) {
+ rc = syscall(si[i].entry[0].nr,
+ si[i].entry[0].args[0],
+ si[i].entry[0].args[1],
+ si[i].entry[0].args[2],
+ si[i].entry[0].args[3],
+ si[i].entry[0].args[4],
+ si[i].entry[0].args[5]);
+ if (si[i].exit[1].is_error) {
+ if (rc != -1 || errno != -si[i].exit[1].rval)
+ break;
+ } else {
+ if (rc != si[i].exit[1].rval)
+ break;
+ }
+ }
+ /*
+ * Something went wrong, but in this state tracee
+ * cannot reliably issue syscalls, so just crash.
+ */
+ *(volatile unsigned char *) (uintptr_t) i = 42;
+ /* unreachable */
+ _exit(i + 1);
+ }
+
+ for (ptrace_stop = 0; ; ++ptrace_stop) {
+ struct ptrace_syscall_info info = {
+ .op = 0xff /* invalid PTRACE_SYSCALL_INFO_* op */
+ };
+ const size_t size = sizeof(info);
+ const int expected_entry_size =
+ (void *) &info.entry.args[6] - (void *) &info;
+ const int expected_exit_size =
+ (void *) (&info.exit.is_error + 1) -
+ (void *) &info;
+ int status;
+
+ ASSERT_EQ(tracee_pid, wait(&status)) {
+ /* cannot happen */
+ LOG_KILL_TRACEE("wait: %m");
+ }
+ if (WIFEXITED(status)) {
+ tracee_pid = 0; /* the tracee is no more */
+ ASSERT_EQ(0, WEXITSTATUS(status)) {
+ LOG_KILL_TRACEE("unexpected exit status %u",
+ WEXITSTATUS(status));
+ }
+ break;
+ }
+ ASSERT_FALSE(WIFSIGNALED(status)) {
+ tracee_pid = 0; /* the tracee is no more */
+ LOG_KILL_TRACEE("unexpected signal %u",
+ WTERMSIG(status));
+ }
+ ASSERT_TRUE(WIFSTOPPED(status)) {
+ /* cannot happen */
+ LOG_KILL_TRACEE("unexpected wait status %#x", status);
+ }
+
+ ASSERT_LT(ptrace_stop, ARRAY_SIZE(si) * 2) {
+ LOG_KILL_TRACEE("ptrace stop overflow");
+ }
+
+ switch (WSTOPSIG(status)) {
+ case SIGSTOP:
+ ASSERT_EQ(0, ptrace_stop) {
+ LOG_KILL_TRACEE("unexpected signal stop");
+ }
+ ASSERT_EQ(0, sys_ptrace(PTRACE_SETOPTIONS, tracee_pid,
+ 0, PTRACE_O_TRACESYSGOOD)) {
+ LOG_KILL_TRACEE("PTRACE_SETOPTIONS: %m");
+ }
+ break;
+
+ case SIGTRAP | 0x80:
+ ASSERT_LT(0, ptrace_stop) {
+ LOG_KILL_TRACEE("unexpected syscall stop");
+ }
+ ASSERT_LT(0, (rc = sys_ptrace(PTRACE_GET_SYSCALL_INFO,
+ tracee_pid, size,
+ (uintptr_t) &info))) {
+ LOG_KILL_TRACEE("PTRACE_GET_SYSCALL_INFO #1: %m");
+ }
+ if (ptrace_stop & 1) {
+ /* entering syscall */
+ const struct si_entry *exp_entry =
+ &si[ptrace_stop / 2].entry[0];
+ const struct si_entry *set_entry =
+ &si[ptrace_stop / 2].entry[1];
+
+ /* check ptrace_syscall_info before the changes */
+ ASSERT_EQ(expected_entry_size, rc) {
+ LOG_KILL_TRACEE("PTRACE_GET_SYSCALL_INFO #1"
+ ": entry stop mismatch");
+ }
+ check_psi_entry(_metadata, &info, exp_entry,
+ "PTRACE_GET_SYSCALL_INFO #1");
+
+ /* apply the changes */
+ info.entry.nr = set_entry->nr;
+ for (i = 0; i < ARRAY_SIZE(set_entry->args); ++i)
+ info.entry.args[i] = set_entry->args[i];
+ ASSERT_EQ(0, sys_ptrace(PTRACE_SET_SYSCALL_INFO,
+ tracee_pid, size,
+ (uintptr_t) &info)) {
+ LOG_KILL_TRACEE("PTRACE_SET_SYSCALL_INFO: %m");
+ }
+
+ /* check ptrace_syscall_info after the changes */
+ memset(&info, 0, sizeof(info));
+ info.op = 0xff;
+ ASSERT_LT(0, (rc = sys_ptrace(PTRACE_GET_SYSCALL_INFO,
+ tracee_pid, size,
+ (uintptr_t) &info))) {
+ LOG_KILL_TRACEE("PTRACE_GET_SYSCALL_INFO: %m");
+ }
+ ASSERT_EQ(expected_entry_size, rc) {
+ LOG_KILL_TRACEE("PTRACE_GET_SYSCALL_INFO #2"
+ ": entry stop mismatch");
+ }
+ check_psi_entry(_metadata, &info, set_entry,
+ "PTRACE_GET_SYSCALL_INFO #2");
+ } else {
+ /* exiting syscall */
+ const struct si_exit *exp_exit =
+ &si[ptrace_stop / 2 - 1].exit[0];
+ const struct si_exit *set_exit =
+ &si[ptrace_stop / 2 - 1].exit[1];
+
+ /* check ptrace_syscall_info before the changes */
+ ASSERT_EQ(expected_exit_size, rc) {
+ LOG_KILL_TRACEE("PTRACE_GET_SYSCALL_INFO #1"
+ ": exit stop mismatch");
+ }
+ check_psi_exit(_metadata, &info, exp_exit,
+ "PTRACE_GET_SYSCALL_INFO #1");
+
+ /* apply the changes */
+ info.exit.is_error = set_exit->is_error;
+ info.exit.rval = set_exit->rval;
+ ASSERT_EQ(0, sys_ptrace(PTRACE_SET_SYSCALL_INFO,
+ tracee_pid, size,
+ (uintptr_t) &info)) {
+ LOG_KILL_TRACEE("PTRACE_SET_SYSCALL_INFO: %m");
+ }
+
+ /* check ptrace_syscall_info after the changes */
+ memset(&info, 0, sizeof(info));
+ info.op = 0xff;
+ ASSERT_LT(0, (rc = sys_ptrace(PTRACE_GET_SYSCALL_INFO,
+ tracee_pid, size,
+ (uintptr_t) &info))) {
+ LOG_KILL_TRACEE("PTRACE_GET_SYSCALL_INFO #2: %m");
+ }
+ ASSERT_EQ(expected_exit_size, rc) {
+ LOG_KILL_TRACEE("PTRACE_GET_SYSCALL_INFO #2"
+ ": exit stop mismatch");
+ }
+ check_psi_exit(_metadata, &info, set_exit,
+ "PTRACE_GET_SYSCALL_INFO #2");
+ }
+ break;
+
+ default:
+ LOG_KILL_TRACEE("unexpected stop signal %u",
+ WSTOPSIG(status));
+ abort();
+ }
+
+ ASSERT_EQ(0, sys_ptrace(PTRACE_SYSCALL, tracee_pid, 0, 0)) {
+ LOG_KILL_TRACEE("PTRACE_SYSCALL: %m");
+ }
+ }
+
+ ASSERT_EQ(ptrace_stop, ARRAY_SIZE(si) * 2);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/sched_ext/exit.c b/tools/testing/selftests/sched_ext/exit.c
index 9451782689de..ee25824b1cbe 100644
--- a/tools/testing/selftests/sched_ext/exit.c
+++ b/tools/testing/selftests/sched_ext/exit.c
@@ -22,6 +22,14 @@ static enum scx_test_status run(void *ctx)
struct bpf_link *link;
char buf[16];
+ /*
+ * On single-CPU systems, ops.select_cpu() is never
+ * invoked, so skip this test to avoid getting stuck
+ * indefinitely.
+ */
+ if (tc == EXIT_SELECT_CPU && libbpf_num_possible_cpus() == 1)
+ continue;
+
skel = exit__open();
SCX_ENUM_INIT(skel);
skel->rodata->exit_point = tc;
diff --git a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json
index 9aa44d8176d9..c6db7fa94f55 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json
@@ -128,6 +128,32 @@
]
},
{
+ "id": "5456",
+ "name": "Test htb_dequeue_tree with deactivation and row emptying",
+ "category": [
+ "qdisc",
+ "htb"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$IP link set dev $DUMMY up || true",
+ "$IP addr add 10.10.11.10/24 dev $DUMMY || true",
+ "$TC qdisc add dev $DUMMY root handle 1: htb default 1",
+ "$TC class add dev $DUMMY parent 1: classid 1:1 htb rate 64bit ",
+ "$TC qdisc add dev $DUMMY parent 1:1 handle 2: netem",
+ "$TC qdisc add dev $DUMMY parent 2:1 handle 3: blackhole"
+ ],
+ "cmdUnderTest": "ping -c1 -W0.01 -I $DUMMY 10.10.11.11",
+ "expExitCode": "1",
+ "verifyCmd": "$TC -j qdisc show dev $DUMMY",
+ "matchJSON": [],
+ "teardown": [
+ "$TC qdisc del dev $DUMMY root"
+ ]
+ },
+ {
"id": "c024",
"name": "Test TBF with SKBPRIO - catch qlen corner cases",
"category": [
@@ -635,5 +661,108 @@
"$TC qdisc del dev $DUMMY handle 1:0 root",
"$IP addr del 10.10.10.10/24 dev $DUMMY || true"
]
+ },
+ {
+ "id": "d74b",
+ "name": "Test use-after-free with DRR/NETEM/BLACKHOLE chain",
+ "category": [
+ "qdisc",
+ "hfsc",
+ "drr",
+ "netem",
+ "blackhole"
+ ],
+ "plugins": {
+ "requires": [
+ "nsPlugin",
+ "scapyPlugin"
+ ]
+ },
+ "setup": [
+ "$IP link set dev $DUMMY up || true",
+ "$IP addr add 10.10.11.10/24 dev $DUMMY || true",
+ "$TC qdisc add dev $DUMMY root handle 1: drr",
+ "$TC filter add dev $DUMMY parent 1: basic classid 1:1",
+ "$TC class add dev $DUMMY parent 1: classid 1:1 drr",
+ "$TC qdisc add dev $DUMMY parent 1:1 handle 2: hfsc def 1",
+ "$TC class add dev $DUMMY parent 2: classid 2:1 hfsc rt m1 8 d 1 m2 0",
+ "$TC qdisc add dev $DUMMY parent 2:1 handle 3: netem",
+ "$TC qdisc add dev $DUMMY parent 3:1 handle 4: blackhole",
+ "ping -c1 -W0.01 -I $DUMMY 10.10.11.11 || true",
+ "$TC class del dev $DUMMY classid 1:1"
+ ],
+ "cmdUnderTest": "ping -c1 -W0.01 -I $DUMMY 10.10.11.11",
+ "expExitCode": "1",
+ "verifyCmd": "$TC -j class ls dev $DUMMY classid 1:1",
+ "matchJSON": [],
+ "teardown": [
+ "$TC qdisc del dev $DUMMY root handle 1: drr"
+ ]
+ },
+ {
+ "id": "be28",
+ "name": "Try to add fq_codel qdisc as a child of an hhf qdisc",
+ "category": [
+ "qdisc",
+ "fq_codel",
+ "hhf"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DUMMY root handle a: hhf"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY parent a: handle b: fq_codel",
+ "expExitCode": "2",
+ "verifyCmd": "$TC -j qdisc ls dev $DUMMY handle b:",
+ "matchJSON": [],
+ "teardown": [
+ "$TC qdisc del dev $DUMMY root"
+ ]
+ },
+ {
+ "id": "fcb5",
+ "name": "Try to add pie qdisc as a child of a drr qdisc",
+ "category": [
+ "qdisc",
+ "pie",
+ "drr"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DUMMY root handle a: drr"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY parent a: handle b: pie",
+ "expExitCode": "2",
+ "verifyCmd": "$TC -j qdisc ls dev $DUMMY handle b:",
+ "matchJSON": [],
+ "teardown": [
+ "$TC qdisc del dev $DUMMY root"
+ ]
+ },
+ {
+ "id": "7801",
+ "name": "Try to add fq qdisc as a child of an inexistent hfsc class",
+ "category": [
+ "qdisc",
+ "sfq",
+ "hfsc"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DUMMY root handle a: hfsc"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY parent a:fff2 sfq limit 4",
+ "expExitCode": "2",
+ "verifyCmd": "$TC -j qdisc ls dev $DUMMY handle b:",
+ "matchJSON": [],
+ "teardown": [
+ "$TC qdisc del dev $DUMMY root"
+ ]
}
]
diff --git a/tools/testing/selftests/thermal/intel/power_floor/power_floor_test.c b/tools/testing/selftests/thermal/intel/power_floor/power_floor_test.c
index 0326b39a11b9..30cab5d425d2 100644
--- a/tools/testing/selftests/thermal/intel/power_floor/power_floor_test.c
+++ b/tools/testing/selftests/thermal/intel/power_floor/power_floor_test.c
@@ -56,7 +56,7 @@ int main(int argc, char **argv)
}
if (write(fd, "1\n", 2) < 0) {
- perror("Can' enable power floor notifications\n");
+ perror("Can't enable power floor notifications\n");
exit(1);
}
diff --git a/tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c b/tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c
index 217c3a641c53..a40097232967 100644
--- a/tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c
+++ b/tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c
@@ -37,7 +37,7 @@ void workload_hint_exit(int signum)
}
if (write(fd, "0\n", 2) < 0) {
- perror("Can' disable workload hints\n");
+ perror("Can't disable workload hints\n");
exit(1);
}
@@ -99,7 +99,7 @@ int main(int argc, char **argv)
}
if (write(fd, "1\n", 2) < 0) {
- perror("Can' enable workload hints\n");
+ perror("Can't enable workload hints\n");
exit(1);
}
diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile
index 4dde8838261d..5d7f4ecfb816 100644
--- a/tools/testing/selftests/ublk/Makefile
+++ b/tools/testing/selftests/ublk/Makefile
@@ -19,6 +19,7 @@ TEST_PROGS += test_generic_08.sh
TEST_PROGS += test_generic_09.sh
TEST_PROGS += test_generic_10.sh
TEST_PROGS += test_generic_11.sh
+TEST_PROGS += test_generic_12.sh
TEST_PROGS += test_null_01.sh
TEST_PROGS += test_null_02.sh
diff --git a/tools/testing/selftests/ublk/fault_inject.c b/tools/testing/selftests/ublk/fault_inject.c
index 5421774d7867..6e60f7d97125 100644
--- a/tools/testing/selftests/ublk/fault_inject.c
+++ b/tools/testing/selftests/ublk/fault_inject.c
@@ -46,9 +46,9 @@ static int ublk_fault_inject_queue_io(struct ublk_queue *q, int tag)
.tv_nsec = (long long)q->dev->private_data,
};
- ublk_queue_alloc_sqes(q, &sqe, 1);
+ ublk_io_alloc_sqes(ublk_get_io(q, tag), &sqe, 1);
io_uring_prep_timeout(sqe, &ts, 1, 0);
- sqe->user_data = build_user_data(tag, ublksrv_get_op(iod), 0, 1);
+ sqe->user_data = build_user_data(tag, ublksrv_get_op(iod), 0, q->q_id, 1);
ublk_queued_tgt_io(q, tag, 1);
diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c
index 509842df9bee..cfa59b631693 100644
--- a/tools/testing/selftests/ublk/file_backed.c
+++ b/tools/testing/selftests/ublk/file_backed.c
@@ -18,11 +18,11 @@ static int loop_queue_flush_io(struct ublk_queue *q, const struct ublksrv_io_des
unsigned ublk_op = ublksrv_get_op(iod);
struct io_uring_sqe *sqe[1];
- ublk_queue_alloc_sqes(q, sqe, 1);
+ ublk_io_alloc_sqes(ublk_get_io(q, tag), sqe, 1);
io_uring_prep_fsync(sqe[0], 1 /*fds[1]*/, IORING_FSYNC_DATASYNC);
io_uring_sqe_set_flags(sqe[0], IOSQE_FIXED_FILE);
/* bit63 marks us as tgt io */
- sqe[0]->user_data = build_user_data(tag, ublk_op, 0, 1);
+ sqe[0]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1);
return 1;
}
@@ -36,7 +36,7 @@ static int loop_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_de
void *addr = (zc | auto_zc) ? NULL : (void *)iod->addr;
if (!zc || auto_zc) {
- ublk_queue_alloc_sqes(q, sqe, 1);
+ ublk_io_alloc_sqes(ublk_get_io(q, tag), sqe, 1);
if (!sqe[0])
return -ENOMEM;
@@ -48,26 +48,26 @@ static int loop_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_de
sqe[0]->buf_index = tag;
io_uring_sqe_set_flags(sqe[0], IOSQE_FIXED_FILE);
/* bit63 marks us as tgt io */
- sqe[0]->user_data = build_user_data(tag, ublk_op, 0, 1);
+ sqe[0]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1);
return 1;
}
- ublk_queue_alloc_sqes(q, sqe, 3);
+ ublk_io_alloc_sqes(ublk_get_io(q, tag), sqe, 3);
- io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, tag);
+ io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, ublk_get_io(q, tag)->buf_index);
sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK;
sqe[0]->user_data = build_user_data(tag,
- ublk_cmd_op_nr(sqe[0]->cmd_op), 0, 1);
+ ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1);
io_uring_prep_rw(op, sqe[1], 1 /*fds[1]*/, 0,
iod->nr_sectors << 9,
iod->start_sector << 9);
sqe[1]->buf_index = tag;
sqe[1]->flags |= IOSQE_FIXED_FILE | IOSQE_IO_HARDLINK;
- sqe[1]->user_data = build_user_data(tag, ublk_op, 0, 1);
+ sqe[1]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1);
- io_uring_prep_buf_unregister(sqe[2], 0, tag, q->q_id, tag);
- sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, 1);
+ io_uring_prep_buf_unregister(sqe[2], 0, tag, q->q_id, ublk_get_io(q, tag)->buf_index);
+ sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, q->q_id, 1);
return 2;
}
diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c
index b5131a000795..e2d2042810d4 100644
--- a/tools/testing/selftests/ublk/kublk.c
+++ b/tools/testing/selftests/ublk/kublk.c
@@ -348,8 +348,8 @@ static void ublk_ctrl_dump(struct ublk_dev *dev)
for (i = 0; i < info->nr_hw_queues; i++) {
ublk_print_cpu_set(&affinity[i], buf, sizeof(buf));
- printf("\tqueue %u: tid %d affinity(%s)\n",
- i, dev->q[i].tid, buf);
+ printf("\tqueue %u: affinity(%s)\n",
+ i, buf);
}
free(affinity);
}
@@ -412,16 +412,6 @@ static void ublk_queue_deinit(struct ublk_queue *q)
int i;
int nr_ios = q->q_depth;
- io_uring_unregister_buffers(&q->ring);
-
- io_uring_unregister_ring_fd(&q->ring);
-
- if (q->ring.ring_fd > 0) {
- io_uring_unregister_files(&q->ring);
- close(q->ring.ring_fd);
- q->ring.ring_fd = -1;
- }
-
if (q->io_cmd_buf)
munmap(q->io_cmd_buf, ublk_queue_cmd_buf_sz(q));
@@ -429,20 +419,30 @@ static void ublk_queue_deinit(struct ublk_queue *q)
free(q->ios[i].buf_addr);
}
+static void ublk_thread_deinit(struct ublk_thread *t)
+{
+ io_uring_unregister_buffers(&t->ring);
+
+ io_uring_unregister_ring_fd(&t->ring);
+
+ if (t->ring.ring_fd > 0) {
+ io_uring_unregister_files(&t->ring);
+ close(t->ring.ring_fd);
+ t->ring.ring_fd = -1;
+ }
+}
+
static int ublk_queue_init(struct ublk_queue *q, unsigned extra_flags)
{
struct ublk_dev *dev = q->dev;
int depth = dev->dev_info.queue_depth;
- int i, ret = -1;
+ int i;
int cmd_buf_size, io_buf_size;
unsigned long off;
- int ring_depth = dev->tgt.sq_depth, cq_depth = dev->tgt.cq_depth;
q->tgt_ops = dev->tgt.ops;
q->state = 0;
q->q_depth = depth;
- q->cmd_inflight = 0;
- q->tid = gettid();
if (dev->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_AUTO_BUF_REG)) {
q->state |= UBLKSRV_NO_BUF;
@@ -467,6 +467,7 @@ static int ublk_queue_init(struct ublk_queue *q, unsigned extra_flags)
for (i = 0; i < q->q_depth; i++) {
q->ios[i].buf_addr = NULL;
q->ios[i].flags = UBLKSRV_NEED_FETCH_RQ | UBLKSRV_IO_FREE;
+ q->ios[i].tag = i;
if (q->state & UBLKSRV_NO_BUF)
continue;
@@ -479,39 +480,57 @@ static int ublk_queue_init(struct ublk_queue *q, unsigned extra_flags)
}
}
- ret = ublk_setup_ring(&q->ring, ring_depth, cq_depth,
+ return 0;
+ fail:
+ ublk_queue_deinit(q);
+ ublk_err("ublk dev %d queue %d failed\n",
+ dev->dev_info.dev_id, q->q_id);
+ return -ENOMEM;
+}
+
+static int ublk_thread_init(struct ublk_thread *t)
+{
+ struct ublk_dev *dev = t->dev;
+ int ring_depth = dev->tgt.sq_depth, cq_depth = dev->tgt.cq_depth;
+ int ret;
+
+ ret = ublk_setup_ring(&t->ring, ring_depth, cq_depth,
IORING_SETUP_COOP_TASKRUN |
IORING_SETUP_SINGLE_ISSUER |
IORING_SETUP_DEFER_TASKRUN);
if (ret < 0) {
- ublk_err("ublk dev %d queue %d setup io_uring failed %d\n",
- q->dev->dev_info.dev_id, q->q_id, ret);
+ ublk_err("ublk dev %d thread %d setup io_uring failed %d\n",
+ dev->dev_info.dev_id, t->idx, ret);
goto fail;
}
if (dev->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_AUTO_BUF_REG)) {
- ret = io_uring_register_buffers_sparse(&q->ring, q->q_depth);
+ unsigned nr_ios = dev->dev_info.queue_depth * dev->dev_info.nr_hw_queues;
+ unsigned max_nr_ios_per_thread = nr_ios / dev->nthreads;
+ max_nr_ios_per_thread += !!(nr_ios % dev->nthreads);
+ ret = io_uring_register_buffers_sparse(
+ &t->ring, max_nr_ios_per_thread);
if (ret) {
- ublk_err("ublk dev %d queue %d register spare buffers failed %d",
- dev->dev_info.dev_id, q->q_id, ret);
+ ublk_err("ublk dev %d thread %d register spare buffers failed %d",
+ dev->dev_info.dev_id, t->idx, ret);
goto fail;
}
}
- io_uring_register_ring_fd(&q->ring);
+ io_uring_register_ring_fd(&t->ring);
- ret = io_uring_register_files(&q->ring, dev->fds, dev->nr_fds);
+ ret = io_uring_register_files(&t->ring, dev->fds, dev->nr_fds);
if (ret) {
- ublk_err("ublk dev %d queue %d register files failed %d\n",
- q->dev->dev_info.dev_id, q->q_id, ret);
+ ublk_err("ublk dev %d thread %d register files failed %d\n",
+ t->dev->dev_info.dev_id, t->idx, ret);
goto fail;
}
return 0;
- fail:
- ublk_queue_deinit(q);
- ublk_err("ublk dev %d queue %d failed\n",
- dev->dev_info.dev_id, q->q_id);
+fail:
+ ublk_thread_deinit(t);
+ ublk_err("ublk dev %d thread %d init failed\n",
+ dev->dev_info.dev_id, t->idx);
return -ENOMEM;
}
@@ -562,7 +581,7 @@ static void ublk_set_auto_buf_reg(const struct ublk_queue *q,
if (q->tgt_ops->buf_index)
buf.index = q->tgt_ops->buf_index(q, tag);
else
- buf.index = tag;
+ buf.index = q->ios[tag].buf_index;
if (q->state & UBLKSRV_AUTO_BUF_REG_FALLBACK)
buf.flags = UBLK_AUTO_BUF_REG_FALLBACK;
@@ -570,8 +589,10 @@ static void ublk_set_auto_buf_reg(const struct ublk_queue *q,
sqe->addr = ublk_auto_buf_reg_to_sqe_addr(&buf);
}
-int ublk_queue_io_cmd(struct ublk_queue *q, struct ublk_io *io, unsigned tag)
+int ublk_queue_io_cmd(struct ublk_io *io)
{
+ struct ublk_thread *t = io->t;
+ struct ublk_queue *q = ublk_io_to_queue(io);
struct ublksrv_io_cmd *cmd;
struct io_uring_sqe *sqe[1];
unsigned int cmd_op = 0;
@@ -596,13 +617,13 @@ int ublk_queue_io_cmd(struct ublk_queue *q, struct ublk_io *io, unsigned tag)
else if (io->flags & UBLKSRV_NEED_FETCH_RQ)
cmd_op = UBLK_U_IO_FETCH_REQ;
- if (io_uring_sq_space_left(&q->ring) < 1)
- io_uring_submit(&q->ring);
+ if (io_uring_sq_space_left(&t->ring) < 1)
+ io_uring_submit(&t->ring);
- ublk_queue_alloc_sqes(q, sqe, 1);
+ ublk_io_alloc_sqes(io, sqe, 1);
if (!sqe[0]) {
- ublk_err("%s: run out of sqe %d, tag %d\n",
- __func__, q->q_id, tag);
+ ublk_err("%s: run out of sqe. thread %u, tag %d\n",
+ __func__, t->idx, io->tag);
return -1;
}
@@ -617,7 +638,7 @@ int ublk_queue_io_cmd(struct ublk_queue *q, struct ublk_io *io, unsigned tag)
sqe[0]->opcode = IORING_OP_URING_CMD;
sqe[0]->flags = IOSQE_FIXED_FILE;
sqe[0]->rw_flags = 0;
- cmd->tag = tag;
+ cmd->tag = io->tag;
cmd->q_id = q->q_id;
if (!(q->state & UBLKSRV_NO_BUF))
cmd->addr = (__u64) (uintptr_t) io->buf_addr;
@@ -625,37 +646,72 @@ int ublk_queue_io_cmd(struct ublk_queue *q, struct ublk_io *io, unsigned tag)
cmd->addr = 0;
if (q->state & UBLKSRV_AUTO_BUF_REG)
- ublk_set_auto_buf_reg(q, sqe[0], tag);
+ ublk_set_auto_buf_reg(q, sqe[0], io->tag);
- user_data = build_user_data(tag, _IOC_NR(cmd_op), 0, 0);
+ user_data = build_user_data(io->tag, _IOC_NR(cmd_op), 0, q->q_id, 0);
io_uring_sqe_set_data64(sqe[0], user_data);
io->flags = 0;
- q->cmd_inflight += 1;
+ t->cmd_inflight += 1;
- ublk_dbg(UBLK_DBG_IO_CMD, "%s: (qid %d tag %u cmd_op %u) iof %x stopping %d\n",
- __func__, q->q_id, tag, cmd_op,
- io->flags, !!(q->state & UBLKSRV_QUEUE_STOPPING));
+ ublk_dbg(UBLK_DBG_IO_CMD, "%s: (thread %u qid %d tag %u cmd_op %u) iof %x stopping %d\n",
+ __func__, t->idx, q->q_id, io->tag, cmd_op,
+ io->flags, !!(t->state & UBLKSRV_THREAD_STOPPING));
return 1;
}
-static void ublk_submit_fetch_commands(struct ublk_queue *q)
+static void ublk_submit_fetch_commands(struct ublk_thread *t)
{
- int i = 0;
+ struct ublk_queue *q;
+ struct ublk_io *io;
+ int i = 0, j = 0;
- for (i = 0; i < q->q_depth; i++)
- ublk_queue_io_cmd(q, &q->ios[i], i);
+ if (t->dev->per_io_tasks) {
+ /*
+ * Lexicographically order all the (qid,tag) pairs, with
+ * qid taking priority (so (1,0) > (0,1)). Then make
+ * this thread the daemon for every Nth entry in this
+ * list (N is the number of threads), starting at this
+ * thread's index. This ensures that each queue is
+ * handled by as many ublk server threads as possible,
+ * so that load that is concentrated on one or a few
+ * queues can make use of all ublk server threads.
+ */
+ const struct ublksrv_ctrl_dev_info *dinfo = &t->dev->dev_info;
+ int nr_ios = dinfo->nr_hw_queues * dinfo->queue_depth;
+ for (i = t->idx; i < nr_ios; i += t->dev->nthreads) {
+ int q_id = i / dinfo->queue_depth;
+ int tag = i % dinfo->queue_depth;
+ q = &t->dev->q[q_id];
+ io = &q->ios[tag];
+ io->t = t;
+ io->buf_index = j++;
+ ublk_queue_io_cmd(io);
+ }
+ } else {
+ /*
+ * Service exclusively the queue whose q_id matches our
+ * thread index.
+ */
+ struct ublk_queue *q = &t->dev->q[t->idx];
+ for (i = 0; i < q->q_depth; i++) {
+ io = &q->ios[i];
+ io->t = t;
+ io->buf_index = i;
+ ublk_queue_io_cmd(io);
+ }
+ }
}
-static int ublk_queue_is_idle(struct ublk_queue *q)
+static int ublk_thread_is_idle(struct ublk_thread *t)
{
- return !io_uring_sq_ready(&q->ring) && !q->io_inflight;
+ return !io_uring_sq_ready(&t->ring) && !t->io_inflight;
}
-static int ublk_queue_is_done(struct ublk_queue *q)
+static int ublk_thread_is_done(struct ublk_thread *t)
{
- return (q->state & UBLKSRV_QUEUE_STOPPING) && ublk_queue_is_idle(q);
+ return (t->state & UBLKSRV_THREAD_STOPPING) && ublk_thread_is_idle(t);
}
static inline void ublksrv_handle_tgt_cqe(struct ublk_queue *q,
@@ -673,14 +729,16 @@ static inline void ublksrv_handle_tgt_cqe(struct ublk_queue *q,
q->tgt_ops->tgt_io_done(q, tag, cqe);
}
-static void ublk_handle_cqe(struct io_uring *r,
+static void ublk_handle_cqe(struct ublk_thread *t,
struct io_uring_cqe *cqe, void *data)
{
- struct ublk_queue *q = container_of(r, struct ublk_queue, ring);
+ struct ublk_dev *dev = t->dev;
+ unsigned q_id = user_data_to_q_id(cqe->user_data);
+ struct ublk_queue *q = &dev->q[q_id];
unsigned tag = user_data_to_tag(cqe->user_data);
unsigned cmd_op = user_data_to_op(cqe->user_data);
int fetch = (cqe->res != UBLK_IO_RES_ABORT) &&
- !(q->state & UBLKSRV_QUEUE_STOPPING);
+ !(t->state & UBLKSRV_THREAD_STOPPING);
struct ublk_io *io;
if (cqe->res < 0 && cqe->res != -ENODEV)
@@ -691,7 +749,7 @@ static void ublk_handle_cqe(struct io_uring *r,
__func__, cqe->res, q->q_id, tag, cmd_op,
is_target_io(cqe->user_data),
user_data_to_tgt_data(cqe->user_data),
- (q->state & UBLKSRV_QUEUE_STOPPING));
+ (t->state & UBLKSRV_THREAD_STOPPING));
/* Don't retrieve io in case of target io */
if (is_target_io(cqe->user_data)) {
@@ -700,10 +758,10 @@ static void ublk_handle_cqe(struct io_uring *r,
}
io = &q->ios[tag];
- q->cmd_inflight--;
+ t->cmd_inflight--;
if (!fetch) {
- q->state |= UBLKSRV_QUEUE_STOPPING;
+ t->state |= UBLKSRV_THREAD_STOPPING;
io->flags &= ~UBLKSRV_NEED_FETCH_RQ;
}
@@ -713,7 +771,7 @@ static void ublk_handle_cqe(struct io_uring *r,
q->tgt_ops->queue_io(q, tag);
} else if (cqe->res == UBLK_IO_RES_NEED_GET_DATA) {
io->flags |= UBLKSRV_NEED_GET_DATA | UBLKSRV_IO_FREE;
- ublk_queue_io_cmd(q, io, tag);
+ ublk_queue_io_cmd(io);
} else {
/*
* COMMIT_REQ will be completed immediately since no fetching
@@ -727,92 +785,93 @@ static void ublk_handle_cqe(struct io_uring *r,
}
}
-static int ublk_reap_events_uring(struct io_uring *r)
+static int ublk_reap_events_uring(struct ublk_thread *t)
{
struct io_uring_cqe *cqe;
unsigned head;
int count = 0;
- io_uring_for_each_cqe(r, head, cqe) {
- ublk_handle_cqe(r, cqe, NULL);
+ io_uring_for_each_cqe(&t->ring, head, cqe) {
+ ublk_handle_cqe(t, cqe, NULL);
count += 1;
}
- io_uring_cq_advance(r, count);
+ io_uring_cq_advance(&t->ring, count);
return count;
}
-static int ublk_process_io(struct ublk_queue *q)
+static int ublk_process_io(struct ublk_thread *t)
{
int ret, reapped;
- ublk_dbg(UBLK_DBG_QUEUE, "dev%d-q%d: to_submit %d inflight cmd %u stopping %d\n",
- q->dev->dev_info.dev_id,
- q->q_id, io_uring_sq_ready(&q->ring),
- q->cmd_inflight,
- (q->state & UBLKSRV_QUEUE_STOPPING));
+ ublk_dbg(UBLK_DBG_THREAD, "dev%d-t%u: to_submit %d inflight cmd %u stopping %d\n",
+ t->dev->dev_info.dev_id,
+ t->idx, io_uring_sq_ready(&t->ring),
+ t->cmd_inflight,
+ (t->state & UBLKSRV_THREAD_STOPPING));
- if (ublk_queue_is_done(q))
+ if (ublk_thread_is_done(t))
return -ENODEV;
- ret = io_uring_submit_and_wait(&q->ring, 1);
- reapped = ublk_reap_events_uring(&q->ring);
+ ret = io_uring_submit_and_wait(&t->ring, 1);
+ reapped = ublk_reap_events_uring(t);
- ublk_dbg(UBLK_DBG_QUEUE, "submit result %d, reapped %d stop %d idle %d\n",
- ret, reapped, (q->state & UBLKSRV_QUEUE_STOPPING),
- (q->state & UBLKSRV_QUEUE_IDLE));
+ ublk_dbg(UBLK_DBG_THREAD, "submit result %d, reapped %d stop %d idle %d\n",
+ ret, reapped, (t->state & UBLKSRV_THREAD_STOPPING),
+ (t->state & UBLKSRV_THREAD_IDLE));
return reapped;
}
-static void ublk_queue_set_sched_affinity(const struct ublk_queue *q,
+static void ublk_thread_set_sched_affinity(const struct ublk_thread *t,
cpu_set_t *cpuset)
{
if (sched_setaffinity(0, sizeof(*cpuset), cpuset) < 0)
- ublk_err("ublk dev %u queue %u set affinity failed",
- q->dev->dev_info.dev_id, q->q_id);
+ ublk_err("ublk dev %u thread %u set affinity failed",
+ t->dev->dev_info.dev_id, t->idx);
}
-struct ublk_queue_info {
- struct ublk_queue *q;
- sem_t *queue_sem;
+struct ublk_thread_info {
+ struct ublk_dev *dev;
+ unsigned idx;
+ sem_t *ready;
cpu_set_t *affinity;
- unsigned char auto_zc_fallback;
};
static void *ublk_io_handler_fn(void *data)
{
- struct ublk_queue_info *info = data;
- struct ublk_queue *q = info->q;
- int dev_id = q->dev->dev_info.dev_id;
- unsigned extra_flags = 0;
+ struct ublk_thread_info *info = data;
+ struct ublk_thread *t = &info->dev->threads[info->idx];
+ int dev_id = info->dev->dev_info.dev_id;
int ret;
- if (info->auto_zc_fallback)
- extra_flags = UBLKSRV_AUTO_BUF_REG_FALLBACK;
+ t->dev = info->dev;
+ t->idx = info->idx;
- ret = ublk_queue_init(q, extra_flags);
+ ret = ublk_thread_init(t);
if (ret) {
- ublk_err("ublk dev %d queue %d init queue failed\n",
- dev_id, q->q_id);
+ ublk_err("ublk dev %d thread %u init failed\n",
+ dev_id, t->idx);
return NULL;
}
/* IO perf is sensitive with queue pthread affinity on NUMA machine*/
- ublk_queue_set_sched_affinity(q, info->affinity);
- sem_post(info->queue_sem);
+ if (info->affinity)
+ ublk_thread_set_sched_affinity(t, info->affinity);
+ sem_post(info->ready);
- ublk_dbg(UBLK_DBG_QUEUE, "tid %d: ublk dev %d queue %d started\n",
- q->tid, dev_id, q->q_id);
+ ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %u started\n",
+ gettid(), dev_id, t->idx);
/* submit all io commands to ublk driver */
- ublk_submit_fetch_commands(q);
+ ublk_submit_fetch_commands(t);
do {
- if (ublk_process_io(q) < 0)
+ if (ublk_process_io(t) < 0)
break;
} while (1);
- ublk_dbg(UBLK_DBG_QUEUE, "ublk dev %d queue %d exited\n", dev_id, q->q_id);
- ublk_queue_deinit(q);
+ ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %d exiting\n",
+ gettid(), dev_id, t->idx);
+ ublk_thread_deinit(t);
return NULL;
}
@@ -855,20 +914,20 @@ static int ublk_send_dev_event(const struct dev_ctx *ctx, struct ublk_dev *dev,
static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
{
const struct ublksrv_ctrl_dev_info *dinfo = &dev->dev_info;
- struct ublk_queue_info *qinfo;
+ struct ublk_thread_info *tinfo;
+ unsigned extra_flags = 0;
cpu_set_t *affinity_buf;
void *thread_ret;
- sem_t queue_sem;
+ sem_t ready;
int ret, i;
ublk_dbg(UBLK_DBG_DEV, "%s enter\n", __func__);
- qinfo = (struct ublk_queue_info *)calloc(sizeof(struct ublk_queue_info),
- dinfo->nr_hw_queues);
- if (!qinfo)
+ tinfo = calloc(sizeof(struct ublk_thread_info), dev->nthreads);
+ if (!tinfo)
return -ENOMEM;
- sem_init(&queue_sem, 0, 0);
+ sem_init(&ready, 0, 0);
ret = ublk_dev_prep(ctx, dev);
if (ret)
return ret;
@@ -877,22 +936,44 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
if (ret)
return ret;
+ if (ctx->auto_zc_fallback)
+ extra_flags = UBLKSRV_AUTO_BUF_REG_FALLBACK;
+
for (i = 0; i < dinfo->nr_hw_queues; i++) {
dev->q[i].dev = dev;
dev->q[i].q_id = i;
- qinfo[i].q = &dev->q[i];
- qinfo[i].queue_sem = &queue_sem;
- qinfo[i].affinity = &affinity_buf[i];
- qinfo[i].auto_zc_fallback = ctx->auto_zc_fallback;
- pthread_create(&dev->q[i].thread, NULL,
+ ret = ublk_queue_init(&dev->q[i], extra_flags);
+ if (ret) {
+ ublk_err("ublk dev %d queue %d init queue failed\n",
+ dinfo->dev_id, i);
+ goto fail;
+ }
+ }
+
+ for (i = 0; i < dev->nthreads; i++) {
+ tinfo[i].dev = dev;
+ tinfo[i].idx = i;
+ tinfo[i].ready = &ready;
+
+ /*
+ * If threads are not tied 1:1 to queues, setting thread
+ * affinity based on queue affinity makes little sense.
+ * However, thread CPU affinity has significant impact
+ * on performance, so to compare fairly, we'll still set
+ * thread CPU affinity based on queue affinity where
+ * possible.
+ */
+ if (dev->nthreads == dinfo->nr_hw_queues)
+ tinfo[i].affinity = &affinity_buf[i];
+ pthread_create(&dev->threads[i].thread, NULL,
ublk_io_handler_fn,
- &qinfo[i]);
+ &tinfo[i]);
}
- for (i = 0; i < dinfo->nr_hw_queues; i++)
- sem_wait(&queue_sem);
- free(qinfo);
+ for (i = 0; i < dev->nthreads; i++)
+ sem_wait(&ready);
+ free(tinfo);
free(affinity_buf);
/* everything is fine now, start us */
@@ -914,9 +995,11 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
ublk_send_dev_event(ctx, dev, dev->dev_info.dev_id);
/* wait until we are terminated */
- for (i = 0; i < dinfo->nr_hw_queues; i++)
- pthread_join(dev->q[i].thread, &thread_ret);
+ for (i = 0; i < dev->nthreads; i++)
+ pthread_join(dev->threads[i].thread, &thread_ret);
fail:
+ for (i = 0; i < dinfo->nr_hw_queues; i++)
+ ublk_queue_deinit(&dev->q[i]);
ublk_dev_unprep(dev);
ublk_dbg(UBLK_DBG_DEV, "%s exit\n", __func__);
@@ -1022,13 +1105,14 @@ wait:
static int __cmd_dev_add(const struct dev_ctx *ctx)
{
+ unsigned nthreads = ctx->nthreads;
unsigned nr_queues = ctx->nr_hw_queues;
const char *tgt_type = ctx->tgt_type;
unsigned depth = ctx->queue_depth;
__u64 features;
const struct ublk_tgt_ops *ops;
struct ublksrv_ctrl_dev_info *info;
- struct ublk_dev *dev;
+ struct ublk_dev *dev = NULL;
int dev_id = ctx->dev_id;
int ret, i;
@@ -1036,29 +1120,55 @@ static int __cmd_dev_add(const struct dev_ctx *ctx)
if (!ops) {
ublk_err("%s: no such tgt type, type %s\n",
__func__, tgt_type);
- return -ENODEV;
+ ret = -ENODEV;
+ goto fail;
}
if (nr_queues > UBLK_MAX_QUEUES || depth > UBLK_QUEUE_DEPTH) {
ublk_err("%s: invalid nr_queues or depth queues %u depth %u\n",
__func__, nr_queues, depth);
- return -EINVAL;
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ /* default to 1:1 threads:queues if nthreads is unspecified */
+ if (!nthreads)
+ nthreads = nr_queues;
+
+ if (nthreads > UBLK_MAX_THREADS) {
+ ublk_err("%s: %u is too many threads (max %u)\n",
+ __func__, nthreads, UBLK_MAX_THREADS);
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ if (nthreads != nr_queues && !ctx->per_io_tasks) {
+ ublk_err("%s: threads %u must be same as queues %u if "
+ "not using per_io_tasks\n",
+ __func__, nthreads, nr_queues);
+ ret = -EINVAL;
+ goto fail;
}
dev = ublk_ctrl_init();
if (!dev) {
ublk_err("%s: can't alloc dev id %d, type %s\n",
__func__, dev_id, tgt_type);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto fail;
}
/* kernel doesn't support get_features */
ret = ublk_ctrl_get_features(dev, &features);
- if (ret < 0)
- return -EINVAL;
+ if (ret < 0) {
+ ret = -EINVAL;
+ goto fail;
+ }
- if (!(features & UBLK_F_CMD_IOCTL_ENCODE))
- return -ENOTSUP;
+ if (!(features & UBLK_F_CMD_IOCTL_ENCODE)) {
+ ret = -ENOTSUP;
+ goto fail;
+ }
info = &dev->dev_info;
info->dev_id = ctx->dev_id;
@@ -1068,6 +1178,8 @@ static int __cmd_dev_add(const struct dev_ctx *ctx)
if ((features & UBLK_F_QUIESCE) &&
(info->flags & UBLK_F_USER_RECOVERY))
info->flags |= UBLK_F_QUIESCE;
+ dev->nthreads = nthreads;
+ dev->per_io_tasks = ctx->per_io_tasks;
dev->tgt.ops = ops;
dev->tgt.sq_depth = depth;
dev->tgt.cq_depth = depth;
@@ -1097,7 +1209,8 @@ static int __cmd_dev_add(const struct dev_ctx *ctx)
fail:
if (ret < 0)
ublk_send_dev_event(ctx, dev, -1);
- ublk_ctrl_deinit(dev);
+ if (dev)
+ ublk_ctrl_deinit(dev);
return ret;
}
@@ -1159,6 +1272,8 @@ run:
shmctl(ctx->_shmid, IPC_RMID, NULL);
/* wait for child and detach from it */
wait(NULL);
+ if (exit_code == EXIT_FAILURE)
+ ublk_err("%s: command failed\n", __func__);
exit(exit_code);
} else {
exit(EXIT_FAILURE);
@@ -1266,6 +1381,7 @@ static int cmd_dev_get_features(void)
[const_ilog2(UBLK_F_UPDATE_SIZE)] = "UPDATE_SIZE",
[const_ilog2(UBLK_F_AUTO_BUF_REG)] = "AUTO_BUF_REG",
[const_ilog2(UBLK_F_QUIESCE)] = "QUIESCE",
+ [const_ilog2(UBLK_F_PER_IO_DAEMON)] = "PER_IO_DAEMON",
};
struct ublk_dev *dev;
__u64 features = 0;
@@ -1360,8 +1476,10 @@ static void __cmd_create_help(char *exe, bool recovery)
exe, recovery ? "recover" : "add");
printf("\t[--foreground] [--quiet] [-z] [--auto_zc] [--auto_zc_fallback] [--debug_mask mask] [-r 0|1 ] [-g]\n");
printf("\t[-e 0|1 ] [-i 0|1]\n");
+ printf("\t[--nthreads threads] [--per_io_tasks]\n");
printf("\t[target options] [backfile1] [backfile2] ...\n");
printf("\tdefault: nr_queues=2(max 32), depth=128(max 1024), dev_id=-1(auto allocation)\n");
+ printf("\tdefault: nthreads=nr_queues");
for (i = 0; i < sizeof(tgt_ops_list) / sizeof(tgt_ops_list[0]); i++) {
const struct ublk_tgt_ops *ops = tgt_ops_list[i];
@@ -1418,6 +1536,8 @@ int main(int argc, char *argv[])
{ "auto_zc", 0, NULL, 0 },
{ "auto_zc_fallback", 0, NULL, 0 },
{ "size", 1, NULL, 's'},
+ { "nthreads", 1, NULL, 0 },
+ { "per_io_tasks", 0, NULL, 0 },
{ 0, 0, 0, 0 }
};
const struct ublk_tgt_ops *ops = NULL;
@@ -1493,6 +1613,10 @@ int main(int argc, char *argv[])
ctx.flags |= UBLK_F_AUTO_BUF_REG;
if (!strcmp(longopts[option_idx].name, "auto_zc_fallback"))
ctx.auto_zc_fallback = 1;
+ if (!strcmp(longopts[option_idx].name, "nthreads"))
+ ctx.nthreads = strtol(optarg, NULL, 10);
+ if (!strcmp(longopts[option_idx].name, "per_io_tasks"))
+ ctx.per_io_tasks = 1;
break;
case '?':
/*
diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h
index e34508bf5798..6be601536b3d 100644
--- a/tools/testing/selftests/ublk/kublk.h
+++ b/tools/testing/selftests/ublk/kublk.h
@@ -49,11 +49,14 @@
#define UBLKSRV_IO_IDLE_SECS 20
#define UBLK_IO_MAX_BYTES (1 << 20)
-#define UBLK_MAX_QUEUES 32
+#define UBLK_MAX_QUEUES_SHIFT 5
+#define UBLK_MAX_QUEUES (1 << UBLK_MAX_QUEUES_SHIFT)
+#define UBLK_MAX_THREADS_SHIFT 5
+#define UBLK_MAX_THREADS (1 << UBLK_MAX_THREADS_SHIFT)
#define UBLK_QUEUE_DEPTH 1024
#define UBLK_DBG_DEV (1U << 0)
-#define UBLK_DBG_QUEUE (1U << 1)
+#define UBLK_DBG_THREAD (1U << 1)
#define UBLK_DBG_IO_CMD (1U << 2)
#define UBLK_DBG_IO (1U << 3)
#define UBLK_DBG_CTRL_CMD (1U << 4)
@@ -61,6 +64,7 @@
struct ublk_dev;
struct ublk_queue;
+struct ublk_thread;
struct stripe_ctx {
/* stripe */
@@ -76,6 +80,7 @@ struct dev_ctx {
char tgt_type[16];
unsigned long flags;
unsigned nr_hw_queues;
+ unsigned short nthreads;
unsigned queue_depth;
int dev_id;
int nr_files;
@@ -85,6 +90,7 @@ struct dev_ctx {
unsigned int fg:1;
unsigned int recovery:1;
unsigned int auto_zc_fallback:1;
+ unsigned int per_io_tasks:1;
int _evtfd;
int _shmid;
@@ -123,10 +129,14 @@ struct ublk_io {
unsigned short flags;
unsigned short refs; /* used by target code only */
+ int tag;
+
int result;
+ unsigned short buf_index;
unsigned short tgt_ios;
void *private_data;
+ struct ublk_thread *t;
};
struct ublk_tgt_ops {
@@ -165,28 +175,39 @@ struct ublk_tgt {
struct ublk_queue {
int q_id;
int q_depth;
- unsigned int cmd_inflight;
- unsigned int io_inflight;
struct ublk_dev *dev;
const struct ublk_tgt_ops *tgt_ops;
struct ublksrv_io_desc *io_cmd_buf;
- struct io_uring ring;
+
struct ublk_io ios[UBLK_QUEUE_DEPTH];
-#define UBLKSRV_QUEUE_STOPPING (1U << 0)
-#define UBLKSRV_QUEUE_IDLE (1U << 1)
#define UBLKSRV_NO_BUF (1U << 2)
#define UBLKSRV_ZC (1U << 3)
#define UBLKSRV_AUTO_BUF_REG (1U << 4)
#define UBLKSRV_AUTO_BUF_REG_FALLBACK (1U << 5)
unsigned state;
- pid_t tid;
+};
+
+struct ublk_thread {
+ struct ublk_dev *dev;
+ struct io_uring ring;
+ unsigned int cmd_inflight;
+ unsigned int io_inflight;
+
pthread_t thread;
+ unsigned idx;
+
+#define UBLKSRV_THREAD_STOPPING (1U << 0)
+#define UBLKSRV_THREAD_IDLE (1U << 1)
+ unsigned state;
};
struct ublk_dev {
struct ublk_tgt tgt;
struct ublksrv_ctrl_dev_info dev_info;
struct ublk_queue q[UBLK_MAX_QUEUES];
+ struct ublk_thread threads[UBLK_MAX_THREADS];
+ unsigned nthreads;
+ unsigned per_io_tasks;
int fds[MAX_BACK_FILES + 1]; /* fds[0] points to /dev/ublkcN */
int nr_fds;
@@ -211,7 +232,7 @@ struct ublk_dev {
extern unsigned int ublk_dbg_mask;
-extern int ublk_queue_io_cmd(struct ublk_queue *q, struct ublk_io *io, unsigned tag);
+extern int ublk_queue_io_cmd(struct ublk_io *io);
static inline int ublk_io_auto_zc_fallback(const struct ublksrv_io_desc *iod)
@@ -225,11 +246,14 @@ static inline int is_target_io(__u64 user_data)
}
static inline __u64 build_user_data(unsigned tag, unsigned op,
- unsigned tgt_data, unsigned is_target_io)
+ unsigned tgt_data, unsigned q_id, unsigned is_target_io)
{
- assert(!(tag >> 16) && !(op >> 8) && !(tgt_data >> 16));
+ /* we only have 7 bits to encode q_id */
+ _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7);
+ assert(!(tag >> 16) && !(op >> 8) && !(tgt_data >> 16) && !(q_id >> 7));
- return tag | (op << 16) | (tgt_data << 24) | (__u64)is_target_io << 63;
+ return tag | (op << 16) | (tgt_data << 24) |
+ (__u64)q_id << 56 | (__u64)is_target_io << 63;
}
static inline unsigned int user_data_to_tag(__u64 user_data)
@@ -247,6 +271,11 @@ static inline unsigned int user_data_to_tgt_data(__u64 user_data)
return (user_data >> 24) & 0xffff;
}
+static inline unsigned int user_data_to_q_id(__u64 user_data)
+{
+ return (user_data >> 56) & 0x7f;
+}
+
static inline unsigned short ublk_cmd_op_nr(unsigned int op)
{
return _IOC_NR(op);
@@ -280,17 +309,23 @@ static inline void ublk_dbg(int level, const char *fmt, ...)
}
}
-static inline int ublk_queue_alloc_sqes(struct ublk_queue *q,
+static inline struct ublk_queue *ublk_io_to_queue(const struct ublk_io *io)
+{
+ return container_of(io, struct ublk_queue, ios[io->tag]);
+}
+
+static inline int ublk_io_alloc_sqes(struct ublk_io *io,
struct io_uring_sqe *sqes[], int nr_sqes)
{
- unsigned left = io_uring_sq_space_left(&q->ring);
+ struct io_uring *ring = &io->t->ring;
+ unsigned left = io_uring_sq_space_left(ring);
int i;
if (left < nr_sqes)
- io_uring_submit(&q->ring);
+ io_uring_submit(ring);
for (i = 0; i < nr_sqes; i++) {
- sqes[i] = io_uring_get_sqe(&q->ring);
+ sqes[i] = io_uring_get_sqe(ring);
if (!sqes[i])
return i;
}
@@ -373,7 +408,7 @@ static inline int ublk_complete_io(struct ublk_queue *q, unsigned tag, int res)
ublk_mark_io_done(io, res);
- return ublk_queue_io_cmd(q, io, tag);
+ return ublk_queue_io_cmd(io);
}
static inline void ublk_queued_tgt_io(struct ublk_queue *q, unsigned tag, int queued)
@@ -383,7 +418,7 @@ static inline void ublk_queued_tgt_io(struct ublk_queue *q, unsigned tag, int qu
else {
struct ublk_io *io = ublk_get_io(q, tag);
- q->io_inflight += queued;
+ io->t->io_inflight += queued;
io->tgt_ios = queued;
io->result = 0;
}
@@ -393,7 +428,7 @@ static inline int ublk_completed_tgt_io(struct ublk_queue *q, unsigned tag)
{
struct ublk_io *io = ublk_get_io(q, tag);
- q->io_inflight--;
+ io->t->io_inflight--;
return --io->tgt_ios == 0;
}
diff --git a/tools/testing/selftests/ublk/null.c b/tools/testing/selftests/ublk/null.c
index 44aca31cf2b0..afe0b99d77ee 100644
--- a/tools/testing/selftests/ublk/null.c
+++ b/tools/testing/selftests/ublk/null.c
@@ -43,7 +43,7 @@ static int ublk_null_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev)
}
static void __setup_nop_io(int tag, const struct ublksrv_io_desc *iod,
- struct io_uring_sqe *sqe)
+ struct io_uring_sqe *sqe, int q_id)
{
unsigned ublk_op = ublksrv_get_op(iod);
@@ -52,7 +52,7 @@ static void __setup_nop_io(int tag, const struct ublksrv_io_desc *iod,
sqe->flags |= IOSQE_FIXED_FILE;
sqe->rw_flags = IORING_NOP_FIXED_BUFFER | IORING_NOP_INJECT_RESULT;
sqe->len = iod->nr_sectors << 9; /* injected result */
- sqe->user_data = build_user_data(tag, ublk_op, 0, 1);
+ sqe->user_data = build_user_data(tag, ublk_op, 0, q_id, 1);
}
static int null_queue_zc_io(struct ublk_queue *q, int tag)
@@ -60,18 +60,18 @@ static int null_queue_zc_io(struct ublk_queue *q, int tag)
const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag);
struct io_uring_sqe *sqe[3];
- ublk_queue_alloc_sqes(q, sqe, 3);
+ ublk_io_alloc_sqes(ublk_get_io(q, tag), sqe, 3);
- io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, tag);
+ io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, ublk_get_io(q, tag)->buf_index);
sqe[0]->user_data = build_user_data(tag,
- ublk_cmd_op_nr(sqe[0]->cmd_op), 0, 1);
+ ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1);
sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK;
- __setup_nop_io(tag, iod, sqe[1]);
+ __setup_nop_io(tag, iod, sqe[1], q->q_id);
sqe[1]->flags |= IOSQE_IO_HARDLINK;
- io_uring_prep_buf_unregister(sqe[2], 0, tag, q->q_id, tag);
- sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, 1);
+ io_uring_prep_buf_unregister(sqe[2], 0, tag, q->q_id, ublk_get_io(q, tag)->buf_index);
+ sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, q->q_id, 1);
// buf register is marked as IOSQE_CQE_SKIP_SUCCESS
return 2;
@@ -82,8 +82,8 @@ static int null_queue_auto_zc_io(struct ublk_queue *q, int tag)
const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag);
struct io_uring_sqe *sqe[1];
- ublk_queue_alloc_sqes(q, sqe, 1);
- __setup_nop_io(tag, iod, sqe[0]);
+ ublk_io_alloc_sqes(ublk_get_io(q, tag), sqe, 1);
+ __setup_nop_io(tag, iod, sqe[0], q->q_id);
return 1;
}
@@ -136,7 +136,7 @@ static unsigned short ublk_null_buf_index(const struct ublk_queue *q, int tag)
{
if (q->state & UBLKSRV_AUTO_BUF_REG_FALLBACK)
return (unsigned short)-1;
- return tag;
+ return q->ios[tag].buf_index;
}
const struct ublk_tgt_ops null_tgt_ops = {
diff --git a/tools/testing/selftests/ublk/stripe.c b/tools/testing/selftests/ublk/stripe.c
index 404a143bf3d6..37d50bbf5f5e 100644
--- a/tools/testing/selftests/ublk/stripe.c
+++ b/tools/testing/selftests/ublk/stripe.c
@@ -138,13 +138,13 @@ static int stripe_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_
io->private_data = s;
calculate_stripe_array(conf, iod, s, base);
- ublk_queue_alloc_sqes(q, sqe, s->nr + extra);
+ ublk_io_alloc_sqes(ublk_get_io(q, tag), sqe, s->nr + extra);
if (zc) {
- io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, tag);
+ io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, io->buf_index);
sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK;
sqe[0]->user_data = build_user_data(tag,
- ublk_cmd_op_nr(sqe[0]->cmd_op), 0, 1);
+ ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1);
}
for (i = zc; i < s->nr + extra - zc; i++) {
@@ -162,13 +162,14 @@ static int stripe_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_
sqe[i]->flags |= IOSQE_IO_HARDLINK;
}
/* bit63 marks us as tgt io */
- sqe[i]->user_data = build_user_data(tag, ublksrv_get_op(iod), i - zc, 1);
+ sqe[i]->user_data = build_user_data(tag, ublksrv_get_op(iod), i - zc, q->q_id, 1);
}
if (zc) {
struct io_uring_sqe *unreg = sqe[s->nr + 1];
- io_uring_prep_buf_unregister(unreg, 0, tag, q->q_id, tag);
- unreg->user_data = build_user_data(tag, ublk_cmd_op_nr(unreg->cmd_op), 0, 1);
+ io_uring_prep_buf_unregister(unreg, 0, tag, q->q_id, io->buf_index);
+ unreg->user_data = build_user_data(
+ tag, ublk_cmd_op_nr(unreg->cmd_op), 0, q->q_id, 1);
}
/* register buffer is skip_success */
@@ -181,11 +182,11 @@ static int handle_flush(struct ublk_queue *q, const struct ublksrv_io_desc *iod,
struct io_uring_sqe *sqe[NR_STRIPE];
int i;
- ublk_queue_alloc_sqes(q, sqe, conf->nr_files);
+ ublk_io_alloc_sqes(ublk_get_io(q, tag), sqe, conf->nr_files);
for (i = 0; i < conf->nr_files; i++) {
io_uring_prep_fsync(sqe[i], i + 1, IORING_FSYNC_DATASYNC);
io_uring_sqe_set_flags(sqe[i], IOSQE_FIXED_FILE);
- sqe[i]->user_data = build_user_data(tag, UBLK_IO_OP_FLUSH, 0, 1);
+ sqe[i]->user_data = build_user_data(tag, UBLK_IO_OP_FLUSH, 0, q->q_id, 1);
}
return conf->nr_files;
}
diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh
index 0145569ee7e9..8a4dbd09feb0 100755
--- a/tools/testing/selftests/ublk/test_common.sh
+++ b/tools/testing/selftests/ublk/test_common.sh
@@ -278,6 +278,11 @@ __run_io_and_remove()
fio --name=job1 --filename=/dev/ublkb"${dev_id}" --ioengine=libaio \
--rw=randrw --norandommap --iodepth=256 --size="${size}" --numjobs="$(nproc)" \
--runtime=20 --time_based > /dev/null 2>&1 &
+ fio --name=batchjob --filename=/dev/ublkb"${dev_id}" --ioengine=io_uring \
+ --rw=randrw --norandommap --iodepth=256 --size="${size}" \
+ --numjobs="$(nproc)" --runtime=20 --time_based \
+ --iodepth_batch_submit=32 --iodepth_batch_complete_min=32 \
+ --force_async=7 > /dev/null 2>&1 &
sleep 2
if [ "${kill_server}" = "yes" ]; then
local state
diff --git a/tools/testing/selftests/ublk/test_generic_12.sh b/tools/testing/selftests/ublk/test_generic_12.sh
new file mode 100755
index 000000000000..7abbb00d251d
--- /dev/null
+++ b/tools/testing/selftests/ublk/test_generic_12.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
+
+TID="generic_12"
+ERR_CODE=0
+
+if ! _have_program bpftrace; then
+ exit "$UBLK_SKIP_CODE"
+fi
+
+_prep_test "null" "do imbalanced load, it should be balanced over I/O threads"
+
+NTHREADS=6
+dev_id=$(_add_ublk_dev -t null -q 4 -d 16 --nthreads $NTHREADS --per_io_tasks)
+_check_add_dev $TID $?
+
+dev_t=$(_get_disk_dev_t "$dev_id")
+bpftrace trace/count_ios_per_tid.bt "$dev_t" > "$UBLK_TMP" 2>&1 &
+btrace_pid=$!
+sleep 2
+
+if ! kill -0 "$btrace_pid" > /dev/null 2>&1; then
+ _cleanup_test "null"
+ exit "$UBLK_SKIP_CODE"
+fi
+
+# do imbalanced I/O on the ublk device
+# pin to cpu 0 to prevent migration/only target one queue
+fio --name=write_seq \
+ --filename=/dev/ublkb"${dev_id}" \
+ --ioengine=libaio --iodepth=16 \
+ --rw=write \
+ --size=512M \
+ --direct=1 \
+ --bs=4k \
+ --cpus_allowed=0 > /dev/null 2>&1
+ERR_CODE=$?
+kill "$btrace_pid"
+wait
+
+# check that every task handles some I/O, even though all I/O was issued
+# from a single CPU. when ublk gets support for round-robin tag
+# allocation, this check can be strengthened to assert that every thread
+# handles the same number of I/Os
+NR_THREADS_THAT_HANDLED_IO=$(grep -c '@' ${UBLK_TMP})
+if [[ $NR_THREADS_THAT_HANDLED_IO -ne $NTHREADS ]]; then
+ echo "only $NR_THREADS_THAT_HANDLED_IO handled I/O! expected $NTHREADS"
+ cat "$UBLK_TMP"
+ ERR_CODE=255
+fi
+
+_cleanup_test "null"
+_show_result $TID $ERR_CODE
diff --git a/tools/testing/selftests/ublk/test_stress_03.sh b/tools/testing/selftests/ublk/test_stress_03.sh
index 7d728ce50774..3ed4c9b2d8c0 100755
--- a/tools/testing/selftests/ublk/test_stress_03.sh
+++ b/tools/testing/selftests/ublk/test_stress_03.sh
@@ -32,14 +32,23 @@ _create_backfile 2 128M
ublk_io_and_remove 8G -t null -q 4 -z &
ublk_io_and_remove 256M -t loop -q 4 -z "${UBLK_BACKFILES[0]}" &
ublk_io_and_remove 256M -t stripe -q 4 -z "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
+wait
if _have_feature "AUTO_BUF_REG"; then
ublk_io_and_remove 8G -t null -q 4 --auto_zc &
ublk_io_and_remove 256M -t loop -q 4 --auto_zc "${UBLK_BACKFILES[0]}" &
ublk_io_and_remove 256M -t stripe -q 4 --auto_zc "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
ublk_io_and_remove 8G -t null -q 4 -z --auto_zc --auto_zc_fallback &
+ wait
+fi
+
+if _have_feature "PER_IO_DAEMON"; then
+ ublk_io_and_remove 8G -t null -q 4 --auto_zc --nthreads 8 --per_io_tasks &
+ ublk_io_and_remove 256M -t loop -q 4 --auto_zc --nthreads 8 --per_io_tasks "${UBLK_BACKFILES[0]}" &
+ ublk_io_and_remove 256M -t stripe -q 4 --auto_zc --nthreads 8 --per_io_tasks "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
+ ublk_io_and_remove 8G -t null -q 4 -z --auto_zc --auto_zc_fallback --nthreads 8 --per_io_tasks &
+ wait
fi
-wait
_cleanup_test "stress"
_show_result $TID $ERR_CODE
diff --git a/tools/testing/selftests/ublk/test_stress_04.sh b/tools/testing/selftests/ublk/test_stress_04.sh
index 9bcfa64ea1f0..40d1437ca298 100755
--- a/tools/testing/selftests/ublk/test_stress_04.sh
+++ b/tools/testing/selftests/ublk/test_stress_04.sh
@@ -38,6 +38,13 @@ if _have_feature "AUTO_BUF_REG"; then
ublk_io_and_kill_daemon 256M -t stripe -q 4 --auto_zc "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
ublk_io_and_kill_daemon 8G -t null -q 4 -z --auto_zc --auto_zc_fallback &
fi
+
+if _have_feature "PER_IO_DAEMON"; then
+ ublk_io_and_kill_daemon 8G -t null -q 4 --nthreads 8 --per_io_tasks &
+ ublk_io_and_kill_daemon 256M -t loop -q 4 --nthreads 8 --per_io_tasks "${UBLK_BACKFILES[0]}" &
+ ublk_io_and_kill_daemon 256M -t stripe -q 4 --nthreads 8 --per_io_tasks "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
+ ublk_io_and_kill_daemon 8G -t null -q 4 --nthreads 8 --per_io_tasks &
+fi
wait
_cleanup_test "stress"
diff --git a/tools/testing/selftests/ublk/test_stress_05.sh b/tools/testing/selftests/ublk/test_stress_05.sh
index bcfc904cefc6..566cfd90d192 100755
--- a/tools/testing/selftests/ublk/test_stress_05.sh
+++ b/tools/testing/selftests/ublk/test_stress_05.sh
@@ -69,5 +69,12 @@ if _have_feature "AUTO_BUF_REG"; then
done
fi
+if _have_feature "PER_IO_DAEMON"; then
+ ublk_io_and_remove 8G -t null -q 4 --nthreads 8 --per_io_tasks -r 1 -i "$reissue" &
+ ublk_io_and_remove 256M -t loop -q 4 --nthreads 8 --per_io_tasks -r 1 -i "$reissue" "${UBLK_BACKFILES[0]}" &
+ ublk_io_and_remove 8G -t null -q 4 --nthreads 8 --per_io_tasks -r 1 -i "$reissue" &
+fi
+wait
+
_cleanup_test "stress"
_show_result $TID $ERR_CODE
diff --git a/tools/testing/selftests/ublk/trace/count_ios_per_tid.bt b/tools/testing/selftests/ublk/trace/count_ios_per_tid.bt
new file mode 100644
index 000000000000..f4aa63ff2938
--- /dev/null
+++ b/tools/testing/selftests/ublk/trace/count_ios_per_tid.bt
@@ -0,0 +1,11 @@
+/*
+ * Tabulates and prints I/O completions per thread for the given device
+ *
+ * $1: dev_t
+*/
+tracepoint:block:block_rq_complete
+{
+ if (args.dev == $1) {
+ @[tid] = count();
+ }
+}
diff --git a/tools/testing/selftests/vDSO/vgetrandom-chacha.S b/tools/testing/selftests/vDSO/vgetrandom-chacha.S
index d6e09af7c0a9..a4a82e1c28a9 100644
--- a/tools/testing/selftests/vDSO/vgetrandom-chacha.S
+++ b/tools/testing/selftests/vDSO/vgetrandom-chacha.S
@@ -11,6 +11,8 @@
#include "../../../../arch/loongarch/vdso/vgetrandom-chacha.S"
#elif defined(__powerpc__) || defined(__powerpc64__)
#include "../../../../arch/powerpc/kernel/vdso/vgetrandom-chacha.S"
+#elif defined(__riscv) && __riscv_xlen == 64
+#include "../../../../arch/riscv/kernel/vdso/vgetrandom-chacha.S"
#elif defined(__s390x__)
#include "../../../../arch/s390/kernel/vdso64/vgetrandom-chacha.S"
#elif defined(__x86_64__)
diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index f703fcfe9f7c..83148875a12c 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -12,7 +12,7 @@ CAN_BUILD_WITH_NOPIE := $(shell ./check_cc.sh "$(CC)" trivial_program.c -no-pie)
TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap_vdso \
check_initial_reg_state sigreturn iopl ioperm \
- test_vsyscall mov_ss_trap \
+ test_vsyscall mov_ss_trap sigtrap_loop \
syscall_arg_fault fsgsbase_restore sigaltstack
TARGETS_C_BOTHBITS += nx_stack
TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \
diff --git a/tools/testing/selftests/x86/sigtrap_loop.c b/tools/testing/selftests/x86/sigtrap_loop.c
new file mode 100644
index 000000000000..9d065479e89f
--- /dev/null
+++ b/tools/testing/selftests/x86/sigtrap_loop.c
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2025 Intel Corporation
+ */
+#define _GNU_SOURCE
+
+#include <err.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ucontext.h>
+
+#ifdef __x86_64__
+# define REG_IP REG_RIP
+#else
+# define REG_IP REG_EIP
+#endif
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), int flags)
+{
+ struct sigaction sa;
+
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_sigaction = handler;
+ sa.sa_flags = SA_SIGINFO | flags;
+ sigemptyset(&sa.sa_mask);
+
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+
+ return;
+}
+
+static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
+{
+ ucontext_t *ctx = (ucontext_t *)ctx_void;
+ static unsigned int loop_count_on_same_ip;
+ static unsigned long last_trap_ip;
+
+ if (last_trap_ip == ctx->uc_mcontext.gregs[REG_IP]) {
+ printf("\tTrapped at %016lx\n", last_trap_ip);
+
+ /*
+ * If the same IP is hit more than 10 times in a row, it is
+ * _considered_ an infinite loop.
+ */
+ if (++loop_count_on_same_ip > 10) {
+ printf("[FAIL]\tDetected SIGTRAP infinite loop\n");
+ exit(1);
+ }
+
+ return;
+ }
+
+ loop_count_on_same_ip = 0;
+ last_trap_ip = ctx->uc_mcontext.gregs[REG_IP];
+ printf("\tTrapped at %016lx\n", last_trap_ip);
+}
+
+int main(int argc, char *argv[])
+{
+ sethandler(SIGTRAP, sigtrap, 0);
+
+ /*
+ * Set the Trap Flag (TF) to single-step the test code, therefore to
+ * trigger a SIGTRAP signal after each instruction until the TF is
+ * cleared.
+ *
+ * Because the arithmetic flags are not significant here, the TF is
+ * set by pushing 0x302 onto the stack and then popping it into the
+ * flags register.
+ *
+ * Four instructions in the following asm code are executed with the
+ * TF set, thus the SIGTRAP handler is expected to run four times.
+ */
+ printf("[RUN]\tSIGTRAP infinite loop detection\n");
+ asm volatile(
+#ifdef __x86_64__
+ /*
+ * Avoid clobbering the redzone
+ *
+ * Equivalent to "sub $128, %rsp", however -128 can be encoded
+ * in a single byte immediate while 128 uses 4 bytes.
+ */
+ "add $-128, %rsp\n\t"
+#endif
+ "push $0x302\n\t"
+ "popf\n\t"
+ "nop\n\t"
+ "nop\n\t"
+ "push $0x202\n\t"
+ "popf\n\t"
+#ifdef __x86_64__
+ "sub $-128, %rsp\n\t"
+#endif
+ );
+
+ printf("[OK]\tNo SIGTRAP infinite loop detected\n");
+ return 0;
+}