summaryrefslogtreecommitdiff
path: root/tools
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-04-27 19:42:02 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2023-04-27 19:42:02 -0700
commit7fa8a8ee9400fe8ec188426e40e481717bc5e924 (patch)
treecc8fd6b4f936ec01e73238643757451e20478c07 /tools
parent91ec4b0d11fe115581ce2835300558802ce55e6c (diff)
parent4d4b6d66db63ceed399f1fb1a4b24081d2590eb1 (diff)
Merge tag 'mm-stable-2023-04-27-15-30' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton: - Nick Piggin's "shoot lazy tlbs" series, to improve the peformance of switching from a user process to a kernel thread. - More folio conversions from Kefeng Wang, Zhang Peng and Pankaj Raghav. - zsmalloc performance improvements from Sergey Senozhatsky. - Yue Zhao has found and fixed some data race issues around the alteration of memcg userspace tunables. - VFS rationalizations from Christoph Hellwig: - removal of most of the callers of write_one_page() - make __filemap_get_folio()'s return value more useful - Luis Chamberlain has changed tmpfs so it no longer requires swap backing. Use `mount -o noswap'. - Qi Zheng has made the slab shrinkers operate locklessly, providing some scalability benefits. - Keith Busch has improved dmapool's performance, making part of its operations O(1) rather than O(n). - Peter Xu adds the UFFD_FEATURE_WP_UNPOPULATED feature to userfaultd, permitting userspace to wr-protect anon memory unpopulated ptes. - Kirill Shutemov has changed MAX_ORDER's meaning to be inclusive rather than exclusive, and has fixed a bunch of errors which were caused by its unintuitive meaning. - Axel Rasmussen give userfaultfd the UFFDIO_CONTINUE_MODE_WP feature, which causes minor faults to install a write-protected pte. - Vlastimil Babka has done some maintenance work on vma_merge(): cleanups to the kernel code and improvements to our userspace test harness. - Cleanups to do_fault_around() by Lorenzo Stoakes. - Mike Rapoport has moved a lot of initialization code out of various mm/ files and into mm/mm_init.c. - Lorenzo Stoakes removd vmf_insert_mixed_prot(), which was added for DRM, but DRM doesn't use it any more. - Lorenzo has also coverted read_kcore() and vread() to use iterators and has thereby removed the use of bounce buffers in some cases. - Lorenzo has also contributed further cleanups of vma_merge(). - Chaitanya Prakash provides some fixes to the mmap selftesting code. - Matthew Wilcox changes xfs and afs so they no longer take sleeping locks in ->map_page(), a step towards RCUification of pagefaults. - Suren Baghdasaryan has improved mmap_lock scalability by switching to per-VMA locking. - Frederic Weisbecker has reworked the percpu cache draining so that it no longer causes latency glitches on cpu isolated workloads. - Mike Rapoport cleans up and corrects the ARCH_FORCE_MAX_ORDER Kconfig logic. - Liu Shixin has changed zswap's initialization so we no longer waste a chunk of memory if zswap is not being used. - Yosry Ahmed has improved the performance of memcg statistics flushing. - David Stevens has fixed several issues involving khugepaged, userfaultfd and shmem. - Christoph Hellwig has provided some cleanup work to zram's IO-related code paths. - David Hildenbrand has fixed up some issues in the selftest code's testing of our pte state changing. - Pankaj Raghav has made page_endio() unneeded and has removed it. - Peter Xu contributed some rationalizations of the userfaultfd selftests. - Yosry Ahmed has fixed an issue around memcg's page recalim accounting. - Chaitanya Prakash has fixed some arm-related issues in the selftests/mm code. - Longlong Xia has improved the way in which KSM handles hwpoisoned pages. - Peter Xu fixes a few issues with uffd-wp at fork() time. - Stefan Roesch has changed KSM so that it may now be used on a per-process and per-cgroup basis. * tag 'mm-stable-2023-04-27-15-30' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (369 commits) mm,unmap: avoid flushing TLB in batch if PTE is inaccessible shmem: restrict noswap option to initial user namespace mm/khugepaged: fix conflicting mods to collapse_file() sparse: remove unnecessary 0 values from rc mm: move 'mmap_min_addr' logic from callers into vm_unmapped_area() hugetlb: pte_alloc_huge() to replace huge pte_alloc_map() maple_tree: fix allocation in mas_sparse_area() mm: do not increment pgfault stats when page fault handler retries zsmalloc: allow only one active pool compaction context selftests/mm: add new selftests for KSM mm: add new KSM process and sysfs knobs mm: add new api to enable ksm per process mm: shrinkers: fix debugfs file permissions mm: don't check VMA write permissions if the PTE/PMD indicates write permissions migrate_pages_batch: fix statistics for longterm pin retry userfaultfd: use helper function range_in_vma() lib/show_mem.c: use for_each_populated_zone() simplify code mm: correct arg in reclaim_pages()/reclaim_clean_pages_from_list() fs/buffer: convert create_page_buffers to folio_create_buffers fs/buffer: add folio_create_empty_buffers helper ...
Diffstat (limited to 'tools')
-rw-r--r--tools/include/uapi/linux/prctl.h2
-rw-r--r--tools/testing/memblock/linux/mmzone.h6
-rw-r--r--tools/testing/radix-tree/maple.c24
-rw-r--r--tools/testing/selftests/memfd/memfd_test.c14
-rw-r--r--tools/testing/selftests/mm/.gitignore5
-rw-r--r--tools/testing/selftests/mm/Makefile92
-rw-r--r--tools/testing/selftests/mm/check_config.sh4
-rw-r--r--tools/testing/selftests/mm/cow.c33
-rw-r--r--tools/testing/selftests/mm/gup_test.c5
-rw-r--r--tools/testing/selftests/mm/hugepage-mremap.c9
-rw-r--r--tools/testing/selftests/mm/hugetlb-madvise.c25
-rw-r--r--tools/testing/selftests/mm/khugepaged.c4
-rw-r--r--tools/testing/selftests/mm/ksm_functional_tests.c97
-rw-r--r--tools/testing/selftests/mm/ksm_tests.c174
-rw-r--r--tools/testing/selftests/mm/mkdirty.c379
-rw-r--r--tools/testing/selftests/mm/mrelease_test.c11
-rw-r--r--tools/testing/selftests/mm/run_vmtests.sh46
-rw-r--r--tools/testing/selftests/mm/soft-dirty.c3
-rw-r--r--tools/testing/selftests/mm/split_huge_page_test.c10
-rw-r--r--tools/testing/selftests/mm/thuge-gen.c19
-rw-r--r--tools/testing/selftests/mm/transhuge-stress.c12
-rw-r--r--tools/testing/selftests/mm/uffd-common.c618
-rw-r--r--tools/testing/selftests/mm/uffd-common.h117
-rw-r--r--tools/testing/selftests/mm/uffd-stress.c481
-rw-r--r--tools/testing/selftests/mm/uffd-unit-tests.c1228
-rw-r--r--tools/testing/selftests/mm/userfaultfd.c1858
-rw-r--r--tools/testing/selftests/mm/util.h69
-rw-r--r--tools/testing/selftests/mm/va_high_addr_switch.c (renamed from tools/testing/selftests/mm/va_128TBswitch.c)41
-rw-r--r--tools/testing/selftests/mm/va_high_addr_switch.sh (renamed from tools/testing/selftests/mm/va_128TBswitch.sh)6
-rw-r--r--tools/testing/selftests/mm/virtual_address_range.c24
-rw-r--r--tools/testing/selftests/mm/vm_util.c180
-rw-r--r--tools/testing/selftests/mm/vm_util.h50
32 files changed, 3480 insertions, 2166 deletions
diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h
index 1312a137f7fb..759b3f53e53f 100644
--- a/tools/include/uapi/linux/prctl.h
+++ b/tools/include/uapi/linux/prctl.h
@@ -290,4 +290,6 @@ struct prctl_mm_map {
#define PR_SET_VMA 0x53564d41
# define PR_SET_VMA_ANON_NAME 0
+#define PR_SET_MEMORY_MERGE 67
+#define PR_GET_MEMORY_MERGE 68
#endif /* _LINUX_PRCTL_H */
diff --git a/tools/testing/memblock/linux/mmzone.h b/tools/testing/memblock/linux/mmzone.h
index e65f89b12f1c..134f8eab0768 100644
--- a/tools/testing/memblock/linux/mmzone.h
+++ b/tools/testing/memblock/linux/mmzone.h
@@ -17,10 +17,10 @@ enum zone_type {
};
#define MAX_NR_ZONES __MAX_NR_ZONES
-#define MAX_ORDER 11
-#define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1))
+#define MAX_ORDER 10
+#define MAX_ORDER_NR_PAGES (1 << MAX_ORDER)
-#define pageblock_order (MAX_ORDER - 1)
+#define pageblock_order MAX_ORDER
#define pageblock_nr_pages BIT(pageblock_order)
#define pageblock_align(pfn) ALIGN((pfn), pageblock_nr_pages)
#define pageblock_start_pfn(pfn) ALIGN_DOWN((pfn), pageblock_nr_pages)
diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c
index 4c89ff333f6f..9286d3baa12d 100644
--- a/tools/testing/radix-tree/maple.c
+++ b/tools/testing/radix-tree/maple.c
@@ -55,6 +55,28 @@ struct rcu_reader_struct {
struct rcu_test_struct2 *test;
};
+static int get_alloc_node_count(struct ma_state *mas)
+{
+ int count = 1;
+ struct maple_alloc *node = mas->alloc;
+
+ if (!node || ((unsigned long)node & 0x1))
+ return 0;
+ while (node->node_count) {
+ count += node->node_count;
+ node = node->slot[0];
+ }
+ return count;
+}
+
+static void check_mas_alloc_node_count(struct ma_state *mas)
+{
+ mas_node_count_gfp(mas, MAPLE_ALLOC_SLOTS + 1, GFP_KERNEL);
+ mas_node_count_gfp(mas, MAPLE_ALLOC_SLOTS + 3, GFP_KERNEL);
+ MT_BUG_ON(mas->tree, get_alloc_node_count(mas) != mas->alloc->total);
+ mas_destroy(mas);
+}
+
/*
* check_new_node() - Check the creation of new nodes and error path
* verification.
@@ -69,6 +91,8 @@ static noinline void check_new_node(struct maple_tree *mt)
MA_STATE(mas, mt, 0, 0);
+ check_mas_alloc_node_count(&mas);
+
/* Try allocating 3 nodes */
mtree_lock(mt);
mt_set_non_kernel(0);
diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c
index ae71f15f790d..dba0e8ba002f 100644
--- a/tools/testing/selftests/memfd/memfd_test.c
+++ b/tools/testing/selftests/memfd/memfd_test.c
@@ -43,6 +43,9 @@
*/
static size_t mfd_def_size = MFD_DEF_SIZE;
static const char *memfd_str = MEMFD_STR;
+static pid_t spawn_newpid_thread(unsigned int flags, int (*fn)(void *));
+static int newpid_thread_fn2(void *arg);
+static void join_newpid_thread(pid_t pid);
static ssize_t fd2name(int fd, char *buf, size_t bufsize)
{
@@ -1111,6 +1114,7 @@ static void test_noexec_seal(void)
static void test_sysctl_child(void)
{
int fd;
+ int pid;
printf("%s sysctl 0\n", memfd_str);
sysctl_assert_write("0");
@@ -1129,6 +1133,10 @@ static void test_sysctl_child(void)
mfd_def_size,
MFD_CLOEXEC | MFD_ALLOW_SEALING);
+ printf("%s child ns\n", memfd_str);
+ pid = spawn_newpid_thread(CLONE_NEWPID, newpid_thread_fn2);
+ join_newpid_thread(pid);
+
mfd_assert_mode(fd, 0666);
mfd_assert_has_seals(fd, F_SEAL_EXEC);
mfd_fail_chmod(fd, 0777);
@@ -1206,12 +1214,6 @@ static void test_sysctl(void)
int pid = spawn_newpid_thread(CLONE_NEWPID, newpid_thread_fn);
join_newpid_thread(pid);
-
- printf("%s child ns\n", memfd_str);
- sysctl_assert_write("1");
-
- pid = spawn_newpid_thread(CLONE_NEWPID, newpid_thread_fn2);
- join_newpid_thread(pid);
}
/*
diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore
index 1f8c36a9fa10..8917455f4f51 100644
--- a/tools/testing/selftests/mm/.gitignore
+++ b/tools/testing/selftests/mm/.gitignore
@@ -21,7 +21,8 @@ protection_keys
protection_keys_32
protection_keys_64
madv_populate
-userfaultfd
+uffd-stress
+uffd-unit-tests
mlock-intersect-test
mlock-random-test
virtual_address_range
@@ -36,3 +37,5 @@ split_huge_page_test
ksm_tests
local_config.h
local_config.mk
+ksm_functional_tests
+mdwe_test
diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index fc35050b5542..23af4633f0f4 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -20,7 +20,7 @@ MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/p
# Avoid accidental wrong builds, due to built-in rules working just a little
# bit too well--but not quite as well as required for our situation here.
#
-# In other words, "make userfaultfd" is supposed to fail to build at all,
+# In other words, "make $SOME_TEST" is supposed to fail to build at all,
# because this Makefile only supports either "make" (all), or "make /full/path".
# However, the built-in rules, if not suppressed, will pick up CFLAGS and the
# initial LDLIBS (but not the target-specific LDLIBS, because those are only
@@ -29,36 +29,39 @@ MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/p
# LDLIBS.
MAKEFLAGS += --no-builtin-rules
-CFLAGS = -Wall -I $(top_srcdir) $(EXTRA_CFLAGS) $(KHDR_INCLUDES)
+CFLAGS = -Wall -I $(top_srcdir) -I $(top_srcdir)/tools/include/uapi $(EXTRA_CFLAGS) $(KHDR_INCLUDES)
LDLIBS = -lrt -lpthread
-TEST_GEN_FILES = cow
-TEST_GEN_FILES += compaction_test
-TEST_GEN_FILES += gup_test
-TEST_GEN_FILES += hmm-tests
-TEST_GEN_FILES += hugetlb-madvise
-TEST_GEN_FILES += hugepage-mmap
-TEST_GEN_FILES += hugepage-mremap
-TEST_GEN_FILES += hugepage-shm
-TEST_GEN_FILES += hugepage-vmemmap
-TEST_GEN_FILES += khugepaged
-TEST_GEN_PROGS = madv_populate
-TEST_GEN_FILES += map_fixed_noreplace
-TEST_GEN_FILES += map_hugetlb
-TEST_GEN_FILES += map_populate
-TEST_GEN_FILES += memfd_secret
-TEST_GEN_FILES += migration
-TEST_GEN_FILES += mlock-random-test
-TEST_GEN_FILES += mlock2-tests
-TEST_GEN_FILES += mrelease_test
-TEST_GEN_FILES += mremap_dontunmap
-TEST_GEN_FILES += mremap_test
-TEST_GEN_FILES += on-fault-limit
-TEST_GEN_FILES += thuge-gen
-TEST_GEN_FILES += transhuge-stress
-TEST_GEN_FILES += userfaultfd
+
+TEST_GEN_PROGS = cow
+TEST_GEN_PROGS += compaction_test
+TEST_GEN_PROGS += gup_test
+TEST_GEN_PROGS += hmm-tests
+TEST_GEN_PROGS += hugetlb-madvise
+TEST_GEN_PROGS += hugepage-mmap
+TEST_GEN_PROGS += hugepage-mremap
+TEST_GEN_PROGS += hugepage-shm
+TEST_GEN_PROGS += hugepage-vmemmap
+TEST_GEN_PROGS += khugepaged
+TEST_GEN_PROGS += madv_populate
+TEST_GEN_PROGS += map_fixed_noreplace
+TEST_GEN_PROGS += map_hugetlb
+TEST_GEN_PROGS += map_populate
+TEST_GEN_PROGS += memfd_secret
+TEST_GEN_PROGS += migration
+TEST_GEN_PROGS += mkdirty
+TEST_GEN_PROGS += mlock-random-test
+TEST_GEN_PROGS += mlock2-tests
+TEST_GEN_PROGS += mrelease_test
+TEST_GEN_PROGS += mremap_dontunmap
+TEST_GEN_PROGS += mremap_test
+TEST_GEN_PROGS += on-fault-limit
+TEST_GEN_PROGS += thuge-gen
+TEST_GEN_PROGS += transhuge-stress
+TEST_GEN_PROGS += uffd-stress
+TEST_GEN_PROGS += uffd-unit-tests
TEST_GEN_PROGS += soft-dirty
TEST_GEN_PROGS += split_huge_page_test
-TEST_GEN_FILES += ksm_tests
+TEST_GEN_PROGS += ksm_tests
TEST_GEN_PROGS += ksm_functional_tests
TEST_GEN_PROGS += mdwe_test
@@ -76,41 +79,38 @@ CFLAGS += -no-pie
endif
ifeq ($(CAN_BUILD_I386),1)
-TEST_GEN_FILES += $(BINARIES_32)
+TEST_GEN_PROGS += $(BINARIES_32)
endif
ifeq ($(CAN_BUILD_X86_64),1)
-TEST_GEN_FILES += $(BINARIES_64)
+TEST_GEN_PROGS += $(BINARIES_64)
endif
else
ifneq (,$(findstring $(MACHINE),ppc64))
-TEST_GEN_FILES += protection_keys
+TEST_GEN_PROGS += protection_keys
endif
endif
ifneq (,$(filter $(MACHINE),arm64 ia64 mips64 parisc64 ppc64 riscv64 s390x sparc64 x86_64))
-TEST_GEN_FILES += va_128TBswitch
-TEST_GEN_FILES += virtual_address_range
-TEST_GEN_FILES += write_to_hugetlbfs
+TEST_GEN_PROGS += va_high_addr_switch
+TEST_GEN_PROGS += virtual_address_range
+TEST_GEN_PROGS += write_to_hugetlbfs
endif
TEST_PROGS := run_vmtests.sh
TEST_FILES := test_vmalloc.sh
TEST_FILES += test_hmm.sh
-TEST_FILES += va_128TBswitch.sh
+TEST_FILES += va_high_addr_switch.sh
include ../lib.mk
-$(OUTPUT)/cow: vm_util.c
-$(OUTPUT)/khugepaged: vm_util.c
-$(OUTPUT)/ksm_functional_tests: vm_util.c
-$(OUTPUT)/madv_populate: vm_util.c
-$(OUTPUT)/soft-dirty: vm_util.c
-$(OUTPUT)/split_huge_page_test: vm_util.c
-$(OUTPUT)/userfaultfd: vm_util.c
+$(TEST_GEN_PROGS): vm_util.c
+
+$(OUTPUT)/uffd-stress: uffd-common.c
+$(OUTPUT)/uffd-unit-tests: uffd-common.c
ifeq ($(MACHINE),x86_64)
BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32))
@@ -161,8 +161,8 @@ warn_32bit_failure:
endif
endif
-# cow_EXTRA_LIBS may get set in local_config.mk, or it may be left empty.
-$(OUTPUT)/cow: LDLIBS += $(COW_EXTRA_LIBS)
+# IOURING_EXTRA_LIBS may get set in local_config.mk, or it may be left empty.
+$(OUTPUT)/cow: LDLIBS += $(IOURING_EXTRA_LIBS)
$(OUTPUT)/mlock-random-test $(OUTPUT)/memfd_secret: LDLIBS += -lcap
@@ -175,11 +175,11 @@ local_config.mk local_config.h: check_config.sh
EXTRA_CLEAN += local_config.mk local_config.h
-ifeq ($(COW_EXTRA_LIBS),)
+ifeq ($(IOURING_EXTRA_LIBS),)
all: warn_missing_liburing
warn_missing_liburing:
@echo ; \
- echo "Warning: missing liburing support. Some COW tests will be skipped." ; \
+ echo "Warning: missing liburing support. Some tests will be skipped." ; \
echo
endif
diff --git a/tools/testing/selftests/mm/check_config.sh b/tools/testing/selftests/mm/check_config.sh
index bcba3af0acea..3954f4746161 100644
--- a/tools/testing/selftests/mm/check_config.sh
+++ b/tools/testing/selftests/mm/check_config.sh
@@ -21,11 +21,11 @@ $CC -c $tmpfile_c -o $tmpfile_o >/dev/null 2>&1
if [ -f $tmpfile_o ]; then
echo "#define LOCAL_CONFIG_HAVE_LIBURING 1" > $OUTPUT_H_FILE
- echo "COW_EXTRA_LIBS = -luring" > $OUTPUT_MKFILE
+ echo "IOURING_EXTRA_LIBS = -luring" > $OUTPUT_MKFILE
else
echo "// No liburing support found" > $OUTPUT_H_FILE
echo "# No liburing support found, so:" > $OUTPUT_MKFILE
- echo "COW_EXTRA_LIBS = " >> $OUTPUT_MKFILE
+ echo "IOURING_EXTRA_LIBS = " >> $OUTPUT_MKFILE
fi
rm ${tmpname}.*
diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c
index 0eb2e8180aa5..dc9d6fe86028 100644
--- a/tools/testing/selftests/mm/cow.c
+++ b/tools/testing/selftests/mm/cow.c
@@ -45,34 +45,6 @@ static size_t hugetlbsizes[10];
static int gup_fd;
static bool has_huge_zeropage;
-static void detect_thpsize(void)
-{
- int fd = open("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size",
- O_RDONLY);
- size_t size = 0;
- char buf[15];
- int ret;
-
- if (fd < 0)
- return;
-
- ret = pread(fd, buf, sizeof(buf), 0);
- if (ret > 0 && ret < sizeof(buf)) {
- buf[ret] = 0;
-
- size = strtoul(buf, NULL, 10);
- if (size < pagesize)
- size = 0;
- if (size > 0) {
- thpsize = size;
- ksft_print_msg("[INFO] detected THP size: %zu KiB\n",
- thpsize / 1024);
- }
- }
-
- close(fd);
-}
-
static void detect_huge_zeropage(void)
{
int fd = open("/sys/kernel/mm/transparent_hugepage/use_zero_page",
@@ -1741,7 +1713,10 @@ int main(int argc, char **argv)
int err;
pagesize = getpagesize();
- detect_thpsize();
+ thpsize = read_pmd_pagesize();
+ if (thpsize)
+ ksft_print_msg("[INFO] detected THP size: %zu KiB\n",
+ thpsize / 1024);
detect_hugetlbsizes();
detect_huge_zeropage();
diff --git a/tools/testing/selftests/mm/gup_test.c b/tools/testing/selftests/mm/gup_test.c
index e43879291dac..ec2229136384 100644
--- a/tools/testing/selftests/mm/gup_test.c
+++ b/tools/testing/selftests/mm/gup_test.c
@@ -12,8 +12,7 @@
#include <assert.h>
#include <mm/gup_test.h>
#include "../kselftest.h"
-
-#include "util.h"
+#include "vm_util.h"
#define MB (1UL << 20)
@@ -251,7 +250,7 @@ int main(int argc, char **argv)
if (touch) {
gup.gup_flags |= FOLL_TOUCH;
} else {
- for (; (unsigned long)p < gup.addr + size; p += PAGE_SIZE)
+ for (; (unsigned long)p < gup.addr + size; p += psize())
p[0] = 0;
}
diff --git a/tools/testing/selftests/mm/hugepage-mremap.c b/tools/testing/selftests/mm/hugepage-mremap.c
index e53b5eaa8fce..cabd0084f57b 100644
--- a/tools/testing/selftests/mm/hugepage-mremap.c
+++ b/tools/testing/selftests/mm/hugepage-mremap.c
@@ -23,6 +23,8 @@
#include <linux/userfaultfd.h>
#include <sys/ioctl.h>
#include <string.h>
+#include <stdbool.h>
+#include "vm_util.h"
#define DEFAULT_LENGTH_MB 10UL
#define MB_TO_BYTES(x) (x * 1024 * 1024)
@@ -60,7 +62,6 @@ static void register_region_with_uffd(char *addr, size_t len)
{
long uffd; /* userfaultfd file descriptor */
struct uffdio_api uffdio_api;
- struct uffdio_register uffdio_register;
/* Create and enable userfaultfd object. */
@@ -96,11 +97,7 @@ static void register_region_with_uffd(char *addr, size_t len)
* handling by the userfaultfd object. In mode, we request to track
* missing pages (i.e., pages that have not yet been faulted in).
*/
-
- uffdio_register.range.start = (unsigned long)addr;
- uffdio_register.range.len = len;
- uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
- if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) {
+ if (uffd_register(uffd, addr, len, true, false, false)) {
perror("ioctl-UFFDIO_REGISTER");
exit(1);
}
diff --git a/tools/testing/selftests/mm/hugetlb-madvise.c b/tools/testing/selftests/mm/hugetlb-madvise.c
index 9a127a8fe176..28426e30d9bc 100644
--- a/tools/testing/selftests/mm/hugetlb-madvise.c
+++ b/tools/testing/selftests/mm/hugetlb-madvise.c
@@ -18,6 +18,7 @@
#include <unistd.h>
#include <sys/mman.h>
#include <fcntl.h>
+#include "vm_util.h"
#define MIN_FREE_PAGES 20
#define NR_HUGE_PAGES 10 /* common number of pages to map/allocate */
@@ -35,30 +36,6 @@
unsigned long huge_page_size;
unsigned long base_page_size;
-/*
- * default_huge_page_size copied from mlock2-tests.c
- */
-unsigned long default_huge_page_size(void)
-{
- unsigned long hps = 0;
- char *line = NULL;
- size_t linelen = 0;
- FILE *f = fopen("/proc/meminfo", "r");
-
- if (!f)
- return 0;
- while (getline(&line, &linelen, f) > 0) {
- if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) {
- hps <<= 10;
- break;
- }
- }
-
- free(line);
- fclose(f);
- return hps;
-}
-
unsigned long get_free_hugepages(void)
{
unsigned long fhp = 0;
diff --git a/tools/testing/selftests/mm/khugepaged.c b/tools/testing/selftests/mm/khugepaged.c
index 64126c8cd561..97adc0f34f9c 100644
--- a/tools/testing/selftests/mm/khugepaged.c
+++ b/tools/testing/selftests/mm/khugepaged.c
@@ -1476,6 +1476,10 @@ int main(int argc, const char **argv)
page_size = getpagesize();
hpage_pmd_size = read_pmd_pagesize();
+ if (!hpage_pmd_size) {
+ printf("Reading PMD pagesize failed");
+ exit(EXIT_FAILURE);
+ }
hpage_pmd_nr = hpage_pmd_size / page_size;
default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1;
diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c
index d8b5b4930412..7bc9fc17c9f0 100644
--- a/tools/testing/selftests/mm/ksm_functional_tests.c
+++ b/tools/testing/selftests/mm/ksm_functional_tests.c
@@ -15,8 +15,10 @@
#include <errno.h>
#include <fcntl.h>
#include <sys/mman.h>
+#include <sys/prctl.h>
#include <sys/syscall.h>
#include <sys/ioctl.h>
+#include <sys/wait.h>
#include <linux/userfaultfd.h>
#include "../kselftest.h"
@@ -178,7 +180,6 @@ unmap:
static void test_unmerge_uffd_wp(void)
{
struct uffdio_writeprotect uffd_writeprotect;
- struct uffdio_register uffdio_register;
const unsigned int size = 2 * MiB;
struct uffdio_api uffdio_api;
char *map;
@@ -210,10 +211,7 @@ static void test_unmerge_uffd_wp(void)
}
/* Register UFFD-WP, no need for an actual handler. */
- uffdio_register.range.start = (unsigned long) map;
- uffdio_register.range.len = size;
- uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
- if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) < 0) {
+ if (uffd_register(uffd, map, size, false, true, false)) {
ksft_test_result_fail("UFFDIO_REGISTER_MODE_WP failed\n");
goto close_uffd;
}
@@ -241,9 +239,93 @@ unmap:
}
#endif
+/* Verify that KSM can be enabled / queried with prctl. */
+static void test_prctl(void)
+{
+ int ret;
+
+ ksft_print_msg("[RUN] %s\n", __func__);
+
+ ret = prctl(PR_SET_MEMORY_MERGE, 1, 0, 0, 0);
+ if (ret < 0 && errno == EINVAL) {
+ ksft_test_result_skip("PR_SET_MEMORY_MERGE not supported\n");
+ return;
+ } else if (ret) {
+ ksft_test_result_fail("PR_SET_MEMORY_MERGE=1 failed\n");
+ return;
+ }
+
+ ret = prctl(PR_GET_MEMORY_MERGE, 0, 0, 0, 0);
+ if (ret < 0) {
+ ksft_test_result_fail("PR_GET_MEMORY_MERGE failed\n");
+ return;
+ } else if (ret != 1) {
+ ksft_test_result_fail("PR_SET_MEMORY_MERGE=1 not effective\n");
+ return;
+ }
+
+ ret = prctl(PR_SET_MEMORY_MERGE, 0, 0, 0, 0);
+ if (ret) {
+ ksft_test_result_fail("PR_SET_MEMORY_MERGE=0 failed\n");
+ return;
+ }
+
+ ret = prctl(PR_GET_MEMORY_MERGE, 0, 0, 0, 0);
+ if (ret < 0) {
+ ksft_test_result_fail("PR_GET_MEMORY_MERGE failed\n");
+ return;
+ } else if (ret != 0) {
+ ksft_test_result_fail("PR_SET_MEMORY_MERGE=0 not effective\n");
+ return;
+ }
+
+ ksft_test_result_pass("Setting/clearing PR_SET_MEMORY_MERGE works\n");
+}
+
+/* Verify that prctl ksm flag is inherited. */
+static void test_prctl_fork(void)
+{
+ int ret, status;
+ pid_t child_pid;
+
+ ksft_print_msg("[RUN] %s\n", __func__);
+
+ ret = prctl(PR_SET_MEMORY_MERGE, 1, 0, 0, 0);
+ if (ret < 0 && errno == EINVAL) {
+ ksft_test_result_skip("PR_SET_MEMORY_MERGE not supported\n");
+ return;
+ } else if (ret) {
+ ksft_test_result_fail("PR_SET_MEMORY_MERGE=1 failed\n");
+ return;
+ }
+
+ child_pid = fork();
+ if (!child_pid) {
+ exit(prctl(PR_GET_MEMORY_MERGE, 0, 0, 0, 0));
+ } else if (child_pid < 0) {
+ ksft_test_result_fail("fork() failed\n");
+ return;
+ }
+
+ if (waitpid(child_pid, &status, 0) < 0) {
+ ksft_test_result_fail("waitpid() failed\n");
+ return;
+ } else if (WEXITSTATUS(status) != 1) {
+ ksft_test_result_fail("unexpected PR_GET_MEMORY_MERGE result in child\n");
+ return;
+ }
+
+ if (prctl(PR_SET_MEMORY_MERGE, 0, 0, 0, 0)) {
+ ksft_test_result_fail("PR_SET_MEMORY_MERGE=0 failed\n");
+ return;
+ }
+
+ ksft_test_result_pass("PR_SET_MEMORY_MERGE value is inherited\n");
+}
+
int main(int argc, char **argv)
{
- unsigned int tests = 2;
+ unsigned int tests = 4;
int err;
#ifdef __NR_userfaultfd
@@ -271,6 +353,9 @@ int main(int argc, char **argv)
test_unmerge_uffd_wp();
#endif
+ test_prctl();
+ test_prctl_fork();
+
err = ksft_get_fail_cnt();
if (err)
ksft_exit_fail_msg("%d out of %d tests failed\n",
diff --git a/tools/testing/selftests/mm/ksm_tests.c b/tools/testing/selftests/mm/ksm_tests.c
index f9eb4d67e0dd..435acebdc325 100644
--- a/tools/testing/selftests/mm/ksm_tests.c
+++ b/tools/testing/selftests/mm/ksm_tests.c
@@ -1,6 +1,8 @@
// SPDX-License-Identifier: GPL-2.0
#include <sys/mman.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
#include <stdbool.h>
#include <time.h>
#include <string.h>
@@ -12,7 +14,7 @@
#include "../kselftest.h"
#include <include/vdso/time64.h>
-#include "util.h"
+#include "vm_util.h"
#define KSM_SYSFS_PATH "/sys/kernel/mm/ksm/"
#define KSM_FP(s) (KSM_SYSFS_PATH s)
@@ -21,6 +23,7 @@
#define KSM_PROT_STR_DEFAULT "rw"
#define KSM_USE_ZERO_PAGES_DEFAULT false
#define KSM_MERGE_ACROSS_NODES_DEFAULT true
+#define KSM_MERGE_TYPE_DEFAULT 0
#define MB (1ul << 20)
struct ksm_sysfs {
@@ -33,9 +36,16 @@ struct ksm_sysfs {
unsigned long use_zero_pages;
};
+enum ksm_merge_type {
+ KSM_MERGE_MADVISE,
+ KSM_MERGE_PRCTL,
+ KSM_MERGE_LAST = KSM_MERGE_PRCTL
+};
+
enum ksm_test_name {
CHECK_KSM_MERGE,
CHECK_KSM_UNMERGE,
+ CHECK_KSM_GET_MERGE_TYPE,
CHECK_KSM_ZERO_PAGE_MERGE,
CHECK_KSM_NUMA_MERGE,
KSM_MERGE_TIME,
@@ -44,6 +54,8 @@ enum ksm_test_name {
KSM_COW_TIME
};
+int debug;
+
static int ksm_write_sysfs(const char *file_path, unsigned long val)
{
FILE *f = fopen(file_path, "w");
@@ -82,6 +94,53 @@ static int ksm_read_sysfs(const char *file_path, unsigned long *val)
return 0;
}
+static void ksm_print_sysfs(void)
+{
+ unsigned long max_page_sharing, pages_sharing, pages_shared;
+ unsigned long full_scans, pages_unshared, pages_volatile;
+ unsigned long stable_node_chains, stable_node_dups;
+ long general_profit;
+
+ if (ksm_read_sysfs(KSM_FP("pages_shared"), &pages_shared) ||
+ ksm_read_sysfs(KSM_FP("pages_sharing"), &pages_sharing) ||
+ ksm_read_sysfs(KSM_FP("max_page_sharing"), &max_page_sharing) ||
+ ksm_read_sysfs(KSM_FP("full_scans"), &full_scans) ||
+ ksm_read_sysfs(KSM_FP("pages_unshared"), &pages_unshared) ||
+ ksm_read_sysfs(KSM_FP("pages_volatile"), &pages_volatile) ||
+ ksm_read_sysfs(KSM_FP("stable_node_chains"), &stable_node_chains) ||
+ ksm_read_sysfs(KSM_FP("stable_node_dups"), &stable_node_dups) ||
+ ksm_read_sysfs(KSM_FP("general_profit"), (unsigned long *)&general_profit))
+ return;
+
+ printf("pages_shared : %lu\n", pages_shared);
+ printf("pages_sharing : %lu\n", pages_sharing);
+ printf("max_page_sharing : %lu\n", max_page_sharing);
+ printf("full_scans : %lu\n", full_scans);
+ printf("pages_unshared : %lu\n", pages_unshared);
+ printf("pages_volatile : %lu\n", pages_volatile);
+ printf("stable_node_chains: %lu\n", stable_node_chains);
+ printf("stable_node_dups : %lu\n", stable_node_dups);
+ printf("general_profit : %ld\n", general_profit);
+}
+
+static void ksm_print_procfs(void)
+{
+ const char *file_name = "/proc/self/ksm_stat";
+ char buffer[512];
+ FILE *f = fopen(file_name, "r");
+
+ if (!f) {
+ fprintf(stderr, "f %s\n", file_name);
+ perror("fopen");
+ return;
+ }
+
+ while (fgets(buffer, sizeof(buffer), f))
+ printf("%s", buffer);
+
+ fclose(f);
+}
+
static int str_to_prot(char *prot_str)
{
int prot = 0;
@@ -128,7 +187,12 @@ static void print_help(void)
" Default: %d\n", KSM_USE_ZERO_PAGES_DEFAULT);
printf(" -m: change merge_across_nodes tunable\n"
" Default: %d\n", KSM_MERGE_ACROSS_NODES_DEFAULT);
+ printf(" -d: turn debugging output on\n");
printf(" -s: the size of duplicated memory area (in MiB)\n");
+ printf(" -t: KSM merge type\n"
+ " Default: 0\n"
+ " 0: madvise merging\n"
+ " 1: prctl merging\n");
exit(0);
}
@@ -176,12 +240,21 @@ static int ksm_do_scan(int scan_count, struct timespec start_time, int timeout)
return 0;
}
-static int ksm_merge_pages(void *addr, size_t size, struct timespec start_time, int timeout)
+static int ksm_merge_pages(int merge_type, void *addr, size_t size,
+ struct timespec start_time, int timeout)
{
- if (madvise(addr, size, MADV_MERGEABLE)) {
- perror("madvise");
- return 1;
+ if (merge_type == KSM_MERGE_MADVISE) {
+ if (madvise(addr, size, MADV_MERGEABLE)) {
+ perror("madvise");
+ return 1;
+ }
+ } else if (merge_type == KSM_MERGE_PRCTL) {
+ if (prctl(PR_SET_MEMORY_MERGE, 1, 0, 0, 0)) {
+ perror("prctl");
+ return 1;
+ }
}
+
if (ksm_write_sysfs(KSM_FP("run"), 1))
return 1;
@@ -211,6 +284,11 @@ static bool assert_ksm_pages_count(long dupl_page_count)
ksm_read_sysfs(KSM_FP("max_page_sharing"), &max_page_sharing))
return false;
+ if (debug) {
+ ksm_print_sysfs();
+ ksm_print_procfs();
+ }
+
/*
* Since there must be at least 2 pages for merging and 1 page can be
* shared with the limited number of pages (max_page_sharing), sometimes
@@ -266,7 +344,8 @@ static int ksm_restore(struct ksm_sysfs *ksm_sysfs)
return 0;
}
-static int check_ksm_merge(int mapping, int prot, long page_count, int timeout, size_t page_size)
+static int check_ksm_merge(int merge_type, int mapping, int prot,
+ long page_count, int timeout, size_t page_size)
{
void *map_ptr;
struct timespec start_time;
@@ -281,13 +360,15 @@ static int check_ksm_merge(int mapping, int prot, long page_count, int timeout,
if (!map_ptr)
return KSFT_FAIL;
- if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout))
+ if (ksm_merge_pages(merge_type, map_ptr, page_size * page_count, start_time, timeout))
goto err_out;
/* verify that the right number of pages are merged */
if (assert_ksm_pages_count(page_count)) {
printf("OK\n");
munmap(map_ptr, page_size * page_count);
+ if (merge_type == KSM_MERGE_PRCTL)
+ prctl(PR_SET_MEMORY_MERGE, 0, 0, 0, 0);
return KSFT_PASS;
}
@@ -297,7 +378,7 @@ err_out:
return KSFT_FAIL;
}
-static int check_ksm_unmerge(int mapping, int prot, int timeout, size_t page_size)
+static int check_ksm_unmerge(int merge_type, int mapping, int prot, int timeout, size_t page_size)
{
void *map_ptr;
struct timespec start_time;
@@ -313,7 +394,7 @@ static int check_ksm_unmerge(int mapping, int prot, int timeout, size_t page_siz
if (!map_ptr)
return KSFT_FAIL;
- if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout))
+ if (ksm_merge_pages(merge_type, map_ptr, page_size * page_count, start_time, timeout))
goto err_out;
/* change 1 byte in each of the 2 pages -- KSM must automatically unmerge them */
@@ -337,8 +418,8 @@ err_out:
return KSFT_FAIL;
}
-static int check_ksm_zero_page_merge(int mapping, int prot, long page_count, int timeout,
- bool use_zero_pages, size_t page_size)
+static int check_ksm_zero_page_merge(int merge_type, int mapping, int prot, long page_count,
+ int timeout, bool use_zero_pages, size_t page_size)
{
void *map_ptr;
struct timespec start_time;
@@ -356,7 +437,7 @@ static int check_ksm_zero_page_merge(int mapping, int prot, long page_count, int
if (!map_ptr)
return KSFT_FAIL;
- if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout))
+ if (ksm_merge_pages(merge_type, map_ptr, page_size * page_count, start_time, timeout))
goto err_out;
/*
@@ -402,8 +483,8 @@ static int get_first_mem_node(void)
return get_next_mem_node(numa_max_node());
}
-static int check_ksm_numa_merge(int mapping, int prot, int timeout, bool merge_across_nodes,
- size_t page_size)
+static int check_ksm_numa_merge(int merge_type, int mapping, int prot, int timeout,
+ bool merge_across_nodes, size_t page_size)
{
void *numa1_map_ptr, *numa2_map_ptr;
struct timespec start_time;
@@ -439,8 +520,8 @@ static int check_ksm_numa_merge(int mapping, int prot, int timeout, bool merge_a
memset(numa2_map_ptr, '*', page_size);
/* try to merge the pages */
- if (ksm_merge_pages(numa1_map_ptr, page_size, start_time, timeout) ||
- ksm_merge_pages(numa2_map_ptr, page_size, start_time, timeout))
+ if (ksm_merge_pages(merge_type, numa1_map_ptr, page_size, start_time, timeout) ||
+ ksm_merge_pages(merge_type, numa2_map_ptr, page_size, start_time, timeout))
goto err_out;
/*
@@ -466,7 +547,8 @@ err_out:
return KSFT_FAIL;
}
-static int ksm_merge_hugepages_time(int mapping, int prot, int timeout, size_t map_size)
+static int ksm_merge_hugepages_time(int merge_type, int mapping, int prot,
+ int timeout, size_t map_size)
{
void *map_ptr, *map_ptr_orig;
struct timespec start_time, end_time;
@@ -508,7 +590,7 @@ static int ksm_merge_hugepages_time(int mapping, int prot, int timeout, size_t m
perror("clock_gettime");
goto err_out;
}
- if (ksm_merge_pages(map_ptr, map_size, start_time, timeout))
+ if (ksm_merge_pages(merge_type, map_ptr, map_size, start_time, timeout))
goto err_out;
if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) {
perror("clock_gettime");
@@ -533,7 +615,7 @@ err_out:
return KSFT_FAIL;
}
-static int ksm_merge_time(int mapping, int prot, int timeout, size_t map_size)
+static int ksm_merge_time(int merge_type, int mapping, int prot, int timeout, size_t map_size)
{
void *map_ptr;
struct timespec start_time, end_time;
@@ -549,7 +631,7 @@ static int ksm_merge_time(int mapping, int prot, int timeout, size_t map_size)
perror("clock_gettime");
goto err_out;
}
- if (ksm_merge_pages(map_ptr, map_size, start_time, timeout))
+ if (ksm_merge_pages(merge_type, map_ptr, map_size, start_time, timeout))
goto err_out;
if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) {
perror("clock_gettime");
@@ -574,7 +656,7 @@ err_out:
return KSFT_FAIL;
}
-static int ksm_unmerge_time(int mapping, int prot, int timeout, size_t map_size)
+static int ksm_unmerge_time(int merge_type, int mapping, int prot, int timeout, size_t map_size)
{
void *map_ptr;
struct timespec start_time, end_time;
@@ -589,7 +671,7 @@ static int ksm_unmerge_time(int mapping, int prot, int timeout, size_t map_size)
perror("clock_gettime");
goto err_out;
}
- if (ksm_merge_pages(map_ptr, map_size, start_time, timeout))
+ if (ksm_merge_pages(merge_type, map_ptr, map_size, start_time, timeout))
goto err_out;
if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
@@ -621,7 +703,7 @@ err_out:
return KSFT_FAIL;
}
-static int ksm_cow_time(int mapping, int prot, int timeout, size_t page_size)
+static int ksm_cow_time(int merge_type, int mapping, int prot, int timeout, size_t page_size)
{
void *map_ptr;
struct timespec start_time, end_time;
@@ -660,7 +742,7 @@ static int ksm_cow_time(int mapping, int prot, int timeout, size_t page_size)
memset(map_ptr + page_size * i, '+', i / 2 + 1);
memset(map_ptr + page_size * (i + 1), '+', i / 2 + 1);
}
- if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout))
+ if (ksm_merge_pages(merge_type, map_ptr, page_size * page_count, start_time, timeout))
goto err_out;
if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
@@ -697,6 +779,7 @@ int main(int argc, char *argv[])
int ret, opt;
int prot = 0;
int ksm_scan_limit_sec = KSM_SCAN_LIMIT_SEC_DEFAULT;
+ int merge_type = KSM_MERGE_TYPE_DEFAULT;
long page_count = KSM_PAGE_COUNT_DEFAULT;
size_t page_size = sysconf(_SC_PAGESIZE);
struct ksm_sysfs ksm_sysfs_old;
@@ -705,7 +788,7 @@ int main(int argc, char *argv[])
bool merge_across_nodes = KSM_MERGE_ACROSS_NODES_DEFAULT;
long size_MB = 0;
- while ((opt = getopt(argc, argv, "ha:p:l:z:m:s:MUZNPCHD")) != -1) {
+ while ((opt = getopt(argc, argv, "dha:p:l:z:m:s:t:MUZNPCHD")) != -1) {
switch (opt) {
case 'a':
prot = str_to_prot(optarg);
@@ -739,12 +822,26 @@ int main(int argc, char *argv[])
else
merge_across_nodes = 1;
break;
+ case 'd':
+ debug = 1;
+ break;
case 's':
size_MB = atoi(optarg);
if (size_MB <= 0) {
printf("Size must be greater than 0\n");
return KSFT_FAIL;
}
+ case 't':
+ {
+ int tmp = atoi(optarg);
+
+ if (tmp < 0 || tmp > KSM_MERGE_LAST) {
+ printf("Invalid merge type\n");
+ return KSFT_FAIL;
+ }
+ merge_type = tmp;
+ }
+ break;
case 'M':
break;
case 'U':
@@ -795,35 +892,36 @@ int main(int argc, char *argv[])
switch (test_name) {
case CHECK_KSM_MERGE:
- ret = check_ksm_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count,
+ ret = check_ksm_merge(merge_type, MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count,
ksm_scan_limit_sec, page_size);
break;
case CHECK_KSM_UNMERGE:
- ret = check_ksm_unmerge(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec,
- page_size);
+ ret = check_ksm_unmerge(merge_type, MAP_PRIVATE | MAP_ANONYMOUS, prot,
+ ksm_scan_limit_sec, page_size);
break;
case CHECK_KSM_ZERO_PAGE_MERGE:
- ret = check_ksm_zero_page_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count,
- ksm_scan_limit_sec, use_zero_pages, page_size);
+ ret = check_ksm_zero_page_merge(merge_type, MAP_PRIVATE | MAP_ANONYMOUS, prot,
+ page_count, ksm_scan_limit_sec, use_zero_pages,
+ page_size);
break;
case CHECK_KSM_NUMA_MERGE:
- ret = check_ksm_numa_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec,
- merge_across_nodes, page_size);
+ ret = check_ksm_numa_merge(merge_type, MAP_PRIVATE | MAP_ANONYMOUS, prot,
+ ksm_scan_limit_sec, merge_across_nodes, page_size);
break;
case KSM_MERGE_TIME:
if (size_MB == 0) {
printf("Option '-s' is required.\n");
return KSFT_FAIL;
}
- ret = ksm_merge_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec,
- size_MB);
+ ret = ksm_merge_time(merge_type, MAP_PRIVATE | MAP_ANONYMOUS, prot,
+ ksm_scan_limit_sec, size_MB);
break;
case KSM_MERGE_TIME_HUGE_PAGES:
if (size_MB == 0) {
printf("Option '-s' is required.\n");
return KSFT_FAIL;
}
- ret = ksm_merge_hugepages_time(MAP_PRIVATE | MAP_ANONYMOUS, prot,
+ ret = ksm_merge_hugepages_time(merge_type, MAP_PRIVATE | MAP_ANONYMOUS, prot,
ksm_scan_limit_sec, size_MB);
break;
case KSM_UNMERGE_TIME:
@@ -831,12 +929,12 @@ int main(int argc, char *argv[])
printf("Option '-s' is required.\n");
return KSFT_FAIL;
}
- ret = ksm_unmerge_time(MAP_PRIVATE | MAP_ANONYMOUS, prot,
+ ret = ksm_unmerge_time(merge_type, MAP_PRIVATE | MAP_ANONYMOUS, prot,
ksm_scan_limit_sec, size_MB);
break;
case KSM_COW_TIME:
- ret = ksm_cow_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec,
- page_size);
+ ret = ksm_cow_time(merge_type, MAP_PRIVATE | MAP_ANONYMOUS, prot,
+ ksm_scan_limit_sec, page_size);
break;
}
diff --git a/tools/testing/selftests/mm/mkdirty.c b/tools/testing/selftests/mm/mkdirty.c
new file mode 100644
index 000000000000..6d71d972997b
--- /dev/null
+++ b/tools/testing/selftests/mm/mkdirty.c
@@ -0,0 +1,379 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Test handling of code that might set PTE/PMD dirty in read-only VMAs.
+ * Setting a PTE/PMD dirty must not accidentally set the PTE/PMD writable.
+ *
+ * Copyright 2023, Red Hat, Inc.
+ *
+ * Author(s): David Hildenbrand <david@redhat.com>
+ */
+#include <fcntl.h>
+#include <signal.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <sys/mman.h>
+#include <setjmp.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <linux/userfaultfd.h>
+#include <linux/mempolicy.h>
+
+#include "../kselftest.h"
+#include "vm_util.h"
+
+static size_t pagesize;
+static size_t thpsize;
+static int mem_fd;
+static int pagemap_fd;
+static sigjmp_buf env;
+
+static void signal_handler(int sig)
+{
+ if (sig == SIGSEGV)
+ siglongjmp(env, 1);
+ siglongjmp(env, 2);
+}
+
+static void do_test_write_sigsegv(char *mem)
+{
+ char orig = *mem;
+ int ret;
+
+ if (signal(SIGSEGV, signal_handler) == SIG_ERR) {
+ ksft_test_result_fail("signal() failed\n");
+ return;
+ }
+
+ ret = sigsetjmp(env, 1);
+ if (!ret)
+ *mem = orig + 1;
+
+ if (signal(SIGSEGV, SIG_DFL) == SIG_ERR)
+ ksft_test_result_fail("signal() failed\n");
+
+ ksft_test_result(ret == 1 && *mem == orig,
+ "SIGSEGV generated, page not modified\n");
+}
+
+static char *mmap_thp_range(int prot, char **_mmap_mem, size_t *_mmap_size)
+{
+ const size_t mmap_size = 2 * thpsize;
+ char *mem, *mmap_mem;
+
+ mmap_mem = mmap(NULL, mmap_size, prot, MAP_PRIVATE|MAP_ANON,
+ -1, 0);
+ if (mmap_mem == MAP_FAILED) {
+ ksft_test_result_fail("mmap() failed\n");
+ return MAP_FAILED;
+ }
+ mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1));
+
+ if (madvise(mem, thpsize, MADV_HUGEPAGE)) {
+ ksft_test_result_skip("MADV_HUGEPAGE failed\n");
+ munmap(mmap_mem, mmap_size);
+ return MAP_FAILED;
+ }
+
+ *_mmap_mem = mmap_mem;
+ *_mmap_size = mmap_size;
+ return mem;
+}
+
+static void test_ptrace_write(void)
+{
+ char data = 1;
+ char *mem;
+ int ret;
+
+ ksft_print_msg("[INFO] PTRACE write access\n");
+
+ mem = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE|MAP_ANON, -1, 0);
+ if (mem == MAP_FAILED) {
+ ksft_test_result_fail("mmap() failed\n");
+ return;
+ }
+
+ /* Fault in the shared zeropage. */
+ if (*mem != 0) {
+ ksft_test_result_fail("Memory not zero\n");
+ goto munmap;
+ }
+
+ /*
+ * Unshare the page (populating a fresh anon page that might be set
+ * dirty in the PTE) in the read-only VMA using ptrace (FOLL_FORCE).
+ */
+ lseek(mem_fd, (uintptr_t) mem, SEEK_SET);
+ ret = write(mem_fd, &data, 1);
+ if (ret != 1 || *mem != data) {
+ ksft_test_result_fail("write() failed\n");
+ goto munmap;
+ }
+
+ do_test_write_sigsegv(mem);
+munmap:
+ munmap(mem, pagesize);
+}
+
+static void test_ptrace_write_thp(void)
+{
+ char *mem, *mmap_mem;
+ size_t mmap_size;
+ char data = 1;
+ int ret;
+
+ ksft_print_msg("[INFO] PTRACE write access to THP\n");
+
+ mem = mmap_thp_range(PROT_READ, &mmap_mem, &mmap_size);
+ if (mem == MAP_FAILED)
+ return;
+
+ /*
+ * Write to the first subpage in the read-only VMA using
+ * ptrace(FOLL_FORCE), eventually placing a fresh THP that is marked
+ * dirty in the PMD.
+ */
+ lseek(mem_fd, (uintptr_t) mem, SEEK_SET);
+ ret = write(mem_fd, &data, 1);
+ if (ret != 1 || *mem != data) {
+ ksft_test_result_fail("write() failed\n");
+ goto munmap;
+ }
+
+ /* MM populated a THP if we got the last subpage populated as well. */
+ if (!pagemap_is_populated(pagemap_fd, mem + thpsize - pagesize)) {
+ ksft_test_result_skip("Did not get a THP populated\n");
+ goto munmap;
+ }
+
+ do_test_write_sigsegv(mem);
+munmap:
+ munmap(mmap_mem, mmap_size);
+}
+
+static void test_page_migration(void)
+{
+ char *mem;
+
+ ksft_print_msg("[INFO] Page migration\n");
+
+ mem = mmap(NULL, pagesize, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON,
+ -1, 0);
+ if (mem == MAP_FAILED) {
+ ksft_test_result_fail("mmap() failed\n");
+ return;
+ }
+
+ /* Populate a fresh page and dirty it. */
+ memset(mem, 1, pagesize);
+ if (mprotect(mem, pagesize, PROT_READ)) {
+ ksft_test_result_fail("mprotect() failed\n");
+ goto munmap;
+ }
+
+ /* Trigger page migration. Might not be available or fail. */
+ if (syscall(__NR_mbind, mem, pagesize, MPOL_LOCAL, NULL, 0x7fful,
+ MPOL_MF_MOVE)) {
+ ksft_test_result_skip("mbind() failed\n");
+ goto munmap;
+ }
+
+ do_test_write_sigsegv(mem);
+munmap:
+ munmap(mem, pagesize);
+}
+
+static void test_page_migration_thp(void)
+{
+ char *mem, *mmap_mem;
+ size_t mmap_size;
+
+ ksft_print_msg("[INFO] Page migration of THP\n");
+
+ mem = mmap_thp_range(PROT_READ|PROT_WRITE, &mmap_mem, &mmap_size);
+ if (mem == MAP_FAILED)
+ return;
+
+ /*
+ * Write to the first page, which might populate a fresh anon THP
+ * and dirty it.
+ */
+ memset(mem, 1, pagesize);
+ if (mprotect(mem, thpsize, PROT_READ)) {
+ ksft_test_result_fail("mprotect() failed\n");
+ goto munmap;
+ }
+
+ /* MM populated a THP if we got the last subpage populated as well. */
+ if (!pagemap_is_populated(pagemap_fd, mem + thpsize - pagesize)) {
+ ksft_test_result_skip("Did not get a THP populated\n");
+ goto munmap;
+ }
+
+ /* Trigger page migration. Might not be available or fail. */
+ if (syscall(__NR_mbind, mem, thpsize, MPOL_LOCAL, NULL, 0x7fful,
+ MPOL_MF_MOVE)) {
+ ksft_test_result_skip("mbind() failed\n");
+ goto munmap;
+ }
+
+ do_test_write_sigsegv(mem);
+munmap:
+ munmap(mmap_mem, mmap_size);
+}
+
+static void test_pte_mapped_thp(void)
+{
+ char *mem, *mmap_mem;
+ size_t mmap_size;
+
+ ksft_print_msg("[INFO] PTE-mapping a THP\n");
+
+ mem = mmap_thp_range(PROT_READ|PROT_WRITE, &mmap_mem, &mmap_size);
+ if (mem == MAP_FAILED)
+ return;
+
+ /*
+ * Write to the first page, which might populate a fresh anon THP
+ * and dirty it.
+ */
+ memset(mem, 1, pagesize);
+ if (mprotect(mem, thpsize, PROT_READ)) {
+ ksft_test_result_fail("mprotect() failed\n");
+ goto munmap;
+ }
+
+ /* MM populated a THP if we got the last subpage populated as well. */
+ if (!pagemap_is_populated(pagemap_fd, mem + thpsize - pagesize)) {
+ ksft_test_result_skip("Did not get a THP populated\n");
+ goto munmap;
+ }
+
+ /* Trigger PTE-mapping the THP by mprotect'ing the last subpage. */
+ if (mprotect(mem + thpsize - pagesize, pagesize,
+ PROT_READ|PROT_WRITE)) {
+ ksft_test_result_fail("mprotect() failed\n");
+ goto munmap;
+ }
+
+ do_test_write_sigsegv(mem);
+munmap:
+ munmap(mmap_mem, mmap_size);
+}
+
+#ifdef __NR_userfaultfd
+static void test_uffdio_copy(void)
+{
+ struct uffdio_register uffdio_register;
+ struct uffdio_copy uffdio_copy;
+ struct uffdio_api uffdio_api;
+ char *dst, *src;
+ int uffd;
+
+ ksft_print_msg("[INFO] UFFDIO_COPY\n");
+
+ src = malloc(pagesize);
+ memset(src, 1, pagesize);
+ dst = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE|MAP_ANON, -1, 0);
+ if (dst == MAP_FAILED) {
+ ksft_test_result_fail("mmap() failed\n");
+ return;
+ }
+
+ uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+ if (uffd < 0) {
+ ksft_test_result_skip("__NR_userfaultfd failed\n");
+ goto munmap;
+ }
+
+ uffdio_api.api = UFFD_API;
+ uffdio_api.features = 0;
+ if (ioctl(uffd, UFFDIO_API, &uffdio_api) < 0) {
+ ksft_test_result_fail("UFFDIO_API failed\n");
+ goto close_uffd;
+ }
+
+ uffdio_register.range.start = (unsigned long) dst;
+ uffdio_register.range.len = pagesize;
+ uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+ if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
+ ksft_test_result_fail("UFFDIO_REGISTER failed\n");
+ goto close_uffd;
+ }
+
+ /* Place a page in a read-only VMA, which might set the PTE dirty. */
+ uffdio_copy.dst = (unsigned long) dst;
+ uffdio_copy.src = (unsigned long) src;
+ uffdio_copy.len = pagesize;
+ uffdio_copy.mode = 0;
+ if (ioctl(uffd, UFFDIO_COPY, &uffdio_copy)) {
+ ksft_test_result_fail("UFFDIO_COPY failed\n");
+ goto close_uffd;
+ }
+
+ do_test_write_sigsegv(dst);
+close_uffd:
+ close(uffd);
+munmap:
+ munmap(dst, pagesize);
+ free(src);
+#endif /* __NR_userfaultfd */
+}
+
+int main(void)
+{
+ int err, tests = 2;
+
+ pagesize = getpagesize();
+ thpsize = read_pmd_pagesize();
+ if (thpsize) {
+ ksft_print_msg("[INFO] detected THP size: %zu KiB\n",
+ thpsize / 1024);
+ tests += 3;
+ }
+#ifdef __NR_userfaultfd
+ tests += 1;
+#endif /* __NR_userfaultfd */
+
+ ksft_print_header();
+ ksft_set_plan(tests);
+
+ mem_fd = open("/proc/self/mem", O_RDWR);
+ if (mem_fd < 0)
+ ksft_exit_fail_msg("opening /proc/self/mem failed\n");
+ pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
+ if (pagemap_fd < 0)
+ ksft_exit_fail_msg("opening /proc/self/pagemap failed\n");
+
+ /*
+ * On some ptrace(FOLL_FORCE) write access via /proc/self/mem in
+ * read-only VMAs, the kernel may set the PTE/PMD dirty.
+ */
+ test_ptrace_write();
+ if (thpsize)
+ test_ptrace_write_thp();
+ /*
+ * On page migration, the kernel may set the PTE/PMD dirty when
+ * remapping the page.
+ */
+ test_page_migration();
+ if (thpsize)
+ test_page_migration_thp();
+ /* PTE-mapping a THP might propagate the dirty PMD bit to the PTEs. */
+ if (thpsize)
+ test_pte_mapped_thp();
+ /* Placing a fresh page via userfaultfd may set the PTE dirty. */
+#ifdef __NR_userfaultfd
+ test_uffdio_copy();
+#endif /* __NR_userfaultfd */
+
+ err = ksft_get_fail_cnt();
+ if (err)
+ ksft_exit_fail_msg("%d out of %d tests failed\n",
+ err, ksft_test_num());
+ return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/mm/mrelease_test.c b/tools/testing/selftests/mm/mrelease_test.c
index 6c62966ab5db..37b6d33b9e84 100644
--- a/tools/testing/selftests/mm/mrelease_test.c
+++ b/tools/testing/selftests/mm/mrelease_test.c
@@ -9,8 +9,7 @@
#include <stdlib.h>
#include <sys/wait.h>
#include <unistd.h>
-
-#include "util.h"
+#include "vm_util.h"
#include "../kselftest.h"
@@ -32,7 +31,7 @@ static int alloc_noexit(unsigned long nr_pages, int pipefd)
unsigned long i;
char *buf;
- buf = (char *)mmap(NULL, nr_pages * PAGE_SIZE, PROT_READ | PROT_WRITE,
+ buf = (char *)mmap(NULL, nr_pages * psize(), PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANON, 0, 0);
if (buf == MAP_FAILED) {
perror("mmap failed, halting the test");
@@ -40,7 +39,7 @@ static int alloc_noexit(unsigned long nr_pages, int pipefd)
}
for (i = 0; i < nr_pages; i++)
- *((unsigned long *)(buf + (i * PAGE_SIZE))) = i;
+ *((unsigned long *)(buf + (i * psize()))) = i;
/* Signal the parent that the child is ready */
if (write(pipefd, "", 1) < 0) {
@@ -54,7 +53,7 @@ static int alloc_noexit(unsigned long nr_pages, int pipefd)
timeout--;
}
- munmap(buf, nr_pages * PAGE_SIZE);
+ munmap(buf, nr_pages * psize());
return (timeout > 0) ? KSFT_PASS : KSFT_FAIL;
}
@@ -87,7 +86,7 @@ static int child_main(int pipefd[], size_t size)
/* Allocate and fault-in memory and wait to be killed */
close(pipefd[0]);
- res = alloc_noexit(MB(size) / PAGE_SIZE, pipefd[1]);
+ res = alloc_noexit(MB(size) / psize(), pipefd[1]);
close(pipefd[1]);
return res;
}
diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
index a9d6d52596f2..4893eb60d96d 100644
--- a/tools/testing/selftests/mm/run_vmtests.sh
+++ b/tools/testing/selftests/mm/run_vmtests.sh
@@ -5,6 +5,9 @@
# Kselftest framework requirement - SKIP code is 4.
ksft_skip=4
+count_pass=0
+count_fail=0
+count_skip=0
exitcode=0
usage() {
@@ -149,11 +152,14 @@ run_test() {
"$@"
local ret=$?
if [ $ret -eq 0 ]; then
+ count_pass=$(( count_pass + 1 ))
echo "[PASS]"
elif [ $ret -eq $ksft_skip ]; then
+ count_skip=$(( count_skip + 1 ))
echo "[SKIP]"
exitcode=$ksft_skip
else
+ count_fail=$(( count_fail + 1 ))
echo "[FAIL]"
exitcode=1
fi
@@ -190,15 +196,15 @@ CATEGORY="gup_test" run_test ./gup_test -a
# Dump pages 0, 19, and 4096, using pin_user_pages:
CATEGORY="gup_test" run_test ./gup_test -ct -F 0x1 0 19 0x1000
-uffd_mods=("" ":dev")
-for mod in "${uffd_mods[@]}"; do
- CATEGORY="userfaultfd" run_test ./userfaultfd anon${mod} 20 16
- # Hugetlb tests require source and destination huge pages. Pass in half
- # the size ($half_ufd_size_MB), which is used for *each*.
- CATEGORY="userfaultfd" run_test ./userfaultfd hugetlb${mod} "$half_ufd_size_MB" 32
- CATEGORY="userfaultfd" run_test ./userfaultfd hugetlb_shared${mod} "$half_ufd_size_MB" 32
- CATEGORY="userfaultfd" run_test ./userfaultfd shmem${mod} 20 16
-done
+CATEGORY="userfaultfd" run_test ./uffd-unit-tests
+uffd_stress_bin=./uffd-stress
+CATEGORY="userfaultfd" run_test ${uffd_stress_bin} anon 20 16
+# Hugetlb tests require source and destination huge pages. Pass in half
+# the size ($half_ufd_size_MB), which is used for *each*.
+CATEGORY="userfaultfd" run_test ${uffd_stress_bin} hugetlb "$half_ufd_size_MB" 32
+CATEGORY="userfaultfd" run_test ${uffd_stress_bin} hugetlb-private "$half_ufd_size_MB" 32
+CATEGORY="userfaultfd" run_test ${uffd_stress_bin} shmem 20 16
+CATEGORY="userfaultfd" run_test ${uffd_stress_bin} shmem-private 20 16
#cleanup
echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages
@@ -220,10 +226,26 @@ CATEGORY="mremap" run_test ./mremap_test
CATEGORY="hugetlb" run_test ./thuge-gen
if [ $VADDR64 -ne 0 ]; then
+
+ # set overcommit_policy as OVERCOMMIT_ALWAYS so that kernel
+ # allows high virtual address allocation requests independent
+ # of platform's physical memory.
+
+ prev_policy=$(cat /proc/sys/vm/overcommit_memory)
+ echo 1 > /proc/sys/vm/overcommit_memory
CATEGORY="hugevm" run_test ./virtual_address_range
+ echo $prev_policy > /proc/sys/vm/overcommit_memory
- # virtual address 128TB switch test
- CATEGORY="hugevm" run_test ./va_128TBswitch.sh
+ # va high address boundary switch test
+ ARCH_ARM64="arm64"
+ prev_nr_hugepages=$(cat /proc/sys/vm/nr_hugepages)
+ if [ "$ARCH" == "$ARCH_ARM64" ]; then
+ echo 6 > /proc/sys/vm/nr_hugepages
+ fi
+ CATEGORY="hugevm" run_test ./va_high_addr_switch.sh
+ if [ "$ARCH" == "$ARCH_ARM64" ]; then
+ echo $prev_nr_hugepages > /proc/sys/vm/nr_hugepages
+ fi
fi # VADDR64
# vmalloc stability smoke test
@@ -271,4 +293,6 @@ CATEGORY="soft_dirty" run_test ./soft-dirty
# COW tests
CATEGORY="cow" run_test ./cow
+echo "SUMMARY: PASS=${count_pass} SKIP=${count_skip} FAIL=${count_fail}"
+
exit $exitcode
diff --git a/tools/testing/selftests/mm/soft-dirty.c b/tools/testing/selftests/mm/soft-dirty.c
index 21d8830c5f24..cc5f144430d4 100644
--- a/tools/testing/selftests/mm/soft-dirty.c
+++ b/tools/testing/selftests/mm/soft-dirty.c
@@ -80,6 +80,9 @@ static void test_hugepage(int pagemap_fd, int pagesize)
int i, ret;
size_t hpage_len = read_pmd_pagesize();
+ if (!hpage_len)
+ ksft_exit_fail_msg("Reading PMD pagesize failed");
+
map = memalign(hpage_len, hpage_len);
if (!map)
ksft_exit_fail_msg("memalign failed\n");
diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c
index 76e1c36dd9e5..0e74635c8c3d 100644
--- a/tools/testing/selftests/mm/split_huge_page_test.c
+++ b/tools/testing/selftests/mm/split_huge_page_test.c
@@ -106,7 +106,7 @@ void split_pmd_thp(void)
for (i = 0; i < len; i++)
one_page[i] = (char)i;
- if (!check_huge_anon(one_page, 1, pmd_pagesize)) {
+ if (!check_huge_anon(one_page, 4, pmd_pagesize)) {
printf("No THP is allocated\n");
exit(EXIT_FAILURE);
}
@@ -122,7 +122,7 @@ void split_pmd_thp(void)
}
- if (check_huge_anon(one_page, 0, pmd_pagesize)) {
+ if (!check_huge_anon(one_page, 0, pmd_pagesize)) {
printf("Still AnonHugePages not split\n");
exit(EXIT_FAILURE);
}
@@ -169,7 +169,7 @@ void split_pte_mapped_thp(void)
for (i = 0; i < len; i++)
one_page[i] = (char)i;
- if (!check_huge_anon(one_page, 1, pmd_pagesize)) {
+ if (!check_huge_anon(one_page, 4, pmd_pagesize)) {
printf("No THP is allocated\n");
exit(EXIT_FAILURE);
}
@@ -300,6 +300,10 @@ int main(int argc, char **argv)
pagesize = getpagesize();
pageshift = ffs(pagesize) - 1;
pmd_pagesize = read_pmd_pagesize();
+ if (!pmd_pagesize) {
+ printf("Reading PMD pagesize failed\n");
+ exit(EXIT_FAILURE);
+ }
split_pmd_thp();
split_pte_mapped_thp();
diff --git a/tools/testing/selftests/mm/thuge-gen.c b/tools/testing/selftests/mm/thuge-gen.c
index 361ef7192cc6..380ab5f0a534 100644
--- a/tools/testing/selftests/mm/thuge-gen.c
+++ b/tools/testing/selftests/mm/thuge-gen.c
@@ -24,6 +24,7 @@
#include <unistd.h>
#include <stdarg.h>
#include <string.h>
+#include "vm_util.h"
#define err(x) perror(x), exit(1)
@@ -74,24 +75,6 @@ void find_pagesizes(void)
globfree(&g);
}
-unsigned long default_huge_page_size(void)
-{
- unsigned long hps = 0;
- char *line = NULL;
- size_t linelen = 0;
- FILE *f = fopen("/proc/meminfo", "r");
- if (!f)
- return 0;
- while (getline(&line, &linelen, f) > 0) {
- if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) {
- hps <<= 10;
- break;
- }
- }
- free(line);
- return hps;
-}
-
void show(unsigned long ps)
{
char buf[100];
diff --git a/tools/testing/selftests/mm/transhuge-stress.c b/tools/testing/selftests/mm/transhuge-stress.c
index e3f00adb1b82..ba9d37ad3a89 100644
--- a/tools/testing/selftests/mm/transhuge-stress.c
+++ b/tools/testing/selftests/mm/transhuge-stress.c
@@ -15,7 +15,7 @@
#include <fcntl.h>
#include <string.h>
#include <sys/mman.h>
-#include "util.h"
+#include "vm_util.h"
int backing_fd = -1;
int mmap_flags = MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE;
@@ -34,10 +34,10 @@ int main(int argc, char **argv)
int pagemap_fd;
ram = sysconf(_SC_PHYS_PAGES);
- if (ram > SIZE_MAX / sysconf(_SC_PAGESIZE) / 4)
+ if (ram > SIZE_MAX / psize() / 4)
ram = SIZE_MAX / 4;
else
- ram *= sysconf(_SC_PAGESIZE);
+ ram *= psize();
len = ram;
while (++i < argc) {
@@ -58,7 +58,7 @@ int main(int argc, char **argv)
warnx("allocate %zd transhuge pages, using %zd MiB virtual memory"
" and %zd MiB of ram", len >> HPAGE_SHIFT, len >> 20,
- ram >> (20 + HPAGE_SHIFT - PAGE_SHIFT - 1));
+ ram >> (20 + HPAGE_SHIFT - pshift() - 1));
pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
if (pagemap_fd < 0)
@@ -92,7 +92,7 @@ int main(int argc, char **argv)
if (pfn < 0) {
nr_failed++;
} else {
- size_t idx = pfn >> (HPAGE_SHIFT - PAGE_SHIFT);
+ size_t idx = pfn >> (HPAGE_SHIFT - pshift());
nr_succeed++;
if (idx >= map_len) {
@@ -108,7 +108,7 @@ int main(int argc, char **argv)
}
/* split transhuge page, keep last page */
- if (madvise(p, HPAGE_SIZE - PAGE_SIZE, MADV_DONTNEED))
+ if (madvise(p, HPAGE_SIZE - psize(), MADV_DONTNEED))
err(2, "MADV_DONTNEED");
}
clock_gettime(CLOCK_MONOTONIC, &b);
diff --git a/tools/testing/selftests/mm/uffd-common.c b/tools/testing/selftests/mm/uffd-common.c
new file mode 100644
index 000000000000..61c6250adf93
--- /dev/null
+++ b/tools/testing/selftests/mm/uffd-common.c
@@ -0,0 +1,618 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Userfaultfd tests util functions
+ *
+ * Copyright (C) 2015-2023 Red Hat, Inc.
+ */
+
+#include "uffd-common.h"
+
+#define BASE_PMD_ADDR ((void *)(1UL << 30))
+
+volatile bool test_uffdio_copy_eexist = true;
+unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
+char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap;
+int uffd = -1, uffd_flags, finished, *pipefd, test_type;
+bool map_shared;
+bool test_uffdio_wp = true;
+unsigned long long *count_verify;
+uffd_test_ops_t *uffd_test_ops;
+
+static int uffd_mem_fd_create(off_t mem_size, bool hugetlb)
+{
+ unsigned int memfd_flags = 0;
+ int mem_fd;
+
+ if (hugetlb)
+ memfd_flags = MFD_HUGETLB;
+ mem_fd = memfd_create("uffd-test", memfd_flags);
+ if (mem_fd < 0)
+ err("memfd_create");
+ if (ftruncate(mem_fd, mem_size))
+ err("ftruncate");
+ if (fallocate(mem_fd,
+ FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0,
+ mem_size))
+ err("fallocate");
+
+ return mem_fd;
+}
+
+static void anon_release_pages(char *rel_area)
+{
+ if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
+ err("madvise(MADV_DONTNEED) failed");
+}
+
+static int anon_allocate_area(void **alloc_area, bool is_src)
+{
+ *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ if (*alloc_area == MAP_FAILED) {
+ *alloc_area = NULL;
+ return -errno;
+ }
+ return 0;
+}
+
+static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)
+{
+}
+
+static void hugetlb_release_pages(char *rel_area)
+{
+ if (!map_shared) {
+ if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
+ err("madvise(MADV_DONTNEED) failed");
+ } else {
+ if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
+ err("madvise(MADV_REMOVE) failed");
+ }
+}
+
+static int hugetlb_allocate_area(void **alloc_area, bool is_src)
+{
+ off_t size = nr_pages * page_size;
+ off_t offset = is_src ? 0 : size;
+ void *area_alias = NULL;
+ char **alloc_area_alias;
+ int mem_fd = uffd_mem_fd_create(size * 2, true);
+
+ *alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE,
+ (map_shared ? MAP_SHARED : MAP_PRIVATE) |
+ (is_src ? 0 : MAP_NORESERVE),
+ mem_fd, offset);
+ if (*alloc_area == MAP_FAILED) {
+ *alloc_area = NULL;
+ return -errno;
+ }
+
+ if (map_shared) {
+ area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE,
+ MAP_SHARED, mem_fd, offset);
+ if (area_alias == MAP_FAILED)
+ return -errno;
+ }
+
+ if (is_src) {
+ alloc_area_alias = &area_src_alias;
+ } else {
+ alloc_area_alias = &area_dst_alias;
+ }
+ if (area_alias)
+ *alloc_area_alias = area_alias;
+
+ close(mem_fd);
+ return 0;
+}
+
+static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset)
+{
+ if (!map_shared)
+ return;
+
+ *start = (unsigned long) area_dst_alias + offset;
+}
+
+static void shmem_release_pages(char *rel_area)
+{
+ if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
+ err("madvise(MADV_REMOVE) failed");
+}
+
+static int shmem_allocate_area(void **alloc_area, bool is_src)
+{
+ void *area_alias = NULL;
+ size_t bytes = nr_pages * page_size, hpage_size = read_pmd_pagesize();
+ unsigned long offset = is_src ? 0 : bytes;
+ char *p = NULL, *p_alias = NULL;
+ int mem_fd = uffd_mem_fd_create(bytes * 2, false);
+
+ /* TODO: clean this up. Use a static addr is ugly */
+ p = BASE_PMD_ADDR;
+ if (!is_src)
+ /* src map + alias + interleaved hpages */
+ p += 2 * (bytes + hpage_size);
+ p_alias = p;
+ p_alias += bytes;
+ p_alias += hpage_size; /* Prevent src/dst VMA merge */
+
+ *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
+ mem_fd, offset);
+ if (*alloc_area == MAP_FAILED) {
+ *alloc_area = NULL;
+ return -errno;
+ }
+ if (*alloc_area != p)
+ err("mmap of memfd failed at %p", p);
+
+ area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
+ mem_fd, offset);
+ if (area_alias == MAP_FAILED) {
+ munmap(*alloc_area, bytes);
+ *alloc_area = NULL;
+ return -errno;
+ }
+ if (area_alias != p_alias)
+ err("mmap of anonymous memory failed at %p", p_alias);
+
+ if (is_src)
+ area_src_alias = area_alias;
+ else
+ area_dst_alias = area_alias;
+
+ close(mem_fd);
+ return 0;
+}
+
+static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset)
+{
+ *start = (unsigned long)area_dst_alias + offset;
+}
+
+static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages)
+{
+ if (!check_huge_shmem(area_dst_alias, expect_nr_hpages,
+ read_pmd_pagesize()))
+ err("Did not find expected %d number of hugepages",
+ expect_nr_hpages);
+}
+
+struct uffd_test_ops anon_uffd_test_ops = {
+ .allocate_area = anon_allocate_area,
+ .release_pages = anon_release_pages,
+ .alias_mapping = noop_alias_mapping,
+ .check_pmd_mapping = NULL,
+};
+
+struct uffd_test_ops shmem_uffd_test_ops = {
+ .allocate_area = shmem_allocate_area,
+ .release_pages = shmem_release_pages,
+ .alias_mapping = shmem_alias_mapping,
+ .check_pmd_mapping = shmem_check_pmd_mapping,
+};
+
+struct uffd_test_ops hugetlb_uffd_test_ops = {
+ .allocate_area = hugetlb_allocate_area,
+ .release_pages = hugetlb_release_pages,
+ .alias_mapping = hugetlb_alias_mapping,
+ .check_pmd_mapping = NULL,
+};
+
+void uffd_stats_report(struct uffd_args *args, int n_cpus)
+{
+ int i;
+ unsigned long long miss_total = 0, wp_total = 0, minor_total = 0;
+
+ for (i = 0; i < n_cpus; i++) {
+ miss_total += args[i].missing_faults;
+ wp_total += args[i].wp_faults;
+ minor_total += args[i].minor_faults;
+ }
+
+ printf("userfaults: ");
+ if (miss_total) {
+ printf("%llu missing (", miss_total);
+ for (i = 0; i < n_cpus; i++)
+ printf("%lu+", args[i].missing_faults);
+ printf("\b) ");
+ }
+ if (wp_total) {
+ printf("%llu wp (", wp_total);
+ for (i = 0; i < n_cpus; i++)
+ printf("%lu+", args[i].wp_faults);
+ printf("\b) ");
+ }
+ if (minor_total) {
+ printf("%llu minor (", minor_total);
+ for (i = 0; i < n_cpus; i++)
+ printf("%lu+", args[i].minor_faults);
+ printf("\b)");
+ }
+ printf("\n");
+}
+
+int userfaultfd_open(uint64_t *features)
+{
+ struct uffdio_api uffdio_api;
+
+ uffd = uffd_open(UFFD_FLAGS);
+ if (uffd < 0)
+ return -1;
+ uffd_flags = fcntl(uffd, F_GETFD, NULL);
+
+ uffdio_api.api = UFFD_API;
+ uffdio_api.features = *features;
+ if (ioctl(uffd, UFFDIO_API, &uffdio_api))
+ /* Probably lack of CAP_PTRACE? */
+ return -1;
+ if (uffdio_api.api != UFFD_API)
+ err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api);
+
+ *features = uffdio_api.features;
+ return 0;
+}
+
+static inline void munmap_area(void **area)
+{
+ if (*area)
+ if (munmap(*area, nr_pages * page_size))
+ err("munmap");
+
+ *area = NULL;
+}
+
+static void uffd_test_ctx_clear(void)
+{
+ size_t i;
+
+ if (pipefd) {
+ for (i = 0; i < nr_cpus * 2; ++i) {
+ if (close(pipefd[i]))
+ err("close pipefd");
+ }
+ free(pipefd);
+ pipefd = NULL;
+ }
+
+ if (count_verify) {
+ free(count_verify);
+ count_verify = NULL;
+ }
+
+ if (uffd != -1) {
+ if (close(uffd))
+ err("close uffd");
+ uffd = -1;
+ }
+
+ munmap_area((void **)&area_src);
+ munmap_area((void **)&area_src_alias);
+ munmap_area((void **)&area_dst);
+ munmap_area((void **)&area_dst_alias);
+ munmap_area((void **)&area_remap);
+}
+
+int uffd_test_ctx_init(uint64_t features, const char **errmsg)
+{
+ unsigned long nr, cpu;
+ int ret;
+
+ uffd_test_ctx_clear();
+
+ ret = uffd_test_ops->allocate_area((void **)&area_src, true);
+ ret |= uffd_test_ops->allocate_area((void **)&area_dst, false);
+ if (ret) {
+ if (errmsg)
+ *errmsg = "memory allocation failed";
+ return ret;
+ }
+
+ ret = userfaultfd_open(&features);
+ if (ret) {
+ if (errmsg)
+ *errmsg = "possible lack of priviledge";
+ return ret;
+ }
+
+ count_verify = malloc(nr_pages * sizeof(unsigned long long));
+ if (!count_verify)
+ err("count_verify");
+
+ for (nr = 0; nr < nr_pages; nr++) {
+ *area_mutex(area_src, nr) =
+ (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER;
+ count_verify[nr] = *area_count(area_src, nr) = 1;
+ /*
+ * In the transition between 255 to 256, powerpc will
+ * read out of order in my_bcmp and see both bytes as
+ * zero, so leave a placeholder below always non-zero
+ * after the count, to avoid my_bcmp to trigger false
+ * positives.
+ */
+ *(area_count(area_src, nr) + 1) = 1;
+ }
+
+ /*
+ * After initialization of area_src, we must explicitly release pages
+ * for area_dst to make sure it's fully empty. Otherwise we could have
+ * some area_dst pages be errornously initialized with zero pages,
+ * hence we could hit memory corruption later in the test.
+ *
+ * One example is when THP is globally enabled, above allocate_area()
+ * calls could have the two areas merged into a single VMA (as they
+ * will have the same VMA flags so they're mergeable). When we
+ * initialize the area_src above, it's possible that some part of
+ * area_dst could have been faulted in via one huge THP that will be
+ * shared between area_src and area_dst. It could cause some of the
+ * area_dst won't be trapped by missing userfaults.
+ *
+ * This release_pages() will guarantee even if that happened, we'll
+ * proactively split the thp and drop any accidentally initialized
+ * pages within area_dst.
+ */
+ uffd_test_ops->release_pages(area_dst);
+
+ pipefd = malloc(sizeof(int) * nr_cpus * 2);
+ if (!pipefd)
+ err("pipefd");
+ for (cpu = 0; cpu < nr_cpus; cpu++)
+ if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK))
+ err("pipe");
+
+ return 0;
+}
+
+void wp_range(int ufd, __u64 start, __u64 len, bool wp)
+{
+ struct uffdio_writeprotect prms;
+
+ /* Write protection page faults */
+ prms.range.start = start;
+ prms.range.len = len;
+ /* Undo write-protect, do wakeup after that */
+ prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0;
+
+ if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms))
+ err("clear WP failed: address=0x%"PRIx64, (uint64_t)start);
+}
+
+static void continue_range(int ufd, __u64 start, __u64 len, bool wp)
+{
+ struct uffdio_continue req;
+ int ret;
+
+ req.range.start = start;
+ req.range.len = len;
+ req.mode = 0;
+ if (wp)
+ req.mode |= UFFDIO_CONTINUE_MODE_WP;
+
+ if (ioctl(ufd, UFFDIO_CONTINUE, &req))
+ err("UFFDIO_CONTINUE failed for address 0x%" PRIx64,
+ (uint64_t)start);
+
+ /*
+ * Error handling within the kernel for continue is subtly different
+ * from copy or zeropage, so it may be a source of bugs. Trigger an
+ * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG.
+ */
+ req.mapped = 0;
+ ret = ioctl(ufd, UFFDIO_CONTINUE, &req);
+ if (ret >= 0 || req.mapped != -EEXIST)
+ err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64,
+ ret, (int64_t) req.mapped);
+}
+
+int uffd_read_msg(int ufd, struct uffd_msg *msg)
+{
+ int ret = read(uffd, msg, sizeof(*msg));
+
+ if (ret != sizeof(*msg)) {
+ if (ret < 0) {
+ if (errno == EAGAIN || errno == EINTR)
+ return 1;
+ err("blocking read error");
+ } else {
+ err("short read");
+ }
+ }
+
+ return 0;
+}
+
+void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_args *args)
+{
+ unsigned long offset;
+
+ if (msg->event != UFFD_EVENT_PAGEFAULT)
+ err("unexpected msg event %u", msg->event);
+
+ if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) {
+ /* Write protect page faults */
+ wp_range(uffd, msg->arg.pagefault.address, page_size, false);
+ args->wp_faults++;
+ } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) {
+ uint8_t *area;
+ int b;
+
+ /*
+ * Minor page faults
+ *
+ * To prove we can modify the original range for testing
+ * purposes, we're going to bit flip this range before
+ * continuing.
+ *
+ * Note that this requires all minor page fault tests operate on
+ * area_dst (non-UFFD-registered) and area_dst_alias
+ * (UFFD-registered).
+ */
+
+ area = (uint8_t *)(area_dst +
+ ((char *)msg->arg.pagefault.address -
+ area_dst_alias));
+ for (b = 0; b < page_size; ++b)
+ area[b] = ~area[b];
+ continue_range(uffd, msg->arg.pagefault.address, page_size,
+ args->apply_wp);
+ args->minor_faults++;
+ } else {
+ /*
+ * Missing page faults.
+ *
+ * Here we force a write check for each of the missing mode
+ * faults. It's guaranteed because the only threads that
+ * will trigger uffd faults are the locking threads, and
+ * their first instruction to touch the missing page will
+ * always be pthread_mutex_lock().
+ *
+ * Note that here we relied on an NPTL glibc impl detail to
+ * always read the lock type at the entry of the lock op
+ * (pthread_mutex_t.__data.__type, offset 0x10) before
+ * doing any locking operations to guarantee that. It's
+ * actually not good to rely on this impl detail because
+ * logically a pthread-compatible lib can implement the
+ * locks without types and we can fail when linking with
+ * them. However since we used to find bugs with this
+ * strict check we still keep it around. Hopefully this
+ * could be a good hint when it fails again. If one day
+ * it'll break on some other impl of glibc we'll revisit.
+ */
+ if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
+ err("unexpected write fault");
+
+ offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
+ offset &= ~(page_size-1);
+
+ if (copy_page(uffd, offset, args->apply_wp))
+ args->missing_faults++;
+ }
+}
+
+void *uffd_poll_thread(void *arg)
+{
+ struct uffd_args *args = (struct uffd_args *)arg;
+ unsigned long cpu = args->cpu;
+ struct pollfd pollfd[2];
+ struct uffd_msg msg;
+ struct uffdio_register uffd_reg;
+ int ret;
+ char tmp_chr;
+
+ pollfd[0].fd = uffd;
+ pollfd[0].events = POLLIN;
+ pollfd[1].fd = pipefd[cpu*2];
+ pollfd[1].events = POLLIN;
+
+ for (;;) {
+ ret = poll(pollfd, 2, -1);
+ if (ret <= 0) {
+ if (errno == EINTR || errno == EAGAIN)
+ continue;
+ err("poll error: %d", ret);
+ }
+ if (pollfd[1].revents) {
+ if (!(pollfd[1].revents & POLLIN))
+ err("pollfd[1].revents %d", pollfd[1].revents);
+ if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
+ err("read pipefd error");
+ break;
+ }
+ if (!(pollfd[0].revents & POLLIN))
+ err("pollfd[0].revents %d", pollfd[0].revents);
+ if (uffd_read_msg(uffd, &msg))
+ continue;
+ switch (msg.event) {
+ default:
+ err("unexpected msg event %u\n", msg.event);
+ break;
+ case UFFD_EVENT_PAGEFAULT:
+ uffd_handle_page_fault(&msg, args);
+ break;
+ case UFFD_EVENT_FORK:
+ close(uffd);
+ uffd = msg.arg.fork.ufd;
+ pollfd[0].fd = uffd;
+ break;
+ case UFFD_EVENT_REMOVE:
+ uffd_reg.range.start = msg.arg.remove.start;
+ uffd_reg.range.len = msg.arg.remove.end -
+ msg.arg.remove.start;
+ if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range))
+ err("remove failure");
+ break;
+ case UFFD_EVENT_REMAP:
+ area_remap = area_dst; /* save for later unmap */
+ area_dst = (char *)(unsigned long)msg.arg.remap.to;
+ break;
+ }
+ }
+
+ return NULL;
+}
+
+static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy,
+ unsigned long offset)
+{
+ uffd_test_ops->alias_mapping(&uffdio_copy->dst,
+ uffdio_copy->len,
+ offset);
+ if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) {
+ /* real retval in ufdio_copy.copy */
+ if (uffdio_copy->copy != -EEXIST)
+ err("UFFDIO_COPY retry error: %"PRId64,
+ (int64_t)uffdio_copy->copy);
+ } else {
+ err("UFFDIO_COPY retry unexpected: %"PRId64,
+ (int64_t)uffdio_copy->copy);
+ }
+}
+
+static void wake_range(int ufd, unsigned long addr, unsigned long len)
+{
+ struct uffdio_range uffdio_wake;
+
+ uffdio_wake.start = addr;
+ uffdio_wake.len = len;
+
+ if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake))
+ fprintf(stderr, "error waking %lu\n",
+ addr), exit(1);
+}
+
+int __copy_page(int ufd, unsigned long offset, bool retry, bool wp)
+{
+ struct uffdio_copy uffdio_copy;
+
+ if (offset >= nr_pages * page_size)
+ err("unexpected offset %lu\n", offset);
+ uffdio_copy.dst = (unsigned long) area_dst + offset;
+ uffdio_copy.src = (unsigned long) area_src + offset;
+ uffdio_copy.len = page_size;
+ if (wp)
+ uffdio_copy.mode = UFFDIO_COPY_MODE_WP;
+ else
+ uffdio_copy.mode = 0;
+ uffdio_copy.copy = 0;
+ if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) {
+ /* real retval in ufdio_copy.copy */
+ if (uffdio_copy.copy != -EEXIST)
+ err("UFFDIO_COPY error: %"PRId64,
+ (int64_t)uffdio_copy.copy);
+ wake_range(ufd, uffdio_copy.dst, page_size);
+ } else if (uffdio_copy.copy != page_size) {
+ err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy);
+ } else {
+ if (test_uffdio_copy_eexist && retry) {
+ test_uffdio_copy_eexist = false;
+ retry_copy_page(ufd, &uffdio_copy, offset);
+ }
+ return 1;
+ }
+ return 0;
+}
+
+int copy_page(int ufd, unsigned long offset, bool wp)
+{
+ return __copy_page(ufd, offset, false, wp);
+}
diff --git a/tools/testing/selftests/mm/uffd-common.h b/tools/testing/selftests/mm/uffd-common.h
new file mode 100644
index 000000000000..6068f2346b86
--- /dev/null
+++ b/tools/testing/selftests/mm/uffd-common.h
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Userfaultfd tests common header
+ *
+ * Copyright (C) 2015-2023 Red Hat, Inc.
+ */
+#ifndef __UFFD_COMMON_H__
+#define __UFFD_COMMON_H__
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <time.h>
+#include <signal.h>
+#include <poll.h>
+#include <string.h>
+#include <linux/mman.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/wait.h>
+#include <pthread.h>
+#include <linux/userfaultfd.h>
+#include <setjmp.h>
+#include <stdbool.h>
+#include <assert.h>
+#include <inttypes.h>
+#include <stdint.h>
+#include <sys/random.h>
+
+#include "../kselftest.h"
+#include "vm_util.h"
+
+#define UFFD_FLAGS (O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY)
+
+#define _err(fmt, ...) \
+ do { \
+ int ret = errno; \
+ fprintf(stderr, "ERROR: " fmt, ##__VA_ARGS__); \
+ fprintf(stderr, " (errno=%d, @%s:%d)\n", \
+ ret, __FILE__, __LINE__); \
+ } while (0)
+
+#define errexit(exitcode, fmt, ...) \
+ do { \
+ _err(fmt, ##__VA_ARGS__); \
+ exit(exitcode); \
+ } while (0)
+
+#define err(fmt, ...) errexit(1, fmt, ##__VA_ARGS__)
+
+/* pthread_mutex_t starts at page offset 0 */
+#define area_mutex(___area, ___nr) \
+ ((pthread_mutex_t *) ((___area) + (___nr)*page_size))
+/*
+ * count is placed in the page after pthread_mutex_t naturally aligned
+ * to avoid non alignment faults on non-x86 archs.
+ */
+#define area_count(___area, ___nr) \
+ ((volatile unsigned long long *) ((unsigned long) \
+ ((___area) + (___nr)*page_size + \
+ sizeof(pthread_mutex_t) + \
+ sizeof(unsigned long long) - 1) & \
+ ~(unsigned long)(sizeof(unsigned long long) \
+ - 1)))
+
+/* Userfaultfd test statistics */
+struct uffd_args {
+ int cpu;
+ /* Whether apply wr-protects when installing pages */
+ bool apply_wp;
+ unsigned long missing_faults;
+ unsigned long wp_faults;
+ unsigned long minor_faults;
+};
+
+struct uffd_test_ops {
+ int (*allocate_area)(void **alloc_area, bool is_src);
+ void (*release_pages)(char *rel_area);
+ void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset);
+ void (*check_pmd_mapping)(void *p, int expect_nr_hpages);
+};
+typedef struct uffd_test_ops uffd_test_ops_t;
+
+extern unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
+extern char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap;
+extern int uffd, uffd_flags, finished, *pipefd, test_type;
+extern bool map_shared;
+extern bool test_uffdio_wp;
+extern unsigned long long *count_verify;
+extern volatile bool test_uffdio_copy_eexist;
+
+extern uffd_test_ops_t anon_uffd_test_ops;
+extern uffd_test_ops_t shmem_uffd_test_ops;
+extern uffd_test_ops_t hugetlb_uffd_test_ops;
+extern uffd_test_ops_t *uffd_test_ops;
+
+void uffd_stats_report(struct uffd_args *args, int n_cpus);
+int uffd_test_ctx_init(uint64_t features, const char **errmsg);
+int userfaultfd_open(uint64_t *features);
+int uffd_read_msg(int ufd, struct uffd_msg *msg);
+void wp_range(int ufd, __u64 start, __u64 len, bool wp);
+void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_args *args);
+int __copy_page(int ufd, unsigned long offset, bool retry, bool wp);
+int copy_page(int ufd, unsigned long offset, bool wp);
+void *uffd_poll_thread(void *arg);
+
+#define TEST_ANON 1
+#define TEST_HUGETLB 2
+#define TEST_SHMEM 3
+
+#endif
diff --git a/tools/testing/selftests/mm/uffd-stress.c b/tools/testing/selftests/mm/uffd-stress.c
new file mode 100644
index 000000000000..f1ad9eef1c3a
--- /dev/null
+++ b/tools/testing/selftests/mm/uffd-stress.c
@@ -0,0 +1,481 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Stress userfaultfd syscall.
+ *
+ * Copyright (C) 2015 Red Hat, Inc.
+ *
+ * This test allocates two virtual areas and bounces the physical
+ * memory across the two virtual areas (from area_src to area_dst)
+ * using userfaultfd.
+ *
+ * There are three threads running per CPU:
+ *
+ * 1) one per-CPU thread takes a per-page pthread_mutex in a random
+ * page of the area_dst (while the physical page may still be in
+ * area_src), and increments a per-page counter in the same page,
+ * and checks its value against a verification region.
+ *
+ * 2) another per-CPU thread handles the userfaults generated by
+ * thread 1 above. userfaultfd blocking reads or poll() modes are
+ * exercised interleaved.
+ *
+ * 3) one last per-CPU thread transfers the memory in the background
+ * at maximum bandwidth (if not already transferred by thread
+ * 2). Each cpu thread takes cares of transferring a portion of the
+ * area.
+ *
+ * When all threads of type 3 completed the transfer, one bounce is
+ * complete. area_src and area_dst are then swapped. All threads are
+ * respawned and so the bounce is immediately restarted in the
+ * opposite direction.
+ *
+ * per-CPU threads 1 by triggering userfaults inside
+ * pthread_mutex_lock will also verify the atomicity of the memory
+ * transfer (UFFDIO_COPY).
+ */
+
+#include "uffd-common.h"
+
+#ifdef __NR_userfaultfd
+
+#define BOUNCE_RANDOM (1<<0)
+#define BOUNCE_RACINGFAULTS (1<<1)
+#define BOUNCE_VERIFY (1<<2)
+#define BOUNCE_POLL (1<<3)
+static int bounces;
+
+/* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */
+#define ALARM_INTERVAL_SECS 10
+static char *zeropage;
+pthread_attr_t attr;
+
+#define swap(a, b) \
+ do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
+
+const char *examples =
+ "# Run anonymous memory test on 100MiB region with 99999 bounces:\n"
+ "./userfaultfd anon 100 99999\n\n"
+ "# Run share memory test on 1GiB region with 99 bounces:\n"
+ "./userfaultfd shmem 1000 99\n\n"
+ "# Run hugetlb memory test on 256MiB region with 50 bounces:\n"
+ "./userfaultfd hugetlb 256 50\n\n"
+ "# Run the same hugetlb test but using private file:\n"
+ "./userfaultfd hugetlb-private 256 50\n\n"
+ "# 10MiB-~6GiB 999 bounces anonymous test, "
+ "continue forever unless an error triggers\n"
+ "while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n";
+
+static void usage(void)
+{
+ fprintf(stderr, "\nUsage: ./userfaultfd <test type> <MiB> <bounces>\n\n");
+ fprintf(stderr, "Supported <test type>: anon, hugetlb, "
+ "hugetlb-private, shmem, shmem-private\n\n");
+ fprintf(stderr, "Examples:\n\n");
+ fprintf(stderr, "%s", examples);
+ exit(1);
+}
+
+static void uffd_stats_reset(struct uffd_args *args, unsigned long n_cpus)
+{
+ int i;
+
+ for (i = 0; i < n_cpus; i++) {
+ args[i].cpu = i;
+ args[i].apply_wp = test_uffdio_wp;
+ args[i].missing_faults = 0;
+ args[i].wp_faults = 0;
+ args[i].minor_faults = 0;
+ }
+}
+
+static inline uint64_t uffd_minor_feature(void)
+{
+ if (test_type == TEST_HUGETLB && map_shared)
+ return UFFD_FEATURE_MINOR_HUGETLBFS;
+ else if (test_type == TEST_SHMEM)
+ return UFFD_FEATURE_MINOR_SHMEM;
+ else
+ return 0;
+}
+
+static void *locking_thread(void *arg)
+{
+ unsigned long cpu = (unsigned long) arg;
+ unsigned long page_nr;
+ unsigned long long count;
+
+ if (!(bounces & BOUNCE_RANDOM)) {
+ page_nr = -bounces;
+ if (!(bounces & BOUNCE_RACINGFAULTS))
+ page_nr += cpu * nr_pages_per_cpu;
+ }
+
+ while (!finished) {
+ if (bounces & BOUNCE_RANDOM) {
+ if (getrandom(&page_nr, sizeof(page_nr), 0) != sizeof(page_nr))
+ err("getrandom failed");
+ } else
+ page_nr += 1;
+ page_nr %= nr_pages;
+ pthread_mutex_lock(area_mutex(area_dst, page_nr));
+ count = *area_count(area_dst, page_nr);
+ if (count != count_verify[page_nr])
+ err("page_nr %lu memory corruption %llu %llu",
+ page_nr, count, count_verify[page_nr]);
+ count++;
+ *area_count(area_dst, page_nr) = count_verify[page_nr] = count;
+ pthread_mutex_unlock(area_mutex(area_dst, page_nr));
+ }
+
+ return NULL;
+}
+
+static int copy_page_retry(int ufd, unsigned long offset)
+{
+ return __copy_page(ufd, offset, true, test_uffdio_wp);
+}
+
+pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static void *uffd_read_thread(void *arg)
+{
+ struct uffd_args *args = (struct uffd_args *)arg;
+ struct uffd_msg msg;
+
+ pthread_mutex_unlock(&uffd_read_mutex);
+ /* from here cancellation is ok */
+
+ for (;;) {
+ if (uffd_read_msg(uffd, &msg))
+ continue;
+ uffd_handle_page_fault(&msg, args);
+ }
+
+ return NULL;
+}
+
+static void *background_thread(void *arg)
+{
+ unsigned long cpu = (unsigned long) arg;
+ unsigned long page_nr, start_nr, mid_nr, end_nr;
+
+ start_nr = cpu * nr_pages_per_cpu;
+ end_nr = (cpu+1) * nr_pages_per_cpu;
+ mid_nr = (start_nr + end_nr) / 2;
+
+ /* Copy the first half of the pages */
+ for (page_nr = start_nr; page_nr < mid_nr; page_nr++)
+ copy_page_retry(uffd, page_nr * page_size);
+
+ /*
+ * If we need to test uffd-wp, set it up now. Then we'll have
+ * at least the first half of the pages mapped already which
+ * can be write-protected for testing
+ */
+ if (test_uffdio_wp)
+ wp_range(uffd, (unsigned long)area_dst + start_nr * page_size,
+ nr_pages_per_cpu * page_size, true);
+
+ /*
+ * Continue the 2nd half of the page copying, handling write
+ * protection faults if any
+ */
+ for (page_nr = mid_nr; page_nr < end_nr; page_nr++)
+ copy_page_retry(uffd, page_nr * page_size);
+
+ return NULL;
+}
+
+static int stress(struct uffd_args *args)
+{
+ unsigned long cpu;
+ pthread_t locking_threads[nr_cpus];
+ pthread_t uffd_threads[nr_cpus];
+ pthread_t background_threads[nr_cpus];
+
+ finished = 0;
+ for (cpu = 0; cpu < nr_cpus; cpu++) {
+ if (pthread_create(&locking_threads[cpu], &attr,
+ locking_thread, (void *)cpu))
+ return 1;
+ if (bounces & BOUNCE_POLL) {
+ if (pthread_create(&uffd_threads[cpu], &attr,
+ uffd_poll_thread,
+ (void *)&args[cpu]))
+ return 1;
+ } else {
+ if (pthread_create(&uffd_threads[cpu], &attr,
+ uffd_read_thread,
+ (void *)&args[cpu]))
+ return 1;
+ pthread_mutex_lock(&uffd_read_mutex);
+ }
+ if (pthread_create(&background_threads[cpu], &attr,
+ background_thread, (void *)cpu))
+ return 1;
+ }
+ for (cpu = 0; cpu < nr_cpus; cpu++)
+ if (pthread_join(background_threads[cpu], NULL))
+ return 1;
+
+ /*
+ * Be strict and immediately zap area_src, the whole area has
+ * been transferred already by the background treads. The
+ * area_src could then be faulted in a racy way by still
+ * running uffdio_threads reading zeropages after we zapped
+ * area_src (but they're guaranteed to get -EEXIST from
+ * UFFDIO_COPY without writing zero pages into area_dst
+ * because the background threads already completed).
+ */
+ uffd_test_ops->release_pages(area_src);
+
+ finished = 1;
+ for (cpu = 0; cpu < nr_cpus; cpu++)
+ if (pthread_join(locking_threads[cpu], NULL))
+ return 1;
+
+ for (cpu = 0; cpu < nr_cpus; cpu++) {
+ char c;
+ if (bounces & BOUNCE_POLL) {
+ if (write(pipefd[cpu*2+1], &c, 1) != 1)
+ err("pipefd write error");
+ if (pthread_join(uffd_threads[cpu],
+ (void *)&args[cpu]))
+ return 1;
+ } else {
+ if (pthread_cancel(uffd_threads[cpu]))
+ return 1;
+ if (pthread_join(uffd_threads[cpu], NULL))
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static int userfaultfd_stress(void)
+{
+ void *area;
+ unsigned long nr;
+ struct uffd_args args[nr_cpus];
+ uint64_t mem_size = nr_pages * page_size;
+
+ if (uffd_test_ctx_init(UFFD_FEATURE_WP_UNPOPULATED, NULL))
+ err("context init failed");
+
+ if (posix_memalign(&area, page_size, page_size))
+ err("out of memory");
+ zeropage = area;
+ bzero(zeropage, page_size);
+
+ pthread_mutex_lock(&uffd_read_mutex);
+
+ pthread_attr_init(&attr);
+ pthread_attr_setstacksize(&attr, 16*1024*1024);
+
+ while (bounces--) {
+ printf("bounces: %d, mode:", bounces);
+ if (bounces & BOUNCE_RANDOM)
+ printf(" rnd");
+ if (bounces & BOUNCE_RACINGFAULTS)
+ printf(" racing");
+ if (bounces & BOUNCE_VERIFY)
+ printf(" ver");
+ if (bounces & BOUNCE_POLL)
+ printf(" poll");
+ else
+ printf(" read");
+ printf(", ");
+ fflush(stdout);
+
+ if (bounces & BOUNCE_POLL)
+ fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
+ else
+ fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK);
+
+ /* register */
+ if (uffd_register(uffd, area_dst, mem_size,
+ true, test_uffdio_wp, false))
+ err("register failure");
+
+ if (area_dst_alias) {
+ if (uffd_register(uffd, area_dst_alias, mem_size,
+ true, test_uffdio_wp, false))
+ err("register failure alias");
+ }
+
+ /*
+ * The madvise done previously isn't enough: some
+ * uffd_thread could have read userfaults (one of
+ * those already resolved by the background thread)
+ * and it may be in the process of calling
+ * UFFDIO_COPY. UFFDIO_COPY will read the zapped
+ * area_src and it would map a zero page in it (of
+ * course such a UFFDIO_COPY is perfectly safe as it'd
+ * return -EEXIST). The problem comes at the next
+ * bounce though: that racing UFFDIO_COPY would
+ * generate zeropages in the area_src, so invalidating
+ * the previous MADV_DONTNEED. Without this additional
+ * MADV_DONTNEED those zeropages leftovers in the
+ * area_src would lead to -EEXIST failure during the
+ * next bounce, effectively leaving a zeropage in the
+ * area_dst.
+ *
+ * Try to comment this out madvise to see the memory
+ * corruption being caught pretty quick.
+ *
+ * khugepaged is also inhibited to collapse THP after
+ * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's
+ * required to MADV_DONTNEED here.
+ */
+ uffd_test_ops->release_pages(area_dst);
+
+ uffd_stats_reset(args, nr_cpus);
+
+ /* bounce pass */
+ if (stress(args))
+ return 1;
+
+ /* Clear all the write protections if there is any */
+ if (test_uffdio_wp)
+ wp_range(uffd, (unsigned long)area_dst,
+ nr_pages * page_size, false);
+
+ /* unregister */
+ if (uffd_unregister(uffd, area_dst, mem_size))
+ err("unregister failure");
+ if (area_dst_alias) {
+ if (uffd_unregister(uffd, area_dst_alias, mem_size))
+ err("unregister failure alias");
+ }
+
+ /* verification */
+ if (bounces & BOUNCE_VERIFY)
+ for (nr = 0; nr < nr_pages; nr++)
+ if (*area_count(area_dst, nr) != count_verify[nr])
+ err("error area_count %llu %llu %lu\n",
+ *area_count(area_src, nr),
+ count_verify[nr], nr);
+
+ /* prepare next bounce */
+ swap(area_src, area_dst);
+
+ swap(area_src_alias, area_dst_alias);
+
+ uffd_stats_report(args, nr_cpus);
+ }
+
+ return 0;
+}
+
+static void set_test_type(const char *type)
+{
+ if (!strcmp(type, "anon")) {
+ test_type = TEST_ANON;
+ uffd_test_ops = &anon_uffd_test_ops;
+ } else if (!strcmp(type, "hugetlb")) {
+ test_type = TEST_HUGETLB;
+ uffd_test_ops = &hugetlb_uffd_test_ops;
+ map_shared = true;
+ } else if (!strcmp(type, "hugetlb-private")) {
+ test_type = TEST_HUGETLB;
+ uffd_test_ops = &hugetlb_uffd_test_ops;
+ } else if (!strcmp(type, "shmem")) {
+ map_shared = true;
+ test_type = TEST_SHMEM;
+ uffd_test_ops = &shmem_uffd_test_ops;
+ } else if (!strcmp(type, "shmem-private")) {
+ test_type = TEST_SHMEM;
+ uffd_test_ops = &shmem_uffd_test_ops;
+ }
+}
+
+static void parse_test_type_arg(const char *raw_type)
+{
+ uint64_t features = UFFD_API_FEATURES;
+
+ set_test_type(raw_type);
+
+ if (!test_type)
+ err("failed to parse test type argument: '%s'", raw_type);
+
+ if (test_type == TEST_HUGETLB)
+ page_size = default_huge_page_size();
+ else
+ page_size = sysconf(_SC_PAGE_SIZE);
+
+ if (!page_size)
+ err("Unable to determine page size");
+ if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
+ > page_size)
+ err("Impossible to run this test");
+
+ /*
+ * Whether we can test certain features depends not just on test type,
+ * but also on whether or not this particular kernel supports the
+ * feature.
+ */
+
+ if (userfaultfd_open(&features))
+ err("Userfaultfd open failed");
+
+ test_uffdio_wp = test_uffdio_wp &&
+ (features & UFFD_FEATURE_PAGEFAULT_FLAG_WP);
+
+ close(uffd);
+ uffd = -1;
+}
+
+static void sigalrm(int sig)
+{
+ if (sig != SIGALRM)
+ abort();
+ test_uffdio_copy_eexist = true;
+ alarm(ALARM_INTERVAL_SECS);
+}
+
+int main(int argc, char **argv)
+{
+ size_t bytes;
+
+ if (argc < 4)
+ usage();
+
+ if (signal(SIGALRM, sigalrm) == SIG_ERR)
+ err("failed to arm SIGALRM");
+ alarm(ALARM_INTERVAL_SECS);
+
+ parse_test_type_arg(argv[1]);
+ bytes = atol(argv[2]) * 1024 * 1024;
+
+ nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+
+ nr_pages_per_cpu = bytes / page_size / nr_cpus;
+ if (!nr_pages_per_cpu) {
+ _err("invalid MiB");
+ usage();
+ }
+
+ bounces = atoi(argv[3]);
+ if (bounces <= 0) {
+ _err("invalid bounces");
+ usage();
+ }
+ nr_pages = nr_pages_per_cpu * nr_cpus;
+
+ printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
+ nr_pages, nr_pages_per_cpu);
+ return userfaultfd_stress();
+}
+
+#else /* __NR_userfaultfd */
+
+#warning "missing __NR_userfaultfd definition"
+
+int main(void)
+{
+ printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n");
+ return KSFT_SKIP;
+}
+
+#endif /* __NR_userfaultfd */
diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c
new file mode 100644
index 000000000000..269c86768a02
--- /dev/null
+++ b/tools/testing/selftests/mm/uffd-unit-tests.c
@@ -0,0 +1,1228 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Userfaultfd unit tests.
+ *
+ * Copyright (C) 2015-2023 Red Hat, Inc.
+ */
+
+#include "uffd-common.h"
+
+#include "../../../../mm/gup_test.h"
+
+#ifdef __NR_userfaultfd
+
+/* The unit test doesn't need a large or random size, make it 32MB for now */
+#define UFFD_TEST_MEM_SIZE (32UL << 20)
+
+#define MEM_ANON BIT_ULL(0)
+#define MEM_SHMEM BIT_ULL(1)
+#define MEM_SHMEM_PRIVATE BIT_ULL(2)
+#define MEM_HUGETLB BIT_ULL(3)
+#define MEM_HUGETLB_PRIVATE BIT_ULL(4)
+
+#define MEM_ALL (MEM_ANON | MEM_SHMEM | MEM_SHMEM_PRIVATE | \
+ MEM_HUGETLB | MEM_HUGETLB_PRIVATE)
+
+struct mem_type {
+ const char *name;
+ unsigned int mem_flag;
+ uffd_test_ops_t *mem_ops;
+ bool shared;
+};
+typedef struct mem_type mem_type_t;
+
+mem_type_t mem_types[] = {
+ {
+ .name = "anon",
+ .mem_flag = MEM_ANON,
+ .mem_ops = &anon_uffd_test_ops,
+ .shared = false,
+ },
+ {
+ .name = "shmem",
+ .mem_flag = MEM_SHMEM,
+ .mem_ops = &shmem_uffd_test_ops,
+ .shared = true,
+ },
+ {
+ .name = "shmem-private",
+ .mem_flag = MEM_SHMEM_PRIVATE,
+ .mem_ops = &shmem_uffd_test_ops,
+ .shared = false,
+ },
+ {
+ .name = "hugetlb",
+ .mem_flag = MEM_HUGETLB,
+ .mem_ops = &hugetlb_uffd_test_ops,
+ .shared = true,
+ },
+ {
+ .name = "hugetlb-private",
+ .mem_flag = MEM_HUGETLB_PRIVATE,
+ .mem_ops = &hugetlb_uffd_test_ops,
+ .shared = false,
+ },
+};
+
+/* Arguments to be passed over to each uffd unit test */
+struct uffd_test_args {
+ mem_type_t *mem_type;
+};
+typedef struct uffd_test_args uffd_test_args_t;
+
+/* Returns: UFFD_TEST_* */
+typedef void (*uffd_test_fn)(uffd_test_args_t *);
+
+typedef struct {
+ const char *name;
+ uffd_test_fn uffd_fn;
+ unsigned int mem_targets;
+ uint64_t uffd_feature_required;
+} uffd_test_case_t;
+
+static void uffd_test_report(void)
+{
+ printf("Userfaults unit tests: pass=%u, skip=%u, fail=%u (total=%u)\n",
+ ksft_get_pass_cnt(),
+ ksft_get_xskip_cnt(),
+ ksft_get_fail_cnt(),
+ ksft_test_num());
+}
+
+static void uffd_test_pass(void)
+{
+ printf("done\n");
+ ksft_inc_pass_cnt();
+}
+
+#define uffd_test_start(...) do { \
+ printf("Testing "); \
+ printf(__VA_ARGS__); \
+ printf("... "); \
+ fflush(stdout); \
+ } while (0)
+
+#define uffd_test_fail(...) do { \
+ printf("failed [reason: "); \
+ printf(__VA_ARGS__); \
+ printf("]\n"); \
+ ksft_inc_fail_cnt(); \
+ } while (0)
+
+#define uffd_test_skip(...) do { \
+ printf("skipped [reason: "); \
+ printf(__VA_ARGS__); \
+ printf("]\n"); \
+ ksft_inc_xskip_cnt(); \
+ } while (0)
+
+/*
+ * Returns 1 if specific userfaultfd supported, 0 otherwise. Note, we'll
+ * return 1 even if some test failed as long as uffd supported, because in
+ * that case we still want to proceed with the rest uffd unit tests.
+ */
+static int test_uffd_api(bool use_dev)
+{
+ struct uffdio_api uffdio_api;
+ int uffd;
+
+ uffd_test_start("UFFDIO_API (with %s)",
+ use_dev ? "/dev/userfaultfd" : "syscall");
+
+ if (use_dev)
+ uffd = uffd_open_dev(UFFD_FLAGS);
+ else
+ uffd = uffd_open_sys(UFFD_FLAGS);
+ if (uffd < 0) {
+ uffd_test_skip("cannot open userfaultfd handle");
+ return 0;
+ }
+
+ /* Test wrong UFFD_API */
+ uffdio_api.api = 0xab;
+ uffdio_api.features = 0;
+ if (ioctl(uffd, UFFDIO_API, &uffdio_api) == 0) {
+ uffd_test_fail("UFFDIO_API should fail with wrong api but didn't");
+ goto out;
+ }
+
+ /* Test wrong feature bit */
+ uffdio_api.api = UFFD_API;
+ uffdio_api.features = BIT_ULL(63);
+ if (ioctl(uffd, UFFDIO_API, &uffdio_api) == 0) {
+ uffd_test_fail("UFFDIO_API should fail with wrong feature but didn't");
+ goto out;
+ }
+
+ /* Test normal UFFDIO_API */
+ uffdio_api.api = UFFD_API;
+ uffdio_api.features = 0;
+ if (ioctl(uffd, UFFDIO_API, &uffdio_api)) {
+ uffd_test_fail("UFFDIO_API should succeed but failed");
+ goto out;
+ }
+
+ /* Test double requests of UFFDIO_API with a random feature set */
+ uffdio_api.features = BIT_ULL(0);
+ if (ioctl(uffd, UFFDIO_API, &uffdio_api) == 0) {
+ uffd_test_fail("UFFDIO_API should reject initialized uffd");
+ goto out;
+ }
+
+ uffd_test_pass();
+out:
+ close(uffd);
+ /* We have a valid uffd handle */
+ return 1;
+}
+
+/*
+ * This function initializes the global variables. TODO: remove global
+ * vars and then remove this.
+ */
+static int
+uffd_setup_environment(uffd_test_args_t *args, uffd_test_case_t *test,
+ mem_type_t *mem_type, const char **errmsg)
+{
+ map_shared = mem_type->shared;
+ uffd_test_ops = mem_type->mem_ops;
+
+ if (mem_type->mem_flag & (MEM_HUGETLB_PRIVATE | MEM_HUGETLB))
+ page_size = default_huge_page_size();
+ else
+ page_size = psize();
+
+ nr_pages = UFFD_TEST_MEM_SIZE / page_size;
+ /* TODO: remove this global var.. it's so ugly */
+ nr_cpus = 1;
+
+ /* Initialize test arguments */
+ args->mem_type = mem_type;
+
+ return uffd_test_ctx_init(test->uffd_feature_required, errmsg);
+}
+
+static bool uffd_feature_supported(uffd_test_case_t *test)
+{
+ uint64_t features;
+
+ if (uffd_get_features(&features))
+ return false;
+
+ return (features & test->uffd_feature_required) ==
+ test->uffd_feature_required;
+}
+
+static int pagemap_open(void)
+{
+ int fd = open("/proc/self/pagemap", O_RDONLY);
+
+ if (fd < 0)
+ err("open pagemap");
+
+ return fd;
+}
+
+/* This macro let __LINE__ works in err() */
+#define pagemap_check_wp(value, wp) do { \
+ if (!!(value & PM_UFFD_WP) != wp) \
+ err("pagemap uffd-wp bit error: 0x%"PRIx64, value); \
+ } while (0)
+
+typedef struct {
+ int parent_uffd, child_uffd;
+} fork_event_args;
+
+static void *fork_event_consumer(void *data)
+{
+ fork_event_args *args = data;
+ struct uffd_msg msg = { 0 };
+
+ /* Read until a full msg received */
+ while (uffd_read_msg(args->parent_uffd, &msg));
+
+ if (msg.event != UFFD_EVENT_FORK)
+ err("wrong message: %u\n", msg.event);
+
+ /* Just to be properly freed later */
+ args->child_uffd = msg.arg.fork.ufd;
+ return NULL;
+}
+
+typedef struct {
+ int gup_fd;
+ bool pinned;
+} pin_args;
+
+/*
+ * Returns 0 if succeed, <0 for errors. pin_pages() needs to be paired
+ * with unpin_pages(). Currently it needs to be RO longterm pin to satisfy
+ * all needs of the test cases (e.g., trigger unshare, trigger fork() early
+ * CoW, etc.).
+ */
+static int pin_pages(pin_args *args, void *buffer, size_t size)
+{
+ struct pin_longterm_test test = {
+ .addr = (uintptr_t)buffer,
+ .size = size,
+ /* Read-only pins */
+ .flags = 0,
+ };
+
+ if (args->pinned)
+ err("already pinned");
+
+ args->gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR);
+ if (args->gup_fd < 0)
+ return -errno;
+
+ if (ioctl(args->gup_fd, PIN_LONGTERM_TEST_START, &test)) {
+ /* Even if gup_test existed, can be an old gup_test / kernel */
+ close(args->gup_fd);
+ return -errno;
+ }
+ args->pinned = true;
+ return 0;
+}
+
+static void unpin_pages(pin_args *args)
+{
+ if (!args->pinned)
+ err("unpin without pin first");
+ if (ioctl(args->gup_fd, PIN_LONGTERM_TEST_STOP))
+ err("PIN_LONGTERM_TEST_STOP");
+ close(args->gup_fd);
+ args->pinned = false;
+}
+
+static int pagemap_test_fork(int uffd, bool with_event, bool test_pin)
+{
+ fork_event_args args = { .parent_uffd = uffd, .child_uffd = -1 };
+ pthread_t thread;
+ pid_t child;
+ uint64_t value;
+ int fd, result;
+
+ /* Prepare a thread to resolve EVENT_FORK */
+ if (with_event) {
+ if (pthread_create(&thread, NULL, fork_event_consumer, &args))
+ err("pthread_create()");
+ }
+
+ child = fork();
+ if (!child) {
+ /* Open the pagemap fd of the child itself */
+ pin_args args = {};
+
+ fd = pagemap_open();
+
+ if (test_pin && pin_pages(&args, area_dst, page_size))
+ /*
+ * Normally when reach here we have pinned in
+ * previous tests, so shouldn't fail anymore
+ */
+ err("pin page failed in child");
+
+ value = pagemap_get_entry(fd, area_dst);
+ /*
+ * After fork(), we should handle uffd-wp bit differently:
+ *
+ * (1) when with EVENT_FORK, it should persist
+ * (2) when without EVENT_FORK, it should be dropped
+ */
+ pagemap_check_wp(value, with_event);
+ if (test_pin)
+ unpin_pages(&args);
+ /* Succeed */
+ exit(0);
+ }
+ waitpid(child, &result, 0);
+
+ if (with_event) {
+ if (pthread_join(thread, NULL))
+ err("pthread_join()");
+ if (args.child_uffd < 0)
+ err("Didn't receive child uffd");
+ close(args.child_uffd);
+ }
+
+ return result;
+}
+
+static void uffd_wp_unpopulated_test(uffd_test_args_t *args)
+{
+ uint64_t value;
+ int pagemap_fd;
+
+ if (uffd_register(uffd, area_dst, nr_pages * page_size,
+ false, true, false))
+ err("register failed");
+
+ pagemap_fd = pagemap_open();
+
+ /* Test applying pte marker to anon unpopulated */
+ wp_range(uffd, (uint64_t)area_dst, page_size, true);
+ value = pagemap_get_entry(pagemap_fd, area_dst);
+ pagemap_check_wp(value, true);
+
+ /* Test unprotect on anon pte marker */
+ wp_range(uffd, (uint64_t)area_dst, page_size, false);
+ value = pagemap_get_entry(pagemap_fd, area_dst);
+ pagemap_check_wp(value, false);
+
+ /* Test zap on anon marker */
+ wp_range(uffd, (uint64_t)area_dst, page_size, true);
+ if (madvise(area_dst, page_size, MADV_DONTNEED))
+ err("madvise(MADV_DONTNEED) failed");
+ value = pagemap_get_entry(pagemap_fd, area_dst);
+ pagemap_check_wp(value, false);
+
+ /* Test fault in after marker removed */
+ *area_dst = 1;
+ value = pagemap_get_entry(pagemap_fd, area_dst);
+ pagemap_check_wp(value, false);
+ /* Drop it to make pte none again */
+ if (madvise(area_dst, page_size, MADV_DONTNEED))
+ err("madvise(MADV_DONTNEED) failed");
+
+ /* Test read-zero-page upon pte marker */
+ wp_range(uffd, (uint64_t)area_dst, page_size, true);
+ *(volatile char *)area_dst;
+ /* Drop it to make pte none again */
+ if (madvise(area_dst, page_size, MADV_DONTNEED))
+ err("madvise(MADV_DONTNEED) failed");
+
+ uffd_test_pass();
+}
+
+static void uffd_wp_fork_test_common(uffd_test_args_t *args,
+ bool with_event)
+{
+ int pagemap_fd;
+ uint64_t value;
+
+ if (uffd_register(uffd, area_dst, nr_pages * page_size,
+ false, true, false))
+ err("register failed");
+
+ pagemap_fd = pagemap_open();
+
+ /* Touch the page */
+ *area_dst = 1;
+ wp_range(uffd, (uint64_t)area_dst, page_size, true);
+ value = pagemap_get_entry(pagemap_fd, area_dst);
+ pagemap_check_wp(value, true);
+ if (pagemap_test_fork(uffd, with_event, false)) {
+ uffd_test_fail("Detected %s uffd-wp bit in child in present pte",
+ with_event ? "missing" : "stall");
+ goto out;
+ }
+
+ /*
+ * This is an attempt for zapping the pgtable so as to test the
+ * markers.
+ *
+ * For private mappings, PAGEOUT will only work on exclusive ptes
+ * (PM_MMAP_EXCLUSIVE) which we should satisfy.
+ *
+ * For shared, PAGEOUT may not work. Use DONTNEED instead which
+ * plays a similar role of zapping (rather than freeing the page)
+ * to expose pte markers.
+ */
+ if (args->mem_type->shared) {
+ if (madvise(area_dst, page_size, MADV_DONTNEED))
+ err("MADV_DONTNEED");
+ } else {
+ /*
+ * NOTE: ignore retval because private-hugetlb doesn't yet
+ * support swapping, so it could fail.
+ */
+ madvise(area_dst, page_size, MADV_PAGEOUT);
+ }
+
+ /* Uffd-wp should persist even swapped out */
+ value = pagemap_get_entry(pagemap_fd, area_dst);
+ pagemap_check_wp(value, true);
+ if (pagemap_test_fork(uffd, with_event, false)) {
+ uffd_test_fail("Detected %s uffd-wp bit in child in zapped pte",
+ with_event ? "missing" : "stall");
+ goto out;
+ }
+
+ /* Unprotect; this tests swap pte modifications */
+ wp_range(uffd, (uint64_t)area_dst, page_size, false);
+ value = pagemap_get_entry(pagemap_fd, area_dst);
+ pagemap_check_wp(value, false);
+
+ /* Fault in the page from disk */
+ *area_dst = 2;
+ value = pagemap_get_entry(pagemap_fd, area_dst);
+ pagemap_check_wp(value, false);
+ uffd_test_pass();
+out:
+ if (uffd_unregister(uffd, area_dst, nr_pages * page_size))
+ err("unregister failed");
+ close(pagemap_fd);
+}
+
+static void uffd_wp_fork_test(uffd_test_args_t *args)
+{
+ uffd_wp_fork_test_common(args, false);
+}
+
+static void uffd_wp_fork_with_event_test(uffd_test_args_t *args)
+{
+ uffd_wp_fork_test_common(args, true);
+}
+
+static void uffd_wp_fork_pin_test_common(uffd_test_args_t *args,
+ bool with_event)
+{
+ int pagemap_fd;
+ pin_args pin_args = {};
+
+ if (uffd_register(uffd, area_dst, page_size, false, true, false))
+ err("register failed");
+
+ pagemap_fd = pagemap_open();
+
+ /* Touch the page */
+ *area_dst = 1;
+ wp_range(uffd, (uint64_t)area_dst, page_size, true);
+
+ /*
+ * 1. First pin, then fork(). This tests fork() special path when
+ * doing early CoW if the page is private.
+ */
+ if (pin_pages(&pin_args, area_dst, page_size)) {
+ uffd_test_skip("Possibly CONFIG_GUP_TEST missing "
+ "or unprivileged");
+ close(pagemap_fd);
+ uffd_unregister(uffd, area_dst, page_size);
+ return;
+ }
+
+ if (pagemap_test_fork(uffd, with_event, false)) {
+ uffd_test_fail("Detected %s uffd-wp bit in early CoW of fork()",
+ with_event ? "missing" : "stall");
+ unpin_pages(&pin_args);
+ goto out;
+ }
+
+ unpin_pages(&pin_args);
+
+ /*
+ * 2. First fork(), then pin (in the child, where test_pin==true).
+ * This tests COR, aka, page unsharing on private memories.
+ */
+ if (pagemap_test_fork(uffd, with_event, true)) {
+ uffd_test_fail("Detected %s uffd-wp bit when RO pin",
+ with_event ? "missing" : "stall");
+ goto out;
+ }
+ uffd_test_pass();
+out:
+ if (uffd_unregister(uffd, area_dst, page_size))
+ err("register failed");
+ close(pagemap_fd);
+}
+
+static void uffd_wp_fork_pin_test(uffd_test_args_t *args)
+{
+ uffd_wp_fork_pin_test_common(args, false);
+}
+
+static void uffd_wp_fork_pin_with_event_test(uffd_test_args_t *args)
+{
+ uffd_wp_fork_pin_test_common(args, true);
+}
+
+static void check_memory_contents(char *p)
+{
+ unsigned long i, j;
+ uint8_t expected_byte;
+
+ for (i = 0; i < nr_pages; ++i) {
+ expected_byte = ~((uint8_t)(i % ((uint8_t)-1)));
+ for (j = 0; j < page_size; j++) {
+ uint8_t v = *(uint8_t *)(p + (i * page_size) + j);
+ if (v != expected_byte)
+ err("unexpected page contents");
+ }
+ }
+}
+
+static void uffd_minor_test_common(bool test_collapse, bool test_wp)
+{
+ unsigned long p;
+ pthread_t uffd_mon;
+ char c;
+ struct uffd_args args = { 0 };
+
+ /*
+ * NOTE: MADV_COLLAPSE is not yet compatible with WP, so testing
+ * both do not make much sense.
+ */
+ assert(!(test_collapse && test_wp));
+
+ if (uffd_register(uffd, area_dst_alias, nr_pages * page_size,
+ /* NOTE! MADV_COLLAPSE may not work with uffd-wp */
+ false, test_wp, true))
+ err("register failure");
+
+ /*
+ * After registering with UFFD, populate the non-UFFD-registered side of
+ * the shared mapping. This should *not* trigger any UFFD minor faults.
+ */
+ for (p = 0; p < nr_pages; ++p)
+ memset(area_dst + (p * page_size), p % ((uint8_t)-1),
+ page_size);
+
+ args.apply_wp = test_wp;
+ if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args))
+ err("uffd_poll_thread create");
+
+ /*
+ * Read each of the pages back using the UFFD-registered mapping. We
+ * expect that the first time we touch a page, it will result in a minor
+ * fault. uffd_poll_thread will resolve the fault by bit-flipping the
+ * page's contents, and then issuing a CONTINUE ioctl.
+ */
+ check_memory_contents(area_dst_alias);
+
+ if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
+ err("pipe write");
+ if (pthread_join(uffd_mon, NULL))
+ err("join() failed");
+
+ if (test_collapse) {
+ if (madvise(area_dst_alias, nr_pages * page_size,
+ MADV_COLLAPSE)) {
+ /* It's fine to fail for this one... */
+ uffd_test_skip("MADV_COLLAPSE failed");
+ return;
+ }
+
+ uffd_test_ops->check_pmd_mapping(area_dst,
+ nr_pages * page_size /
+ read_pmd_pagesize());
+ /*
+ * This won't cause uffd-fault - it purely just makes sure there
+ * was no corruption.
+ */
+ check_memory_contents(area_dst_alias);
+ }
+
+ if (args.missing_faults != 0 || args.minor_faults != nr_pages)
+ uffd_test_fail("stats check error");
+ else
+ uffd_test_pass();
+}
+
+void uffd_minor_test(uffd_test_args_t *args)
+{
+ uffd_minor_test_common(false, false);
+}
+
+void uffd_minor_wp_test(uffd_test_args_t *args)
+{
+ uffd_minor_test_common(false, true);
+}
+
+void uffd_minor_collapse_test(uffd_test_args_t *args)
+{
+ uffd_minor_test_common(true, false);
+}
+
+static sigjmp_buf jbuf, *sigbuf;
+
+static void sighndl(int sig, siginfo_t *siginfo, void *ptr)
+{
+ if (sig == SIGBUS) {
+ if (sigbuf)
+ siglongjmp(*sigbuf, 1);
+ abort();
+ }
+}
+
+/*
+ * For non-cooperative userfaultfd test we fork() a process that will
+ * generate pagefaults, will mremap the area monitored by the
+ * userfaultfd and at last this process will release the monitored
+ * area.
+ * For the anonymous and shared memory the area is divided into two
+ * parts, the first part is accessed before mremap, and the second
+ * part is accessed after mremap. Since hugetlbfs does not support
+ * mremap, the entire monitored area is accessed in a single pass for
+ * HUGETLB_TEST.
+ * The release of the pages currently generates event for shmem and
+ * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked
+ * for hugetlb.
+ * For signal test(UFFD_FEATURE_SIGBUS), signal_test = 1, we register
+ * monitored area, generate pagefaults and test that signal is delivered.
+ * Use UFFDIO_COPY to allocate missing page and retry. For signal_test = 2
+ * test robustness use case - we release monitored area, fork a process
+ * that will generate pagefaults and verify signal is generated.
+ * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal
+ * feature. Using monitor thread, verify no userfault events are generated.
+ */
+static int faulting_process(int signal_test, bool wp)
+{
+ unsigned long nr, i;
+ unsigned long long count;
+ unsigned long split_nr_pages;
+ unsigned long lastnr;
+ struct sigaction act;
+ volatile unsigned long signalled = 0;
+
+ split_nr_pages = (nr_pages + 1) / 2;
+
+ if (signal_test) {
+ sigbuf = &jbuf;
+ memset(&act, 0, sizeof(act));
+ act.sa_sigaction = sighndl;
+ act.sa_flags = SA_SIGINFO;
+ if (sigaction(SIGBUS, &act, 0))
+ err("sigaction");
+ lastnr = (unsigned long)-1;
+ }
+
+ for (nr = 0; nr < split_nr_pages; nr++) {
+ volatile int steps = 1;
+ unsigned long offset = nr * page_size;
+
+ if (signal_test) {
+ if (sigsetjmp(*sigbuf, 1) != 0) {
+ if (steps == 1 && nr == lastnr)
+ err("Signal repeated");
+
+ lastnr = nr;
+ if (signal_test == 1) {
+ if (steps == 1) {
+ /* This is a MISSING request */
+ steps++;
+ if (copy_page(uffd, offset, wp))
+ signalled++;
+ } else {
+ /* This is a WP request */
+ assert(steps == 2);
+ wp_range(uffd,
+ (__u64)area_dst +
+ offset,
+ page_size, false);
+ }
+ } else {
+ signalled++;
+ continue;
+ }
+ }
+ }
+
+ count = *area_count(area_dst, nr);
+ if (count != count_verify[nr])
+ err("nr %lu memory corruption %llu %llu\n",
+ nr, count, count_verify[nr]);
+ /*
+ * Trigger write protection if there is by writing
+ * the same value back.
+ */
+ *area_count(area_dst, nr) = count;
+ }
+
+ if (signal_test)
+ return signalled != split_nr_pages;
+
+ area_dst = mremap(area_dst, nr_pages * page_size, nr_pages * page_size,
+ MREMAP_MAYMOVE | MREMAP_FIXED, area_src);
+ if (area_dst == MAP_FAILED)
+ err("mremap");
+ /* Reset area_src since we just clobbered it */
+ area_src = NULL;
+
+ for (; nr < nr_pages; nr++) {
+ count = *area_count(area_dst, nr);
+ if (count != count_verify[nr]) {
+ err("nr %lu memory corruption %llu %llu\n",
+ nr, count, count_verify[nr]);
+ }
+ /*
+ * Trigger write protection if there is by writing
+ * the same value back.
+ */
+ *area_count(area_dst, nr) = count;
+ }
+
+ uffd_test_ops->release_pages(area_dst);
+
+ for (nr = 0; nr < nr_pages; nr++)
+ for (i = 0; i < page_size; i++)
+ if (*(area_dst + nr * page_size + i) != 0)
+ err("page %lu offset %lu is not zero", nr, i);
+
+ return 0;
+}
+
+static void uffd_sigbus_test_common(bool wp)
+{
+ unsigned long userfaults;
+ pthread_t uffd_mon;
+ pid_t pid;
+ int err;
+ char c;
+ struct uffd_args args = { 0 };
+
+ fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
+
+ if (uffd_register(uffd, area_dst, nr_pages * page_size,
+ true, wp, false))
+ err("register failure");
+
+ if (faulting_process(1, wp))
+ err("faulting process failed");
+
+ uffd_test_ops->release_pages(area_dst);
+
+ args.apply_wp = wp;
+ if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args))
+ err("uffd_poll_thread create");
+
+ pid = fork();
+ if (pid < 0)
+ err("fork");
+
+ if (!pid)
+ exit(faulting_process(2, wp));
+
+ waitpid(pid, &err, 0);
+ if (err)
+ err("faulting process failed");
+ if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
+ err("pipe write");
+ if (pthread_join(uffd_mon, (void **)&userfaults))
+ err("pthread_join()");
+
+ if (userfaults)
+ uffd_test_fail("Signal test failed, userfaults: %ld", userfaults);
+ else
+ uffd_test_pass();
+}
+
+static void uffd_sigbus_test(uffd_test_args_t *args)
+{
+ uffd_sigbus_test_common(false);
+}
+
+static void uffd_sigbus_wp_test(uffd_test_args_t *args)
+{
+ uffd_sigbus_test_common(true);
+}
+
+static void uffd_events_test_common(bool wp)
+{
+ pthread_t uffd_mon;
+ pid_t pid;
+ int err;
+ char c;
+ struct uffd_args args = { 0 };
+
+ fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
+ if (uffd_register(uffd, area_dst, nr_pages * page_size,
+ true, wp, false))
+ err("register failure");
+
+ args.apply_wp = wp;
+ if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args))
+ err("uffd_poll_thread create");
+
+ pid = fork();
+ if (pid < 0)
+ err("fork");
+
+ if (!pid)
+ exit(faulting_process(0, wp));
+
+ waitpid(pid, &err, 0);
+ if (err)
+ err("faulting process failed");
+ if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
+ err("pipe write");
+ if (pthread_join(uffd_mon, NULL))
+ err("pthread_join()");
+
+ if (args.missing_faults != nr_pages)
+ uffd_test_fail("Fault counts wrong");
+ else
+ uffd_test_pass();
+}
+
+static void uffd_events_test(uffd_test_args_t *args)
+{
+ uffd_events_test_common(false);
+}
+
+static void uffd_events_wp_test(uffd_test_args_t *args)
+{
+ uffd_events_test_common(true);
+}
+
+static void retry_uffdio_zeropage(int ufd,
+ struct uffdio_zeropage *uffdio_zeropage)
+{
+ uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start,
+ uffdio_zeropage->range.len,
+ 0);
+ if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) {
+ if (uffdio_zeropage->zeropage != -EEXIST)
+ err("UFFDIO_ZEROPAGE error: %"PRId64,
+ (int64_t)uffdio_zeropage->zeropage);
+ } else {
+ err("UFFDIO_ZEROPAGE error: %"PRId64,
+ (int64_t)uffdio_zeropage->zeropage);
+ }
+}
+
+static bool do_uffdio_zeropage(int ufd, bool has_zeropage)
+{
+ struct uffdio_zeropage uffdio_zeropage = { 0 };
+ int ret;
+ __s64 res;
+
+ uffdio_zeropage.range.start = (unsigned long) area_dst;
+ uffdio_zeropage.range.len = page_size;
+ uffdio_zeropage.mode = 0;
+ ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage);
+ res = uffdio_zeropage.zeropage;
+ if (ret) {
+ /* real retval in ufdio_zeropage.zeropage */
+ if (has_zeropage)
+ err("UFFDIO_ZEROPAGE error: %"PRId64, (int64_t)res);
+ else if (res != -EINVAL)
+ err("UFFDIO_ZEROPAGE not -EINVAL");
+ } else if (has_zeropage) {
+ if (res != page_size)
+ err("UFFDIO_ZEROPAGE unexpected size");
+ else
+ retry_uffdio_zeropage(ufd, &uffdio_zeropage);
+ return true;
+ } else
+ err("UFFDIO_ZEROPAGE succeeded");
+
+ return false;
+}
+
+/*
+ * Registers a range with MISSING mode only for zeropage test. Return true
+ * if UFFDIO_ZEROPAGE supported, false otherwise. Can't use uffd_register()
+ * because we want to detect .ioctls along the way.
+ */
+static bool
+uffd_register_detect_zeropage(int uffd, void *addr, uint64_t len)
+{
+ uint64_t ioctls = 0;
+
+ if (uffd_register_with_ioctls(uffd, addr, len, true,
+ false, false, &ioctls))
+ err("zeropage register fail");
+
+ return ioctls & (1 << _UFFDIO_ZEROPAGE);
+}
+
+/* exercise UFFDIO_ZEROPAGE */
+static void uffd_zeropage_test(uffd_test_args_t *args)
+{
+ bool has_zeropage;
+ int i;
+
+ has_zeropage = uffd_register_detect_zeropage(uffd, area_dst, page_size);
+ if (area_dst_alias)
+ /* Ignore the retval; we already have it */
+ uffd_register_detect_zeropage(uffd, area_dst_alias, page_size);
+
+ if (do_uffdio_zeropage(uffd, has_zeropage))
+ for (i = 0; i < page_size; i++)
+ if (area_dst[i] != 0)
+ err("data non-zero at offset %d\n", i);
+
+ if (uffd_unregister(uffd, area_dst, page_size))
+ err("unregister");
+
+ if (area_dst_alias && uffd_unregister(uffd, area_dst_alias, page_size))
+ err("unregister");
+
+ uffd_test_pass();
+}
+
+/*
+ * Test the returned uffdio_register.ioctls with different register modes.
+ * Note that _UFFDIO_ZEROPAGE is tested separately in the zeropage test.
+ */
+static void
+do_register_ioctls_test(uffd_test_args_t *args, bool miss, bool wp, bool minor)
+{
+ uint64_t ioctls = 0, expected = BIT_ULL(_UFFDIO_WAKE);
+ mem_type_t *mem_type = args->mem_type;
+ int ret;
+
+ ret = uffd_register_with_ioctls(uffd, area_dst, page_size,
+ miss, wp, minor, &ioctls);
+
+ /*
+ * Handle special cases of UFFDIO_REGISTER here where it should
+ * just fail with -EINVAL first..
+ *
+ * Case 1: register MINOR on anon
+ * Case 2: register with no mode selected
+ */
+ if ((minor && (mem_type->mem_flag == MEM_ANON)) ||
+ (!miss && !wp && !minor)) {
+ if (ret != -EINVAL)
+ err("register (miss=%d, wp=%d, minor=%d) failed "
+ "with wrong errno=%d", miss, wp, minor, ret);
+ return;
+ }
+
+ /* UFFDIO_REGISTER should succeed, then check ioctls returned */
+ if (miss)
+ expected |= BIT_ULL(_UFFDIO_COPY);
+ if (wp)
+ expected |= BIT_ULL(_UFFDIO_WRITEPROTECT);
+ if (minor)
+ expected |= BIT_ULL(_UFFDIO_CONTINUE);
+
+ if ((ioctls & expected) != expected)
+ err("unexpected uffdio_register.ioctls "
+ "(miss=%d, wp=%d, minor=%d): expected=0x%"PRIx64", "
+ "returned=0x%"PRIx64, miss, wp, minor, expected, ioctls);
+
+ if (uffd_unregister(uffd, area_dst, page_size))
+ err("unregister");
+}
+
+static void uffd_register_ioctls_test(uffd_test_args_t *args)
+{
+ int miss, wp, minor;
+
+ for (miss = 0; miss <= 1; miss++)
+ for (wp = 0; wp <= 1; wp++)
+ for (minor = 0; minor <= 1; minor++)
+ do_register_ioctls_test(args, miss, wp, minor);
+
+ uffd_test_pass();
+}
+
+uffd_test_case_t uffd_tests[] = {
+ {
+ /* Test returned uffdio_register.ioctls. */
+ .name = "register-ioctls",
+ .uffd_fn = uffd_register_ioctls_test,
+ .mem_targets = MEM_ALL,
+ .uffd_feature_required = UFFD_FEATURE_MISSING_HUGETLBFS |
+ UFFD_FEATURE_MISSING_SHMEM |
+ UFFD_FEATURE_PAGEFAULT_FLAG_WP |
+ UFFD_FEATURE_WP_HUGETLBFS_SHMEM |
+ UFFD_FEATURE_MINOR_HUGETLBFS |
+ UFFD_FEATURE_MINOR_SHMEM,
+ },
+ {
+ .name = "zeropage",
+ .uffd_fn = uffd_zeropage_test,
+ .mem_targets = MEM_ALL,
+ .uffd_feature_required = 0,
+ },
+ {
+ .name = "wp-fork",
+ .uffd_fn = uffd_wp_fork_test,
+ .mem_targets = MEM_ALL,
+ .uffd_feature_required = UFFD_FEATURE_PAGEFAULT_FLAG_WP |
+ UFFD_FEATURE_WP_HUGETLBFS_SHMEM,
+ },
+ {
+ .name = "wp-fork-with-event",
+ .uffd_fn = uffd_wp_fork_with_event_test,
+ .mem_targets = MEM_ALL,
+ .uffd_feature_required = UFFD_FEATURE_PAGEFAULT_FLAG_WP |
+ UFFD_FEATURE_WP_HUGETLBFS_SHMEM |
+ /* when set, child process should inherit uffd-wp bits */
+ UFFD_FEATURE_EVENT_FORK,
+ },
+ {
+ .name = "wp-fork-pin",
+ .uffd_fn = uffd_wp_fork_pin_test,
+ .mem_targets = MEM_ALL,
+ .uffd_feature_required = UFFD_FEATURE_PAGEFAULT_FLAG_WP |
+ UFFD_FEATURE_WP_HUGETLBFS_SHMEM,
+ },
+ {
+ .name = "wp-fork-pin-with-event",
+ .uffd_fn = uffd_wp_fork_pin_with_event_test,
+ .mem_targets = MEM_ALL,
+ .uffd_feature_required = UFFD_FEATURE_PAGEFAULT_FLAG_WP |
+ UFFD_FEATURE_WP_HUGETLBFS_SHMEM |
+ /* when set, child process should inherit uffd-wp bits */
+ UFFD_FEATURE_EVENT_FORK,
+ },
+ {
+ .name = "wp-unpopulated",
+ .uffd_fn = uffd_wp_unpopulated_test,
+ .mem_targets = MEM_ANON,
+ .uffd_feature_required =
+ UFFD_FEATURE_PAGEFAULT_FLAG_WP | UFFD_FEATURE_WP_UNPOPULATED,
+ },
+ {
+ .name = "minor",
+ .uffd_fn = uffd_minor_test,
+ .mem_targets = MEM_SHMEM | MEM_HUGETLB,
+ .uffd_feature_required =
+ UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM,
+ },
+ {
+ .name = "minor-wp",
+ .uffd_fn = uffd_minor_wp_test,
+ .mem_targets = MEM_SHMEM | MEM_HUGETLB,
+ .uffd_feature_required =
+ UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM |
+ UFFD_FEATURE_PAGEFAULT_FLAG_WP |
+ /*
+ * HACK: here we leveraged WP_UNPOPULATED to detect whether
+ * minor mode supports wr-protect. There's no feature flag
+ * for it so this is the best we can test against.
+ */
+ UFFD_FEATURE_WP_UNPOPULATED,
+ },
+ {
+ .name = "minor-collapse",
+ .uffd_fn = uffd_minor_collapse_test,
+ /* MADV_COLLAPSE only works with shmem */
+ .mem_targets = MEM_SHMEM,
+ /* We can't test MADV_COLLAPSE, so try our luck */
+ .uffd_feature_required = UFFD_FEATURE_MINOR_SHMEM,
+ },
+ {
+ .name = "sigbus",
+ .uffd_fn = uffd_sigbus_test,
+ .mem_targets = MEM_ALL,
+ .uffd_feature_required = UFFD_FEATURE_SIGBUS |
+ UFFD_FEATURE_EVENT_FORK,
+ },
+ {
+ .name = "sigbus-wp",
+ .uffd_fn = uffd_sigbus_wp_test,
+ .mem_targets = MEM_ALL,
+ .uffd_feature_required = UFFD_FEATURE_SIGBUS |
+ UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_PAGEFAULT_FLAG_WP,
+ },
+ {
+ .name = "events",
+ .uffd_fn = uffd_events_test,
+ .mem_targets = MEM_ALL,
+ .uffd_feature_required = UFFD_FEATURE_EVENT_FORK |
+ UFFD_FEATURE_EVENT_REMAP | UFFD_FEATURE_EVENT_REMOVE,
+ },
+ {
+ .name = "events-wp",
+ .uffd_fn = uffd_events_wp_test,
+ .mem_targets = MEM_ALL,
+ .uffd_feature_required = UFFD_FEATURE_EVENT_FORK |
+ UFFD_FEATURE_EVENT_REMAP | UFFD_FEATURE_EVENT_REMOVE |
+ UFFD_FEATURE_PAGEFAULT_FLAG_WP |
+ UFFD_FEATURE_WP_HUGETLBFS_SHMEM,
+ },
+};
+
+static void usage(const char *prog)
+{
+ printf("usage: %s [-f TESTNAME]\n", prog);
+ puts("");
+ puts(" -f: test name to filter (e.g., event)");
+ puts(" -h: show the help msg");
+ puts(" -l: list tests only");
+ puts("");
+ exit(KSFT_FAIL);
+}
+
+int main(int argc, char *argv[])
+{
+ int n_tests = sizeof(uffd_tests) / sizeof(uffd_test_case_t);
+ int n_mems = sizeof(mem_types) / sizeof(mem_type_t);
+ const char *test_filter = NULL;
+ bool list_only = false;
+ uffd_test_case_t *test;
+ mem_type_t *mem_type;
+ uffd_test_args_t args;
+ char test_name[128];
+ const char *errmsg;
+ int has_uffd, opt;
+ int i, j;
+
+ while ((opt = getopt(argc, argv, "f:hl")) != -1) {
+ switch (opt) {
+ case 'f':
+ test_filter = optarg;
+ break;
+ case 'l':
+ list_only = true;
+ break;
+ case 'h':
+ default:
+ /* Unknown */
+ usage(argv[0]);
+ break;
+ }
+ }
+
+ if (!test_filter && !list_only) {
+ has_uffd = test_uffd_api(false);
+ has_uffd |= test_uffd_api(true);
+
+ if (!has_uffd) {
+ printf("Userfaultfd not supported or unprivileged, skip all tests\n");
+ exit(KSFT_SKIP);
+ }
+ }
+
+ for (i = 0; i < n_tests; i++) {
+ test = &uffd_tests[i];
+ if (test_filter && !strstr(test->name, test_filter))
+ continue;
+ if (list_only) {
+ printf("%s\n", test->name);
+ continue;
+ }
+ for (j = 0; j < n_mems; j++) {
+ mem_type = &mem_types[j];
+ if (!(test->mem_targets & mem_type->mem_flag))
+ continue;
+ snprintf(test_name, sizeof(test_name),
+ "%s on %s", test->name, mem_type->name);
+
+ uffd_test_start(test_name);
+ if (!uffd_feature_supported(test)) {
+ uffd_test_skip("feature missing");
+ continue;
+ }
+ if (uffd_setup_environment(&args, test, mem_type,
+ &errmsg)) {
+ uffd_test_skip(errmsg);
+ continue;
+ }
+ test->uffd_fn(&args);
+ }
+ }
+
+ if (!list_only)
+ uffd_test_report();
+
+ return ksft_get_fail_cnt() ? KSFT_FAIL : KSFT_PASS;
+}
+
+#else /* __NR_userfaultfd */
+
+#warning "missing __NR_userfaultfd definition"
+
+int main(void)
+{
+ printf("Skipping %s (missing __NR_userfaultfd)\n", __file__);
+ return KSFT_SKIP;
+}
+
+#endif /* __NR_userfaultfd */
diff --git a/tools/testing/selftests/mm/userfaultfd.c b/tools/testing/selftests/mm/userfaultfd.c
deleted file mode 100644
index 7f22844ed704..000000000000
--- a/tools/testing/selftests/mm/userfaultfd.c
+++ /dev/null
@@ -1,1858 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Stress userfaultfd syscall.
- *
- * Copyright (C) 2015 Red Hat, Inc.
- *
- * This test allocates two virtual areas and bounces the physical
- * memory across the two virtual areas (from area_src to area_dst)
- * using userfaultfd.
- *
- * There are three threads running per CPU:
- *
- * 1) one per-CPU thread takes a per-page pthread_mutex in a random
- * page of the area_dst (while the physical page may still be in
- * area_src), and increments a per-page counter in the same page,
- * and checks its value against a verification region.
- *
- * 2) another per-CPU thread handles the userfaults generated by
- * thread 1 above. userfaultfd blocking reads or poll() modes are
- * exercised interleaved.
- *
- * 3) one last per-CPU thread transfers the memory in the background
- * at maximum bandwidth (if not already transferred by thread
- * 2). Each cpu thread takes cares of transferring a portion of the
- * area.
- *
- * When all threads of type 3 completed the transfer, one bounce is
- * complete. area_src and area_dst are then swapped. All threads are
- * respawned and so the bounce is immediately restarted in the
- * opposite direction.
- *
- * per-CPU threads 1 by triggering userfaults inside
- * pthread_mutex_lock will also verify the atomicity of the memory
- * transfer (UFFDIO_COPY).
- */
-
-#define _GNU_SOURCE
-#include <stdio.h>
-#include <errno.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <time.h>
-#include <signal.h>
-#include <poll.h>
-#include <string.h>
-#include <linux/mman.h>
-#include <sys/mman.h>
-#include <sys/syscall.h>
-#include <sys/ioctl.h>
-#include <sys/wait.h>
-#include <pthread.h>
-#include <linux/userfaultfd.h>
-#include <setjmp.h>
-#include <stdbool.h>
-#include <assert.h>
-#include <inttypes.h>
-#include <stdint.h>
-#include <sys/random.h>
-
-#include "../kselftest.h"
-#include "vm_util.h"
-
-#ifdef __NR_userfaultfd
-
-static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size, hpage_size;
-
-#define BOUNCE_RANDOM (1<<0)
-#define BOUNCE_RACINGFAULTS (1<<1)
-#define BOUNCE_VERIFY (1<<2)
-#define BOUNCE_POLL (1<<3)
-static int bounces;
-
-#define TEST_ANON 1
-#define TEST_HUGETLB 2
-#define TEST_SHMEM 3
-static int test_type;
-
-#define UFFD_FLAGS (O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY)
-
-#define BASE_PMD_ADDR ((void *)(1UL << 30))
-
-/* test using /dev/userfaultfd, instead of userfaultfd(2) */
-static bool test_dev_userfaultfd;
-
-/* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */
-#define ALARM_INTERVAL_SECS 10
-static volatile bool test_uffdio_copy_eexist = true;
-static volatile bool test_uffdio_zeropage_eexist = true;
-/* Whether to test uffd write-protection */
-static bool test_uffdio_wp = true;
-/* Whether to test uffd minor faults */
-static bool test_uffdio_minor = false;
-static bool map_shared;
-static int mem_fd;
-static unsigned long long *count_verify;
-static int uffd = -1;
-static int uffd_flags, finished, *pipefd;
-static char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap;
-static char *zeropage;
-pthread_attr_t attr;
-static bool test_collapse;
-
-/* Userfaultfd test statistics */
-struct uffd_stats {
- int cpu;
- unsigned long missing_faults;
- unsigned long wp_faults;
- unsigned long minor_faults;
-};
-
-/* pthread_mutex_t starts at page offset 0 */
-#define area_mutex(___area, ___nr) \
- ((pthread_mutex_t *) ((___area) + (___nr)*page_size))
-/*
- * count is placed in the page after pthread_mutex_t naturally aligned
- * to avoid non alignment faults on non-x86 archs.
- */
-#define area_count(___area, ___nr) \
- ((volatile unsigned long long *) ((unsigned long) \
- ((___area) + (___nr)*page_size + \
- sizeof(pthread_mutex_t) + \
- sizeof(unsigned long long) - 1) & \
- ~(unsigned long)(sizeof(unsigned long long) \
- - 1)))
-
-#define swap(a, b) \
- do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
-
-#define factor_of_2(x) ((x) ^ ((x) & ((x) - 1)))
-
-const char *examples =
- "# Run anonymous memory test on 100MiB region with 99999 bounces:\n"
- "./userfaultfd anon 100 99999\n\n"
- "# Run the same anonymous memory test, but using /dev/userfaultfd:\n"
- "./userfaultfd anon:dev 100 99999\n\n"
- "# Run share memory test on 1GiB region with 99 bounces:\n"
- "./userfaultfd shmem 1000 99\n\n"
- "# Run hugetlb memory test on 256MiB region with 50 bounces:\n"
- "./userfaultfd hugetlb 256 50\n\n"
- "# Run the same hugetlb test but using shared file:\n"
- "./userfaultfd hugetlb_shared 256 50\n\n"
- "# 10MiB-~6GiB 999 bounces anonymous test, "
- "continue forever unless an error triggers\n"
- "while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n";
-
-static void usage(void)
-{
- fprintf(stderr, "\nUsage: ./userfaultfd <test type> <MiB> <bounces> "
- "[hugetlbfs_file]\n\n");
- fprintf(stderr, "Supported <test type>: anon, hugetlb, "
- "hugetlb_shared, shmem\n\n");
- fprintf(stderr, "'Test mods' can be joined to the test type string with a ':'. "
- "Supported mods:\n");
- fprintf(stderr, "\tsyscall - Use userfaultfd(2) (default)\n");
- fprintf(stderr, "\tdev - Use /dev/userfaultfd instead of userfaultfd(2)\n");
- fprintf(stderr, "\tcollapse - Test MADV_COLLAPSE of UFFDIO_REGISTER_MODE_MINOR\n"
- "memory\n");
- fprintf(stderr, "\nExample test mod usage:\n");
- fprintf(stderr, "# Run anonymous memory test with /dev/userfaultfd:\n");
- fprintf(stderr, "./userfaultfd anon:dev 100 99999\n\n");
-
- fprintf(stderr, "Examples:\n\n");
- fprintf(stderr, "%s", examples);
- exit(1);
-}
-
-#define _err(fmt, ...) \
- do { \
- int ret = errno; \
- fprintf(stderr, "ERROR: " fmt, ##__VA_ARGS__); \
- fprintf(stderr, " (errno=%d, line=%d)\n", \
- ret, __LINE__); \
- } while (0)
-
-#define errexit(exitcode, fmt, ...) \
- do { \
- _err(fmt, ##__VA_ARGS__); \
- exit(exitcode); \
- } while (0)
-
-#define err(fmt, ...) errexit(1, fmt, ##__VA_ARGS__)
-
-static void uffd_stats_reset(struct uffd_stats *uffd_stats,
- unsigned long n_cpus)
-{
- int i;
-
- for (i = 0; i < n_cpus; i++) {
- uffd_stats[i].cpu = i;
- uffd_stats[i].missing_faults = 0;
- uffd_stats[i].wp_faults = 0;
- uffd_stats[i].minor_faults = 0;
- }
-}
-
-static void uffd_stats_report(struct uffd_stats *stats, int n_cpus)
-{
- int i;
- unsigned long long miss_total = 0, wp_total = 0, minor_total = 0;
-
- for (i = 0; i < n_cpus; i++) {
- miss_total += stats[i].missing_faults;
- wp_total += stats[i].wp_faults;
- minor_total += stats[i].minor_faults;
- }
-
- printf("userfaults: ");
- if (miss_total) {
- printf("%llu missing (", miss_total);
- for (i = 0; i < n_cpus; i++)
- printf("%lu+", stats[i].missing_faults);
- printf("\b) ");
- }
- if (wp_total) {
- printf("%llu wp (", wp_total);
- for (i = 0; i < n_cpus; i++)
- printf("%lu+", stats[i].wp_faults);
- printf("\b) ");
- }
- if (minor_total) {
- printf("%llu minor (", minor_total);
- for (i = 0; i < n_cpus; i++)
- printf("%lu+", stats[i].minor_faults);
- printf("\b)");
- }
- printf("\n");
-}
-
-static void anon_release_pages(char *rel_area)
-{
- if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
- err("madvise(MADV_DONTNEED) failed");
-}
-
-static void anon_allocate_area(void **alloc_area, bool is_src)
-{
- *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
- MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-}
-
-static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)
-{
-}
-
-static void hugetlb_release_pages(char *rel_area)
-{
- if (!map_shared) {
- if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
- err("madvise(MADV_DONTNEED) failed");
- } else {
- if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
- err("madvise(MADV_REMOVE) failed");
- }
-}
-
-static void hugetlb_allocate_area(void **alloc_area, bool is_src)
-{
- off_t size = nr_pages * page_size;
- off_t offset = is_src ? 0 : size;
- void *area_alias = NULL;
- char **alloc_area_alias;
-
- *alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE,
- (map_shared ? MAP_SHARED : MAP_PRIVATE) |
- (is_src ? 0 : MAP_NORESERVE),
- mem_fd, offset);
- if (*alloc_area == MAP_FAILED)
- err("mmap of hugetlbfs file failed");
-
- if (map_shared) {
- area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE,
- MAP_SHARED, mem_fd, offset);
- if (area_alias == MAP_FAILED)
- err("mmap of hugetlb file alias failed");
- }
-
- if (is_src) {
- alloc_area_alias = &area_src_alias;
- } else {
- alloc_area_alias = &area_dst_alias;
- }
- if (area_alias)
- *alloc_area_alias = area_alias;
-}
-
-static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset)
-{
- if (!map_shared)
- return;
-
- *start = (unsigned long) area_dst_alias + offset;
-}
-
-static void shmem_release_pages(char *rel_area)
-{
- if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
- err("madvise(MADV_REMOVE) failed");
-}
-
-static void shmem_allocate_area(void **alloc_area, bool is_src)
-{
- void *area_alias = NULL;
- size_t bytes = nr_pages * page_size;
- unsigned long offset = is_src ? 0 : bytes;
- char *p = NULL, *p_alias = NULL;
-
- if (test_collapse) {
- p = BASE_PMD_ADDR;
- if (!is_src)
- /* src map + alias + interleaved hpages */
- p += 2 * (bytes + hpage_size);
- p_alias = p;
- p_alias += bytes;
- p_alias += hpage_size; /* Prevent src/dst VMA merge */
- }
-
- *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
- mem_fd, offset);
- if (*alloc_area == MAP_FAILED)
- err("mmap of memfd failed");
- if (test_collapse && *alloc_area != p)
- err("mmap of memfd failed at %p", p);
-
- area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
- mem_fd, offset);
- if (area_alias == MAP_FAILED)
- err("mmap of memfd alias failed");
- if (test_collapse && area_alias != p_alias)
- err("mmap of anonymous memory failed at %p", p_alias);
-
- if (is_src)
- area_src_alias = area_alias;
- else
- area_dst_alias = area_alias;
-}
-
-static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset)
-{
- *start = (unsigned long)area_dst_alias + offset;
-}
-
-static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages)
-{
- if (!check_huge_shmem(area_dst_alias, expect_nr_hpages, hpage_size))
- err("Did not find expected %d number of hugepages",
- expect_nr_hpages);
-}
-
-struct uffd_test_ops {
- void (*allocate_area)(void **alloc_area, bool is_src);
- void (*release_pages)(char *rel_area);
- void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset);
- void (*check_pmd_mapping)(void *p, int expect_nr_hpages);
-};
-
-static struct uffd_test_ops anon_uffd_test_ops = {
- .allocate_area = anon_allocate_area,
- .release_pages = anon_release_pages,
- .alias_mapping = noop_alias_mapping,
- .check_pmd_mapping = NULL,
-};
-
-static struct uffd_test_ops shmem_uffd_test_ops = {
- .allocate_area = shmem_allocate_area,
- .release_pages = shmem_release_pages,
- .alias_mapping = shmem_alias_mapping,
- .check_pmd_mapping = shmem_check_pmd_mapping,
-};
-
-static struct uffd_test_ops hugetlb_uffd_test_ops = {
- .allocate_area = hugetlb_allocate_area,
- .release_pages = hugetlb_release_pages,
- .alias_mapping = hugetlb_alias_mapping,
- .check_pmd_mapping = NULL,
-};
-
-static struct uffd_test_ops *uffd_test_ops;
-
-static inline uint64_t uffd_minor_feature(void)
-{
- if (test_type == TEST_HUGETLB && map_shared)
- return UFFD_FEATURE_MINOR_HUGETLBFS;
- else if (test_type == TEST_SHMEM)
- return UFFD_FEATURE_MINOR_SHMEM;
- else
- return 0;
-}
-
-static uint64_t get_expected_ioctls(uint64_t mode)
-{
- uint64_t ioctls = UFFD_API_RANGE_IOCTLS;
-
- if (test_type == TEST_HUGETLB)
- ioctls &= ~(1 << _UFFDIO_ZEROPAGE);
-
- if (!((mode & UFFDIO_REGISTER_MODE_WP) && test_uffdio_wp))
- ioctls &= ~(1 << _UFFDIO_WRITEPROTECT);
-
- if (!((mode & UFFDIO_REGISTER_MODE_MINOR) && test_uffdio_minor))
- ioctls &= ~(1 << _UFFDIO_CONTINUE);
-
- return ioctls;
-}
-
-static void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls)
-{
- uint64_t expected = get_expected_ioctls(mode);
- uint64_t actual = ioctls & expected;
-
- if (actual != expected) {
- err("missing ioctl(s): expected %"PRIx64" actual: %"PRIx64,
- expected, actual);
- }
-}
-
-static int __userfaultfd_open_dev(void)
-{
- int fd, _uffd;
-
- fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
- if (fd < 0)
- errexit(KSFT_SKIP, "opening /dev/userfaultfd failed");
-
- _uffd = ioctl(fd, USERFAULTFD_IOC_NEW, UFFD_FLAGS);
- if (_uffd < 0)
- errexit(errno == ENOTTY ? KSFT_SKIP : 1,
- "creating userfaultfd failed");
- close(fd);
- return _uffd;
-}
-
-static void userfaultfd_open(uint64_t *features)
-{
- struct uffdio_api uffdio_api;
-
- if (test_dev_userfaultfd)
- uffd = __userfaultfd_open_dev();
- else {
- uffd = syscall(__NR_userfaultfd, UFFD_FLAGS);
- if (uffd < 0)
- errexit(errno == ENOSYS ? KSFT_SKIP : 1,
- "creating userfaultfd failed");
- }
- uffd_flags = fcntl(uffd, F_GETFD, NULL);
-
- uffdio_api.api = UFFD_API;
- uffdio_api.features = *features;
- if (ioctl(uffd, UFFDIO_API, &uffdio_api))
- err("UFFDIO_API failed.\nPlease make sure to "
- "run with either root or ptrace capability.");
- if (uffdio_api.api != UFFD_API)
- err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api);
-
- *features = uffdio_api.features;
-}
-
-static inline void munmap_area(void **area)
-{
- if (*area)
- if (munmap(*area, nr_pages * page_size))
- err("munmap");
-
- *area = NULL;
-}
-
-static void uffd_test_ctx_clear(void)
-{
- size_t i;
-
- if (pipefd) {
- for (i = 0; i < nr_cpus * 2; ++i) {
- if (close(pipefd[i]))
- err("close pipefd");
- }
- free(pipefd);
- pipefd = NULL;
- }
-
- if (count_verify) {
- free(count_verify);
- count_verify = NULL;
- }
-
- if (uffd != -1) {
- if (close(uffd))
- err("close uffd");
- uffd = -1;
- }
-
- munmap_area((void **)&area_src);
- munmap_area((void **)&area_src_alias);
- munmap_area((void **)&area_dst);
- munmap_area((void **)&area_dst_alias);
- munmap_area((void **)&area_remap);
-}
-
-static void uffd_test_ctx_init(uint64_t features)
-{
- unsigned long nr, cpu;
-
- uffd_test_ctx_clear();
-
- uffd_test_ops->allocate_area((void **)&area_src, true);
- uffd_test_ops->allocate_area((void **)&area_dst, false);
-
- userfaultfd_open(&features);
-
- count_verify = malloc(nr_pages * sizeof(unsigned long long));
- if (!count_verify)
- err("count_verify");
-
- for (nr = 0; nr < nr_pages; nr++) {
- *area_mutex(area_src, nr) =
- (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER;
- count_verify[nr] = *area_count(area_src, nr) = 1;
- /*
- * In the transition between 255 to 256, powerpc will
- * read out of order in my_bcmp and see both bytes as
- * zero, so leave a placeholder below always non-zero
- * after the count, to avoid my_bcmp to trigger false
- * positives.
- */
- *(area_count(area_src, nr) + 1) = 1;
- }
-
- /*
- * After initialization of area_src, we must explicitly release pages
- * for area_dst to make sure it's fully empty. Otherwise we could have
- * some area_dst pages be errornously initialized with zero pages,
- * hence we could hit memory corruption later in the test.
- *
- * One example is when THP is globally enabled, above allocate_area()
- * calls could have the two areas merged into a single VMA (as they
- * will have the same VMA flags so they're mergeable). When we
- * initialize the area_src above, it's possible that some part of
- * area_dst could have been faulted in via one huge THP that will be
- * shared between area_src and area_dst. It could cause some of the
- * area_dst won't be trapped by missing userfaults.
- *
- * This release_pages() will guarantee even if that happened, we'll
- * proactively split the thp and drop any accidentally initialized
- * pages within area_dst.
- */
- uffd_test_ops->release_pages(area_dst);
-
- pipefd = malloc(sizeof(int) * nr_cpus * 2);
- if (!pipefd)
- err("pipefd");
- for (cpu = 0; cpu < nr_cpus; cpu++)
- if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK))
- err("pipe");
-}
-
-static int my_bcmp(char *str1, char *str2, size_t n)
-{
- unsigned long i;
- for (i = 0; i < n; i++)
- if (str1[i] != str2[i])
- return 1;
- return 0;
-}
-
-static void wp_range(int ufd, __u64 start, __u64 len, bool wp)
-{
- struct uffdio_writeprotect prms;
-
- /* Write protection page faults */
- prms.range.start = start;
- prms.range.len = len;
- /* Undo write-protect, do wakeup after that */
- prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0;
-
- if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms))
- err("clear WP failed: address=0x%"PRIx64, (uint64_t)start);
-}
-
-static void continue_range(int ufd, __u64 start, __u64 len)
-{
- struct uffdio_continue req;
- int ret;
-
- req.range.start = start;
- req.range.len = len;
- req.mode = 0;
-
- if (ioctl(ufd, UFFDIO_CONTINUE, &req))
- err("UFFDIO_CONTINUE failed for address 0x%" PRIx64,
- (uint64_t)start);
-
- /*
- * Error handling within the kernel for continue is subtly different
- * from copy or zeropage, so it may be a source of bugs. Trigger an
- * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG.
- */
- req.mapped = 0;
- ret = ioctl(ufd, UFFDIO_CONTINUE, &req);
- if (ret >= 0 || req.mapped != -EEXIST)
- err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64,
- ret, (int64_t) req.mapped);
-}
-
-static void *locking_thread(void *arg)
-{
- unsigned long cpu = (unsigned long) arg;
- unsigned long page_nr;
- unsigned long long count;
-
- if (!(bounces & BOUNCE_RANDOM)) {
- page_nr = -bounces;
- if (!(bounces & BOUNCE_RACINGFAULTS))
- page_nr += cpu * nr_pages_per_cpu;
- }
-
- while (!finished) {
- if (bounces & BOUNCE_RANDOM) {
- if (getrandom(&page_nr, sizeof(page_nr), 0) != sizeof(page_nr))
- err("getrandom failed");
- } else
- page_nr += 1;
- page_nr %= nr_pages;
- pthread_mutex_lock(area_mutex(area_dst, page_nr));
- count = *area_count(area_dst, page_nr);
- if (count != count_verify[page_nr])
- err("page_nr %lu memory corruption %llu %llu",
- page_nr, count, count_verify[page_nr]);
- count++;
- *area_count(area_dst, page_nr) = count_verify[page_nr] = count;
- pthread_mutex_unlock(area_mutex(area_dst, page_nr));
- }
-
- return NULL;
-}
-
-static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy,
- unsigned long offset)
-{
- uffd_test_ops->alias_mapping(&uffdio_copy->dst,
- uffdio_copy->len,
- offset);
- if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) {
- /* real retval in ufdio_copy.copy */
- if (uffdio_copy->copy != -EEXIST)
- err("UFFDIO_COPY retry error: %"PRId64,
- (int64_t)uffdio_copy->copy);
- } else {
- err("UFFDIO_COPY retry unexpected: %"PRId64,
- (int64_t)uffdio_copy->copy);
- }
-}
-
-static void wake_range(int ufd, unsigned long addr, unsigned long len)
-{
- struct uffdio_range uffdio_wake;
-
- uffdio_wake.start = addr;
- uffdio_wake.len = len;
-
- if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake))
- fprintf(stderr, "error waking %lu\n",
- addr), exit(1);
-}
-
-static int __copy_page(int ufd, unsigned long offset, bool retry)
-{
- struct uffdio_copy uffdio_copy;
-
- if (offset >= nr_pages * page_size)
- err("unexpected offset %lu\n", offset);
- uffdio_copy.dst = (unsigned long) area_dst + offset;
- uffdio_copy.src = (unsigned long) area_src + offset;
- uffdio_copy.len = page_size;
- if (test_uffdio_wp)
- uffdio_copy.mode = UFFDIO_COPY_MODE_WP;
- else
- uffdio_copy.mode = 0;
- uffdio_copy.copy = 0;
- if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) {
- /* real retval in ufdio_copy.copy */
- if (uffdio_copy.copy != -EEXIST)
- err("UFFDIO_COPY error: %"PRId64,
- (int64_t)uffdio_copy.copy);
- wake_range(ufd, uffdio_copy.dst, page_size);
- } else if (uffdio_copy.copy != page_size) {
- err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy);
- } else {
- if (test_uffdio_copy_eexist && retry) {
- test_uffdio_copy_eexist = false;
- retry_copy_page(ufd, &uffdio_copy, offset);
- }
- return 1;
- }
- return 0;
-}
-
-static int copy_page_retry(int ufd, unsigned long offset)
-{
- return __copy_page(ufd, offset, true);
-}
-
-static int copy_page(int ufd, unsigned long offset)
-{
- return __copy_page(ufd, offset, false);
-}
-
-static int uffd_read_msg(int ufd, struct uffd_msg *msg)
-{
- int ret = read(uffd, msg, sizeof(*msg));
-
- if (ret != sizeof(*msg)) {
- if (ret < 0) {
- if (errno == EAGAIN || errno == EINTR)
- return 1;
- err("blocking read error");
- } else {
- err("short read");
- }
- }
-
- return 0;
-}
-
-static void uffd_handle_page_fault(struct uffd_msg *msg,
- struct uffd_stats *stats)
-{
- unsigned long offset;
-
- if (msg->event != UFFD_EVENT_PAGEFAULT)
- err("unexpected msg event %u", msg->event);
-
- if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) {
- /* Write protect page faults */
- wp_range(uffd, msg->arg.pagefault.address, page_size, false);
- stats->wp_faults++;
- } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) {
- uint8_t *area;
- int b;
-
- /*
- * Minor page faults
- *
- * To prove we can modify the original range for testing
- * purposes, we're going to bit flip this range before
- * continuing.
- *
- * Note that this requires all minor page fault tests operate on
- * area_dst (non-UFFD-registered) and area_dst_alias
- * (UFFD-registered).
- */
-
- area = (uint8_t *)(area_dst +
- ((char *)msg->arg.pagefault.address -
- area_dst_alias));
- for (b = 0; b < page_size; ++b)
- area[b] = ~area[b];
- continue_range(uffd, msg->arg.pagefault.address, page_size);
- stats->minor_faults++;
- } else {
- /*
- * Missing page faults.
- *
- * Here we force a write check for each of the missing mode
- * faults. It's guaranteed because the only threads that
- * will trigger uffd faults are the locking threads, and
- * their first instruction to touch the missing page will
- * always be pthread_mutex_lock().
- *
- * Note that here we relied on an NPTL glibc impl detail to
- * always read the lock type at the entry of the lock op
- * (pthread_mutex_t.__data.__type, offset 0x10) before
- * doing any locking operations to guarantee that. It's
- * actually not good to rely on this impl detail because
- * logically a pthread-compatible lib can implement the
- * locks without types and we can fail when linking with
- * them. However since we used to find bugs with this
- * strict check we still keep it around. Hopefully this
- * could be a good hint when it fails again. If one day
- * it'll break on some other impl of glibc we'll revisit.
- */
- if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
- err("unexpected write fault");
-
- offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
- offset &= ~(page_size-1);
-
- if (copy_page(uffd, offset))
- stats->missing_faults++;
- }
-}
-
-static void *uffd_poll_thread(void *arg)
-{
- struct uffd_stats *stats = (struct uffd_stats *)arg;
- unsigned long cpu = stats->cpu;
- struct pollfd pollfd[2];
- struct uffd_msg msg;
- struct uffdio_register uffd_reg;
- int ret;
- char tmp_chr;
-
- pollfd[0].fd = uffd;
- pollfd[0].events = POLLIN;
- pollfd[1].fd = pipefd[cpu*2];
- pollfd[1].events = POLLIN;
-
- for (;;) {
- ret = poll(pollfd, 2, -1);
- if (ret <= 0) {
- if (errno == EINTR || errno == EAGAIN)
- continue;
- err("poll error: %d", ret);
- }
- if (pollfd[1].revents & POLLIN) {
- if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
- err("read pipefd error");
- break;
- }
- if (!(pollfd[0].revents & POLLIN))
- err("pollfd[0].revents %d", pollfd[0].revents);
- if (uffd_read_msg(uffd, &msg))
- continue;
- switch (msg.event) {
- default:
- err("unexpected msg event %u\n", msg.event);
- break;
- case UFFD_EVENT_PAGEFAULT:
- uffd_handle_page_fault(&msg, stats);
- break;
- case UFFD_EVENT_FORK:
- close(uffd);
- uffd = msg.arg.fork.ufd;
- pollfd[0].fd = uffd;
- break;
- case UFFD_EVENT_REMOVE:
- uffd_reg.range.start = msg.arg.remove.start;
- uffd_reg.range.len = msg.arg.remove.end -
- msg.arg.remove.start;
- if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range))
- err("remove failure");
- break;
- case UFFD_EVENT_REMAP:
- area_remap = area_dst; /* save for later unmap */
- area_dst = (char *)(unsigned long)msg.arg.remap.to;
- break;
- }
- }
-
- return NULL;
-}
-
-pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER;
-
-static void *uffd_read_thread(void *arg)
-{
- struct uffd_stats *stats = (struct uffd_stats *)arg;
- struct uffd_msg msg;
-
- pthread_mutex_unlock(&uffd_read_mutex);
- /* from here cancellation is ok */
-
- for (;;) {
- if (uffd_read_msg(uffd, &msg))
- continue;
- uffd_handle_page_fault(&msg, stats);
- }
-
- return NULL;
-}
-
-static void *background_thread(void *arg)
-{
- unsigned long cpu = (unsigned long) arg;
- unsigned long page_nr, start_nr, mid_nr, end_nr;
-
- start_nr = cpu * nr_pages_per_cpu;
- end_nr = (cpu+1) * nr_pages_per_cpu;
- mid_nr = (start_nr + end_nr) / 2;
-
- /* Copy the first half of the pages */
- for (page_nr = start_nr; page_nr < mid_nr; page_nr++)
- copy_page_retry(uffd, page_nr * page_size);
-
- /*
- * If we need to test uffd-wp, set it up now. Then we'll have
- * at least the first half of the pages mapped already which
- * can be write-protected for testing
- */
- if (test_uffdio_wp)
- wp_range(uffd, (unsigned long)area_dst + start_nr * page_size,
- nr_pages_per_cpu * page_size, true);
-
- /*
- * Continue the 2nd half of the page copying, handling write
- * protection faults if any
- */
- for (page_nr = mid_nr; page_nr < end_nr; page_nr++)
- copy_page_retry(uffd, page_nr * page_size);
-
- return NULL;
-}
-
-static int stress(struct uffd_stats *uffd_stats)
-{
- unsigned long cpu;
- pthread_t locking_threads[nr_cpus];
- pthread_t uffd_threads[nr_cpus];
- pthread_t background_threads[nr_cpus];
-
- finished = 0;
- for (cpu = 0; cpu < nr_cpus; cpu++) {
- if (pthread_create(&locking_threads[cpu], &attr,
- locking_thread, (void *)cpu))
- return 1;
- if (bounces & BOUNCE_POLL) {
- if (pthread_create(&uffd_threads[cpu], &attr,
- uffd_poll_thread,
- (void *)&uffd_stats[cpu]))
- return 1;
- } else {
- if (pthread_create(&uffd_threads[cpu], &attr,
- uffd_read_thread,
- (void *)&uffd_stats[cpu]))
- return 1;
- pthread_mutex_lock(&uffd_read_mutex);
- }
- if (pthread_create(&background_threads[cpu], &attr,
- background_thread, (void *)cpu))
- return 1;
- }
- for (cpu = 0; cpu < nr_cpus; cpu++)
- if (pthread_join(background_threads[cpu], NULL))
- return 1;
-
- /*
- * Be strict and immediately zap area_src, the whole area has
- * been transferred already by the background treads. The
- * area_src could then be faulted in a racy way by still
- * running uffdio_threads reading zeropages after we zapped
- * area_src (but they're guaranteed to get -EEXIST from
- * UFFDIO_COPY without writing zero pages into area_dst
- * because the background threads already completed).
- */
- uffd_test_ops->release_pages(area_src);
-
- finished = 1;
- for (cpu = 0; cpu < nr_cpus; cpu++)
- if (pthread_join(locking_threads[cpu], NULL))
- return 1;
-
- for (cpu = 0; cpu < nr_cpus; cpu++) {
- char c;
- if (bounces & BOUNCE_POLL) {
- if (write(pipefd[cpu*2+1], &c, 1) != 1)
- err("pipefd write error");
- if (pthread_join(uffd_threads[cpu],
- (void *)&uffd_stats[cpu]))
- return 1;
- } else {
- if (pthread_cancel(uffd_threads[cpu]))
- return 1;
- if (pthread_join(uffd_threads[cpu], NULL))
- return 1;
- }
- }
-
- return 0;
-}
-
-sigjmp_buf jbuf, *sigbuf;
-
-static void sighndl(int sig, siginfo_t *siginfo, void *ptr)
-{
- if (sig == SIGBUS) {
- if (sigbuf)
- siglongjmp(*sigbuf, 1);
- abort();
- }
-}
-
-/*
- * For non-cooperative userfaultfd test we fork() a process that will
- * generate pagefaults, will mremap the area monitored by the
- * userfaultfd and at last this process will release the monitored
- * area.
- * For the anonymous and shared memory the area is divided into two
- * parts, the first part is accessed before mremap, and the second
- * part is accessed after mremap. Since hugetlbfs does not support
- * mremap, the entire monitored area is accessed in a single pass for
- * HUGETLB_TEST.
- * The release of the pages currently generates event for shmem and
- * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked
- * for hugetlb.
- * For signal test(UFFD_FEATURE_SIGBUS), signal_test = 1, we register
- * monitored area, generate pagefaults and test that signal is delivered.
- * Use UFFDIO_COPY to allocate missing page and retry. For signal_test = 2
- * test robustness use case - we release monitored area, fork a process
- * that will generate pagefaults and verify signal is generated.
- * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal
- * feature. Using monitor thread, verify no userfault events are generated.
- */
-static int faulting_process(int signal_test)
-{
- unsigned long nr;
- unsigned long long count;
- unsigned long split_nr_pages;
- unsigned long lastnr;
- struct sigaction act;
- volatile unsigned long signalled = 0;
-
- split_nr_pages = (nr_pages + 1) / 2;
-
- if (signal_test) {
- sigbuf = &jbuf;
- memset(&act, 0, sizeof(act));
- act.sa_sigaction = sighndl;
- act.sa_flags = SA_SIGINFO;
- if (sigaction(SIGBUS, &act, 0))
- err("sigaction");
- lastnr = (unsigned long)-1;
- }
-
- for (nr = 0; nr < split_nr_pages; nr++) {
- volatile int steps = 1;
- unsigned long offset = nr * page_size;
-
- if (signal_test) {
- if (sigsetjmp(*sigbuf, 1) != 0) {
- if (steps == 1 && nr == lastnr)
- err("Signal repeated");
-
- lastnr = nr;
- if (signal_test == 1) {
- if (steps == 1) {
- /* This is a MISSING request */
- steps++;
- if (copy_page(uffd, offset))
- signalled++;
- } else {
- /* This is a WP request */
- assert(steps == 2);
- wp_range(uffd,
- (__u64)area_dst +
- offset,
- page_size, false);
- }
- } else {
- signalled++;
- continue;
- }
- }
- }
-
- count = *area_count(area_dst, nr);
- if (count != count_verify[nr])
- err("nr %lu memory corruption %llu %llu\n",
- nr, count, count_verify[nr]);
- /*
- * Trigger write protection if there is by writing
- * the same value back.
- */
- *area_count(area_dst, nr) = count;
- }
-
- if (signal_test)
- return signalled != split_nr_pages;
-
- area_dst = mremap(area_dst, nr_pages * page_size, nr_pages * page_size,
- MREMAP_MAYMOVE | MREMAP_FIXED, area_src);
- if (area_dst == MAP_FAILED)
- err("mremap");
- /* Reset area_src since we just clobbered it */
- area_src = NULL;
-
- for (; nr < nr_pages; nr++) {
- count = *area_count(area_dst, nr);
- if (count != count_verify[nr]) {
- err("nr %lu memory corruption %llu %llu\n",
- nr, count, count_verify[nr]);
- }
- /*
- * Trigger write protection if there is by writing
- * the same value back.
- */
- *area_count(area_dst, nr) = count;
- }
-
- uffd_test_ops->release_pages(area_dst);
-
- for (nr = 0; nr < nr_pages; nr++)
- if (my_bcmp(area_dst + nr * page_size, zeropage, page_size))
- err("nr %lu is not zero", nr);
-
- return 0;
-}
-
-static void retry_uffdio_zeropage(int ufd,
- struct uffdio_zeropage *uffdio_zeropage,
- unsigned long offset)
-{
- uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start,
- uffdio_zeropage->range.len,
- offset);
- if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) {
- if (uffdio_zeropage->zeropage != -EEXIST)
- err("UFFDIO_ZEROPAGE error: %"PRId64,
- (int64_t)uffdio_zeropage->zeropage);
- } else {
- err("UFFDIO_ZEROPAGE error: %"PRId64,
- (int64_t)uffdio_zeropage->zeropage);
- }
-}
-
-static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry)
-{
- struct uffdio_zeropage uffdio_zeropage;
- int ret;
- bool has_zeropage = get_expected_ioctls(0) & (1 << _UFFDIO_ZEROPAGE);
- __s64 res;
-
- if (offset >= nr_pages * page_size)
- err("unexpected offset %lu", offset);
- uffdio_zeropage.range.start = (unsigned long) area_dst + offset;
- uffdio_zeropage.range.len = page_size;
- uffdio_zeropage.mode = 0;
- ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage);
- res = uffdio_zeropage.zeropage;
- if (ret) {
- /* real retval in ufdio_zeropage.zeropage */
- if (has_zeropage)
- err("UFFDIO_ZEROPAGE error: %"PRId64, (int64_t)res);
- else if (res != -EINVAL)
- err("UFFDIO_ZEROPAGE not -EINVAL");
- } else if (has_zeropage) {
- if (res != page_size) {
- err("UFFDIO_ZEROPAGE unexpected size");
- } else {
- if (test_uffdio_zeropage_eexist && retry) {
- test_uffdio_zeropage_eexist = false;
- retry_uffdio_zeropage(ufd, &uffdio_zeropage,
- offset);
- }
- return 1;
- }
- } else
- err("UFFDIO_ZEROPAGE succeeded");
-
- return 0;
-}
-
-static int uffdio_zeropage(int ufd, unsigned long offset)
-{
- return __uffdio_zeropage(ufd, offset, false);
-}
-
-/* exercise UFFDIO_ZEROPAGE */
-static int userfaultfd_zeropage_test(void)
-{
- struct uffdio_register uffdio_register;
-
- printf("testing UFFDIO_ZEROPAGE: ");
- fflush(stdout);
-
- uffd_test_ctx_init(0);
-
- uffdio_register.range.start = (unsigned long) area_dst;
- uffdio_register.range.len = nr_pages * page_size;
- uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
- if (test_uffdio_wp)
- uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
- if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
- err("register failure");
-
- assert_expected_ioctls_present(
- uffdio_register.mode, uffdio_register.ioctls);
-
- if (uffdio_zeropage(uffd, 0))
- if (my_bcmp(area_dst, zeropage, page_size))
- err("zeropage is not zero");
-
- printf("done.\n");
- return 0;
-}
-
-static int userfaultfd_events_test(void)
-{
- struct uffdio_register uffdio_register;
- pthread_t uffd_mon;
- int err, features;
- pid_t pid;
- char c;
- struct uffd_stats stats = { 0 };
-
- printf("testing events (fork, remap, remove): ");
- fflush(stdout);
-
- features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP |
- UFFD_FEATURE_EVENT_REMOVE;
- uffd_test_ctx_init(features);
-
- fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
-
- uffdio_register.range.start = (unsigned long) area_dst;
- uffdio_register.range.len = nr_pages * page_size;
- uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
- if (test_uffdio_wp)
- uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
- if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
- err("register failure");
-
- assert_expected_ioctls_present(
- uffdio_register.mode, uffdio_register.ioctls);
-
- if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
- err("uffd_poll_thread create");
-
- pid = fork();
- if (pid < 0)
- err("fork");
-
- if (!pid)
- exit(faulting_process(0));
-
- waitpid(pid, &err, 0);
- if (err)
- err("faulting process failed");
- if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
- err("pipe write");
- if (pthread_join(uffd_mon, NULL))
- return 1;
-
- uffd_stats_report(&stats, 1);
-
- return stats.missing_faults != nr_pages;
-}
-
-static int userfaultfd_sig_test(void)
-{
- struct uffdio_register uffdio_register;
- unsigned long userfaults;
- pthread_t uffd_mon;
- int err, features;
- pid_t pid;
- char c;
- struct uffd_stats stats = { 0 };
-
- printf("testing signal delivery: ");
- fflush(stdout);
-
- features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS;
- uffd_test_ctx_init(features);
-
- fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
-
- uffdio_register.range.start = (unsigned long) area_dst;
- uffdio_register.range.len = nr_pages * page_size;
- uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
- if (test_uffdio_wp)
- uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
- if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
- err("register failure");
-
- assert_expected_ioctls_present(
- uffdio_register.mode, uffdio_register.ioctls);
-
- if (faulting_process(1))
- err("faulting process failed");
-
- uffd_test_ops->release_pages(area_dst);
-
- if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
- err("uffd_poll_thread create");
-
- pid = fork();
- if (pid < 0)
- err("fork");
-
- if (!pid)
- exit(faulting_process(2));
-
- waitpid(pid, &err, 0);
- if (err)
- err("faulting process failed");
- if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
- err("pipe write");
- if (pthread_join(uffd_mon, (void **)&userfaults))
- return 1;
-
- printf("done.\n");
- if (userfaults)
- err("Signal test failed, userfaults: %ld", userfaults);
-
- return userfaults != 0;
-}
-
-void check_memory_contents(char *p)
-{
- unsigned long i;
- uint8_t expected_byte;
- void *expected_page;
-
- if (posix_memalign(&expected_page, page_size, page_size))
- err("out of memory");
-
- for (i = 0; i < nr_pages; ++i) {
- expected_byte = ~((uint8_t)(i % ((uint8_t)-1)));
- memset(expected_page, expected_byte, page_size);
- if (my_bcmp(expected_page, p + (i * page_size), page_size))
- err("unexpected page contents after minor fault");
- }
-
- free(expected_page);
-}
-
-static int userfaultfd_minor_test(void)
-{
- unsigned long p;
- struct uffdio_register uffdio_register;
- pthread_t uffd_mon;
- char c;
- struct uffd_stats stats = { 0 };
-
- if (!test_uffdio_minor)
- return 0;
-
- printf("testing minor faults: ");
- fflush(stdout);
-
- uffd_test_ctx_init(uffd_minor_feature());
-
- uffdio_register.range.start = (unsigned long)area_dst_alias;
- uffdio_register.range.len = nr_pages * page_size;
- uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR;
- if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
- err("register failure");
-
- assert_expected_ioctls_present(
- uffdio_register.mode, uffdio_register.ioctls);
-
- /*
- * After registering with UFFD, populate the non-UFFD-registered side of
- * the shared mapping. This should *not* trigger any UFFD minor faults.
- */
- for (p = 0; p < nr_pages; ++p) {
- memset(area_dst + (p * page_size), p % ((uint8_t)-1),
- page_size);
- }
-
- if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
- err("uffd_poll_thread create");
-
- /*
- * Read each of the pages back using the UFFD-registered mapping. We
- * expect that the first time we touch a page, it will result in a minor
- * fault. uffd_poll_thread will resolve the fault by bit-flipping the
- * page's contents, and then issuing a CONTINUE ioctl.
- */
- check_memory_contents(area_dst_alias);
-
- if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
- err("pipe write");
- if (pthread_join(uffd_mon, NULL))
- return 1;
-
- uffd_stats_report(&stats, 1);
-
- if (test_collapse) {
- printf("testing collapse of uffd memory into PMD-mapped THPs:");
- if (madvise(area_dst_alias, nr_pages * page_size,
- MADV_COLLAPSE))
- err("madvise(MADV_COLLAPSE)");
-
- uffd_test_ops->check_pmd_mapping(area_dst,
- nr_pages * page_size /
- hpage_size);
- /*
- * This won't cause uffd-fault - it purely just makes sure there
- * was no corruption.
- */
- check_memory_contents(area_dst_alias);
- printf(" done.\n");
- }
-
- return stats.missing_faults != 0 || stats.minor_faults != nr_pages;
-}
-
-#define BIT_ULL(nr) (1ULL << (nr))
-#define PM_SOFT_DIRTY BIT_ULL(55)
-#define PM_MMAP_EXCLUSIVE BIT_ULL(56)
-#define PM_UFFD_WP BIT_ULL(57)
-#define PM_FILE BIT_ULL(61)
-#define PM_SWAP BIT_ULL(62)
-#define PM_PRESENT BIT_ULL(63)
-
-static int pagemap_open(void)
-{
- int fd = open("/proc/self/pagemap", O_RDONLY);
-
- if (fd < 0)
- err("open pagemap");
-
- return fd;
-}
-
-static uint64_t pagemap_read_vaddr(int fd, void *vaddr)
-{
- uint64_t value;
- int ret;
-
- ret = pread(fd, &value, sizeof(uint64_t),
- ((uint64_t)vaddr >> 12) * sizeof(uint64_t));
- if (ret != sizeof(uint64_t))
- err("pread() on pagemap failed");
-
- return value;
-}
-
-/* This macro let __LINE__ works in err() */
-#define pagemap_check_wp(value, wp) do { \
- if (!!(value & PM_UFFD_WP) != wp) \
- err("pagemap uffd-wp bit error: 0x%"PRIx64, value); \
- } while (0)
-
-static int pagemap_test_fork(bool present)
-{
- pid_t child = fork();
- uint64_t value;
- int fd, result;
-
- if (!child) {
- /* Open the pagemap fd of the child itself */
- fd = pagemap_open();
- value = pagemap_read_vaddr(fd, area_dst);
- /*
- * After fork() uffd-wp bit should be gone as long as we're
- * without UFFD_FEATURE_EVENT_FORK
- */
- pagemap_check_wp(value, false);
- /* Succeed */
- exit(0);
- }
- waitpid(child, &result, 0);
- return result;
-}
-
-static void userfaultfd_pagemap_test(unsigned int test_pgsize)
-{
- struct uffdio_register uffdio_register;
- int pagemap_fd;
- uint64_t value;
-
- /* Pagemap tests uffd-wp only */
- if (!test_uffdio_wp)
- return;
-
- /* Not enough memory to test this page size */
- if (test_pgsize > nr_pages * page_size)
- return;
-
- printf("testing uffd-wp with pagemap (pgsize=%u): ", test_pgsize);
- /* Flush so it doesn't flush twice in parent/child later */
- fflush(stdout);
-
- uffd_test_ctx_init(0);
-
- if (test_pgsize > page_size) {
- /* This is a thp test */
- if (madvise(area_dst, nr_pages * page_size, MADV_HUGEPAGE))
- err("madvise(MADV_HUGEPAGE) failed");
- } else if (test_pgsize == page_size) {
- /* This is normal page test; force no thp */
- if (madvise(area_dst, nr_pages * page_size, MADV_NOHUGEPAGE))
- err("madvise(MADV_NOHUGEPAGE) failed");
- }
-
- uffdio_register.range.start = (unsigned long) area_dst;
- uffdio_register.range.len = nr_pages * page_size;
- uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
- if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
- err("register failed");
-
- pagemap_fd = pagemap_open();
-
- /* Touch the page */
- *area_dst = 1;
- wp_range(uffd, (uint64_t)area_dst, test_pgsize, true);
- value = pagemap_read_vaddr(pagemap_fd, area_dst);
- pagemap_check_wp(value, true);
- /* Make sure uffd-wp bit dropped when fork */
- if (pagemap_test_fork(true))
- err("Detected stall uffd-wp bit in child");
-
- /* Exclusive required or PAGEOUT won't work */
- if (!(value & PM_MMAP_EXCLUSIVE))
- err("multiple mapping detected: 0x%"PRIx64, value);
-
- if (madvise(area_dst, test_pgsize, MADV_PAGEOUT))
- err("madvise(MADV_PAGEOUT) failed");
-
- /* Uffd-wp should persist even swapped out */
- value = pagemap_read_vaddr(pagemap_fd, area_dst);
- pagemap_check_wp(value, true);
- /* Make sure uffd-wp bit dropped when fork */
- if (pagemap_test_fork(false))
- err("Detected stall uffd-wp bit in child");
-
- /* Unprotect; this tests swap pte modifications */
- wp_range(uffd, (uint64_t)area_dst, page_size, false);
- value = pagemap_read_vaddr(pagemap_fd, area_dst);
- pagemap_check_wp(value, false);
-
- /* Fault in the page from disk */
- *area_dst = 2;
- value = pagemap_read_vaddr(pagemap_fd, area_dst);
- pagemap_check_wp(value, false);
-
- close(pagemap_fd);
- printf("done\n");
-}
-
-static int userfaultfd_stress(void)
-{
- void *area;
- unsigned long nr;
- struct uffdio_register uffdio_register;
- struct uffd_stats uffd_stats[nr_cpus];
-
- uffd_test_ctx_init(0);
-
- if (posix_memalign(&area, page_size, page_size))
- err("out of memory");
- zeropage = area;
- bzero(zeropage, page_size);
-
- pthread_mutex_lock(&uffd_read_mutex);
-
- pthread_attr_init(&attr);
- pthread_attr_setstacksize(&attr, 16*1024*1024);
-
- while (bounces--) {
- printf("bounces: %d, mode:", bounces);
- if (bounces & BOUNCE_RANDOM)
- printf(" rnd");
- if (bounces & BOUNCE_RACINGFAULTS)
- printf(" racing");
- if (bounces & BOUNCE_VERIFY)
- printf(" ver");
- if (bounces & BOUNCE_POLL)
- printf(" poll");
- else
- printf(" read");
- printf(", ");
- fflush(stdout);
-
- if (bounces & BOUNCE_POLL)
- fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
- else
- fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK);
-
- /* register */
- uffdio_register.range.start = (unsigned long) area_dst;
- uffdio_register.range.len = nr_pages * page_size;
- uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
- if (test_uffdio_wp)
- uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
- if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
- err("register failure");
- assert_expected_ioctls_present(
- uffdio_register.mode, uffdio_register.ioctls);
-
- if (area_dst_alias) {
- uffdio_register.range.start = (unsigned long)
- area_dst_alias;
- if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
- err("register failure alias");
- }
-
- /*
- * The madvise done previously isn't enough: some
- * uffd_thread could have read userfaults (one of
- * those already resolved by the background thread)
- * and it may be in the process of calling
- * UFFDIO_COPY. UFFDIO_COPY will read the zapped
- * area_src and it would map a zero page in it (of
- * course such a UFFDIO_COPY is perfectly safe as it'd
- * return -EEXIST). The problem comes at the next
- * bounce though: that racing UFFDIO_COPY would
- * generate zeropages in the area_src, so invalidating
- * the previous MADV_DONTNEED. Without this additional
- * MADV_DONTNEED those zeropages leftovers in the
- * area_src would lead to -EEXIST failure during the
- * next bounce, effectively leaving a zeropage in the
- * area_dst.
- *
- * Try to comment this out madvise to see the memory
- * corruption being caught pretty quick.
- *
- * khugepaged is also inhibited to collapse THP after
- * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's
- * required to MADV_DONTNEED here.
- */
- uffd_test_ops->release_pages(area_dst);
-
- uffd_stats_reset(uffd_stats, nr_cpus);
-
- /* bounce pass */
- if (stress(uffd_stats))
- return 1;
-
- /* Clear all the write protections if there is any */
- if (test_uffdio_wp)
- wp_range(uffd, (unsigned long)area_dst,
- nr_pages * page_size, false);
-
- /* unregister */
- if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range))
- err("unregister failure");
- if (area_dst_alias) {
- uffdio_register.range.start = (unsigned long) area_dst;
- if (ioctl(uffd, UFFDIO_UNREGISTER,
- &uffdio_register.range))
- err("unregister failure alias");
- }
-
- /* verification */
- if (bounces & BOUNCE_VERIFY)
- for (nr = 0; nr < nr_pages; nr++)
- if (*area_count(area_dst, nr) != count_verify[nr])
- err("error area_count %llu %llu %lu\n",
- *area_count(area_src, nr),
- count_verify[nr], nr);
-
- /* prepare next bounce */
- swap(area_src, area_dst);
-
- swap(area_src_alias, area_dst_alias);
-
- uffd_stats_report(uffd_stats, nr_cpus);
- }
-
- if (test_type == TEST_ANON) {
- /*
- * shmem/hugetlb won't be able to run since they have different
- * behavior on fork() (file-backed memory normally drops ptes
- * directly when fork), meanwhile the pagemap test will verify
- * pgtable entry of fork()ed child.
- */
- userfaultfd_pagemap_test(page_size);
- /*
- * Hard-code for x86_64 for now for 2M THP, as x86_64 is
- * currently the only one that supports uffd-wp
- */
- userfaultfd_pagemap_test(page_size * 512);
- }
-
- return userfaultfd_zeropage_test() || userfaultfd_sig_test()
- || userfaultfd_events_test() || userfaultfd_minor_test();
-}
-
-/*
- * Copied from mlock2-tests.c
- */
-unsigned long default_huge_page_size(void)
-{
- unsigned long hps = 0;
- char *line = NULL;
- size_t linelen = 0;
- FILE *f = fopen("/proc/meminfo", "r");
-
- if (!f)
- return 0;
- while (getline(&line, &linelen, f) > 0) {
- if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) {
- hps <<= 10;
- break;
- }
- }
-
- free(line);
- fclose(f);
- return hps;
-}
-
-static void set_test_type(const char *type)
-{
- if (!strcmp(type, "anon")) {
- test_type = TEST_ANON;
- uffd_test_ops = &anon_uffd_test_ops;
- } else if (!strcmp(type, "hugetlb")) {
- test_type = TEST_HUGETLB;
- uffd_test_ops = &hugetlb_uffd_test_ops;
- } else if (!strcmp(type, "hugetlb_shared")) {
- map_shared = true;
- test_type = TEST_HUGETLB;
- uffd_test_ops = &hugetlb_uffd_test_ops;
- /* Minor faults require shared hugetlb; only enable here. */
- test_uffdio_minor = true;
- } else if (!strcmp(type, "shmem")) {
- map_shared = true;
- test_type = TEST_SHMEM;
- uffd_test_ops = &shmem_uffd_test_ops;
- test_uffdio_minor = true;
- }
-}
-
-static void parse_test_type_arg(const char *raw_type)
-{
- char *buf = strdup(raw_type);
- uint64_t features = UFFD_API_FEATURES;
-
- while (buf) {
- const char *token = strsep(&buf, ":");
-
- if (!test_type)
- set_test_type(token);
- else if (!strcmp(token, "dev"))
- test_dev_userfaultfd = true;
- else if (!strcmp(token, "syscall"))
- test_dev_userfaultfd = false;
- else if (!strcmp(token, "collapse"))
- test_collapse = true;
- else
- err("unrecognized test mod '%s'", token);
- }
-
- if (!test_type)
- err("failed to parse test type argument: '%s'", raw_type);
-
- if (test_collapse && test_type != TEST_SHMEM)
- err("Unsupported test: %s", raw_type);
-
- if (test_type == TEST_HUGETLB)
- page_size = hpage_size;
- else
- page_size = sysconf(_SC_PAGE_SIZE);
-
- if (!page_size)
- err("Unable to determine page size");
- if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
- > page_size)
- err("Impossible to run this test");
-
- /*
- * Whether we can test certain features depends not just on test type,
- * but also on whether or not this particular kernel supports the
- * feature.
- */
-
- userfaultfd_open(&features);
-
- test_uffdio_wp = test_uffdio_wp &&
- (features & UFFD_FEATURE_PAGEFAULT_FLAG_WP);
- test_uffdio_minor = test_uffdio_minor &&
- (features & uffd_minor_feature());
-
- close(uffd);
- uffd = -1;
-}
-
-static void sigalrm(int sig)
-{
- if (sig != SIGALRM)
- abort();
- test_uffdio_copy_eexist = true;
- test_uffdio_zeropage_eexist = true;
- alarm(ALARM_INTERVAL_SECS);
-}
-
-int main(int argc, char **argv)
-{
- size_t bytes;
-
- if (argc < 4)
- usage();
-
- if (signal(SIGALRM, sigalrm) == SIG_ERR)
- err("failed to arm SIGALRM");
- alarm(ALARM_INTERVAL_SECS);
-
- hpage_size = default_huge_page_size();
- parse_test_type_arg(argv[1]);
- bytes = atol(argv[2]) * 1024 * 1024;
-
- if (test_collapse && bytes & (hpage_size - 1))
- err("MiB must be multiple of %lu if :collapse mod set",
- hpage_size >> 20);
-
- nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
-
- if (test_collapse) {
- /* nr_cpus must divide (bytes / page_size), otherwise,
- * area allocations of (nr_pages * paze_size) won't be a
- * multiple of hpage_size, even if bytes is a multiple of
- * hpage_size.
- *
- * This means that nr_cpus must divide (N * (2 << (H-P))
- * where:
- * bytes = hpage_size * N
- * hpage_size = 2 << H
- * page_size = 2 << P
- *
- * And we want to chose nr_cpus to be the largest value
- * satisfying this constraint, not larger than the number
- * of online CPUs. Unfortunately, prime factorization of
- * N and nr_cpus may be arbitrary, so have to search for it.
- * Instead, just use the highest power of 2 dividing both
- * nr_cpus and (bytes / page_size).
- */
- int x = factor_of_2(nr_cpus);
- int y = factor_of_2(bytes / page_size);
-
- nr_cpus = x < y ? x : y;
- }
- nr_pages_per_cpu = bytes / page_size / nr_cpus;
- if (!nr_pages_per_cpu) {
- _err("invalid MiB");
- usage();
- }
-
- bounces = atoi(argv[3]);
- if (bounces <= 0) {
- _err("invalid bounces");
- usage();
- }
- nr_pages = nr_pages_per_cpu * nr_cpus;
-
- if (test_type == TEST_SHMEM || test_type == TEST_HUGETLB) {
- unsigned int memfd_flags = 0;
-
- if (test_type == TEST_HUGETLB)
- memfd_flags = MFD_HUGETLB;
- mem_fd = memfd_create(argv[0], memfd_flags);
- if (mem_fd < 0)
- err("memfd_create");
- if (ftruncate(mem_fd, nr_pages * page_size * 2))
- err("ftruncate");
- if (fallocate(mem_fd,
- FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0,
- nr_pages * page_size * 2))
- err("fallocate");
- }
- printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
- nr_pages, nr_pages_per_cpu);
- return userfaultfd_stress();
-}
-
-#else /* __NR_userfaultfd */
-
-#warning "missing __NR_userfaultfd definition"
-
-int main(void)
-{
- printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n");
- return KSFT_SKIP;
-}
-
-#endif /* __NR_userfaultfd */
diff --git a/tools/testing/selftests/mm/util.h b/tools/testing/selftests/mm/util.h
deleted file mode 100644
index b27d26199334..000000000000
--- a/tools/testing/selftests/mm/util.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef __KSELFTEST_VM_UTIL_H
-#define __KSELFTEST_VM_UTIL_H
-
-#include <stdint.h>
-#include <sys/mman.h>
-#include <err.h>
-#include <string.h> /* ffsl() */
-#include <unistd.h> /* _SC_PAGESIZE */
-
-static unsigned int __page_size;
-static unsigned int __page_shift;
-
-static inline unsigned int page_size(void)
-{
- if (!__page_size)
- __page_size = sysconf(_SC_PAGESIZE);
- return __page_size;
-}
-
-static inline unsigned int page_shift(void)
-{
- if (!__page_shift)
- __page_shift = (ffsl(page_size()) - 1);
- return __page_shift;
-}
-
-#define PAGE_SHIFT (page_shift())
-#define PAGE_SIZE (page_size())
-/*
- * On ppc64 this will only work with radix 2M hugepage size
- */
-#define HPAGE_SHIFT 21
-#define HPAGE_SIZE (1 << HPAGE_SHIFT)
-
-#define PAGEMAP_PRESENT(ent) (((ent) & (1ull << 63)) != 0)
-#define PAGEMAP_PFN(ent) ((ent) & ((1ull << 55) - 1))
-
-
-static inline int64_t allocate_transhuge(void *ptr, int pagemap_fd)
-{
- uint64_t ent[2];
-
- /* drop pmd */
- if (mmap(ptr, HPAGE_SIZE, PROT_READ | PROT_WRITE,
- MAP_FIXED | MAP_ANONYMOUS |
- MAP_NORESERVE | MAP_PRIVATE, -1, 0) != ptr)
- errx(2, "mmap transhuge");
-
- if (madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE))
- err(2, "MADV_HUGEPAGE");
-
- /* allocate transparent huge page */
- *(volatile void **)ptr = ptr;
-
- if (pread(pagemap_fd, ent, sizeof(ent),
- (uintptr_t)ptr >> (PAGE_SHIFT - 3)) != sizeof(ent))
- err(2, "read pagemap");
-
- if (PAGEMAP_PRESENT(ent[0]) && PAGEMAP_PRESENT(ent[1]) &&
- PAGEMAP_PFN(ent[0]) + 1 == PAGEMAP_PFN(ent[1]) &&
- !(PAGEMAP_PFN(ent[0]) & ((1 << (HPAGE_SHIFT - PAGE_SHIFT)) - 1)))
- return PAGEMAP_PFN(ent[0]);
-
- return -1;
-}
-
-#endif
diff --git a/tools/testing/selftests/mm/va_128TBswitch.c b/tools/testing/selftests/mm/va_high_addr_switch.c
index 1d2068989883..7cfaf4a74c57 100644
--- a/tools/testing/selftests/mm/va_128TBswitch.c
+++ b/tools/testing/selftests/mm/va_high_addr_switch.c
@@ -17,18 +17,38 @@
* This will work with 16M and 2M hugepage size
*/
#define HUGETLB_SIZE (16 << 20)
+#elif __aarch64__
+/*
+ * The default hugepage size for 64k base pagesize
+ * is 512MB.
+ */
+#define PAGE_SIZE (64 << 10)
+#define HUGETLB_SIZE (512 << 20)
#else
#define PAGE_SIZE (4 << 10)
#define HUGETLB_SIZE (2 << 20)
#endif
/*
- * >= 128TB is the hint addr value we used to select
- * large address space.
+ * The hint addr value is used to allocate addresses
+ * beyond the high address switch boundary.
*/
-#define ADDR_SWITCH_HINT (1UL << 47)
+
+#define ADDR_MARK_128TB (1UL << 47)
+#define ADDR_MARK_256TB (1UL << 48)
+
+#define HIGH_ADDR_128TB ((void *) (1UL << 48))
+#define HIGH_ADDR_256TB ((void *) (1UL << 49))
+
#define LOW_ADDR ((void *) (1UL << 30))
-#define HIGH_ADDR ((void *) (1UL << 48))
+
+#ifdef __aarch64__
+#define ADDR_SWITCH_HINT ADDR_MARK_256TB
+#define HIGH_ADDR HIGH_ADDR_256TB
+#else
+#define ADDR_SWITCH_HINT ADDR_MARK_128TB
+#define HIGH_ADDR HIGH_ADDR_128TB
+#endif
struct testcase {
void *addr;
@@ -53,9 +73,10 @@ static struct testcase testcases[] = {
},
{
/*
- * We should never allocate at the requested address or above it
- * The len cross the 128TB boundary. Without MAP_FIXED
- * we will always search in the lower address space.
+ * Unless MAP_FIXED is specified, allocation based on hint
+ * addr is never at requested address or above it, which is
+ * beyond high address switch boundary in this case. Instead,
+ * a suitable allocation is found in lower address space.
*/
.addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)),
.size = 2 * PAGE_SIZE,
@@ -65,8 +86,8 @@ static struct testcase testcases[] = {
},
{
/*
- * Exact mapping at 128TB, the area is free we should get that
- * even without MAP_FIXED.
+ * Exact mapping at high address switch boundary, should
+ * be obtained even without MAP_FIXED as area is free.
*/
.addr = ((void *)(ADDR_SWITCH_HINT)),
.size = PAGE_SIZE,
@@ -270,6 +291,8 @@ static int supported_arch(void)
return 1;
#elif defined(__x86_64__)
return 1;
+#elif defined(__aarch64__)
+ return 1;
#else
return 0;
#endif
diff --git a/tools/testing/selftests/mm/va_128TBswitch.sh b/tools/testing/selftests/mm/va_high_addr_switch.sh
index 41580751dc51..45cae7cab27e 100644
--- a/tools/testing/selftests/mm/va_128TBswitch.sh
+++ b/tools/testing/selftests/mm/va_high_addr_switch.sh
@@ -51,4 +51,8 @@ check_test_requirements()
}
check_test_requirements
-./va_128TBswitch
+./va_high_addr_switch
+
+# In order to run hugetlb testcases, "--run-hugetlb" must be appended
+# to the binary.
+./va_high_addr_switch --run-hugetlb
diff --git a/tools/testing/selftests/mm/virtual_address_range.c b/tools/testing/selftests/mm/virtual_address_range.c
index c0592646ed93..bae0ceaf95b1 100644
--- a/tools/testing/selftests/mm/virtual_address_range.c
+++ b/tools/testing/selftests/mm/virtual_address_range.c
@@ -15,11 +15,15 @@
/*
* Maximum address range mapped with a single mmap()
- * call is little bit more than 16GB. Hence 16GB is
+ * call is little bit more than 1GB. Hence 1GB is
* chosen as the single chunk size for address space
* mapping.
*/
-#define MAP_CHUNK_SIZE 17179869184UL /* 16GB */
+
+#define SZ_1GB (1024 * 1024 * 1024UL)
+#define SZ_1TB (1024 * 1024 * 1024 * 1024UL)
+
+#define MAP_CHUNK_SIZE SZ_1GB
/*
* Address space till 128TB is mapped without any hint
@@ -32,13 +36,15 @@
* till it reaches 512TB. One with size 128TB and the
* other being 384TB.
*
- * On Arm64 the address space is 256TB and no high mappings
- * are supported so far.
+ * On Arm64 the address space is 256TB and support for
+ * high mappings up to 4PB virtual address space has
+ * been added.
*/
-#define NR_CHUNKS_128TB 8192UL /* Number of 16GB chunks for 128TB */
+#define NR_CHUNKS_128TB ((128 * SZ_1TB) / MAP_CHUNK_SIZE) /* Number of chunks for 128TB */
#define NR_CHUNKS_256TB (NR_CHUNKS_128TB * 2UL)
#define NR_CHUNKS_384TB (NR_CHUNKS_128TB * 3UL)
+#define NR_CHUNKS_3840TB (NR_CHUNKS_128TB * 30UL)
#define ADDR_MARK_128TB (1UL << 47) /* First address beyond 128TB */
#define ADDR_MARK_256TB (1UL << 48) /* First address beyond 256TB */
@@ -47,7 +53,7 @@
#define HIGH_ADDR_MARK ADDR_MARK_256TB
#define HIGH_ADDR_SHIFT 49
#define NR_CHUNKS_LOW NR_CHUNKS_256TB
-#define NR_CHUNKS_HIGH 0
+#define NR_CHUNKS_HIGH NR_CHUNKS_3840TB
#else
#define HIGH_ADDR_MARK ADDR_MARK_128TB
#define HIGH_ADDR_SHIFT 48
@@ -97,7 +103,7 @@ static int validate_lower_address_hint(void)
int main(int argc, char *argv[])
{
char *ptr[NR_CHUNKS_LOW];
- char *hptr[NR_CHUNKS_HIGH];
+ char **hptr;
char *hint;
unsigned long i, lchunks, hchunks;
@@ -115,6 +121,9 @@ int main(int argc, char *argv[])
return 1;
}
lchunks = i;
+ hptr = (char **) calloc(NR_CHUNKS_HIGH, sizeof(char *));
+ if (hptr == NULL)
+ return 1;
for (i = 0; i < NR_CHUNKS_HIGH; i++) {
hint = hind_addr();
@@ -135,5 +144,6 @@ int main(int argc, char *argv[])
for (i = 0; i < hchunks; i++)
munmap(hptr[i], MAP_CHUNK_SIZE);
+ free(hptr);
return 0;
}
diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c
index 40e795624ff3..9b06a5034808 100644
--- a/tools/testing/selftests/mm/vm_util.c
+++ b/tools/testing/selftests/mm/vm_util.c
@@ -1,6 +1,10 @@
// SPDX-License-Identifier: GPL-2.0
#include <string.h>
#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <linux/userfaultfd.h>
+#include <sys/syscall.h>
+#include <unistd.h>
#include "../kselftest.h"
#include "vm_util.h"
@@ -8,6 +12,9 @@
#define SMAP_FILE_PATH "/proc/self/smaps"
#define MAX_LINE_LENGTH 500
+unsigned int __page_size;
+unsigned int __page_shift;
+
uint64_t pagemap_get_entry(int fd, char *start)
{
const unsigned long pfn = (unsigned long)start / getpagesize();
@@ -22,25 +29,17 @@ uint64_t pagemap_get_entry(int fd, char *start)
bool pagemap_is_softdirty(int fd, char *start)
{
- uint64_t entry = pagemap_get_entry(fd, start);
-
- // Check if dirty bit (55th bit) is set
- return entry & 0x0080000000000000ull;
+ return pagemap_get_entry(fd, start) & PM_SOFT_DIRTY;
}
bool pagemap_is_swapped(int fd, char *start)
{
- uint64_t entry = pagemap_get_entry(fd, start);
-
- return entry & 0x4000000000000000ull;
+ return pagemap_get_entry(fd, start) & PM_SWAP;
}
bool pagemap_is_populated(int fd, char *start)
{
- uint64_t entry = pagemap_get_entry(fd, start);
-
- /* Present or swapped. */
- return entry & 0xc000000000000000ull;
+ return pagemap_get_entry(fd, start) & (PM_PRESENT | PM_SWAP);
}
unsigned long pagemap_get_pfn(int fd, char *start)
@@ -48,7 +47,7 @@ unsigned long pagemap_get_pfn(int fd, char *start)
uint64_t entry = pagemap_get_entry(fd, start);
/* If present (63th bit), PFN is at bit 0 -- 54. */
- if (entry & 0x8000000000000000ull)
+ if (entry & PM_PRESENT)
return entry & 0x007fffffffffffffull;
return -1ul;
}
@@ -84,12 +83,12 @@ uint64_t read_pmd_pagesize(void)
fd = open(PMD_SIZE_FILE_PATH, O_RDONLY);
if (fd == -1)
- ksft_exit_fail_msg("Open hpage_pmd_size failed\n");
+ return 0;
num_read = read(fd, buf, 19);
if (num_read < 1) {
close(fd);
- ksft_exit_fail_msg("Read hpage_pmd_size failed\n");
+ return 0;
}
buf[num_read] = '\0';
close(fd);
@@ -149,3 +148,156 @@ bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size)
{
return __check_huge(addr, "ShmemPmdMapped:", nr_hpages, hpage_size);
}
+
+int64_t allocate_transhuge(void *ptr, int pagemap_fd)
+{
+ uint64_t ent[2];
+
+ /* drop pmd */
+ if (mmap(ptr, HPAGE_SIZE, PROT_READ | PROT_WRITE,
+ MAP_FIXED | MAP_ANONYMOUS |
+ MAP_NORESERVE | MAP_PRIVATE, -1, 0) != ptr)
+ errx(2, "mmap transhuge");
+
+ if (madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE))
+ err(2, "MADV_HUGEPAGE");
+
+ /* allocate transparent huge page */
+ *(volatile void **)ptr = ptr;
+
+ if (pread(pagemap_fd, ent, sizeof(ent),
+ (uintptr_t)ptr >> (pshift() - 3)) != sizeof(ent))
+ err(2, "read pagemap");
+
+ if (PAGEMAP_PRESENT(ent[0]) && PAGEMAP_PRESENT(ent[1]) &&
+ PAGEMAP_PFN(ent[0]) + 1 == PAGEMAP_PFN(ent[1]) &&
+ !(PAGEMAP_PFN(ent[0]) & ((1 << (HPAGE_SHIFT - pshift())) - 1)))
+ return PAGEMAP_PFN(ent[0]);
+
+ return -1;
+}
+
+unsigned long default_huge_page_size(void)
+{
+ unsigned long hps = 0;
+ char *line = NULL;
+ size_t linelen = 0;
+ FILE *f = fopen("/proc/meminfo", "r");
+
+ if (!f)
+ return 0;
+ while (getline(&line, &linelen, f) > 0) {
+ if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) {
+ hps <<= 10;
+ break;
+ }
+ }
+
+ free(line);
+ fclose(f);
+ return hps;
+}
+
+/* If `ioctls' non-NULL, the allowed ioctls will be returned into the var */
+int uffd_register_with_ioctls(int uffd, void *addr, uint64_t len,
+ bool miss, bool wp, bool minor, uint64_t *ioctls)
+{
+ struct uffdio_register uffdio_register = { 0 };
+ uint64_t mode = 0;
+ int ret = 0;
+
+ if (miss)
+ mode |= UFFDIO_REGISTER_MODE_MISSING;
+ if (wp)
+ mode |= UFFDIO_REGISTER_MODE_WP;
+ if (minor)
+ mode |= UFFDIO_REGISTER_MODE_MINOR;
+
+ uffdio_register.range.start = (unsigned long)addr;
+ uffdio_register.range.len = len;
+ uffdio_register.mode = mode;
+
+ if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1)
+ ret = -errno;
+ else if (ioctls)
+ *ioctls = uffdio_register.ioctls;
+
+ return ret;
+}
+
+int uffd_register(int uffd, void *addr, uint64_t len,
+ bool miss, bool wp, bool minor)
+{
+ return uffd_register_with_ioctls(uffd, addr, len,
+ miss, wp, minor, NULL);
+}
+
+int uffd_unregister(int uffd, void *addr, uint64_t len)
+{
+ struct uffdio_range range = { .start = (uintptr_t)addr, .len = len };
+ int ret = 0;
+
+ if (ioctl(uffd, UFFDIO_UNREGISTER, &range) == -1)
+ ret = -errno;
+
+ return ret;
+}
+
+int uffd_open_dev(unsigned int flags)
+{
+ int fd, uffd;
+
+ fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
+ if (fd < 0)
+ return fd;
+ uffd = ioctl(fd, USERFAULTFD_IOC_NEW, flags);
+ close(fd);
+
+ return uffd;
+}
+
+int uffd_open_sys(unsigned int flags)
+{
+#ifdef __NR_userfaultfd
+ return syscall(__NR_userfaultfd, flags);
+#else
+ return -1;
+#endif
+}
+
+int uffd_open(unsigned int flags)
+{
+ int uffd = uffd_open_sys(flags);
+
+ if (uffd < 0)
+ uffd = uffd_open_dev(flags);
+
+ return uffd;
+}
+
+int uffd_get_features(uint64_t *features)
+{
+ struct uffdio_api uffdio_api = { .api = UFFD_API, .features = 0 };
+ /*
+ * This should by default work in most kernels; the feature list
+ * will be the same no matter what we pass in here.
+ */
+ int fd = uffd_open(UFFD_USER_MODE_ONLY);
+
+ if (fd < 0)
+ /* Maybe the kernel is older than user-only mode? */
+ fd = uffd_open(0);
+
+ if (fd < 0)
+ return fd;
+
+ if (ioctl(fd, UFFDIO_API, &uffdio_api)) {
+ close(fd);
+ return -errno;
+ }
+
+ *features = uffdio_api.features;
+ close(fd);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h
index 1995ee911ef2..b950bd16083a 100644
--- a/tools/testing/selftests/mm/vm_util.h
+++ b/tools/testing/selftests/mm/vm_util.h
@@ -1,6 +1,35 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <stdint.h>
#include <stdbool.h>
+#include <sys/mman.h>
+#include <err.h>
+#include <string.h> /* ffsl() */
+#include <unistd.h> /* _SC_PAGESIZE */
+
+#define BIT_ULL(nr) (1ULL << (nr))
+#define PM_SOFT_DIRTY BIT_ULL(55)
+#define PM_MMAP_EXCLUSIVE BIT_ULL(56)
+#define PM_UFFD_WP BIT_ULL(57)
+#define PM_FILE BIT_ULL(61)
+#define PM_SWAP BIT_ULL(62)
+#define PM_PRESENT BIT_ULL(63)
+
+extern unsigned int __page_size;
+extern unsigned int __page_shift;
+
+static inline unsigned int psize(void)
+{
+ if (!__page_size)
+ __page_size = sysconf(_SC_PAGESIZE);
+ return __page_size;
+}
+
+static inline unsigned int pshift(void)
+{
+ if (!__page_shift)
+ __page_shift = (ffsl(psize()) - 1);
+ return __page_shift;
+}
uint64_t pagemap_get_entry(int fd, char *start);
bool pagemap_is_softdirty(int fd, char *start);
@@ -13,3 +42,24 @@ uint64_t read_pmd_pagesize(void);
bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size);
bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size);
bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size);
+int64_t allocate_transhuge(void *ptr, int pagemap_fd);
+unsigned long default_huge_page_size(void);
+
+int uffd_register(int uffd, void *addr, uint64_t len,
+ bool miss, bool wp, bool minor);
+int uffd_unregister(int uffd, void *addr, uint64_t len);
+int uffd_open_dev(unsigned int flags);
+int uffd_open_sys(unsigned int flags);
+int uffd_open(unsigned int flags);
+int uffd_get_features(uint64_t *features);
+int uffd_register_with_ioctls(int uffd, void *addr, uint64_t len,
+ bool miss, bool wp, bool minor, uint64_t *ioctls);
+
+/*
+ * On ppc64 this will only work with radix 2M hugepage size
+ */
+#define HPAGE_SHIFT 21
+#define HPAGE_SIZE (1 << HPAGE_SHIFT)
+
+#define PAGEMAP_PRESENT(ent) (((ent) & (1ull << 63)) != 0)
+#define PAGEMAP_PFN(ent) ((ent) & ((1ull << 55) - 1))