// SPDX-License-Identifier: GPL-2.0-or-later #define _GNU_SOURCE #include "../kselftest_harness.h" #include #include #include #include #include #include #include #include #include #include #include #include "vm_util.h" #include "../pidfd/pidfd.h" FIXTURE(process_madvise) { unsigned long page_size; pid_t child_pid; int remote_pidfd; int pidfd; }; FIXTURE_SETUP(process_madvise) { self->page_size = (unsigned long)sysconf(_SC_PAGESIZE); self->pidfd = PIDFD_SELF; self->remote_pidfd = -1; self->child_pid = -1; }; FIXTURE_TEARDOWN_PARENT(process_madvise) { /* This teardown is guaranteed to run, even if tests SKIP or ASSERT */ if (self->child_pid > 0) { kill(self->child_pid, SIGKILL); waitpid(self->child_pid, NULL, 0); } if (self->remote_pidfd >= 0) close(self->remote_pidfd); } static ssize_t sys_process_madvise(int pidfd, const struct iovec *iovec, size_t vlen, int advice, unsigned int flags) { return syscall(__NR_process_madvise, pidfd, iovec, vlen, advice, flags); } /* * This test uses PIDFD_SELF to target the current process. The main * goal is to verify the basic behavior of process_madvise() with * a vector of non-contiguous memory ranges, not its cross-process * capabilities. */ TEST_F(process_madvise, basic) { const unsigned long pagesize = self->page_size; const int madvise_pages = 4; struct iovec vec[madvise_pages]; int pidfd = self->pidfd; ssize_t ret; char *map; /* * Create a single large mapping. We will pick pages from this * mapping to advise on. This ensures we test non-contiguous iovecs. */ map = mmap(NULL, pagesize * 10, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (map == MAP_FAILED) SKIP(return, "mmap failed, not enough memory.\n"); /* Fill the entire region with a known pattern. */ memset(map, 'A', pagesize * 10); /* * Setup the iovec to point to 4 non-contiguous pages * within the mapping. */ vec[0].iov_base = &map[0 * pagesize]; vec[0].iov_len = pagesize; vec[1].iov_base = &map[3 * pagesize]; vec[1].iov_len = pagesize; vec[2].iov_base = &map[5 * pagesize]; vec[2].iov_len = pagesize; vec[3].iov_base = &map[8 * pagesize]; vec[3].iov_len = pagesize; ret = sys_process_madvise(pidfd, vec, madvise_pages, MADV_DONTNEED, 0); if (ret == -1 && errno == EPERM) SKIP(return, "process_madvise() unsupported or permission denied, try running as root.\n"); else if (errno == EINVAL) SKIP(return, "process_madvise() unsupported or parameter invalid, please check arguments.\n"); /* The call should succeed and report the total bytes processed. */ ASSERT_EQ(ret, madvise_pages * pagesize); /* Check that advised pages are now zero. */ for (int i = 0; i < madvise_pages; i++) { char *advised_page = (char *)vec[i].iov_base; /* Content must be 0, not 'A'. */ ASSERT_EQ(*advised_page, '\0'); } /* Check that an un-advised page in between is still 'A'. */ char *unadvised_page = &map[1 * pagesize]; for (int i = 0; i < pagesize; i++) ASSERT_EQ(unadvised_page[i], 'A'); /* Cleanup. */ ASSERT_EQ(munmap(map, pagesize * 10), 0); } /* * This test deterministically validates process_madvise() with MADV_COLLAPSE * on a remote process, other advices are difficult to verify reliably. * * The test verifies that a memory region in a child process, * focus on process_madv remote result, only check addresses and lengths. * The correctness of the MADV_COLLAPSE can be found in the relevant test examples in khugepaged. */ TEST_F(process_madvise, remote_collapse) { const unsigned long pagesize = self->page_size; long huge_page_size; int pipe_info[2]; ssize_t ret; struct iovec vec; struct child_info { pid_t pid; void *map_addr; } info; huge_page_size = read_pmd_pagesize(); if (huge_page_size <= 0) SKIP(return, "Could not determine a valid huge page size.\n"); ASSERT_EQ(pipe(pipe_info), 0); self->child_pid = fork(); ASSERT_NE(self->child_pid, -1); if (self->child_pid == 0) { char *map; size_t map_size = 2 * huge_page_size; close(pipe_info[0]); map = mmap(NULL, map_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); ASSERT_NE(map, MAP_FAILED); /* Fault in as small pages */ for (size_t i = 0; i < map_size; i += pagesize) map[i] = 'A'; /* Send info and pause */ info.pid = getpid(); info.map_addr = map; ret = write(pipe_info[1], &info, sizeof(info)); ASSERT_EQ(ret, sizeof(info)); close(pipe_info[1]); pause(); exit(0); } close(pipe_info[1]); /* Receive child info */ ret = read(pipe_info[0], &info, sizeof(info)); if (ret <= 0) { waitpid(self->child_pid, NULL, 0); SKIP(return, "Failed to read child info from pipe.\n"); } ASSERT_EQ(ret, sizeof(info)); close(pipe_info[0]); self->child_pid = info.pid; self->remote_pidfd = syscall(__NR_pidfd_open, self->child_pid, 0); ASSERT_GE(self->remote_pidfd, 0); vec.iov_base = info.map_addr; vec.iov_len = huge_page_size; ret = sys_process_madvise(self->remote_pidfd, &vec, 1, MADV_COLLAPSE, 0); if (ret == -1) { if (errno == EINVAL) SKIP(return, "PROCESS_MADV_ADVISE is not supported.\n"); else if (errno == EPERM) SKIP(return, "No process_madvise() permissions, try running as root.\n"); return; } ASSERT_EQ(ret, huge_page_size); } /* * Test process_madvise() with a pidfd for a process that has already * exited to ensure correct error handling. */ TEST_F(process_madvise, exited_process_pidfd) { const unsigned long pagesize = self->page_size; struct iovec vec; char *map; ssize_t ret; map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (map == MAP_FAILED) SKIP(return, "mmap failed, not enough memory.\n"); vec.iov_base = map; vec.iov_len = pagesize; /* * Using a pidfd for a process that has already exited should fail * with ESRCH. */ self->child_pid = fork(); ASSERT_NE(self->child_pid, -1); if (self->child_pid == 0) exit(0); self->remote_pidfd = syscall(__NR_pidfd_open, self->child_pid, 0); ASSERT_GE(self->remote_pidfd, 0); /* Wait for the child to ensure it has terminated. */ waitpid(self->child_pid, NULL, 0); ret = sys_process_madvise(self->remote_pidfd, &vec, 1, MADV_DONTNEED, 0); ASSERT_EQ(ret, -1); ASSERT_EQ(errno, ESRCH); } /* * Test process_madvise() with bad pidfds to ensure correct error * handling. */ TEST_F(process_madvise, bad_pidfd) { const unsigned long pagesize = self->page_size; struct iovec vec; char *map; ssize_t ret; map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (map == MAP_FAILED) SKIP(return, "mmap failed, not enough memory.\n"); vec.iov_base = map; vec.iov_len = pagesize; /* Using an invalid fd number (-1) should fail with EBADF. */ ret = sys_process_madvise(-1, &vec, 1, MADV_DONTNEED, 0); ASSERT_EQ(ret, -1); ASSERT_EQ(errno, EBADF); /* * Using a valid fd that is not a pidfd (e.g. stdin) should fail * with EBADF. */ ret = sys_process_madvise(STDIN_FILENO, &vec, 1, MADV_DONTNEED, 0); ASSERT_EQ(ret, -1); ASSERT_EQ(errno, EBADF); } /* * Test that process_madvise() rejects vlen > UIO_MAXIOV. * The kernel should return -EINVAL when the number of iovecs exceeds 1024. */ TEST_F(process_madvise, invalid_vlen) { const unsigned long pagesize = self->page_size; int pidfd = self->pidfd; struct iovec vec; char *map; ssize_t ret; map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (map == MAP_FAILED) SKIP(return, "mmap failed, not enough memory.\n"); vec.iov_base = map; vec.iov_len = pagesize; ret = sys_process_madvise(pidfd, &vec, 1025, MADV_DONTNEED, 0); ASSERT_EQ(ret, -1); ASSERT_EQ(errno, EINVAL); /* Cleanup. */ ASSERT_EQ(munmap(map, pagesize), 0); } /* * Test process_madvise() with an invalid flag value. Currently, only a flag * value of 0 is supported. This test is reserved for the future, e.g., if * synchronous flags are added. */ TEST_F(process_madvise, flag) { const unsigned long pagesize = self->page_size; unsigned int invalid_flag; int pidfd = self->pidfd; struct iovec vec; char *map; ssize_t ret; map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (map == MAP_FAILED) SKIP(return, "mmap failed, not enough memory.\n"); vec.iov_base = map; vec.iov_len = pagesize; invalid_flag = 0x80000000; ret = sys_process_madvise(pidfd, &vec, 1, MADV_DONTNEED, invalid_flag); ASSERT_EQ(ret, -1); ASSERT_EQ(errno, EINVAL); /* Cleanup. */ ASSERT_EQ(munmap(map, pagesize), 0); } TEST_HARNESS_MAIN