1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
|
// SPDX-License-Identifier: GPL-2.0-or-later
#define _GNU_SOURCE
#include "../kselftest_harness.h"
#include <errno.h>
#include <setjmp.h>
#include <signal.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <linux/mman.h>
#include <sys/syscall.h>
#include <unistd.h>
#include <sched.h>
#include "vm_util.h"
#include "../pidfd/pidfd.h"
FIXTURE(process_madvise)
{
unsigned long page_size;
pid_t child_pid;
int remote_pidfd;
int pidfd;
};
FIXTURE_SETUP(process_madvise)
{
self->page_size = (unsigned long)sysconf(_SC_PAGESIZE);
self->pidfd = PIDFD_SELF;
self->remote_pidfd = -1;
self->child_pid = -1;
};
FIXTURE_TEARDOWN_PARENT(process_madvise)
{
/* This teardown is guaranteed to run, even if tests SKIP or ASSERT */
if (self->child_pid > 0) {
kill(self->child_pid, SIGKILL);
waitpid(self->child_pid, NULL, 0);
}
if (self->remote_pidfd >= 0)
close(self->remote_pidfd);
}
static ssize_t sys_process_madvise(int pidfd, const struct iovec *iovec,
size_t vlen, int advice, unsigned int flags)
{
return syscall(__NR_process_madvise, pidfd, iovec, vlen, advice, flags);
}
/*
* This test uses PIDFD_SELF to target the current process. The main
* goal is to verify the basic behavior of process_madvise() with
* a vector of non-contiguous memory ranges, not its cross-process
* capabilities.
*/
TEST_F(process_madvise, basic)
{
const unsigned long pagesize = self->page_size;
const int madvise_pages = 4;
struct iovec vec[madvise_pages];
int pidfd = self->pidfd;
ssize_t ret;
char *map;
/*
* Create a single large mapping. We will pick pages from this
* mapping to advise on. This ensures we test non-contiguous iovecs.
*/
map = mmap(NULL, pagesize * 10, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (map == MAP_FAILED)
SKIP(return, "mmap failed, not enough memory.\n");
/* Fill the entire region with a known pattern. */
memset(map, 'A', pagesize * 10);
/*
* Setup the iovec to point to 4 non-contiguous pages
* within the mapping.
*/
vec[0].iov_base = &map[0 * pagesize];
vec[0].iov_len = pagesize;
vec[1].iov_base = &map[3 * pagesize];
vec[1].iov_len = pagesize;
vec[2].iov_base = &map[5 * pagesize];
vec[2].iov_len = pagesize;
vec[3].iov_base = &map[8 * pagesize];
vec[3].iov_len = pagesize;
ret = sys_process_madvise(pidfd, vec, madvise_pages, MADV_DONTNEED, 0);
if (ret == -1 && errno == EPERM)
SKIP(return,
"process_madvise() unsupported or permission denied, try running as root.\n");
else if (errno == EINVAL)
SKIP(return,
"process_madvise() unsupported or parameter invalid, please check arguments.\n");
/* The call should succeed and report the total bytes processed. */
ASSERT_EQ(ret, madvise_pages * pagesize);
/* Check that advised pages are now zero. */
for (int i = 0; i < madvise_pages; i++) {
char *advised_page = (char *)vec[i].iov_base;
/* Content must be 0, not 'A'. */
ASSERT_EQ(*advised_page, '\0');
}
/* Check that an un-advised page in between is still 'A'. */
char *unadvised_page = &map[1 * pagesize];
for (int i = 0; i < pagesize; i++)
ASSERT_EQ(unadvised_page[i], 'A');
/* Cleanup. */
ASSERT_EQ(munmap(map, pagesize * 10), 0);
}
/*
* This test deterministically validates process_madvise() with MADV_COLLAPSE
* on a remote process, other advices are difficult to verify reliably.
*
* The test verifies that a memory region in a child process,
* focus on process_madv remote result, only check addresses and lengths.
* The correctness of the MADV_COLLAPSE can be found in the relevant test examples in khugepaged.
*/
TEST_F(process_madvise, remote_collapse)
{
const unsigned long pagesize = self->page_size;
long huge_page_size;
int pipe_info[2];
ssize_t ret;
struct iovec vec;
struct child_info {
pid_t pid;
void *map_addr;
} info;
huge_page_size = read_pmd_pagesize();
if (huge_page_size <= 0)
SKIP(return, "Could not determine a valid huge page size.\n");
ASSERT_EQ(pipe(pipe_info), 0);
self->child_pid = fork();
ASSERT_NE(self->child_pid, -1);
if (self->child_pid == 0) {
char *map;
size_t map_size = 2 * huge_page_size;
close(pipe_info[0]);
map = mmap(NULL, map_size, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
ASSERT_NE(map, MAP_FAILED);
/* Fault in as small pages */
for (size_t i = 0; i < map_size; i += pagesize)
map[i] = 'A';
/* Send info and pause */
info.pid = getpid();
info.map_addr = map;
ret = write(pipe_info[1], &info, sizeof(info));
ASSERT_EQ(ret, sizeof(info));
close(pipe_info[1]);
pause();
exit(0);
}
close(pipe_info[1]);
/* Receive child info */
ret = read(pipe_info[0], &info, sizeof(info));
if (ret <= 0) {
waitpid(self->child_pid, NULL, 0);
SKIP(return, "Failed to read child info from pipe.\n");
}
ASSERT_EQ(ret, sizeof(info));
close(pipe_info[0]);
self->child_pid = info.pid;
self->remote_pidfd = syscall(__NR_pidfd_open, self->child_pid, 0);
ASSERT_GE(self->remote_pidfd, 0);
vec.iov_base = info.map_addr;
vec.iov_len = huge_page_size;
ret = sys_process_madvise(self->remote_pidfd, &vec, 1, MADV_COLLAPSE,
0);
if (ret == -1) {
if (errno == EINVAL)
SKIP(return, "PROCESS_MADV_ADVISE is not supported.\n");
else if (errno == EPERM)
SKIP(return,
"No process_madvise() permissions, try running as root.\n");
return;
}
ASSERT_EQ(ret, huge_page_size);
}
/*
* Test process_madvise() with a pidfd for a process that has already
* exited to ensure correct error handling.
*/
TEST_F(process_madvise, exited_process_pidfd)
{
const unsigned long pagesize = self->page_size;
struct iovec vec;
char *map;
ssize_t ret;
map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1,
0);
if (map == MAP_FAILED)
SKIP(return, "mmap failed, not enough memory.\n");
vec.iov_base = map;
vec.iov_len = pagesize;
/*
* Using a pidfd for a process that has already exited should fail
* with ESRCH.
*/
self->child_pid = fork();
ASSERT_NE(self->child_pid, -1);
if (self->child_pid == 0)
exit(0);
self->remote_pidfd = syscall(__NR_pidfd_open, self->child_pid, 0);
ASSERT_GE(self->remote_pidfd, 0);
/* Wait for the child to ensure it has terminated. */
waitpid(self->child_pid, NULL, 0);
ret = sys_process_madvise(self->remote_pidfd, &vec, 1, MADV_DONTNEED,
0);
ASSERT_EQ(ret, -1);
ASSERT_EQ(errno, ESRCH);
}
/*
* Test process_madvise() with bad pidfds to ensure correct error
* handling.
*/
TEST_F(process_madvise, bad_pidfd)
{
const unsigned long pagesize = self->page_size;
struct iovec vec;
char *map;
ssize_t ret;
map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1,
0);
if (map == MAP_FAILED)
SKIP(return, "mmap failed, not enough memory.\n");
vec.iov_base = map;
vec.iov_len = pagesize;
/* Using an invalid fd number (-1) should fail with EBADF. */
ret = sys_process_madvise(-1, &vec, 1, MADV_DONTNEED, 0);
ASSERT_EQ(ret, -1);
ASSERT_EQ(errno, EBADF);
/*
* Using a valid fd that is not a pidfd (e.g. stdin) should fail
* with EBADF.
*/
ret = sys_process_madvise(STDIN_FILENO, &vec, 1, MADV_DONTNEED, 0);
ASSERT_EQ(ret, -1);
ASSERT_EQ(errno, EBADF);
}
/*
* Test that process_madvise() rejects vlen > UIO_MAXIOV.
* The kernel should return -EINVAL when the number of iovecs exceeds 1024.
*/
TEST_F(process_madvise, invalid_vlen)
{
const unsigned long pagesize = self->page_size;
int pidfd = self->pidfd;
struct iovec vec;
char *map;
ssize_t ret;
map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1,
0);
if (map == MAP_FAILED)
SKIP(return, "mmap failed, not enough memory.\n");
vec.iov_base = map;
vec.iov_len = pagesize;
ret = sys_process_madvise(pidfd, &vec, 1025, MADV_DONTNEED, 0);
ASSERT_EQ(ret, -1);
ASSERT_EQ(errno, EINVAL);
/* Cleanup. */
ASSERT_EQ(munmap(map, pagesize), 0);
}
/*
* Test process_madvise() with an invalid flag value. Currently, only a flag
* value of 0 is supported. This test is reserved for the future, e.g., if
* synchronous flags are added.
*/
TEST_F(process_madvise, flag)
{
const unsigned long pagesize = self->page_size;
unsigned int invalid_flag;
int pidfd = self->pidfd;
struct iovec vec;
char *map;
ssize_t ret;
map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1,
0);
if (map == MAP_FAILED)
SKIP(return, "mmap failed, not enough memory.\n");
vec.iov_base = map;
vec.iov_len = pagesize;
invalid_flag = 0x80000000;
ret = sys_process_madvise(pidfd, &vec, 1, MADV_DONTNEED, invalid_flag);
ASSERT_EQ(ret, -1);
ASSERT_EQ(errno, EINVAL);
/* Cleanup. */
ASSERT_EQ(munmap(map, pagesize), 0);
}
TEST_HARNESS_MAIN
|