1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
|
// SPDX-License-Identifier: GPL-2.0
#define _GNU_SOURCE
#include <errno.h>
#include <fcntl.h>
#include <limits.h>
#include <sched.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <linux/nsfs.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/mount.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
#include "../kselftest_harness.h"
#include "../filesystems/utils.h"
#include "../pidfd/pidfd.h"
#include "wrappers.h"
/*
* Test listns() error handling with invalid buffer addresses.
*
* When the buffer pointer is invalid (e.g., crossing page boundaries
* into unmapped memory), listns() returns EINVAL.
*
* This test also creates mount namespaces that get destroyed during
* iteration, testing that namespace cleanup happens outside the RCU
* read lock.
*/
TEST(listns_partial_fault_with_ns_cleanup)
{
void *map;
__u64 *ns_ids;
ssize_t ret;
long page_size;
pid_t pid, iter_pid;
int pidfds[5];
int sv[5][2];
int iter_pidfd;
int i, status;
char c;
page_size = sysconf(_SC_PAGESIZE);
ASSERT_GT(page_size, 0);
/*
* Map two pages:
* - First page: readable and writable
* - Second page: will be unmapped to trigger EFAULT
*/
map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
ASSERT_NE(map, MAP_FAILED);
/* Unmap the second page */
ret = munmap((char *)map + page_size, page_size);
ASSERT_EQ(ret, 0);
/*
* Position the buffer pointer so there's room for exactly one u64
* before the page boundary. The second u64 would fall into the
* unmapped page.
*/
ns_ids = ((__u64 *)((char *)map + page_size)) - 1;
/*
* Create a separate process to run listns() in a loop concurrently
* with namespace creation and destruction.
*/
iter_pid = create_child(&iter_pidfd, 0);
ASSERT_NE(iter_pid, -1);
if (iter_pid == 0) {
struct ns_id_req req = {
.size = sizeof(req),
.spare = 0,
.ns_id = 0,
.ns_type = 0, /* All types */
.spare2 = 0,
.user_ns_id = 0, /* Global listing */
};
int iter_ret;
/*
* Loop calling listns() until killed.
* The kernel should:
* 1. Successfully write the first namespace ID (within valid page)
* 2. Fail with EFAULT when trying to write the second ID (unmapped page)
* 3. Handle concurrent namespace destruction without deadlock
*/
while (1) {
iter_ret = sys_listns(&req, ns_ids, 2, 0);
if (iter_ret == -1 && errno == ENOSYS)
_exit(PIDFD_SKIP);
}
}
/* Small delay to let iterator start looping */
usleep(50000);
/*
* Create several child processes, each in its own mount namespace.
* These will be destroyed while the iterator is running listns().
*/
for (i = 0; i < 5; i++) {
/* Create socketpair for synchronization */
ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0);
pid = create_child(&pidfds[i], CLONE_NEWNS);
ASSERT_NE(pid, -1);
if (pid == 0) {
close(sv[i][0]); /* Close parent end */
if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
_exit(1);
/* Child: create a couple of tmpfs mounts */
if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST)
_exit(1);
if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST)
_exit(1);
if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1)
_exit(1);
if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1)
_exit(1);
/* Signal parent that setup is complete */
if (write_nointr(sv[i][1], "R", 1) != 1)
_exit(1);
/* Wait for parent to signal us to exit */
if (read_nointr(sv[i][1], &c, 1) != 1)
_exit(1);
close(sv[i][1]);
_exit(0);
}
close(sv[i][1]); /* Close child end */
}
/* Wait for all children to finish setup */
for (i = 0; i < 5; i++) {
ret = read_nointr(sv[i][0], &c, 1);
ASSERT_EQ(ret, 1);
ASSERT_EQ(c, 'R');
}
/*
* Signal children to exit. This will destroy their mount namespaces
* while listns() is iterating the namespace tree.
* This tests that cleanup happens outside the RCU read lock.
*/
for (i = 0; i < 5; i++)
write_nointr(sv[i][0], "X", 1);
/* Wait for all mount namespace children to exit and cleanup */
for (i = 0; i < 5; i++) {
waitpid(-1, NULL, 0);
close(sv[i][0]);
close(pidfds[i]);
}
/* Kill iterator and wait for it */
sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0);
ret = waitpid(iter_pid, &status, 0);
ASSERT_EQ(ret, iter_pid);
close(iter_pidfd);
/* Should have been killed */
ASSERT_TRUE(WIFSIGNALED(status));
ASSERT_EQ(WTERMSIG(status), SIGKILL);
/* Clean up */
munmap(map, page_size);
}
/*
* Test listns() error handling when the entire buffer is invalid.
* This is a sanity check that basic invalid pointer detection works.
*/
TEST(listns_complete_fault)
{
struct ns_id_req req = {
.size = sizeof(req),
.spare = 0,
.ns_id = 0,
.ns_type = 0,
.spare2 = 0,
.user_ns_id = 0,
};
__u64 *ns_ids;
ssize_t ret;
/* Use a clearly invalid pointer */
ns_ids = (__u64 *)0xdeadbeef;
ret = sys_listns(&req, ns_ids, 10, 0);
if (ret == -1 && errno == ENOSYS)
SKIP(return, "listns() not supported");
/* Should fail with EFAULT */
ASSERT_EQ(ret, -1);
ASSERT_EQ(errno, EFAULT);
}
/*
* Test listns() error handling when the buffer is NULL.
*/
TEST(listns_null_buffer)
{
struct ns_id_req req = {
.size = sizeof(req),
.spare = 0,
.ns_id = 0,
.ns_type = 0,
.spare2 = 0,
.user_ns_id = 0,
};
ssize_t ret;
/* NULL buffer with non-zero count should fail */
ret = sys_listns(&req, NULL, 10, 0);
if (ret == -1 && errno == ENOSYS)
SKIP(return, "listns() not supported");
/* Should fail with EFAULT */
ASSERT_EQ(ret, -1);
ASSERT_EQ(errno, EFAULT);
}
/*
* Test listns() with a buffer that becomes invalid mid-iteration
* (after several successful writes), combined with mount namespace
* destruction to test RCU cleanup logic.
*/
TEST(listns_late_fault_with_ns_cleanup)
{
void *map;
__u64 *ns_ids;
ssize_t ret;
long page_size;
pid_t pid, iter_pid;
int pidfds[10];
int sv[10][2];
int iter_pidfd;
int i, status;
char c;
page_size = sysconf(_SC_PAGESIZE);
ASSERT_GT(page_size, 0);
/* Map two pages */
map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
ASSERT_NE(map, MAP_FAILED);
/* Unmap the second page */
ret = munmap((char *)map + page_size, page_size);
ASSERT_EQ(ret, 0);
/*
* Position buffer so we can write several u64s successfully
* before hitting the page boundary.
*/
ns_ids = ((__u64 *)((char *)map + page_size)) - 5;
/*
* Create a separate process to run listns() concurrently.
*/
iter_pid = create_child(&iter_pidfd, 0);
ASSERT_NE(iter_pid, -1);
if (iter_pid == 0) {
struct ns_id_req req = {
.size = sizeof(req),
.spare = 0,
.ns_id = 0,
.ns_type = 0,
.spare2 = 0,
.user_ns_id = 0,
};
int iter_ret;
/*
* Loop calling listns() until killed.
* Request 10 namespace IDs while namespaces are being destroyed.
* This tests:
* 1. EFAULT handling when buffer becomes invalid
* 2. Namespace cleanup outside RCU read lock during iteration
*/
while (1) {
iter_ret = sys_listns(&req, ns_ids, 10, 0);
if (iter_ret == -1 && errno == ENOSYS)
_exit(PIDFD_SKIP);
}
}
/* Small delay to let iterator start looping */
usleep(50000);
/*
* Create more children with mount namespaces to increase the
* likelihood that namespace cleanup happens during iteration.
*/
for (i = 0; i < 10; i++) {
/* Create socketpair for synchronization */
ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0);
pid = create_child(&pidfds[i], CLONE_NEWNS);
ASSERT_NE(pid, -1);
if (pid == 0) {
close(sv[i][0]); /* Close parent end */
if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
_exit(1);
/* Child: create tmpfs mounts */
if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST)
_exit(1);
if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST)
_exit(1);
if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1)
_exit(1);
if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1)
_exit(1);
/* Signal parent that setup is complete */
if (write_nointr(sv[i][1], "R", 1) != 1)
_exit(1);
/* Wait for parent to signal us to exit */
if (read_nointr(sv[i][1], &c, 1) != 1)
_exit(1);
close(sv[i][1]);
_exit(0);
}
close(sv[i][1]); /* Close child end */
}
/* Wait for all children to finish setup */
for (i = 0; i < 10; i++) {
ret = read_nointr(sv[i][0], &c, 1);
ASSERT_EQ(ret, 1);
ASSERT_EQ(c, 'R');
}
/* Kill half the children */
for (i = 0; i < 5; i++)
write_nointr(sv[i][0], "X", 1);
/* Small delay to let some exit */
usleep(10000);
/* Kill remaining children */
for (i = 5; i < 10; i++)
write_nointr(sv[i][0], "X", 1);
/* Wait for all children and cleanup */
for (i = 0; i < 10; i++) {
waitpid(-1, NULL, 0);
close(sv[i][0]);
close(pidfds[i]);
}
/* Kill iterator and wait for it */
sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0);
ret = waitpid(iter_pid, &status, 0);
ASSERT_EQ(ret, iter_pid);
close(iter_pidfd);
/* Should have been killed */
ASSERT_TRUE(WIFSIGNALED(status));
ASSERT_EQ(WTERMSIG(status), SIGKILL);
/* Clean up */
munmap(map, page_size);
}
/*
* Test specifically focused on mount namespace cleanup during EFAULT.
* Filter for mount namespaces only.
*/
TEST(listns_mnt_ns_cleanup_on_fault)
{
void *map;
__u64 *ns_ids;
ssize_t ret;
long page_size;
pid_t pid, iter_pid;
int pidfds[8];
int sv[8][2];
int iter_pidfd;
int i, status;
char c;
page_size = sysconf(_SC_PAGESIZE);
ASSERT_GT(page_size, 0);
/* Set up partial fault buffer */
map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
ASSERT_NE(map, MAP_FAILED);
ret = munmap((char *)map + page_size, page_size);
ASSERT_EQ(ret, 0);
/* Position for 3 successful writes, then fault */
ns_ids = ((__u64 *)((char *)map + page_size)) - 3;
/*
* Create a separate process to run listns() concurrently.
*/
iter_pid = create_child(&iter_pidfd, 0);
ASSERT_NE(iter_pid, -1);
if (iter_pid == 0) {
struct ns_id_req req = {
.size = sizeof(req),
.spare = 0,
.ns_id = 0,
.ns_type = CLONE_NEWNS, /* Only mount namespaces */
.spare2 = 0,
.user_ns_id = 0,
};
int iter_ret;
/*
* Loop calling listns() until killed.
* Call listns() to race with namespace destruction.
*/
while (1) {
iter_ret = sys_listns(&req, ns_ids, 10, 0);
if (iter_ret == -1 && errno == ENOSYS)
_exit(PIDFD_SKIP);
}
}
/* Small delay to let iterator start looping */
usleep(50000);
/* Create children with mount namespaces */
for (i = 0; i < 8; i++) {
/* Create socketpair for synchronization */
ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0);
pid = create_child(&pidfds[i], CLONE_NEWNS);
ASSERT_NE(pid, -1);
if (pid == 0) {
close(sv[i][0]); /* Close parent end */
if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
_exit(1);
/* Do some mount operations to make cleanup more interesting */
if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST)
_exit(1);
if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST)
_exit(1);
if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1)
_exit(1);
if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1)
_exit(1);
/* Signal parent that setup is complete */
if (write_nointr(sv[i][1], "R", 1) != 1)
_exit(1);
/* Wait for parent to signal us to exit */
if (read_nointr(sv[i][1], &c, 1) != 1)
_exit(1);
close(sv[i][1]);
_exit(0);
}
close(sv[i][1]); /* Close child end */
}
/* Wait for all children to finish setup */
for (i = 0; i < 8; i++) {
ret = read_nointr(sv[i][0], &c, 1);
ASSERT_EQ(ret, 1);
ASSERT_EQ(c, 'R');
}
/* Kill children to trigger namespace destruction during iteration */
for (i = 0; i < 8; i++)
write_nointr(sv[i][0], "X", 1);
/* Wait for children and cleanup */
for (i = 0; i < 8; i++) {
waitpid(-1, NULL, 0);
close(sv[i][0]);
close(pidfds[i]);
}
/* Kill iterator and wait for it */
sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0);
ret = waitpid(iter_pid, &status, 0);
ASSERT_EQ(ret, iter_pid);
close(iter_pidfd);
/* Should have been killed */
ASSERT_TRUE(WIFSIGNALED(status));
ASSERT_EQ(WTERMSIG(status), SIGKILL);
munmap(map, page_size);
}
TEST_HARNESS_MAIN
|