diff options
| -rw-r--r-- | fs/nsfs.c | 2 | ||||
| -rw-r--r-- | include/linux/ns_common.h | 49 | ||||
| -rw-r--r-- | kernel/nscommon.c | 52 | ||||
| -rw-r--r-- | kernel/nstree.c | 44 | ||||
| -rw-r--r-- | tools/testing/selftests/namespaces/.gitignore | 2 | ||||
| -rw-r--r-- | tools/testing/selftests/namespaces/Makefile | 6 | ||||
| -rw-r--r-- | tools/testing/selftests/namespaces/listns_efault_test.c | 530 | ||||
| -rw-r--r-- | tools/testing/selftests/namespaces/regression_pidfd_setns_test.c | 113 |
8 files changed, 724 insertions, 74 deletions
diff --git a/fs/nsfs.c b/fs/nsfs.c index ba6c8975c82e..a80f8d2a4122 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -430,7 +430,7 @@ static int nsfs_init_inode(struct inode *inode, void *data) * ioctl on such a socket will resurrect the relevant namespace * subtree. */ - __ns_ref_active_resurrect(ns); + __ns_ref_active_get(ns); return 0; } diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index bd4492ef6ffc..66ea09b48377 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -141,6 +141,12 @@ static __always_inline bool is_initial_namespace(struct ns_common *ns) IPC_NS_INIT_INO - MNT_NS_INIT_INO + 1)); } +static __always_inline bool is_ns_init_id(const struct ns_common *ns) +{ + VFS_WARN_ON_ONCE(ns->ns_id == 0); + return ns->ns_id <= NS_LAST_INIT_ID; +} + #define to_ns_common(__ns) \ _Generic((__ns), \ struct cgroup_namespace *: &(__ns)->ns, \ @@ -281,54 +287,25 @@ static __always_inline __must_check int __ns_ref_read(const struct ns_common *ns #define ns_ref_active_read(__ns) \ ((__ns) ? __ns_ref_active_read(to_ns_common(__ns)) : 0) -void __ns_ref_active_get_owner(struct ns_common *ns); - -static __always_inline void __ns_ref_active_get(struct ns_common *ns) -{ - WARN_ON_ONCE(atomic_add_negative(1, &ns->__ns_ref_active)); - VFS_WARN_ON_ONCE(is_initial_namespace(ns) && __ns_ref_active_read(ns) <= 0); -} -#define ns_ref_active_get(__ns) \ - do { if (__ns) __ns_ref_active_get(to_ns_common(__ns)); } while (0) - -static __always_inline bool __ns_ref_active_get_not_zero(struct ns_common *ns) -{ - if (atomic_inc_not_zero(&ns->__ns_ref_active)) { - VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); - return true; - } - return false; -} - -#define ns_ref_active_get_owner(__ns) \ - do { if (__ns) __ns_ref_active_get_owner(to_ns_common(__ns)); } while (0) - -void __ns_ref_active_put_owner(struct ns_common *ns); +void __ns_ref_active_put(struct ns_common *ns); -static __always_inline void __ns_ref_active_put(struct ns_common *ns) -{ - if (atomic_dec_and_test(&ns->__ns_ref_active)) { - VFS_WARN_ON_ONCE(is_initial_namespace(ns)); - VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); - __ns_ref_active_put_owner(ns); - } -} #define ns_ref_active_put(__ns) \ do { if (__ns) __ns_ref_active_put(to_ns_common(__ns)); } while (0) static __always_inline struct ns_common *__must_check ns_get_unless_inactive(struct ns_common *ns) { - VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) && !__ns_ref_read(ns)); - if (!__ns_ref_active_read(ns)) + if (!__ns_ref_active_read(ns)) { + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); return NULL; + } if (!__ns_ref_get(ns)) return NULL; return ns; } -void __ns_ref_active_resurrect(struct ns_common *ns); +void __ns_ref_active_get(struct ns_common *ns); -#define ns_ref_active_resurrect(__ns) \ - do { if (__ns) __ns_ref_active_resurrect(to_ns_common(__ns)); } while (0) +#define ns_ref_active_get(__ns) \ + do { if (__ns) __ns_ref_active_get(to_ns_common(__ns)); } while (0) #endif diff --git a/kernel/nscommon.c b/kernel/nscommon.c index 6fe1c747fa46..c910b979e433 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -54,7 +54,7 @@ static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops) int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum) { - int ret; + int ret = 0; refcount_set(&ns->__ns_ref, 1); ns->stashed = NULL; @@ -74,11 +74,10 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope ns_debug(ns, ops); #endif - if (inum) { + if (inum) ns->inum = inum; - return 0; - } - ret = proc_alloc_inum(&ns->inum); + else + ret = proc_alloc_inum(&ns->inum); if (ret) return ret; /* @@ -115,13 +114,6 @@ struct ns_common *__must_check ns_owner(struct ns_common *ns) return to_ns_common(owner); } -void __ns_ref_active_get_owner(struct ns_common *ns) -{ - ns = ns_owner(ns); - if (ns) - WARN_ON_ONCE(atomic_add_negative(1, &ns->__ns_ref_active)); -} - /* * The active reference count works by having each namespace that gets * created take a single active reference on its owning user namespace. @@ -172,14 +164,29 @@ void __ns_ref_active_get_owner(struct ns_common *ns) * The iteration stops once we reach a namespace that still has active * references. */ -void __ns_ref_active_put_owner(struct ns_common *ns) +void __ns_ref_active_put(struct ns_common *ns) { + /* Initial namespaces are always active. */ + if (is_ns_init_id(ns)) + return; + + if (!atomic_dec_and_test(&ns->__ns_ref_active)) { + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0); + return; + } + + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); + VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); + for (;;) { ns = ns_owner(ns); if (!ns) return; - if (!atomic_dec_and_test(&ns->__ns_ref_active)) + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); + if (!atomic_dec_and_test(&ns->__ns_ref_active)) { + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0); return; + } } } @@ -275,10 +282,18 @@ void __ns_ref_active_put_owner(struct ns_common *ns) * it also needs to take another reference on its owning user namespace * and so on. */ -void __ns_ref_active_resurrect(struct ns_common *ns) +void __ns_ref_active_get(struct ns_common *ns) { + int prev; + + /* Initial namespaces are always active. */ + if (is_ns_init_id(ns)) + return; + /* If we didn't resurrect the namespace we're done. */ - if (atomic_fetch_add(1, &ns->__ns_ref_active)) + prev = atomic_fetch_add(1, &ns->__ns_ref_active); + VFS_WARN_ON_ONCE(prev < 0); + if (likely(prev)) return; /* @@ -290,7 +305,10 @@ void __ns_ref_active_resurrect(struct ns_common *ns) if (!ns) return; - if (atomic_fetch_add(1, &ns->__ns_ref_active)) + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); + prev = atomic_fetch_add(1, &ns->__ns_ref_active); + VFS_WARN_ON_ONCE(prev < 0); + if (likely(prev)) return; } } diff --git a/kernel/nstree.c b/kernel/nstree.c index 4a8838683b6b..97404fb90749 100644 --- a/kernel/nstree.c +++ b/kernel/nstree.c @@ -173,14 +173,6 @@ void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree) write_sequnlock(&ns_tree_lock); VFS_WARN_ON_ONCE(node); - - /* - * Take an active reference on the owner namespace. This ensures - * that the owner remains visible while any of its child namespaces - * are active. For init namespaces this is a no-op as ns_owner() - * returns NULL for namespaces owned by init_user_ns. - */ - __ns_ref_active_get_owner(ns); } void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree) @@ -505,13 +497,13 @@ static inline bool __must_check may_list_ns(const struct klistns *kls, return false; } -static void __ns_put(struct ns_common *ns) +static inline void ns_put(struct ns_common *ns) { - if (ns->ops) + if (ns && ns->ops) ns->ops->put(ns); } -DEFINE_FREE(ns_put, struct ns_common *, if (!IS_ERR_OR_NULL(_T)) __ns_put(_T)) +DEFINE_FREE(ns_put, struct ns_common *, if (!IS_ERR_OR_NULL(_T)) ns_put(_T)) static inline struct ns_common *__must_check legitimize_ns(const struct klistns *kls, struct ns_common *candidate) @@ -535,7 +527,7 @@ static ssize_t do_listns_userns(struct klistns *kls) { u64 __user *ns_ids = kls->uns_ids; size_t nr_ns_ids = kls->nr_ns_ids; - struct ns_common *ns = NULL, *first_ns = NULL; + struct ns_common *ns = NULL, *first_ns = NULL, *prev = NULL; const struct list_head *head; ssize_t ret; @@ -568,9 +560,10 @@ static ssize_t do_listns_userns(struct klistns *kls) if (!first_ns) first_ns = list_entry_rcu(head->next, typeof(*ns), ns_owner_entry); + for (ns = first_ns; &ns->ns_owner_entry != head && nr_ns_ids; ns = list_entry_rcu(ns->ns_owner_entry.next, typeof(*ns), ns_owner_entry)) { - struct ns_common *valid __free(ns_put); + struct ns_common *valid; valid = legitimize_ns(kls, ns); if (!valid) @@ -578,8 +571,14 @@ static ssize_t do_listns_userns(struct klistns *kls) rcu_read_unlock(); - if (put_user(valid->ns_id, ns_ids + ret)) - return -EINVAL; + ns_put(prev); + prev = valid; + + if (put_user(valid->ns_id, ns_ids + ret)) { + ns_put(prev); + return -EFAULT; + } + nr_ns_ids--; ret++; @@ -587,6 +586,7 @@ static ssize_t do_listns_userns(struct klistns *kls) } rcu_read_unlock(); + ns_put(prev); return ret; } @@ -668,7 +668,7 @@ static ssize_t do_listns(struct klistns *kls) { u64 __user *ns_ids = kls->uns_ids; size_t nr_ns_ids = kls->nr_ns_ids; - struct ns_common *ns, *first_ns = NULL; + struct ns_common *ns, *first_ns = NULL, *prev = NULL; struct ns_tree *ns_tree = NULL; const struct list_head *head; u32 ns_type; @@ -705,7 +705,7 @@ static ssize_t do_listns(struct klistns *kls) for (ns = first_ns; !ns_common_is_head(ns, head, ns_tree) && nr_ns_ids; ns = next_ns_common(ns, ns_tree)) { - struct ns_common *valid __free(ns_put); + struct ns_common *valid; valid = legitimize_ns(kls, ns); if (!valid) @@ -713,8 +713,13 @@ static ssize_t do_listns(struct klistns *kls) rcu_read_unlock(); - if (put_user(valid->ns_id, ns_ids + ret)) - return -EINVAL; + ns_put(prev); + prev = valid; + + if (put_user(valid->ns_id, ns_ids + ret)) { + ns_put(prev); + return -EFAULT; + } nr_ns_ids--; ret++; @@ -723,6 +728,7 @@ static ssize_t do_listns(struct klistns *kls) } rcu_read_unlock(); + ns_put(prev); return ret; } diff --git a/tools/testing/selftests/namespaces/.gitignore b/tools/testing/selftests/namespaces/.gitignore index f4d2209ca4e4..0989e80da457 100644 --- a/tools/testing/selftests/namespaces/.gitignore +++ b/tools/testing/selftests/namespaces/.gitignore @@ -4,7 +4,9 @@ init_ino_test ns_active_ref_test listns_test listns_permissions_test +listns_efault_test siocgskns_test cred_change_test stress_test listns_pagination_bug +regression_pidfd_setns_test diff --git a/tools/testing/selftests/namespaces/Makefile b/tools/testing/selftests/namespaces/Makefile index 01569e0abbdb..fbb821652c17 100644 --- a/tools/testing/selftests/namespaces/Makefile +++ b/tools/testing/selftests/namespaces/Makefile @@ -8,18 +8,22 @@ TEST_GEN_PROGS := nsid_test \ ns_active_ref_test \ listns_test \ listns_permissions_test \ + listns_efault_test \ siocgskns_test \ cred_change_test \ stress_test \ - listns_pagination_bug + listns_pagination_bug \ + regression_pidfd_setns_test include ../lib.mk $(OUTPUT)/ns_active_ref_test: ../filesystems/utils.c $(OUTPUT)/listns_test: ../filesystems/utils.c $(OUTPUT)/listns_permissions_test: ../filesystems/utils.c +$(OUTPUT)/listns_efault_test: ../filesystems/utils.c $(OUTPUT)/siocgskns_test: ../filesystems/utils.c $(OUTPUT)/cred_change_test: ../filesystems/utils.c $(OUTPUT)/stress_test: ../filesystems/utils.c $(OUTPUT)/listns_pagination_bug: ../filesystems/utils.c +$(OUTPUT)/regression_pidfd_setns_test: ../filesystems/utils.c diff --git a/tools/testing/selftests/namespaces/listns_efault_test.c b/tools/testing/selftests/namespaces/listns_efault_test.c new file mode 100644 index 000000000000..c7ed4023d7a8 --- /dev/null +++ b/tools/testing/selftests/namespaces/listns_efault_test.c @@ -0,0 +1,530 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <sched.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <linux/nsfs.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <sys/mount.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> +#include "../kselftest_harness.h" +#include "../filesystems/utils.h" +#include "../pidfd/pidfd.h" +#include "wrappers.h" + +/* + * Test listns() error handling with invalid buffer addresses. + * + * When the buffer pointer is invalid (e.g., crossing page boundaries + * into unmapped memory), listns() returns EINVAL. + * + * This test also creates mount namespaces that get destroyed during + * iteration, testing that namespace cleanup happens outside the RCU + * read lock. + */ +TEST(listns_partial_fault_with_ns_cleanup) +{ + void *map; + __u64 *ns_ids; + ssize_t ret; + long page_size; + pid_t pid, iter_pid; + int pidfds[5]; + int sv[5][2]; + int iter_pidfd; + int i, status; + char c; + + page_size = sysconf(_SC_PAGESIZE); + ASSERT_GT(page_size, 0); + + /* + * Map two pages: + * - First page: readable and writable + * - Second page: will be unmapped to trigger EFAULT + */ + map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(map, MAP_FAILED); + + /* Unmap the second page */ + ret = munmap((char *)map + page_size, page_size); + ASSERT_EQ(ret, 0); + + /* + * Position the buffer pointer so there's room for exactly one u64 + * before the page boundary. The second u64 would fall into the + * unmapped page. + */ + ns_ids = ((__u64 *)((char *)map + page_size)) - 1; + + /* + * Create a separate process to run listns() in a loop concurrently + * with namespace creation and destruction. + */ + iter_pid = create_child(&iter_pidfd, 0); + ASSERT_NE(iter_pid, -1); + + if (iter_pid == 0) { + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, /* All types */ + .spare2 = 0, + .user_ns_id = 0, /* Global listing */ + }; + int iter_ret; + + /* + * Loop calling listns() until killed. + * The kernel should: + * 1. Successfully write the first namespace ID (within valid page) + * 2. Fail with EFAULT when trying to write the second ID (unmapped page) + * 3. Handle concurrent namespace destruction without deadlock + */ + while (1) { + iter_ret = sys_listns(&req, ns_ids, 2, 0); + + if (iter_ret == -1 && errno == ENOSYS) + _exit(PIDFD_SKIP); + } + } + + /* Small delay to let iterator start looping */ + usleep(50000); + + /* + * Create several child processes, each in its own mount namespace. + * These will be destroyed while the iterator is running listns(). + */ + for (i = 0; i < 5; i++) { + /* Create socketpair for synchronization */ + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0); + + pid = create_child(&pidfds[i], CLONE_NEWNS); + ASSERT_NE(pid, -1); + + if (pid == 0) { + close(sv[i][0]); /* Close parent end */ + + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0)) + _exit(1); + + /* Child: create a couple of tmpfs mounts */ + if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST) + _exit(1); + if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST) + _exit(1); + + if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1) + _exit(1); + if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1) + _exit(1); + + /* Signal parent that setup is complete */ + if (write_nointr(sv[i][1], "R", 1) != 1) + _exit(1); + + /* Wait for parent to signal us to exit */ + if (read_nointr(sv[i][1], &c, 1) != 1) + _exit(1); + + close(sv[i][1]); + _exit(0); + } + + close(sv[i][1]); /* Close child end */ + } + + /* Wait for all children to finish setup */ + for (i = 0; i < 5; i++) { + ret = read_nointr(sv[i][0], &c, 1); + ASSERT_EQ(ret, 1); + ASSERT_EQ(c, 'R'); + } + + /* + * Signal children to exit. This will destroy their mount namespaces + * while listns() is iterating the namespace tree. + * This tests that cleanup happens outside the RCU read lock. + */ + for (i = 0; i < 5; i++) + write_nointr(sv[i][0], "X", 1); + + /* Wait for all mount namespace children to exit and cleanup */ + for (i = 0; i < 5; i++) { + waitpid(-1, NULL, 0); + close(sv[i][0]); + close(pidfds[i]); + } + + /* Kill iterator and wait for it */ + sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0); + ret = waitpid(iter_pid, &status, 0); + ASSERT_EQ(ret, iter_pid); + close(iter_pidfd); + + /* Should have been killed */ + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_EQ(WTERMSIG(status), SIGKILL); + + /* Clean up */ + munmap(map, page_size); +} + +/* + * Test listns() error handling when the entire buffer is invalid. + * This is a sanity check that basic invalid pointer detection works. + */ +TEST(listns_complete_fault) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 *ns_ids; + ssize_t ret; + + /* Use a clearly invalid pointer */ + ns_ids = (__u64 *)0xdeadbeef; + + ret = sys_listns(&req, ns_ids, 10, 0); + + if (ret == -1 && errno == ENOSYS) + SKIP(return, "listns() not supported"); + + /* Should fail with EFAULT */ + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EFAULT); +} + +/* + * Test listns() error handling when the buffer is NULL. + */ +TEST(listns_null_buffer) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = 0, + }; + ssize_t ret; + + /* NULL buffer with non-zero count should fail */ + ret = sys_listns(&req, NULL, 10, 0); + + if (ret == -1 && errno == ENOSYS) + SKIP(return, "listns() not supported"); + + /* Should fail with EFAULT */ + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EFAULT); +} + +/* + * Test listns() with a buffer that becomes invalid mid-iteration + * (after several successful writes), combined with mount namespace + * destruction to test RCU cleanup logic. + */ +TEST(listns_late_fault_with_ns_cleanup) +{ + void *map; + __u64 *ns_ids; + ssize_t ret; + long page_size; + pid_t pid, iter_pid; + int pidfds[10]; + int sv[10][2]; + int iter_pidfd; + int i, status; + char c; + + page_size = sysconf(_SC_PAGESIZE); + ASSERT_GT(page_size, 0); + + /* Map two pages */ + map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(map, MAP_FAILED); + + /* Unmap the second page */ + ret = munmap((char *)map + page_size, page_size); + ASSERT_EQ(ret, 0); + + /* + * Position buffer so we can write several u64s successfully + * before hitting the page boundary. + */ + ns_ids = ((__u64 *)((char *)map + page_size)) - 5; + + /* + * Create a separate process to run listns() concurrently. + */ + iter_pid = create_child(&iter_pidfd, 0); + ASSERT_NE(iter_pid, -1); + + if (iter_pid == 0) { + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = 0, + }; + int iter_ret; + + /* + * Loop calling listns() until killed. + * Request 10 namespace IDs while namespaces are being destroyed. + * This tests: + * 1. EFAULT handling when buffer becomes invalid + * 2. Namespace cleanup outside RCU read lock during iteration + */ + while (1) { + iter_ret = sys_listns(&req, ns_ids, 10, 0); + + if (iter_ret == -1 && errno == ENOSYS) + _exit(PIDFD_SKIP); + } + } + + /* Small delay to let iterator start looping */ + usleep(50000); + + /* + * Create more children with mount namespaces to increase the + * likelihood that namespace cleanup happens during iteration. + */ + for (i = 0; i < 10; i++) { + /* Create socketpair for synchronization */ + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0); + + pid = create_child(&pidfds[i], CLONE_NEWNS); + ASSERT_NE(pid, -1); + + if (pid == 0) { + close(sv[i][0]); /* Close parent end */ + + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0)) + _exit(1); + + /* Child: create tmpfs mounts */ + if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST) + _exit(1); + if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST) + _exit(1); + + if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1) + _exit(1); + if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1) + _exit(1); + + /* Signal parent that setup is complete */ + if (write_nointr(sv[i][1], "R", 1) != 1) + _exit(1); + + /* Wait for parent to signal us to exit */ + if (read_nointr(sv[i][1], &c, 1) != 1) + _exit(1); + + close(sv[i][1]); + _exit(0); + } + + close(sv[i][1]); /* Close child end */ + } + + /* Wait for all children to finish setup */ + for (i = 0; i < 10; i++) { + ret = read_nointr(sv[i][0], &c, 1); + ASSERT_EQ(ret, 1); + ASSERT_EQ(c, 'R'); + } + + /* Kill half the children */ + for (i = 0; i < 5; i++) + write_nointr(sv[i][0], "X", 1); + + /* Small delay to let some exit */ + usleep(10000); + + /* Kill remaining children */ + for (i = 5; i < 10; i++) + write_nointr(sv[i][0], "X", 1); + + /* Wait for all children and cleanup */ + for (i = 0; i < 10; i++) { + waitpid(-1, NULL, 0); + close(sv[i][0]); + close(pidfds[i]); + } + + /* Kill iterator and wait for it */ + sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0); + ret = waitpid(iter_pid, &status, 0); + ASSERT_EQ(ret, iter_pid); + close(iter_pidfd); + + /* Should have been killed */ + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_EQ(WTERMSIG(status), SIGKILL); + + /* Clean up */ + munmap(map, page_size); +} + +/* + * Test specifically focused on mount namespace cleanup during EFAULT. + * Filter for mount namespaces only. + */ +TEST(listns_mnt_ns_cleanup_on_fault) +{ + void *map; + __u64 *ns_ids; + ssize_t ret; + long page_size; + pid_t pid, iter_pid; + int pidfds[8]; + int sv[8][2]; + int iter_pidfd; + int i, status; + char c; + + page_size = sysconf(_SC_PAGESIZE); + ASSERT_GT(page_size, 0); + + /* Set up partial fault buffer */ + map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(map, MAP_FAILED); + + ret = munmap((char *)map + page_size, page_size); + ASSERT_EQ(ret, 0); + + /* Position for 3 successful writes, then fault */ + ns_ids = ((__u64 *)((char *)map + page_size)) - 3; + + /* + * Create a separate process to run listns() concurrently. + */ + iter_pid = create_child(&iter_pidfd, 0); + ASSERT_NE(iter_pid, -1); + + if (iter_pid == 0) { + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNS, /* Only mount namespaces */ + .spare2 = 0, + .user_ns_id = 0, + }; + int iter_ret; + + /* + * Loop calling listns() until killed. + * Call listns() to race with namespace destruction. + */ + while (1) { + iter_ret = sys_listns(&req, ns_ids, 10, 0); + + if (iter_ret == -1 && errno == ENOSYS) + _exit(PIDFD_SKIP); + } + } + + /* Small delay to let iterator start looping */ + usleep(50000); + + /* Create children with mount namespaces */ + for (i = 0; i < 8; i++) { + /* Create socketpair for synchronization */ + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0); + + pid = create_child(&pidfds[i], CLONE_NEWNS); + ASSERT_NE(pid, -1); + + if (pid == 0) { + close(sv[i][0]); /* Close parent end */ + + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0)) + _exit(1); + + /* Do some mount operations to make cleanup more interesting */ + if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST) + _exit(1); + if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST) + _exit(1); + + if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1) + _exit(1); + if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1) + _exit(1); + + /* Signal parent that setup is complete */ + if (write_nointr(sv[i][1], "R", 1) != 1) + _exit(1); + + /* Wait for parent to signal us to exit */ + if (read_nointr(sv[i][1], &c, 1) != 1) + _exit(1); + + close(sv[i][1]); + _exit(0); + } + + close(sv[i][1]); /* Close child end */ + } + + /* Wait for all children to finish setup */ + for (i = 0; i < 8; i++) { + ret = read_nointr(sv[i][0], &c, 1); + ASSERT_EQ(ret, 1); + ASSERT_EQ(c, 'R'); + } + + /* Kill children to trigger namespace destruction during iteration */ + for (i = 0; i < 8; i++) + write_nointr(sv[i][0], "X", 1); + + /* Wait for children and cleanup */ + for (i = 0; i < 8; i++) { + waitpid(-1, NULL, 0); + close(sv[i][0]); + close(pidfds[i]); + } + + /* Kill iterator and wait for it */ + sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0); + ret = waitpid(iter_pid, &status, 0); + ASSERT_EQ(ret, iter_pid); + close(iter_pidfd); + + /* Should have been killed */ + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_EQ(WTERMSIG(status), SIGKILL); + + munmap(map, page_size); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/regression_pidfd_setns_test.c b/tools/testing/selftests/namespaces/regression_pidfd_setns_test.c new file mode 100644 index 000000000000..753fd29dffd8 --- /dev/null +++ b/tools/testing/selftests/namespaces/regression_pidfd_setns_test.c @@ -0,0 +1,113 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <sched.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/socket.h> +#include <unistd.h> +#include "../pidfd/pidfd.h" +#include "../kselftest_harness.h" + +/* + * Regression tests for the setns(pidfd) active reference counting bug. + * + * These tests are based on the reproducers that triggered the race condition + * fixed by commit 1c465d0518dc ("ns: handle setns(pidfd, ...) cleanly"). + * + * The bug: When using setns() with a pidfd, if the target task exits between + * prepare_nsset() and commit_nsset(), the namespaces would become inactive. + * Then ns_ref_active_get() would increment from 0 without properly resurrecting + * the owner chain, causing active reference count underflows. + */ + +/* + * Simple pidfd setns test using create_child()+unshare(). + * + * Without the fix, this would trigger active refcount warnings when the + * parent exits after doing setns(pidfd) on a child that has already exited. + */ +TEST(simple_pidfd_setns) +{ + pid_t child_pid; + int pidfd = -1; + int ret; + int sv[2]; + char c; + + /* Ignore SIGCHLD for autoreap */ + ASSERT_NE(signal(SIGCHLD, SIG_IGN), SIG_ERR); + + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); + + /* Create a child process without namespaces initially */ + child_pid = create_child(&pidfd, 0); + ASSERT_GE(child_pid, 0); + + if (child_pid == 0) { + close(sv[0]); + + if (unshare(CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWNET | CLONE_NEWUSER) < 0) { + close(sv[1]); + _exit(1); + } + + /* Signal parent that namespaces are ready */ + if (write_nointr(sv[1], "1", 1) < 0) { + close(sv[1]); + _exit(1); + } + + close(sv[1]); + _exit(0); + } + ASSERT_GE(pidfd, 0); + EXPECT_EQ(close(sv[1]), 0); + + ret = read_nointr(sv[0], &c, 1); + ASSERT_EQ(ret, 1); + EXPECT_EQ(close(sv[0]), 0); + + /* Set to child's namespaces via pidfd */ + ret = setns(pidfd, CLONE_NEWUTS | CLONE_NEWIPC); + TH_LOG("setns() returned %d", ret); + close(pidfd); +} + +/* + * Simple pidfd setns test using create_child(). + * + * This variation uses create_child() with namespace flags directly. + * Namespaces are created immediately at clone time. + */ +TEST(simple_pidfd_setns_clone) +{ + pid_t child_pid; + int pidfd = -1; + int ret; + + /* Ignore SIGCHLD for autoreap */ + ASSERT_NE(signal(SIGCHLD, SIG_IGN), SIG_ERR); + + /* Create a child process with new namespaces using create_child() */ + child_pid = create_child(&pidfd, CLONE_NEWUSER | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWNET); + ASSERT_GE(child_pid, 0); + + if (child_pid == 0) { + /* Child: sleep for a while so parent can setns to us */ + sleep(2); + _exit(0); + } + + /* Parent: pidfd was already created by create_child() */ + ASSERT_GE(pidfd, 0); + + /* Set to child's namespaces via pidfd */ + ret = setns(pidfd, CLONE_NEWUTS | CLONE_NEWIPC); + close(pidfd); + TH_LOG("setns() returned %d", ret); +} + +TEST_HARNESS_MAIN |
