Merge patch series "ns: fixes for namespace iteration and active reference counting"

Christian Brauner <brauner@kernel.org> says: * Make sure to initialize the active reference count for the initial network namespace and prevent __ns_common_init() from returning too early. * Make sure that passive reference counts are dropped outside of rcu read locks as some namespaces such as the mount namespace do in fact sleep when putting the last reference. * The setns() system call supports: (1) namespace file descriptors (nsfd) (2) process file descriptors (pidfd) When using nsfds the namespaces will remain active because they are pinned by the vfs. However, when pidfds are used things are more complicated. When the target task exits and passes through exit_nsproxy_namespaces() or is reaped and thus also passes through exit_cred_namespaces() after the setns()'ing task has called prepare_nsset() but before the active reference count of the set of namespaces it wants to setns() to might have been dropped already: P1 P2 pid_p1 = clone(CLONE_NEWUSER | CLONE_NEWNET | CLONE_NEWNS) pidfd = pidfd_open(pid_p1) setns(pidfd, CLONE_NEWUSER | CLONE_NEWNET | CLONE_NEWNS) prepare_nsset() exit(0) // ns->__ns_active_ref == 1 // parent_ns->__ns_active_ref == 1 -> exit_nsproxy_namespaces() -> exit_cred_namespaces() // ns_active_ref_put() will also put // the reference on the owner of the // namespace. If the only reason the // owning namespace was alive was // because it was a parent of @ns // it's active reference count now goes // to zero... -------------------------------- // | // ns->__ns_active_ref == 0 | // parent_ns->__ns_active_ref == 0 | | commit_nsset() -----------------> // If setns() // now manages to install the namespaces // it will call ns_active_ref_get() // on them thus bumping the active reference // count from zero again but without also // taking the required reference on the owner. // Thus we get: // // ns->__ns_active_ref == 1 // parent_ns->__ns_active_ref == 0 When later someone does ns_active_ref_put() on @ns it will underflow parent_ns->__ns_active_ref leading to a splat from our asserts thinking there are still active references when in fact the counter just underflowed. So resurrect the ownership chain if necessary as well. If the caller succeeded to grab passive references to the set of namespaces the setns() should simply succeed even if the target task exists or gets reaped in the meantime. The race is rare and can only be triggered when using pidfs to setns() to namespaces. Also note that active reference on initial namespaces are nops. Since we now always handle parent references directly we can drop ns_ref_active_get_owner() when adding a namespace to a namespace tree. This is now all handled uniformly in the places where the new namespaces actually become active. * patches from https://patch.msgid.link/20251109-namespace-6-19-fixes-v1-0-ae8a4ad5a3b3@kernel.org: selftests/namespaces: test for efault selftests/namespaces: add active reference count regression test ns: add asserts for active refcount underflow ns: handle setns(pidfd, ...) cleanly ns: return EFAULT on put_user() error ns: make sure reference are dropped outside of rcu lock ns: don't increment or decrement initial namespaces ns: don't skip active reference count initialization Link: https://patch.msgid.link/20251109-namespace-6-19-fixes-v1-0-ae8a4ad5a3b3@kernel.org Signed-off-by: Christian Brauner <brauner@kernel.org>
author: Christian Brauner <brauner@kernel.org> 2025-11-10 10:21:00 +0100
committer: Christian Brauner <brauner@kernel.org> 2025-11-10 15:54:02 +0100
commit: ae901e5e2e9b079761d26a366e0c80530d8aad22 (patch)
tree: 65f2dcf3975dd9d7ba1d412fa3f116b5fce9e534 /tools
parent: 8ebfb9896c97ab609222460e705f425cb3f0aad0 (diff)
parent: 07d7ad46dad48a81ffc796fb7875b1ec141c8b48 (diff)
4 files changed, 650 insertions, 1 deletions
diff --git a/tools/testing/selftests/namespaces/.gitignore b/tools/testing/selftests/namespaces/.gitignore
index f4d2209ca4e4..0989e80da457 100644
--- a/tools/testing/selftests/namespaces/.gitignore
+++ b/tools/testing/selftests/namespaces/.gitignore
@@ -4,7 +4,9 @@ init_ino_test
 ns_active_ref_test
 listns_test
 listns_permissions_test
+listns_efault_test
 siocgskns_test
 cred_change_test
 stress_test
 listns_pagination_bug
+regression_pidfd_setns_test
diff --git a/tools/testing/selftests/namespaces/Makefile b/tools/testing/selftests/namespaces/Makefile
index 01569e0abbdb..fbb821652c17 100644
--- a/tools/testing/selftests/namespaces/Makefile
+++ b/tools/testing/selftests/namespaces/Makefile
@@ -8,18 +8,22 @@ TEST_GEN_PROGS := nsid_test \
 		  ns_active_ref_test \
 		  listns_test \
 		  listns_permissions_test \
+		  listns_efault_test \
 		  siocgskns_test \
 		  cred_change_test \
 		  stress_test \
-		  listns_pagination_bug
+		  listns_pagination_bug \
+		  regression_pidfd_setns_test
 
 include ../lib.mk
 
 $(OUTPUT)/ns_active_ref_test: ../filesystems/utils.c
 $(OUTPUT)/listns_test: ../filesystems/utils.c
 $(OUTPUT)/listns_permissions_test: ../filesystems/utils.c
+$(OUTPUT)/listns_efault_test: ../filesystems/utils.c
 $(OUTPUT)/siocgskns_test: ../filesystems/utils.c
 $(OUTPUT)/cred_change_test: ../filesystems/utils.c
 $(OUTPUT)/stress_test: ../filesystems/utils.c
 $(OUTPUT)/listns_pagination_bug: ../filesystems/utils.c
+$(OUTPUT)/regression_pidfd_setns_test: ../filesystems/utils.c
 
diff --git a/tools/testing/selftests/namespaces/listns_efault_test.c b/tools/testing/selftests/namespaces/listns_efault_test.c
new file mode 100644
index 000000000000..c7ed4023d7a8
--- /dev/null
+++ b/tools/testing/selftests/namespaces/listns_efault_test.c
@@ -0,0 +1,530 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <linux/nsfs.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/mount.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "../kselftest_harness.h"
+#include "../filesystems/utils.h"
+#include "../pidfd/pidfd.h"
+#include "wrappers.h"
+
+/*
+ * Test listns() error handling with invalid buffer addresses.
+ *
+ * When the buffer pointer is invalid (e.g., crossing page boundaries
+ * into unmapped memory), listns() returns EINVAL.
+ *
+ * This test also creates mount namespaces that get destroyed during
+ * iteration, testing that namespace cleanup happens outside the RCU
+ * read lock.
+ */
+TEST(listns_partial_fault_with_ns_cleanup)
+{
+	void *map;
+	__u64 *ns_ids;
+	ssize_t ret;
+	long page_size;
+	pid_t pid, iter_pid;
+	int pidfds[5];
+	int sv[5][2];
+	int iter_pidfd;
+	int i, status;
+	char c;
+
+	page_size = sysconf(_SC_PAGESIZE);
+	ASSERT_GT(page_size, 0);
+
+	/*
+	 * Map two pages:
+	 * - First page: readable and writable
+	 * - Second page: will be unmapped to trigger EFAULT
+	 */
+	map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
+		   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	ASSERT_NE(map, MAP_FAILED);
+
+	/* Unmap the second page */
+	ret = munmap((char *)map + page_size, page_size);
+	ASSERT_EQ(ret, 0);
+
+	/*
+	 * Position the buffer pointer so there's room for exactly one u64
+	 * before the page boundary. The second u64 would fall into the
+	 * unmapped page.
+	 */
+	ns_ids = ((__u64 *)((char *)map + page_size)) - 1;
+
+	/*
+	 * Create a separate process to run listns() in a loop concurrently
+	 * with namespace creation and destruction.
+	 */
+	iter_pid = create_child(&iter_pidfd, 0);
+	ASSERT_NE(iter_pid, -1);
+
+	if (iter_pid == 0) {
+		struct ns_id_req req = {
+			.size = sizeof(req),
+			.spare = 0,
+			.ns_id = 0,
+			.ns_type = 0,  /* All types */
+			.spare2 = 0,
+			.user_ns_id = 0,  /* Global listing */
+		};
+		int iter_ret;
+
+		/*
+		 * Loop calling listns() until killed.
+		 * The kernel should:
+		 * 1. Successfully write the first namespace ID (within valid page)
+		 * 2. Fail with EFAULT when trying to write the second ID (unmapped page)
+		 * 3. Handle concurrent namespace destruction without deadlock
+		 */
+		while (1) {
+			iter_ret = sys_listns(&req, ns_ids, 2, 0);
+
+			if (iter_ret == -1 && errno == ENOSYS)
+				_exit(PIDFD_SKIP);
+		}
+	}
+
+	/* Small delay to let iterator start looping */
+	usleep(50000);
+
+	/*
+	 * Create several child processes, each in its own mount namespace.
+	 * These will be destroyed while the iterator is running listns().
+	 */
+	for (i = 0; i < 5; i++) {
+		/* Create socketpair for synchronization */
+		ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0);
+
+		pid = create_child(&pidfds[i], CLONE_NEWNS);
+		ASSERT_NE(pid, -1);
+
+		if (pid == 0) {
+			close(sv[i][0]); /* Close parent end */
+
+			if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
+				_exit(1);
+
+			/* Child: create a couple of tmpfs mounts */
+			if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST)
+				_exit(1);
+			if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST)
+				_exit(1);
+
+			if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1)
+				_exit(1);
+			if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1)
+				_exit(1);
+
+			/* Signal parent that setup is complete */
+			if (write_nointr(sv[i][1], "R", 1) != 1)
+				_exit(1);
+
+			/* Wait for parent to signal us to exit */
+			if (read_nointr(sv[i][1], &c, 1) != 1)
+				_exit(1);
+
+			close(sv[i][1]);
+			_exit(0);
+		}
+
+		close(sv[i][1]); /* Close child end */
+	}
+
+	/* Wait for all children to finish setup */
+	for (i = 0; i < 5; i++) {
+		ret = read_nointr(sv[i][0], &c, 1);
+		ASSERT_EQ(ret, 1);
+		ASSERT_EQ(c, 'R');
+	}
+
+	/*
+	 * Signal children to exit. This will destroy their mount namespaces
+	 * while listns() is iterating the namespace tree.
+	 * This tests that cleanup happens outside the RCU read lock.
+	 */
+	for (i = 0; i < 5; i++)
+		write_nointr(sv[i][0], "X", 1);
+
+	/* Wait for all mount namespace children to exit and cleanup */
+	for (i = 0; i < 5; i++) {
+		waitpid(-1, NULL, 0);
+		close(sv[i][0]);
+		close(pidfds[i]);
+	}
+
+	/* Kill iterator and wait for it */
+	sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0);
+	ret = waitpid(iter_pid, &status, 0);
+	ASSERT_EQ(ret, iter_pid);
+	close(iter_pidfd);
+
+	/* Should have been killed */
+	ASSERT_TRUE(WIFSIGNALED(status));
+	ASSERT_EQ(WTERMSIG(status), SIGKILL);
+
+	/* Clean up */
+	munmap(map, page_size);
+}
+
+/*
+ * Test listns() error handling when the entire buffer is invalid.
+ * This is a sanity check that basic invalid pointer detection works.
+ */
+TEST(listns_complete_fault)
+{
+	struct ns_id_req req = {
+		.size = sizeof(req),
+		.spare = 0,
+		.ns_id = 0,
+		.ns_type = 0,
+		.spare2 = 0,
+		.user_ns_id = 0,
+	};
+	__u64 *ns_ids;
+	ssize_t ret;
+
+	/* Use a clearly invalid pointer */
+	ns_ids = (__u64 *)0xdeadbeef;
+
+	ret = sys_listns(&req, ns_ids, 10, 0);
+
+	if (ret == -1 && errno == ENOSYS)
+		SKIP(return, "listns() not supported");
+
+	/* Should fail with EFAULT */
+	ASSERT_EQ(ret, -1);
+	ASSERT_EQ(errno, EFAULT);
+}
+
+/*
+ * Test listns() error handling when the buffer is NULL.
+ */
+TEST(listns_null_buffer)
+{
+	struct ns_id_req req = {
+		.size = sizeof(req),
+		.spare = 0,
+		.ns_id = 0,
+		.ns_type = 0,
+		.spare2 = 0,
+		.user_ns_id = 0,
+	};
+	ssize_t ret;
+
+	/* NULL buffer with non-zero count should fail */
+	ret = sys_listns(&req, NULL, 10, 0);
+
+	if (ret == -1 && errno == ENOSYS)
+		SKIP(return, "listns() not supported");
+
+	/* Should fail with EFAULT */
+	ASSERT_EQ(ret, -1);
+	ASSERT_EQ(errno, EFAULT);
+}
+
+/*
+ * Test listns() with a buffer that becomes invalid mid-iteration
+ * (after several successful writes), combined with mount namespace
+ * destruction to test RCU cleanup logic.
+ */
+TEST(listns_late_fault_with_ns_cleanup)
+{
+	void *map;
+	__u64 *ns_ids;
+	ssize_t ret;
+	long page_size;
+	pid_t pid, iter_pid;
+	int pidfds[10];
+	int sv[10][2];
+	int iter_pidfd;
+	int i, status;
+	char c;
+
+	page_size = sysconf(_SC_PAGESIZE);
+	ASSERT_GT(page_size, 0);
+
+	/* Map two pages */
+	map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
+		   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	ASSERT_NE(map, MAP_FAILED);
+
+	/* Unmap the second page */
+	ret = munmap((char *)map + page_size, page_size);
+	ASSERT_EQ(ret, 0);
+
+	/*
+	 * Position buffer so we can write several u64s successfully
+	 * before hitting the page boundary.
+	 */
+	ns_ids = ((__u64 *)((char *)map + page_size)) - 5;
+
+	/*
+	 * Create a separate process to run listns() concurrently.
+	 */
+	iter_pid = create_child(&iter_pidfd, 0);
+	ASSERT_NE(iter_pid, -1);
+
+	if (iter_pid == 0) {
+		struct ns_id_req req = {
+			.size = sizeof(req),
+			.spare = 0,
+			.ns_id = 0,
+			.ns_type = 0,
+			.spare2 = 0,
+			.user_ns_id = 0,
+		};
+		int iter_ret;
+
+		/*
+		 * Loop calling listns() until killed.
+		 * Request 10 namespace IDs while namespaces are being destroyed.
+		 * This tests:
+		 * 1. EFAULT handling when buffer becomes invalid
+		 * 2. Namespace cleanup outside RCU read lock during iteration
+		 */
+		while (1) {
+			iter_ret = sys_listns(&req, ns_ids, 10, 0);
+
+			if (iter_ret == -1 && errno == ENOSYS)
+				_exit(PIDFD_SKIP);
+		}
+	}
+
+	/* Small delay to let iterator start looping */
+	usleep(50000);
+
+	/*
+	 * Create more children with mount namespaces to increase the
+	 * likelihood that namespace cleanup happens during iteration.
+	 */
+	for (i = 0; i < 10; i++) {
+		/* Create socketpair for synchronization */
+		ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0);
+
+		pid = create_child(&pidfds[i], CLONE_NEWNS);
+		ASSERT_NE(pid, -1);
+
+		if (pid == 0) {
+			close(sv[i][0]); /* Close parent end */
+
+			if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
+				_exit(1);
+
+			/* Child: create tmpfs mounts */
+			if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST)
+				_exit(1);
+			if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST)
+				_exit(1);
+
+			if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1)
+				_exit(1);
+			if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1)
+				_exit(1);
+
+			/* Signal parent that setup is complete */
+			if (write_nointr(sv[i][1], "R", 1) != 1)
+				_exit(1);
+
+			/* Wait for parent to signal us to exit */
+			if (read_nointr(sv[i][1], &c, 1) != 1)
+				_exit(1);
+
+			close(sv[i][1]);
+			_exit(0);
+		}
+
+		close(sv[i][1]); /* Close child end */
+	}
+
+	/* Wait for all children to finish setup */
+	for (i = 0; i < 10; i++) {
+		ret = read_nointr(sv[i][0], &c, 1);
+		ASSERT_EQ(ret, 1);
+		ASSERT_EQ(c, 'R');
+	}
+
+	/* Kill half the children */
+	for (i = 0; i < 5; i++)
+		write_nointr(sv[i][0], "X", 1);
+
+	/* Small delay to let some exit */
+	usleep(10000);
+
+	/* Kill remaining children */
+	for (i = 5; i < 10; i++)
+		write_nointr(sv[i][0], "X", 1);
+
+	/* Wait for all children and cleanup */
+	for (i = 0; i < 10; i++) {
+		waitpid(-1, NULL, 0);
+		close(sv[i][0]);
+		close(pidfds[i]);
+	}
+
+	/* Kill iterator and wait for it */
+	sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0);
+	ret = waitpid(iter_pid, &status, 0);
+	ASSERT_EQ(ret, iter_pid);
+	close(iter_pidfd);
+
+	/* Should have been killed */
+	ASSERT_TRUE(WIFSIGNALED(status));
+	ASSERT_EQ(WTERMSIG(status), SIGKILL);
+
+	/* Clean up */
+	munmap(map, page_size);
+}
+
+/*
+ * Test specifically focused on mount namespace cleanup during EFAULT.
+ * Filter for mount namespaces only.
+ */
+TEST(listns_mnt_ns_cleanup_on_fault)
+{
+	void *map;
+	__u64 *ns_ids;
+	ssize_t ret;
+	long page_size;
+	pid_t pid, iter_pid;
+	int pidfds[8];
+	int sv[8][2];
+	int iter_pidfd;
+	int i, status;
+	char c;
+
+	page_size = sysconf(_SC_PAGESIZE);
+	ASSERT_GT(page_size, 0);
+
+	/* Set up partial fault buffer */
+	map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
+		   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	ASSERT_NE(map, MAP_FAILED);
+
+	ret = munmap((char *)map + page_size, page_size);
+	ASSERT_EQ(ret, 0);
+
+	/* Position for 3 successful writes, then fault */
+	ns_ids = ((__u64 *)((char *)map + page_size)) - 3;
+
+	/*
+	 * Create a separate process to run listns() concurrently.
+	 */
+	iter_pid = create_child(&iter_pidfd, 0);
+	ASSERT_NE(iter_pid, -1);
+
+	if (iter_pid == 0) {
+		struct ns_id_req req = {
+			.size = sizeof(req),
+			.spare = 0,
+			.ns_id = 0,
+			.ns_type = CLONE_NEWNS,  /* Only mount namespaces */
+			.spare2 = 0,
+			.user_ns_id = 0,
+		};
+		int iter_ret;
+
+		/*
+		 * Loop calling listns() until killed.
+		 * Call listns() to race with namespace destruction.
+		 */
+		while (1) {
+			iter_ret = sys_listns(&req, ns_ids, 10, 0);
+
+			if (iter_ret == -1 && errno == ENOSYS)
+				_exit(PIDFD_SKIP);
+		}
+	}
+
+	/* Small delay to let iterator start looping */
+	usleep(50000);
+
+	/* Create children with mount namespaces */
+	for (i = 0; i < 8; i++) {
+		/* Create socketpair for synchronization */
+		ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0);
+
+		pid = create_child(&pidfds[i], CLONE_NEWNS);
+		ASSERT_NE(pid, -1);
+
+		if (pid == 0) {
+			close(sv[i][0]); /* Close parent end */
+
+			if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
+				_exit(1);
+
+			/* Do some mount operations to make cleanup more interesting */
+			if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST)
+				_exit(1);
+			if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST)
+				_exit(1);
+
+			if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1)
+				_exit(1);
+			if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1)
+				_exit(1);
+
+			/* Signal parent that setup is complete */
+			if (write_nointr(sv[i][1], "R", 1) != 1)
+				_exit(1);
+
+			/* Wait for parent to signal us to exit */
+			if (read_nointr(sv[i][1], &c, 1) != 1)
+				_exit(1);
+
+			close(sv[i][1]);
+			_exit(0);
+		}
+
+		close(sv[i][1]); /* Close child end */
+	}
+
+	/* Wait for all children to finish setup */
+	for (i = 0; i < 8; i++) {
+		ret = read_nointr(sv[i][0], &c, 1);
+		ASSERT_EQ(ret, 1);
+		ASSERT_EQ(c, 'R');
+	}
+
+	/* Kill children to trigger namespace destruction during iteration */
+	for (i = 0; i < 8; i++)
+		write_nointr(sv[i][0], "X", 1);
+
+	/* Wait for children and cleanup */
+	for (i = 0; i < 8; i++) {
+		waitpid(-1, NULL, 0);
+		close(sv[i][0]);
+		close(pidfds[i]);
+	}
+
+	/* Kill iterator and wait for it */
+	sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0);
+	ret = waitpid(iter_pid, &status, 0);
+	ASSERT_EQ(ret, iter_pid);
+	close(iter_pidfd);
+
+	/* Should have been killed */
+	ASSERT_TRUE(WIFSIGNALED(status));
+	ASSERT_EQ(WTERMSIG(status), SIGKILL);
+
+	munmap(map, page_size);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/regression_pidfd_setns_test.c b/tools/testing/selftests/namespaces/regression_pidfd_setns_test.c
new file mode 100644
index 000000000000..753fd29dffd8
--- /dev/null
+++ b/tools/testing/selftests/namespaces/regression_pidfd_setns_test.c
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#include "../pidfd/pidfd.h"
+#include "../kselftest_harness.h"
+
+/*
+ * Regression tests for the setns(pidfd) active reference counting bug.
+ *
+ * These tests are based on the reproducers that triggered the race condition
+ * fixed by commit 1c465d0518dc ("ns: handle setns(pidfd, ...) cleanly").
+ *
+ * The bug: When using setns() with a pidfd, if the target task exits between
+ * prepare_nsset() and commit_nsset(), the namespaces would become inactive.
+ * Then ns_ref_active_get() would increment from 0 without properly resurrecting
+ * the owner chain, causing active reference count underflows.
+ */
+
+/*
+ * Simple pidfd setns test using create_child()+unshare().
+ *
+ * Without the fix, this would trigger active refcount warnings when the
+ * parent exits after doing setns(pidfd) on a child that has already exited.
+ */
+TEST(simple_pidfd_setns)
+{
+	pid_t child_pid;
+	int pidfd = -1;
+	int ret;
+	int sv[2];
+	char c;
+
+	/* Ignore SIGCHLD for autoreap */
+	ASSERT_NE(signal(SIGCHLD, SIG_IGN), SIG_ERR);
+
+	ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0);
+
+	/* Create a child process without namespaces initially */
+	child_pid = create_child(&pidfd, 0);
+	ASSERT_GE(child_pid, 0);
+
+	if (child_pid == 0) {
+		close(sv[0]);
+
+		if (unshare(CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWNET | CLONE_NEWUSER) < 0) {
+			close(sv[1]);
+			_exit(1);
+		}
+
+		/* Signal parent that namespaces are ready */
+		if (write_nointr(sv[1], "1", 1) < 0) {
+			close(sv[1]);
+			_exit(1);
+		}
+
+		close(sv[1]);
+		_exit(0);
+	}
+	ASSERT_GE(pidfd, 0);
+	EXPECT_EQ(close(sv[1]), 0);
+
+	ret = read_nointr(sv[0], &c, 1);
+	ASSERT_EQ(ret, 1);
+	EXPECT_EQ(close(sv[0]), 0);
+
+	/* Set to child's namespaces via pidfd */
+	ret = setns(pidfd, CLONE_NEWUTS | CLONE_NEWIPC);
+	TH_LOG("setns() returned %d", ret);
+	close(pidfd);
+}
+
+/*
+ * Simple pidfd setns test using create_child().
+ *
+ * This variation uses create_child() with namespace flags directly.
+ * Namespaces are created immediately at clone time.
+ */
+TEST(simple_pidfd_setns_clone)
+{
+	pid_t child_pid;
+	int pidfd = -1;
+	int ret;
+
+	/* Ignore SIGCHLD for autoreap */
+	ASSERT_NE(signal(SIGCHLD, SIG_IGN), SIG_ERR);
+
+	/* Create a child process with new namespaces using create_child() */
+	child_pid = create_child(&pidfd, CLONE_NEWUSER | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWNET);
+	ASSERT_GE(child_pid, 0);
+
+	if (child_pid == 0) {
+		/* Child: sleep for a while so parent can setns to us */
+		sleep(2);
+		_exit(0);
+	}
+
+	/* Parent: pidfd was already created by create_child() */
+	ASSERT_GE(pidfd, 0);
+
+	/* Set to child's namespaces via pidfd */
+	ret = setns(pidfd, CLONE_NEWUTS | CLONE_NEWIPC);
+	close(pidfd);
+	TH_LOG("setns() returned %d", ret);
+}
+
+TEST_HARNESS_MAIN
author	Christian Brauner <brauner@kernel.org>	2025-11-10 10:21:00 +0100
committer	Christian Brauner <brauner@kernel.org>	2025-11-10 15:54:02 +0100
commit	ae901e5e2e9b079761d26a366e0c80530d8aad22 (patch)
tree	65f2dcf3975dd9d7ba1d412fa3f116b5fce9e534 /tools
parent	8ebfb9896c97ab609222460e705f425cb3f0aad0 (diff)
parent	07d7ad46dad48a81ffc796fb7875b1ec141c8b48 (diff)