diff --git a/MAINTAINERS b/MAINTAINERS index 6765bca373d8..e7dc9e6fad2e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18191,6 +18191,15 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/mtd/linux.git nand/next F: drivers/mtd/nand/ F: include/linux/mtd/*nand*.h +NAMESPACES: +M: Christian Brauner +R: Pavel Tikhomirov +L: linux-kernel@vger.kernel.org +S: Maintained +F: rust/kernel/pid_namespace.rs +F: kernel/pid_namespace.c +F: tools/testing/selftests/pid_namespace/ + NATIONAL INSTRUMENTS SERIAL DRIVER M: Chaitanya Vadrevu L: linux-serial@vger.kernel.org @@ -20804,10 +20813,8 @@ M: Christian Brauner L: linux-kernel@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git -F: rust/kernel/pid_namespace.rs F: samples/pidfd/ F: tools/testing/selftests/clone3/ -F: tools/testing/selftests/pid_namespace/ F: tools/testing/selftests/pidfd/ K: (?i)pidfd K: (?i)clone3 diff --git a/kernel/exit.c b/kernel/exit.c index ede3117fa7d4..31b714c3a791 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -608,7 +608,8 @@ static struct task_struct *find_child_reaper(struct task_struct *father, reaper = find_alive_thread(father); if (reaper) { - pid_ns->child_reaper = reaper; + ASSERT_EXCLUSIVE_WRITER(pid_ns->child_reaper); + WRITE_ONCE(pid_ns->child_reaper, reaper); return reaper; } diff --git a/kernel/fork.c b/kernel/fork.c index 9c194fc58736..8c61c8dd4372 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2469,7 +2469,10 @@ __latent_entropy struct task_struct *copy_process( init_task_pid(p, PIDTYPE_SID, task_session(current)); if (is_child_reaper(pid)) { - ns_of_pid(pid)->child_reaper = p; + struct pid_namespace *ns = ns_of_pid(pid); + + ASSERT_EXCLUSIVE_WRITER(ns->child_reaper); + WRITE_ONCE(ns->child_reaper, p); p->signal->flags |= SIGNAL_UNKILLABLE; } p->signal->shared_pending.signal = delayed.signal; diff --git a/kernel/pid.c b/kernel/pid.c index 3b96571d0fe6..677c84e319dd 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -128,7 +128,7 @@ void free_pid(struct pid *pid) * is the reaper wake up the reaper. The reaper * may be sleeping in zap_pid_ns_processes(). */ - wake_up_process(ns->child_reaper); + wake_up_process(READ_ONCE(ns->child_reaper)); break; case PIDNS_ADDING: /* Handle a fork failure of the first process */ @@ -215,12 +215,6 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid, retval = -EINVAL; if (tid < 1 || tid >= pid_max[ns->level - i]) goto out_abort; - /* - * Also fail if a PID != 1 is requested and - * no PID 1 exists. - */ - if (tid != 1 && !tmp->child_reaper) - goto out_abort; retval = -EPERM; if (!checkpoint_restore_ns_capable(tmp->user_ns)) goto out_abort; @@ -296,9 +290,18 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid, pid->numbers[i].nr = nr; pid->numbers[i].ns = tmp; - tmp = tmp->parent; i--; retried_preload = false; + + /* + * PID 1 (init) must be created first. + */ + if (!READ_ONCE(tmp->child_reaper) && nr != 1) { + retval = -EINVAL; + goto out_free; + } + + tmp = tmp->parent; } /* diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index e48f5de41361..d36afc58ee1d 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -369,15 +369,6 @@ static struct ns_common *pidns_for_children_get(struct task_struct *task) } task_unlock(task); - if (ns) { - read_lock(&tasklist_lock); - if (!ns->child_reaper) { - put_pid_ns(ns); - ns = NULL; - } - read_unlock(&tasklist_lock); - } - return ns ? &ns->ns : NULL; } diff --git a/kernel/signal.c b/kernel/signal.c index e61f39fa8c8a..21d881b95ffb 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2818,8 +2818,9 @@ bool get_signal(struct ksignal *ksig) /* * Do this once, we can't return to user-mode if freezing() == T. - * do_signal_stop() and ptrace_stop() do freezable_schedule() and - * thus do not need another check after return. + * do_signal_stop() and ptrace_stop() set TASK_STOPPED/TASK_TRACED + * and the freezer handles those states via TASK_FROZEN, thus they + * do not need another check after return. */ try_to_freeze(); diff --git a/tools/testing/selftests/pid_namespace/.gitignore b/tools/testing/selftests/pid_namespace/.gitignore index 5118f0f3edf4..c647c6eb3367 100644 --- a/tools/testing/selftests/pid_namespace/.gitignore +++ b/tools/testing/selftests/pid_namespace/.gitignore @@ -1,2 +1,3 @@ pid_max +pidns_init_via_setns regression_enomem diff --git a/tools/testing/selftests/pid_namespace/Makefile b/tools/testing/selftests/pid_namespace/Makefile index b972f55d07ae..b01a924ac04b 100644 --- a/tools/testing/selftests/pid_namespace/Makefile +++ b/tools/testing/selftests/pid_namespace/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 CFLAGS += -g $(KHDR_INCLUDES) -TEST_GEN_PROGS = regression_enomem pid_max +TEST_GEN_PROGS = regression_enomem pid_max pidns_init_via_setns LOCAL_HDRS += $(selfdir)/pidfd/pidfd.h diff --git a/tools/testing/selftests/pid_namespace/pidns_init_via_setns.c b/tools/testing/selftests/pid_namespace/pidns_init_via_setns.c new file mode 100644 index 000000000000..520835ca42ed --- /dev/null +++ b/tools/testing/selftests/pid_namespace/pidns_init_via_setns.c @@ -0,0 +1,238 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include +#include +#include +#include +#include + +#include "kselftest_harness.h" +#include "../pidfd/pidfd.h" + +/* + * Test that a process can become PID 1 (init) in a new PID namespace + * created via unshare() and joined via setns(). + * + * Flow: + * 1. Parent creates a pipe for synchronization. + * 2. Parent forks a child. + * 3. Parent calls unshare(CLONE_NEWPID) to create a new PID namespace. + * 4. Parent signals the child via the pipe. + * 5. Child opens parent's /proc//ns/pid_for_children and calls + * setns(fd, CLONE_NEWPID) to join the new namespace. + * 6. Child forks a grandchild. + * 7. Grandchild verifies getpid() == 1. + */ +TEST(pidns_init_via_setns) +{ + pid_t child, parent_pid; + int pipe_fd[2]; + char buf; + + if (geteuid()) + ASSERT_EQ(0, unshare(CLONE_NEWUSER)); + + parent_pid = getpid(); + + ASSERT_EQ(0, pipe(pipe_fd)); + + child = fork(); + ASSERT_GE(child, 0); + + if (child == 0) { + char path[256]; + int nsfd; + pid_t grandchild; + + close(pipe_fd[1]); + + /* Wait for parent to complete unshare */ + ASSERT_EQ(1, read_nointr(pipe_fd[0], &buf, 1)); + close(pipe_fd[0]); + + snprintf(path, sizeof(path), + "/proc/%d/ns/pid_for_children", parent_pid); + nsfd = open(path, O_RDONLY); + ASSERT_GE(nsfd, 0); + + ASSERT_EQ(0, setns(nsfd, CLONE_NEWPID)); + close(nsfd); + + grandchild = fork(); + ASSERT_GE(grandchild, 0); + + if (grandchild == 0) { + /* Should be init (PID 1) in the new namespace */ + if (getpid() != 1) + _exit(1); + _exit(0); + } + + ASSERT_EQ(0, wait_for_pid(grandchild)); + _exit(0); + } + + close(pipe_fd[0]); + + ASSERT_EQ(0, unshare(CLONE_NEWPID)); + + /* Signal child that the new PID namespace is ready */ + buf = 0; + ASSERT_EQ(1, write_nointr(pipe_fd[1], &buf, 1)); + close(pipe_fd[1]); + + ASSERT_EQ(0, wait_for_pid(child)); +} + +/* + * Similar to pidns_init_via_setns, but: + * 1. Parent enters a new PID namespace right from the start to be able to + * later freely use pid 1001 in it. + * 2. After forking child, parent also calls unshare(CLONE_NEWUSER) + * before unshare(CLONE_NEWPID) so that new old and new pid namespaces have + * different user namespace owners. + * 3. Child uses clone3() with set_tid={1, 1001} instead of fork() and + * grandchild checks that it gets desired pids . + * + * Flow: + * 1. Test process creates a new PID namespace and forks a wrapper + * (PID 1 in the outer namespace). + * 2. Wrapper forks a child. + * 3. Wrapper calls unshare(CLONE_NEWUSER) + unshare(CLONE_NEWPID) + * to create an inner PID namespace. + * 4. Wrapper signals the child via pipe. + * 5. Child opens wrapper's /proc//ns/pid_for_children and calls + * setns(fd, CLONE_NEWPID) to join the inner namespace. + * 6. Child calls clone3() with set_tid={1, 1001}. + * 7. Grandchild verifies its NSpid ends with "1001 1". + */ + +pid_t set_tid[] = {1, 1001}; + +static int pidns_init_via_setns_set_tid_grandchild(struct __test_metadata *_metadata) +{ + char *line = NULL; + size_t len = 0; + int found = 0; + FILE *gf; + + gf = fopen("/proc/self/status", "r"); + ASSERT_NE(gf, NULL); + + while (getline(&line, &len, gf) != -1) { + if (strncmp(line, "NSpid:", 6) != 0) + continue; + + for (int i = 0; i < 2; i++) { + char *last = strrchr(line, '\t'); + pid_t pid; + + ASSERT_NE(last, NULL); + ASSERT_EQ(sscanf(last, "%d", &pid), 1); + ASSERT_EQ(pid, set_tid[i]); + *last = '\0'; + } + + found = true; + break; + } + + free(line); + fclose(gf); + ASSERT_TRUE(found); + return 0; +} + +static int pidns_init_via_setns_set_tid_child(struct __test_metadata *_metadata, + pid_t parent_pid, int pipe_fd[2]) +{ + struct __clone_args args = { + .exit_signal = SIGCHLD, + .set_tid = ptr_to_u64(set_tid), + .set_tid_size = 2, + }; + pid_t grandchild; + char path[256]; + char buf; + int nsfd; + + close(pipe_fd[1]); + + ASSERT_EQ(1, read_nointr(pipe_fd[0], &buf, 1)); + close(pipe_fd[0]); + + snprintf(path, sizeof(path), + "/proc/%d/ns/pid_for_children", parent_pid); + nsfd = open(path, O_RDONLY); + ASSERT_GE(nsfd, 0); + + ASSERT_EQ(0, setns(nsfd, CLONE_NEWPID)); + close(nsfd); + + grandchild = sys_clone3(&args, sizeof(args)); + ASSERT_GE(grandchild, 0); + + if (grandchild == 0) + _exit(pidns_init_via_setns_set_tid_grandchild(_metadata)); + + ASSERT_EQ(0, wait_for_pid(grandchild)); + return 0; +} + +static int pidns_init_via_setns_set_tid_wrapper(struct __test_metadata *_metadata) +{ + int pipe_fd[2]; + pid_t child, parent_pid; + char buf; + FILE *f; + + /* + * We are PID 1 inside the new namespace, but /proc is + * mounted from the host. Read our host-visible PID so + * the child can reach our pid_for_children via /proc. + */ + f = fopen("/proc/self/stat", "r"); + ASSERT_NE(f, NULL); + ASSERT_EQ(fscanf(f, "%d", &parent_pid), 1); + ASSERT_EQ(0, pipe(pipe_fd)); + + child = fork(); + ASSERT_GE(child, 0); + + if (child == 0) + _exit(pidns_init_via_setns_set_tid_child(_metadata, parent_pid, pipe_fd)); + + close(pipe_fd[0]); + + ASSERT_EQ(0, unshare(CLONE_NEWUSER)); + ASSERT_EQ(0, unshare(CLONE_NEWPID)); + + buf = 0; + ASSERT_EQ(1, write_nointr(pipe_fd[1], &buf, 1)); + close(pipe_fd[1]); + + ASSERT_EQ(0, wait_for_pid(child)); + + fclose(f); + return 0; +} + +TEST(pidns_init_via_setns_set_tid) +{ + pid_t wrapper; + + if (geteuid()) + SKIP(return, "This test needs root to run!"); + + ASSERT_EQ(0, unshare(CLONE_NEWPID)); + + wrapper = fork(); + ASSERT_GE(wrapper, 0); + + if (wrapper == 0) + _exit(pidns_init_via_setns_set_tid_wrapper(_metadata)); + + ASSERT_EQ(0, wait_for_pid(wrapper)); +} + +TEST_HARNESS_MAIN