summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristian Brauner <brauner@kernel.org>2025-02-07 10:17:54 +0100
committerChristian Brauner <brauner@kernel.org>2025-02-07 11:22:44 +0100
commit0a7713ac0d98d02f2c69145754c93715ab07b307 (patch)
tree4eccbd49259e181f1a57e14b6ef89f276daae49e
parent33be3ffd30b3ae67c7a1a405f98b963fd915d61a (diff)
parent627454c0f6708cb79dc58332e8033b8fa904c999 (diff)
Merge patch series "reduce tasklist_lock hold time on exit and do some pid cleanup"
Mateusz Guzik <mjguzik@gmail.com> says: The clone side contends against exit side in a way which avoidably exacerbates the problem by the latter waiting on locks held by the former while holding the tasklist_lock. Whacking this for both add_device_randomness and pids allocation gives me a 15% speed up for thread creation/destruction in a 24-core vm. The random patch is worth about 4%. The new bottleneck is pidmap_lock itself, with the biggest problem being the allocation itself taking the lock *twice*. Bench (plop into will-it-scale): $ cat tests/threadspawn1.c char *testcase_description = "Thread creation and teardown"; static void *worker(void *arg) { return (NULL); } void testcase(unsigned long long *iterations, unsigned long nr) { pthread_t thread; int error; while (1) { error = pthread_create(&thread, NULL, worker, NULL); assert(error == 0); error = pthread_join(thread, NULL); assert(error == 0); (*iterations)++; } } * patches from https://lore.kernel.org/r/20250206164415.450051-1-mjguzik@gmail.com: pid: drop irq disablement around pidmap_lock pid: perform free_pid() calls outside of tasklist_lock pid: sprinkle tasklist_lock asserts exit: hoist get_pid() in release_task() outside of tasklist_lock exit: perform add_device_randomness() without tasklist_lock Link: https://lore.kernel.org/r/20250206164415.450051-1-mjguzik@gmail.com Signed-off-by: Christian Brauner <brauner@kernel.org>
-rw-r--r--include/linux/pid.h7
-rw-r--r--kernel/exit.c36
-rw-r--r--kernel/pid.c82
-rw-r--r--kernel/sys.c14
4 files changed, 82 insertions, 57 deletions
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 98837a1ff0f3..311ecebd7d56 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -101,9 +101,9 @@ extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type);
* these helpers must be called with the tasklist_lock write-held.
*/
extern void attach_pid(struct task_struct *task, enum pid_type);
-extern void detach_pid(struct task_struct *task, enum pid_type);
-extern void change_pid(struct task_struct *task, enum pid_type,
- struct pid *pid);
+void detach_pid(struct pid **pids, struct task_struct *task, enum pid_type);
+void change_pid(struct pid **pids, struct task_struct *task, enum pid_type,
+ struct pid *pid);
extern void exchange_tids(struct task_struct *task, struct task_struct *old);
extern void transfer_pid(struct task_struct *old, struct task_struct *new,
enum pid_type);
@@ -129,6 +129,7 @@ extern struct pid *find_ge_pid(int nr, struct pid_namespace *);
extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
size_t set_tid_size);
extern void free_pid(struct pid *pid);
+void free_pids(struct pid **pids);
extern void disable_pid_allocation(struct pid_namespace *ns);
/*
diff --git a/kernel/exit.c b/kernel/exit.c
index 0acb94b17caa..fd960a67a4dc 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -122,14 +122,22 @@ static __init int kernel_exit_sysfs_init(void)
late_initcall(kernel_exit_sysfs_init);
#endif
-static void __unhash_process(struct task_struct *p, bool group_dead)
+/*
+ * For things release_task() would like to do *after* tasklist_lock is released.
+ */
+struct release_task_post {
+ struct pid *pids[PIDTYPE_MAX];
+};
+
+static void __unhash_process(struct release_task_post *post, struct task_struct *p,
+ bool group_dead)
{
nr_threads--;
- detach_pid(p, PIDTYPE_PID);
+ detach_pid(post->pids, p, PIDTYPE_PID);
if (group_dead) {
- detach_pid(p, PIDTYPE_TGID);
- detach_pid(p, PIDTYPE_PGID);
- detach_pid(p, PIDTYPE_SID);
+ detach_pid(post->pids, p, PIDTYPE_TGID);
+ detach_pid(post->pids, p, PIDTYPE_PGID);
+ detach_pid(post->pids, p, PIDTYPE_SID);
list_del_rcu(&p->tasks);
list_del_init(&p->sibling);
@@ -141,7 +149,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
/*
* This function expects the tasklist_lock write-locked.
*/
-static void __exit_signal(struct task_struct *tsk)
+static void __exit_signal(struct release_task_post *post, struct task_struct *tsk)
{
struct signal_struct *sig = tsk->signal;
bool group_dead = thread_group_leader(tsk);
@@ -174,9 +182,6 @@ static void __exit_signal(struct task_struct *tsk)
sig->curr_target = next_thread(tsk);
}
- add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
- sizeof(unsigned long long));
-
/*
* Accumulate here the counters for all threads as they die. We could
* skip the group leader because it is the last user of signal_struct,
@@ -197,7 +202,7 @@ static void __exit_signal(struct task_struct *tsk)
task_io_accounting_add(&sig->ioac, &tsk->ioac);
sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
sig->nr_threads--;
- __unhash_process(tsk, group_dead);
+ __unhash_process(post, tsk, group_dead);
write_sequnlock(&sig->stats_lock);
tsk->sighand = NULL;
@@ -231,10 +236,13 @@ void __weak release_thread(struct task_struct *dead_task)
void release_task(struct task_struct *p)
{
+ struct release_task_post post;
struct task_struct *leader;
struct pid *thread_pid;
int zap_leader;
repeat:
+ memset(&post, 0, sizeof(post));
+
/* don't need to get the RCU readlock here - the process is dead and
* can't be modifying its own credentials. But shut RCU-lockdep up */
rcu_read_lock();
@@ -243,10 +251,11 @@ repeat:
cgroup_release(p);
+ thread_pid = get_pid(p->thread_pid);
+
write_lock_irq(&tasklist_lock);
ptrace_release_task(p);
- thread_pid = get_pid(p->thread_pid);
- __exit_signal(p);
+ __exit_signal(&post, p);
/*
* If we are the last non-leader member of the thread
@@ -270,6 +279,9 @@ repeat:
write_unlock_irq(&tasklist_lock);
proc_flush_pid(thread_pid);
put_pid(thread_pid);
+ add_device_randomness(&p->se.sum_exec_runtime,
+ sizeof(p->se.sum_exec_runtime));
+ free_pids(post.pids);
release_thread(p);
/*
* This task was already removed from the process/thread/pid lists
diff --git a/kernel/pid.c b/kernel/pid.c
index 924084713be8..900193de4232 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -88,20 +88,6 @@ struct pid_namespace init_pid_ns = {
};
EXPORT_SYMBOL_GPL(init_pid_ns);
-/*
- * Note: disable interrupts while the pidmap_lock is held as an
- * interrupt might come in and do read_lock(&tasklist_lock).
- *
- * If we don't disable interrupts there is a nasty deadlock between
- * detach_pid()->free_pid() and another cpu that does
- * spin_lock(&pidmap_lock) followed by an interrupt routine that does
- * read_lock(&tasklist_lock);
- *
- * After we clean up the tasklist_lock and know there are no
- * irq handlers that take it we can leave the interrupts enabled.
- * For now it is easier to be safe than to prove it can't happen.
- */
-
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock);
@@ -128,11 +114,11 @@ static void delayed_put_pid(struct rcu_head *rhp)
void free_pid(struct pid *pid)
{
- /* We can be called with write_lock_irq(&tasklist_lock) held */
int i;
- unsigned long flags;
- spin_lock_irqsave(&pidmap_lock, flags);
+ lockdep_assert_not_held(&tasklist_lock);
+
+ spin_lock(&pidmap_lock);
for (i = 0; i <= pid->level; i++) {
struct upid *upid = pid->numbers + i;
struct pid_namespace *ns = upid->ns;
@@ -155,11 +141,23 @@ void free_pid(struct pid *pid)
idr_remove(&ns->idr, upid->nr);
}
pidfs_remove_pid(pid);
- spin_unlock_irqrestore(&pidmap_lock, flags);
+ spin_unlock(&pidmap_lock);
call_rcu(&pid->rcu, delayed_put_pid);
}
+void free_pids(struct pid **pids)
+{
+ int tmp;
+
+ /*
+ * This can batch pidmap_lock.
+ */
+ for (tmp = PIDTYPE_MAX; --tmp >= 0; )
+ if (pids[tmp])
+ free_pid(pids[tmp]);
+}
+
struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
size_t set_tid_size)
{
@@ -211,7 +209,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
}
idr_preload(GFP_KERNEL);
- spin_lock_irq(&pidmap_lock);
+ spin_lock(&pidmap_lock);
if (tid) {
nr = idr_alloc(&tmp->idr, NULL, tid,
@@ -238,7 +236,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
pid_max, GFP_ATOMIC);
}
- spin_unlock_irq(&pidmap_lock);
+ spin_unlock(&pidmap_lock);
idr_preload_end();
if (nr < 0) {
@@ -272,7 +270,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
upid = pid->numbers + ns->level;
idr_preload(GFP_KERNEL);
- spin_lock_irq(&pidmap_lock);
+ spin_lock(&pidmap_lock);
if (!(ns->pid_allocated & PIDNS_ADDING))
goto out_unlock;
pidfs_add_pid(pid);
@@ -281,18 +279,18 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
idr_replace(&upid->ns->idr, pid, upid->nr);
upid->ns->pid_allocated++;
}
- spin_unlock_irq(&pidmap_lock);
+ spin_unlock(&pidmap_lock);
idr_preload_end();
return pid;
out_unlock:
- spin_unlock_irq(&pidmap_lock);
+ spin_unlock(&pidmap_lock);
idr_preload_end();
put_pid_ns(ns);
out_free:
- spin_lock_irq(&pidmap_lock);
+ spin_lock(&pidmap_lock);
while (++i <= ns->level) {
upid = pid->numbers + i;
idr_remove(&upid->ns->idr, upid->nr);
@@ -302,7 +300,7 @@ out_free:
if (ns->pid_allocated == PIDNS_ADDING)
idr_set_cursor(&ns->idr, 0);
- spin_unlock_irq(&pidmap_lock);
+ spin_unlock(&pidmap_lock);
kmem_cache_free(ns->pid_cachep, pid);
return ERR_PTR(retval);
@@ -310,9 +308,9 @@ out_free:
void disable_pid_allocation(struct pid_namespace *ns)
{
- spin_lock_irq(&pidmap_lock);
+ spin_lock(&pidmap_lock);
ns->pid_allocated &= ~PIDNS_ADDING;
- spin_unlock_irq(&pidmap_lock);
+ spin_unlock(&pidmap_lock);
}
struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
@@ -339,17 +337,23 @@ static struct pid **task_pid_ptr(struct task_struct *task, enum pid_type type)
*/
void attach_pid(struct task_struct *task, enum pid_type type)
{
- struct pid *pid = *task_pid_ptr(task, type);
+ struct pid *pid;
+
+ lockdep_assert_held_write(&tasklist_lock);
+
+ pid = *task_pid_ptr(task, type);
hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]);
}
-static void __change_pid(struct task_struct *task, enum pid_type type,
- struct pid *new)
+static void __change_pid(struct pid **pids, struct task_struct *task,
+ enum pid_type type, struct pid *new)
{
- struct pid **pid_ptr = task_pid_ptr(task, type);
- struct pid *pid;
+ struct pid **pid_ptr, *pid;
int tmp;
+ lockdep_assert_held_write(&tasklist_lock);
+
+ pid_ptr = task_pid_ptr(task, type);
pid = *pid_ptr;
hlist_del_rcu(&task->pid_links[type]);
@@ -364,18 +368,19 @@ static void __change_pid(struct task_struct *task, enum pid_type type,
if (pid_has_task(pid, tmp))
return;
- free_pid(pid);
+ WARN_ON(pids[type]);
+ pids[type] = pid;
}
-void detach_pid(struct task_struct *task, enum pid_type type)
+void detach_pid(struct pid **pids, struct task_struct *task, enum pid_type type)
{
- __change_pid(task, type, NULL);
+ __change_pid(pids, task, type, NULL);
}
-void change_pid(struct task_struct *task, enum pid_type type,
+void change_pid(struct pid **pids, struct task_struct *task, enum pid_type type,
struct pid *pid)
{
- __change_pid(task, type, pid);
+ __change_pid(pids, task, type, pid);
attach_pid(task, type);
}
@@ -386,6 +391,8 @@ void exchange_tids(struct task_struct *left, struct task_struct *right)
struct hlist_head *head1 = &pid1->tasks[PIDTYPE_PID];
struct hlist_head *head2 = &pid2->tasks[PIDTYPE_PID];
+ lockdep_assert_held_write(&tasklist_lock);
+
/* Swap the single entry tid lists */
hlists_swap_heads_rcu(head1, head2);
@@ -403,6 +410,7 @@ void transfer_pid(struct task_struct *old, struct task_struct *new,
enum pid_type type)
{
WARN_ON_ONCE(type == PIDTYPE_PID);
+ lockdep_assert_held_write(&tasklist_lock);
hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]);
}
diff --git a/kernel/sys.c b/kernel/sys.c
index cb366ff8703a..4efca8a97d62 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1085,6 +1085,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
{
struct task_struct *p;
struct task_struct *group_leader = current->group_leader;
+ struct pid *pids[PIDTYPE_MAX] = { 0 };
struct pid *pgrp;
int err;
@@ -1142,13 +1143,14 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
goto out;
if (task_pgrp(p) != pgrp)
- change_pid(p, PIDTYPE_PGID, pgrp);
+ change_pid(pids, p, PIDTYPE_PGID, pgrp);
err = 0;
out:
/* All paths lead to here, thus we are safe. -DaveM */
write_unlock_irq(&tasklist_lock);
rcu_read_unlock();
+ free_pids(pids);
return err;
}
@@ -1222,21 +1224,22 @@ out:
return retval;
}
-static void set_special_pids(struct pid *pid)
+static void set_special_pids(struct pid **pids, struct pid *pid)
{
struct task_struct *curr = current->group_leader;
if (task_session(curr) != pid)
- change_pid(curr, PIDTYPE_SID, pid);
+ change_pid(pids, curr, PIDTYPE_SID, pid);
if (task_pgrp(curr) != pid)
- change_pid(curr, PIDTYPE_PGID, pid);
+ change_pid(pids, curr, PIDTYPE_PGID, pid);
}
int ksys_setsid(void)
{
struct task_struct *group_leader = current->group_leader;
struct pid *sid = task_pid(group_leader);
+ struct pid *pids[PIDTYPE_MAX] = { 0 };
pid_t session = pid_vnr(sid);
int err = -EPERM;
@@ -1252,13 +1255,14 @@ int ksys_setsid(void)
goto out;
group_leader->signal->leader = 1;
- set_special_pids(sid);
+ set_special_pids(pids, sid);
proc_clear_tty(group_leader);
err = session;
out:
write_unlock_irq(&tasklist_lock);
+ free_pids(pids);
if (err > 0) {
proc_sid_connector(group_leader);
sched_autogroup_create_attach(group_leader);