From fb3bbcfe344e64a46574a638b051ffd78762c12d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 6 Feb 2025 16:23:14 +0100 Subject: exit: change the release_task() paths to call flush_sigqueue() lockless A task can block a signal, accumulate up to RLIMIT_SIGPENDING sigqueues, and exit. In this case __exit_signal()->flush_sigqueue() called with irqs disabled can trigger a hard lockup, see https://lore.kernel.org/all/20190322114917.GC28876@redhat.com/ Fortunately, after the recent posixtimer changes sys_timer_delete() paths no longer try to clear SIGQUEUE_PREALLOC and/or free tmr->sigq, and after the exiting task passes __exit_signal() lock_task_sighand() can't succeed and pid_task(tmr->it_pid) will return NULL. This means that after __exit_signal(tsk) nobody can play with tsk->pending or (if group_dead) with tsk->signal->shared_pending, so release_task() can safely call flush_sigqueue() after write_unlock_irq(&tasklist_lock). TODO: - we can probably shift posix_cpu_timers_exit() as well - do_sigaction() can hit the similar problem Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250206152314.GA14620@redhat.com Reviewed-by: Frederic Weisbecker Signed-off-by: Christian Brauner --- kernel/exit.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/kernel/exit.c b/kernel/exit.c index 3485e5fc499e..2d7444da743d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -200,20 +200,13 @@ static void __exit_signal(struct task_struct *tsk) __unhash_process(tsk, group_dead); write_sequnlock(&sig->stats_lock); - /* - * Do this under ->siglock, we can race with another thread - * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. - */ - flush_sigqueue(&tsk->pending); tsk->sighand = NULL; spin_unlock(&sighand->siglock); __cleanup_sighand(sighand); clear_tsk_thread_flag(tsk, TIF_SIGPENDING); - if (group_dead) { - flush_sigqueue(&sig->shared_pending); + if (group_dead) tty_kref_put(tty); - } } static void delayed_put_task_struct(struct rcu_head *rhp) @@ -279,6 +272,16 @@ repeat: proc_flush_pid(thread_pid); put_pid(thread_pid); release_thread(p); + /* + * This task was already removed from the process/thread/pid lists + * and lock_task_sighand(p) can't succeed. Nobody else can touch + * ->pending or, if group dead, signal->shared_pending. We can call + * flush_sigqueue() lockless. + */ + flush_sigqueue(&p->pending); + if (thread_group_leader(p)) + flush_sigqueue(&p->signal->shared_pending); + put_task_struct_rcu_user(p); p = leader; -- cgit v1.2.3 From 43966114b49988ebca6b7d17eb68bad7740e9fb2 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 6 Feb 2025 16:23:34 +0100 Subject: exit: kill the pointless __exit_signal()->clear_tsk_thread_flag(TIF_SIGPENDING) It predates the git history and most probably it was never needed. It doesn't really hurt, but it looks confusing because its purpose is not clear at all. release_task(p) is called when this task has already passed exit_notify() so signal_pending(p) == T shouldn't make any difference. And even _if_ there were a subtle reason to clear TIF_SIGPENDING after exit_notify(), this clear_tsk_thread_flag() can't help anyway. If the exiting task is a group leader or if it is ptraced, release_task() will be likely called when this task has already done its last schedule() from do_task_dead(). Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250206152334.GB14620@redhat.com Acked-by: Frederic Weisbecker Signed-off-by: Christian Brauner --- kernel/exit.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/exit.c b/kernel/exit.c index 2d7444da743d..0acb94b17caa 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -204,7 +204,6 @@ static void __exit_signal(struct task_struct *tsk) spin_unlock(&sighand->siglock); __cleanup_sighand(sighand); - clear_tsk_thread_flag(tsk, TIF_SIGPENDING); if (group_dead) tty_kref_put(tty); } -- cgit v1.2.3 From 1ab2785694971257f6a7dbd5a71bd8402b5fc305 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 6 Feb 2025 17:44:10 +0100 Subject: exit: perform add_device_randomness() without tasklist_lock Parallel calls to add_device_randomness() contend on their own. The clone side aleady runs outside of tasklist_lock, which in turn means any caller on the exit side extends the tasklist_lock hold time while contending on the random-private lock. Reviewed-by: Oleg Nesterov Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20250206164415.450051-2-mjguzik@gmail.com Acked-by: "Liam R. Howlett" Signed-off-by: Christian Brauner --- kernel/exit.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/exit.c b/kernel/exit.c index 0acb94b17caa..a00273db024b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -174,9 +174,6 @@ static void __exit_signal(struct task_struct *tsk) sig->curr_target = next_thread(tsk); } - add_device_randomness((const void*) &tsk->se.sum_exec_runtime, - sizeof(unsigned long long)); - /* * Accumulate here the counters for all threads as they die. We could * skip the group leader because it is the last user of signal_struct, @@ -270,6 +267,8 @@ repeat: write_unlock_irq(&tasklist_lock); proc_flush_pid(thread_pid); put_pid(thread_pid); + add_device_randomness(&p->se.sum_exec_runtime, + sizeof(p->se.sum_exec_runtime)); release_thread(p); /* * This task was already removed from the process/thread/pid lists -- cgit v1.2.3 From 6731cd97e60d6f3a30057c0fc513aed543187104 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 6 Feb 2025 17:44:11 +0100 Subject: exit: hoist get_pid() in release_task() outside of tasklist_lock Reduces hold time as get_pid() contains an atomic. Reviewed-by: Oleg Nesterov Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20250206164415.450051-3-mjguzik@gmail.com Acked-by: "Liam R. Howlett" Signed-off-by: Christian Brauner --- kernel/exit.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/exit.c b/kernel/exit.c index a00273db024b..ebc1c5bec512 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -240,9 +240,10 @@ repeat: cgroup_release(p); + thread_pid = get_pid(p->thread_pid); + write_lock_irq(&tasklist_lock); ptrace_release_task(p); - thread_pid = get_pid(p->thread_pid); __exit_signal(p); /* -- cgit v1.2.3 From 74198dc2067b2aa14e411b29ce50ea3f418a43ab Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 6 Feb 2025 17:44:12 +0100 Subject: pid: sprinkle tasklist_lock asserts They cost nothing on production kernels and document the requirement of holding the tasklist_lock lock in respective routines. Reviewed-by: Oleg Nesterov Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20250206164415.450051-4-mjguzik@gmail.com Acked-by: "Liam R. Howlett" Signed-off-by: Christian Brauner --- kernel/pid.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/kernel/pid.c b/kernel/pid.c index 924084713be8..2ae872f689a7 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -339,17 +339,23 @@ static struct pid **task_pid_ptr(struct task_struct *task, enum pid_type type) */ void attach_pid(struct task_struct *task, enum pid_type type) { - struct pid *pid = *task_pid_ptr(task, type); + struct pid *pid; + + lockdep_assert_held_write(&tasklist_lock); + + pid = *task_pid_ptr(task, type); hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]); } static void __change_pid(struct task_struct *task, enum pid_type type, struct pid *new) { - struct pid **pid_ptr = task_pid_ptr(task, type); - struct pid *pid; + struct pid **pid_ptr, *pid; int tmp; + lockdep_assert_held_write(&tasklist_lock); + + pid_ptr = task_pid_ptr(task, type); pid = *pid_ptr; hlist_del_rcu(&task->pid_links[type]); @@ -386,6 +392,8 @@ void exchange_tids(struct task_struct *left, struct task_struct *right) struct hlist_head *head1 = &pid1->tasks[PIDTYPE_PID]; struct hlist_head *head2 = &pid2->tasks[PIDTYPE_PID]; + lockdep_assert_held_write(&tasklist_lock); + /* Swap the single entry tid lists */ hlists_swap_heads_rcu(head1, head2); @@ -403,6 +411,7 @@ void transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type type) { WARN_ON_ONCE(type == PIDTYPE_PID); + lockdep_assert_held_write(&tasklist_lock); hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]); } -- cgit v1.2.3 From 7903f907a226058ed99f86e9924e082aea57fc45 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 6 Feb 2025 17:44:13 +0100 Subject: pid: perform free_pid() calls outside of tasklist_lock As the clone side already executes pid allocation with only pidmap_lock held, issuing free_pid() while still holding tasklist_lock exacerbates total hold time of the latter. More things may show up later which require initial clean up with the lock held and allow finishing without it. For that reason a struct to collect such work is added instead of merely passing the pid array. Reviewed-by: Oleg Nesterov Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20250206164415.450051-5-mjguzik@gmail.com Acked-by: "Liam R. Howlett" Signed-off-by: Christian Brauner --- include/linux/pid.h | 7 ++++--- kernel/exit.c | 28 ++++++++++++++++++++-------- kernel/pid.c | 44 ++++++++++++++++++++++---------------------- kernel/sys.c | 14 +++++++++----- 4 files changed, 55 insertions(+), 38 deletions(-) diff --git a/include/linux/pid.h b/include/linux/pid.h index 98837a1ff0f3..311ecebd7d56 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -101,9 +101,9 @@ extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type); * these helpers must be called with the tasklist_lock write-held. */ extern void attach_pid(struct task_struct *task, enum pid_type); -extern void detach_pid(struct task_struct *task, enum pid_type); -extern void change_pid(struct task_struct *task, enum pid_type, - struct pid *pid); +void detach_pid(struct pid **pids, struct task_struct *task, enum pid_type); +void change_pid(struct pid **pids, struct task_struct *task, enum pid_type, + struct pid *pid); extern void exchange_tids(struct task_struct *task, struct task_struct *old); extern void transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type); @@ -129,6 +129,7 @@ extern struct pid *find_ge_pid(int nr, struct pid_namespace *); extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, size_t set_tid_size); extern void free_pid(struct pid *pid); +void free_pids(struct pid **pids); extern void disable_pid_allocation(struct pid_namespace *ns); /* diff --git a/kernel/exit.c b/kernel/exit.c index ebc1c5bec512..fd960a67a4dc 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -122,14 +122,22 @@ static __init int kernel_exit_sysfs_init(void) late_initcall(kernel_exit_sysfs_init); #endif -static void __unhash_process(struct task_struct *p, bool group_dead) +/* + * For things release_task() would like to do *after* tasklist_lock is released. + */ +struct release_task_post { + struct pid *pids[PIDTYPE_MAX]; +}; + +static void __unhash_process(struct release_task_post *post, struct task_struct *p, + bool group_dead) { nr_threads--; - detach_pid(p, PIDTYPE_PID); + detach_pid(post->pids, p, PIDTYPE_PID); if (group_dead) { - detach_pid(p, PIDTYPE_TGID); - detach_pid(p, PIDTYPE_PGID); - detach_pid(p, PIDTYPE_SID); + detach_pid(post->pids, p, PIDTYPE_TGID); + detach_pid(post->pids, p, PIDTYPE_PGID); + detach_pid(post->pids, p, PIDTYPE_SID); list_del_rcu(&p->tasks); list_del_init(&p->sibling); @@ -141,7 +149,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead) /* * This function expects the tasklist_lock write-locked. */ -static void __exit_signal(struct task_struct *tsk) +static void __exit_signal(struct release_task_post *post, struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; bool group_dead = thread_group_leader(tsk); @@ -194,7 +202,7 @@ static void __exit_signal(struct task_struct *tsk) task_io_accounting_add(&sig->ioac, &tsk->ioac); sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig->nr_threads--; - __unhash_process(tsk, group_dead); + __unhash_process(post, tsk, group_dead); write_sequnlock(&sig->stats_lock); tsk->sighand = NULL; @@ -228,10 +236,13 @@ void __weak release_thread(struct task_struct *dead_task) void release_task(struct task_struct *p) { + struct release_task_post post; struct task_struct *leader; struct pid *thread_pid; int zap_leader; repeat: + memset(&post, 0, sizeof(post)); + /* don't need to get the RCU readlock here - the process is dead and * can't be modifying its own credentials. But shut RCU-lockdep up */ rcu_read_lock(); @@ -244,7 +255,7 @@ repeat: write_lock_irq(&tasklist_lock); ptrace_release_task(p); - __exit_signal(p); + __exit_signal(&post, p); /* * If we are the last non-leader member of the thread @@ -270,6 +281,7 @@ repeat: put_pid(thread_pid); add_device_randomness(&p->se.sum_exec_runtime, sizeof(p->se.sum_exec_runtime)); + free_pids(post.pids); release_thread(p); /* * This task was already removed from the process/thread/pid lists diff --git a/kernel/pid.c b/kernel/pid.c index 2ae872f689a7..73625f28c166 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -88,20 +88,6 @@ struct pid_namespace init_pid_ns = { }; EXPORT_SYMBOL_GPL(init_pid_ns); -/* - * Note: disable interrupts while the pidmap_lock is held as an - * interrupt might come in and do read_lock(&tasklist_lock). - * - * If we don't disable interrupts there is a nasty deadlock between - * detach_pid()->free_pid() and another cpu that does - * spin_lock(&pidmap_lock) followed by an interrupt routine that does - * read_lock(&tasklist_lock); - * - * After we clean up the tasklist_lock and know there are no - * irq handlers that take it we can leave the interrupts enabled. - * For now it is easier to be safe than to prove it can't happen. - */ - static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock); @@ -128,10 +114,11 @@ static void delayed_put_pid(struct rcu_head *rhp) void free_pid(struct pid *pid) { - /* We can be called with write_lock_irq(&tasklist_lock) held */ int i; unsigned long flags; + lockdep_assert_not_held(&tasklist_lock); + spin_lock_irqsave(&pidmap_lock, flags); for (i = 0; i <= pid->level; i++) { struct upid *upid = pid->numbers + i; @@ -160,6 +147,18 @@ void free_pid(struct pid *pid) call_rcu(&pid->rcu, delayed_put_pid); } +void free_pids(struct pid **pids) +{ + int tmp; + + /* + * This can batch pidmap_lock. + */ + for (tmp = PIDTYPE_MAX; --tmp >= 0; ) + if (pids[tmp]) + free_pid(pids[tmp]); +} + struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, size_t set_tid_size) { @@ -347,8 +346,8 @@ void attach_pid(struct task_struct *task, enum pid_type type) hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]); } -static void __change_pid(struct task_struct *task, enum pid_type type, - struct pid *new) +static void __change_pid(struct pid **pids, struct task_struct *task, + enum pid_type type, struct pid *new) { struct pid **pid_ptr, *pid; int tmp; @@ -370,18 +369,19 @@ static void __change_pid(struct task_struct *task, enum pid_type type, if (pid_has_task(pid, tmp)) return; - free_pid(pid); + WARN_ON(pids[type]); + pids[type] = pid; } -void detach_pid(struct task_struct *task, enum pid_type type) +void detach_pid(struct pid **pids, struct task_struct *task, enum pid_type type) { - __change_pid(task, type, NULL); + __change_pid(pids, task, type, NULL); } -void change_pid(struct task_struct *task, enum pid_type type, +void change_pid(struct pid **pids, struct task_struct *task, enum pid_type type, struct pid *pid) { - __change_pid(task, type, pid); + __change_pid(pids, task, type, pid); attach_pid(task, type); } diff --git a/kernel/sys.c b/kernel/sys.c index cb366ff8703a..4efca8a97d62 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1085,6 +1085,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) { struct task_struct *p; struct task_struct *group_leader = current->group_leader; + struct pid *pids[PIDTYPE_MAX] = { 0 }; struct pid *pgrp; int err; @@ -1142,13 +1143,14 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) goto out; if (task_pgrp(p) != pgrp) - change_pid(p, PIDTYPE_PGID, pgrp); + change_pid(pids, p, PIDTYPE_PGID, pgrp); err = 0; out: /* All paths lead to here, thus we are safe. -DaveM */ write_unlock_irq(&tasklist_lock); rcu_read_unlock(); + free_pids(pids); return err; } @@ -1222,21 +1224,22 @@ out: return retval; } -static void set_special_pids(struct pid *pid) +static void set_special_pids(struct pid **pids, struct pid *pid) { struct task_struct *curr = current->group_leader; if (task_session(curr) != pid) - change_pid(curr, PIDTYPE_SID, pid); + change_pid(pids, curr, PIDTYPE_SID, pid); if (task_pgrp(curr) != pid) - change_pid(curr, PIDTYPE_PGID, pid); + change_pid(pids, curr, PIDTYPE_PGID, pid); } int ksys_setsid(void) { struct task_struct *group_leader = current->group_leader; struct pid *sid = task_pid(group_leader); + struct pid *pids[PIDTYPE_MAX] = { 0 }; pid_t session = pid_vnr(sid); int err = -EPERM; @@ -1252,13 +1255,14 @@ int ksys_setsid(void) goto out; group_leader->signal->leader = 1; - set_special_pids(sid); + set_special_pids(pids, sid); proc_clear_tty(group_leader); err = session; out: write_unlock_irq(&tasklist_lock); + free_pids(pids); if (err > 0) { proc_sid_connector(group_leader); sched_autogroup_create_attach(group_leader); -- cgit v1.2.3 From 627454c0f6708cb79dc58332e8033b8fa904c999 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 6 Feb 2025 17:44:14 +0100 Subject: pid: drop irq disablement around pidmap_lock It no longer serves any purpose now that the tasklist_lock -> pidmap_lock ordering got eliminated. Reviewed-by: Oleg Nesterov Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20250206164415.450051-6-mjguzik@gmail.com Acked-by: "Liam R. Howlett" Signed-off-by: Christian Brauner --- kernel/pid.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/kernel/pid.c b/kernel/pid.c index 73625f28c166..900193de4232 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -115,11 +115,10 @@ static void delayed_put_pid(struct rcu_head *rhp) void free_pid(struct pid *pid) { int i; - unsigned long flags; lockdep_assert_not_held(&tasklist_lock); - spin_lock_irqsave(&pidmap_lock, flags); + spin_lock(&pidmap_lock); for (i = 0; i <= pid->level; i++) { struct upid *upid = pid->numbers + i; struct pid_namespace *ns = upid->ns; @@ -142,7 +141,7 @@ void free_pid(struct pid *pid) idr_remove(&ns->idr, upid->nr); } pidfs_remove_pid(pid); - spin_unlock_irqrestore(&pidmap_lock, flags); + spin_unlock(&pidmap_lock); call_rcu(&pid->rcu, delayed_put_pid); } @@ -210,7 +209,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, } idr_preload(GFP_KERNEL); - spin_lock_irq(&pidmap_lock); + spin_lock(&pidmap_lock); if (tid) { nr = idr_alloc(&tmp->idr, NULL, tid, @@ -237,7 +236,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, pid_max, GFP_ATOMIC); } - spin_unlock_irq(&pidmap_lock); + spin_unlock(&pidmap_lock); idr_preload_end(); if (nr < 0) { @@ -271,7 +270,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, upid = pid->numbers + ns->level; idr_preload(GFP_KERNEL); - spin_lock_irq(&pidmap_lock); + spin_lock(&pidmap_lock); if (!(ns->pid_allocated & PIDNS_ADDING)) goto out_unlock; pidfs_add_pid(pid); @@ -280,18 +279,18 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, idr_replace(&upid->ns->idr, pid, upid->nr); upid->ns->pid_allocated++; } - spin_unlock_irq(&pidmap_lock); + spin_unlock(&pidmap_lock); idr_preload_end(); return pid; out_unlock: - spin_unlock_irq(&pidmap_lock); + spin_unlock(&pidmap_lock); idr_preload_end(); put_pid_ns(ns); out_free: - spin_lock_irq(&pidmap_lock); + spin_lock(&pidmap_lock); while (++i <= ns->level) { upid = pid->numbers + i; idr_remove(&upid->ns->idr, upid->nr); @@ -301,7 +300,7 @@ out_free: if (ns->pid_allocated == PIDNS_ADDING) idr_set_cursor(&ns->idr, 0); - spin_unlock_irq(&pidmap_lock); + spin_unlock(&pidmap_lock); kmem_cache_free(ns->pid_cachep, pid); return ERR_PTR(retval); @@ -309,9 +308,9 @@ out_free: void disable_pid_allocation(struct pid_namespace *ns) { - spin_lock_irq(&pidmap_lock); + spin_lock(&pidmap_lock); ns->pid_allocated &= ~PIDNS_ADDING; - spin_unlock_irq(&pidmap_lock); + spin_unlock(&pidmap_lock); } struct pid *find_pid_ns(int nr, struct pid_namespace *ns) -- cgit v1.2.3