diff options
author | Christian Brauner <brauner@kernel.org> | 2025-04-25 10:35:27 +0200 |
---|---|---|
committer | Christian Brauner <brauner@kernel.org> | 2025-04-28 11:05:24 +0200 |
commit | 923ea4d4482be9475c31f1bc3691d7d74368348c (patch) | |
tree | d72065d553976bdf4ba27a5a555434893293ced9 | |
parent | b590c928cca7bdc7fd580d52e42bfdc3ac5eeacb (diff) | |
parent | 20b70e58961b855e61f2c6a7a19a38a64ad1bed6 (diff) |
Merge patch series "net, pidfs: enable handing out pidfds for reaped sk->sk_peer_pid"
Christian Brauner <brauner@kernel.org> says:
SO_PEERPIDFD currently doesn't support handing out pidfds if the
sk->sk_peer_pid thread-group leader has already been reaped. In this
case it currently returns EINVAL. Userspace still wants to get a pidfd
for a reaped process to have a stable handle it can pass on.
This is especially useful now that it is possible to retrieve exit
information through a pidfd via the PIDFD_GET_INFO ioctl()'s
PIDFD_INFO_EXIT flag.
Another summary has been provided by David in [1]:
> A pidfd can outlive the task it refers to, and thus user-space must
> already be prepared that the task underlying a pidfd is gone at the time
> they get their hands on the pidfd. For instance, resolving the pidfd to
> a PID via the fdinfo must be prepared to read `-1`.
>
> Despite user-space knowing that a pidfd might be stale, several kernel
> APIs currently add another layer that checks for this. In particular,
> SO_PEERPIDFD returns `EINVAL` if the peer-task was already reaped,
> but returns a stale pidfd if the task is reaped immediately after the
> respective alive-check.
>
> This has the unfortunate effect that user-space now has two ways to
> check for the exact same scenario: A syscall might return
> EINVAL/ESRCH/... *or* the pidfd might be stale, even though there is no
> particular reason to distinguish both cases. This also propagates
> through user-space APIs, which pass on pidfds. They must be prepared to
> pass on `-1` *or* the pidfd, because there is no guaranteed way to get a
> stale pidfd from the kernel.
> Userspace must already deal with a pidfd referring to a reaped task as
> the task may exit and get reaped at any time will there are still many
> pidfds referring to it.
In order to allow handing out reaped pidfd SO_PEERPIDFD needs to ensure
that PIDFD_INFO_EXIT information is available whenever a pidfd for a
reaped task is created by PIDFD_INFO_EXIT. The uapi promises that reaped
pidfds are only handed out if it is guaranteed that the caller sees the
exit information:
TEST_F(pidfd_info, success_reaped)
{
struct pidfd_info info = {
.mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT,
};
/*
* Process has already been reaped and PIDFD_INFO_EXIT been set.
* Verify that we can retrieve the exit status of the process.
*/
ASSERT_EQ(ioctl(self->child_pidfd4, PIDFD_GET_INFO, &info), 0);
ASSERT_FALSE(!!(info.mask & PIDFD_INFO_CREDS));
ASSERT_TRUE(!!(info.mask & PIDFD_INFO_EXIT));
ASSERT_TRUE(WIFEXITED(info.exit_code));
ASSERT_EQ(WEXITSTATUS(info.exit_code), 0);
}
To hand out pidfds for reaped processes we thus allocate a pidfs entry
for the relevant sk->sk_peer_pid at the time the sk->sk_peer_pid is
stashed and drop it when the socket is destroyed. This guarantees that
exit information will always be recorded for the sk->sk_peer_pid task
and we can hand out pidfds for reaped processes.
* patches from https://lore.kernel.org/20250425-work-pidfs-net-v2-0-450a19461e75@kernel.org:
net, pidfs: enable handing out pidfds for reaped sk->sk_peer_pid
pidfs: get rid of __pidfd_prepare()
net, pidfs: prepare for handing out pidfds for reaped sk->sk_peer_pid
pidfs: register pid in pidfs
Link: https://lore.kernel.org/20250425-work-pidfs-net-v2-0-450a19461e75@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
-rw-r--r-- | fs/pidfs.c | 81 | ||||
-rw-r--r-- | include/linux/pid.h | 2 | ||||
-rw-r--r-- | include/linux/pidfs.h | 3 | ||||
-rw-r--r-- | include/uapi/linux/pidfd.h | 2 | ||||
-rw-r--r-- | kernel/fork.c | 83 | ||||
-rw-r--r-- | net/core/sock.c | 22 | ||||
-rw-r--r-- | net/unix/af_unix.c | 85 |
7 files changed, 192 insertions, 86 deletions
diff --git a/fs/pidfs.c b/fs/pidfs.c index d64a4cbeb0da..9d993ecadad7 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -768,7 +768,7 @@ static inline bool pidfs_pid_valid(struct pid *pid, const struct path *path, { enum pid_type type; - if (flags & PIDFD_CLONE) + if (flags & PIDFD_STALE) return true; /* @@ -777,10 +777,14 @@ static inline bool pidfs_pid_valid(struct pid *pid, const struct path *path, * pidfd has been allocated perform another check that the pid * is still alive. If it is exit information is available even * if the task gets reaped before the pidfd is returned to - * userspace. The only exception is PIDFD_CLONE where no task - * linkage has been established for @pid yet and the kernel is - * in the middle of process creation so there's nothing for - * pidfs to miss. + * userspace. The only exception are indicated by PIDFD_STALE: + * + * (1) The kernel is in the middle of task creation and thus no + * task linkage has been established yet. + * (2) The caller knows @pid has been registered in pidfs at a + * time when the task was still alive. + * + * In both cases exit information will have been reported. */ if (flags & PIDFD_THREAD) type = PIDTYPE_PID; @@ -874,11 +878,11 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) int ret; /* - * Ensure that PIDFD_CLONE can be passed as a flag without + * Ensure that PIDFD_STALE can be passed as a flag without * overloading other uapi pidfd flags. */ - BUILD_BUG_ON(PIDFD_CLONE == PIDFD_THREAD); - BUILD_BUG_ON(PIDFD_CLONE == PIDFD_NONBLOCK); + BUILD_BUG_ON(PIDFD_STALE == PIDFD_THREAD); + BUILD_BUG_ON(PIDFD_STALE == PIDFD_NONBLOCK); ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path); if (ret < 0) @@ -887,7 +891,7 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) if (!pidfs_pid_valid(pid, &path, flags)) return ERR_PTR(-ESRCH); - flags &= ~PIDFD_CLONE; + flags &= ~PIDFD_STALE; pidfd_file = dentry_open(&path, flags, current_cred()); /* Raise PIDFD_THREAD explicitly as do_dentry_open() strips it. */ if (!IS_ERR(pidfd_file)) @@ -896,6 +900,65 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) return pidfd_file; } +/** + * pidfs_register_pid - register a struct pid in pidfs + * @pid: pid to pin + * + * Register a struct pid in pidfs. Needs to be paired with + * pidfs_put_pid() to not risk leaking the pidfs dentry and inode. + * + * Return: On success zero, on error a negative error code is returned. + */ +int pidfs_register_pid(struct pid *pid) +{ + struct path path __free(path_put) = {}; + int ret; + + might_sleep(); + + if (!pid) + return 0; + + ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path); + if (unlikely(ret)) + return ret; + /* Keep the dentry and only put the reference to the mount. */ + path.dentry = NULL; + return 0; +} + +/** + * pidfs_get_pid - pin a struct pid through pidfs + * @pid: pid to pin + * + * Similar to pidfs_register_pid() but only valid if the caller knows + * there's a reference to the @pid through a dentry already that can't + * go away. + */ +void pidfs_get_pid(struct pid *pid) +{ + if (!pid) + return; + WARN_ON_ONCE(!stashed_dentry_get(&pid->stashed)); +} + +/** + * pidfs_put_pid - drop a pidfs reference + * @pid: pid to drop + * + * Drop a reference to @pid via pidfs. This is only safe if the + * reference has been taken via pidfs_get_pid(). + */ +void pidfs_put_pid(struct pid *pid) +{ + might_sleep(); + + if (!pid) + return; + VFS_WARN_ON_ONCE(!pid->stashed); + dput(pid->stashed); +} + static void pidfs_inode_init_once(void *data) { struct pidfs_inode *pi = data; diff --git a/include/linux/pid.h b/include/linux/pid.h index 311ecebd7d56..453ae6d8a68d 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -77,7 +77,7 @@ struct file; struct pid *pidfd_pid(const struct file *file); struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags); struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags); -int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret); +int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret_file); void do_notify_pidfd(struct task_struct *task); static inline struct pid *get_pid(struct pid *pid) diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h index 05e6f8f4a026..2676890c4d0d 100644 --- a/include/linux/pidfs.h +++ b/include/linux/pidfs.h @@ -8,5 +8,8 @@ void pidfs_add_pid(struct pid *pid); void pidfs_remove_pid(struct pid *pid); void pidfs_exit(struct task_struct *tsk); extern const struct dentry_operations pidfs_dentry_operations; +int pidfs_register_pid(struct pid *pid); +void pidfs_get_pid(struct pid *pid); +void pidfs_put_pid(struct pid *pid); #endif /* _LINUX_PID_FS_H */ diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h index 2970ef44655a..8c1511edd0e9 100644 --- a/include/uapi/linux/pidfd.h +++ b/include/uapi/linux/pidfd.h @@ -12,7 +12,7 @@ #define PIDFD_THREAD O_EXCL #ifdef __KERNEL__ #include <linux/sched.h> -#define PIDFD_CLONE CLONE_PIDFD +#define PIDFD_STALE CLONE_PIDFD #endif /* Flags for pidfd_send_signal(). */ diff --git a/kernel/fork.c b/kernel/fork.c index f7403e1fb0d4..1d95f4dae327 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2036,54 +2036,10 @@ static inline void rcu_copy_process(struct task_struct *p) } /** - * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd - * @pid: the struct pid for which to create a pidfd - * @flags: flags of the new @pidfd - * @ret: Where to return the file for the pidfd. - * - * Allocate a new file that stashes @pid and reserve a new pidfd number in the - * caller's file descriptor table. The pidfd is reserved but not installed yet. - * - * The helper doesn't perform checks on @pid which makes it useful for pidfds - * created via CLONE_PIDFD where @pid has no task attached when the pidfd and - * pidfd file are prepared. - * - * If this function returns successfully the caller is responsible to either - * call fd_install() passing the returned pidfd and pidfd file as arguments in - * order to install the pidfd into its file descriptor table or they must use - * put_unused_fd() and fput() on the returned pidfd and pidfd file - * respectively. - * - * This function is useful when a pidfd must already be reserved but there - * might still be points of failure afterwards and the caller wants to ensure - * that no pidfd is leaked into its file descriptor table. - * - * Return: On success, a reserved pidfd is returned from the function and a new - * pidfd file is returned in the last argument to the function. On - * error, a negative error code is returned from the function and the - * last argument remains unchanged. - */ -static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) -{ - struct file *pidfd_file; - - CLASS(get_unused_fd, pidfd)(O_CLOEXEC); - if (pidfd < 0) - return pidfd; - - pidfd_file = pidfs_alloc_file(pid, flags | O_RDWR); - if (IS_ERR(pidfd_file)) - return PTR_ERR(pidfd_file); - - *ret = pidfd_file; - return take_fd(pidfd); -} - -/** * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd * @pid: the struct pid for which to create a pidfd * @flags: flags of the new @pidfd - * @ret: Where to return the pidfd. + * @ret_file: return the new pidfs file * * Allocate a new file that stashes @pid and reserve a new pidfd number in the * caller's file descriptor table. The pidfd is reserved but not installed yet. @@ -2106,16 +2062,26 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re * error, a negative error code is returned from the function and the * last argument remains unchanged. */ -int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) +int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret_file) { + struct file *pidfs_file; + /* - * While holding the pidfd waitqueue lock removing the task - * linkage for the thread-group leader pid (PIDTYPE_TGID) isn't - * possible. Thus, if there's still task linkage for PIDTYPE_PID - * not having thread-group leader linkage for the pid means it - * wasn't a thread-group leader in the first place. + * PIDFD_STALE is only allowed to be passed if the caller knows + * that @pid is already registered in pidfs and thus + * PIDFD_INFO_EXIT information is guaranteed to be available. */ - scoped_guard(spinlock_irq, &pid->wait_pidfd.lock) { + if (!(flags & PIDFD_STALE)) { + /* + * While holding the pidfd waitqueue lock removing the + * task linkage for the thread-group leader pid + * (PIDTYPE_TGID) isn't possible. Thus, if there's still + * task linkage for PIDTYPE_PID not having thread-group + * leader linkage for the pid means it wasn't a + * thread-group leader in the first place. + */ + guard(spinlock_irq)(&pid->wait_pidfd.lock); + /* Task has already been reaped. */ if (!pid_has_task(pid, PIDTYPE_PID)) return -ESRCH; @@ -2128,7 +2094,16 @@ int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) return -ENOENT; } - return __pidfd_prepare(pid, flags, ret); + CLASS(get_unused_fd, pidfd)(O_CLOEXEC); + if (pidfd < 0) + return pidfd; + + pidfs_file = pidfs_alloc_file(pid, flags | O_RDWR); + if (IS_ERR(pidfs_file)) + return PTR_ERR(pidfs_file); + + *ret_file = pidfs_file; + return take_fd(pidfd); } static void __delayed_free_task(struct rcu_head *rhp) @@ -2477,7 +2452,7 @@ __latent_entropy struct task_struct *copy_process( * Note that no task has been attached to @pid yet indicate * that via CLONE_PIDFD. */ - retval = __pidfd_prepare(pid, flags | PIDFD_CLONE, &pidfile); + retval = pidfd_prepare(pid, flags | PIDFD_STALE, &pidfile); if (retval < 0) goto bad_fork_free_pid; pidfd = retval; diff --git a/net/core/sock.c b/net/core/sock.c index b969d2210656..180f441f4ab6 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -148,6 +148,8 @@ #include <linux/ethtool.h> +#include <uapi/linux/pidfd.h> + #include "dev.h" static DEFINE_MUTEX(proto_list_mutex); @@ -1879,6 +1881,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname, { struct pid *peer_pid; struct file *pidfd_file = NULL; + unsigned int flags = 0; int pidfd; if (len > sizeof(pidfd)) @@ -1891,18 +1894,17 @@ int sk_getsockopt(struct sock *sk, int level, int optname, if (!peer_pid) return -ENODATA; - pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file); + /* The use of PIDFD_STALE requires stashing of struct pid + * on pidfs with pidfs_register_pid() and only AF_UNIX + * were prepared for this. + */ + if (sk->sk_family == AF_UNIX) + flags = PIDFD_STALE; + + pidfd = pidfd_prepare(peer_pid, flags, &pidfd_file); put_pid(peer_pid); - if (pidfd < 0) { - /* - * dbus-broker relies on -EINVAL being returned - * to indicate ESRCH. Paper over it until this - * is fixed in userspace. - */ - if (pidfd == -ESRCH) - pidfd = -EINVAL; + if (pidfd < 0) return pidfd; - } if (copy_to_sockptr(optval, &pidfd, len) || copy_to_sockptr(optlen, &len, sizeof(int))) { diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index f78a2492826f..472f8aa9ea15 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -100,6 +100,7 @@ #include <linux/splice.h> #include <linux/string.h> #include <linux/uaccess.h> +#include <linux/pidfs.h> #include <net/af_unix.h> #include <net/net_namespace.h> #include <net/scm.h> @@ -643,6 +644,9 @@ static void unix_sock_destructor(struct sock *sk) return; } + if (sk->sk_peer_pid) + pidfs_put_pid(sk->sk_peer_pid); + if (u->addr) unix_release_addr(u->addr); @@ -734,13 +738,48 @@ static void unix_release_sock(struct sock *sk, int embrion) unix_gc(); /* Garbage collect fds */ } -static void init_peercred(struct sock *sk) +struct unix_peercred { + struct pid *peer_pid; + const struct cred *peer_cred; +}; + +static inline int prepare_peercred(struct unix_peercred *peercred) { - sk->sk_peer_pid = get_pid(task_tgid(current)); - sk->sk_peer_cred = get_current_cred(); + struct pid *pid; + int err; + + pid = task_tgid(current); + err = pidfs_register_pid(pid); + if (likely(!err)) { + peercred->peer_pid = get_pid(pid); + peercred->peer_cred = get_current_cred(); + } + return err; } -static void update_peercred(struct sock *sk) +static void drop_peercred(struct unix_peercred *peercred) +{ + const struct cred *cred = NULL; + struct pid *pid = NULL; + + might_sleep(); + + swap(peercred->peer_pid, pid); + swap(peercred->peer_cred, cred); + + pidfs_put_pid(pid); + put_pid(pid); + put_cred(cred); +} + +static inline void init_peercred(struct sock *sk, + const struct unix_peercred *peercred) +{ + sk->sk_peer_pid = peercred->peer_pid; + sk->sk_peer_cred = peercred->peer_cred; +} + +static void update_peercred(struct sock *sk, struct unix_peercred *peercred) { const struct cred *old_cred; struct pid *old_pid; @@ -748,11 +787,11 @@ static void update_peercred(struct sock *sk) spin_lock(&sk->sk_peer_lock); old_pid = sk->sk_peer_pid; old_cred = sk->sk_peer_cred; - init_peercred(sk); + init_peercred(sk, peercred); spin_unlock(&sk->sk_peer_lock); - put_pid(old_pid); - put_cred(old_cred); + peercred->peer_pid = old_pid; + peercred->peer_cred = old_cred; } static void copy_peercred(struct sock *sk, struct sock *peersk) @@ -761,6 +800,7 @@ static void copy_peercred(struct sock *sk, struct sock *peersk) spin_lock(&sk->sk_peer_lock); sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); + pidfs_get_pid(sk->sk_peer_pid); sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); spin_unlock(&sk->sk_peer_lock); } @@ -770,6 +810,7 @@ static int unix_listen(struct socket *sock, int backlog) int err; struct sock *sk = sock->sk; struct unix_sock *u = unix_sk(sk); + struct unix_peercred peercred = {}; err = -EOPNOTSUPP; if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) @@ -777,6 +818,9 @@ static int unix_listen(struct socket *sock, int backlog) err = -EINVAL; if (!READ_ONCE(u->addr)) goto out; /* No listens on an unbound socket */ + err = prepare_peercred(&peercred); + if (err) + goto out; unix_state_lock(sk); if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) goto out_unlock; @@ -786,11 +830,12 @@ static int unix_listen(struct socket *sock, int backlog) WRITE_ONCE(sk->sk_state, TCP_LISTEN); /* set credentials so connect can copy them */ - update_peercred(sk); + update_peercred(sk, &peercred); err = 0; out_unlock: unix_state_unlock(sk); + drop_peercred(&peercred); out: return err; } @@ -1525,6 +1570,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; struct sock *sk = sock->sk, *newsk = NULL, *other = NULL; struct unix_sock *u = unix_sk(sk), *newu, *otheru; + struct unix_peercred peercred = {}; struct net *net = sock_net(sk); struct sk_buff *skb = NULL; unsigned char state; @@ -1561,6 +1607,10 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, goto out; } + err = prepare_peercred(&peercred); + if (err) + goto out; + /* Allocate skb for sending to listening sock */ skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); if (!skb) { @@ -1636,7 +1686,7 @@ restart: unix_peer(newsk) = sk; newsk->sk_state = TCP_ESTABLISHED; newsk->sk_type = sk->sk_type; - init_peercred(newsk); + init_peercred(newsk, &peercred); newu = unix_sk(newsk); newu->listener = other; RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); @@ -1695,20 +1745,33 @@ out_free_skb: out_free_sk: unix_release_sock(newsk, 0); out: + drop_peercred(&peercred); return err; } static int unix_socketpair(struct socket *socka, struct socket *sockb) { + struct unix_peercred ska_peercred = {}, skb_peercred = {}; struct sock *ska = socka->sk, *skb = sockb->sk; + int err; + + err = prepare_peercred(&ska_peercred); + if (err) + return err; + + err = prepare_peercred(&skb_peercred); + if (err) { + drop_peercred(&ska_peercred); + return err; + } /* Join our sockets back to back */ sock_hold(ska); sock_hold(skb); unix_peer(ska) = skb; unix_peer(skb) = ska; - init_peercred(ska); - init_peercred(skb); + init_peercred(ska, &ska_peercred); + init_peercred(skb, &skb_peercred); ska->sk_state = TCP_ESTABLISHED; skb->sk_state = TCP_ESTABLISHED; |