diff options
Diffstat (limited to 'net/unix/af_unix.c')
-rw-r--r-- | net/unix/af_unix.c | 1225 |
1 files changed, 903 insertions, 322 deletions
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 5d1192ceb139..c19569819866 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -89,6 +89,7 @@ #include <linux/socket.h> #include <linux/un.h> #include <linux/fcntl.h> +#include <linux/filter.h> #include <linux/termios.h> #include <linux/sockios.h> #include <linux/net.h> @@ -113,27 +114,68 @@ #include <linux/security.h> #include <linux/freezer.h> #include <linux/file.h> +#include <linux/btf_ids.h> #include "scm.h" +spinlock_t unix_table_locks[2 * UNIX_HASH_SIZE]; +EXPORT_SYMBOL_GPL(unix_table_locks); struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE]; EXPORT_SYMBOL_GPL(unix_socket_table); -DEFINE_SPINLOCK(unix_table_lock); -EXPORT_SYMBOL_GPL(unix_table_lock); static atomic_long_t unix_nr_socks; +/* SMP locking strategy: + * hash table is protected with spinlock unix_table_locks + * each socket state is protected by separate spin lock. + */ -static struct hlist_head *unix_sockets_unbound(void *addr) +static unsigned int unix_unbound_hash(struct sock *sk) { - unsigned long hash = (unsigned long)addr; + unsigned long hash = (unsigned long)sk; hash ^= hash >> 16; hash ^= hash >> 8; - hash %= UNIX_HASH_SIZE; - return &unix_socket_table[UNIX_HASH_SIZE + hash]; + hash ^= sk->sk_type; + + return UNIX_HASH_SIZE + (hash & (UNIX_HASH_SIZE - 1)); +} + +static unsigned int unix_bsd_hash(struct inode *i) +{ + return i->i_ino & (UNIX_HASH_SIZE - 1); +} + +static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr, + int addr_len, int type) +{ + __wsum csum = csum_partial(sunaddr, addr_len, 0); + unsigned int hash; + + hash = (__force unsigned int)csum_fold(csum); + hash ^= hash >> 8; + hash ^= type; + + return hash & (UNIX_HASH_SIZE - 1); +} + +static void unix_table_double_lock(unsigned int hash1, unsigned int hash2) +{ + /* hash1 and hash2 is never the same because + * one is between 0 and UNIX_HASH_SIZE - 1, and + * another is between UNIX_HASH_SIZE and UNIX_HASH_SIZE * 2. + */ + if (hash1 > hash2) + swap(hash1, hash2); + + spin_lock(&unix_table_locks[hash1]); + spin_lock_nested(&unix_table_locks[hash2], SINGLE_DEPTH_NESTING); } -#define UNIX_ABSTRACT(sk) (unix_sk(sk)->addr->hash < UNIX_HASH_SIZE) +static void unix_table_double_unlock(unsigned int hash1, unsigned int hash2) +{ + spin_unlock(&unix_table_locks[hash1]); + spin_unlock(&unix_table_locks[hash2]); +} #ifdef CONFIG_SECURITY_NETWORK static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) @@ -163,20 +205,6 @@ static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) } #endif /* CONFIG_SECURITY_NETWORK */ -/* - * SMP locking strategy: - * hash table is protected with spinlock unix_table_lock - * each socket state is protected by separate spin lock. - */ - -static inline unsigned int unix_hash_fold(__wsum n) -{ - unsigned int hash = (__force unsigned int)csum_fold(n); - - hash ^= hash>>8; - return hash&(UNIX_HASH_SIZE-1); -} - #define unix_peer(sk) (unix_sk(sk)->peer) static inline int unix_our_peer(struct sock *sk, struct sock *osk) @@ -213,6 +241,22 @@ struct sock *unix_peer_get(struct sock *s) } EXPORT_SYMBOL_GPL(unix_peer_get); +static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr, + int addr_len) +{ + struct unix_address *addr; + + addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL); + if (!addr) + return NULL; + + refcount_set(&addr->refcnt, 1); + addr->len = addr_len; + memcpy(addr->name, sunaddr, addr_len); + + return addr; +} + static inline void unix_release_addr(struct unix_address *addr) { if (refcount_dec_and_test(&addr->refcnt)) @@ -226,29 +270,29 @@ static inline void unix_release_addr(struct unix_address *addr) * - if started by zero, it is abstract name. */ -static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp) +static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len) { - *hashp = 0; - - if (len <= sizeof(short) || len > sizeof(*sunaddr)) + if (addr_len <= offsetof(struct sockaddr_un, sun_path) || + addr_len > sizeof(*sunaddr)) return -EINVAL; - if (!sunaddr || sunaddr->sun_family != AF_UNIX) + + if (sunaddr->sun_family != AF_UNIX) return -EINVAL; - if (sunaddr->sun_path[0]) { - /* - * This may look like an off by one error but it is a bit more - * subtle. 108 is the longest valid AF_UNIX path for a binding. - * sun_path[108] doesn't as such exist. However in kernel space - * we are guaranteed that it is a valid memory location in our - * kernel address buffer. - */ - ((char *)sunaddr)[len] = 0; - len = strlen(sunaddr->sun_path)+1+sizeof(short); - return len; - } - *hashp = unix_hash_fold(csum_partial(sunaddr, len, 0)); - return len; + return 0; +} + +static void unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len) +{ + /* This may look like an off by one error but it is a bit more + * subtle. 108 is the longest valid AF_UNIX path for a binding. + * sun_path[108] doesn't as such exist. However in kernel space + * we are guaranteed that it is a valid memory location in our + * kernel address buffer because syscall functions always pass + * a pointer of struct sockaddr_storage which has a bigger buffer + * than 108. + */ + ((char *)sunaddr)[addr_len] = 0; } static void __unix_remove_socket(struct sock *sk) @@ -256,33 +300,43 @@ static void __unix_remove_socket(struct sock *sk) sk_del_node_init(sk); } -static void __unix_insert_socket(struct hlist_head *list, struct sock *sk) +static void __unix_insert_socket(struct sock *sk) { WARN_ON(!sk_unhashed(sk)); - sk_add_node(sk, list); + sk_add_node(sk, &unix_socket_table[sk->sk_hash]); } -static inline void unix_remove_socket(struct sock *sk) +static void __unix_set_addr_hash(struct sock *sk, struct unix_address *addr, + unsigned int hash) { - spin_lock(&unix_table_lock); __unix_remove_socket(sk); - spin_unlock(&unix_table_lock); + smp_store_release(&unix_sk(sk)->addr, addr); + + sk->sk_hash = hash; + __unix_insert_socket(sk); } -static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk) +static void unix_remove_socket(struct sock *sk) { - spin_lock(&unix_table_lock); - __unix_insert_socket(list, sk); - spin_unlock(&unix_table_lock); + spin_lock(&unix_table_locks[sk->sk_hash]); + __unix_remove_socket(sk); + spin_unlock(&unix_table_locks[sk->sk_hash]); +} + +static void unix_insert_unbound_socket(struct sock *sk) +{ + spin_lock(&unix_table_locks[sk->sk_hash]); + __unix_insert_socket(sk); + spin_unlock(&unix_table_locks[sk->sk_hash]); } static struct sock *__unix_find_socket_byname(struct net *net, struct sockaddr_un *sunname, - int len, int type, unsigned int hash) + int len, unsigned int hash) { struct sock *s; - sk_for_each(s, &unix_socket_table[hash ^ type]) { + sk_for_each(s, &unix_socket_table[hash]) { struct unix_sock *u = unix_sk(s); if (!net_eq(sock_net(s), net)) @@ -297,37 +351,35 @@ static struct sock *__unix_find_socket_byname(struct net *net, static inline struct sock *unix_find_socket_byname(struct net *net, struct sockaddr_un *sunname, - int len, int type, - unsigned int hash) + int len, unsigned int hash) { struct sock *s; - spin_lock(&unix_table_lock); - s = __unix_find_socket_byname(net, sunname, len, type, hash); + spin_lock(&unix_table_locks[hash]); + s = __unix_find_socket_byname(net, sunname, len, hash); if (s) sock_hold(s); - spin_unlock(&unix_table_lock); + spin_unlock(&unix_table_locks[hash]); return s; } static struct sock *unix_find_socket_byinode(struct inode *i) { + unsigned int hash = unix_bsd_hash(i); struct sock *s; - spin_lock(&unix_table_lock); - sk_for_each(s, - &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) { + spin_lock(&unix_table_locks[hash]); + sk_for_each(s, &unix_socket_table[hash]) { struct dentry *dentry = unix_sk(s)->path.dentry; if (dentry && d_backing_inode(dentry) == i) { sock_hold(s); - goto found; + spin_unlock(&unix_table_locks[hash]); + return s; } } - s = NULL; -found: - spin_unlock(&unix_table_lock); - return s; + spin_unlock(&unix_table_locks[hash]); + return NULL; } /* Support code for asymmetrically connected dgram sockets @@ -484,9 +536,10 @@ static void unix_dgram_disconnected(struct sock *sk, struct sock *other) */ if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) { other->sk_err = ECONNRESET; - other->sk_error_report(other); + sk_error_report(other); } } + other->sk_state = TCP_CLOSE; } static void unix_sock_destructor(struct sock *sk) @@ -495,6 +548,12 @@ static void unix_sock_destructor(struct sock *sk) skb_queue_purge(&sk->sk_receive_queue); +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) + if (u->oob_skb) { + kfree_skb(u->oob_skb); + u->oob_skb = NULL; + } +#endif WARN_ON(refcount_read(&sk->sk_wmem_alloc)); WARN_ON(!sk_unhashed(sk)); WARN_ON(sk->sk_socket); @@ -507,9 +566,7 @@ static void unix_sock_destructor(struct sock *sk) unix_release_addr(u->addr); atomic_long_dec(&unix_nr_socks); - local_bh_disable(); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); - local_bh_enable(); #ifdef UNIX_REFCNT_DEBUG pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk, atomic_long_read(&unix_nr_socks)); @@ -593,20 +650,42 @@ static void unix_release_sock(struct sock *sk, int embrion) static void init_peercred(struct sock *sk) { - put_pid(sk->sk_peer_pid); - if (sk->sk_peer_cred) - put_cred(sk->sk_peer_cred); + const struct cred *old_cred; + struct pid *old_pid; + + spin_lock(&sk->sk_peer_lock); + old_pid = sk->sk_peer_pid; + old_cred = sk->sk_peer_cred; sk->sk_peer_pid = get_pid(task_tgid(current)); sk->sk_peer_cred = get_current_cred(); + spin_unlock(&sk->sk_peer_lock); + + put_pid(old_pid); + put_cred(old_cred); } static void copy_peercred(struct sock *sk, struct sock *peersk) { - put_pid(sk->sk_peer_pid); - if (sk->sk_peer_cred) - put_cred(sk->sk_peer_cred); + const struct cred *old_cred; + struct pid *old_pid; + + if (sk < peersk) { + spin_lock(&sk->sk_peer_lock); + spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING); + } else { + spin_lock(&peersk->sk_peer_lock); + spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING); + } + old_pid = sk->sk_peer_pid; + old_cred = sk->sk_peer_cred; sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); + + spin_unlock(&sk->sk_peer_lock); + spin_unlock(&peersk->sk_peer_lock); + + put_pid(old_pid); + put_cred(old_cred); } static int unix_listen(struct socket *sock, int backlog) @@ -662,6 +741,10 @@ static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos, unsigned int flags); static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); +static int unix_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor); +static int unix_stream_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor); static int unix_dgram_connect(struct socket *, struct sockaddr *, int, int); static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t); @@ -715,6 +798,7 @@ static const struct proto_ops unix_stream_ops = { .shutdown = unix_shutdown, .sendmsg = unix_stream_sendmsg, .recvmsg = unix_stream_recvmsg, + .read_sock = unix_stream_read_sock, .mmap = sock_no_mmap, .sendpage = unix_stream_sendpage, .splice_read = unix_stream_splice_read, @@ -739,6 +823,7 @@ static const struct proto_ops unix_dgram_ops = { .listen = sock_no_listen, .shutdown = unix_shutdown, .sendmsg = unix_dgram_sendmsg, + .read_sock = unix_read_sock, .recvmsg = unix_dgram_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, @@ -770,27 +855,66 @@ static const struct proto_ops unix_seqpacket_ops = { .show_fdinfo = unix_show_fdinfo, }; -static struct proto unix_proto = { +static void unix_close(struct sock *sk, long timeout) +{ + /* Nothing to do here, unix socket does not need a ->close(). + * This is merely for sockmap. + */ +} + +static void unix_unhash(struct sock *sk) +{ + /* Nothing to do here, unix socket does not need a ->unhash(). + * This is merely for sockmap. + */ +} + +struct proto unix_dgram_proto = { .name = "UNIX", .owner = THIS_MODULE, .obj_size = sizeof(struct unix_sock), + .close = unix_close, +#ifdef CONFIG_BPF_SYSCALL + .psock_update_sk_prot = unix_dgram_bpf_update_proto, +#endif +}; + +struct proto unix_stream_proto = { + .name = "UNIX-STREAM", + .owner = THIS_MODULE, + .obj_size = sizeof(struct unix_sock), + .close = unix_close, + .unhash = unix_unhash, +#ifdef CONFIG_BPF_SYSCALL + .psock_update_sk_prot = unix_stream_bpf_update_proto, +#endif }; -static struct sock *unix_create1(struct net *net, struct socket *sock, int kern) +static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type) { - struct sock *sk = NULL; struct unix_sock *u; + struct sock *sk; + int err; atomic_long_inc(&unix_nr_socks); - if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) - goto out; + if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) { + err = -ENFILE; + goto err; + } - sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto, kern); - if (!sk) - goto out; + if (type == SOCK_STREAM) + sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern); + else /*dgram and seqpacket */ + sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern); + + if (!sk) { + err = -ENOMEM; + goto err; + } sock_init_data(sock, sk); + sk->sk_hash = unix_unbound_hash(sk); sk->sk_allocation = GFP_KERNEL_ACCOUNT; sk->sk_write_space = unix_write_space; sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen; @@ -806,21 +930,22 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern) init_waitqueue_head(&u->peer_wait); init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); memset(&u->scm_stat, 0, sizeof(struct scm_stat)); - unix_insert_socket(unix_sockets_unbound(sk), sk); -out: - if (sk == NULL) - atomic_long_dec(&unix_nr_socks); - else { - local_bh_disable(); - sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); - local_bh_enable(); - } + unix_insert_unbound_socket(sk); + + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + return sk; + +err: + atomic_long_dec(&unix_nr_socks); + return ERR_PTR(err); } static int unix_create(struct net *net, struct socket *sock, int protocol, int kern) { + struct sock *sk; + if (protocol && protocol != PF_UNIX) return -EPROTONOSUPPORT; @@ -847,7 +972,11 @@ static int unix_create(struct net *net, struct socket *sock, int protocol, return -ESOCKTNOSUPPORT; } - return unix_create1(net, sock, kern) ? 0 : -ENOMEM; + sk = unix_create1(net, sock, kern, sock->type); + if (IS_ERR(sk)) + return PTR_ERR(sk); + + return 0; } static int unix_release(struct socket *sock) @@ -857,21 +986,97 @@ static int unix_release(struct socket *sock) if (!sk) return 0; + sk->sk_prot->close(sk, 0); unix_release_sock(sk, 0); sock->sk = NULL; return 0; } -static int unix_autobind(struct socket *sock) +static struct sock *unix_find_bsd(struct net *net, struct sockaddr_un *sunaddr, + int addr_len, int type) { - struct sock *sk = sock->sk; - struct net *net = sock_net(sk); + struct inode *inode; + struct path path; + struct sock *sk; + int err; + + unix_mkname_bsd(sunaddr, addr_len); + err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path); + if (err) + goto fail; + + err = path_permission(&path, MAY_WRITE); + if (err) + goto path_put; + + err = -ECONNREFUSED; + inode = d_backing_inode(path.dentry); + if (!S_ISSOCK(inode->i_mode)) + goto path_put; + + sk = unix_find_socket_byinode(inode); + if (!sk) + goto path_put; + + err = -EPROTOTYPE; + if (sk->sk_type == type) + touch_atime(&path); + else + goto sock_put; + + path_put(&path); + + return sk; + +sock_put: + sock_put(sk); +path_put: + path_put(&path); +fail: + return ERR_PTR(err); +} + +static struct sock *unix_find_abstract(struct net *net, + struct sockaddr_un *sunaddr, + int addr_len, int type) +{ + unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type); + struct dentry *dentry; + struct sock *sk; + + sk = unix_find_socket_byname(net, sunaddr, addr_len, hash); + if (!sk) + return ERR_PTR(-ECONNREFUSED); + + dentry = unix_sk(sk)->path.dentry; + if (dentry) + touch_atime(&unix_sk(sk)->path); + + return sk; +} + +static struct sock *unix_find_other(struct net *net, + struct sockaddr_un *sunaddr, + int addr_len, int type) +{ + struct sock *sk; + + if (sunaddr->sun_path[0]) + sk = unix_find_bsd(net, sunaddr, addr_len, type); + else + sk = unix_find_abstract(net, sunaddr, addr_len, type); + + return sk; +} + +static int unix_autobind(struct sock *sk) +{ + unsigned int new_hash, old_hash = sk->sk_hash; struct unix_sock *u = unix_sk(sk); - static u32 ordernum = 1; struct unix_address *addr; + u32 lastnum, ordernum; int err; - unsigned int retries = 0; err = mutex_lock_interruptible(&u->bindlock); if (err) @@ -881,221 +1086,182 @@ static int unix_autobind(struct socket *sock) goto out; err = -ENOMEM; - addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL); + addr = kzalloc(sizeof(*addr) + + offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL); if (!addr) goto out; + addr->len = offsetof(struct sockaddr_un, sun_path) + 6; addr->name->sun_family = AF_UNIX; refcount_set(&addr->refcnt, 1); + ordernum = prandom_u32(); + lastnum = ordernum & 0xFFFFF; retry: - addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short); - addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0)); + ordernum = (ordernum + 1) & 0xFFFFF; + sprintf(addr->name->sun_path + 1, "%05x", ordernum); - spin_lock(&unix_table_lock); - ordernum = (ordernum+1)&0xFFFFF; + new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); + unix_table_double_lock(old_hash, new_hash); - if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type, - addr->hash)) { - spin_unlock(&unix_table_lock); - /* - * __unix_find_socket_byname() may take long time if many names + if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len, + new_hash)) { + unix_table_double_unlock(old_hash, new_hash); + + /* __unix_find_socket_byname() may take long time if many names * are already in use. */ cond_resched(); - /* Give up if all names seems to be in use. */ - if (retries++ == 0xFFFFF) { + + if (ordernum == lastnum) { + /* Give up if all names seems to be in use. */ err = -ENOSPC; - kfree(addr); + unix_release_addr(addr); goto out; } + goto retry; } - addr->hash ^= sk->sk_type; - __unix_remove_socket(sk); - smp_store_release(&u->addr, addr); - __unix_insert_socket(&unix_socket_table[addr->hash], sk); - spin_unlock(&unix_table_lock); + __unix_set_addr_hash(sk, addr, new_hash); + unix_table_double_unlock(old_hash, new_hash); err = 0; out: mutex_unlock(&u->bindlock); return err; } -static struct sock *unix_find_other(struct net *net, - struct sockaddr_un *sunname, int len, - int type, unsigned int hash, int *error) +static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, + int addr_len) { - struct sock *u; - struct path path; - int err = 0; - - if (sunname->sun_path[0]) { - struct inode *inode; - err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path); - if (err) - goto fail; - inode = d_backing_inode(path.dentry); - err = path_permission(&path, MAY_WRITE); - if (err) - goto put_fail; - - err = -ECONNREFUSED; - if (!S_ISSOCK(inode->i_mode)) - goto put_fail; - u = unix_find_socket_byinode(inode); - if (!u) - goto put_fail; - - if (u->sk_type == type) - touch_atime(&path); + umode_t mode = S_IFSOCK | + (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); + unsigned int new_hash, old_hash = sk->sk_hash; + struct unix_sock *u = unix_sk(sk); + struct user_namespace *ns; // barf... + struct unix_address *addr; + struct dentry *dentry; + struct path parent; + int err; - path_put(&path); + unix_mkname_bsd(sunaddr, addr_len); + addr_len = strlen(sunaddr->sun_path) + + offsetof(struct sockaddr_un, sun_path) + 1; - err = -EPROTOTYPE; - if (u->sk_type != type) { - sock_put(u); - goto fail; - } - } else { - err = -ECONNREFUSED; - u = unix_find_socket_byname(net, sunname, len, type, hash); - if (u) { - struct dentry *dentry; - dentry = unix_sk(u)->path.dentry; - if (dentry) - touch_atime(&unix_sk(u)->path); - } else - goto fail; - } - return u; - -put_fail: - path_put(&path); -fail: - *error = err; - return NULL; -} + addr = unix_create_addr(sunaddr, addr_len); + if (!addr) + return -ENOMEM; -static int unix_mknod(const char *sun_path, umode_t mode, struct path *res) -{ - struct dentry *dentry; - struct path path; - int err = 0; /* * Get the parent directory, calculate the hash for last * component. */ - dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0); - err = PTR_ERR(dentry); - if (IS_ERR(dentry)) - return err; + dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto out; + } /* * All right, let's create it. */ - err = security_path_mknod(&path, dentry, mode, 0); - if (!err) { - err = vfs_mknod(mnt_user_ns(path.mnt), d_inode(path.dentry), - dentry, mode, 0); - if (!err) { - res->mnt = mntget(path.mnt); - res->dentry = dget(dentry); - } - } - done_path_create(&path, dentry); - return err; + ns = mnt_user_ns(parent.mnt); + err = security_path_mknod(&parent, dentry, mode, 0); + if (!err) + err = vfs_mknod(ns, d_inode(parent.dentry), dentry, mode, 0); + if (err) + goto out_path; + err = mutex_lock_interruptible(&u->bindlock); + if (err) + goto out_unlink; + if (u->addr) + goto out_unlock; + + new_hash = unix_bsd_hash(d_backing_inode(dentry)); + unix_table_double_lock(old_hash, new_hash); + u->path.mnt = mntget(parent.mnt); + u->path.dentry = dget(dentry); + __unix_set_addr_hash(sk, addr, new_hash); + unix_table_double_unlock(old_hash, new_hash); + mutex_unlock(&u->bindlock); + done_path_create(&parent, dentry); + return 0; + +out_unlock: + mutex_unlock(&u->bindlock); + err = -EINVAL; +out_unlink: + /* failed after successful mknod? unlink what we'd created... */ + vfs_unlink(ns, d_inode(parent.dentry), dentry, NULL); +out_path: + done_path_create(&parent, dentry); +out: + unix_release_addr(addr); + return err == -EEXIST ? -EADDRINUSE : err; } -static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, + int addr_len) { - struct sock *sk = sock->sk; - struct net *net = sock_net(sk); + unsigned int new_hash, old_hash = sk->sk_hash; struct unix_sock *u = unix_sk(sk); - struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; - char *sun_path = sunaddr->sun_path; - int err; - unsigned int hash; struct unix_address *addr; - struct hlist_head *list; - struct path path = { }; - - err = -EINVAL; - if (addr_len < offsetofend(struct sockaddr_un, sun_family) || - sunaddr->sun_family != AF_UNIX) - goto out; + int err; - if (addr_len == sizeof(short)) { - err = unix_autobind(sock); - goto out; - } + addr = unix_create_addr(sunaddr, addr_len); + if (!addr) + return -ENOMEM; - err = unix_mkname(sunaddr, addr_len, &hash); - if (err < 0) + err = mutex_lock_interruptible(&u->bindlock); + if (err) goto out; - addr_len = err; - if (sun_path[0]) { - umode_t mode = S_IFSOCK | - (SOCK_INODE(sock)->i_mode & ~current_umask()); - err = unix_mknod(sun_path, mode, &path); - if (err) { - if (err == -EEXIST) - err = -EADDRINUSE; - goto out; - } + if (u->addr) { + err = -EINVAL; + goto out_mutex; } - err = mutex_lock_interruptible(&u->bindlock); - if (err) - goto out_put; + new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); + unix_table_double_lock(old_hash, new_hash); - err = -EINVAL; - if (u->addr) - goto out_up; - - err = -ENOMEM; - addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL); - if (!addr) - goto out_up; + if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len, + new_hash)) + goto out_spin; - memcpy(addr->name, sunaddr, addr_len); - addr->len = addr_len; - addr->hash = hash ^ sk->sk_type; - refcount_set(&addr->refcnt, 1); + __unix_set_addr_hash(sk, addr, new_hash); + unix_table_double_unlock(old_hash, new_hash); + mutex_unlock(&u->bindlock); + return 0; - if (sun_path[0]) { - addr->hash = UNIX_HASH_SIZE; - hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1); - spin_lock(&unix_table_lock); - u->path = path; - list = &unix_socket_table[hash]; - } else { - spin_lock(&unix_table_lock); - err = -EADDRINUSE; - if (__unix_find_socket_byname(net, sunaddr, addr_len, - sk->sk_type, hash)) { - unix_release_addr(addr); - goto out_unlock; - } +out_spin: + unix_table_double_unlock(old_hash, new_hash); + err = -EADDRINUSE; +out_mutex: + mutex_unlock(&u->bindlock); +out: + unix_release_addr(addr); + return err; +} - list = &unix_socket_table[addr->hash]; - } +static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; + struct sock *sk = sock->sk; + int err; - err = 0; - __unix_remove_socket(sk); - smp_store_release(&u->addr, addr); - __unix_insert_socket(list, sk); + if (addr_len == offsetof(struct sockaddr_un, sun_path) && + sunaddr->sun_family == AF_UNIX) + return unix_autobind(sk); -out_unlock: - spin_unlock(&unix_table_lock); -out_up: - mutex_unlock(&u->bindlock); -out_put: + err = unix_validate_addr(sunaddr, addr_len); if (err) - path_put(&path); -out: + return err; + + if (sunaddr->sun_path[0]) + err = unix_bind_bsd(sk, sunaddr, addr_len); + else + err = unix_bind_abstract(sk, sunaddr, addr_len); + return err; } @@ -1131,7 +1297,6 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, struct net *net = sock_net(sk); struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr; struct sock *other; - unsigned int hash; int err; err = -EINVAL; @@ -1139,19 +1304,23 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, goto out; if (addr->sa_family != AF_UNSPEC) { - err = unix_mkname(sunaddr, alen, &hash); - if (err < 0) + err = unix_validate_addr(sunaddr, alen); + if (err) goto out; - alen = err; if (test_bit(SOCK_PASSCRED, &sock->flags) && - !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0) - goto out; + !unix_sk(sk)->addr) { + err = unix_autobind(sk); + if (err) + goto out; + } restart: - other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err); - if (!other) + other = unix_find_other(net, sunaddr, alen, sock->type); + if (IS_ERR(other)) { + err = PTR_ERR(other); goto out; + } unix_state_double_lock(sk, other); @@ -1170,6 +1339,7 @@ restart: if (err) goto out_unlock; + sk->sk_state = other->sk_state = TCP_ESTABLISHED; } else { /* * 1003.1g breaking connected state with AF_UNSPEC @@ -1183,7 +1353,10 @@ restart: */ if (unix_peer(sk)) { struct sock *old_peer = unix_peer(sk); + unix_peer(sk) = other; + if (!other) + sk->sk_state = TCP_CLOSE; unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer); unix_state_double_unlock(sk, other); @@ -1195,6 +1368,7 @@ restart: unix_peer(sk) = other; unix_state_double_unlock(sk, other); } + return 0; out_unlock: @@ -1236,19 +1410,19 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, struct sock *newsk = NULL; struct sock *other = NULL; struct sk_buff *skb = NULL; - unsigned int hash; int st; int err; long timeo; - err = unix_mkname(sunaddr, addr_len, &hash); - if (err < 0) + err = unix_validate_addr(sunaddr, addr_len); + if (err) goto out; - addr_len = err; - if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr && - (err = unix_autobind(sock)) != 0) - goto out; + if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) { + err = unix_autobind(sk); + if (err) + goto out; + } timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); @@ -1257,12 +1431,15 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, we will have to recheck all again in any case. */ - err = -ENOMEM; - /* create new sock for complete connection */ - newsk = unix_create1(sock_net(sk), NULL, 0); - if (newsk == NULL) + newsk = unix_create1(sock_net(sk), NULL, 0, sock->type); + if (IS_ERR(newsk)) { + err = PTR_ERR(newsk); + newsk = NULL; goto out; + } + + err = -ENOMEM; /* Allocate skb for sending to listening sock */ skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); @@ -1271,9 +1448,12 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, restart: /* Find listening sock. */ - other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err); - if (!other) + other = unix_find_other(net, sunaddr, addr_len, sk->sk_type); + if (IS_ERR(other)) { + err = PTR_ERR(other); + other = NULL; goto out; + } /* Latch state of peer */ unix_state_lock(other); @@ -1361,9 +1541,9 @@ restart: * * The contents of *(otheru->addr) and otheru->path * are seen fully set up here, since we have found - * otheru in hash under unix_table_lock. Insertion + * otheru in hash under unix_table_locks. Insertion * into the hash chain we'd found it in had been done - * in an earlier critical area protected by unix_table_lock, + * in an earlier critical area protected by unix_table_locks, * the same one where we'd set *(otheru->addr) contents, * as well as otheru->path and otheru->addr itself. * @@ -1393,7 +1573,7 @@ restart: unix_state_unlock(sk); - /* take ten and and send info to listening sock */ + /* take ten and send info to listening sock */ spin_lock(&other->sk_receive_queue.lock); __skb_queue_tail(&other->sk_receive_queue, skb); spin_unlock(&other->sk_receive_queue.lock); @@ -1427,12 +1607,10 @@ static int unix_socketpair(struct socket *socka, struct socket *sockb) init_peercred(ska); init_peercred(skb); - if (ska->sk_type != SOCK_DGRAM) { - ska->sk_state = TCP_ESTABLISHED; - skb->sk_state = TCP_ESTABLISHED; - socka->state = SS_CONNECTED; - sockb->state = SS_CONNECTED; - } + ska->sk_state = TCP_ESTABLISHED; + skb->sk_state = TCP_ESTABLISHED; + socka->state = SS_CONNECTED; + sockb->state = SS_CONNECTED; return 0; } @@ -1512,7 +1690,7 @@ static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer) if (!addr) { sunaddr->sun_family = AF_UNIX; sunaddr->sun_path[0] = 0; - err = sizeof(short); + err = offsetof(struct sockaddr_un, sun_path); } else { err = addr->len; memcpy(sunaddr, addr->name, addr->len); @@ -1522,6 +1700,53 @@ out: return err; } +static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) +{ + scm->fp = scm_fp_dup(UNIXCB(skb).fp); + + /* + * Garbage collection of unix sockets starts by selecting a set of + * candidate sockets which have reference only from being in flight + * (total_refs == inflight_refs). This condition is checked once during + * the candidate collection phase, and candidates are marked as such, so + * that non-candidates can later be ignored. While inflight_refs is + * protected by unix_gc_lock, total_refs (file count) is not, hence this + * is an instantaneous decision. + * + * Once a candidate, however, the socket must not be reinstalled into a + * file descriptor while the garbage collection is in progress. + * + * If the above conditions are met, then the directed graph of + * candidates (*) does not change while unix_gc_lock is held. + * + * Any operations that changes the file count through file descriptors + * (dup, close, sendmsg) does not change the graph since candidates are + * not installed in fds. + * + * Dequeing a candidate via recvmsg would install it into an fd, but + * that takes unix_gc_lock to decrement the inflight count, so it's + * serialized with garbage collection. + * + * MSG_PEEK is special in that it does not change the inflight count, + * yet does install the socket into an fd. The following lock/unlock + * pair is to ensure serialization with garbage collection. It must be + * done between incrementing the file count and installing the file into + * an fd. + * + * If garbage collection starts after the barrier provided by the + * lock/unlock, then it will see the elevated refcount and not mark this + * as a candidate. If a garbage collection is already in progress + * before the file count was incremented, then the lock/unlock pair will + * ensure that garbage collection is finished before progressing to + * installing the fd. + * + * (*) A -> B where B is on the queue of A or B is on the queue of C + * which is on the queue of listening socket A. + */ + spin_lock(&unix_gc_lock); + spin_unlock(&unix_gc_lock); +} + static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) { int err = 0; @@ -1621,9 +1846,7 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, struct unix_sock *u = unix_sk(sk); DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name); struct sock *other = NULL; - int namelen = 0; /* fake GCC */ int err; - unsigned int hash; struct sk_buff *skb; long timeo; struct scm_cookie scm; @@ -1640,10 +1863,9 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, goto out; if (msg->msg_namelen) { - err = unix_mkname(sunaddr, msg->msg_namelen, &hash); - if (err < 0) + err = unix_validate_addr(sunaddr, msg->msg_namelen); + if (err) goto out; - namelen = err; } else { sunaddr = NULL; err = -ENOTCONN; @@ -1652,9 +1874,11 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, goto out; } - if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr - && (err = unix_autobind(sock)) != 0) - goto out; + if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) { + err = unix_autobind(sk); + if (err) + goto out; + } err = -EMSGSIZE; if (len > sk->sk_sndbuf - 32) @@ -1694,10 +1918,13 @@ restart: if (sunaddr == NULL) goto out_free; - other = unix_find_other(net, sunaddr, namelen, sk->sk_type, - hash, &err); - if (other == NULL) + other = unix_find_other(net, sunaddr, msg->msg_namelen, + sk->sk_type); + if (IS_ERR(other)) { + err = PTR_ERR(other); + other = NULL; goto out_free; + } } if (sk_filter(other, skb) < 0) { @@ -1731,6 +1958,7 @@ restart_locked: unix_state_unlock(sk); + sk->sk_state = TCP_CLOSE; unix_dgram_disconnected(sk, other); sock_put(other); err = -ECONNREFUSED; @@ -1821,6 +2049,53 @@ out: */ #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768)) +#if (IS_ENABLED(CONFIG_AF_UNIX_OOB)) +static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other) +{ + struct unix_sock *ousk = unix_sk(other); + struct sk_buff *skb; + int err = 0; + + skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err); + + if (!skb) + return err; + + skb_put(skb, 1); + err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1); + + if (err) { + kfree_skb(skb); + return err; + } + + unix_state_lock(other); + + if (sock_flag(other, SOCK_DEAD) || + (other->sk_shutdown & RCV_SHUTDOWN)) { + unix_state_unlock(other); + kfree_skb(skb); + return -EPIPE; + } + + maybe_add_creds(skb, sock, other); + skb_get(skb); + + if (ousk->oob_skb) + consume_skb(ousk->oob_skb); + + ousk->oob_skb = skb; + + scm_stat_add(other, skb); + skb_queue_tail(&other->sk_receive_queue, skb); + sk_send_sigurg(other); + unix_state_unlock(other); + other->sk_data_ready(other); + + return err; +} +#endif + static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { @@ -1839,8 +2114,14 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, return err; err = -EOPNOTSUPP; - if (msg->msg_flags&MSG_OOB) - goto out_err; + if (msg->msg_flags & MSG_OOB) { +#if (IS_ENABLED(CONFIG_AF_UNIX_OOB)) + if (len) + len--; + else +#endif + goto out_err; + } if (msg->msg_namelen) { err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; @@ -1905,6 +2186,15 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, sent += size; } +#if (IS_ENABLED(CONFIG_AF_UNIX_OOB)) + if (msg->msg_flags & MSG_OOB) { + err = queue_oob(sock, msg, other); + if (err) + goto out_err; + sent++; + } +#endif + scm_destroy(&scm); return sent; @@ -2077,11 +2367,11 @@ static void unix_copy_addr(struct msghdr *msg, struct sock *sk) } } -static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, - size_t size, int flags) +int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, + int flags) { struct scm_cookie scm; - struct sock *sk = sock->sk; + struct socket *sock = sk->sk_socket; struct unix_sock *u = unix_sk(sk); struct sk_buff *skb, *last; long timeo; @@ -2171,7 +2461,7 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, sk_peek_offset_fwd(sk, size); if (UNIXCB(skb).fp) - scm.fp = scm_fp_dup(UNIXCB(skb).fp); + unix_peek_fds(&scm, skb); } err = (flags & MSG_TRUNC) ? skb->len - skip : size; @@ -2184,6 +2474,55 @@ out: return err; } +static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, + int flags) +{ + struct sock *sk = sock->sk; + +#ifdef CONFIG_BPF_SYSCALL + const struct proto *prot = READ_ONCE(sk->sk_prot); + + if (prot != &unix_dgram_proto) + return prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, + flags & ~MSG_DONTWAIT, NULL); +#endif + return __unix_dgram_recvmsg(sk, msg, size, flags); +} + +static int unix_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor) +{ + int copied = 0; + + while (1) { + struct unix_sock *u = unix_sk(sk); + struct sk_buff *skb; + int used, err; + + mutex_lock(&u->iolock); + skb = skb_recv_datagram(sk, 0, 1, &err); + mutex_unlock(&u->iolock); + if (!skb) + return err; + + used = recv_actor(desc, skb, 0, skb->len); + if (used <= 0) { + if (!copied) + copied = used; + kfree_skb(skb); + break; + } else if (used <= skb->len) { + copied += used; + } + + kfree_skb(skb); + if (!desc->count) + break; + } + + return copied; +} + /* * Sleep until more data has arrived. But check for races.. */ @@ -2243,6 +2582,86 @@ struct unix_stream_read_state { unsigned int splice_flags; }; +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) +static int unix_stream_recv_urg(struct unix_stream_read_state *state) +{ + struct socket *sock = state->socket; + struct sock *sk = sock->sk; + struct unix_sock *u = unix_sk(sk); + int chunk = 1; + struct sk_buff *oob_skb; + + mutex_lock(&u->iolock); + unix_state_lock(sk); + + if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) { + unix_state_unlock(sk); + mutex_unlock(&u->iolock); + return -EINVAL; + } + + oob_skb = u->oob_skb; + + if (!(state->flags & MSG_PEEK)) { + u->oob_skb = NULL; + } + + unix_state_unlock(sk); + + chunk = state->recv_actor(oob_skb, 0, chunk, state); + + if (!(state->flags & MSG_PEEK)) { + UNIXCB(oob_skb).consumed += 1; + kfree_skb(oob_skb); + } + + mutex_unlock(&u->iolock); + + if (chunk < 0) + return -EFAULT; + + state->msg->msg_flags |= MSG_OOB; + return 1; +} + +static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, + int flags, int copied) +{ + struct unix_sock *u = unix_sk(sk); + + if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) { + skb_unlink(skb, &sk->sk_receive_queue); + consume_skb(skb); + skb = NULL; + } else { + if (skb == u->oob_skb) { + if (copied) { + skb = NULL; + } else if (sock_flag(sk, SOCK_URGINLINE)) { + if (!(flags & MSG_PEEK)) { + u->oob_skb = NULL; + consume_skb(skb); + } + } else if (!(flags & MSG_PEEK)) { + skb_unlink(skb, &sk->sk_receive_queue); + consume_skb(skb); + skb = skb_peek(&sk->sk_receive_queue); + } + } + } + return skb; +} +#endif + +static int unix_stream_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor) +{ + if (unlikely(sk->sk_state != TCP_ESTABLISHED)) + return -ENOTCONN; + + return unix_read_sock(sk, desc, recv_actor); +} + static int unix_stream_read_generic(struct unix_stream_read_state *state, bool freezable) { @@ -2268,6 +2687,9 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state, if (unlikely(flags & MSG_OOB)) { err = -EOPNOTSUPP; +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) + err = unix_stream_recv_urg(state); +#endif goto out; } @@ -2296,6 +2718,18 @@ redo: } last = skb = skb_peek(&sk->sk_receive_queue); last_len = last ? last->len : 0; + +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) + if (skb) { + skb = manage_oob(skb, sk, flags, copied); + if (!skb) { + unix_state_unlock(sk); + if (copied) + break; + goto redo; + } + } +#endif again: if (skb == NULL) { if (copied >= target) @@ -2414,7 +2848,7 @@ unlock: /* It is questionable, see note in unix_dgram_recvmsg. */ if (UNIXCB(skb).fp) - scm.fp = scm_fp_dup(UNIXCB(skb).fp); + unix_peek_fds(&scm, skb); sk_peek_offset_fwd(sk, chunk); @@ -2453,6 +2887,20 @@ static int unix_stream_read_actor(struct sk_buff *skb, return ret ?: chunk; } +int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg, + size_t size, int flags) +{ + struct unix_stream_read_state state = { + .recv_actor = unix_stream_read_actor, + .socket = sk->sk_socket, + .msg = msg, + .size = size, + .flags = flags + }; + + return unix_stream_read_generic(&state, true); +} + static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { @@ -2464,6 +2912,14 @@ static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, .flags = flags }; +#ifdef CONFIG_BPF_SYSCALL + struct sock *sk = sock->sk; + const struct proto *prot = READ_ONCE(sk->sk_prot); + + if (prot != &unix_stream_proto) + return prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, + flags & ~MSG_DONTWAIT, NULL); +#endif return unix_stream_read_generic(&state, true); } @@ -2524,7 +2980,10 @@ static int unix_shutdown(struct socket *sock, int mode) (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) { int peer_mode = 0; + const struct proto *prot = READ_ONCE(other->sk_prot); + if (prot->unhash) + prot->unhash(other); if (mode&RCV_SHUTDOWN) peer_mode |= SEND_SHUTDOWN; if (mode&SEND_SHUTDOWN) @@ -2631,6 +3090,20 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCUNIXFILE: err = unix_open_file(sk); break; +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) + case SIOCATMARK: + { + struct sk_buff *skb; + struct unix_sock *u = unix_sk(sk); + int answ = 0; + + skb = skb_peek(&sk->sk_receive_queue); + if (skb && skb == u->oob_skb) + answ = 1; + err = put_user(answ, (int __user *)arg); + } + break; +#endif default: err = -ENOIOCTLCMD; break; @@ -2664,6 +3137,8 @@ static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wa /* readable? */ if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; + if (sk_is_readable(sk)) + mask |= EPOLLIN | EPOLLRDNORM; /* Connection-based need to check for termination and startup */ if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && @@ -2703,6 +3178,8 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, /* readable? */ if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; + if (sk_is_readable(sk)) + mask |= EPOLLIN | EPOLLRDNORM; /* Connection-based need to check for termination and startup */ if (sk->sk_type == SOCK_SEQPACKET) { @@ -2723,7 +3200,7 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, other = unix_peer(sk); if (other && unix_peer(other) != sk && - unix_recvq_full(other) && + unix_recvq_full_lockless(other) && unix_dgram_peer_wake_me(sk, other)) writable = 0; @@ -2743,7 +3220,7 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1) #define get_bucket(x) ((x) >> BUCKET_SPACE) -#define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1)) +#define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1)) #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos) @@ -2767,7 +3244,7 @@ static struct sock *unix_next_socket(struct seq_file *seq, struct sock *sk, loff_t *pos) { - unsigned long bucket; + unsigned long bucket = get_bucket(*pos); while (sk > (struct sock *)SEQ_START_TOKEN) { sk = sk_next(sk); @@ -2778,12 +3255,13 @@ static struct sock *unix_next_socket(struct seq_file *seq, } do { + spin_lock(&unix_table_locks[bucket]); sk = unix_from_bucket(seq, pos); if (sk) return sk; next_bucket: - bucket = get_bucket(*pos) + 1; + spin_unlock(&unix_table_locks[bucket++]); *pos = set_bucket_offset(bucket, 1); } while (bucket < ARRAY_SIZE(unix_socket_table)); @@ -2791,10 +3269,7 @@ next_bucket: } static void *unix_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(unix_table_lock) { - spin_lock(&unix_table_lock); - if (!*pos) return SEQ_START_TOKEN; @@ -2811,9 +3286,11 @@ static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void unix_seq_stop(struct seq_file *seq, void *v) - __releases(unix_table_lock) { - spin_unlock(&unix_table_lock); + struct sock *sk = v; + + if (sk) + spin_unlock(&unix_table_locks[sk->sk_hash]); } static int unix_seq_show(struct seq_file *seq, void *v) @@ -2838,15 +3315,16 @@ static int unix_seq_show(struct seq_file *seq, void *v) (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING), sock_i_ino(s)); - if (u->addr) { // under unix_table_lock here + if (u->addr) { // under unix_table_locks here int i, len; seq_putc(seq, ' '); i = 0; - len = u->addr->len - sizeof(short); - if (!UNIX_ABSTRACT(s)) + len = u->addr->len - + offsetof(struct sockaddr_un, sun_path); + if (u->addr->name->sun_path[0]) { len--; - else { + } else { seq_putc(seq, '@'); i++; } @@ -2867,6 +3345,64 @@ static const struct seq_operations unix_seq_ops = { .stop = unix_seq_stop, .show = unix_seq_show, }; + +#if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) +struct bpf_iter__unix { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct unix_sock *, unix_sk); + uid_t uid __aligned(8); +}; + +static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, + struct unix_sock *unix_sk, uid_t uid) +{ + struct bpf_iter__unix ctx; + + meta->seq_num--; /* skip SEQ_START_TOKEN */ + ctx.meta = meta; + ctx.unix_sk = unix_sk; + ctx.uid = uid; + return bpf_iter_run_prog(prog, &ctx); +} + +static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v) +{ + struct bpf_iter_meta meta; + struct bpf_prog *prog; + struct sock *sk = v; + uid_t uid; + + if (v == SEQ_START_TOKEN) + return 0; + + uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); + meta.seq = seq; + prog = bpf_iter_get_info(&meta, false); + return unix_prog_seq_show(prog, &meta, v, uid); +} + +static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v) +{ + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + if (!v) { + meta.seq = seq; + prog = bpf_iter_get_info(&meta, true); + if (prog) + (void)unix_prog_seq_show(prog, &meta, v, 0); + } + + unix_seq_stop(seq, v); +} + +static const struct seq_operations bpf_iter_unix_seq_ops = { + .start = unix_seq_start, + .next = unix_seq_next, + .stop = bpf_iter_unix_seq_stop, + .show = bpf_iter_unix_seq_show, +}; +#endif #endif static const struct net_proto_family unix_family_ops = { @@ -2907,13 +3443,51 @@ static struct pernet_operations unix_net_ops = { .exit = unix_net_exit, }; +#if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) +DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta, + struct unix_sock *unix_sk, uid_t uid) + +static const struct bpf_iter_seq_info unix_seq_info = { + .seq_ops = &bpf_iter_unix_seq_ops, + .init_seq_private = bpf_iter_init_seq_net, + .fini_seq_private = bpf_iter_fini_seq_net, + .seq_priv_size = sizeof(struct seq_net_private), +}; + +static struct bpf_iter_reg unix_reg_info = { + .target = "unix", + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__unix, unix_sk), + PTR_TO_BTF_ID_OR_NULL }, + }, + .seq_info = &unix_seq_info, +}; + +static void __init bpf_iter_register(void) +{ + unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX]; + if (bpf_iter_reg_target(&unix_reg_info)) + pr_warn("Warning: could not register bpf iterator unix\n"); +} +#endif + static int __init af_unix_init(void) { - int rc = -1; + int i, rc = -1; BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb)); - rc = proto_register(&unix_proto, 1); + for (i = 0; i < 2 * UNIX_HASH_SIZE; i++) + spin_lock_init(&unix_table_locks[i]); + + rc = proto_register(&unix_dgram_proto, 1); + if (rc != 0) { + pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); + goto out; + } + + rc = proto_register(&unix_stream_proto, 1); if (rc != 0) { pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); goto out; @@ -2921,6 +3495,12 @@ static int __init af_unix_init(void) sock_register(&unix_family_ops); register_pernet_subsys(&unix_net_ops); + unix_bpf_build_proto(); + +#if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) + bpf_iter_register(); +#endif + out: return rc; } @@ -2928,7 +3508,8 @@ out: static void __exit af_unix_exit(void) { sock_unregister(PF_UNIX); - proto_unregister(&unix_proto); + proto_unregister(&unix_dgram_proto); + proto_unregister(&unix_stream_proto); unregister_pernet_subsys(&unix_net_ops); } |