summaryrefslogtreecommitdiff
path: root/net/mptcp
diff options
context:
space:
mode:
Diffstat (limited to 'net/mptcp')
-rw-r--r--net/mptcp/options.c3
-rw-r--r--net/mptcp/pm_netlink.c89
-rw-r--r--net/mptcp/pm_userspace.c5
-rw-r--r--net/mptcp/protocol.c112
-rw-r--r--net/mptcp/protocol.h8
-rw-r--r--net/mptcp/sockopt.c14
-rw-r--r--net/mptcp/subflow.c143
-rw-r--r--net/mptcp/token.c14
-rw-r--r--net/mptcp/token_test.c3
9 files changed, 208 insertions, 183 deletions
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 5ded85e2c374..b30cea2fbf3f 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -1594,8 +1594,7 @@ mp_rst:
TCPOLEN_MPTCP_PRIO,
opts->backup, TCPOPT_NOP);
- MPTCP_INC_STATS(sock_net((const struct sock *)tp),
- MPTCP_MIB_MPPRIOTX);
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPPRIOTX);
}
mp_capable_done:
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index 2ea7eae43bdb..5c8dea49626c 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -152,7 +152,6 @@ static struct mptcp_pm_addr_entry *
select_local_address(const struct pm_nl_pernet *pernet,
const struct mptcp_sock *msk)
{
- const struct sock *sk = (const struct sock *)msk;
struct mptcp_pm_addr_entry *entry, *ret = NULL;
msk_owned_by_me(msk);
@@ -165,16 +164,6 @@ select_local_address(const struct pm_nl_pernet *pernet,
if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap))
continue;
- if (entry->addr.family != sk->sk_family) {
-#if IS_ENABLED(CONFIG_MPTCP_IPV6)
- if ((entry->addr.family == AF_INET &&
- !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) ||
- (sk->sk_family == AF_INET &&
- !ipv6_addr_v4mapped(&entry->addr.addr6)))
-#endif
- continue;
- }
-
ret = entry;
break;
}
@@ -423,7 +412,9 @@ static bool lookup_address_in_vec(const struct mptcp_addr_info *addrs, unsigned
/* Fill all the remote addresses into the array addrs[],
* and return the array size.
*/
-static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, bool fullmesh,
+static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk,
+ struct mptcp_addr_info *local,
+ bool fullmesh,
struct mptcp_addr_info *addrs)
{
bool deny_id0 = READ_ONCE(msk->pm.remote_deny_join_id0);
@@ -443,6 +434,9 @@ static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, bool fullm
if (deny_id0)
return 0;
+ if (!mptcp_pm_addr_families_match(sk, local, &remote))
+ return 0;
+
msk->pm.subflows++;
addrs[i++] = remote;
} else {
@@ -453,6 +447,9 @@ static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, bool fullm
if (deny_id0 && !addrs[i].id)
continue;
+ if (!mptcp_pm_addr_families_match(sk, local, &addrs[i]))
+ continue;
+
if (!lookup_address_in_vec(addrs, i, &addrs[i]) &&
msk->pm.subflows < subflows_max) {
msk->pm.subflows++;
@@ -603,9 +600,11 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
fullmesh = !!(local->flags & MPTCP_PM_ADDR_FLAG_FULLMESH);
msk->pm.local_addr_used++;
- nr = fill_remote_addresses_vec(msk, fullmesh, addrs);
- if (nr)
- __clear_bit(local->addr.id, msk->pm.id_avail_bitmap);
+ __clear_bit(local->addr.id, msk->pm.id_avail_bitmap);
+ nr = fill_remote_addresses_vec(msk, &local->addr, fullmesh, addrs);
+ if (nr == 0)
+ continue;
+
spin_unlock_bh(&msk->pm.lock);
for (i = 0; i < nr; i++)
__mptcp_subflow_connect(sk, &local->addr, &addrs[i]);
@@ -628,11 +627,11 @@ static void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk)
* and return the array size.
*/
static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk,
+ struct mptcp_addr_info *remote,
struct mptcp_addr_info *addrs)
{
struct sock *sk = (struct sock *)msk;
struct mptcp_pm_addr_entry *entry;
- struct mptcp_addr_info local;
struct pm_nl_pernet *pernet;
unsigned int subflows_max;
int i = 0;
@@ -645,15 +644,8 @@ static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk,
if (!(entry->flags & MPTCP_PM_ADDR_FLAG_FULLMESH))
continue;
- if (entry->addr.family != sk->sk_family) {
-#if IS_ENABLED(CONFIG_MPTCP_IPV6)
- if ((entry->addr.family == AF_INET &&
- !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) ||
- (sk->sk_family == AF_INET &&
- !ipv6_addr_v4mapped(&entry->addr.addr6)))
-#endif
- continue;
- }
+ if (!mptcp_pm_addr_families_match(sk, &entry->addr, remote))
+ continue;
if (msk->pm.subflows < subflows_max) {
msk->pm.subflows++;
@@ -666,8 +658,18 @@ static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk,
* 'IPADDRANY' local address
*/
if (!i) {
+ struct mptcp_addr_info local;
+
memset(&local, 0, sizeof(local));
- local.family = msk->pm.remote.family;
+ local.family =
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ remote->family == AF_INET6 &&
+ ipv6_addr_v4mapped(&remote->addr6) ? AF_INET :
+#endif
+ remote->family;
+
+ if (!mptcp_pm_addr_families_match(sk, &local, remote))
+ return 0;
msk->pm.subflows++;
addrs[i++] = local;
@@ -706,7 +708,9 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk)
/* connect to the specified remote address, using whatever
* local address the routing configuration will pick.
*/
- nr = fill_local_addresses_vec(msk, addrs);
+ nr = fill_local_addresses_vec(msk, &remote, addrs);
+ if (nr == 0)
+ return;
msk->pm.add_addr_accepted++;
if (msk->pm.add_addr_accepted >= add_addr_accept_max ||
@@ -993,13 +997,17 @@ out:
return ret;
}
+static struct lock_class_key mptcp_slock_keys[2];
+static struct lock_class_key mptcp_keys[2];
+
static int mptcp_pm_nl_create_listen_socket(struct sock *sk,
struct mptcp_pm_addr_entry *entry)
{
+ bool is_ipv6 = sk->sk_family == AF_INET6;
int addrlen = sizeof(struct sockaddr_in);
struct sockaddr_storage addr;
- struct mptcp_sock *msk;
struct socket *ssock;
+ struct sock *newsk;
int backlog = 1024;
int err;
@@ -1008,11 +1016,25 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk,
if (err)
return err;
- msk = mptcp_sk(entry->lsk->sk);
- if (!msk)
+ newsk = entry->lsk->sk;
+ if (!newsk)
return -EINVAL;
- ssock = __mptcp_nmpc_socket(msk);
+ /* The subflow socket lock is acquired in a nested to the msk one
+ * in several places, even by the TCP stack, and this msk is a kernel
+ * socket: lockdep complains. Instead of propagating the _nested
+ * modifiers in several places, re-init the lock class for the msk
+ * socket to an mptcp specific one.
+ */
+ sock_lock_init_class_and_name(newsk,
+ is_ipv6 ? "mlock-AF_INET6" : "mlock-AF_INET",
+ &mptcp_slock_keys[is_ipv6],
+ is_ipv6 ? "msk_lock-AF_INET6" : "msk_lock-AF_INET",
+ &mptcp_keys[is_ipv6]);
+
+ lock_sock(newsk);
+ ssock = __mptcp_nmpc_socket(mptcp_sk(newsk));
+ release_sock(newsk);
if (!ssock)
return -EINVAL;
@@ -1143,7 +1165,7 @@ void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ss
if (!tcp_rtx_and_write_queues_empty(ssk)) {
subflow->stale = 1;
__mptcp_retransmit_pending_data(sk);
- MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_SUBFLOWSTALE);
+ MPTCP_INC_STATS(net, MPTCP_MIB_SUBFLOWSTALE);
}
unlock_sock_fast(ssk, slow);
@@ -1903,8 +1925,7 @@ static int mptcp_nl_cmd_set_flags(struct sk_buff *skb, struct genl_info *info)
}
if (token)
- return mptcp_userspace_pm_set_flags(sock_net(skb->sk),
- token, &addr, &remote, bkup);
+ return mptcp_userspace_pm_set_flags(net, token, &addr, &remote, bkup);
spin_lock_bh(&pernet->lock);
entry = __lookup_addr(pernet, &addr.addr, lookup_by_id);
diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c
index ea6ad9da7493..a02d3cbf2a1b 100644
--- a/net/mptcp/pm_userspace.c
+++ b/net/mptcp/pm_userspace.c
@@ -59,8 +59,8 @@ int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk,
*/
e = sock_kmalloc(sk, sizeof(*e), GFP_ATOMIC);
if (!e) {
- spin_unlock_bh(&msk->pm.lock);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto append_err;
}
*e = *entry;
@@ -74,6 +74,7 @@ int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk,
ret = entry->addr.id;
}
+append_err:
spin_unlock_bh(&msk->pm.lock);
return ret;
}
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 8cd6cc67c2c5..60b23b2716c4 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -825,7 +825,6 @@ static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk)
if (sk->sk_socket && !ssk->sk_socket)
mptcp_sock_graft(ssk, sk->sk_socket);
- mptcp_propagate_sndbuf((struct sock *)msk, ssk);
mptcp_sockopt_sync_locked(msk, ssk);
return true;
}
@@ -923,9 +922,8 @@ static void mptcp_check_for_eof(struct mptcp_sock *msk)
static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow;
- struct sock *sk = (struct sock *)msk;
- sock_owned_by_me(sk);
+ msk_owned_by_me(msk);
mptcp_for_each_subflow(msk, subflow) {
if (READ_ONCE(subflow->data_avail))
@@ -1408,7 +1406,7 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
u64 linger_time;
long tout = 0;
- sock_owned_by_me(sk);
+ msk_owned_by_me(msk);
if (__mptcp_check_fallback(msk)) {
if (!msk->first)
@@ -1890,7 +1888,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
u32 time, advmss = 1;
u64 rtt_us, mstamp;
- sock_owned_by_me(sk);
+ msk_owned_by_me(msk);
if (copied <= 0)
return;
@@ -2217,7 +2215,7 @@ static struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk)
struct mptcp_subflow_context *subflow;
int min_stale_count = INT_MAX;
- sock_owned_by_me((const struct sock *)msk);
+ msk_owned_by_me(msk);
if (__mptcp_check_fallback(msk))
return NULL;
@@ -2344,7 +2342,6 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
goto out;
}
- sock_orphan(ssk);
subflow->disposable = 1;
/* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops
@@ -2352,15 +2349,25 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
* reference owned by msk;
*/
if (!inet_csk(ssk)->icsk_ulp_ops) {
+ WARN_ON_ONCE(!sock_flag(ssk, SOCK_DEAD));
kfree_rcu(subflow, rcu);
+ } else if (msk->in_accept_queue && msk->first == ssk) {
+ /* if the first subflow moved to a close state, e.g. due to
+ * incoming reset and we reach here before inet_child_forget()
+ * the TCP stack could later try to close it via
+ * inet_csk_listen_stop(), or deliver it to the user space via
+ * accept().
+ * We can't delete the subflow - or risk a double free - nor let
+ * the msk survive - or will be leaked in the non accept scenario:
+ * fallback and let TCP cope with the subflow cleanup.
+ */
+ WARN_ON_ONCE(sock_flag(ssk, SOCK_DEAD));
+ mptcp_subflow_drop_ctx(ssk);
} else {
/* otherwise tcp will dispose of the ssk and subflow ctx */
- if (ssk->sk_state == TCP_LISTEN) {
- tcp_set_state(ssk, TCP_CLOSE);
- mptcp_subflow_queue_clean(sk, ssk);
- inet_csk_listen_stop(ssk);
+ if (ssk->sk_state == TCP_LISTEN)
mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CLOSED);
- }
+
__tcp_close(ssk, 0);
/* close acquired an extra ref */
@@ -2400,9 +2407,10 @@ static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu)
return 0;
}
-static void __mptcp_close_subflow(struct mptcp_sock *msk)
+static void __mptcp_close_subflow(struct sock *sk)
{
struct mptcp_subflow_context *subflow, *tmp;
+ struct mptcp_sock *msk = mptcp_sk(sk);
might_sleep();
@@ -2416,7 +2424,15 @@ static void __mptcp_close_subflow(struct mptcp_sock *msk)
if (!skb_queue_empty_lockless(&ssk->sk_receive_queue))
continue;
- mptcp_close_ssk((struct sock *)msk, ssk, subflow);
+ mptcp_close_ssk(sk, ssk, subflow);
+ }
+
+ /* if the MPC subflow has been closed before the msk is accepted,
+ * msk will never be accept-ed, close it now
+ */
+ if (!msk->first && msk->in_accept_queue) {
+ sock_set_flag(sk, SOCK_DEAD);
+ inet_sk_state_store(sk, TCP_CLOSE);
}
}
@@ -2625,6 +2641,9 @@ static void mptcp_worker(struct work_struct *work)
__mptcp_check_send_data_fin(sk);
mptcp_check_data_fin(sk);
+ if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags))
+ __mptcp_close_subflow(sk);
+
/* There is no point in keeping around an orphaned sk timedout or
* closed, but we need the msk around to reply to incoming DATA_FIN,
* even if it is orphaned and in FIN_WAIT2 state
@@ -2640,9 +2659,6 @@ static void mptcp_worker(struct work_struct *work)
}
}
- if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags))
- __mptcp_close_subflow(msk);
-
if (test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags))
__mptcp_retrans(sk);
@@ -2724,8 +2740,8 @@ static int mptcp_init_sock(struct sock *sk)
mptcp_ca_reset(sk);
sk_sockets_allocated_inc(sk);
- sk->sk_rcvbuf = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1]);
- sk->sk_sndbuf = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1]);
+ sk->sk_rcvbuf = READ_ONCE(net->ipv4.sysctl_tcp_rmem[1]);
+ sk->sk_sndbuf = READ_ONCE(net->ipv4.sysctl_tcp_wmem[1]);
return 0;
}
@@ -2876,7 +2892,6 @@ static void __mptcp_destroy_sock(struct sock *sk)
sk_stream_kill_queues(sk);
xfrm_sk_free_policy(sk);
- sk_refcnt_debug_release(sk);
sock_put(sk);
}
@@ -2892,15 +2907,23 @@ static __poll_t mptcp_check_readable(struct mptcp_sock *msk)
return EPOLLIN | EPOLLRDNORM;
}
+static void mptcp_listen_inuse_dec(struct sock *sk)
+{
+ if (inet_sk_state_load(sk) == TCP_LISTEN)
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+}
+
bool __mptcp_close(struct sock *sk, long timeout)
{
struct mptcp_subflow_context *subflow;
struct mptcp_sock *msk = mptcp_sk(sk);
bool do_cancel_work = false;
+ int subflows_alive = 0;
sk->sk_shutdown = SHUTDOWN_MASK;
if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) {
+ mptcp_listen_inuse_dec(sk);
inet_sk_state_store(sk, TCP_CLOSE);
goto cleanup;
}
@@ -2922,6 +2945,8 @@ cleanup:
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
bool slow = lock_sock_fast_nested(ssk);
+ subflows_alive += ssk->sk_state != TCP_CLOSE;
+
/* since the close timeout takes precedence on the fail one,
* cancel the latter
*/
@@ -2937,6 +2962,12 @@ cleanup:
}
sock_orphan(sk);
+ /* all the subflows are closed, only timeout can change the msk
+ * state, let's not keep resources busy for no reasons
+ */
+ if (subflows_alive == 0)
+ inet_sk_state_store(sk, TCP_CLOSE);
+
sock_hold(sk);
pr_debug("msk=%p state=%d", sk, sk->sk_state);
if (msk->token)
@@ -3001,6 +3032,7 @@ static int mptcp_disconnect(struct sock *sk, int flags)
if (msk->fastopening)
return 0;
+ mptcp_listen_inuse_dec(sk);
inet_sk_state_store(sk, TCP_CLOSE);
mptcp_stop_timer(sk);
@@ -3064,6 +3096,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk,
msk->local_key = subflow_req->local_key;
msk->token = subflow_req->token;
msk->subflow = NULL;
+ msk->in_accept_queue = 1;
WRITE_ONCE(msk->fully_established, false);
if (mp_opt->suboptions & OPTION_MPTCP_CSUMREQD)
WRITE_ONCE(msk->csum_enabled, true);
@@ -3081,8 +3114,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk,
security_inet_csk_clone(nsk, req);
bh_unlock_sock(nsk);
- /* keep a single reference */
- __sock_put(nsk);
+ /* note: the newly allocated socket refcount is 2 now */
return nsk;
}
@@ -3138,8 +3170,6 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
goto out;
}
- /* acquire the 2nd reference for the owning socket */
- sock_hold(new_mptcp_sock);
newsk = new_mptcp_sock;
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK);
} else {
@@ -3639,12 +3669,13 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr,
static int mptcp_listen(struct socket *sock, int backlog)
{
struct mptcp_sock *msk = mptcp_sk(sock->sk);
+ struct sock *sk = sock->sk;
struct socket *ssock;
int err;
pr_debug("msk=%p", msk);
- lock_sock(sock->sk);
+ lock_sock(sk);
ssock = __mptcp_nmpc_socket(msk);
if (!ssock) {
err = -EINVAL;
@@ -3652,18 +3683,20 @@ static int mptcp_listen(struct socket *sock, int backlog)
}
mptcp_token_destroy(msk);
- inet_sk_state_store(sock->sk, TCP_LISTEN);
- sock_set_flag(sock->sk, SOCK_RCU_FREE);
+ inet_sk_state_store(sk, TCP_LISTEN);
+ sock_set_flag(sk, SOCK_RCU_FREE);
err = ssock->ops->listen(ssock, backlog);
- inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk));
- if (!err)
- mptcp_copy_inaddrs(sock->sk, ssock->sk);
+ inet_sk_state_store(sk, inet_sk_state_load(ssock->sk));
+ if (!err) {
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+ mptcp_copy_inaddrs(sk, ssock->sk);
+ }
mptcp_event_pm_listener(ssock->sk, MPTCP_EVENT_LISTENER_CREATED);
unlock:
- release_sock(sock->sk);
+ release_sock(sk);
return err;
}
@@ -3687,25 +3720,10 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
struct sock *newsk = newsock->sk;
set_bit(SOCK_CUSTOM_SOCKOPT, &newsock->flags);
+ msk->in_accept_queue = 0;
lock_sock(newsk);
- /* PM/worker can now acquire the first subflow socket
- * lock without racing with listener queue cleanup,
- * we can notify it, if needed.
- *
- * Even if remote has reset the initial subflow by now
- * the refcnt is still at least one.
- */
- subflow = mptcp_subflow_ctx(msk->first);
- list_add(&subflow->node, &msk->conn_list);
- sock_hold(msk->first);
- if (mptcp_is_fully_established(newsk))
- mptcp_pm_fully_established(msk, msk->first, GFP_KERNEL);
-
- mptcp_rcv_space_init(msk, msk->first);
- mptcp_propagate_sndbuf(newsk, msk->first);
-
/* set ssk->sk_socket of accept()ed flows to mptcp socket.
* This is needed so NOSPACE flag can be set from tcp stack.
*/
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 601469249da8..339a6f072989 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -295,7 +295,8 @@ struct mptcp_sock {
u8 recvmsg_inq:1,
cork:1,
nodelay:1,
- fastopening:1;
+ fastopening:1,
+ in_accept_queue:1;
int connect_flags;
struct work_struct work;
struct sk_buff *ooo_last_skb;
@@ -628,7 +629,6 @@ void mptcp_close_ssk(struct sock *sk, struct sock *ssk,
struct mptcp_subflow_context *subflow);
void __mptcp_subflow_send_ack(struct sock *ssk);
void mptcp_subflow_reset(struct sock *ssk);
-void mptcp_subflow_queue_clean(struct sock *sk, struct sock *ssk);
void mptcp_sock_graft(struct sock *sk, struct socket *parent);
struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk);
bool __mptcp_close(struct sock *sk, long timeout);
@@ -666,6 +666,8 @@ void mptcp_subflow_set_active(struct mptcp_subflow_context *subflow);
bool mptcp_subflow_active(struct mptcp_subflow_context *subflow);
+void mptcp_subflow_drop_ctx(struct sock *ssk);
+
static inline void mptcp_subflow_tcp_fallback(struct sock *sk,
struct mptcp_subflow_context *ctx)
{
@@ -755,7 +757,7 @@ static inline void mptcp_token_init_request(struct request_sock *req)
int mptcp_token_new_request(struct request_sock *req);
void mptcp_token_destroy_request(struct request_sock *req);
-int mptcp_token_new_connect(struct sock *sk);
+int mptcp_token_new_connect(struct sock *ssk);
void mptcp_token_accept(struct mptcp_subflow_request_sock *r,
struct mptcp_sock *msk);
bool mptcp_token_exists(u32 token);
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index d4b1e6ec1b36..8a9656248b0f 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -18,7 +18,7 @@
static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk)
{
- sock_owned_by_me((const struct sock *)msk);
+ msk_owned_by_me(msk);
if (likely(!__mptcp_check_fallback(msk)))
return NULL;
@@ -760,14 +760,21 @@ static int mptcp_setsockopt_v4(struct mptcp_sock *msk, int optname,
static int mptcp_setsockopt_first_sf_only(struct mptcp_sock *msk, int level, int optname,
sockptr_t optval, unsigned int optlen)
{
+ struct sock *sk = (struct sock *)msk;
struct socket *sock;
+ int ret = -EINVAL;
/* Limit to first subflow, before the connection establishment */
+ lock_sock(sk);
sock = __mptcp_nmpc_socket(msk);
if (!sock)
- return -EINVAL;
+ goto unlock;
+
+ ret = tcp_setsockopt(sock->sk, level, optname, optval, optlen);
- return tcp_setsockopt(sock->sk, level, optname, optval, optlen);
+unlock:
+ release_sock(sk);
+ return ret;
}
static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
@@ -1255,6 +1262,7 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk)
ssk->sk_priority = sk->sk_priority;
ssk->sk_bound_dev_if = sk->sk_bound_dev_if;
ssk->sk_incoming_cpu = sk->sk_incoming_cpu;
+ ssk->sk_ipv6only = sk->sk_ipv6only;
__ip_sock_set_tos(ssk, inet_sk(sk)->tos);
if (sk->sk_userlocks & tx_rx_locks) {
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index ec54413fb31f..a0041360ee9d 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -26,6 +26,7 @@
#include "mib.h"
#include <trace/events/mptcp.h>
+#include <trace/events/sock.h>
static void mptcp_subflow_ops_undo_override(struct sock *ssk);
@@ -396,10 +397,15 @@ void mptcp_subflow_reset(struct sock *ssk)
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
struct sock *sk = subflow->conn;
+ /* mptcp_mp_fail_no_response() can reach here on an already closed
+ * socket
+ */
+ if (ssk->sk_state == TCP_CLOSE)
+ return;
+
/* must hold: tcp_done() could drop last reference on parent */
sock_hold(sk);
- tcp_set_state(ssk, TCP_CLOSE);
tcp_send_active_reset(ssk, GFP_ATOMIC);
tcp_done(ssk);
if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags) &&
@@ -621,7 +627,7 @@ static struct request_sock_ops mptcp_subflow_v6_request_sock_ops __ro_after_init
static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops __ro_after_init;
static struct inet_connection_sock_af_ops subflow_v6_specific __ro_after_init;
static struct inet_connection_sock_af_ops subflow_v6m_specific __ro_after_init;
-static struct proto tcpv6_prot_override;
+static struct proto tcpv6_prot_override __ro_after_init;
static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb)
{
@@ -692,9 +698,10 @@ static bool subflow_hmac_valid(const struct request_sock *req,
static void mptcp_force_close(struct sock *sk)
{
- /* the msk is not yet exposed to user-space */
+ /* the msk is not yet exposed to user-space, and refcount is 2 */
inet_sk_state_store(sk, TCP_CLOSE);
sk_common_release(sk);
+ sock_put(sk);
}
static void subflow_ulp_fallback(struct sock *sk,
@@ -710,7 +717,7 @@ static void subflow_ulp_fallback(struct sock *sk,
mptcp_subflow_ops_undo_override(sk);
}
-static void subflow_drop_ctx(struct sock *ssk)
+void mptcp_subflow_drop_ctx(struct sock *ssk)
{
struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk);
@@ -749,6 +756,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk,
struct mptcp_options_received mp_opt;
bool fallback, fallback_is_fatal;
struct sock *new_msk = NULL;
+ struct mptcp_sock *owner;
struct sock *child;
pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn);
@@ -815,7 +823,7 @@ create_child:
if (new_msk)
mptcp_copy_inaddrs(new_msk, child);
- subflow_drop_ctx(child);
+ mptcp_subflow_drop_ctx(child);
goto out;
}
@@ -823,6 +831,8 @@ create_child:
ctx->setsockopt_seq = listener->setsockopt_seq;
if (ctx->mp_capable) {
+ owner = mptcp_sk(new_msk);
+
/* this can't race with mptcp_close(), as the msk is
* not yet exposted to user-space
*/
@@ -831,14 +841,14 @@ create_child:
/* record the newly created socket as the first msk
* subflow, but don't link it yet into conn_list
*/
- WRITE_ONCE(mptcp_sk(new_msk)->first, child);
+ WRITE_ONCE(owner->first, child);
/* new mpc subflow takes ownership of the newly
* created mptcp socket
*/
mptcp_sk(new_msk)->setsockopt_seq = ctx->setsockopt_seq;
- mptcp_pm_new_connection(mptcp_sk(new_msk), child, 1);
- mptcp_token_accept(subflow_req, mptcp_sk(new_msk));
+ mptcp_pm_new_connection(owner, child, 1);
+ mptcp_token_accept(subflow_req, owner);
ctx->conn = new_msk;
new_msk = NULL;
@@ -846,15 +856,21 @@ create_child:
* uses the correct data
*/
mptcp_copy_inaddrs(ctx->conn, child);
+ mptcp_propagate_sndbuf(ctx->conn, child);
+
+ mptcp_rcv_space_init(owner, child);
+ list_add(&ctx->node, &owner->conn_list);
+ sock_hold(child);
/* with OoO packets we can reach here without ingress
* mpc option
*/
- if (mp_opt.suboptions & OPTION_MPTCP_MPC_ACK)
+ if (mp_opt.suboptions & OPTION_MPTCP_MPC_ACK) {
mptcp_subflow_fully_established(ctx, &mp_opt);
+ mptcp_pm_fully_established(owner, child, GFP_ATOMIC);
+ ctx->pm_notified = 1;
+ }
} else if (ctx->mp_join) {
- struct mptcp_sock *owner;
-
owner = subflow_req->msk;
if (!owner) {
subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT);
@@ -898,7 +914,7 @@ out:
return child;
dispose_child:
- subflow_drop_ctx(child);
+ mptcp_subflow_drop_ctx(child);
tcp_rsk(req)->drop_req = true;
inet_csk_prepare_for_destroy_sock(child);
tcp_done(child);
@@ -909,7 +925,7 @@ dispose_child:
}
static struct inet_connection_sock_af_ops subflow_specific __ro_after_init;
-static struct proto tcp_prot_override;
+static struct proto tcp_prot_override __ro_after_init;
enum mapping_status {
MAPPING_OK,
@@ -1399,6 +1415,7 @@ void __mptcp_error_report(struct sock *sk)
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
int err = sock_error(ssk);
+ int ssk_state;
if (!err)
continue;
@@ -1409,7 +1426,14 @@ void __mptcp_error_report(struct sock *sk)
if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(msk))
continue;
- inet_sk_state_store(sk, inet_sk_state_load(ssk));
+ /* We need to propagate only transition to CLOSE state.
+ * Orphaned socket will see such state change via
+ * subflow_sched_work_if_closed() and that path will properly
+ * destroy the msk as needed.
+ */
+ ssk_state = inet_sk_state_load(ssk);
+ if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD))
+ inet_sk_state_store(sk, ssk_state);
sk->sk_err = -err;
/* This barrier is coupled with smp_rmb() in mptcp_poll() */
@@ -1423,6 +1447,13 @@ static void subflow_error_report(struct sock *ssk)
{
struct sock *sk = mptcp_subflow_ctx(ssk)->conn;
+ /* bail early if this is a no-op, so that we avoid introducing a
+ * problematic lockdep dependency between TCP accept queue lock
+ * and msk socket spinlock
+ */
+ if (!sk->sk_socket)
+ return;
+
mptcp_data_lock(sk);
if (!sock_owned_by_user(sk))
__mptcp_error_report(sk);
@@ -1438,6 +1469,8 @@ static void subflow_data_ready(struct sock *sk)
struct sock *parent = subflow->conn;
struct mptcp_sock *msk;
+ trace_sk_data_ready(sk);
+
msk = mptcp_sk(parent);
if (state & TCPF_LISTEN) {
/* MPJ subflow are removed from accept queue before reaching here,
@@ -1679,7 +1712,7 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family,
if (err)
return err;
- lock_sock(sf->sk);
+ lock_sock_nested(sf->sk, SINGLE_DEPTH_NESTING);
/* the newly created socket has to be in the same cgroup as its parent */
mptcp_attach_cgroup(sk, sf->sk);
@@ -1792,79 +1825,6 @@ static void subflow_state_change(struct sock *sk)
}
}
-void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk)
-{
- struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue;
- struct mptcp_sock *msk, *next, *head = NULL;
- struct request_sock *req;
-
- /* build a list of all unaccepted mptcp sockets */
- spin_lock_bh(&queue->rskq_lock);
- for (req = queue->rskq_accept_head; req; req = req->dl_next) {
- struct mptcp_subflow_context *subflow;
- struct sock *ssk = req->sk;
- struct mptcp_sock *msk;
-
- if (!sk_is_mptcp(ssk))
- continue;
-
- subflow = mptcp_subflow_ctx(ssk);
- if (!subflow || !subflow->conn)
- continue;
-
- /* skip if already in list */
- msk = mptcp_sk(subflow->conn);
- if (msk->dl_next || msk == head)
- continue;
-
- msk->dl_next = head;
- head = msk;
- }
- spin_unlock_bh(&queue->rskq_lock);
- if (!head)
- return;
-
- /* can't acquire the msk socket lock under the subflow one,
- * or will cause ABBA deadlock
- */
- release_sock(listener_ssk);
-
- for (msk = head; msk; msk = next) {
- struct sock *sk = (struct sock *)msk;
- bool do_cancel_work;
-
- sock_hold(sk);
- lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
- next = msk->dl_next;
- msk->first = NULL;
- msk->dl_next = NULL;
-
- do_cancel_work = __mptcp_close(sk, 0);
- release_sock(sk);
- if (do_cancel_work) {
- /* lockdep will report a false positive ABBA deadlock
- * between cancel_work_sync and the listener socket.
- * The involved locks belong to different sockets WRT
- * the existing AB chain.
- * Using a per socket key is problematic as key
- * deregistration requires process context and must be
- * performed at socket disposal time, in atomic
- * context.
- * Just tell lockdep to consider the listener socket
- * released here.
- */
- mutex_release(&listener_sk->sk_lock.dep_map, _RET_IP_);
- mptcp_cancel_work(sk);
- mutex_acquire(&listener_sk->sk_lock.dep_map,
- SINGLE_DEPTH_NESTING, 0, _RET_IP_);
- }
- sock_put(sk);
- }
-
- /* we are still under the listener msk socket lock */
- lock_sock_nested(listener_ssk, SINGLE_DEPTH_NESTING);
-}
-
static int subflow_ulp_init(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1921,6 +1881,13 @@ static void subflow_ulp_release(struct sock *ssk)
* when the subflow is still unaccepted
*/
release = ctx->disposable || list_empty(&ctx->node);
+
+ /* inet_child_forget() does not call sk_state_change(),
+ * explicitly trigger the socket close machinery
+ */
+ if (!release && !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW,
+ &mptcp_sk(sk)->flags))
+ mptcp_schedule_work(sk);
sock_put(sk);
}
diff --git a/net/mptcp/token.c b/net/mptcp/token.c
index 65430f314a68..5bb924534387 100644
--- a/net/mptcp/token.c
+++ b/net/mptcp/token.c
@@ -134,7 +134,7 @@ int mptcp_token_new_request(struct request_sock *req)
/**
* mptcp_token_new_connect - create new key/idsn/token for subflow
- * @sk: the socket that will initiate a connection
+ * @ssk: the socket that will initiate a connection
*
* This function is called when a new outgoing mptcp connection is
* initiated.
@@ -148,11 +148,12 @@ int mptcp_token_new_request(struct request_sock *req)
*
* returns 0 on success.
*/
-int mptcp_token_new_connect(struct sock *sk)
+int mptcp_token_new_connect(struct sock *ssk)
{
- struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
int retries = MPTCP_TOKEN_MAX_RETRIES;
+ struct sock *sk = subflow->conn;
struct token_bucket *bucket;
again:
@@ -169,12 +170,13 @@ again:
}
pr_debug("ssk=%p, local_key=%llu, token=%u, idsn=%llu\n",
- sk, subflow->local_key, subflow->token, subflow->idsn);
+ ssk, subflow->local_key, subflow->token, subflow->idsn);
WRITE_ONCE(msk->token, subflow->token);
__sk_nulls_add_node_rcu((struct sock *)msk, &bucket->msk_chain);
bucket->chain_len++;
spin_unlock_bh(&bucket->lock);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
return 0;
}
@@ -190,8 +192,10 @@ void mptcp_token_accept(struct mptcp_subflow_request_sock *req,
struct mptcp_sock *msk)
{
struct mptcp_subflow_request_sock *pos;
+ struct sock *sk = (struct sock *)msk;
struct token_bucket *bucket;
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
bucket = token_bucket(req->token);
spin_lock_bh(&bucket->lock);
@@ -370,12 +374,14 @@ void mptcp_token_destroy_request(struct request_sock *req)
*/
void mptcp_token_destroy(struct mptcp_sock *msk)
{
+ struct sock *sk = (struct sock *)msk;
struct token_bucket *bucket;
struct mptcp_sock *pos;
if (sk_unhashed((struct sock *)msk))
return;
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
bucket = token_bucket(msk->token);
spin_lock_bh(&bucket->lock);
pos = __token_lookup_msk(bucket, msk->token);
diff --git a/net/mptcp/token_test.c b/net/mptcp/token_test.c
index 5d984bec1cd8..0758865ab658 100644
--- a/net/mptcp/token_test.c
+++ b/net/mptcp/token_test.c
@@ -57,6 +57,9 @@ static struct mptcp_sock *build_msk(struct kunit *test)
KUNIT_EXPECT_NOT_ERR_OR_NULL(test, msk);
refcount_set(&((struct sock *)msk)->sk_refcnt, 1);
sock_net_set((struct sock *)msk, &init_net);
+
+ /* be sure the token helpers can dereference sk->sk_prot */
+ ((struct sock *)msk)->sk_prot = &tcp_prot;
return msk;
}