tcp: fix sk_rcvbuf overshoot

Current autosizing in tcp_rcv_space_adjust() is too aggressive. Instead of betting on possible losses and over estimate BDP, it is better to only account for slow start. The following patch is then adding a more precise tuning in the events of packet losses. Signed-off-by: Eric Dumazet <edumazet@google.com> Link: https://patch.msgid.link/20250513193919.1089692-3-edumazet@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
author: Eric Dumazet <edumazet@google.com> 2025-05-13 19:39:10 +0000
committer: Jakub Kicinski <kuba@kernel.org> 2025-05-15 11:30:08 -0700
commit: 65c5287892e9a881e41758cbf071df6ec9c24a76 (patch)
tree: 7776d47aeb29ac97b639bb9ebd326c2a98d368e8 /net/ipv4/tcp_input.c
parent: c1269d3d12b88151ee4c109624b5022d53a11738 (diff)
1 files changed, 25 insertions, 34 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 88beb6d0f7b5..89e886bb0fa1 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -747,6 +747,29 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
 	}
 }
 
+static void tcp_rcvbuf_grow(struct sock *sk)
+{
+	const struct net *net = sock_net(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	int rcvwin, rcvbuf, cap;
+
+	if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) ||
+	    (sk->sk_userlocks & SOCK_RCVBUF_LOCK))
+		return;
+
+	/* slow start: allow the sender to double its rate. */
+	rcvwin = tp->rcvq_space.space << 1;
+
+	cap = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
+
+	rcvbuf = min_t(u32, tcp_space_from_win(sk, rcvwin), cap);
+	if (rcvbuf > sk->sk_rcvbuf) {
+		WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
+		/* Make the window clamp follow along.  */
+		WRITE_ONCE(tp->window_clamp,
+			   tcp_win_from_space(sk, rcvbuf));
+	}
+}
 /*
  * This function should be called every time data is copied to user space.
  * It calculates the appropriate TCP receive buffer space.
@@ -771,42 +794,10 @@ void tcp_rcv_space_adjust(struct sock *sk)
 
 	trace_tcp_rcvbuf_grow(sk, time);
 
-	/* A bit of theory :
-	 * copied = bytes received in previous RTT, our base window
-	 * To cope with packet losses, we need a 2x factor
-	 * To cope with slow start, and sender growing its cwin by 100 %
-	 * every RTT, we need a 4x factor, because the ACK we are sending
-	 * now is for the next RTT, not the current one :
-	 * <prev RTT . ><current RTT .. ><next RTT .... >
-	 */
-
-	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
-	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
-		u64 rcvwin, grow;
-		int rcvbuf;
-
-		/* minimal window to cope with packet losses, assuming
-		 * steady state. Add some cushion because of small variations.
-		 */
-		rcvwin = ((u64)copied << 1) + 16 * tp->advmss;
-
-		/* Accommodate for sender rate increase (eg. slow start) */
-		grow = rcvwin * (copied - tp->rcvq_space.space);
-		do_div(grow, tp->rcvq_space.space);
-		rcvwin += (grow << 1);
-
-		rcvbuf = min_t(u64, tcp_space_from_win(sk, rcvwin),
-			       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
-		if (rcvbuf > sk->sk_rcvbuf) {
-			WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
-
-			/* Make the window clamp follow along.  */
-			WRITE_ONCE(tp->window_clamp,
-				   tcp_win_from_space(sk, rcvbuf));
-		}
-	}
 	tp->rcvq_space.space = copied;
 
+	tcp_rcvbuf_grow(sk);
+
 new_measure:
 	tp->rcvq_space.seq = tp->copied_seq;
 	tp->rcvq_space.time = tp->tcp_mstamp;
author	Eric Dumazet <edumazet@google.com>	2025-05-13 19:39:10 +0000
committer	Jakub Kicinski <kuba@kernel.org>	2025-05-15 11:30:08 -0700
commit	65c5287892e9a881e41758cbf071df6ec9c24a76 (patch)
tree	7776d47aeb29ac97b639bb9ebd326c2a98d368e8 /net/ipv4/tcp_input.c
parent	c1269d3d12b88151ee4c109624b5022d53a11738 (diff)