diff options
author | Eric Dumazet <edumazet@google.com> | 2025-05-13 19:39:10 +0000 |
---|---|---|
committer | Jakub Kicinski <kuba@kernel.org> | 2025-05-15 11:30:08 -0700 |
commit | 65c5287892e9a881e41758cbf071df6ec9c24a76 (patch) | |
tree | 7776d47aeb29ac97b639bb9ebd326c2a98d368e8 /net/ipv4/tcp_input.c | |
parent | c1269d3d12b88151ee4c109624b5022d53a11738 (diff) |
tcp: fix sk_rcvbuf overshoot
Current autosizing in tcp_rcv_space_adjust() is too aggressive.
Instead of betting on possible losses and over estimate BDP,
it is better to only account for slow start.
The following patch is then adding a more precise tuning
in the events of packet losses.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20250513193919.1089692-3-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r-- | net/ipv4/tcp_input.c | 59 |
1 files changed, 25 insertions, 34 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 88beb6d0f7b5..89e886bb0fa1 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -747,6 +747,29 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, } } +static void tcp_rcvbuf_grow(struct sock *sk) +{ + const struct net *net = sock_net(sk); + struct tcp_sock *tp = tcp_sk(sk); + int rcvwin, rcvbuf, cap; + + if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) || + (sk->sk_userlocks & SOCK_RCVBUF_LOCK)) + return; + + /* slow start: allow the sender to double its rate. */ + rcvwin = tp->rcvq_space.space << 1; + + cap = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]); + + rcvbuf = min_t(u32, tcp_space_from_win(sk, rcvwin), cap); + if (rcvbuf > sk->sk_rcvbuf) { + WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); + /* Make the window clamp follow along. */ + WRITE_ONCE(tp->window_clamp, + tcp_win_from_space(sk, rcvbuf)); + } +} /* * This function should be called every time data is copied to user space. * It calculates the appropriate TCP receive buffer space. @@ -771,42 +794,10 @@ void tcp_rcv_space_adjust(struct sock *sk) trace_tcp_rcvbuf_grow(sk, time); - /* A bit of theory : - * copied = bytes received in previous RTT, our base window - * To cope with packet losses, we need a 2x factor - * To cope with slow start, and sender growing its cwin by 100 % - * every RTT, we need a 4x factor, because the ACK we are sending - * now is for the next RTT, not the current one : - * <prev RTT . ><current RTT .. ><next RTT .... > - */ - - if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && - !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { - u64 rcvwin, grow; - int rcvbuf; - - /* minimal window to cope with packet losses, assuming - * steady state. Add some cushion because of small variations. - */ - rcvwin = ((u64)copied << 1) + 16 * tp->advmss; - - /* Accommodate for sender rate increase (eg. slow start) */ - grow = rcvwin * (copied - tp->rcvq_space.space); - do_div(grow, tp->rcvq_space.space); - rcvwin += (grow << 1); - - rcvbuf = min_t(u64, tcp_space_from_win(sk, rcvwin), - READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); - if (rcvbuf > sk->sk_rcvbuf) { - WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); - - /* Make the window clamp follow along. */ - WRITE_ONCE(tp->window_clamp, - tcp_win_from_space(sk, rcvbuf)); - } - } tp->rcvq_space.space = copied; + tcp_rcvbuf_grow(sk); + new_measure: tp->rcvq_space.seq = tp->copied_seq; tp->rcvq_space.time = tp->tcp_mstamp; |