From bc9f38c8328e10c22cb2016d6131ea36141c8d11 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 29 Apr 2019 15:46:13 -0700 Subject: [PATCH 1/8] tcp: avoid unconditional congestion window undo on SYN retransmit Previously if an active TCP open has SYN timeout, it always undo the cwnd upon receiving the SYNACK. This is because tcp_clean_rtx_queue would reset tp->retrans_stamp when SYN is acked, which fools then tcp_try_undo_loss and tcp_packet_delayed. Addressing this issue is required to properly support undo for spurious SYN timeout. Fixing this is tricky -- for active TCP open tp->retrans_stamp records the time when the handshake starts, not the first retransmission time as the name may suggest. The simplest fix is for tcp_packet_delayed to ensure it is valid before comparing with other timestamp. One side effect of this change is active TCP Fast Open that incurred SYN timeout. Upon receiving a SYN-ACK that only acknowledged the SYN, it would immediately retransmit unacknowledged data in tcp_ack() because the data is marked lost after SYN timeout. But the retransmission would have an incorrect ack sequence number since rcv_nxt has not been updated yet tcp_rcv_synsent_state_process(), the retransmission needs to properly handed by tcp_rcv_fastopen_synack() like before. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 97671bff597a..e2cbfc3ffa3f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2252,7 +2252,7 @@ static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp, */ static inline bool tcp_packet_delayed(const struct tcp_sock *tp) { - return !tp->retrans_stamp || + return tp->retrans_stamp && tcp_tsopt_ecr_before(tp, tp->retrans_stamp); } @@ -3521,7 +3521,7 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit) { struct tcp_sock *tp = tcp_sk(sk); - if (rexmit == REXMIT_NONE) + if (rexmit == REXMIT_NONE || sk->sk_state == TCP_SYN_SENT) return; if (unlikely(rexmit == 2)) { From 7c1f08154c4e34d10be41156375ce2b8ab591b0f Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 29 Apr 2019 15:46:14 -0700 Subject: [PATCH 2/8] tcp: undo initial congestion window on false SYN timeout Linux implements RFC6298 and use an initial congestion window of 1 upon establishing the connection if the SYN packet is retransmitted 2 or more times. In cellular networks SYN timeouts are often spurious if the wireless radio was dormant or idle. Also some network path is longer than the default SYN timeout. Having a minimal cwnd on both cases are detrimental to TCP startup performance. This patch extends TCP undo feature (RFC3522 aka TCP Eifel) to detect spurious SYN timeout via TCP timestamps. Since tp->retrans_stamp records the initial SYN timestamp instead of first retransmission, we have to implement a different undo code additionally. The detection also must happen before tcp_ack() as retrans_stamp is reset when SYN is acknowledged. Note this patch covers both active regular and fast open. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 16 ++++++++++++++++ net/ipv4/tcp_metrics.c | 2 +- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e2cbfc3ffa3f..695f840acc14 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5748,6 +5748,21 @@ static void smc_check_reset_syn(struct tcp_sock *tp) #endif } +static void tcp_try_undo_spurious_syn(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 syn_stamp; + + /* undo_marker is set when SYN or SYNACK times out. The timeout is + * spurious if the ACK's timestamp option echo value matches the + * original SYN timestamp. + */ + syn_stamp = tp->retrans_stamp; + if (tp->undo_marker && syn_stamp && tp->rx_opt.saw_tstamp && + syn_stamp == tp->rx_opt.rcv_tsecr) + tp->undo_marker = 0; +} + static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th) { @@ -5815,6 +5830,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_ecn_rcv_synack(tp, th); tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); + tcp_try_undo_spurious_syn(sk); tcp_ack(sk, skb, FLAG_SLOWPATH); /* Ok.. it's good. Set up sequence numbers and diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index f262f2cace29..d4d687330e2b 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -517,7 +517,7 @@ reset: * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK * retransmission has occurred. */ - if (tp->total_retrans > 1) + if (tp->total_retrans > 1 && tp->undo_marker) tp->snd_cwnd = 1; else tp->snd_cwnd = tcp_init_cwnd(tp, dst); From 9e450c1ecb027417c99eba651413d2a6ba6ffc1f Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 29 Apr 2019 15:46:15 -0700 Subject: [PATCH 3/8] tcp: better SYNACK sent timestamp Detecting spurious SYNACK timeout using timestamp option requires recording the exact SYNACK skb timestamp. Previously the SYNACK sent timestamp was stamped slightly earlier before the skb was transmitted. This patch uses the SYNACK skb transmission timestamp directly. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_output.c | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 695f840acc14..30c6a42b1f5b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6319,7 +6319,7 @@ static void tcp_openreq_init(struct request_sock *req, req->cookie_ts = 0; tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; - tcp_rsk(req)->snt_synack = tcp_clock_us(); + tcp_rsk(req)->snt_synack = 0; tcp_rsk(req)->last_oow_ack_time = 0; req->mss = rx_opt->mss_clamp; req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 32061928b054..0c4ed66dc1bf 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3247,7 +3247,11 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, skb->skb_mstamp_ns = cookie_init_timestamp(req); else #endif + { skb->skb_mstamp_ns = tcp_clock_ns(); + if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */ + tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb); + } #ifdef CONFIG_TCP_MD5SIG rcu_read_lock(); From 336c39a0315139103712d04b9bfaf0215df23b8e Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 29 Apr 2019 15:46:16 -0700 Subject: [PATCH 4/8] tcp: undo init congestion window on false SYNACK timeout Linux implements RFC6298 and use an initial congestion window of 1 upon establishing the connection if the SYNACK packet is retransmitted 2 or more times. In cellular networks SYNACK timeouts are often spurious if the wireless radio was dormant or idle. Also some network path is longer than the default SYNACK timeout. In both cases falsely starting with a minimal cwnd are detrimental to performance. This patch avoids doing so when the final ACK's TCP timestamp indicates the original SYNACK was delivered. It remembers the original SYNACK timestamp when SYNACK timeout has occurred and re-uses the function to detect spurious SYN timeout conveniently. Note that a server may receives multiple SYNs from and immediately retransmits SYNACKs without any SYNACK timeout. This often happens on when the client SYNs have timed out due to wireless delay above. In this case since the server will still use the default initial congestion (e.g. 10) because tp->undo_marker is reset in tcp_init_metrics(). This is an intentional design because packets are not lost but delayed. This patch only covers regular TCP passive open. Fast Open is supported in the next patch. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 ++ net/ipv4/tcp_minisocks.c | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 30c6a42b1f5b..53b4c5a3113b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6101,6 +6101,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) */ tcp_rearm_rto(sk); } else { + tcp_try_undo_spurious_syn(sk); + tp->retrans_stamp = 0; tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); tp->copied_seq = tp->rcv_nxt; } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 79900f783e0d..9c2a0d36fb20 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -522,6 +522,11 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->rx_opt.ts_recent_stamp = 0; newtp->tcp_header_len = sizeof(struct tcphdr); } + if (req->num_timeout) { + newtp->undo_marker = treq->snt_isn; + newtp->retrans_stamp = div_u64(treq->snt_synack, + USEC_PER_SEC / TCP_TS_HZ); + } newtp->tsoffset = treq->ts_off; #ifdef CONFIG_TCP_MD5SIG newtp->md5sig_info = NULL; /*XXX*/ From 8c3cfe19feac41065bb88bc14b36c318b26847a9 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 29 Apr 2019 15:46:17 -0700 Subject: [PATCH 5/8] tcp: lower congestion window on Fast Open SYNACK timeout TCP sender would use congestion window of 1 packet on the second SYN and SYNACK timeout except passive TCP Fast Open. This makes passive TFO too aggressive and unfair during congestion at handshake. This patch fixes this issue so TCP (fast open or not, passive or active) always conforms to the RFC6298. Note that tcp_enter_loss() is called only once during recurring timeouts. This is because during handshake, high_seq and snd_una are the same so tcp_enter_loss() would incorrect set the undo state variables multiple times. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index f0c86398e6a7..2ac23da42dd2 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -393,6 +393,9 @@ static void tcp_fastopen_synack_timer(struct sock *sk) tcp_write_err(sk); return; } + /* Lower cwnd after certain SYNACK timeout like tcp_init_transfer() */ + if (icsk->icsk_retransmits == 1) + tcp_enter_loss(sk); /* XXX (TFO) - Unlike regular SYN-ACK retransmit, we ignore error * returned from rtx_syn_ack() to make it more persistent like * regular retransmit because if the child socket has been accepted From 794200d66273cbfa32cab2dbcd59a5db6b57a5d1 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 29 Apr 2019 15:46:18 -0700 Subject: [PATCH 6/8] tcp: undo cwnd on Fast Open spurious SYNACK retransmit This patch makes passive Fast Open reverts the cwnd to default initial cwnd (10 packets) if the SYNACK timeout is spurious. Passive Fast Open uses a full socket during handshake so it can use the existing undo logic to detect spurious retransmission by recording the first SYNACK timeout in key state variable retrans_stamp. Upon receiving the ACK of the SYNACK, if the socket has sent some data before the timeout, the spurious timeout is detected by tcp_try_undo_recovery() in tcp_process_loss() in tcp_ack(). But if the socket has not send any data yet, tcp_ack() does not execute the undo code since no data is acknowledged. The fix is to check such case explicitly after tcp_ack() during the ACK processing in SYN_RECV state. In addition this is checked in FIN_WAIT_1 state in case the server closes the socket before handshake completes. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 53b4c5a3113b..3a40584cb473 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6089,6 +6089,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) * so release it. */ if (req) { + tcp_try_undo_loss(sk, false); inet_csk(sk)->icsk_retransmits = 0; reqsk_fastopen_remove(sk, req, false); /* Re-arm the timer because data may have been sent out. @@ -6143,6 +6144,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) * our SYNACK so stop the SYNACK timer. */ if (req) { + tcp_try_undo_loss(sk, false); + inet_csk(sk)->icsk_retransmits = 0; /* We no longer need the request sock. */ reqsk_fastopen_remove(sk, req, false); tcp_rearm_rto(sk); From 6b94b1c88b660a786fdb1c22d8a0d3529fe40f8c Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 29 Apr 2019 15:46:19 -0700 Subject: [PATCH 7/8] tcp: refactor to consolidate TFO passive open code Use a helper to consolidate two identical code block for passive TFO. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 52 +++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3a40584cb473..706a99ec73f6 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5989,6 +5989,27 @@ reset_and_undo: return 1; } +static void tcp_rcv_synrecv_state_fastopen(struct sock *sk) +{ + tcp_try_undo_loss(sk, false); + inet_csk(sk)->icsk_retransmits = 0; + + /* Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1, + * we no longer need req so release it. + */ + reqsk_fastopen_remove(sk, tcp_sk(sk)->fastopen_rsk, false); + + /* Re-arm the timer because data may have been sent out. + * This is similar to the regular data transmission case + * when new data has just been ack'ed. + * + * (TFO) - we could try to be more aggressive and + * retransmitting any data sooner based on when they + * are sent out. + */ + tcp_rearm_rto(sk); +} + /* * This function implements the receiving procedure of RFC 793 for * all states except ESTABLISHED and TIME_WAIT. @@ -6085,22 +6106,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (!tp->srtt_us) tcp_synack_rtt_meas(sk, req); - /* Once we leave TCP_SYN_RECV, we no longer need req - * so release it. - */ if (req) { - tcp_try_undo_loss(sk, false); - inet_csk(sk)->icsk_retransmits = 0; - reqsk_fastopen_remove(sk, req, false); - /* Re-arm the timer because data may have been sent out. - * This is similar to the regular data transmission case - * when new data has just been ack'ed. - * - * (TFO) - we could try to be more aggressive and - * retransmitting any data sooner based on when they - * are sent out. - */ - tcp_rearm_rto(sk); + tcp_rcv_synrecv_state_fastopen(sk); } else { tcp_try_undo_spurious_syn(sk); tp->retrans_stamp = 0; @@ -6138,18 +6145,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) case TCP_FIN_WAIT1: { int tmo; - /* If we enter the TCP_FIN_WAIT1 state and we are a - * Fast Open socket and this is the first acceptable - * ACK we have received, this would have acknowledged - * our SYNACK so stop the SYNACK timer. - */ - if (req) { - tcp_try_undo_loss(sk, false); - inet_csk(sk)->icsk_retransmits = 0; - /* We no longer need the request sock. */ - reqsk_fastopen_remove(sk, req, false); - tcp_rearm_rto(sk); - } + if (req) + tcp_rcv_synrecv_state_fastopen(sk); + if (tp->snd_una != tp->write_seq) break; From 98fa6271cfcb1de873b3fe0caf48d9daa1bcc0ac Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 29 Apr 2019 15:46:20 -0700 Subject: [PATCH 8/8] tcp: refactor setting the initial congestion window Relocate the congestion window initialization from tcp_init_metrics() to tcp_init_transfer() to improve code readability. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 12 ------------ net/ipv4/tcp_input.c | 26 ++++++++++++++++++++++++++ net/ipv4/tcp_metrics.c | 10 ---------- 3 files changed, 26 insertions(+), 22 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f7567a3698eb..1fa15beb8380 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -457,18 +457,6 @@ void tcp_init_sock(struct sock *sk) } EXPORT_SYMBOL(tcp_init_sock); -void tcp_init_transfer(struct sock *sk, int bpf_op) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - - tcp_mtup_init(sk); - icsk->icsk_af_ops->rebuild_header(sk); - tcp_init_metrics(sk); - tcp_call_bpf(sk, bpf_op, 0, NULL); - tcp_init_congestion_control(sk); - tcp_init_buffer_space(sk); -} - static void tcp_tx_timestamp(struct sock *sk, u16 tsflags) { struct sk_buff *skb = tcp_write_queue_tail(sk); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 706a99ec73f6..077d9abdfcf5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5647,6 +5647,32 @@ discard: } EXPORT_SYMBOL(tcp_rcv_established); +void tcp_init_transfer(struct sock *sk, int bpf_op) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + + tcp_mtup_init(sk); + icsk->icsk_af_ops->rebuild_header(sk); + tcp_init_metrics(sk); + + /* Initialize the congestion window to start the transfer. + * Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been + * retransmitted. In light of RFC6298 more aggressive 1sec + * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK + * retransmission has occurred. + */ + if (tp->total_retrans > 1 && tp->undo_marker) + tp->snd_cwnd = 1; + else + tp->snd_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk)); + tp->snd_cwnd_stamp = tcp_jiffies32; + + tcp_call_bpf(sk, bpf_op, 0, NULL); + tcp_init_congestion_control(sk); + tcp_init_buffer_space(sk); +} + void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index d4d687330e2b..c4848e7a0aad 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -512,16 +512,6 @@ reset: inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; } - /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been - * retransmitted. In light of RFC6298 more aggressive 1sec - * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK - * retransmission has occurred. - */ - if (tp->total_retrans > 1 && tp->undo_marker) - tp->snd_cwnd = 1; - else - tp->snd_cwnd = tcp_init_cwnd(tp, dst); - tp->snd_cwnd_stamp = tcp_jiffies32; } bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst)