From a7eea416cb08a514f94c0ca5ff30c18783fab054 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 Jun 2015 09:15:15 -0700 Subject: [PATCH 1/5] tcp: reserve tcp_skb_mss() to tcp stack tcp_gso_segment() and tcp_gro_receive() are not strictly part of TCP stack. They should not assume tcp_skb_mss(skb) is in fact skb_shinfo(skb)->gso_size. This will allow us to change tcp_skb_mss() in following patches. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_offload.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 3f7c2fca5431..9864a2dbadce 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -77,7 +77,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, oldlen = (u16)~skb->len; __skb_pull(skb, thlen); - mss = tcp_skb_mss(skb); + mss = skb_shinfo(skb)->gso_size; if (unlikely(skb->len <= mss)) goto out; @@ -242,7 +242,7 @@ found: flush |= *(u32 *)((u8 *)th + i) ^ *(u32 *)((u8 *)th2 + i); - mss = tcp_skb_mss(p); + mss = skb_shinfo(p)->gso_size; flush |= (len - 1) >= mss; flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); From 51466a7545b73b7ad7bcfb33410d2823ccfaa501 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 Jun 2015 09:15:16 -0700 Subject: [PATCH 2/5] tcp: fill shinfo->gso_type at last moment Our goal is to touch skb_shinfo(skb) only when absolutely needed, to avoid two cache line misses in TCP output path for last skb that is considered but not sent because of various conditions (cwnd, tso defer, receiver window, TSQ...) A packet is GSO only when skb_shinfo(skb)->gso_size is not zero. We can set skb_shinfo(skb)->gso_type to sk->sk_gso_type even for non GSO packets. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 8 ++------ net/ipv4/tcp_output.c | 4 +--- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d4f76ab6e136..70a6fa8ecbd3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1316,16 +1316,12 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, * code can come after this skb later on it's better to keep * setting gso_size to something. */ - if (!skb_shinfo(prev)->gso_size) { + if (!skb_shinfo(prev)->gso_size) skb_shinfo(prev)->gso_size = mss; - skb_shinfo(prev)->gso_type = sk->sk_gso_type; - } /* CHECKME: To clear or not to clear? Mimics normal skb currently */ - if (tcp_skb_pcount(skb) <= 1) { + if (tcp_skb_pcount(skb) <= 1) skb_shinfo(skb)->gso_size = 0; - skb_shinfo(skb)->gso_type = 0; - } /* Difference in this won't matter, both ACKed by the same cumul. ACK */ TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index eeb59befaf06..a51f7aab27d6 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -412,7 +412,6 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) tcp_skb_pcount_set(skb, 1); shinfo->gso_size = 0; - shinfo->gso_type = 0; TCP_SKB_CB(skb)->seq = seq; if (flags & (TCPHDR_SYN | TCPHDR_FIN)) @@ -1003,6 +1002,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, } tcp_options_write((__be32 *)(th + 1), tp, &opts); + skb_shinfo(skb)->gso_type = sk->sk_gso_type; if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0)) tcp_ecn_send(sk, skb, tcp_header_size); @@ -1080,11 +1080,9 @@ static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, */ tcp_skb_pcount_set(skb, 1); shinfo->gso_size = 0; - shinfo->gso_type = 0; } else { tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now)); shinfo->gso_size = mss_now; - shinfo->gso_type = sk->sk_gso_type; } } From 5bbb432c896d23ce8f41f38e88dbd38982df99f9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 Jun 2015 09:15:17 -0700 Subject: [PATCH 3/5] tcp: tcp_set_skb_tso_segs() no longer need struct sock parameter tcp_set_skb_tso_segs() & tcp_init_tso_segs() no longer use the sock pointer. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a51f7aab27d6..d12888581337 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1066,8 +1066,7 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) } /* Initialize TSO segments for a packet. */ -static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, - unsigned int mss_now) +static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) { struct skb_shared_info *shinfo = skb_shinfo(skb); @@ -1214,8 +1213,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, old_factor = tcp_skb_pcount(skb); /* Fix up tso_factor for both original and new SKB. */ - tcp_set_skb_tso_segs(sk, skb, mss_now); - tcp_set_skb_tso_segs(sk, buff, mss_now); + tcp_set_skb_tso_segs(skb, mss_now); + tcp_set_skb_tso_segs(buff, mss_now); /* If this packet has been sent out already, we must * adjust the various packet counters. @@ -1295,7 +1294,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) /* Any change of skb->len requires recalculation of tso factor. */ if (tcp_skb_pcount(skb) > 1) - tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb)); + tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb)); return 0; } @@ -1627,13 +1626,12 @@ static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, * This must be invoked the first time we consider transmitting * SKB onto the wire. */ -static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb, - unsigned int mss_now) +static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now) { int tso_segs = tcp_skb_pcount(skb); if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) { - tcp_set_skb_tso_segs(sk, skb, mss_now); + tcp_set_skb_tso_segs(skb, mss_now); tso_segs = tcp_skb_pcount(skb); } return tso_segs; @@ -1688,7 +1686,7 @@ static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb, const struct tcp_sock *tp = tcp_sk(sk); unsigned int cwnd_quota; - tcp_init_tso_segs(sk, skb, cur_mss); + tcp_init_tso_segs(skb, cur_mss); if (!tcp_nagle_test(tp, skb, cur_mss, nonagle)) return 0; @@ -1757,8 +1755,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, tcp_fragment_tstamp(skb, buff); /* Fix up tso_factor for both original and new SKB. */ - tcp_set_skb_tso_segs(sk, skb, mss_now); - tcp_set_skb_tso_segs(sk, buff, mss_now); + tcp_set_skb_tso_segs(skb, mss_now); + tcp_set_skb_tso_segs(buff, mss_now); /* Link BUFF into the send queue. */ __skb_header_release(buff); @@ -1992,7 +1990,7 @@ static int tcp_mtu_probe(struct sock *sk) skb->len, 0); } else { __pskb_trim_head(skb, copy); - tcp_set_skb_tso_segs(sk, skb, mss_now); + tcp_set_skb_tso_segs(skb, mss_now); } TCP_SKB_CB(skb)->seq += copy; } @@ -2002,7 +2000,7 @@ static int tcp_mtu_probe(struct sock *sk) if (len >= probe_size) break; } - tcp_init_tso_segs(sk, nskb, nskb->len); + tcp_init_tso_segs(nskb, nskb->len); /* We're ready to send. If this fails, the probe will * be resegmented into mss-sized pieces by tcp_write_xmit(). @@ -2064,7 +2062,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, while ((skb = tcp_send_head(sk))) { unsigned int limit; - tso_segs = tcp_init_tso_segs(sk, skb, mss_now); + tso_segs = tcp_init_tso_segs(skb, mss_now); BUG_ON(!tso_segs); if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { @@ -2618,7 +2616,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) if (unlikely(oldpcount > 1)) { if (skb_unclone(skb, GFP_ATOMIC)) return -ENOMEM; - tcp_init_tso_segs(sk, skb, cur_mss); + tcp_init_tso_segs(skb, cur_mss); tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb)); } } @@ -3455,7 +3453,7 @@ int tcp_write_wakeup(struct sock *sk, int mib) if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC)) return -1; } else if (!tcp_skb_pcount(skb)) - tcp_set_skb_tso_segs(sk, skb, mss); + tcp_set_skb_tso_segs(skb, mss); TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); From f69ad292cfd13aa7ee00847320c6bb9ba2154e87 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 Jun 2015 09:15:18 -0700 Subject: [PATCH 4/5] tcp: fill shinfo->gso_size at last moment In commit cd7d8498c9a5 ("tcp: change tcp_skb_pcount() location") we stored gso_segs in a temporary cache hot location. This patch does the same for gso_size. This allows to save 2 cache line misses in tcp xmit path for the last packet that is considered but not sent because of various conditions (cwnd, tso defer, receiver window, TSQ...) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 13 ++++++++----- net/ipv4/tcp_input.c | 8 ++++---- net/ipv4/tcp_output.c | 12 ++++-------- 3 files changed, 16 insertions(+), 17 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 978cebedd3fc..950cfecaad3c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -730,11 +730,14 @@ struct tcp_skb_cb { /* Note : tcp_tw_isn is used in input path only * (isn chosen by tcp_timewait_state_process()) * - * tcp_gso_segs is used in write queue only, - * cf tcp_skb_pcount() + * tcp_gso_segs/size are used in write queue only, + * cf tcp_skb_pcount()/tcp_skb_mss() */ __u32 tcp_tw_isn; - __u32 tcp_gso_segs; + struct { + u16 tcp_gso_segs; + u16 tcp_gso_size; + }; }; __u8 tcp_flags; /* TCP header flags. (tcp[13]) */ @@ -790,10 +793,10 @@ static inline void tcp_skb_pcount_add(struct sk_buff *skb, int segs) TCP_SKB_CB(skb)->tcp_gso_segs += segs; } -/* This is valid iff tcp_skb_pcount() > 1. */ +/* This is valid iff skb is in write queue and tcp_skb_pcount() > 1. */ static inline int tcp_skb_mss(const struct sk_buff *skb) { - return skb_shinfo(skb)->gso_size; + return TCP_SKB_CB(skb)->tcp_gso_size; } /* Events passed to congestion control interface */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 70a6fa8ecbd3..684f095d196e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1316,12 +1316,12 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, * code can come after this skb later on it's better to keep * setting gso_size to something. */ - if (!skb_shinfo(prev)->gso_size) - skb_shinfo(prev)->gso_size = mss; + if (!TCP_SKB_CB(prev)->tcp_gso_size) + TCP_SKB_CB(prev)->tcp_gso_size = mss; /* CHECKME: To clear or not to clear? Mimics normal skb currently */ if (tcp_skb_pcount(skb) <= 1) - skb_shinfo(skb)->gso_size = 0; + TCP_SKB_CB(skb)->tcp_gso_size = 0; /* Difference in this won't matter, both ACKed by the same cumul. ACK */ TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS); @@ -2248,7 +2248,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) (oldcnt >= packets)) break; - mss = skb_shinfo(skb)->gso_size; + mss = tcp_skb_mss(skb); err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, mss, GFP_ATOMIC); if (err < 0) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index d12888581337..787f57ff87c4 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -402,8 +402,6 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, */ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) { - struct skb_shared_info *shinfo = skb_shinfo(skb); - skb->ip_summed = CHECKSUM_PARTIAL; skb->csum = 0; @@ -411,7 +409,6 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) TCP_SKB_CB(skb)->sacked = 0; tcp_skb_pcount_set(skb, 1); - shinfo->gso_size = 0; TCP_SKB_CB(skb)->seq = seq; if (flags & (TCPHDR_SYN | TCPHDR_FIN)) @@ -1028,8 +1025,9 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, tcp_skb_pcount(skb)); tp->segs_out += tcp_skb_pcount(skb); - /* OK, its time to fill skb_shinfo(skb)->gso_segs */ + /* OK, its time to fill skb_shinfo(skb)->gso_{segs|size} */ skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb); + skb_shinfo(skb)->gso_size = tcp_skb_mss(skb); /* Our usage of tstamp should remain private */ skb->tstamp.tv64 = 0; @@ -1068,8 +1066,6 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) /* Initialize TSO segments for a packet. */ static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) { - struct skb_shared_info *shinfo = skb_shinfo(skb); - /* Make sure we own this skb before messing gso_size/gso_segs */ WARN_ON_ONCE(skb_cloned(skb)); @@ -1078,10 +1074,10 @@ static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) * non-TSO case. */ tcp_skb_pcount_set(skb, 1); - shinfo->gso_size = 0; + TCP_SKB_CB(skb)->tcp_gso_size = 0; } else { tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now)); - shinfo->gso_size = mss_now; + TCP_SKB_CB(skb)->tcp_gso_size = mss_now; } } From b5e2c45783aa785cbb195e43d5f0c0c6b228bde4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 Jun 2015 09:15:19 -0700 Subject: [PATCH 5/5] tcp: remove obsolete check in tcp_set_skb_tso_segs() We had various issues in the past when TCP stack was modifying gso_size/gso_segs while clones were in flight. Commit c52e2421f73 ("tcp: must unclone packets before mangling them") fixed these bugs and added a WARN_ON_ONCE(skb_cloned(skb)); in tcp_set_skb_tso_segs() These bugs are now fixed, and because TCP stack now only sets shinfo->gso_size|segs on the clone itself, the check can be removed. As a result of this change, compiler inlines tcp_set_skb_tso_segs() in tcp_init_tso_segs() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 787f57ff87c4..b1c218df2c85 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1066,9 +1066,6 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) /* Initialize TSO segments for a packet. */ static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) { - /* Make sure we own this skb before messing gso_size/gso_segs */ - WARN_ON_ONCE(skb_cloned(skb)); - if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) { /* Avoid the costly divide in the normal * non-TSO case.