From a014bc8f0f0a3a0cac4fef656f101cdfb77b71eb Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Tue, 4 Dec 2007 00:43:24 -0800 Subject: [PATCH 1/8] [IPVS]: Don't leak sysctl tables if the scheduler registration fails. In case we load lblc or lblcr module we can leak some sysctl tables if the call to register_ip_vs_scheduler() fails. I've looked at the register_ip_vs_scheduler() code and saw, that the only reason to fail is the name collision, so I think that with some 3rd party schedulers this becomes a relevant issue. No? Signed-off-by: Pavel Emelyanov Acked-by: Simon Horman Signed-off-by: David S. Miller --- net/ipv4/ipvs/ip_vs_lblc.c | 7 ++++++- net/ipv4/ipvs/ip_vs_lblcr.c | 7 ++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c index b843a11d7cf7..ad89644ef5d2 100644 --- a/net/ipv4/ipvs/ip_vs_lblc.c +++ b/net/ipv4/ipvs/ip_vs_lblc.c @@ -580,9 +580,14 @@ static struct ip_vs_scheduler ip_vs_lblc_scheduler = static int __init ip_vs_lblc_init(void) { + int ret; + INIT_LIST_HEAD(&ip_vs_lblc_scheduler.n_list); sysctl_header = register_sysctl_table(lblc_root_table); - return register_ip_vs_scheduler(&ip_vs_lblc_scheduler); + ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler); + if (ret) + unregister_sysctl_table(sysctl_header); + return ret; } diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c index e5b323a6b2f7..2a5ed85a3352 100644 --- a/net/ipv4/ipvs/ip_vs_lblcr.c +++ b/net/ipv4/ipvs/ip_vs_lblcr.c @@ -769,9 +769,14 @@ static struct ip_vs_scheduler ip_vs_lblcr_scheduler = static int __init ip_vs_lblcr_init(void) { + int ret; + INIT_LIST_HEAD(&ip_vs_lblcr_scheduler.n_list); sysctl_header = register_sysctl_table(lblcr_root_table); - return register_ip_vs_scheduler(&ip_vs_lblcr_scheduler); + ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler); + if (ret) + unregister_sysctl_table(sysctl_header); + return ret; } From 4ac63ad6c52e9cdefbcb54ec4575ab12b78b49d9 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Tue, 4 Dec 2007 00:45:06 -0800 Subject: [PATCH 2/8] [IPVS]: Fix sched registration race when checking for name collision. The register_ip_vs_scheduler() checks for the scheduler with the same name under the read-locked __ip_vs_sched_lock, then drops, takes it for writing and puts the scheduler in list. This is racy, since we can have a race window between the lock being re-locked for writing. The fix is to search the scheduler with the given name right under the write-locked __ip_vs_sched_lock. Signed-off-by: Pavel Emelyanov Acked-by: Simon Horman Signed-off-by: David S. Miller --- net/ipv4/ipvs/ip_vs_sched.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/net/ipv4/ipvs/ip_vs_sched.c b/net/ipv4/ipvs/ip_vs_sched.c index 1602304abbf9..432235861908 100644 --- a/net/ipv4/ipvs/ip_vs_sched.c +++ b/net/ipv4/ipvs/ip_vs_sched.c @@ -183,19 +183,6 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) /* increase the module use count */ ip_vs_use_count_inc(); - /* - * Make sure that the scheduler with this name doesn't exist - * in the scheduler list. - */ - sched = ip_vs_sched_getbyname(scheduler->name); - if (sched) { - ip_vs_scheduler_put(sched); - ip_vs_use_count_dec(); - IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler " - "already existed in the system\n", scheduler->name); - return -EINVAL; - } - write_lock_bh(&__ip_vs_sched_lock); if (scheduler->n_list.next != &scheduler->n_list) { @@ -206,6 +193,20 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) return -EINVAL; } + /* + * Make sure that the scheduler with this name doesn't exist + * in the scheduler list. + */ + list_for_each_entry(sched, &ip_vs_schedulers, n_list) { + if (strcmp(scheduler->name, sched->name) == 0) { + write_unlock_bh(&__ip_vs_sched_lock); + ip_vs_use_count_dec(); + IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler " + "already existed in the system\n", + scheduler->name); + return -EINVAL; + } + } /* * Add it into the d-linked scheduler list */ From 78f150bf94f5430fe8c34edeafe8d01706f38148 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 5 Dec 2007 02:18:15 -0800 Subject: [PATCH 3/8] [ROSE]: Trivial compilation CONFIG_INET=n case The rose_rebuild_header() consists only of some variables in case INET=n, and gcc will warn us about it. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/rose/rose_dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/rose/rose_dev.c b/net/rose/rose_dev.c index 1b6741f1d746..12cfcf09556b 100644 --- a/net/rose/rose_dev.c +++ b/net/rose/rose_dev.c @@ -55,13 +55,13 @@ static int rose_header(struct sk_buff *skb, struct net_device *dev, static int rose_rebuild_header(struct sk_buff *skb) { +#ifdef CONFIG_INET struct net_device *dev = skb->dev; struct net_device_stats *stats = netdev_priv(dev); unsigned char *bp = (unsigned char *)skb->data; struct sk_buff *skbn; unsigned int len; -#ifdef CONFIG_INET if (arp_find(bp + 7, skb)) { return 1; } From 92b05e13f16a41405c4f6c953c47b6c4bcf82d30 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 5 Dec 2007 02:18:48 -0800 Subject: [PATCH 4/8] [IRDA]: Move ircomm_tty_line_info() under #ifdef CONFIG_PROC_FS The function in question is called only from ircomm_tty_read_proc, which is under this option. Move this helper to the same place. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/irda/ircomm/ircomm_tty.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c index 1120b150e211..be627e1f04d8 100644 --- a/net/irda/ircomm/ircomm_tty.c +++ b/net/irda/ircomm/ircomm_tty.c @@ -1245,6 +1245,7 @@ static void ircomm_tty_flow_indication(void *instance, void *sap, self->flow = cmd; } +#ifdef CONFIG_PROC_FS static int ircomm_tty_line_info(struct ircomm_tty_cb *self, char *buf) { int ret=0; @@ -1354,7 +1355,6 @@ static int ircomm_tty_line_info(struct ircomm_tty_cb *self, char *buf) * * */ -#ifdef CONFIG_PROC_FS static int ircomm_tty_read_proc(char *buf, char **start, off_t offset, int len, int *eof, void *unused) { From 3e6f049e0c4cf0606207c1a210abf50b436e9adf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=E4rvinen?= Date: Wed, 5 Dec 2007 02:20:21 -0800 Subject: [PATCH 5/8] [TCP] FRTO: Use of existing funcs make code more obvious & robust MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Though there's little need for everything that tcp_may_send_now does (actually, even the state had to be adjusted to pass some checks FRTO does not want to occur), it's more robust to let it make the decision if sending is allowed. State adjustments needed: - Make sure snd_cwnd limit is not hit in there - Disable nagle (if necessary) through the frto_counter == 2 The result of check for frto_counter in argument to call for tcp_enter_frto_loss can just be open coded, therefore there isn't need to store the previous frto_counter past tcp_may_send_now. In addition, returns can then be combined. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 0f0c1c9829a1..094f8fafaf53 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3003,17 +3003,13 @@ static int tcp_process_frto(struct sock *sk, int flag) } if (tp->frto_counter == 1) { - /* Sending of the next skb must be allowed or no F-RTO */ - if (!tcp_send_head(sk) || - after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, - tp->snd_una + tp->snd_wnd)) { - tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), - flag); - return 1; - } - + /* tcp_may_send_now needs to see updated state */ tp->snd_cwnd = tcp_packets_in_flight(tp) + 2; tp->frto_counter = 2; + + if (!tcp_may_send_now(sk)) + tcp_enter_frto_loss(sk, 2, flag); + return 1; } else { switch (sysctl_tcp_frto_response) { From 52d3408150858a301a84bcbfe2f323d90d71d2ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=E4rvinen?= Date: Wed, 5 Dec 2007 02:21:35 -0800 Subject: [PATCH 6/8] [TCP]: Move prior_in_flight collect to more robust place MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous location is after sacktag processing, which affects counters tcp_packets_in_flight depends on. This may manifest as wrong behavior if new SACK blocks are present and all is clear for call to tcp_cong_avoid, which in the case of tcp_reno_cong_avoid bails out early because it thinks that TCP is not limited by cwnd. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 094f8fafaf53..b9e429d2d1de 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3065,6 +3065,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) } prior_fackets = tp->fackets_out; + prior_in_flight = tcp_packets_in_flight(tp); if (!(flag&FLAG_SLOWPATH) && after(ack, prior_snd_una)) { /* Window is constant, pure forward advance. @@ -3104,8 +3105,6 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) if (!prior_packets) goto no_queue; - prior_in_flight = tcp_packets_in_flight(tp); - /* See if we can take anything off of the retransmit queue. */ flag |= tcp_clean_rtx_queue(sk, &seq_rtt, prior_fackets); From 4e67d876ce07471e02be571038d5435a825f0215 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=E4rvinen?= Date: Wed, 5 Dec 2007 02:25:32 -0800 Subject: [PATCH 7/8] [TCP]: NAGLE_PUSH seems to be a wrong way around MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The comment in tcp_nagle_test suggests that. This bug is very very old, even 2.4.0 seems to have it. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e5130a7fe181..f4c1eef89af0 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1162,8 +1162,7 @@ int tcp_may_send_now(struct sock *sk) return (skb && tcp_snd_test(sk, skb, tcp_current_mss(sk, 1), (tcp_skb_is_last(sk, skb) ? - TCP_NAGLE_PUSH : - tp->nonagle))); + tp->nonagle : TCP_NAGLE_PUSH))); } /* Trim TSO SKB to LEN bytes, put the remaining data into a new packet From 621544eb8c3beaa859c75850f816dd9b056a00a3 Mon Sep 17 00:00:00 2001 From: Andrew Gallatin Date: Wed, 5 Dec 2007 02:31:42 -0800 Subject: [PATCH 8/8] [LRO]: fix lro_gen_skb() alignment Add a field to the lro_mgr struct so that drivers can specify how much padding is required to align layer 3 headers when a packet is copied into a freshly allocated skb by inet_lro.c:lro_gen_skb(). Without padding, skbs generated by LRO will cause alignment warnings on architectures which require strict alignment (seen on sparc64). Myri10GE is updated to use this field. Signed-off-by: Andrew Gallatin Signed-off-by: David S. Miller --- drivers/net/myri10ge/myri10ge.c | 1 + include/linux/inet_lro.h | 3 +++ net/ipv4/inet_lro.c | 3 ++- 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/myri10ge/myri10ge.c b/drivers/net/myri10ge/myri10ge.c index 0f306ddb5630..8def8657251f 100644 --- a/drivers/net/myri10ge/myri10ge.c +++ b/drivers/net/myri10ge/myri10ge.c @@ -1979,6 +1979,7 @@ static int myri10ge_open(struct net_device *dev) lro_mgr->lro_arr = mgp->rx_done.lro_desc; lro_mgr->get_frag_header = myri10ge_get_frag_header; lro_mgr->max_aggr = myri10ge_lro_max_pkts; + lro_mgr->frag_align_pad = 2; if (lro_mgr->max_aggr > MAX_SKB_FRAGS) lro_mgr->max_aggr = MAX_SKB_FRAGS; diff --git a/include/linux/inet_lro.h b/include/linux/inet_lro.h index 1246d46abbc0..80335b7d77c5 100644 --- a/include/linux/inet_lro.h +++ b/include/linux/inet_lro.h @@ -91,6 +91,9 @@ struct net_lro_mgr { int max_desc; /* Max number of LRO descriptors */ int max_aggr; /* Max number of LRO packets to be aggregated */ + int frag_align_pad; /* Padding required to properly align layer 3 + * headers in generated skb when using frags */ + struct net_lro_desc *lro_arr; /* Array of LRO descriptors */ /* diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c index ac3b1d3dba2e..9a96c277393d 100644 --- a/net/ipv4/inet_lro.c +++ b/net/ipv4/inet_lro.c @@ -401,10 +401,11 @@ static struct sk_buff *lro_gen_skb(struct net_lro_mgr *lro_mgr, int data_len = len; int hdr_len = min(len, hlen); - skb = netdev_alloc_skb(lro_mgr->dev, hlen); + skb = netdev_alloc_skb(lro_mgr->dev, hlen + lro_mgr->frag_align_pad); if (!skb) return NULL; + skb_reserve(skb, lro_mgr->frag_align_pad); skb->len = len; skb->data_len = len - hdr_len; skb->truesize += true_size;