diff -ru linux-2.6.14.4/include/linux/sysctl.h linux-2.6.14.4-iip/include/linux/sysctl.h --- linux-2.6.14.4/include/linux/sysctl.h Thu Dec 15 01:50:41 2005 +++ linux-2.6.14.4-iip/include/linux/sysctl.h Thu May 18 15:48:54 2006 @@ -353,6 +353,19 @@ NET_TCP_BIC_BETA=108, NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109, NET_TCP_CONG_CONTROL=110, + + NET_IPV4_TCP_INITIAL_WINDOW, + NET_IPV4_TCP_APPLICATION_LIMITED, + NET_IPV4_TCP_DELAYED_ACK, + NET_IPV4_TCP_QUICKACKS, + NET_IPV4_TCP_RFC3517_PIPE, + NET_IPV4_TCP_RATE_HALVING, + NET_IPV4_TCP_LIMITED_TRANSMIT, + NET_IPV4_TCP_CBI_REUSE_SSTHRESH, + NET_IPV4_TCP_CBI_REUSE_RTT, + NET_IPV4_TCP_CBI_REUSE_REORDER, + NET_IPV4_TCP_RFC2988, + NET_IPV4_TCP_SACKFRTO }; enum { diff -ru linux-2.6.14.4/include/linux/tcp.h linux-2.6.14.4-iip/include/linux/tcp.h --- linux-2.6.14.4/include/linux/tcp.h Thu Dec 15 01:50:41 2005 +++ linux-2.6.14.4-iip/include/linux/tcp.h Thu May 18 14:58:37 2006 @@ -263,6 +263,7 @@ __u32 rcv_ssthresh; /* Current window clamp */ __u32 frto_highmark; /* snd_nxt when RTO occurred */ + int frto_origsacked;/* SACK for not retransmitted data arrived after RTO */ __u8 reordering; /* Packet reordering metric. */ __u8 frto_counter; /* Number of new acks after RTO */ __u8 nonagle; /* Disable Nagle algorithm? */ diff -ru linux-2.6.14.4/include/net/tcp.h linux-2.6.14.4-iip/include/net/tcp.h --- linux-2.6.14.4/include/net/tcp.h Thu Dec 15 01:50:41 2005 +++ linux-2.6.14.4-iip/include/net/tcp.h Thu May 18 15:38:06 2006 @@ -184,6 +184,17 @@ extern struct inet_timewait_death_row tcp_death_row; /* sysctl variables for tcp */ +extern int sysctl_tcp_iw; +extern int sysctl_tcp_application_limited; +extern int sysctl_tcp_delack; +extern int sysctl_tcp_quickacks; +extern int sysctl_tcp_rfc3517_pipe; +extern int sysctl_tcp_rate_halving; +extern int sysctl_tcp_limited_transmit; +extern int sysctl_tcp_cbi_reuse_ssthresh; +extern int sysctl_tcp_cbi_reuse_rtt; +extern int sysctl_tcp_cbi_reuse_reorder; +extern int sysctl_tcp_rfc2988; extern int sysctl_tcp_timestamps; extern int sysctl_tcp_window_scaling; extern int sysctl_tcp_sack; @@ -213,6 +224,7 @@ extern int sysctl_tcp_adv_win_scale; extern int sysctl_tcp_tw_reuse; extern int sysctl_tcp_frto; +extern int sysctl_tcp_sackfrto; extern int sysctl_tcp_low_latency; extern int sysctl_tcp_nometrics_save; extern int sysctl_tcp_moderate_rcvbuf; @@ -744,9 +756,18 @@ * "Packets left network, but not honestly ACKed yet" PLUS * "Packets fast retransmitted" */ -static __inline__ unsigned int tcp_packets_in_flight(const struct tcp_sock *tp) +static __inline__ unsigned int tcp_packets_in_flight(const struct sock *sk) { - return (tp->packets_out - tp->left_out + tp->retrans_out); + const struct tcp_sock *tp = tcp_sk(sk); + /* RFC3517 (Conservative SACK-based Loss Recovery) considers losses + * in pipe calculation only during recovery */ + if (!sysctl_tcp_rfc3517_pipe || + /* IsReno || IsFack */ + (tp)->rx_opt.sack_ok == 0 || (tp)->rx_opt.sack_ok & 2 || + ((1 << inet_csk(sk)->icsk_ca_state) & (TCP_CA_Loss | TCP_CA_Recovery))) + return (tp->packets_out - tp->left_out + tp->retrans_out); + else + return (tp->packets_out - tp->sacked_out + tp->retrans_out); } /* If cwnd > ssthresh, we may raise ssthresh to be half-way to cwnd. @@ -781,7 +802,7 @@ tp->undo_marker = 0; tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); tp->snd_cwnd = min(tp->snd_cwnd, - tcp_packets_in_flight(tp) + 1U); + tcp_packets_in_flight(sk) + 1U); tp->snd_cwnd_cnt = 0; tp->high_seq = tp->snd_nxt; tp->snd_cwnd_stamp = tcp_time_stamp; @@ -899,10 +920,16 @@ tp->ucopy.memory = 0; } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) { wake_up_interruptible(sk->sk_sleep); - if (!inet_csk_ack_scheduled(sk)) - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, - (3 * TCP_RTO_MIN) / 4, - TCP_RTO_MAX); + if (!inet_csk_ack_scheduled(sk)) { + if (!sysctl_tcp_delack) + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, + (3 * TCP_RTO_MIN) / 4, + TCP_RTO_MAX); + else + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, + sysctl_tcp_delack * HZ / 1000, + TCP_RTO_MAX); + } } return 1; } diff -ru linux-2.6.14.4/net/ipv4/sysctl_net_ipv4.c linux-2.6.14.4-iip/net/ipv4/sysctl_net_ipv4.c --- linux-2.6.14.4/net/ipv4/sysctl_net_ipv4.c Thu Dec 15 01:50:41 2005 +++ linux-2.6.14.4-iip/net/ipv4/sysctl_net_ipv4.c Tue May 23 13:32:06 2006 @@ -129,6 +129,102 @@ ctl_table ipv4_table[] = { + { + .ctl_name = NET_IPV4_TCP_INITIAL_WINDOW, + .procname = "tcp_iw", + .data = &sysctl_tcp_iw, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_TCP_APPLICATION_LIMITED, + .procname = "tcp_appl_limited", + .data = &sysctl_tcp_application_limited, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_TCP_DELAYED_ACK, + .procname = "tcp_delayed_ack", + .data = &sysctl_tcp_delack, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_TCP_QUICKACKS, + .procname = "tcp_quickacks", + .data = &sysctl_tcp_quickacks, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_TCP_RFC3517_PIPE, + .procname = "tcp_rfc3517_pipe", + .data = &sysctl_tcp_rfc3517_pipe, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_TCP_LIMITED_TRANSMIT, + .procname = "tcp_limited_transmit", + .data = &sysctl_tcp_limited_transmit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_TCP_RATE_HALVING, + .procname = "tcp_rate_halving", + .data = &sysctl_tcp_rate_halving, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_TCP_CBI_REUSE_SSTHRESH, + .procname = "tcp_cbi_reuse_ssthresh", + .data = &sysctl_tcp_cbi_reuse_ssthresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_TCP_CBI_REUSE_RTT, + .procname = "tcp_cbi_reuse_rtt", + .data = &sysctl_tcp_cbi_reuse_rtt, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_TCP_CBI_REUSE_REORDER, + .procname = "tcp_cbi_reuse_reorder", + .data = &sysctl_tcp_cbi_reuse_reorder, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_TCP_RFC2988, + .procname = "tcp_rfc2988", + .data = &sysctl_tcp_rfc2988, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_TCP_SACKFRTO, + .procname = "tcp_sackfrto", + .data = &sysctl_tcp_sackfrto, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, { .ctl_name = NET_IPV4_TCP_TIMESTAMPS, .procname = "tcp_timestamps", diff -ru linux-2.6.14.4/net/ipv4/tcp.c linux-2.6.14.4-iip/net/ipv4/tcp.c --- linux-2.6.14.4/net/ipv4/tcp.c Thu Dec 15 01:50:41 2005 +++ linux-2.6.14.4-iip/net/ipv4/tcp.c Tue May 16 13:35:21 2006 @@ -960,7 +960,8 @@ * receive buffer and there was a small segment * in queue. */ - (copied > 0 && (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) && + (!sysctl_tcp_delack && + copied > 0 && (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) && !icsk->icsk_ack.pingpong && !atomic_read(&sk->sk_rmem_alloc))) time_to_ack = 1; } diff -ru linux-2.6.14.4/net/ipv4/tcp_input.c linux-2.6.14.4-iip/net/ipv4/tcp_input.c --- linux-2.6.14.4/net/ipv4/tcp_input.c Thu Dec 15 01:50:41 2005 +++ linux-2.6.14.4-iip/net/ipv4/tcp_input.c Tue May 23 13:31:17 2006 @@ -72,6 +72,16 @@ #include #include +int sysctl_tcp_iw = 0; +int sysctl_tcp_application_limited = 1; /* RFC2861 */ +int sysctl_tcp_quickacks = 1; /* 1 = linux way */ +int sysctl_tcp_rfc3517_pipe = 0; /* 0 = linux way */ +int sysctl_tcp_rate_halving = 1; /* 0 = rate halving hackaround. buggered with ECN */ +int sysctl_tcp_limited_transmit = 1; /* 0 = limited transmit hackaround. TODO: CHECK IT WORKS */ +int sysctl_tcp_cbi_reuse_ssthresh = 1; /* CBI: use stored ssthresh for new connection */ +int sysctl_tcp_cbi_reuse_rtt = 1; /* CBI: use stored rtt-variables for new connection */ +int sysctl_tcp_cbi_reuse_reorder = 1; /* CBI: use stored reorder-variable for new connection */ +int sysctl_tcp_rfc2988 = 0; /* 1 = RFC2988 RTT estimator, 0 = linux. TODO: CHECK IT WORKS */ int sysctl_tcp_timestamps = 1; int sysctl_tcp_window_scaling = 1; int sysctl_tcp_sack = 1; @@ -86,6 +96,7 @@ int sysctl_tcp_rfc1337; int sysctl_tcp_max_orphans = NR_FILE; int sysctl_tcp_frto; +int sysctl_tcp_sackfrto; /* 1 = SACK enhanced F-RTO, 0 = default. TODO: CHECK IT WORKS */ int sysctl_tcp_nometrics_save; int sysctl_tcp_moderate_rcvbuf = 1; @@ -185,7 +196,7 @@ static inline int tcp_in_quickack_mode(const struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); - return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong; + return sysctl_tcp_quickacks && icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong; } /* Buffer size and advertised window tuning. @@ -615,6 +626,38 @@ icsk->icsk_ca_ops->rtt_sample(sk, *usrtt); } +static void tcp_rfc2988_rtt(struct sock *sk, __u32 mrtt, u32 *usrtt) +{ + struct tcp_sock *tp = tcp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); + long m = mrtt; + + if (m == 0) + m = 1; + + if (tp->srtt != 0) { + m -= (tp->srtt >> 3); /* m is now error in rtt est */ + tp->srtt += m; /* rtt = 7/8 rtt + 1/8 new */ + if (m < 0) { + m = -m; /* m is now abs(error) */ + m -= (tp->mdev >> 2); /* similar update on mdev */ + } else { + m -= (tp->mdev >> 2); /* similar update on mdev */ + } + tp->mdev += m; + tp->rttvar = tp->mdev; + } else { + /* no previous measure. */ + tp->srtt = m<<3; /* take the measured time to be rtt */ + tp->mdev = m<<1; /* make sure rto = 3*rtt */ + tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN); + tp->rtt_seq = tp->snd_nxt; + } + + if (icsk->icsk_ca_ops->rtt_sample) + icsk->icsk_ca_ops->rtt_sample(sk, *usrtt); +} + /* Calculate rto without backoff. This is the second half of Van Jacobson's * routine referred to above. */ @@ -647,6 +690,13 @@ { if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX) inet_csk(sk)->icsk_rto = TCP_RTO_MAX; + + /* + * RFC2988 2.4: if RTO goes below 1 second, round it up. + */ + if (sysctl_tcp_rfc2988) + if (inet_csk(sk)->icsk_rto < HZ) + inet_csk(sk)->icsk_rto = HZ; } /* Save metrics learned by this TCP session. @@ -748,7 +798,9 @@ { __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); - if (!cwnd) { + if (sysctl_tcp_iw) { + cwnd = sysctl_tcp_iw; + } else if (!cwnd) { if (tp->mss_cache > 1460) cwnd = 2; else @@ -771,18 +823,18 @@ if (dst_metric_locked(dst, RTAX_CWND)) tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND); - if (dst_metric(dst, RTAX_SSTHRESH)) { + if (sysctl_tcp_cbi_reuse_ssthresh && dst_metric(dst, RTAX_SSTHRESH)) { tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH); if (tp->snd_ssthresh > tp->snd_cwnd_clamp) tp->snd_ssthresh = tp->snd_cwnd_clamp; } - if (dst_metric(dst, RTAX_REORDERING) && + if (sysctl_tcp_cbi_reuse_reorder && dst_metric(dst, RTAX_REORDERING) && tp->reordering != dst_metric(dst, RTAX_REORDERING)) { tp->rx_opt.sack_ok &= ~2; tp->reordering = dst_metric(dst, RTAX_REORDERING); } - if (dst_metric(dst, RTAX_RTT) == 0) + if (!sysctl_tcp_cbi_reuse_rtt || dst_metric(dst, RTAX_RTT) == 0) goto reset; if (!tp->srtt && dst_metric(dst, RTAX_RTT) < (TCP_TIMEOUT_INIT << 3)) @@ -1058,6 +1110,14 @@ TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; tp->lost_out -= tcp_skb_pcount(skb); } + /* + * F-RTO: SACK for non-retransmitted + * segment after RTO is a sign of + * spurious RTO + */ + if (sysctl_tcp_sackfrto) + if (before(TCP_SKB_CB(skb)->seq, tp->frto_highmark) && tp->frto_counter > 1) + tp->frto_origsacked = 1; } TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED; @@ -1119,6 +1179,7 @@ tp->left_out = tp->sacked_out + tp->lost_out; + if (!sysctl_tcp_frto) if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss) tcp_update_reordering(sk, ((tp->fackets_out + 1) - reord), 0); @@ -1126,7 +1187,7 @@ BUG_TRAP((int)tp->sacked_out >= 0); BUG_TRAP((int)tp->lost_out >= 0); BUG_TRAP((int)tp->retrans_out >= 0); - BUG_TRAP((int)tcp_packets_in_flight(tp) >= 0); + BUG_TRAP((int)tcp_packets_in_flight(sk) >= 0); #endif return flag; } @@ -1142,6 +1203,7 @@ struct sk_buff *skb; tp->frto_counter = 1; + tp->frto_origsacked = 0; if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || @@ -1203,7 +1265,7 @@ } tcp_sync_left_out(tp); - tp->snd_cwnd = tp->frto_counter + tcp_packets_in_flight(tp)+1; + tp->snd_cwnd = tp->frto_counter + tcp_packets_in_flight(sk)+1; tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_time_stamp; tp->undo_marker = 0; @@ -1554,10 +1616,11 @@ /* CWND moderation, preventing bursts due to too big ACKs * in dubious situations. */ -static inline void tcp_moderate_cwnd(struct tcp_sock *tp) +static inline void tcp_moderate_cwnd(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); tp->snd_cwnd = min(tp->snd_cwnd, - tcp_packets_in_flight(tp)+tcp_max_burst(tp)); + tcp_packets_in_flight(sk)+tcp_max_burst(tp)); tp->snd_cwnd_stamp = tcp_time_stamp; } @@ -1574,7 +1637,7 @@ if (decr && tp->snd_cwnd > icsk->icsk_ca_ops->min_cwnd(sk)) tp->snd_cwnd -= decr; - tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); + tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(sk)+1); tp->snd_cwnd_stamp = tcp_time_stamp; } @@ -1624,7 +1687,7 @@ } else { tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh); } - tcp_moderate_cwnd(tp); + tcp_moderate_cwnd(sk); tp->snd_cwnd_stamp = tcp_time_stamp; } @@ -1653,7 +1716,7 @@ /* Hold old state until something *above* high_seq * is ACKed. For Reno it is MUST to prevent false * fast retransmits (RFC2582). SACK TCP is safe. */ - tcp_moderate_cwnd(tp); + tcp_moderate_cwnd(sk); return 1; } tcp_set_ca_state(sk, TCP_CA_Open); @@ -1751,7 +1814,11 @@ tcp_set_ca_state(sk, state); tp->high_seq = tp->snd_nxt; } - tcp_moderate_cwnd(tp); + if (!sysctl_tcp_limited_transmit) + if (state == TCP_CA_Disorder && tp->snd_cwnd > 1) + tp->snd_cwnd--; + + tcp_moderate_cwnd(sk); } else { tcp_cwnd_down(sk); } @@ -1866,7 +1933,7 @@ if (flag&FLAG_DATA_ACKED) icsk->icsk_retransmits = 0; if (!tcp_try_undo_loss(sk, tp)) { - tcp_moderate_cwnd(tp); + tcp_moderate_cwnd(sk); tcp_xmit_retransmit_queue(sk); return; } @@ -1910,11 +1977,47 @@ tp->snd_cwnd_cnt = 0; tcp_set_ca_state(sk, TCP_CA_Recovery); + + if (!sysctl_tcp_rate_halving) { + struct sk_buff *skb; + /* + * From RFC 2581 + * 2. Retransmit the lost segment ... + */ + sk_stream_for_retrans_queue(skb, sk) { + if (!(TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)) { + printk(KERN_DEBUG "Fast retransmit (BUG): head sacked?\n" ); + break; + } + tcp_retransmit_skb(sk, skb); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + icsk->icsk_rto, TCP_RTO_MAX); + break; + } + /* + * ... and set cwnd to ssthresh plus 3*SMSS. + * This artificially "inflates" the congestion + * window by the number of segments (three) + * that have left the network and which the + * receiver has buffered. Inflated segments + * are already included due to limited xmit. + * + * This also conforms to RFC 3517 (Consevative + * SACK-based recovery). + * + * TODO: limited transmit disabled with RFC 3517 enabled + */ + if(sysctl_tcp_limited_transmit) + tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); + else + tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh + sysctl_tcp_reordering); + } } if (is_dupack || tcp_head_timedout(sk, tp)) tcp_update_scoreboard(sk, tp); - tcp_cwnd_down(sk); + if (sysctl_tcp_rate_halving) + tcp_cwnd_down(sk); tcp_xmit_retransmit_queue(sk); } @@ -1940,7 +2043,10 @@ */ struct tcp_sock *tp = tcp_sk(sk); const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; - tcp_rtt_estimator(sk, seq_rtt, usrtt); + if (sysctl_tcp_rfc2988) + tcp_rfc2988_rtt(sk, seq_rtt, usrtt); + else + tcp_rtt_estimator(sk, seq_rtt, usrtt); tcp_set_rto(sk); inet_csk(sk)->icsk_backoff = 0; tcp_bound_rto(sk); @@ -1960,7 +2066,11 @@ if (flag & FLAG_RETRANS_DATA_ACKED) return; - tcp_rtt_estimator(sk, seq_rtt, usrtt); + if (sysctl_tcp_rfc2988) + tcp_rfc2988_rtt(sk, seq_rtt, usrtt); + else + tcp_rtt_estimator(sk, seq_rtt, usrtt); + tcp_set_rto(sk); inet_csk(sk)->icsk_backoff = 0; tcp_bound_rto(sk); @@ -2261,6 +2371,7 @@ tcp_sync_left_out(tp); if (tp->snd_una == prior_snd_una || + (tp->rx_opt.sack_ok && !tp->frto_origsacked && tp->snd_una == prior_snd_una && tp->frto_counter > 1) || !before(tp->snd_una, tp->frto_highmark)) { /* RTO was caused by loss, start retransmitting in * go-back-N slow start @@ -2273,14 +2384,14 @@ /* First ACK after RTO advances the window: allow two new * segments out. */ - tp->snd_cwnd = tcp_packets_in_flight(tp) + 2; + tp->snd_cwnd = tcp_packets_in_flight(sk) + 2; } else { /* Also the second ACK after RTO advances the window. * The RTO was likely spurious. Reduce cwnd and continue * in congestion avoidance */ tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); - tcp_moderate_cwnd(tp); + tcp_moderate_cwnd(sk); } /* F-RTO affects on two new ACKs following RTO. @@ -2349,7 +2460,7 @@ if (!prior_packets) goto no_queue; - prior_in_flight = tcp_packets_in_flight(tp); + prior_in_flight = tcp_packets_in_flight(sk); /* See if we can take anything off of the retransmit queue. */ flag |= tcp_clean_rtx_queue(sk, &seq_rtt, @@ -3346,7 +3457,8 @@ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open && sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { /* Limited by application or receiver window. */ - u32 win_used = max(tp->snd_cwnd_used, 2U); + u32 path_iw = tcp_init_cwnd(tp, __sk_dst_get(sk)); + u32 win_used = max(tp->snd_cwnd_used, path_iw); if (win_used < tp->snd_cwnd) { tp->snd_ssthresh = tcp_current_ssthresh(sk); tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1; diff -ru linux-2.6.14.4/net/ipv4/tcp_ipv4.c linux-2.6.14.4-iip/net/ipv4/tcp_ipv4.c --- linux-2.6.14.4/net/ipv4/tcp_ipv4.c Thu Dec 15 01:50:41 2005 +++ linux-2.6.14.4-iip/net/ipv4/tcp_ipv4.c Tue May 16 13:35:21 2006 @@ -1425,7 +1425,10 @@ * algorithms that we must have the following bandaid to talk * efficiently to them. -DaveM */ - tp->snd_cwnd = 2; + if (sysctl_tcp_iw) + tp->snd_cwnd = sysctl_tcp_iw; + else + tp->snd_cwnd = 2; /* See draft-stevens-tcpca-spec-01 for discussion of the * initialization of these values. diff -ru linux-2.6.14.4/net/ipv4/tcp_minisocks.c linux-2.6.14.4-iip/net/ipv4/tcp_minisocks.c --- linux-2.6.14.4/net/ipv4/tcp_minisocks.c Thu Dec 15 01:50:41 2005 +++ linux-2.6.14.4-iip/net/ipv4/tcp_minisocks.c Tue May 16 13:35:21 2006 @@ -378,7 +378,10 @@ * algorithms that we must have the following bandaid to talk * efficiently to them. -DaveM */ - newtp->snd_cwnd = 2; + if (sysctl_tcp_iw) + newtp->snd_cwnd = sysctl_tcp_iw; + else + newtp->snd_cwnd = 2; newtp->snd_cwnd_cnt = 0; newtp->frto_counter = 0; diff -ru linux-2.6.14.4/net/ipv4/tcp_output.c linux-2.6.14.4-iip/net/ipv4/tcp_output.c --- linux-2.6.14.4/net/ipv4/tcp_output.c Thu Dec 15 01:50:41 2005 +++ linux-2.6.14.4-iip/net/ipv4/tcp_output.c Tue May 16 13:35:21 2006 @@ -42,6 +42,8 @@ #include #include +int sysctl_tcp_delack = 0; /* Zero = linux way, non-zero = ms */ + /* People can turn this off for buggy TCP's found in printers etc. */ int sysctl_tcp_retrans_collapse = 1; @@ -308,7 +310,7 @@ (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); } - if (tcp_packets_in_flight(tp) == 0) + if (tcp_packets_in_flight(sk) == 0) tcp_ca_event(sk, CA_EVENT_TX_START); th = (struct tcphdr *) skb_push(skb, tcp_header_size); @@ -717,8 +719,11 @@ if (tp->packets_out > tp->snd_cwnd_used) tp->snd_cwnd_used = tp->packets_out; - if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto) - tcp_cwnd_application_limited(sk); + if (sysctl_tcp_application_limited) { + if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto) + tcp_cwnd_application_limited(sk); + } else + tp->snd_cwnd_used = 0; } } @@ -734,15 +739,16 @@ /* Can at least one segment of SKB be sent right now, according to the * congestion window rules? If so, return how many segments are allowed. */ -static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb) +static inline unsigned int tcp_cwnd_test(struct sock *sk, struct sk_buff *skb) { + struct tcp_sock *tp = tcp_sk(sk); u32 in_flight, cwnd; /* Don't be strict about the congestion window for the final FIN. */ if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) return 1; - in_flight = tcp_packets_in_flight(tp); + in_flight = tcp_packets_in_flight(sk); cwnd = tp->snd_cwnd; if (in_flight < cwnd) return (cwnd - in_flight); @@ -843,7 +849,7 @@ if (!tcp_nagle_test(tp, skb, cur_mss, nonagle)) return 0; - cwnd_quota = tcp_cwnd_test(tp, skb); + cwnd_quota = tcp_cwnd_test(sk, skb); if (cwnd_quota && !tcp_snd_wnd_test(tp, skb, cur_mss)) cwnd_quota = 0; @@ -935,7 +941,7 @@ if (icsk->icsk_ca_state != TCP_CA_Open) return 0; - in_flight = tcp_packets_in_flight(tp); + in_flight = tcp_packets_in_flight(sk); BUG_ON(tcp_skb_pcount(skb) <= 1 || (tp->snd_cwnd <= in_flight)); @@ -998,7 +1004,7 @@ tso_segs = tcp_init_tso_segs(sk, skb, mss_now); BUG_ON(!tso_segs); - cwnd_quota = tcp_cwnd_test(tp, skb); + cwnd_quota = tcp_cwnd_test(sk, skb); if (!cwnd_quota) break; @@ -1482,7 +1488,7 @@ * packet to be MSS sized and all the * packet counting works out. */ - if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) + if (tcp_packets_in_flight(sk) >= tp->snd_cwnd) return; if (sacked&TCPCB_LOST) { @@ -1540,7 +1546,7 @@ if (++packet_cnt > tp->fackets_out) break; - if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) + if (tcp_packets_in_flight(sk) >= tp->snd_cwnd) break; if (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) @@ -1848,7 +1854,9 @@ int ato = icsk->icsk_ack.ato; unsigned long timeout; - if (ato > TCP_DELACK_MIN) { + if (sysctl_tcp_delack) + ato = sysctl_tcp_delack * HZ / 1000; + else if (ato > TCP_DELACK_MIN) { const struct tcp_sock *tp = tcp_sk(sk); int max_ato = HZ/2; @@ -1880,7 +1888,8 @@ * send ACK now. */ if (icsk->icsk_ack.blocked || - time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) { + (time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2)) && + !sysctl_tcp_delack)) { tcp_send_ack(sk); return; }