diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 90ed781..fc59f41 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -349,7 +349,10 @@ tcp_frto - BOOLEAN Enables F-RTO, an enhanced recovery algorithm for TCP retransmission timeouts. It is particularly beneficial in wireless environments where packet loss is typically due to random radio interference - rather than intermediate router congestion. + rather than intermediate router congestion. If set to 1, basic + version is enabled. 2 enables SACK enhanced FRTO, which is + EXPERIMENTAL. The basic version can be used also when SACK is + enabled for a flow through tcp_sack sysctl. tcp_congestion_control - STRING Set the congestion control algorithm to be used for new diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index e4b1a4d..fe76b3a 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -411,6 +411,28 @@ enum NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS=115, NET_TCP_DMA_COPYBREAK=116, NET_TCP_SLOW_START_AFTER_IDLE=117, + + NET_IPV4_TCP_INITIAL_WINDOW, + NET_IPV4_TCP_APPLICATION_LIMITED, + NET_IPV4_TCP_DELAYED_ACK, + NET_IPV4_TCP_QUICKACKS, + NET_IPV4_TCP_RATE_HALVING, + NET_IPV4_TCP_LIMITED_TRANSMIT, + NET_IPV4_TCP_CBI_REUSE_SSTHRESH, + NET_IPV4_TCP_CBI_REUSE_RTT, + NET_IPV4_TCP_CBI_REUSE_REORDER, + NET_TCP_FRTO_RESPONSE, + NET_IPV4_TCP_RFC2988_RTT, + NET_IPV4_TCP_RTO_MAX, + NET_IPV4_TCP_RTO_MIN, + NET_IPV4_TCP_HEAD_TIMEOUTS, + NET_TCP_DUPACKS_TO_RECOVERY, + NET_TCP_LOST_REXMIT_DETECT, + NET_TCP_CONST_DUPTHRESH, + NET_TCP_RFC3517_PIPE, + NET_TCP_MAXIMIZE_ADV_WIN, + NET_TCP_NO_CWND_LOSSES, + NET_TCP_FASTREC_RTO_REDUCES, }; enum { diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 8ebf497..c950721 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -309,6 +309,8 @@ struct tcp_sock { int lost_cnt_hint; int retransmit_cnt_hint; int forward_cnt_hint; + + int dupacks; /* Number of arrived duplicate ACKs */ __u16 advmss; /* Advertised MSS */ __u16 prior_ssthresh; /* ssthresh saved at recovery start */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 7a093d0..39a05a9 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -120,7 +120,7 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo); #define TCP_DELACK_MIN 4U #define TCP_ATO_MIN 4U #endif -#define TCP_RTO_MAX ((unsigned)(120*HZ)) +#define TCP_RTO_MAX ((unsigned)(60*HZ)) #define TCP_RTO_MIN ((unsigned)(HZ/5)) #define TCP_TIMEOUT_INIT ((unsigned)(3*HZ)) /* RFC 1122 initial RTO value */ @@ -188,6 +188,26 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo); extern struct inet_timewait_death_row tcp_death_row; /* sysctl variables for tcp */ +extern int sysctl_tcp_iw; +extern int sysctl_tcp_application_limited; +extern int sysctl_tcp_delack; +extern int sysctl_tcp_quickacks; +extern int sysctl_tcp_rate_halving; +extern int sysctl_tcp_limited_transmit; +extern int sysctl_tcp_cbi_reuse_ssthresh; +extern int sysctl_tcp_cbi_reuse_rtt; +extern int sysctl_tcp_cbi_reuse_reorder; +extern int sysctl_tcp_rfc2988_rtt; +extern u32 sysctl_tcp_rto_max; +extern u32 sysctl_tcp_rto_min; +extern int sysctl_tcp_head_timeouts; +extern int sysctl_tcp_dupacks_to_recovery; +extern int sysctl_tcp_lost_rexmit_detect; +extern int sysctl_tcp_const_dupthresh; +extern int sysctl_tcp_rfc3517_pipe; +extern int sysctl_tcp_maximize_adv_win; +extern int sysctl_tcp_no_cwnd_losses; +extern int sysctl_tcp_fastrec_rto_reduces; extern int sysctl_tcp_timestamps; extern int sysctl_tcp_window_scaling; extern int sysctl_tcp_sack; @@ -217,6 +237,7 @@ extern int sysctl_tcp_app_win; extern int sysctl_tcp_adv_win_scale; extern int sysctl_tcp_tw_reuse; extern int sysctl_tcp_frto; +extern int sysctl_tcp_frto_response; extern int sysctl_tcp_low_latency; extern int sysctl_tcp_dma_copybreak; extern int sysctl_tcp_nometrics_save; @@ -341,6 +362,7 @@ extern struct sock * tcp_check_req(struct sock *sk,struct sk_buff *skb, extern int tcp_child_process(struct sock *parent, struct sock *child, struct sk_buff *skb); +extern int tcp_use_frto(struct sock *sk); extern void tcp_enter_frto(struct sock *sk); extern void tcp_enter_loss(struct sock *sk, int how); extern void tcp_clear_retrans(struct tcp_sock *tp); @@ -597,7 +619,7 @@ static inline void tcp_packets_out_inc(struct sock *sk, tp->packets_out += tcp_skb_pcount(skb); if (!orig) inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)->icsk_rto, TCP_RTO_MAX); + inet_csk(sk)->icsk_rto, sysctl_tcp_rto_max); } static inline void tcp_packets_out_dec(struct tcp_sock *tp, @@ -729,7 +751,7 @@ static inline void tcp_sync_left_out(struct tcp_sock *tp) tp->left_out = tp->sacked_out + tp->lost_out; } -extern void tcp_enter_cwr(struct sock *sk); +extern void tcp_enter_cwr(struct sock *sk, const int set_ssthresh); extern __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst); /* Slow start with delack produces 3 packets of burst, so that @@ -773,7 +795,7 @@ static inline void tcp_check_probe_timer(struct sock *sk, struct tcp_sock *tp) const struct inet_connection_sock *icsk = inet_csk(sk); if (!tp->packets_out && !icsk->icsk_pending) inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - icsk->icsk_rto, TCP_RTO_MAX); + icsk->icsk_rto, sysctl_tcp_rto_max); } static inline void tcp_push_pending_frames(struct sock *sk, @@ -857,10 +879,16 @@ static inline int tcp_prequeue(struct sock *sk, struct sk_buff *skb) tp->ucopy.memory = 0; } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) { wake_up_interruptible(sk->sk_sleep); - if (!inet_csk_ack_scheduled(sk)) - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, - (3 * TCP_RTO_MIN) / 4, - TCP_RTO_MAX); + if (!inet_csk_ack_scheduled(sk)) { + if (!sysctl_tcp_delack) + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, + (3 * sysctl_tcp_rto_min) / 4, + sysctl_tcp_rto_max); + else + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, + sysctl_tcp_delack * HZ / 1000, + sysctl_tcp_rto_max); + } } return 1; } @@ -1027,19 +1055,6 @@ static inline int tcp_paws_check(const struct tcp_options_received *rx_opt, int #define TCP_CHECK_TIMER(sk) do { } while (0) -static inline int tcp_use_frto(const struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - - /* F-RTO must be activated in sysctl and there must be some - * unsent new data, and the advertised window should allow - * sending it. - */ - return (sysctl_tcp_frto && sk->sk_send_head && - !after(TCP_SKB_CB(sk->sk_send_head)->end_seq, - tp->snd_una + tp->snd_wnd)); -} - static inline void tcp_mib_init(void) { /* See RFC 2012 */ diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 70cea9d..844c3b2 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -130,6 +130,160 @@ static int sysctl_tcp_congestion_control(ctl_table *table, int __user *name, ctl_table ipv4_table[] = { + { + .ctl_name = NET_IPV4_TCP_INITIAL_WINDOW, + .procname = "tcp_iw", + .data = &sysctl_tcp_iw, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_TCP_APPLICATION_LIMITED, + .procname = "tcp_appl_limited", + .data = &sysctl_tcp_application_limited, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_TCP_DELAYED_ACK, + .procname = "tcp_delayed_ack", + .data = &sysctl_tcp_delack, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_TCP_QUICKACKS, + .procname = "tcp_quickacks", + .data = &sysctl_tcp_quickacks, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_TCP_RATE_HALVING, + .procname = "tcp_rate_halving", + .data = &sysctl_tcp_rate_halving, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_TCP_LIMITED_TRANSMIT, + .procname = "tcp_limited_transmit", + .data = &sysctl_tcp_limited_transmit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_TCP_CBI_REUSE_SSTHRESH, + .procname = "tcp_cbi_reuse_ssthresh", + .data = &sysctl_tcp_cbi_reuse_ssthresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_TCP_CBI_REUSE_RTT, + .procname = "tcp_cbi_reuse_rtt", + .data = &sysctl_tcp_cbi_reuse_rtt, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_TCP_CBI_REUSE_REORDER, + .procname = "tcp_cbi_reuse_reorder", + .data = &sysctl_tcp_cbi_reuse_reorder, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_TCP_RFC2988_RTT, + .procname = "tcp_rfc2988_rtt", + .data = &sysctl_tcp_rfc2988_rtt, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_TCP_RTO_MAX, + .procname = "tcp_rto_max", + .data = &sysctl_tcp_rto_max, + .maxlen = sizeof(u32), + .mode = 0644, + .proc_handler = &proc_dointvec_ms_jiffies, + .strategy = &sysctl_ms_jiffies + }, + { + .ctl_name = NET_IPV4_TCP_RTO_MIN, + .procname = "tcp_rto_min", + .data = &sysctl_tcp_rto_min, + .maxlen = sizeof(u32), + .mode = 0644, + .proc_handler = &proc_dointvec_ms_jiffies, + .strategy = &sysctl_ms_jiffies + }, + { + .ctl_name = NET_IPV4_TCP_HEAD_TIMEOUTS, + .procname = "tcp_head_timeouts", + .data = &sysctl_tcp_head_timeouts, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_DUPACKS_TO_RECOVERY, + .procname = "tcp_dupacks_to_recovery", + .data = &sysctl_tcp_dupacks_to_recovery, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_LOST_REXMIT_DETECT, + .procname = "tcp_lost_rexmit_detect", + .data = &sysctl_tcp_lost_rexmit_detect, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_CONST_DUPTHRESH, + .procname = "tcp_const_dupthresh", + .data = &sysctl_tcp_const_dupthresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_MAXIMIZE_ADV_WIN, + .procname = "tcp_maximize_adv_win", + .data = &sysctl_tcp_maximize_adv_win, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_NO_CWND_LOSSES, + .procname = "tcp_no_cwnd_losses", + .data = &sysctl_tcp_no_cwnd_losses, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_FASTREC_RTO_REDUCES, + .procname = "tcp_fastrec_rto_reduces", + .data = &sysctl_tcp_fastrec_rto_reduces, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, { .ctl_name = NET_IPV4_TCP_TIMESTAMPS, .procname = "tcp_timestamps", @@ -494,6 +648,14 @@ ctl_table ipv4_table[] = { .proc_handler = &proc_dointvec }, { + .ctl_name = NET_TCP_RFC3517_PIPE, + .procname = "tcp_rfc3517_pipe", + .data = &sysctl_tcp_rfc3517_pipe, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { .ctl_name = NET_TCP_REORDERING, .procname = "tcp_reordering", .data = &sysctl_tcp_reordering, @@ -590,6 +752,14 @@ ctl_table ipv4_table[] = { .proc_handler = &proc_dointvec }, { + .ctl_name = NET_TCP_FRTO_RESPONSE, + .procname = "tcp_frto_response", + .data = &sysctl_tcp_frto_response, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { .ctl_name = NET_TCP_LOW_LATENCY, .procname = "tcp_low_latency", .data = &sysctl_tcp_low_latency, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 934396b..8eb2993 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -955,7 +955,8 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied) * receive buffer and there was a small segment * in queue. */ - (copied > 0 && (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) && + (!sysctl_tcp_delack && + copied > 0 && (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) && !icsk->icsk_ack.pingpong && !atomic_read(&sk->sk_rmem_alloc))) time_to_ack = 1; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 159fa3f..5a359cb 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -72,10 +72,26 @@ #include #include +int sysctl_tcp_iw = 0; +int sysctl_tcp_application_limited = 1; /* RFC2861 */ +int sysctl_tcp_quickacks = 1; /* 1 = linux way */ +int sysctl_tcp_rate_halving = 1; /* 1 = linux way */ +int sysctl_tcp_limited_transmit = 1; /* 1 = linux way, 0 = disable RFC3042 */ +int sysctl_tcp_cbi_reuse_ssthresh = 1; /* CBI: use stored ssthresh for new connection */ +int sysctl_tcp_cbi_reuse_rtt = 1; /* CBI: use stored rtt-variables for new connection */ +int sysctl_tcp_cbi_reuse_reorder = 1; /* CBI: use stored reorder-variable for new connection */ +int sysctl_tcp_rfc2988_rtt = 0; +int sysctl_tcp_head_timeouts = 1; /* 1 = linux way */ +int sysctl_tcp_dupacks_to_recovery = 0; /* 1 = trigger recovery only at nth dupACK */ +int sysctl_tcp_lost_rexmit_detect = 1; /* 1 = linux way */ +int sysctl_tcp_const_dupthresh = 0; /* 0 = linux way (=max reord detection) */ +int sysctl_tcp_no_cwnd_losses = 0; /* 1 = don't moderate cwnd using lossy way */ +int sysctl_tcp_fastrec_rto_reduces = 0; /* 1 = RTO in fastrecovery => halved ssthresh */ int sysctl_tcp_timestamps = 1; int sysctl_tcp_window_scaling = 1; int sysctl_tcp_sack = 1; int sysctl_tcp_fack = 1; +int sysctl_tcp_rfc3517_pipe = 0; int sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; int sysctl_tcp_ecn; int sysctl_tcp_dsack = 1; @@ -85,7 +101,8 @@ int sysctl_tcp_adv_win_scale = 2; int sysctl_tcp_stdurg; int sysctl_tcp_rfc1337; int sysctl_tcp_max_orphans = NR_FILE; -int sysctl_tcp_frto; +int sysctl_tcp_frto; /* bit 1 enabled = SACK enhanced F-RTO */ +int sysctl_tcp_frto_response; int sysctl_tcp_nometrics_save; int sysctl_tcp_moderate_rcvbuf = 1; @@ -100,6 +117,7 @@ int sysctl_tcp_abc; #define FLAG_ECE 0x40 /* ECE in this ACK */ #define FLAG_DATA_LOST 0x80 /* SACK detected data lossage. */ #define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ +#define FLAG_ONLY_ORIG_SACKED 0x200 /* SACKs only non-rexmit sent before RTO */ #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) @@ -109,6 +127,9 @@ int sysctl_tcp_abc; #define IsReno(tp) ((tp)->rx_opt.sack_ok == 0) #define IsFack(tp) ((tp)->rx_opt.sack_ok & 2) #define IsDSack(tp) ((tp)->rx_opt.sack_ok & 4) +#define Is3517Sack(tp) ((tp)->rx_opt.sack_ok & 8) /* RFC3517 recovery */ + +#define IsSackFrto() (sysctl_tcp_frto & 0x2) #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) @@ -186,7 +207,7 @@ void tcp_enter_quickack_mode(struct sock *sk) static inline int tcp_in_quickack_mode(const struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); - return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong; + return sysctl_tcp_quickacks && icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong; } /* Buffer size and advertised window tuning. @@ -295,7 +316,8 @@ static void tcp_init_buffer_space(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); int maxwin; - if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) + if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && + !sysctl_tcp_maximize_adv_win) tcp_fixup_rcvbuf(sk); if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) tcp_fixup_sndbuf(sk); @@ -331,6 +353,7 @@ static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) icsk->icsk_ack.quick = 0; if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && + !sysctl_tcp_maximize_adv_win && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && !tcp_memory_pressure && atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { @@ -457,6 +480,7 @@ void tcp_rcv_space_adjust(struct sock *sk) tp->rcvq_space.space = space; if (sysctl_tcp_moderate_rcvbuf && + !sysctl_tcp_maximize_adv_win && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { int new_clamp = space; @@ -603,17 +627,44 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) if (tp->mdev_max < tp->rttvar) tp->rttvar -= (tp->rttvar-tp->mdev_max)>>2; tp->rtt_seq = tp->snd_nxt; - tp->mdev_max = TCP_RTO_MIN; + tp->mdev_max = sysctl_tcp_rto_min; } } else { /* no previous measure. */ tp->srtt = m<<3; /* take the measured time to be rtt */ tp->mdev = m<<1; /* make sure rto = 3*rtt */ - tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN); + tp->mdev_max = tp->rttvar = max(tp->mdev, sysctl_tcp_rto_min); tp->rtt_seq = tp->snd_nxt; } } + +static void tcp_rfc2988_rtt(struct sock *sk, __u32 mrtt) +{ + struct tcp_sock *tp = tcp_sk(sk); + long m = mrtt; + + if (m == 0) + m = 1; + + if (tp->srtt != 0) { + m -= (tp->srtt >> 3); /* m is now error in rtt est */ + tp->srtt += m; /* rtt = 7/8 rtt + 1/8 new */ + if (m < 0) + m = -m; /* m is now abs(error) */ + m -= (tp->mdev >> 2); /* -beta * mdev */ + tp->mdev += m; + tp->rttvar = tp->mdev; + } else { + /* no previous measure. */ + tp->srtt = m<<3; /* take the measured time to be rtt */ + tp->mdev = m<<1; /* make sure rto = 3*rtt */ + tp->mdev_max = tp->rttvar = max(tp->mdev, sysctl_tcp_rto_min); + tp->rtt_seq = tp->snd_nxt; + } + +} + /* Calculate rto without backoff. This is the second half of Van Jacobson's * routine referred to above. */ @@ -644,8 +695,12 @@ static inline void tcp_set_rto(struct sock *sk) */ static inline void tcp_bound_rto(struct sock *sk) { - if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX) - inet_csk(sk)->icsk_rto = TCP_RTO_MAX; + if (inet_csk(sk)->icsk_rto > sysctl_tcp_rto_max) + inet_csk(sk)->icsk_rto = sysctl_tcp_rto_max; + + /* RFC2988 2.4: if RTO goes below 1 second, round it up. */ + if (sysctl_tcp_rfc2988_rtt && inet_csk(sk)->icsk_rto < sysctl_tcp_rto_min) + inet_csk(sk)->icsk_rto = sysctl_tcp_rto_min; } /* Save metrics learned by this TCP session. @@ -747,7 +802,9 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) { __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); - if (!cwnd) { + if (sysctl_tcp_iw) { + cwnd = sysctl_tcp_iw; + } else if (!cwnd) { if (tp->mss_cache > 1460) cwnd = 2; else @@ -757,15 +814,17 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) } /* Set slow start threshold and cwnd not falling to slow start */ -void tcp_enter_cwr(struct sock *sk) +void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) { struct tcp_sock *tp = tcp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); tp->prior_ssthresh = 0; tp->bytes_acked = 0; - if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { + if (icsk->icsk_ca_state < TCP_CA_CWR) { tp->undo_marker = 0; - tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); + if (set_ssthresh) + tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1U); tp->snd_cwnd_cnt = 0; @@ -791,18 +850,18 @@ static void tcp_init_metrics(struct sock *sk) if (dst_metric_locked(dst, RTAX_CWND)) tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND); - if (dst_metric(dst, RTAX_SSTHRESH)) { + if (sysctl_tcp_cbi_reuse_ssthresh && dst_metric(dst, RTAX_SSTHRESH)) { tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH); if (tp->snd_ssthresh > tp->snd_cwnd_clamp) tp->snd_ssthresh = tp->snd_cwnd_clamp; } - if (dst_metric(dst, RTAX_REORDERING) && + if (sysctl_tcp_cbi_reuse_reorder && dst_metric(dst, RTAX_REORDERING) && tp->reordering != dst_metric(dst, RTAX_REORDERING)) { tp->rx_opt.sack_ok &= ~2; tp->reordering = dst_metric(dst, RTAX_REORDERING); } - if (dst_metric(dst, RTAX_RTT) == 0) + if (!sysctl_tcp_cbi_reuse_rtt || dst_metric(dst, RTAX_RTT) == 0) goto reset; if (!tp->srtt && dst_metric(dst, RTAX_RTT) < (TCP_TIMEOUT_INIT << 3)) @@ -828,7 +887,7 @@ static void tcp_init_metrics(struct sock *sk) } if (dst_metric(dst, RTAX_RTTVAR) > tp->mdev) { tp->mdev = dst_metric(dst, RTAX_RTTVAR); - tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN); + tp->mdev_max = tp->rttvar = max(tp->mdev, sysctl_tcp_rto_min); } tcp_set_rto(sk); tcp_bound_rto(sk); @@ -855,7 +914,8 @@ static void tcp_update_reordering(struct sock *sk, const int metric, { struct tcp_sock *tp = tcp_sk(sk); if (metric > tp->reordering) { - tp->reordering = min(TCP_MAX_REORDERING, metric); + if (!sysctl_tcp_const_dupthresh) + tp->reordering = min(TCP_MAX_REORDERING, metric); /* This exciting event is worth to be remembered. 8) */ if (ts) @@ -1038,7 +1098,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ } /* Event "B" in the comment above. */ - if (after(end_seq, tp->high_seq)) + if (sysctl_tcp_lost_rexmit_detect && + after(end_seq, tp->high_seq)) flag |= FLAG_DATA_LOST; sk_stream_for_retrans_queue_from(skb, sk) { @@ -1141,12 +1202,31 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ /* clear lost hint */ tp->retransmit_skb_hint = NULL; } + /* SACK enhanced F-RTO detection. + * Set flag if and only if non-rexmitted + * segments below frto_highmark are + * SACKed (RFC4138; Appendix B). + * Clearing correct due to in-order walk + */ + if (after(end_seq, tp->frto_highmark)) { + flag &= ~FLAG_ONLY_ORIG_SACKED; + } else { + if (!(sacked & TCPCB_RETRANS)) + flag |= FLAG_ONLY_ORIG_SACKED; + } } TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED; flag |= FLAG_DATA_SACKED; tp->sacked_out += tcp_skb_pcount(skb); + /* Clear if sacked beoynd mark_head_lost hint */ + if (Is3517Sack(tp) && + tp->lost_skb_hint != NULL && + before(TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(tp->lost_skb_hint)->seq)) + tp->lost_skb_hint = NULL; + if (fack_count > tp->fackets_out) tp->fackets_out = fack_count; } else { @@ -1174,7 +1254,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ * we have to account for reordering! Ugly, * but should help. */ - if (lost_retrans && icsk->icsk_ca_state == TCP_CA_Recovery) { + if (sysctl_tcp_lost_rexmit_detect && + lost_retrans && icsk->icsk_ca_state == TCP_CA_Recovery) { struct sk_buff *skb; sk_stream_for_retrans_queue(skb, sk) { @@ -1206,7 +1287,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ tp->left_out = tp->sacked_out + tp->lost_out; - if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss) + if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss && + (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark))) tcp_update_reordering(sk, ((tp->fackets_out + 1) - reord), 0); #if FASTRETRANS_DEBUG > 0 @@ -1218,9 +1300,58 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ return flag; } -/* RTO occurred, but do not yet enter loss state. Instead, transmit two new - * segments to see from the next ACKs whether any data was really missing. - * If the RTO was spurious, new ACKs should arrive. +/* If cwnd > ssthresh and we might undo, don't limit undoing to 3/4*cwnd + * as done in tcp_current_ssthresh. + */ +static inline __u32 tcp_undo_ssthresh(const struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + if ((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_CWR | TCPF_CA_Recovery)) + return tp->snd_ssthresh; + else + return max(tp->snd_ssthresh, tp->snd_cwnd); +} + +/* F-RTO can only be used if TCP has never retransmitted anything other than + * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here) + */ +int tcp_use_frto(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + + if (!sysctl_tcp_frto) + return 0; + + if (IsSackFrto()) + return 1; + + /* Avoid expensive walking of rexmit queue if possible */ + if (tp->retrans_out > 1) + return 0; + + skb = skb_peek(&sk->sk_write_queue)->next; /* Skips head */ + sk_stream_for_retrans_queue_from(skb, sk) { + if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS) + return 0; + /* Short-circuit when first non-SACKed skb has been checked */ + if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) + break; + } + return 1; +} + +/* RTO occurred, but do not yet enter Loss state. Instead, defer RTO + * recovery a bit and use heuristics in tcp_process_frto() to detect if + * the RTO was spurious. Only clear SACKED_RETRANS of the head here to + * keep retrans_out counting accurate (with SACK F-RTO, other than head + * may still have that bit set); TCPCB_LOST and remaining SACKED_RETRANS + * bits are handled if the Loss state is really to be entered (in + * tcp_enter_frto_loss). + * + * Do like tcp_enter_loss() would; when RTO expires the second time it + * does: + * "Reduce ssthresh if it has not yet been made inside this window." */ void tcp_enter_frto(struct sock *sk) { @@ -1228,39 +1359,71 @@ void tcp_enter_frto(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - tp->frto_counter = 1; - - if (icsk->icsk_ca_state <= TCP_CA_Disorder || + if ((!tp->frto_counter && icsk->icsk_ca_state <= TCP_CA_Disorder) || tp->snd_una == tp->high_seq || - (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) { - tp->prior_ssthresh = tcp_current_ssthresh(sk); - tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); + (sysctl_tcp_fastrec_rto_reduces && + (icsk->icsk_ca_state == TCP_CA_Recovery)) || + ((icsk->icsk_ca_state == TCP_CA_Loss || tp->frto_counter) && + !icsk->icsk_retransmits)) { + tp->prior_ssthresh = tcp_undo_ssthresh(sk); + /* Our state is too optimistic in ssthresh() call because cwnd + * is not reduced until tcp_enter_frto_loss() when previous FRTO + * recovery has not yet completed. Pattern would be this: RTO, + * Cumulative ACK, RTO (2xRTO for the same segment does not end + * up here twice). + * RFC4138 should be more specific on what to do, even though + * RTO is quite unlikely to occur after the first Cumulative ACK + * due to back-off and complexity of triggering events ... + */ + if (tp->frto_counter) { + u32 stored_cwnd; + stored_cwnd = tp->snd_cwnd; + tp->snd_cwnd = 2; + tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); + tp->snd_cwnd = stored_cwnd; + } else { + tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); + } + /* ... in theory, cong.control module could do "any tricks" in + * ssthresh(), which means that ca_state, lost bits and lost_out + * counter would have to be faked before the call occurs. We + * consider that too expensive, unlikely and hacky, so modules + * using these in ssthresh() must deal these incompatibility + * issues if they receives CA_EVENT_FRTO and frto_counter != 0 + */ tcp_ca_event(sk, CA_EVENT_FRTO); } - /* Have to clear retransmission markers here to keep the bookkeeping - * in shape, even though we are not yet in Loss state. - * If something was really lost, it is eventually caught up - * in tcp_enter_frto_loss. - */ - tp->retrans_out = 0; tp->undo_marker = tp->snd_una; tp->undo_retrans = 0; - sk_stream_for_retrans_queue(skb, sk) { - TCP_SKB_CB(skb)->sacked &= ~TCPCB_RETRANS; + skb = skb_peek(&sk->sk_write_queue); + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; + tp->retrans_out -= tcp_skb_pcount(skb); } tcp_sync_left_out(tp); - tcp_set_ca_state(sk, TCP_CA_Open); - tp->frto_highmark = tp->snd_nxt; + /* Earlier loss recovery underway (see RFC4138; Appendix B). + * The last condition is necessary at least in tp->frto_counter case. + */ + if (IsSackFrto() && (tp->frto_counter || + ((1 << icsk->icsk_ca_state) & (TCPF_CA_Recovery|TCPF_CA_Loss))) && + after(tp->high_seq, tp->snd_una)) { + tp->frto_highmark = tp->high_seq; + } else { + tp->frto_highmark = tp->snd_nxt; + } + tcp_set_ca_state(sk, TCP_CA_Disorder); + tp->high_seq = tp->snd_nxt; + tp->frto_counter = 1; } /* Enter Loss state after F-RTO was applied. Dupack arrived after RTO, * which indicates that we should follow the traditional RTO recovery, * i.e. mark everything lost and do go-back-N retransmission. */ -static void tcp_enter_frto_loss(struct sock *sk) +static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; @@ -1269,10 +1432,21 @@ static void tcp_enter_frto_loss(struct sock *sk) tp->sacked_out = 0; tp->lost_out = 0; tp->fackets_out = 0; + tp->retrans_out = 0; sk_stream_for_retrans_queue(skb, sk) { cnt += tcp_skb_pcount(skb); - TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; + /* + * Count the retransmission made on RTO correctly (only when + * waiting for the first ACK and did not get it)... + */ + if ((tp->frto_counter == 1) && !(flag&FLAG_DATA_ACKED)) { + tp->retrans_out += tcp_skb_pcount(skb); + /* ...enter this if branch just for the first segment */ + flag |= FLAG_DATA_ACKED; + } else { + TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); + } if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) { /* Do not mark those segments lost that were @@ -1290,7 +1464,7 @@ static void tcp_enter_frto_loss(struct sock *sk) } tcp_sync_left_out(tp); - tp->snd_cwnd = tp->frto_counter + tcp_packets_in_flight(tp)+1; + tp->snd_cwnd = tcp_packets_in_flight(tp) + allowed_segments; tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_time_stamp; tp->undo_marker = 0; @@ -1331,8 +1505,10 @@ void tcp_enter_loss(struct sock *sk, int how) /* Reduce ssthresh if it has not yet been made inside this window. */ if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || + (sysctl_tcp_fastrec_rto_reduces && + (icsk->icsk_ca_state == TCP_CA_Recovery)) || (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) { - tp->prior_ssthresh = tcp_current_ssthresh(sk); + tp->prior_ssthresh = tcp_undo_ssthresh(sk); tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); tcp_ca_event(sk, CA_EVENT_LOSS); } @@ -1369,6 +1545,8 @@ void tcp_enter_loss(struct sock *sk, int how) tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->snd_nxt; TCP_ECN_queue_cwr(tp); + /* Abort FRTO algorithm if one is in progress */ + tp->frto_counter = 0; clear_all_retrans_hints(tp); } @@ -1392,12 +1570,24 @@ static int tcp_check_sack_reneging(struct sock *sk) icsk->icsk_retransmits++; tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)); inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - icsk->icsk_rto, TCP_RTO_MAX); + icsk->icsk_rto, sysctl_tcp_rto_max); return 1; } return 0; } +static inline int tcp_dup_fackets_out(struct tcp_sock *tp) +{ + /* We use sacked_out also for RFC3517 like recovery even though it + * violates the RFC that uses duplicate ACKs, often these are equal + * but when e.g. out-of-window ACKs or packet duplication occurs, they + * differ. Since either of them occur due to loss, we should really + * ignore them + */ + return (IsReno(tp) || Is3517Sack(tp)) ? tp->sacked_out + 1 : + tp->fackets_out; +} + static inline int tcp_fackets_out(struct tcp_sock *tp) { return IsReno(tp) ? tp->sacked_out+1 : tp->fackets_out; @@ -1410,7 +1600,7 @@ static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb) static inline int tcp_head_timedout(struct sock *sk, struct tcp_sock *tp) { - return tp->packets_out && + return sysctl_tcp_head_timeouts && tp->packets_out && tcp_skb_timedout(sk, skb_peek(&sk->sk_write_queue)); } @@ -1511,12 +1701,20 @@ static int tcp_time_to_recover(struct sock *sk, struct tcp_sock *tp) { __u32 packets_out; + /* Do not perform any recovery during FRTO algorithm */ + if (tp->frto_counter) + return 0; + + /* No tricks if sysctled to use only dupACKs as recovery trigger */ + if (sysctl_tcp_dupacks_to_recovery) + return (tp->dupacks >= tp->reordering); + /* Trick#1: The loss is proven. */ if (tp->lost_out) return 1; /* Not-A-Trick#2 : Classic rule... */ - if (tcp_fackets_out(tp) > tp->reordering) + if (tcp_dup_fackets_out(tp) > tp->reordering) return 1; /* Trick#3 : when we use RFC2988 timer restart, fast @@ -1590,7 +1788,9 @@ static inline void tcp_reset_reno_sack(struct tcp_sock *tp) tp->left_out = tp->lost_out; } -/* Mark head of queue up as lost. */ +/* Mark head of queue up as lost. With RFC3517 SACK, the packets is + * is against sacked "cnt", otherwise it's against facked "cnt" + */ static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp, int packets, u32 high_seq) { @@ -1611,10 +1811,16 @@ static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp, /* this is not the most efficient way to do this... */ tp->lost_skb_hint = skb; tp->lost_cnt_hint = cnt; - cnt += tcp_skb_pcount(skb); - if (cnt > packets || after(TCP_SKB_CB(skb)->end_seq, high_seq)) + if (!Is3517Sack(tp) || + (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) + cnt += tcp_skb_pcount(skb); + if ((!sysctl_tcp_dupacks_to_recovery || + (tp->undo_marker != tp->snd_una) || (tp->lost_out > 0)) && + ((cnt > packets) || + after(TCP_SKB_CB(skb)->end_seq, high_seq))) break; - if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { + if (!(TCP_SKB_CB(skb)->sacked & + (TCPCB_TAGBITS & (~TCPCB_SACKED_RETRANS)))) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); @@ -1635,13 +1841,18 @@ static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp, static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp) { - if (IsFack(tp)) { + if (IsReno(tp)) { + tcp_mark_head_lost(sk, tp, 1, tp->high_seq); + } else if (Is3517Sack(tp)) { + int sacked_upto = tp->sacked_out - tp->reordering; + if (sacked_upto < 0) + sacked_upto = 0; + tcp_mark_head_lost(sk, tp, sacked_upto, tp->high_seq); + } else { int lost = tp->fackets_out - tp->reordering; if (lost <= 0) lost = 1; tcp_mark_head_lost(sk, tp, lost, tp->high_seq); - } else { - tcp_mark_head_lost(sk, tp, 1, tp->high_seq); } /* New heuristics: it is possible only after we switched @@ -1683,8 +1894,13 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp) */ static inline void tcp_moderate_cwnd(struct tcp_sock *tp) { - tp->snd_cwnd = min(tp->snd_cwnd, - tcp_packets_in_flight(tp)+tcp_max_burst(tp)); + int new_inflight = tcp_packets_in_flight(tp)+tcp_max_burst(tp); + if (new_inflight < tp->snd_cwnd) { + /* In case of reduction, slow start to the previous cwnd */ + if (sysctl_tcp_no_cwnd_losses) + tp->snd_ssthresh = max(tp->snd_ssthresh, tp->snd_cwnd); + tp->snd_cwnd = new_inflight; + } tp->snd_cwnd_stamp = tcp_time_stamp; } @@ -1797,6 +2013,7 @@ static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp) return 1; } tcp_set_ca_state(sk, TCP_CA_Open); + tp->dupacks = 0; return 0; } @@ -1882,7 +2099,7 @@ static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) tp->retrans_stamp = 0; if (flag&FLAG_ECE) - tcp_enter_cwr(sk); + tcp_enter_cwr(sk, 1); if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { int state = TCP_CA_Open; @@ -1894,6 +2111,9 @@ static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) tcp_set_ca_state(sk, state); tp->high_seq = tp->snd_nxt; } + if (!sysctl_tcp_limited_transmit && + (state == TCP_CA_Disorder && tp->snd_cwnd > 1)) + tp->snd_cwnd--; tcp_moderate_cwnd(tp); } else { tcp_cwnd_down(sk); @@ -1979,8 +2199,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, /* E. Check state exit conditions. State can be terminated * when high_seq is ACKed. */ if (icsk->icsk_ca_state == TCP_CA_Open) { - if (!sysctl_tcp_frto) - BUG_TRAP(tp->retrans_out == 0); + BUG_TRAP(tp->retrans_out == 0); tp->retrans_stamp = 0; } else if (!before(tp->snd_una, tp->high_seq)) { switch (icsk->icsk_ca_state) { @@ -2055,8 +2274,15 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, if (icsk->icsk_ca_state == TCP_CA_Disorder) tcp_try_undo_dsack(sk, tp); + if (is_dupack) + tp->dupacks++; + if (!tcp_time_to_recover(sk, tp)) { tcp_try_to_open(sk, tp, flag); + + if (icsk->icsk_ca_state == TCP_CA_Open) + tp->dupacks = 0; + return; } @@ -2085,7 +2311,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, if (icsk->icsk_ca_state < TCP_CA_CWR) { if (!(flag&FLAG_ECE)) - tp->prior_ssthresh = tcp_current_ssthresh(sk); + tp->prior_ssthresh = tcp_undo_ssthresh(sk); tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); TCP_ECN_queue_cwr(tp); } @@ -2093,11 +2319,43 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, tp->bytes_acked = 0; tp->snd_cwnd_cnt = 0; tcp_set_ca_state(sk, TCP_CA_Recovery); + + if (!sysctl_tcp_rate_halving) { + /* + * From RFC 2581 + * 2. Retransmit the lost segment ... + * Also RFC3517 rexmits the first segment, even though + * with sk_write_queue); + WARN_ON(TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS); + + tcp_retransmit_skb(sk, skb); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + icsk->icsk_rto, sysctl_tcp_rto_max); + /* + * ... and set cwnd to ssthresh plus 3*SMSS. This + * artificially "inflates" the congestion window by the + * number of segments (three) that have left the + * network and which the receiver has buffered. + * Inflated segments are already included due to + * limited xmit. Snd_cwnd_cnt is already reset. + */ + tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh + + (sysctl_tcp_limited_transmit ? + 0 : sysctl_tcp_reordering)); + } + + /* Force scoreboard update when entering to RFC3517 recovery */ + flag |= FLAG_DATA_SACKED; } - if (is_dupack || tcp_head_timedout(sk, tp)) + if ((!Is3517Sack(tp) && (is_dupack || tcp_head_timedout(sk, tp))) || + (Is3517Sack(tp) && (flag&FLAG_DATA_SACKED))) tcp_update_scoreboard(sk, tp); - tcp_cwnd_down(sk); + if (sysctl_tcp_rate_halving) + tcp_cwnd_down(sk); tcp_xmit_retransmit_queue(sk); } @@ -2123,7 +2381,10 @@ static void tcp_ack_saw_tstamp(struct sock *sk, int flag) */ struct tcp_sock *tp = tcp_sk(sk); const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; - tcp_rtt_estimator(sk, seq_rtt); + if (sysctl_tcp_rfc2988_rtt) + tcp_rfc2988_rtt(sk, seq_rtt); + else + tcp_rtt_estimator(sk, seq_rtt); tcp_set_rto(sk); inet_csk(sk)->icsk_backoff = 0; tcp_bound_rto(sk); @@ -2143,7 +2404,11 @@ static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag) if (flag & FLAG_RETRANS_DATA_ACKED) return; - tcp_rtt_estimator(sk, seq_rtt); + if (sysctl_tcp_rfc2988_rtt) + tcp_rfc2988_rtt(sk, seq_rtt); + else + tcp_rtt_estimator(sk, seq_rtt); + tcp_set_rto(sk); inet_csk(sk)->icsk_backoff = 0; tcp_bound_rto(sk); @@ -2177,7 +2442,7 @@ static void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp) if (!tp->packets_out) { inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); } else { - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, sysctl_tcp_rto_max); } } @@ -2381,8 +2646,8 @@ static void tcp_ack_probe(struct sock *sk) */ } else { inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX), - TCP_RTO_MAX); + min(icsk->icsk_rto << icsk->icsk_backoff, sysctl_tcp_rto_max), + sysctl_tcp_rto_max); } } @@ -2449,39 +2714,146 @@ static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp, return flag; } -static void tcp_process_frto(struct sock *sk, u32 prior_snd_una) +/* A very conservative spurious RTO response algorithm: reduce cwnd and + * continue in congestion avoidance. + */ +static void tcp_conservative_spur_to_response(struct tcp_sock *tp) +{ + tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); + tp->snd_cwnd_cnt = 0; + TCP_ECN_queue_cwr(tp); + tcp_moderate_cwnd(tp); +} + +/* A conservative spurious RTO response algorithm: reduce cwnd using + * rate halving and continue in congestion avoidance. + */ +static void tcp_ratehalving_spur_to_response(struct sock *sk) +{ + tcp_enter_cwr(sk, 0); +} + +/* Restore prior ssthresh and reduce cwnd, possibly continues in slow + * start + */ +static void tcp_middleground_spur_to_response(struct tcp_sock *tp) +{ + tp->snd_ssthresh = tp->prior_ssthresh; + tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); + tp->snd_cwnd_cnt = 0; + tcp_moderate_cwnd(tp); +} + +static void tcp_undo_spur_to_response(struct sock *sk, int flag) +{ + if (flag&FLAG_ECE) + tcp_ratehalving_spur_to_response(sk); + else + tcp_undo_cwr(sk, 1); +} + +/* F-RTO spurious RTO detection algorithm (RFC4138) + * + * F-RTO affects during two new ACKs following RTO (well, almost, see inline + * comments). State (ACK number) is kept in frto_counter. When ACK advances + * window (but not to or beyond highest sequence sent before RTO): + * On First ACK, send two new segments out. + * On Second ACK, RTO was likely spurious. Do spurious response (response + * algorithm is not part of the F-RTO detection algorithm + * given in RFC4138 but can be selected separately). + * Otherwise (basically on duplicate ACK), RTO was (likely) caused by a loss + * and TCP falls back to conventional RTO recovery. F-RTO allows overriding + * of Nagle, this is done using frto_counter states 2 and 3, when something + * is sent, state 2 is upgraded to 3. + * + * Rationale: if the RTO was spurious, new ACKs should arrive from the + * original window even after we transmit two new data segments. + * + * SACK version: + * on first step, wait until first cumulative ACK arrives, then move to + * the second step. In second step, the next ACK decides. + * + * F-RTO is implemented (mainly) in four functions: + * - tcp_use_frto() is used to determine if TCP is can use F-RTO + * - tcp_enter_frto() prepares TCP state on RTO if F-RTO is used, it is + * called when tcp_use_frto() showed green light + * - tcp_process_frto() handles incoming ACKs during F-RTO algorithm + * - tcp_enter_frto_loss() is called if there is not enough evidence + * to prove that the RTO is indeed spurious. It transfers the control + * from F-RTO to the conventional RTO recovery + */ +static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag) { struct tcp_sock *tp = tcp_sk(sk); tcp_sync_left_out(tp); + + /* Duplicate the behavior from Loss state (fastretrans_alert) */ + if (flag&FLAG_DATA_ACKED) + inet_csk(sk)->icsk_retransmits = 0; + + if (!before(tp->snd_una, tp->frto_highmark)) { + tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag); + return 1; + } - if (tp->snd_una == prior_snd_una || - !before(tp->snd_una, tp->frto_highmark)) { - /* RTO was caused by loss, start retransmitting in - * go-back-N slow start + if (!IsSackFrto() || IsReno(tp)) { + /* RFC4138 shortcoming in step 2; should also have case c): + * ACK isn't duplicate nor advances window, e.g., opposite dir + * data, winupdate */ - tcp_enter_frto_loss(sk); - return; + if ((tp->snd_una == prior_snd_una) && (flag&FLAG_NOT_DUP) && + !(flag&FLAG_FORWARD_PROGRESS)) + return 1; + + if (!(flag&FLAG_DATA_ACKED)) { + tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3), + flag); + return 1; + } + } else { + if (!(flag&FLAG_DATA_ACKED) && (tp->frto_counter == 1)) { + /* Prevent sending of new data. */ + tp->snd_cwnd = min(tp->snd_cwnd, + tcp_packets_in_flight(tp)); + return 1; + } + + if ((tp->frto_counter >= 2) && + (!(flag&FLAG_FORWARD_PROGRESS) || + ((flag&FLAG_DATA_SACKED) && !(flag&FLAG_ONLY_ORIG_SACKED)))) { + /* RFC4138 shortcoming (see comment above) */ + if (!(flag&FLAG_FORWARD_PROGRESS) && (flag&FLAG_NOT_DUP)) + return 1; + + tcp_enter_frto_loss(sk, 3, flag); + return 1; + } } if (tp->frto_counter == 1) { - /* First ACK after RTO advances the window: allow two new - * segments out. - */ + /* Sending of the next skb must be allowed or no FRTO */ + if (!sk->sk_send_head || + after(TCP_SKB_CB(sk->sk_send_head)->end_seq, + tp->snd_una + tp->snd_wnd)) { + tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), + flag); + return 1; + } + tp->snd_cwnd = tcp_packets_in_flight(tp) + 2; + tp->frto_counter = 2; + return 1; } else { - /* Also the second ACK after RTO advances the window. - * The RTO was likely spurious. Reduce cwnd and continue - * in congestion avoidance - */ - tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); - tcp_moderate_cwnd(tp); + switch (sysctl_tcp_frto_response) { + case 2: tcp_undo_spur_to_response(sk, flag); break; + case 3: tcp_middleground_spur_to_response(tp); break; + case 0: tcp_ratehalving_spur_to_response(sk); break; + default: tcp_conservative_spur_to_response(tp); break; + } + tp->frto_counter = 0; } - - /* F-RTO affects on two new ACKs following RTO. - * At latest on third ACK the TCP behavior is back to normal. - */ - tp->frto_counter = (tp->frto_counter + 1) % 3; + return 0; } /* This routine deals with incoming acks, but not outgoing ones. */ @@ -2495,6 +2867,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) u32 prior_in_flight; s32 seq_rtt; int prior_packets; + int frto_cwnd = 0; /* If the ack is newer than sent or older than previous acks * then we can probably ignore it. @@ -2557,15 +2930,16 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) flag |= tcp_clean_rtx_queue(sk, &seq_rtt); if (tp->frto_counter) - tcp_process_frto(sk, prior_snd_una); + frto_cwnd = tcp_process_frto(sk, prior_snd_una, flag); if (tcp_ack_is_dubious(sk, flag)) { /* Advance CWND, if state allows this. */ - if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag)) + if ((flag & FLAG_DATA_ACKED) && !frto_cwnd && + tcp_may_raise_cwnd(sk, flag)) tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 0); tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); } else { - if ((flag & FLAG_DATA_ACKED)) + if ((flag & FLAG_DATA_ACKED) && !frto_cwnd) tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 1); } @@ -4214,8 +4588,13 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tp->tcp_header_len = sizeof(struct tcphdr); } - if (tp->rx_opt.sack_ok && sysctl_tcp_fack) - tp->rx_opt.sack_ok |= 2; + if (tp->rx_opt.sack_ok) { + if (sysctl_tcp_rfc3517_pipe) { + tp->rx_opt.sack_ok |= 8; + } else if (sysctl_tcp_fack) { + tp->rx_opt.sack_ok |= 2; + } + } tcp_mtup_init(sk); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); @@ -4271,7 +4650,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_incr_quickack(sk); tcp_enter_quickack_mode(sk); inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, - TCP_DELACK_MAX, TCP_RTO_MAX); + TCP_DELACK_MAX, sysctl_tcp_rto_max); discard: __kfree_skb(skb); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4b04c3e..8176d92 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1282,7 +1282,10 @@ static int tcp_v4_init_sock(struct sock *sk) * algorithms that we must have the following bandaid to talk * efficiently to them. -DaveM */ - tp->snd_cwnd = 2; + if (sysctl_tcp_iw) + tp->snd_cwnd = sysctl_tcp_iw; + else + tp->snd_cwnd = 2; /* See draft-stevens-tcpca-spec-01 for discussion of the * initialization of these values. @@ -1740,7 +1743,7 @@ static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i) } sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " - "%08X %5d %8d %lu %d %p %u %u %u %u %d", + "%08X %5d %8d %lu %d %p %u %u %u %u %d %u %u", i, src, srcp, dest, destp, sp->sk_state, tp->write_seq - tp->snd_una, (sp->sk_state == TCP_LISTEN) ? sp->sk_ack_backlog : (tp->rcv_nxt - tp->copied_seq), @@ -1755,7 +1758,8 @@ static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i) icsk->icsk_ack.ato, (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, tp->snd_cwnd, - tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh); + tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh, + tcp_packets_in_flight(tp), icsk->icsk_ca_state); } static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 624e2b2..bd59af0 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -379,10 +379,15 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, * algorithms that we must have the following bandaid to talk * efficiently to them. -DaveM */ - newtp->snd_cwnd = 2; + if (sysctl_tcp_iw) + newtp->snd_cwnd = sysctl_tcp_iw; + else + newtp->snd_cwnd = 2; newtp->snd_cwnd_cnt = 0; newtp->bytes_acked = 0; + newtp->dupacks = 0; + newtp->frto_counter = 0; newtp->frto_highmark = 0; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b4f3ffe..b4dc8f9 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -42,6 +42,9 @@ #include #include +int sysctl_tcp_delack = 0; /* Zero = linux way, non-zero = ms */ +int sysctl_tcp_maximize_adv_win = 0; + /* People can turn this off for buggy TCP's found in printers etc. */ int sysctl_tcp_retrans_collapse = 1; @@ -68,6 +71,9 @@ static void update_send_head(struct sock *sk, struct tcp_sock *tp, sk->sk_send_head = skb->next; if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue) sk->sk_send_head = NULL; + /* Don't override Nagle indefinately with F-RTO */ + if (tp->frto_counter == 2) + tp->frto_counter = 3; tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; tcp_packets_out_inc(sk, tp, skb); } @@ -221,6 +227,9 @@ void tcp_select_initial_window(int __space, __u32 mss, if (*rcv_wnd > init_cwnd*mss) *rcv_wnd = init_cwnd*mss; } + + if (sysctl_tcp_maximize_adv_win) + *rcv_wnd = space; /* Set the clamp no higher than max representable value */ (*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp); @@ -474,7 +483,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, if (likely(err <= 0)) return err; - tcp_enter_cwr(sk); + tcp_enter_cwr(sk, 1); /* NET_XMIT_CN is special. It does not guarantee, * that this packet is lost. It tells that device @@ -873,8 +882,11 @@ static void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) if (tp->packets_out > tp->snd_cwnd_used) tp->snd_cwnd_used = tp->packets_out; - if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto) - tcp_cwnd_application_limited(sk); + if (sysctl_tcp_application_limited) { + if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto) + tcp_cwnd_application_limited(sk); + } else + tp->snd_cwnd_used = 0; } } @@ -962,8 +974,10 @@ static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb, if (nonagle & TCP_NAGLE_PUSH) return 1; - /* Don't use the nagle rule for urgent data (or for the final FIN). */ - if (tp->urg_mode || + /* Don't use the nagle rule for urgent data (or for the final FIN). + * Nagle can be ignored during F-RTO too (RFC4138). + */ + if (tp->urg_mode || (tp->frto_counter == 2) || (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) return 1; @@ -1489,6 +1503,9 @@ u32 __tcp_select_window(struct sock *sk) if (mss > full_space) mss = full_space; + if (sysctl_tcp_maximize_adv_win) + free_space = full_space; + if (free_space < full_space/2) { icsk->icsk_ack.quick = 0; @@ -1499,7 +1516,7 @@ u32 __tcp_select_window(struct sock *sk) return 0; } - if (free_space > tp->rcv_ssthresh) + if (!sysctl_tcp_maximize_adv_win && free_space > tp->rcv_ssthresh) free_space = tp->rcv_ssthresh; /* Don't do rounding if we are using window scaling, since the @@ -1826,7 +1843,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) skb_peek(&sk->sk_write_queue)) inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, - TCP_RTO_MAX); + sysctl_tcp_rto_max); } packet_cnt += tcp_skb_pcount(skb); @@ -1892,7 +1909,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (skb == skb_peek(&sk->sk_write_queue)) inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, - TCP_RTO_MAX); + sysctl_tcp_rto_max); NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS); } @@ -2181,7 +2198,7 @@ int tcp_connect(struct sock *sk) /* Timer for repeating the SYN until an answer. */ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)->icsk_rto, TCP_RTO_MAX); + inet_csk(sk)->icsk_rto, sysctl_tcp_rto_max); return 0; } @@ -2195,7 +2212,9 @@ void tcp_send_delayed_ack(struct sock *sk) int ato = icsk->icsk_ack.ato; unsigned long timeout; - if (ato > TCP_DELACK_MIN) { + if (sysctl_tcp_delack) + ato = sysctl_tcp_delack * HZ / 1000; + else if (ato > TCP_DELACK_MIN) { const struct tcp_sock *tp = tcp_sk(sk); int max_ato = HZ/2; @@ -2227,7 +2246,8 @@ void tcp_send_delayed_ack(struct sock *sk) * send ACK now. */ if (icsk->icsk_ack.blocked || - time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) { + (time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2)) && + !sysctl_tcp_delack)) { tcp_send_ack(sk); return; } @@ -2257,7 +2277,7 @@ void tcp_send_ack(struct sock *sk) inet_csk_schedule_ack(sk); inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, - TCP_DELACK_MAX, TCP_RTO_MAX); + TCP_DELACK_MAX, sysctl_tcp_rto_max); return; } @@ -2385,8 +2405,8 @@ void tcp_send_probe0(struct sock *sk) icsk->icsk_backoff++; icsk->icsk_probes_out++; inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX), - TCP_RTO_MAX); + min(icsk->icsk_rto << icsk->icsk_backoff, sysctl_tcp_rto_max), + sysctl_tcp_rto_max); } else { /* If packet was not sent due to local congestion, * do not backoff and do not remember icsk_probes_out. @@ -2399,7 +2419,7 @@ void tcp_send_probe0(struct sock *sk) inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RESOURCE_PROBE_INTERVAL), - TCP_RTO_MAX); + sysctl_tcp_rto_max); } } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 7c1bde3..5f4a0e5 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -31,6 +31,8 @@ int sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; int sysctl_tcp_retries1 = TCP_RETR1; int sysctl_tcp_retries2 = TCP_RETR2; int sysctl_tcp_orphan_retries; +u32 sysctl_tcp_rto_max = TCP_RTO_MAX; +u32 sysctl_tcp_rto_min = TCP_RTO_MIN; static void tcp_write_timer(unsigned long); static void tcp_delack_timer(unsigned long); @@ -71,7 +73,7 @@ static int tcp_out_of_resources(struct sock *sk, int do_reset) /* If peer does not open window for long time, or did not transmit * anything for long time, penalize it. */ - if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset) + if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*sysctl_tcp_rto_max || !do_reset) orphans <<= 1; /* If some dubious ICMP arrived, penalize even more. */ @@ -149,7 +151,7 @@ static int tcp_write_timeout(struct sock *sk) retry_until = sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { - const int alive = (icsk->icsk_rto < TCP_RTO_MAX); + const int alive = (icsk->icsk_rto < sysctl_tcp_rto_max); retry_until = tcp_orphan_retries(sk, alive); @@ -256,7 +258,7 @@ static void tcp_probe_timer(struct sock *sk) max_probes = sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { - const int alive = ((icsk->icsk_rto << icsk->icsk_backoff) < TCP_RTO_MAX); + const int alive = ((icsk->icsk_rto << icsk->icsk_backoff) < sysctl_tcp_rto_max); max_probes = tcp_orphan_retries(sk, alive); @@ -301,7 +303,7 @@ static void tcp_retransmit_timer(struct sock *sk) inet->num, tp->snd_una, tp->snd_nxt); } #endif - if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) { + if (tcp_time_stamp - tp->rcv_tstamp > sysctl_tcp_rto_max) { tcp_write_err(sk); goto out; } @@ -349,7 +351,7 @@ static void tcp_retransmit_timer(struct sock *sk) icsk->icsk_retransmits = 1; inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL), - TCP_RTO_MAX); + sysctl_tcp_rto_max); goto out; } @@ -372,8 +374,8 @@ static void tcp_retransmit_timer(struct sock *sk) icsk->icsk_retransmits++; out_reset_timer: - icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); + icsk->icsk_rto = min(icsk->icsk_rto << 1, sysctl_tcp_rto_max); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, sysctl_tcp_rto_max); if (icsk->icsk_retransmits > sysctl_tcp_retries1) __sk_dst_reset(sk); @@ -428,7 +430,7 @@ out_unlock: static void tcp_synack_timer(struct sock *sk) { inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL, - TCP_TIMEOUT_INIT, TCP_RTO_MAX); + TCP_TIMEOUT_INIT, sysctl_tcp_rto_max); } void tcp_set_keepalive(struct sock *sk, int val) diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 5af8a59..346d845 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -31,7 +31,7 @@ #define DEFAULT_HASH_SIZE 64 /* optimized for diffserv */ -#if 1 /* control */ +#if 0 /* control */ #define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) #else #define DPRINTK(format,args...)