diff -r -u -N linux-2.4.19-clean/include/linux/iip.h linux-2.4.19-p1/include/linux/iip.h --- linux-2.4.19-clean/include/linux/iip.h Thu Jan 1 02:00:00 1970 +++ linux-2.4.19-p1/include/linux/iip.h Thu Nov 7 10:58:54 2002 @@ -0,0 +1,23 @@ +/* IIP-MOBILE/IWTCP related definitions + * for IIP-MOBILE project at dept. of Computer Science + * at the Univeristy of Helsinki + * + * Version: $Id: iwtcp.h,v 1.1 2000/06/06 12:59:30 sarolaht Exp $ + * + * Authors: Pasi Sarolahti + */ + +#ifndef __LINUX_IWTCP_H +#define __LINUX_IWTCP_H + +#include +#include + +/* Functions for receiver window sharing */ +extern int srwnd_is_shared(const struct sock *sk); +extern u32 srwnd_get_share(const struct tcp_opt *tp); +extern void srwnd_adj_connections(const struct sock *sk, int adjustment); +extern void srwnd_use_window(const struct sock *sk, int adjustment); +extern void srwnd_free_window(const struct sock *sk, int adjustment); + +#endif diff -r -u -N linux-2.4.19-clean/include/linux/sysctl.h linux-2.4.19-p1/include/linux/sysctl.h --- linux-2.4.19-clean/include/linux/sysctl.h Sat Aug 3 03:39:46 2002 +++ linux-2.4.19-p1/include/linux/sysctl.h Thu Nov 7 10:55:44 2002 @@ -291,7 +291,21 @@ NET_IPV4_NONLOCAL_BIND=88, NET_IPV4_ICMP_RATELIMIT=89, NET_IPV4_ICMP_RATEMASK=90, - NET_TCP_TW_REUSE=91 + NET_TCP_TW_REUSE=91, + + /* IIP-Mobile parameters from 95 onwards */ + NET_IIP_RTO_BEHAVIOUR=95, + NET_IIP_CBI=96, + /* NET_IIP_RFC2988=97, (not used anymore) */ + NET_IIP_SRWND_ADDR=98, + NET_IIP_SRWND_SIZE=99, + NET_IIP_RATEHALVING=100, + NET_IIP_LIMITEDXMIT=101, + NET_IIP_DELACK_MODE=102, + NET_IIP_SRWND_MIN=103, + NET_IIP_SRWND_MAX=104, + NET_IIP_IW=105 + }; enum { diff -r -u -N linux-2.4.19-clean/include/linux/tcp.h linux-2.4.19-p1/include/linux/tcp.h --- linux-2.4.19-clean/include/linux/tcp.h Thu Nov 22 21:47:11 2001 +++ linux-2.4.19-p1/include/linux/tcp.h Thu Nov 7 10:55:28 2002 @@ -128,6 +128,10 @@ #define TCP_INFO 11 /* Information about this connection. */ #define TCP_QUICKACK 12 /* Block/reenable quick acks */ +/* IIP */ +#define TCP_IIP_HACKCODE 13 /* For switching TCP enhancements on per + * socket basis */ + #define TCPI_OPT_TIMESTAMPS 1 #define TCPI_OPT_SACK 2 #define TCPI_OPT_WSCALE 4 diff -r -u -N linux-2.4.19-clean/include/net/sock.h linux-2.4.19-p1/include/net/sock.h --- linux-2.4.19-clean/include/net/sock.h Sat Aug 3 03:39:46 2002 +++ linux-2.4.19-p1/include/net/sock.h Thu Nov 7 10:54:23 2002 @@ -418,7 +418,23 @@ int linger2; unsigned long last_synq_overflow; + + /* IIP (PS) */ + int iip_rtoflag; /* set on RTO, cleared when first ack + * arrives after RTO */ + __u32 iip_rtoseq; /* Segment that triggered RTO */ + __u32 iip_rtohighseq; /* Highest segment transmitted when + * RTO occurred */ + int iip_dupacks; /* counts successive dupacks */ + int iip_hackcode; /* Tune TCP enhancements on per socket + * basis. See info below */ }; + + /* IIP: Possible flags in hackcode field */ + #define IIP_HCODE_DISABLE_FRTO 1 /* Use regular RTO recovery for this + * socket */ + + /* diff -r -u -N linux-2.4.19-clean/include/net/tcp.h linux-2.4.19-p1/include/net/tcp.h --- linux-2.4.19-clean/include/net/tcp.h Sat Aug 3 03:39:46 2002 +++ linux-2.4.19-p1/include/net/tcp.h Thu Nov 7 10:54:02 2002 @@ -30,6 +30,118 @@ #include #include +/* IIP-Mobile sysctls use these */ +extern int sysctl_iip_cbi; /* Control Block Interdependence */ +extern int sysctl_iip_ratehalving; +extern int sysctl_iip_limitedxmit; +extern int sysctl_iip_iw; +extern int sysctl_iip_rto_behaviour; +enum { + IIP_RTO_LINUX = 1, /* Default Linux behaviour */ + IIP_RTO_PASIX = 2, /* Brave variant of F-RTO */ + IIP_RTO_CF_RTO = 3 /* Conservative (and simplier) variant + * of F-RTO */ +}; + +/* IIP: These flags are used with F-RTOs */ +enum { + IIP_RTOF_OPEN, /* Normal state */ + IIP_RTOF_RTO, /* RTO occurred, no acks yet received */ + IIP_RTOF_RECOVERING /* Recovering from RTO, send_high not yet + reached */ +}; + +extern int sysctl_iip_delack_mode; +enum { + IIP_DELACK_STANDARD = 1, + IIP_DELACK_LINUX = 2 +}; + +/* IIP (PS): The scariest experimental things are applied only for subset of + * connections. Currently connections destined to 10.*.*.* are considered as + * test connections for experimental things. + */ +static __inline__ int iip_is_test_connection(const struct sock *sk) +{ + return ((sk->daddr & 0x000000ff) == 0x0000000a); +} + +/* IIP (PS): Convenience func to check if F-RTOs should be applied for the + * connection + */ +static __inline__ int iip_use_pasix_rto(const struct sock *sk) +{ + const struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + /* This must be destined to 10.*.*.*, F-RTO rto_behaviour must be + * activated in sysctl, and there must be some unsent new data + */ + return (iip_is_test_connection(sk) && + sysctl_iip_rto_behaviour == IIP_RTO_PASIX && + tp->send_head && + !(tp->iip_hackcode & IIP_HCODE_DISABLE_FRTO)); +} + + +static __inline__ int iip_use_cf_rto(const struct sock *sk) +{ + const struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + /* This must be destined to 10.*.*.*, F-RTO rto_behaviour must be + * activated in sysctl, and there must be some unsent new data + */ + return (iip_is_test_connection(sk) && + sysctl_iip_rto_behaviour == IIP_RTO_CF_RTO && + tp->send_head && + !(tp->iip_hackcode & IIP_HCODE_DISABLE_FRTO)); +} + + +static __inline__ int iip_use_any_frto(const struct sock *sk) +{ + return (iip_use_pasix_rto(sk) || iip_use_cf_rto(sk)); +} + + +/* IIP (PS): Some modifications are still incorrectly made, causing the + * sacked_out, lost_out, etc. counters to have invalid, even negative values. + * If such values are encountered, fix them and hope that the situation will + * normalize eventually :-). Yes, this function will disappear ASAP, when the + * problem is really fixed. + */ +static __inline__ void iip_fake_counters(struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + if ((int)tp->sacked_out < 0) { + printk(KERN_DEBUG "sacked_out = %d. Faking...\n", + tp->sacked_out); + tp->sacked_out = 0; + } + if ((int)tp->lost_out < 0) { + printk(KERN_DEBUG "lost_out = %d. Faking...\n", + tp->lost_out); + tp->lost_out = 0; + } + if ((int)tp->retrans_out < 0) { + printk(KERN_DEBUG "retrans_out = %d. Faking...\n", + tp->retrans_out); + tp->retrans_out = 0; + } +} + +/* IIP: Place this check here and there to make sure that cwnd does not fall + * below 1 + */ +static __inline__ void iip_cwnd_check(struct tcp_opt *tp, const char *text) +{ + if ((int)tp->snd_cwnd < 1) { + printk(KERN_DEBUG "warning: %s: cwnd = %d. Fixing...\n", + text, tp->snd_cwnd); + tp->snd_cwnd = 1; + } +} + /* This is for all connections with a full identity, no wildcards. * New scheme, half the table is for TIME_WAIT, the other half is * for the rest. I'll experiment with dynamic table growth later. @@ -881,6 +993,8 @@ break; case TCP_TIME_DACK: + /* IIP debug */ +/* SOCK_DEBUG(sk, "resetting DACK timer\n"); */ tp->ack.pending |= TCP_ACK_TIMER; tp->ack.timeout = jiffies+when; if (!mod_timer(&tp->delack_timer, tp->ack.timeout)) @@ -1069,7 +1183,19 @@ */ static __inline__ unsigned int tcp_packets_in_flight(struct tcp_opt *tp) { - return tp->packets_out - tp->left_out + tp->retrans_out; + /* IIP (PS): Follow Allman-SACK draft's "pipe", if pure SACK + * The statement below is (the defines were in tcp_input.c): + * if (IsReno || IsFack) + * Note: + * - If in Loss state, we will have to consider also lost_out in order + * to do go-back-N correctly. + * - If using F-RTOs and RTO has occured, assume FACK behaviour + */ + if (tp->sack_ok == 0 || tp->sack_ok & 2 || + tp->ca_state == TCP_CA_Loss || tp->iip_rtoflag != IIP_RTOF_OPEN) + return tp->packets_out - tp->left_out + tp->retrans_out; + else + return tp->packets_out - tp->sacked_out + tp->retrans_out; } /* Recalculate snd_ssthresh, we want to set it to: @@ -1343,11 +1469,16 @@ tp->ucopy.memory = 0; } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) { wake_up_interruptible(sk->sleep); - if (!tcp_ack_scheduled(tp)) - tcp_reset_xmit_timer(sk, TCP_TIME_DACK, (3*TCP_RTO_MIN)/4); - } - return 1; - } + if (!tcp_ack_scheduled(tp)) { + if (sysctl_iip_delack_mode != IIP_DELACK_STANDARD) + tcp_reset_xmit_timer(sk, TCP_TIME_DACK, (3*TCP_RTO_MIN)/4); + else + tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX); + } + } + return 1; + } + return 0; } diff -r -u -N linux-2.4.19-clean/net/ipv4/Makefile linux-2.4.19-p1/net/ipv4/Makefile --- linux-2.4.19-clean/net/ipv4/Makefile Fri Dec 21 19:42:05 2001 +++ linux-2.4.19-p1/net/ipv4/Makefile Thu Nov 7 10:59:11 2002 @@ -15,8 +15,10 @@ ip_input.o ip_fragment.o ip_forward.o ip_options.o \ ip_output.o ip_sockglue.o \ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o tcp_minisocks.o \ - tcp_diag.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ - sysctl_net_ipv4.o fib_frontend.o fib_semantics.o fib_hash.o + tcp_diag.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ + sysctl_net_ipv4.o fib_frontend.o fib_semantics.o fib_hash.o \ + share_rwnd.o + obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o obj-$(CONFIG_IP_ROUTE_NAT) += ip_nat_dumb.o diff -r -u -N linux-2.4.19-clean/net/ipv4/ip_input.c linux-2.4.19-p1/net/ipv4/ip_input.c --- linux-2.4.19-clean/net/ipv4/ip_input.c Sat Aug 3 03:39:46 2002 +++ linux-2.4.19-p1/net/ipv4/ip_input.c Thu Nov 7 11:00:46 2002 @@ -438,6 +438,7 @@ ip_rcv_finish); inhdr_error: + printk(KERN_DEBUG "IP Receive error\n"); IP_INC_STATS_BH(IpInHdrErrors); drop: kfree_skb(skb); diff -r -u -N linux-2.4.19-clean/net/ipv4/share_rwnd.c linux-2.4.19-p1/net/ipv4/share_rwnd.c --- linux-2.4.19-clean/net/ipv4/share_rwnd.c Thu Jan 1 02:00:00 1970 +++ linux-2.4.19-p1/net/ipv4/share_rwnd.c Thu Nov 7 11:01:15 2002 @@ -0,0 +1,126 @@ +/* Receiver window sharing implementation + * for IWTCP/IIP-MOBILE projects at dept. of Computer Science + * at the Univeristy of Helsinki + * + * Version: $Id: share_rwnd.c,v 1.6 2001/08/02 14:09:24 sarolaht Exp $ + * + * Authors: Pasi Sarolahti + */ + +#include +#include + + +static struct shared_cb { + u32 srw_size; /* Size of shared window (bytes) */ + u32 srw_free; /* Free shared window (bytes) */ + int srw_conn_count; /* Total number of connections using shared + * window */ +} srw = { + 0, /* Size is initialized from sysctl when the first shared + * connection appears */ + 0, + 0 +}; + +/* Read also the note in srwnd_is_shared() */ +/* First 8 bits of the address to be shared. If 0, no addresses will be shared. + * Addresses 10.*.*.* share the window space. If 255, all connections + * share the same address space. + */ +int sysctl_iip_srwnd_addr = 0; +/* Initial size of shared receiver window in bytes */ +int sysctl_iip_srwnd_size = 16384; + +/* Maximum window advertised for individual + * connection at a time (bytes) */ +int sysctl_iip_srwnd_max = 16384; +/* Minimum window advertised for individual connection, regardless of + * the srw_size or the segment size (given in segments) + */ +int sysctl_iip_srwnd_min = 1024; /* bytes */ + + +/* Only socks connected to certain address masks (determined by the most + * meaningful 8 bits in a sysctl variable) use shared receiver window + */ +__inline__ int srwnd_is_shared(const struct sock *sk) +{ + if (!sysctl_iip_srwnd_addr) + return 0; + else if (sysctl_iip_srwnd_addr == 0xff) + return 1; + else + /* The right alternative below was commented out + * because it isn't practical in IWTCP tests. + * Now the srwnd sysctl defines the least meaningful byte of + * IP. Only addresses in 10.0.0.* family can be shared. Note + * that now only connections to one IP address can be shared. + * No usable in the real world. + */ + return ((sk->daddr & 0x00ffffff) == 0x0000000a) + && ((sk->daddr >> 24) == sysctl_iip_srwnd_addr); +/* return ((sk->daddr & 0xff) == sysctl_iwtcp_srwnd_addr); */ +} + + +__inline__ u32 srwnd_get_share(const struct tcp_opt *tp) +{ + u32 wnd, maxfree; + + if (srw.srw_conn_count < 1) { + printk(KERN_DEBUG + "srwnd_get_share: invalid connection count: %d\n", + srw.srw_conn_count); + return srw.srw_free; /* fallback */ + } + + /* Truncate free window to be divisible by mss. Make sure that + * negative values are not returned. Also check that min and max + * boundaries are not exceeded. */ + maxfree = min(((u32)srw.srw_free / srw.srw_conn_count), + sysctl_iip_srwnd_max); + wnd = max((maxfree/tp->mss_cache) * tp->mss_cache, 0); + return max(wnd, sysctl_iip_srwnd_min); +} + + +void srwnd_adj_connections(const struct sock *sk, int adjustment) +{ + /* printk( KERN_DEBUG "adj_connections: daddr: %08x shared: %d\n", + sk->daddr, srwnd_is_shared(sk)); */ + if (!srwnd_is_shared(sk)) return; + + /* Make sure shared window size is initialized according to sysctl + * variable. To reinitialize, all connections using it must be + * closed. */ + if (srw.srw_conn_count <= 0) + srw.srw_size = srw.srw_free = sysctl_iip_srwnd_size; + + srw.srw_conn_count += adjustment; + + /* Just to cover possible window leaks, hopefully never used */ + if (srw.srw_conn_count <= 0) { + if (srw.srw_free < srw.srw_size) + printk(KERN_DEBUG + "share_rwnd.c: No connections but %d bytes missing from window\n", + srw.srw_size - srw.srw_free); + srw.srw_free = srw.srw_size; + srw.srw_conn_count = 0; /* Fallback: remove when sure + * the accounting works properly? */ + } +} + + +__inline__ void srwnd_use_window(const struct sock *sk, int adjustment) +{ + if (!srwnd_is_shared(sk)) return; + srw.srw_free -= adjustment; +} + + +__inline__ void srwnd_free_window(const struct sock *sk, int adjustment) +{ + if (!srwnd_is_shared(sk)) return; + srw.srw_free += adjustment; +} diff -r -u -N linux-2.4.19-clean/net/ipv4/sysctl_net_ipv4.c linux-2.4.19-p1/net/ipv4/sysctl_net_ipv4.c --- linux-2.4.19-clean/net/ipv4/sysctl_net_ipv4.c Sat Aug 3 03:39:46 2002 +++ linux-2.4.19-p1/net/ipv4/sysctl_net_ipv4.c Thu Nov 7 11:01:36 2002 @@ -45,6 +45,12 @@ extern int inet_peer_gc_mintime; extern int inet_peer_gc_maxtime; +/* IIP: From share_rwnd.c */ +extern int sysctl_iip_srwnd_addr; +extern int sysctl_iip_srwnd_size; +extern int sysctl_iip_srwnd_min; +extern int sysctl_iip_srwnd_max; + #ifdef CONFIG_SYSCTL static int tcp_retr1_max = 255; static int ip_local_port_range_min[] = { 1, 1 }; @@ -221,6 +227,28 @@ &sysctl_icmp_ratemask, sizeof(int), 0644, NULL, &proc_dointvec}, {NET_TCP_TW_REUSE, "tcp_tw_reuse", &sysctl_tcp_tw_reuse, sizeof(int), 0644, NULL, &proc_dointvec}, + + /* IIP-Mobile parameters */ + {NET_IIP_RTO_BEHAVIOUR, "iip_rto_behaviour", + &sysctl_iip_rto_behaviour, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IIP_CBI, "iip_cbi", + &sysctl_iip_cbi, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IIP_SRWND_ADDR, "iip_srwnd_addr", + &sysctl_iip_srwnd_addr, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IIP_SRWND_SIZE, "iip_srwnd_size", + &sysctl_iip_srwnd_size, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IIP_SRWND_MIN, "iip_srwnd_min", + &sysctl_iip_srwnd_min, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IIP_SRWND_MAX, "iip_srwnd_max", + &sysctl_iip_srwnd_max, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IIP_RATEHALVING, "iip_ratehalving", + &sysctl_iip_ratehalving, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IIP_LIMITEDXMIT, "iip_limitedxmit", + &sysctl_iip_limitedxmit, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IIP_DELACK_MODE, "iip_delack_mode", + &sysctl_iip_delack_mode, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IIP_IW, "iip_iw", + &sysctl_iip_iw, sizeof(int), 0644, NULL, &proc_dointvec}, {0} }; diff -r -u -N linux-2.4.19-clean/net/ipv4/tcp.c linux-2.4.19-p1/net/ipv4/tcp.c --- linux-2.4.19-clean/net/ipv4/tcp.c Sat Aug 3 03:39:46 2002 +++ linux-2.4.19-p1/net/ipv4/tcp.c Thu Nov 7 11:01:54 2002 @@ -259,6 +259,8 @@ #include #include +#include + int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; struct tcp_mib tcp_statistics[NR_CPUS*2]; @@ -270,7 +272,10 @@ atomic_t tcp_orphan_count = ATOMIC_INIT(0); int sysctl_tcp_mem[3]; -int sysctl_tcp_wmem[3] = { 4*1024, 16*1024, 128*1024 }; +/* IIP: Changed wmem[1] from 16 KB to 32 KB by default, because 16 KB blocked + * the sender on some cases + */ +int sysctl_tcp_wmem[3] = { 4*1024, 32*1024, 128*1024 }; int sysctl_tcp_rmem[3] = { 4*1024, 87380, 87380*2 }; atomic_t tcp_memory_allocated; /* Current allocated memory. */ @@ -1290,6 +1295,16 @@ if (tcp_ack_scheduled(tp)) { /* Delayed ACKs frequently hit locked sockets during bulk receive. */ + /* IIP debug */ +/* SOCK_DEBUG(sk, + "cleanup a: blocked: %d 2*mss: %d third: %d\n", + tp->ack.blocked, + tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss, + (copied > 0 && + (tp->ack.pending&TCP_ACK_PUSHED) && + !tp->ack.pingpong && + atomic_read(&sk->rmem_alloc) == 0)); */ + if (tp->ack.blocked /* Once-per-two-segments ACK was not sent by tcp_input.c */ || tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss @@ -1299,7 +1314,11 @@ * receive buffer and there was a small segment * in queue. */ - || (copied > 0 && + /* IIP (PS): The last condition does not apply on + * "STANDARD" delack mode + */ + || (sysctl_iip_delack_mode != IIP_DELACK_STANDARD && + copied > 0 && (tp->ack.pending&TCP_ACK_PUSHED) && !tp->ack.pingpong && atomic_read(&sk->rmem_alloc) == 0)) { @@ -1307,6 +1326,11 @@ } } + if (copied > 0) { + /* IIP (PS): Give space in shared window, if it is in use */ + srwnd_free_window(sk, copied); + } + /* We send an ACK if we can now advertise a non-zero window * which has been raised "significantly". * @@ -1329,8 +1353,9 @@ time_to_ack = 1; } } - if (time_to_ack) + if (time_to_ack) { tcp_send_ack(sk); + } } /* Now socket state including sk->err is changed only under lock, @@ -2391,6 +2416,15 @@ } break; + case TCP_IIP_HACKCODE: + /* IIP: Hackcode allows user to tune TCP enhancements on per + * socket basis. Further info of the codes and flags for this + * field in include/net/sock.h + */ + tp->iip_hackcode = val; + SOCK_DEBUG(sk, "Setting hackcode: %d\n", tp->iip_hackcode); + break; + default: err = -ENOPROTOOPT; break; diff -r -u -N linux-2.4.19-clean/net/ipv4/tcp_input.c linux-2.4.19-p1/net/ipv4/tcp_input.c --- linux-2.4.19-clean/net/ipv4/tcp_input.c Sat Aug 3 03:39:46 2002 +++ linux-2.4.19-p1/net/ipv4/tcp_input.c Thu Nov 7 11:02:07 2002 @@ -69,17 +69,18 @@ #include #include -int sysctl_tcp_timestamps = 1; +int sysctl_tcp_timestamps = 0; int sysctl_tcp_window_scaling = 1; -int sysctl_tcp_sack = 1; -int sysctl_tcp_fack = 1; +int sysctl_tcp_sack = 0; +/* IIP (PS): Changed FACK to be disabled by default */ +int sysctl_tcp_fack = 0; int sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; #ifdef CONFIG_INET_ECN int sysctl_tcp_ecn = 1; #else -int sysctl_tcp_ecn = 0; +int sysctl_tcp_ecn = 0; /* IIP (PS): For the present ECN is off by default */ #endif -int sysctl_tcp_dsack = 1; +int sysctl_tcp_dsack = 0; /* IIP (PS): Off by default */ int sysctl_tcp_app_win = 31; int sysctl_tcp_adv_win_scale = 2; @@ -87,6 +88,13 @@ int sysctl_tcp_rfc1337 = 0; int sysctl_tcp_max_orphans = NR_FILE; +/* IIP-Mobile variables */ +int sysctl_iip_cbi = 0; /* Control Block Interdependence */ +int sysctl_iip_ratehalving = 0; +int sysctl_iip_limitedxmit = 0; +int sysctl_iip_delack_mode = IIP_DELACK_STANDARD; +int sysctl_iip_iw = 2; /* Initial window */ + #define FLAG_DATA 0x01 /* Incoming frame contained data. */ #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ #define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ @@ -177,7 +185,9 @@ static __inline__ int tcp_in_quickack_mode(struct tcp_opt *tp) { - return (tp->ack.quick && !tp->ack.pingpong); + /* IIP (PS): In STANDARD delayed ACK mode quickacks are not used */ + return (!(sysctl_iip_delack_mode == IIP_DELACK_STANDARD) && + tp->ack.quick && !tp->ack.pingpong); } /* Buffer size and advertised window tuning. @@ -376,6 +386,10 @@ } else { int m = now - tp->ack.lrcvtime; + /* IIP debug */ +/* SOCK_DEBUG(sk, "ato calculation: m: %d ato: %d\n", + m, tp->ack.ato); */ + if (m <= TCP_ATO_MIN/2) { /* The fastest case is the first. */ tp->ack.ato = (tp->ack.ato>>1) + TCP_ATO_MIN/2; @@ -484,14 +498,14 @@ * to do with delayed acks, because at cwnd>2 true delack timeout * is invisible. Actually, Linux-2.4 also generates erratic * ACKs in some curcumstances. - */ + */ tp->rto = (tp->srtt >> 3) + tp->rttvar; - + /* 2. Fixups made earlier cannot be right. * If we do not estimate RTO correctly without them, * all the algo is pure shit and should be replaced * with correct one. It is exaclty, which we pretend to do. - */ + */ } /* NOTE: clamping at TCP_RTO_MIN is not required, current algo @@ -605,8 +619,9 @@ cwnd = (tp->mss_cache > 1095) ? 3 : 4; + /* IIP (PS): Initial window is sysctl variable */ if (!tp->srtt || (tp->snd_ssthresh >= 0xFFFF && tp->srtt > ((HZ/50)<<3))) - cwnd = 2; + cwnd = sysctl_iip_iw; else if (cwnd > tp->snd_ssthresh) cwnd = tp->snd_ssthresh; @@ -620,7 +635,7 @@ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct dst_entry *dst = __sk_dst_get(sk); - if (dst == NULL) + if (dst == NULL || !sysctl_iip_cbi) goto reset; dst_confirm(dst); @@ -941,6 +956,8 @@ } } + /* IIP (PS) */ + iip_fake_counters(sk); tp->left_out = tp->sacked_out + tp->lost_out; if (reord < tp->fackets_out && tp->ca_state != TCP_CA_Loss) @@ -972,39 +989,111 @@ * and reset tags completely, otherwise preserve SACKs. If receiver * dropped its ofo queue, we will know this due to reneging detection. */ -void tcp_enter_loss(struct sock *sk, int how) +void tcp_enter_loss2(struct sock *sk, int how, int loss_postponed) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; struct sk_buff *skb; int cnt = 0; + if (!loss_postponed) { + tp->iip_rtoseq = + TCP_SKB_CB(skb_peek(&sk->write_queue))->end_seq; + } + /* Reduce ssthresh if it has not yet been made inside this window. */ - if (tp->ca_state <= TCP_CA_Disorder || - tp->snd_una == tp->high_seq || - (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { - tp->prior_ssthresh = tcp_current_ssthresh(tp); - tp->snd_ssthresh = tcp_recalc_ssthresh(tp); + /* IIP (PS): If F-RTOs are used, ssthresh is adjusted later. If there + * are any packet losses, ssthresh is adjusted at FR. If not, ssthresh + * does not need to be adjusted. + */ + /* IIP: Nasty exception to the nice F-RTO principle happens + * when RTO is triggered in Recovery state. Then it is probably + * another indication of congestion, hence ssthresh needs to + * be adjusted another time. + */ + /* IIP (14.11.2001): pasix rtos (=2) are not correct currently. + * They do not adjust ssthresh at all after rto. With pasix rtos the + * ssthresh should be adjusted when loss_postponed is set + */ + if (!iip_use_pasix_rto(sk) && !loss_postponed && + (tp->ca_state != TCP_CA_Loss || !tp->retransmits)) { + if (tp->ca_state <= TCP_CA_Disorder || + tp->snd_una == tp->high_seq) { + /* PS: Not yet in recovery, follow RFC 2582 */ + tp->prior_ssthresh = tcp_current_ssthresh(tp); + tp->snd_ssthresh = tcp_recalc_ssthresh(tp); + } else { + /* PS: Already in recovery, RFC 2582 fails here. + * Don't use cwnd for ssthresh calculation + */ + tp->snd_ssthresh = max((tp->snd_ssthresh >> 1), 2); + } } - tp->snd_cwnd = 1; + + /* IIP (PS): F-RTOs do not reduce congestion window immediately after + * RTO. If packets were lost, recovery state is eventually entered + * and cwnd is adjusted appropriately */ + if (!iip_use_any_frto(sk)) + tp->snd_cwnd = 1; + tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_time_stamp; - tcp_clear_retrans(tp); /* Push undo marker, if it was plain RTO and nothing * was retransmitted. */ - if (!how) + /* IIP (PS): Eifel does not work if backed off RTOs */ + if (!how && tp->ca_state != TCP_CA_Recovery) { tp->undo_marker = tp->snd_una; + } + + /* IIP debug */ +/* if (loss_postponed) { + SOCK_DEBUG(sk, "RTO postponed: rtoseq: %u highseq: %u\n", + tp->iip_rtoseq, tp->iip_rtohighseq); + } */ for_retrans_queue(skb, sk, tp) { cnt++; - if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS) - tp->undo_marker = 0; - TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; + /* IIP (PS): Eifel does not work if there are backed off RTO */ +/* if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS) + tp->undo_marker = 0; */ + + if (!loss_postponed) { + if (tp->iip_rtohighseq == 0 || + after(TCP_SKB_CB(skb)->end_seq, + tp->iip_rtohighseq)) + tp->iip_rtohighseq = TCP_SKB_CB(skb)->end_seq; + } + + /* IIP: This clears retrans flags. Don't want to do that on the + * recently retransmitted segments on postponed loss state + */ + if (!loss_postponed) { + TCP_SKB_CB(skb)->sacked &= + (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; + } + if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; - TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; - tp->lost_out++; + + /* IIP (PS): Only account for one loss if Pasix RTOs */ + if (!iip_use_any_frto(sk) || loss_postponed) { + /* IIP: Don't want to retransmit the recently + * triggered retransmission or the two new + * segments + */ + if (loss_postponed && + (!after(TCP_SKB_CB(skb)->end_seq, + tp->iip_rtoseq) || + after(TCP_SKB_CB(skb)->end_seq, + tp->iip_rtohighseq))) { + /* continue; */ + + } else { + TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; + tp->lost_out++; + } + } } else { tp->sacked_out++; tp->fackets_out = cnt; @@ -1012,10 +1101,28 @@ } tcp_sync_left_out(tp); - tp->reordering = min_t(unsigned int, tp->reordering, sysctl_tcp_reordering); - tp->ca_state = TCP_CA_Loss; - tp->high_seq = tp->snd_nxt; - TCP_ECN_queue_cwr(tp); + /* IIP (PS): I suspect this is not actually needed */ + if (loss_postponed && tp->iip_rtoflag != IIP_RTOF_OPEN) { + tp->snd_cwnd = tcp_packets_in_flight(tp) + 2; + } + + /* IIP (PS): If F-RTOs are used, don't move yet on Loss or Recover + * states, but continue in the present state. Fast recovery + * mechanism will take care of further packet losses */ + if (!iip_use_any_frto(sk) || loss_postponed) { + tp->reordering = min_t(unsigned int, tp->reordering, sysctl_tcp_reordering); + tp->ca_state = TCP_CA_Loss; + if (loss_postponed) + tp->high_seq = tp->iip_rtohighseq; + else + tp->high_seq = tp->snd_nxt; + TCP_ECN_queue_cwr(tp); + } +} + +void tcp_enter_loss(struct sock *sk, int how) +{ + tcp_enter_loss2(sk,how,0); } static int tcp_check_sack_reneging(struct sock *sk, struct tcp_opt *tp) @@ -1043,7 +1150,13 @@ static inline int tcp_fackets_out(struct tcp_opt *tp) { - return IsReno(tp) ? tp->sacked_out+1 : tp->fackets_out; + /* IIP (PS): This used to be IsReno() check, which was wrong, IMO. + * However, brave variant of F-RTO uses FACK after rto, whatever the + * sysctl. */ +/* return !IsFack(tp) ? tp->sacked_out+1 : tp->fackets_out; */ + return (!IsFack(tp) && (tp->iip_rtoflag == IIP_RTOF_OPEN || + sysctl_iip_rto_behaviour != IIP_RTO_PASIX)) + ? tp->sacked_out+1 : tp->fackets_out; } static inline int tcp_skb_timedout(struct tcp_opt *tp, struct sk_buff *skb) @@ -1152,6 +1265,16 @@ static int tcp_time_to_recover(struct sock *sk, struct tcp_opt *tp) { + /* IIP (PS): No tricks if FACK is not used - If number of dupacks is + * simply larger than 3, enter fast recovery, otherwise don't. + */ + if (!IsFack(tp) && tp->iip_rtoflag == IIP_RTOF_OPEN) { + if (tp->iip_dupacks < sysctl_tcp_reordering) + return 0; + else + return 1; + } + /* Trick#1: The loss is proven. */ if (tp->lost_out) return 1; @@ -1252,7 +1375,12 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_opt *tp) { - if (IsFack(tp)) { + /* IIP (PS): Mark all unacked segments in the "fack" range lost. + * If Allman-SACK is followed, in_flight does not count lost + * segments + */ +/* if (IsFack(tp)) { */ + if (!IsReno(tp)) { int lost = tp->fackets_out - tp->reordering; if (lost <= 0) lost = 1; @@ -1266,7 +1394,11 @@ * Hence, we can detect timed out packets during fast * retransmit without falling to slow start. */ - if (tcp_head_timedout(sk, tp)) { + /* IIP (PS): Suspecting that this is the place causing mismatch in + * lost_out counter (it gets even negative values) and halts + * the connection */ +/* if (tcp_head_timedout(sk, tp)) { */ + if (tcp_head_timedout(sk, tp) && !IsReno(tp)) { struct sk_buff *skb; for_retrans_queue(skb, sk, tp) { @@ -1287,6 +1419,8 @@ { tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+tcp_max_burst(tp)); + /* IIP (PS): Ensure that cwnd does not get negative value. */ + iip_cwnd_check(tp, "tcp_moderate_cwnd"); tp->snd_cwnd_stamp = tcp_time_stamp; } @@ -1299,10 +1433,18 @@ tp->snd_cwnd_cnt = decr&1; decr >>= 1; - if (decr && tp->snd_cwnd > tp->snd_ssthresh/2) + /* IIP (PS): cwnd is decreased only if ratehalving sysctl is set */ + if (sysctl_iip_ratehalving && decr && + tp->snd_cwnd > tp->snd_ssthresh/2) tp->snd_cwnd -= decr; - tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); + /* IIP: Allow faster increase of cwnd if recovering from RTO in F-RTO + * policy */ + if (tp->iip_rtoflag != IIP_RTOF_OPEN) + tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + + tcp_max_burst(tp)); + else + tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); tp->snd_cwnd_stamp = tcp_time_stamp; } @@ -1368,6 +1510,9 @@ NET_INC_STATS_BH(TCPFullUndo); tp->undo_marker = 0; } + /* IIP (PS): Ensure that cwnd is never smaller than 1 */ + iip_cwnd_check(tp, "undo_recovery"); + if (tp->snd_una == tp->high_seq && IsReno(tp)) { /* Hold old state until something *above* high_seq * is ACKed. For Reno it is MUST to prevent false @@ -1376,6 +1521,16 @@ return 1; } tp->ca_state = TCP_CA_Open; + + /* IIP (PS): retransmits has to be cleared at some point, otherwise + * we will have problems in the end of the test. This looks like a + * good place for this. */ + if (iip_use_any_frto(sk)) + tp->retransmits = 0; + + /* IIP (PS) */ + tp->iip_dupacks = 0; + return 0; } @@ -1407,6 +1562,8 @@ tcp_update_reordering(tp, tcp_fackets_out(tp)+acked, 1); DBGUNDO(sk, tp, "Hoe"); + /* IIP debug */ + SOCK_DEBUG(sk, "undo_partial: Hoe\n"); tcp_undo_cwr(tp, 0); NET_INC_STATS_BH(TCPPartialUndo); @@ -1422,7 +1579,14 @@ /* Undo during loss recovery after partial ACK. */ static int tcp_try_undo_loss(struct sock *sk, struct tcp_opt *tp) { - if (tcp_may_undo(tp)) { + /* IIP debug */ +/* SOCK_DEBUG(sk, "undo loss: marker: %ul delayed: %d\n", + tp->undo_marker, tcp_packet_delayed(tp)); + SOCK_DEBUG(sk, "saw_ts: %d tsecr: %u secr-retrans: %d\n", + tp->saw_tstamp, tp->rcv_tsecr, + (__s32)(tp->rcv_tsecr - tp->retrans_stamp)); */ + + if (tcp_may_undo(tp) && !iip_use_cf_rto(sk)) { struct sk_buff *skb; for_retrans_queue(skb, sk, tp) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; @@ -1434,13 +1598,25 @@ NET_INC_STATS_BH(TCPLossUndo); tp->retransmits = 0; tp->undo_marker = 0; - if (!IsReno(tp)) + /* IIP (PS): This cannot be necessary condition, removing... */ +/* if (!IsReno(tp)) */ tp->ca_state = TCP_CA_Open; return 1; } return 0; } +/* IIP (PS) */ +static void tcp_try_undo_open(struct sock *sk, struct tcp_opt *tp) +{ + SOCK_DEBUG(sk, "undo_open: may_undo: %d undo_marker: %d undo_retrans: %d delayed: %d\n", + tcp_may_undo(tp), tp->undo_marker, tp->undo_retrans, + tcp_packet_delayed(tp)); + if (tcp_may_undo(tp)) { + tcp_undo_cwr(tp, 1); + } +} + static __inline__ void tcp_complete_cwr(struct tcp_opt *tp) { tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); @@ -1470,6 +1646,15 @@ tp->high_seq = tp->snd_nxt; } tcp_moderate_cwnd(tp); + + /* IIP (PS): If limited xmit is not enabled, cwnd has to be + * reduced in order to avoid transmissions on Disorder state. + */ + if (!sysctl_iip_limitedxmit && state == TCP_CA_Disorder && + tp->snd_cwnd > 1) { + tp->snd_cwnd--; + } + } else { tcp_cwnd_down(tp); } @@ -1492,6 +1677,10 @@ { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); int is_dupack = (tp->snd_una == prior_snd_una && !(flag&FLAG_NOT_DUP)); + int iip_fastrxmit_done = 0; + + /* IIP debug */ +/* SOCK_DEBUG(sk, "enter fastretrans_alert\n"); */ /* Some technical things: * 1. Reno does not count dupacks (sacked_out) automatically. */ @@ -1525,7 +1714,8 @@ /* E. Check state exit conditions. State can be terminated * when high_seq is ACKed. */ if (tp->ca_state == TCP_CA_Open) { - BUG_TRAP(tp->retrans_out == 0); + /* IIP (PS): This bug trap is not valid with f-rtos */ +/* BUG_TRAP(tp->retrans_out == 0); */ tp->retrans_stamp = 0; } else if (!before(tp->snd_una, tp->high_seq)) { switch (tp->ca_state) { @@ -1573,10 +1763,19 @@ tcp_add_reno_sack(tp); } else { int acked = prior_packets - tp->packets_out; - if (IsReno(tp)) + if (IsReno(tp)) { tcp_remove_reno_sacks(sk, tp, acked); + /* PS: quick hack. fix properly */ +/* tcp_retransmit_skb(sk, (sk)->write_queue.next); + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, + tp->rto); */ + } is_dupack = tcp_try_undo_partial(sk, tp, acked); } + + /* IIP (PS): Make sure that cwnd is never smaller than 1 at + * this point. Unfortunately it seems to be possible. */ + tp->snd_cwnd = max(tp->snd_cwnd, 1); break; case TCP_CA_Loss: if (flag&FLAG_DATA_ACKED) @@ -1591,7 +1790,9 @@ /* Loss is undone; fall through to processing in Open state. */ default: if (IsReno(tp)) { - if (tp->snd_una != prior_snd_una) + /* IIP experimental */ + if (tp->snd_una != prior_snd_una && + !iip_use_pasix_rto(sk)) tcp_reset_reno_sack(tp); if (is_dupack) tcp_add_reno_sack(tp); @@ -1600,8 +1801,15 @@ if (tp->ca_state == TCP_CA_Disorder) tcp_try_undo_dsack(sk, tp); + /* IIP (PS): We should be in Disorder state */ + /* PS 8.11.2001: not necessarily... */ + if (is_dupack) tp->iip_dupacks++; + if (!tcp_time_to_recover(sk, tp)) { tcp_try_to_open(sk, tp, flag); + + if (tp->ca_state == TCP_CA_Open) + tp->iip_dupacks = 0; return; } @@ -1626,11 +1834,58 @@ tp->snd_cwnd_cnt = 0; tp->ca_state = TCP_CA_Recovery; + + /* IIP (PS): If ratehalving is not in use, cwnd needs to be + * decreased now and fast retransmit needs to be done. + */ + if (!sysctl_iip_ratehalving) { + if (sysctl_iip_limitedxmit) { + /* IIP (PS): No need to "artifically inflate" + * cwnd, like RFC 2581 suggests in Sec. 3.2, + * because dupacks are accounted in sacked_out. + */ + tp->snd_cwnd = min(tp->snd_cwnd, + tp->snd_ssthresh); + } else { + /* IIP (PS): my limited xmit avoidance hack + * artifically deflates cwnd, therefore it + * needs to be increased back here + */ + tp->snd_cwnd = min(tp->snd_cwnd, + tp->snd_ssthresh + + sysctl_tcp_reordering); + } + + /* IIP: First ack after rto: ensure that two segments + * are transmitted */ + if (tp->iip_rtoflag == IIP_RTOF_RTO) { + tp->snd_cwnd = tcp_packets_in_flight(tp) + 2; + } + + if ((sk)->write_queue.next != (tp)->send_head && + (struct sk_buff *)&(sk)->write_queue && + !(TCP_SKB_CB((sk)->write_queue.next)->sacked & TCPCB_RETRANS)) { + tcp_retransmit_skb(sk, (sk)->write_queue.next); + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, + tp->rto); + iip_fastrxmit_done = 1; + } + } } if (is_dupack || tcp_head_timedout(sk, tp)) tcp_update_scoreboard(sk, tp); + /* IIP (PS): Making sure that left_out counter has a valid value */ + if (tp->iip_rtoflag != IIP_RTOF_OPEN) + tcp_sync_left_out(tp); tcp_cwnd_down(tp); + /* IIP: The fast retransmit few lines earlier slightly messes up the + * burst prevention, causing occasional bursts of four segments. + * Therefore we do this stupid trick here. + */ + if (tp->iip_rtoflag != IIP_RTOF_OPEN && iip_use_pasix_rto(sk) && + iip_fastrxmit_done) + tp->snd_cwnd = max(tp->snd_cwnd - 1, 1); tcp_xmit_retransmit_queue(sk); } @@ -1717,7 +1972,7 @@ } /* Restart timer after forward progress on connection. - * RFC2988 recommends to restart timer to now+rto. + * RFC2988 recommends to restart timer to now+rto. */ static __inline__ void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp) @@ -1794,6 +2049,9 @@ tcp_ack_packets_out(sk, tp); } + /* IIP (PS) */ + iip_fake_counters(sk); + #if FASTRETRANS_DEBUG > 0 BUG_TRAP((int)tp->sacked_out >= 0); BUG_TRAP((int)tp->lost_out >= 0); @@ -1843,7 +2101,8 @@ static __inline__ int tcp_may_raise_cwnd(struct tcp_opt *tp, int flag) { return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) && - !((1<ca_state)&(TCPF_CA_Recovery|TCPF_CA_CWR)); + ((!((1<ca_state)&(TCPF_CA_Recovery|TCPF_CA_CWR))) || + tp->iip_rtoflag != IIP_RTOF_OPEN); } /* Check that window update is acceptable. @@ -1949,20 +2208,103 @@ /* See if we can take anything off of the retransmit queue. */ flag |= tcp_clean_rtx_queue(sk); - if (tcp_ack_is_dubious(tp, flag)) { - /* Advanve CWND, if state allows this. */ - if ((flag&FLAG_DATA_ACKED) && prior_in_flight >= tp->snd_cwnd && - tcp_may_raise_cwnd(tp, flag)) - tcp_cong_avoid(tp); - tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); - } else { - if ((flag&FLAG_DATA_ACKED) && prior_in_flight >= tp->snd_cwnd) - tcp_cong_avoid(tp); + /* IIP: A new undo method. If ACK for send_high or below, and a + * notification for spurious retransmission arrives, undo congestion + * window state + */ + if (tp->iip_rtohighseq && !after(ack, tp->iip_rtohighseq) && + tp->ca_state == TCP_CA_Open) { + tcp_try_undo_open(sk, tp); + } + + /* IIP: If dupack follows RTO, do not send two new data in F-RTO. + * It triggered more data in the network that was appropriate. + * Act as the traditional TCP in that case. (2b in the F-RTO paper) + */ + if (iip_use_cf_rto(sk) && tp->iip_rtoflag == IIP_RTOF_RTO && + tp->snd_una == prior_snd_una) { + tp->iip_rtoflag = IIP_RTOF_OPEN; + tp->snd_cwnd = 1; + tcp_enter_loss2(sk, 0, 1); + } + + /* IIP: Branch for careful F-RTO when second ack comes in: If any + * packets are detected lost, revert to the good old go-back-n + */ + if (iip_use_cf_rto(sk) && tp->iip_rtoflag == IIP_RTOF_RECOVERING) { + tp->iip_rtoflag = IIP_RTOF_OPEN; + /* Enter traditional loss state if incoming segment + * did not ack new data (3b in the F-RTO paper) + */ + if (tp->snd_una == prior_snd_una) { + tcp_enter_loss2(sk, 0, 1); + tcp_moderate_cwnd(tp); + } else { + /* IIP: ssthresh was already adjusted in enter_loss, + * as required by The Principle + * We have to halve cwnd in order to be careful. It + * is possible that one segment loss is masked by a + * delay that immediately follows the segment loss + * and then triggers timeout (3a in the F-RTO paper) + */ + tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); + } + } + + /* IIP: force two segments out on the first ACK after RTO (2a in the + * F-RTO paper) + */ + if (tp->iip_rtoflag == IIP_RTOF_RTO) { + /* If RTO happens in Recovery and fastretrans_alert already + * did more than one retransmissions, we cannot increase cwnd + * here. + */ + tcp_sync_left_out(tp); + tp->snd_cwnd = tcp_packets_in_flight(tp) + 2; + } + + if (tp->iip_rtoflag != IIP_RTOF_RTO || !iip_use_cf_rto(sk)) { + if (tcp_ack_is_dubious(tp, flag)) { + /* Advanve CWND, if state allows this. */ + if (((tp->iip_rtoflag != IIP_RTOF_OPEN && + iip_use_pasix_rto(sk)) || + (flag&FLAG_DATA_ACKED)) && + prior_in_flight >= tp->snd_cwnd && + tcp_may_raise_cwnd(tp, flag)) + tcp_cong_avoid(tp); + tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); + } else { + if ((flag&FLAG_DATA_ACKED) && prior_in_flight >= tp->snd_cwnd) + tcp_cong_avoid(tp); + } + } + + if (tp->iip_rtoflag == IIP_RTOF_RTO) { + tp->iip_rtoflag = IIP_RTOF_RECOVERING; + } + + /* IIP (PS): Never send more than three packets in the network when + * F-RTO recovering from RTO. It is up to the standard Linux code to + * check the other cases + */ + /* TODO: This can be removed maybe */ + if (tp->iip_rtoflag != IIP_RTOF_OPEN) + tcp_moderate_cwnd(tp); + + /* IIP (PS): If recovering from RTO and passing rto-highseq, + * the recovery is finished. + */ + if (tp->iip_rtoflag == IIP_RTOF_RECOVERING && + !before(ack, tp->iip_rtohighseq)) { + tp->iip_rtoflag = IIP_RTOF_OPEN; } if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP)) dst_confirm(sk->dst_cache); + /* IIP: Make sure that cwnd has a proper value here */ + iip_cwnd_check(tp, "tcp_ack"); + return 1; no_queue: @@ -2994,6 +3336,15 @@ { struct sk_buff *skb = sk->tp_pinfo.af_tcp.send_head; + /* IIP (PS) */ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + tcp_sync_left_out(tp); + /* IIP debug */ + SOCK_DEBUG(sk, "send_check: cwnd: %d in_flight: %d = packets_out: %d - sacked_out: %d - lost_out: %d + retrans_out: %d pending: %d rto: %d rtoflag: %d\n", + tp->snd_cwnd, tcp_packets_in_flight(tp), tp->packets_out, + tp->sacked_out, tp->lost_out, tp->retrans_out, tp->pending, + tp->rto, tp->iip_rtoflag); + if (skb != NULL) __tcp_data_snd_check(sk, skb); tcp_check_space(sk); @@ -3006,7 +3357,7 @@ { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - /* More than one full frame received... */ + /* More than one full frame received... */ if (((tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss /* ... and right edge of window advances far enough. * (tcp_recvmsg() will send ACK otherwise). Or... @@ -3028,10 +3379,18 @@ static __inline__ void tcp_ack_snd_check(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - if (!tcp_ack_scheduled(tp)) { + /* IIP (PS): Originally this place occasionally stalled the connection, + * when the very last ACK for FIN segment was lost in the + * network. The sender does RTO-triggered retransmissions of the last + * segment, but the receiver never ACKs them in this case. */ + if (!tcp_ack_scheduled(tp) && (sk->state == TCP_ESTABLISHED || + sk->state == TCP_FIN_WAIT1 || + sk->state == TCP_CLOSING || + sk->state == TCP_LAST_ACK)) { /* We sent a data segment already. */ return; } + /* SOCK_DEBUG(sk, "ack_check: scheduled: %d\n", tcp_ack_scheduled(tp)); */ __tcp_ack_snd_check(sk, 1); } @@ -3291,6 +3650,9 @@ } else { int eaten = 0; + /* IIP (PS) */ + u32 old_wnd = tp->rcv_nxt; + if (tp->ucopy.task == current && tp->copied_seq == tp->rcv_nxt && len - tcp_header_len <= tp->ucopy.len && @@ -3320,6 +3682,9 @@ tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; } + /* IIP (PS): Use shared window if needed */ + srwnd_use_window(sk, tp->rcv_nxt - old_wnd); + tcp_event_data_recv(sk, tp, skb); if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) { @@ -3331,7 +3696,12 @@ } if (eaten) { - if (tcp_in_quickack_mode(tp)) { + /* IIP (PS): ACK-on-every second segment policy + * should also be followed here + */ + if (tcp_in_quickack_mode(tp) || + (tp->rcv_nxt - tp->rcv_wup) > + tp->ack.rcv_mss) { tcp_send_ack(sk); } else { tcp_send_delayed_ack(sk); diff -r -u -N linux-2.4.19-clean/net/ipv4/tcp_ipv4.c linux-2.4.19-p1/net/ipv4/tcp_ipv4.c --- linux-2.4.19-clean/net/ipv4/tcp_ipv4.c Sat Aug 3 03:39:46 2002 +++ linux-2.4.19-p1/net/ipv4/tcp_ipv4.c Thu Nov 7 11:02:57 2002 @@ -63,6 +63,8 @@ #include #include +#include + extern int sysctl_ip_dynaddr; extern int sysctl_ip_default_ttl; int sysctl_tcp_tw_reuse = 0; @@ -822,6 +824,10 @@ tp->mss_clamp = 536; + /* IIP (PS): Adjust connections in shared rcv_wnd module */ + srwnd_adj_connections(sk, 1); + + /* Socket identity is still unknown (sport may be zero). * However we set state to SYN-SENT and not releasing socket * lock select source port, enter ourselves into the hash tables and @@ -1567,6 +1573,13 @@ newtp->advmss = dst->advmss; tcp_initialize_rcv_mss(newsk); + /* IIP (PS): Adjust connection counter and use shared window if + * necessary */ + srwnd_adj_connections(newsk, 1); + if (srwnd_is_shared(newsk)) + newtp->rcv_wnd = min(newtp->rcv_wnd, srwnd_get_share(newtp)); + + __tcp_v4_hash(newsk, 0); __tcp_inherit_port(sk, newsk); @@ -2015,7 +2028,15 @@ * algorithms that we must have the following bandaid to talk * efficiently to them. -DaveM */ - tp->snd_cwnd = 2; + /* IIP (PS): Initial window is sysctl'ed. retrans_stamp is also + * initialized because I feel it is the right thing */ + tp->snd_cwnd = sysctl_iip_iw; + tp->iip_rtoflag = IIP_RTOF_OPEN; + tp->iip_rtohighseq = 0; + tp->iip_hackcode = 0; + tp->retrans_stamp = 0; + + tp->iip_dupacks = 0; /* See draft-stevens-tcpca-spec-01 for discussion of the * initialization of these values. @@ -2045,6 +2066,9 @@ { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + /* IIP (PS) */ + srwnd_adj_connections(sk, -1); + tcp_clear_xmit_timers(sk); /* Cleanup up the write buffer. */ diff -r -u -N linux-2.4.19-clean/net/ipv4/tcp_minisocks.c linux-2.4.19-p1/net/ipv4/tcp_minisocks.c --- linux-2.4.19-clean/net/ipv4/tcp_minisocks.c Sat Aug 3 03:39:46 2002 +++ linux-2.4.19-p1/net/ipv4/tcp_minisocks.c Thu Nov 7 11:03:17 2002 @@ -712,8 +712,15 @@ * algorithms that we must have the following bandaid to talk * efficiently to them. -DaveM */ - newtp->snd_cwnd = 2; + /* IIP (PS): Initial window is sysctl */ + newtp->snd_cwnd = sysctl_iip_iw; + newtp->iip_rtoflag = IIP_RTOF_OPEN; + newtp->iip_rtohighseq = 0; newtp->snd_cwnd_cnt = 0; + newtp->retrans_stamp = 0; + + /* IIP (PS) */ + newtp->iip_dupacks = 0; newtp->ca_state = TCP_CA_Open; tcp_init_xmit_timers(newsk); diff -r -u -N linux-2.4.19-clean/net/ipv4/tcp_output.c linux-2.4.19-p1/net/ipv4/tcp_output.c --- linux-2.4.19-clean/net/ipv4/tcp_output.c Sat Aug 3 03:39:46 2002 +++ linux-2.4.19-p1/net/ipv4/tcp_output.c Thu Nov 7 11:03:31 2002 @@ -41,6 +41,8 @@ #include #include +#include + /* People can turn this off for buggy TCP's found in printers etc. */ int sysctl_tcp_retrans_collapse = 1; @@ -175,6 +177,47 @@ } +/* IIP */ +static __inline__ const char *iip_tcp_ca_state(const struct tcp_opt *tp) +{ + switch (tp->ca_state) { + case TCP_CA_Open: return "Open"; + case TCP_CA_Loss: return "Loss"; + case TCP_CA_Recovery: return "Recovery"; + case TCP_CA_Disorder: return "Disorder"; + case TCP_CA_CWR: return "CWR"; + default: return "???"; + } +} + +/* IIP */ +static __inline__ void iip_print_packet(struct sock *sk, + struct sk_buff *skb) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct timeval ts; + + do_gettimeofday(&ts); + + SOCK_DEBUG(sk, + "PKT_SENT: %lu.%lu %d.%d.%d.%d.%u: %u:%u, cwnd %u, ssthresh %u, rto %u, high_seq %u, retransmits %u, in_flight %u, ca_state: %s\n", + ts.tv_sec, ts.tv_usec, + (sk->rcv_saddr<<24)>>24, + (sk->rcv_saddr<<16)>>24, + (sk->rcv_saddr<<8)>>24, + sk->rcv_saddr>>24, + sk->num, + TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, + tp->snd_cwnd, + tp->snd_ssthresh, + tp->rto, + tp->high_seq, + tp->retransmits, + tcp_packets_in_flight(tp), + iip_tcp_ca_state(tp)); +} + + /* This routine actually transmits TCP packets queued in by * tcp_do_sendmsg(). This is used by both the initial * transmission and possible later retransmissions. @@ -275,6 +318,9 @@ TCP_INC_STATS(TcpOutSegs); + /* IIP */ + iip_print_packet(sk, skb); + err = tp->af_specific->queue_xmit(skb); if (err <= 0) return err; @@ -683,6 +729,11 @@ if (window <= free_space - mss || window > free_space) window = (free_space/mss)*mss; + /* IIP (PS): share the destinations with selected address mask */ + if (srwnd_is_shared(sk)) { + window = min(srwnd_get_share(tp), window); + } + return window; } @@ -914,6 +965,10 @@ struct sk_buff *skb; int packet_cnt = tp->lost_out; + /* IIP debug */ +/* SOCK_DEBUG(sk, "enter xmit_retransmit_queue - packet_cnt: %d\n", + packet_cnt); */ + /* First pass: retransmit lost packets. */ if (packet_cnt) { for_retrans_queue(skb, sk, tp) { @@ -923,9 +978,15 @@ return; if (sacked&TCPCB_LOST) { + /* IIP debug */ +/* SOCK_DEBUG(sk, + "retransmitting lost: sacked = %x\n", + sacked); */ if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) { if (tcp_retransmit_skb(sk, skb)) return; + /* IIP (PS) */ +/* SOCK_DEBUG(sk, "retransmitting. in_flight: %d cwnd: %d\n", tcp_packets_in_flight(tp), tp->snd_cwnd); */ if (tp->ca_state != TCP_CA_Loss) NET_INC_STATS_BH(TCPFastRetrans); else @@ -1141,6 +1202,12 @@ req->wscale_ok, &rcv_wscale); req->rcv_wscale = rcv_wscale; + + /* IIP (PS): If needed, use shared rcv_wnd */ + if (srwnd_is_shared(sk)) + req->rcv_wnd = + min(req->rcv_wnd, + srwnd_get_share(&(sk->tp_pinfo.af_tcp))); } /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ @@ -1190,6 +1257,10 @@ sysctl_tcp_window_scaling, &tp->rcv_wscale); + /* IIP (PS): Use shared rcv_wnd, if necessary */ + if (srwnd_is_shared(sk)) + tp->rcv_wnd = min(tp->rcv_wnd, srwnd_get_share(tp)); + tp->rcv_ssthresh = tp->rcv_wnd; sk->err = 0; @@ -1235,7 +1306,11 @@ /* Send it off. */ TCP_SKB_CB(buff)->when = tcp_time_stamp; - tp->retrans_stamp = TCP_SKB_CB(buff)->when; + /* IIP (PS): I suspect there is a bug here. Why retrans_stamp is not + * set to 0? + */ +/* tp->retrans_stamp = TCP_SKB_CB(buff)->when; */ + tp->retrans_stamp = 0; __skb_queue_tail(&sk->write_queue, buff); tcp_charge_skb(sk, buff); tp->packets_out++; @@ -1279,6 +1354,12 @@ ato = min(ato, max_ato); } + /* IIP (PS): If in STANDARD delack mode, always set 200 ms for + * delayed ack timer + */ + if (sysctl_iip_delack_mode == IIP_DELACK_STANDARD) + ato = TCP_DELACK_MAX; + /* Stay within the limit we were given */ timeout = jiffies + ato; @@ -1287,7 +1368,12 @@ /* If delack timer was blocked or is about to expire, * send ACK now. */ - if (tp->ack.blocked || time_before_eq(tp->ack.timeout, jiffies+(ato>>2))) { + /* IIP (PS): Simpler delack rule */ + if (tp->ack.blocked || + (time_before_eq(tp->ack.timeout, jiffies+(ato>>2)) && + sysctl_iip_delack_mode != IIP_DELACK_STANDARD)) { + + /* SOCK_DEBUG(sk, "send_ack: delayed_ack\n"); */ tcp_send_ack(sk); return; } @@ -1297,6 +1383,9 @@ } tp->ack.pending |= TCP_ACK_SCHED|TCP_ACK_TIMER; tp->ack.timeout = timeout; + /* IIP debug */ +/* SOCK_DEBUG(sk, "set timer: timeout: %ld ato: %d\n", + timeout - jiffies, ato); */ if (!mod_timer(&tp->delack_timer, timeout)) sock_hold(sk); } diff -r -u -N linux-2.4.19-clean/net/ipv4/tcp_timer.c linux-2.4.19-p1/net/ipv4/tcp_timer.c --- linux-2.4.19-clean/net/ipv4/tcp_timer.c Mon Oct 1 19:19:57 2001 +++ linux-2.4.19-p1/net/ipv4/tcp_timer.c Thu Nov 7 11:03:48 2002 @@ -27,10 +27,16 @@ int sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; int sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; -int sysctl_tcp_retries1 = TCP_RETR1; +/* IIP (PS): For test purposes 3 retries is not enough. I want to finish the + * test at any cost + */ +/* int sysctl_tcp_retries1 = TCP_RETR1; */ +int sysctl_tcp_retries1 = 10; int sysctl_tcp_retries2 = TCP_RETR2; int sysctl_tcp_orphan_retries; +int sysctl_iip_rto_behaviour = IIP_RTO_LINUX; + static void tcp_write_timer(unsigned long); static void tcp_delack_timer(unsigned long); static void tcp_keepalive_timer (unsigned long data); @@ -256,11 +262,16 @@ tp->ack.pingpong = 0; tp->ack.ato = TCP_ATO_MIN; } + tcp_send_ack(sk); NET_INC_STATS_BH(DelayedACKs); } TCP_CHECK_TIMER(sk); + /* IIP debug */ + /*SOCK_DEBUG(sk, "Delayed ACK timer expired, pending: %d\n", + tp->ack.pending&TCP_ACK_TIMER); */ + out: if (tcp_memory_pressure) tcp_mem_reclaim(sk); @@ -372,6 +383,10 @@ } } + /* IIP (PS) */ + if (iip_use_any_frto(sk)) + tp->iip_rtoflag = IIP_RTOF_RTO; + tcp_enter_loss(sk, 0); if (tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)) > 0) { @@ -385,6 +400,9 @@ goto out; } + /* IIP debug */ + SOCK_DEBUG(sk, "RTO occurred. backoff: %d\n", tp->backoff); + /* Increase the timeout each time we retransmit. Note that * we do not increase the rtt estimate. rto is initialized * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests