diff options
author | Linus Torvalds <torvalds@linuxfoundation.org> | 2007-11-23 15:12:38 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linuxfoundation.org> | 2007-11-23 15:12:38 -0500 |
commit | 0f9cac5b27076f801b29a0867868e1bce7310e00 (patch) | |
tree | 2a9892afdfd043fd3be3857fe321dc26fdb5fd5b /net/ipv4/tcp_input.c | |
parent | 6b4869cbb33316bd08fbc121ed4c647dfc736492 (diff) | |
download | history-0f9cac5b27076f801b29a0867868e1bce7310e00.tar.gz |
Import 2.1.8
Notice: this object is not reachable from any branch.
2.1.8Notice: this object is not reachable from any branch.
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r-- | net/ipv4/tcp_input.c | 2323 |
1 files changed, 992 insertions, 1331 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index cb615e001be6a8..d9188b18f1b640 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -18,81 +18,85 @@ * Matthew Dillon, <dillon@apollo.west.oic.com> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Jorge Cwik, <jorge@laser.satlink.net> + */ + +/* + * TODO + * - A better sock cache + * + */ + +/* + * Changes: + * Pedro Roque : Fast Retransmit/Recovery. + * Two receive queues. + * Retransmit queue handled by TCP. + * Better retransmit timer handling. + * New congestion avoidance. + * Header prediction. + * Variable renaming. * - * FIXES - * Pedro Roque : Double ACK bug + * Eric : Fast Retransmit. + * Randy Scott : MSS option defines. * Eric Schenk : Fixes to slow start algorithm. * Eric Schenk : Yet another double ACK bug. * Eric Schenk : Delayed ACK bug fixes. * Eric Schenk : Floyd style fast retrans war avoidance. - * Eric Schenk : Skip fast retransmit on small windows. - * Eric schenk : Fixes to retransmission code to - * : avoid extra retransmission. - * Theodore Ts'o : Do secure TCP sequence numbers. */ #include <linux/config.h> -#include <linux/types.h> -#include <linux/random.h> #include <net/tcp.h> + /* - * Policy code extracted so it's now separate + * Policy code extracted so it's now seperate */ /* * Called each time to estimate the delayed ack timeout. This is - * how it should be done so a fast link isn't impacted by ack delay. + * how it should be done so a fast link isnt impacted by ack delay. + * + * I think we need a medium deviation here also... + * The estimated value is changing to fast */ -extern __inline__ void tcp_delack_estimator(struct sock *sk) +extern __inline__ void tcp_delack_estimator(struct tcp_opt *tp) { + int m; + /* * Delayed ACK time estimator. */ - if (sk->lrcvtime == 0) - { - sk->lrcvtime = jiffies; - sk->ato = HZ/3; - } - else - { - int m; - - m = jiffies - sk->lrcvtime; + m = jiffies - tp->lrcvtime; - sk->lrcvtime = jiffies; + tp->lrcvtime = jiffies; - if (m <= 0) - m = 1; + if (m < 0) + return; - /* This used to test against sk->rtt. - * On a purely receiving link, there is no rtt measure. - * The result is that we lose delayed ACKs on one-way links. - * Therefore we test against sk->rto, which will always - * at least have a default value. - */ - if (m > sk->rto) - { - sk->ato = sk->rto; /* - * printk(KERN_DEBUG "ato: rtt %lu\n", sk->ato); + * if the mesured value is bigger than + * twice the round trip time ignore it. */ - } - else + if ((m << 2) <= tp->srtt) { - /* - * Very fast acting estimator. - * May fluctuate too much. Probably we should be - * doing something like the rtt estimator here. - */ - sk->ato = (sk->ato >> 1) + m; - /* - * printk(KERN_DEBUG "ato: m %lu\n", sk->ato); - */ - } + m -= (tp->iat >> 3); + tp->iat += m; + + if (m <0) + m = -m; + + m -= (tp->iat_mdev >> 2); + tp->iat_mdev += m; + + tp->ato = (tp->iat >> 3) + (tp->iat_mdev >> 2); + + if (tp->ato < HZ/50) + tp->ato = HZ/50; } + else + tp->ato = 0; } /* @@ -101,7 +105,7 @@ extern __inline__ void tcp_delack_estimator(struct sock *sk) * The algorithm is from the SIGCOMM 88 piece by Van Jacobson. */ -extern __inline__ void tcp_rtt_estimator(struct sock *sk, struct sk_buff *oskb) +extern __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt) { long m; /* @@ -111,130 +115,72 @@ extern __inline__ void tcp_rtt_estimator(struct sock *sk, struct sk_buff *oskb) * This is designed to be as fast as possible * m stands for "measurement". */ + /* + * On a 1990 paper the rto value is changed to: + * RTO = rtt + 4 * mdev + */ - m = jiffies - oskb->when; /* RTT */ + m = mrtt; /* RTT */ - if (sk->rtt != 0) { + if (tp->srtt != 0) { if(m<=0) m=1; /* IS THIS RIGHT FOR <0 ??? */ - m -= (sk->rtt >> 3); /* m is now error in rtt est */ - sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */ + m -= (tp->srtt >> 3); /* m is now error in rtt est */ + tp->srtt += m; /* rtt = 7/8 rtt + 1/8 new */ if (m < 0) m = -m; /* m is now abs(error) */ - m -= (sk->mdev >> 2); /* similar update on mdev */ - sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */ + m -= (tp->mdev >> 2); /* similar update on mdev */ + tp->mdev += m; /* mdev = 3/4 mdev + 1/4 new */ } else { /* no previous measure. */ - sk->rtt = m<<3; /* take the measured time to be rtt */ - sk->mdev = m<<1; /* make sure rto = 3*rtt */ + tp->srtt = m<<3; /* take the measured time to be rtt */ + tp->mdev = m<<2; /* make sure rto = 3*rtt */ } + /* * Now update timeout. Note that this removes any backoff. */ - /* Jacobson's algorithm calls for rto = R + 4V. - * We diverge from Jacobson's algorithm here. See the commentary - * in tcp_ack to understand why. - */ - sk->rto = (sk->rtt >> 3) + sk->mdev; - sk->rto += (sk->rto>>2) + (sk->rto >> (sk->cong_window-1)); - if (sk->rto > 120*HZ) - sk->rto = 120*HZ; - if (sk->rto < HZ/5) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */ - sk->rto = HZ/5; - sk->backoff = 0; -} + tp->rto = (tp->srtt >> 3) + tp->mdev; -/* - * Cached last hit socket - */ + if (tp->rto > 120*HZ) + tp->rto = 120*HZ; -static volatile unsigned long th_cache_saddr, th_cache_daddr; -static volatile unsigned short th_cache_dport, th_cache_sport; -static volatile struct sock *th_cache_sk; + /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */ + if (tp->rto < HZ/5) + tp->rto = HZ/5; -void tcp_cache_zap(void) -{ - th_cache_sk=NULL; + tp->backoff = 0; } -/* - * Find the socket, using the last hit cache if applicable. The cache is not quite - * right... - */ - -static inline struct sock * get_tcp_sock(u32 saddr, u16 sport, u32 daddr, u16 dport, u32 paddr, u16 pport) -{ - struct sock * sk; - - sk = (struct sock *) th_cache_sk; - if (!sk || saddr != th_cache_saddr || daddr != th_cache_daddr || - sport != th_cache_sport || dport != th_cache_dport) { - sk = get_sock(&tcp_prot, dport, saddr, sport, daddr, paddr, pport); - if (sk) { - th_cache_saddr=saddr; - th_cache_daddr=daddr; - th_cache_dport=dport; - th_cache_sport=sport; - th_cache_sk=sk; - } - } - return sk; -} /* - * React to an out-of-window TCP sequence number in an incoming packet - */ - -static void bad_tcp_sequence(struct sock *sk, struct tcphdr *th, u32 end_seq, - struct device *dev) -{ - if (th->rst) - return; - - /* - * Send a reset if we get something not ours and we are - * unsynchronized. Note: We don't do anything to our end. We - * are just killing the bogus remote connection then we will - * connect again and it will work (with luck). + * This functions checks to see if the tcp header is actually acceptable. */ - if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV) +extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 seg_nxt) { - tcp_send_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl); - return; - } + u32 end_window = tp->rcv_wup + tp->rcv_wnd; + u32 end_seq = seg_nxt; /* - * This packet is old news. Usually this is just a resend - * from the far end, but sometimes it means the far end lost - * an ACK we sent, so we better send an ACK. - */ - /* - * BEWARE! Unconditional answering by ack to out-of-window ack - * can result in infinite exchange of empty acks. - * This check cures bug, found by Michiel Boland, but - * not another possible cases. - * If we are in TCP_TIME_WAIT, we have already received - * FIN, so that our peer need not window update. If our - * ACK were lost, peer would retransmit his FIN anyway. --ANK + * When the window is open (most common case) + * we want to accept segments if they have yet unseen data + * or in the case of a dataless segment if seg.seq == rcv.nxt + * this means: + * + * if (seq == end_seq) + * end_seq >= rcv.nxt + * else + * end_seq > rcv.nxt */ - if (sk->state != TCP_TIME_WAIT || ntohl(th->seq) != end_seq) - tcp_send_ack(sk); -} -/* - * This functions checks to see if the tcp header is actually acceptable. - */ + if (seq == end_seq) + end_seq++; -extern __inline__ int tcp_sequence(struct sock *sk, u32 seq, u32 end_seq) -{ - u32 end_window = sk->lastwin_seq + sk->window; - return /* if start is at end of window, end must be too (zero window) */ - (seq == end_window && seq == end_seq) || - /* if start is before end of window, check for interest */ - (before(seq, end_window) && !before(end_seq, sk->acked_seq)); + return ((before(seq, end_window) && after(end_seq, tp->rcv_nxt)) || + (seq == end_window && seq == end_seq)); } /* @@ -273,7 +219,7 @@ static int tcp_reset(struct sock *sk, struct sk_buff *skb) #endif if (!sk->dead) sk->state_change(sk); - kfree_skb(skb, FREE_READ); + return(0); } @@ -289,11 +235,11 @@ static int tcp_reset(struct sock *sk, struct sk_buff *skb) * as Linux gets deployed on 100Mb/sec networks. */ -static void tcp_options(struct sock *sk, struct tcphdr *th) +int tcp_parse_options(struct tcphdr *th) { unsigned char *ptr; int length=(th->doff*4)-sizeof(struct tcphdr); - int mss_seen = 0; + int mss = 0; ptr = (unsigned char *)(th + 1); @@ -304,7 +250,7 @@ static void tcp_options(struct sock *sk, struct tcphdr *th) switch(opcode) { case TCPOPT_EOL: - return; + return 0; case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ length--; ptr--; /* the opsize=*ptr++ above was a mistake */ @@ -312,14 +258,13 @@ static void tcp_options(struct sock *sk, struct tcphdr *th) default: if(opsize<=2) /* Avoid silly options looping forever */ - return; + return 0; switch(opcode) { case TCPOPT_MSS: - if(opsize==4 && th->syn) + if(opsize==TCPOLEN_MSS && th->syn) { - sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr)); - mss_seen = 1; + mss = ntohs(*(unsigned short *)ptr); } break; /* Add other options here as people feel the urge to implement stuff like large windows */ @@ -328,505 +273,316 @@ static void tcp_options(struct sock *sk, struct tcphdr *th) length-=opsize; } } - if (th->syn) - { - if (! mss_seen) - sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */ - } -#ifdef CONFIG_INET_PCTCP - sk->mss = min(sk->max_window >> 1, sk->mtu); -#else - sk->mss = min(sk->max_window, sk->mtu); - sk->max_unacked = 2 * sk->mss; -#endif + + return mss; } /* - * This routine handles a connection request. - * It should make sure we haven't already responded. - * Because of the way BSD works, we have to send a syn/ack now. - * This also means it will be harder to close a socket which is - * listening. + * See draft-stevens-tcpca-spec-01 for documentation. */ -static void tcp_conn_request(struct sock *sk, struct sk_buff *skb, - u32 daddr, u32 saddr, struct options *opt, struct device *dev, u32 seq) +static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup) { - struct sock *newsk; - struct tcphdr *th; - struct rtable *rt; + struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp); - th = skb->h.th; + /* + * An ACK is a duplicate if: + * (1) it has the same sequence number as the largest number we've + * seen, + * (2) it has the same window as the last ACK, + * (3) we have outstanding data that has not been ACKed + * (4) The packet was not carrying any data. + * (5) [From Floyds paper on fast retransmit wars] + * The packet acked data after high_seq; + */ - /* If the socket is dead, don't accept the connection. */ - if (!sk->dead) - { - sk->data_ready(sk,0); - } - else + if (ack == tp->snd_una && sk->packets_out && (not_dup == 0) && + after(ack, tp->high_seq)) { - if(sk->debug) - printk("Reset on %p: Connect on dead socket.\n",sk); - tcp_send_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl); - tcp_statistics.TcpAttemptFails++; - kfree_skb(skb, FREE_READ); - return; - } + + sk->dup_acks++; + /* - * Make sure we can accept more. This will prevent a - * flurry of syns from eating up all our memory. - * - * BSD does some funnies here and allows 3/2 times the - * set backlog as a fudge factor. That's just too gross. + * 1. When the third duplicate ack is received, set ssthresh + * to one half the current congestion window, but no less + * than two segments. Retransmit the missing segment. */ - if (sk->ack_backlog >= sk->max_ack_backlog) + if (sk->dup_acks == 3) { - tcp_statistics.TcpAttemptFails++; - kfree_skb(skb, FREE_READ); - return; + sk->ssthresh = max(sk->cong_window >> 1, 2); + sk->cong_window = sk->ssthresh + 3; + tcp_do_retransmit(sk, 0); } /* - * We need to build a new sock struct. - * It is sort of bad to have a socket without an inode attached - * to it, but the wake_up's will just wake up the listening socket, - * and if the listening socket is destroyed before this is taken - * off of the queue, this will take care of it. + * 2. Each time another duplicate ACK arrives, increment + * cwnd by the segment size. [...] Transmit a packet... + * + * Packet transmission will be done on normal flow processing + * since we're not in "retransmit mode" */ - newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC); - if (newsk == NULL) + if (sk->dup_acks > 3) { - /* just ignore the syn. It will get retransmitted. */ - tcp_statistics.TcpAttemptFails++; - kfree_skb(skb, FREE_READ); - return; + sk->cong_window++; } - - memcpy(newsk, sk, sizeof(*newsk)); - newsk->opt = NULL; - newsk->ip_route_cache = NULL; - if (opt && opt->optlen) - { - sk->opt = (struct options*)kmalloc(sizeof(struct options)+opt->optlen, GFP_ATOMIC); - if (!sk->opt) - { - kfree_s(newsk, sizeof(struct sock)); - tcp_statistics.TcpAttemptFails++; - kfree_skb(skb, FREE_READ); - return; } - if (ip_options_echo(sk->opt, opt, daddr, saddr, skb)) + else { - kfree_s(sk->opt, sizeof(struct options)+opt->optlen); - kfree_s(newsk, sizeof(struct sock)); - tcp_statistics.TcpAttemptFails++; - kfree_skb(skb, FREE_READ); - return; - } - } - skb_queue_head_init(&newsk->write_queue); - skb_queue_head_init(&newsk->receive_queue); - newsk->send_head = NULL; - newsk->send_tail = NULL; - newsk->send_next = NULL; - skb_queue_head_init(&newsk->back_log); - newsk->rtt = 0; - newsk->rto = TCP_TIMEOUT_INIT; - newsk->mdev = TCP_TIMEOUT_INIT; - newsk->max_window = 0; - /* - * See draft-stevens-tcpca-spec-01 for discussion of the - * initialization of these values. - */ - newsk->cong_window = 1; - newsk->cong_count = 0; - newsk->ssthresh = 0x7fffffff; - - newsk->lrcvtime = 0; - newsk->idletime = 0; - newsk->high_seq = 0; - newsk->backoff = 0; - newsk->blog = 0; - newsk->intr = 0; - newsk->proc = 0; - newsk->done = 0; - newsk->partial = NULL; - newsk->pair = NULL; - newsk->wmem_alloc = 0; - newsk->rmem_alloc = 0; - newsk->localroute = sk->localroute; - - newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF; - - newsk->err = 0; - newsk->shutdown = 0; - newsk->ack_backlog = 0; - newsk->acked_seq = skb->seq+1; - newsk->lastwin_seq = skb->seq+1; - newsk->delay_acks = 1; - newsk->copied_seq = skb->seq+1; - newsk->fin_seq = skb->seq; - newsk->syn_seq = skb->seq; - newsk->state = TCP_SYN_RECV; - newsk->timeout = 0; - newsk->ip_xmit_timeout = 0; - newsk->write_seq = seq; - newsk->window_seq = newsk->write_seq; - newsk->rcv_ack_seq = newsk->write_seq; - newsk->urg_data = 0; - newsk->retransmits = 0; - newsk->linger=0; - newsk->destroy = 0; - init_timer(&newsk->timer); - newsk->timer.data = (unsigned long)newsk; - newsk->timer.function = &net_timer; - init_timer(&newsk->delack_timer); - newsk->delack_timer.data = (unsigned long)newsk; - newsk->delack_timer.function = tcp_delack_timer; - init_timer(&newsk->retransmit_timer); - newsk->retransmit_timer.data = (unsigned long)newsk; - newsk->retransmit_timer.function = tcp_retransmit_timer; - newsk->dummy_th.source = skb->h.th->dest; - newsk->dummy_th.dest = skb->h.th->source; - -#ifdef CONFIG_IP_TRANSPARENT_PROXY - /* - * Deal with possibly redirected traffic by setting num to - * the intended destination port of the received packet. - */ - newsk->num = ntohs(skb->h.th->dest); - -#endif /* - * Swap these two, they are from our point of view. + * 3. When the next ACK arrives that acknowledges new data, + * set cwnd to ssthresh */ - newsk->daddr = saddr; - newsk->saddr = daddr; - newsk->rcv_saddr = daddr; + if (sk->dup_acks >= 3) + { + sk->tp_pinfo.af_tcp.retrans_head = NULL; + sk->cong_window = sk->ssthresh; + sk->retransmits = 0; + } + sk->dup_acks = 0; + } + +} - put_sock(newsk->num,newsk); - newsk->acked_seq = skb->seq + 1; - newsk->copied_seq = skb->seq + 1; - newsk->socket = NULL; +int sysctl_tcp_vegas_cong_avoidance = 1; /* - * Grab the ttl and tos values and use them + * TCP slow start and congestion avoidance in two flavors: + * RFC 1122 and TCP Vegas. + * + * This is a /proc/sys configurable option. */ - newsk->ip_ttl=sk->ip_ttl; - newsk->ip_tos=skb->ip_hdr->tos; +#define SHIFT_FACTOR 12 +static void tcp_cong_avoid_vegas(struct sock *sk, u32 seq, u32 ack, + u32 seq_rtt) +{ /* - * Use 512 or whatever user asked for + * From: + * TCP Vegas: New Techniques for Congestion + * Detection and Avoidance. + * + * + * Warning: This code is a scratch implementation taken + * from the paper only. The code they distribute seams + * to have improved several things over the initial spec. */ - /* - * Note use of sk->user_mss, since user has no direct access to newsk - */ + u32 Actual, Expected; + u32 snt_bytes; + struct tcp_opt * tp; - rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0); - newsk->ip_route_cache = rt; + tp = &(sk->tp_pinfo.af_tcp); - if(rt!=NULL && (rt->rt_flags&RTF_WINDOW)) - newsk->window_clamp = rt->rt_window; - else - newsk->window_clamp = 0; + if (!seq_rtt) + seq_rtt = 1; - if (sk->user_mss) - newsk->mtu = sk->user_mss; - else if (rt) - newsk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr); + if (tp->basertt) + tp->basertt = min(seq_rtt, tp->basertt); else - newsk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr); + tp->basertt = seq_rtt; + /* - * But not bigger than device MTU + * + * Actual = throughput for this segment. + * Expected = number_of_bytes in transit / BaseRTT + * */ - newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr)); + snt_bytes = (ack - seq) << SHIFT_FACTOR; -#ifdef CONFIG_SKIP + Actual = snt_bytes / seq_rtt; + Expected = ((tp->snd_nxt - tp->snd_una) << SHIFT_FACTOR) / tp->basertt; /* - * SKIP devices set their MTU to 65535. This is so they can take packets - * unfragmented to security process then fragment. They could lie to the - * TCP layer about a suitable MTU, but it's easier to let skip sort it out - * simply because the final package we want unfragmented is going to be - * - * [IPHDR][IPSP][Security data][Modified TCP data][Security data] + printk(KERN_DEBUG "A:%x E:%x rtt:%x srtt:%x win: %d\n", + Actual, Expected, seq_rtt, tp->srtt, sk->cong_window); */ - - if(skip_pick_mtu!=NULL) /* If SKIP is loaded.. */ - sk->mtu=skip_pick_mtu(sk->mtu,dev); -#endif /* - * This will min with what arrived in the packet + * Slow Start */ - tcp_options(newsk,skb->h.th); - - tcp_cache_zap(); - tcp_send_synack(newsk, sk, skb); -} - + if (sk->cong_window < sk->ssthresh && + (seq == tp->snd_nxt || + (((Expected - Actual) <= + ((TCP_VEGAS_GAMMA << SHIFT_FACTOR) * sk->mss / tp->basertt)) + ) + )) + { /* - * Handle a TCP window that shrunk on us. It shouldn't happen, - * but.. - * - * We may need to move packets from the send queue - * to the write queue, if the window has been shrunk on us. - * The RFC says you are not allowed to shrink your window - * like this, but if the other end does, you must be able - * to deal with it. + * "Vegas allows exponential growth only every other + * RTT" */ -void tcp_window_shrunk(struct sock * sk, u32 window_seq) -{ - struct sk_buff *skb; - struct sk_buff *skb2; - struct sk_buff *wskb = NULL; - - skb2 = sk->send_head; - sk->send_head = NULL; - sk->send_tail = NULL; - sk->send_next = NULL; + if (sk->cong_count || sk->cong_window <= 2) + { + sk->cong_window++; + sk->cong_count = 0; + } + else + sk->cong_count++; + } + else + { /* - * This is an artifact of a flawed concept. We want one - * queue and a smarter send routine when we send all. + * Congestion Avoidance */ - cli(); - while (skb2 != NULL) - { - skb = skb2; - skb2 = skb->link3; - skb->link3 = NULL; - if (after(skb->end_seq, window_seq)) + + if (Expected - Actual <= + ((TCP_VEGAS_ALPHA << SHIFT_FACTOR) * sk->mss / tp->basertt)) { - if (sk->packets_out > 0) - sk->packets_out--; - /* We may need to remove this from the dev send list. */ - if (skb->next != NULL) + /* Increase Linearly */ + + if (sk->cong_count >= sk->cong_window) { - skb_unlink(skb); + sk->cong_window++; + sk->cong_count = 0; } - /* Now add it to the write_queue. */ - if (wskb == NULL) - skb_queue_head(&sk->write_queue,skb); else - skb_append(wskb,skb); - wskb = skb; + sk->cong_count++; } - else + + if (Expected - Actual >= + ((TCP_VEGAS_BETA << SHIFT_FACTOR) * sk->mss / tp->basertt)) { - if (sk->send_head == NULL) + /* Decrease Linearly */ + + if (sk->cong_count >= sk->cong_window) { - sk->send_head = skb; - sk->send_tail = skb; - sk->send_next = skb; + sk->cong_window--; + sk->cong_count = 0; } else + sk->cong_count++; + + + /* Never less than 2 segments */ + if (sk->cong_window < 2) + sk->cong_window = 2; + } + + + } +} + +static void tcp_cong_avoid_vanj(struct sock *sk, u32 seq, u32 ack, u32 seq_rtt) +{ + + /* + * This is Jacobson's slow start and congestion avoidance. + * SIGCOMM '88, p. 328. Because we keep cong_window in + * integral mss's, we can't do cwnd += 1 / cwnd. + * Instead, maintain a counter and increment it once every + * cwnd times. + */ + + if (sk->cong_window <= sk->ssthresh) { - sk->send_tail->link3 = skb; - sk->send_tail = skb; + /* + * In "safe" area, increase + */ + + sk->cong_window++; } - skb->link3 = NULL; + else + { + /* + * In dangerous area, increase slowly. + * In theory this is + * sk->cong_window += 1 / sk->cong_window + */ + + if (sk->cong_count >= sk->cong_window) { + + sk->cong_window++; + sk->cong_count = 0; } + else + sk->cong_count++; } - sti(); } +#define FLAG_DATA 0x01 +#define FLAG_WIN_UPDATE 0x02 +#define FLAG_DATA_ACKED 0x04 + /* * This routine deals with incoming acks, but not outgoing ones. - * - * This routine is totally _WRONG_. The list structuring is wrong, - * the algorithm is wrong, the code is wrong. */ -static int tcp_ack(struct sock *sk, struct tcphdr *th, u32 ack, int len) +static int tcp_ack(struct sock *sk, struct tcphdr *th, + u32 ack_seq, u32 ack, int len) { int flag = 0; - u32 window_seq; + u32 seq = 0; + u32 seq_rtt = 0; + struct sk_buff *skb; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - /* - * 1 - there was data in packet as well as ack or new data is sent or - * in shutdown state - * 2 - data from retransmit queue was acked and removed - * 4 - window shrunk or data from retransmit queue was acked and removed - */ if(sk->zapped) return(1); /* Dead, can't ack any more so why bother */ - /* - * We have dropped back to keepalive timeouts. Thus we have - * no retransmits pending. - */ - if (sk->ip_xmit_timeout == TIME_KEEPOPEN) - sk->retransmits = 0; + if (tp->pending == TIME_KEEPOPEN) + { + tp->probes_out = 0; + } + + tp->rcv_tstamp = jiffies; /* * If the ack is newer than sent or older than previous acks * then we can probably ignore it. */ - if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq)) + if (after(ack, tp->snd_nxt) || before(ack, tp->snd_una)) goto uninteresting_ack; /* - * Have we discovered a larger window + * If there is data set flag 1 */ - window_seq = ntohs(th->window); - if (window_seq > sk->max_window) + + if (len != th->doff*4) { - sk->max_window = window_seq; -#ifdef CONFIG_INET_PCTCP - /* Hack because we don't send partial packets to non SWS - handling hosts */ - sk->mss = min(window_seq>>1, sk->mtu); -#else - sk->mss = min(window_seq, sk->mtu); -#endif + flag |= FLAG_DATA; + tcp_delack_estimator(tp); } - window_seq += ack; - - /* - * See if our window has been shrunk. - */ - if (after(sk->window_seq, window_seq)) - tcp_window_shrunk(sk, window_seq); /* - * Pipe has emptied + * Update our send window */ - if (sk->send_tail == NULL || sk->send_head == NULL) - { - sk->send_head = NULL; - sk->send_tail = NULL; - sk->send_next = NULL; - sk->packets_out= 0; - } /* - * We don't want too many packets out there. + * This is the window update code as per RFC 793 + * snd_wl{1,2} are used to prevent unordered + * segments from shrinking the window */ - if (sk->ip_xmit_timeout == TIME_WRITE && - sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) + if ((tp->snd_wl1 == 0) || before(tp->snd_wl1, ack_seq) || + (tp->snd_wl1 == ack_seq && !after(tp->snd_wl2, ack))) { + tp->snd_wnd = ntohs(th->window); + tp->snd_wl1 = ack_seq; + tp->snd_wl2 = ack; - /* - * This is Jacobson's slow start and congestion avoidance. - * SIGCOMM '88, p. 328. Because we keep cong_window in integral - * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a - * counter and increment it once every cwnd times. It's possible - * that this should be done only if sk->retransmits == 0. I'm - * interpreting "new data is acked" as including data that has - * been retransmitted but is just now being acked. - */ - if (sk->cong_window <= sk->ssthresh) - /* - * In "safe" area, increase - */ - sk->cong_window++; - else - { - /* - * In dangerous area, increase slowly. In theory this is - * sk->cong_window += 1 / sk->cong_window - */ - if (sk->cong_count >= sk->cong_window) - { - sk->cong_window++; - sk->cong_count = 0; - } - else - sk->cong_count++; - } - } - - /* - * Remember the highest ack received and update the - * right hand window edge of the host. - * We do a bit of work here to track number of times we've - * seen this ack without a change in the right edge of the - * window and no data in the packet. - * This will allow us to do fast retransmits. - */ + flag |= FLAG_WIN_UPDATE; - /* We are looking for duplicate ACKs here. - * An ACK is a duplicate if: - * (1) it has the same sequence number as the largest number we've seen, - * (2) it has the same window as the last ACK, - * (3) we have outstanding data that has not been ACKed - * (4) The packet was not carrying any data. - * (5) [From Floyd's paper on fast retransmit wars] - * The packet acked data after high_seq; - * I've tried to order these in occurrence of most likely to fail - * to least likely to fail. - * [These are an extension of the rules BSD stacks use to - * determine if an ACK is a duplicate.] - */ - - if (sk->rcv_ack_seq == ack - && sk->window_seq == window_seq - && len != th->doff*4 - && before(ack, sk->sent_seq) - && after(ack, sk->high_seq)) - { - /* Prevent counting of duplicate ACKs if the congestion - * window is smaller than 3. Note that since we reduce - * the congestion window when we do a fast retransmit, - * we must be careful to keep counting if we were already - * counting. The idea behind this is to avoid doing - * fast retransmits if the congestion window is so small - * that we cannot get 3 ACKs due to the loss of a packet - * unless we are getting ACKs for retransmitted packets. - */ - if (sk->cong_window >= 3 || sk->rcv_ack_cnt > MAX_DUP_ACKS+1) - sk->rcv_ack_cnt++; - /* See draft-stevens-tcpca-spec-01 for explanation - * of what we are doing here. - */ - if (sk->rcv_ack_cnt == MAX_DUP_ACKS+1) { - int tmp; - - /* We need to be a bit careful to preserve the - * count of packets that are out in the system here. - */ - sk->ssthresh = max(sk->cong_window >> 1, 2); - sk->cong_window = sk->ssthresh+MAX_DUP_ACKS+1; - tmp = sk->packets_out; - tcp_do_retransmit(sk,0); - sk->packets_out = tmp; - } else if (sk->rcv_ack_cnt > MAX_DUP_ACKS+1) { - sk->cong_window++; - /* - * At this point we are suppose to transmit a NEW - * packet (not retransmit the missing packet, - * this would only get us into a retransmit war.) - * I think that having just adjusted cong_window - * we will transmit the new packet below. - */ - } - } - else + if (tp->snd_wnd > sk->max_window) { - if (sk->rcv_ack_cnt > MAX_DUP_ACKS) { - sk->cong_window = sk->ssthresh; + sk->max_window = tp->snd_wnd; } - sk->window_seq = window_seq; - sk->rcv_ack_seq = ack; - sk->rcv_ack_cnt = 1; } + /* * We passed data and got it acked, remove any soft error * log. Something worked... @@ -840,55 +596,29 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th, u32 ack, int len) * it needs to be for normal retransmission. */ - if (sk->ip_xmit_timeout == TIME_PROBE0) + if (tp->pending == TIME_PROBE0) { - sk->retransmits = 0; /* Our probe was answered */ + tp->probes_out = 0; /* Our probe was answered */ /* * Was it a usable window open ? */ - if (!skb_queue_empty(&sk->write_queue) && /* should always be true */ - ! before (sk->window_seq, sk->write_queue.next->end_seq)) + /* should always be non-null */ + if (tp->send_head != NULL && + !before (ack + tp->snd_wnd, tp->send_head->end_seq)) { - sk->backoff = 0; + tp->backoff = 0; + tp->pending = 0; - /* - * Recompute rto from rtt. this eliminates any backoff. - */ + tcp_clear_xmit_timer(sk, TIME_PROBE0); - /* - * Appendix C of Van Jacobson's final version of - * the SIGCOMM 88 paper states that although - * the original paper suggested that - * RTO = R*2V - * was the correct calculation experience showed - * better results using - * RTO = R*4V - * In particular this gives better performance over - * slow links, and should not effect fast links. - * - * Note: Jacobson's algorithm is fine on BSD which - * has a 1/2 second granularity clock, but with our - * 1/100 second granularity clock we become too - * sensitive to minor changes in the round trip time. - * We add in two compensating factors. - * First we multiply by 5/4. For large congestion - * windows this allows us to tolerate burst traffic - * delaying up to 1/4 of our packets. - * We also add in a rtt / cong_window term. - * For small congestion windows this allows - * a single packet delay, but has negligible effect - * on the compensation for large windows. - */ - sk->rto = (sk->rtt >> 3) + sk->mdev; - sk->rto += (sk->rto>>2) + (sk->rto >> (sk->cong_window-1)); - if (sk->rto > 120*HZ) - sk->rto = 120*HZ; - if (sk->rto < HZ/5) /* Was 1*HZ, then 1 - turns out we must allow about - .2 of a second because of BSD delayed acks - on a 100Mb/sec link - .2 of a second is going to need huge windows (SIGH) */ - sk->rto = HZ/5; + } + else + { + tcp_reset_xmit_timer(sk, TIME_PROBE0, + min(tp->rto << tp->backoff, + 120*HZ)); } } @@ -896,44 +626,67 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th, u32 ack, int len) * See if we can take anything off of the retransmit queue. */ - for (;;) { - struct sk_buff * skb = sk->send_head; - if (!skb) - break; + start_bh_atomic(); + while(((skb=skb_peek(&sk->write_queue)) != NULL) && + (skb != tp->send_head)) + { /* Check for a bug. */ - if (skb->link3 && after(skb->end_seq, skb->link3->end_seq)) - printk("INET: tcp.c: *** bug send_list out of order.\n"); + + if (skb->next != (struct sk_buff*) &sk->write_queue && + after(skb->end_seq, skb->next->seq)) + printk("INET: tcp_input.c: *** " + "bug send_list out of order.\n"); /* * If our packet is before the ack sequence we can - * discard it as it's confirmed to have arrived the other end. + * discard it as it's confirmed to have arrived the + * other end. */ - if (after(skb->end_seq, ack)) - break; - - if (sk->retransmits) + if (!after(skb->end_seq, ack)) { - /* - * We were retransmitting. don't count this in RTT est - */ - flag |= 2; + if (sk->debug) + { + printk(KERN_DEBUG "removing seg %x-%x from " + "retransmit queue\n", + skb->seq, skb->end_seq); } - if ((sk->send_head = skb->link3) == NULL) + tp->retrans_head = NULL; + + flag |= FLAG_DATA_ACKED; + seq = skb->seq; + seq_rtt = jiffies - skb->when; + + skb_unlink(skb); + atomic_dec(&sk->packets_out); + skb->free = 1; + + kfree_skb(skb, FREE_WRITE); + + if (!sk->dead) + sk->write_space(sk); + } + else { - sk->send_tail = NULL; - sk->send_next = NULL; - sk->retransmits = 0; + break; + } } + end_bh_atomic(); + /* - * advance the send_next pointer if needed. + * if we where retransmiting don't count rtt estimate */ - if (sk->send_next == skb) - sk->send_next = sk->send_head; + if (sk->retransmits) + { + if (sk->packets_out == 0) + sk->retransmits = 0; + } + else + { /* * Note that we only reset backoff and rto in the * rtt recomputation code. And that doesn't happen @@ -946,274 +699,89 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th, u32 ack, int len) * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.) */ - /* - * We have one less packet out there. - */ - - if (sk->packets_out > 0) - sk->packets_out --; - - /* This is really only supposed to be called when we - * are actually ACKing new data, which should exclude - * the ACK handshake on an initial SYN packet as well. - * Rather than introducing a new test here for this - * special case, we just reset the initial values for - * rtt immediately after we move to the established state. - */ - if (!(flag&2)) /* Not retransmitting */ - tcp_rtt_estimator(sk,skb); - IS_SKB(skb); - - /* - * We may need to remove this from the dev send list. - */ - cli(); - if (skb->next) - skb_unlink(skb); - sti(); - kfree_skb(skb, FREE_WRITE); /* write. */ - if (!sk->dead) - sk->write_space(sk); + if (flag & FLAG_DATA_ACKED) + { + tcp_rtt_estimator(tp, seq_rtt); + if (sysctl_tcp_vegas_cong_avoidance) + { + tcp_cong_avoid_vegas(sk, seq, ack, seq_rtt); } - - /* - * Maybe we can take some stuff off of the write queue, - * and put it onto the xmit queue. - * There is bizarre case being tested here, to check if - * the data at the head of the queue ends before the start of - * the sequence we already ACKed. This is not an error, - * it can occur when we send a packet directly off of the write_queue - * in a zero window probe. - */ - - if (!skb_queue_empty(&sk->write_queue) && - !before(sk->window_seq, sk->write_queue.next->end_seq) && - (sk->retransmits == 0 || - sk->ip_xmit_timeout != TIME_WRITE || - !after(sk->write_queue.next->end_seq, sk->rcv_ack_seq)) && - sk->packets_out < sk->cong_window) + else { - /* - * Add more data to the send queue. - */ - tcp_write_xmit(sk); + tcp_cong_avoid_vanj(sk, seq, ack, seq_rtt); } - - /* - * Reset timers to reflect the new state. - * - * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets - * from TCP_CLOSE we don't do anything - * - * from anything else, if there is queued data (or fin) pending, - * we use a TIME_WRITE timeout, if there is data to write but - * no room in the window we use TIME_PROBE0, else if keepalive - * we reset to a KEEPALIVE timeout, else we delete the timer. - * - * We do not set flag for nominal write data, otherwise we may - * force a state where we start to write itsy bitsy tidbits - * of data. - */ - - switch(sk->state) { - case TCP_TIME_WAIT: - /* - * keep us in TIME_WAIT until we stop getting packets, - * reset the timeout. - */ - tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); - break; - case TCP_CLOSE: - /* - * don't touch the timer. - */ - break; - default: - /* - * Must check send_head and write_queue - * to determine which timeout to use. - */ - if (sk->send_head) { - tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto); - } else if (!skb_queue_empty(&sk->write_queue) - && sk->ack_backlog == 0) - { - /* - * if the write queue is not empty when we get here - * then we failed to move any data to the retransmit - * queue above. (If we had send_head would be non-NULL). - * Furthermore, since the send_head is NULL here - * we must not be in retransmit mode at this point. - * This implies we have no packets in flight, - * hence sk->packets_out < sk->cong_window. - * Examining the conditions for the test to move - * data to the retransmission queue we find that - * we must therefore have a zero window. - * Hence, if the ack_backlog is 0 we should initiate - * a zero probe. - * We don't do a zero probe if we have a delayed - * ACK in hand since the other side may have a - * window opening, but they are waiting to hear - * from us before they tell us about it. - * (They are applying Nagle's rule). - * So, we don't set up the zero window probe - * just yet. We do have to clear the timer - * though in this case... - */ - tcp_reset_xmit_timer(sk, TIME_PROBE0, sk->rto); - } else if (sk->keepopen) { - tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN); - } else { - del_timer(&sk->retransmit_timer); - sk->ip_xmit_timeout = 0; } - break; } - /* - * We have nothing queued but space to send. Send any partial - * packets immediately (end of Nagle rule application). - */ - - if (sk->packets_out == 0 - && sk->partial != NULL - && skb_queue_empty(&sk->write_queue) - && sk->send_head == NULL) - { - tcp_send_partial(sk); - } - /* - * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and - * we are now waiting for an acknowledge to our FIN. The other end is - * already in TIME_WAIT. - * - * Move to TCP_CLOSE on success. - */ - if (sk->state == TCP_LAST_ACK) + /* Sanity check out packets_out counter */ + if (skb_queue_len(&sk->write_queue) == 0 || + ack == tp->snd_nxt ) { - if (!sk->dead) - sk->state_change(sk); - if(sk->debug) - printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n", - sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq); - if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/) + if (sk->packets_out) { - sk->shutdown = SHUTDOWN_MASK; - tcp_set_state(sk,TCP_CLOSE); - return 1; + printk(KERN_DEBUG "tcp_ack: packets_out %d\n", + sk->packets_out); + sk->packets_out = 0; } } - /* - * Incoming ACK to a FIN we sent in the case of our initiating the close. - * - * Move to FIN_WAIT2 to await a FIN from the other end. Set - * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in. - */ - if (sk->state == TCP_FIN_WAIT1) + if (sk->packets_out) { - - if (!sk->dead) - sk->state_change(sk); - if (sk->rcv_ack_seq == sk->write_seq) + if (flag & FLAG_DATA_ACKED) { - sk->shutdown |= SEND_SHUTDOWN; - tcp_set_state(sk, TCP_FIN_WAIT2); - /* If the socket is dead, then there is no - * user process hanging around using it. - * We want to set up a FIN_WAIT2 timeout ala BSD. - */ - if (sk->dead) - tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT); - } - } + long when; - /* - * Incoming ACK to a FIN we sent in the case of a simultaneous close. - * - * Move to TIME_WAIT - */ + skb = skb_peek(&sk->write_queue); - if (sk->state == TCP_CLOSING) - { + when = tp->rto - (jiffies - skb->when); - if (!sk->dead) - sk->state_change(sk); - if (sk->rcv_ack_seq == sk->write_seq) + if (when <= 0) { - tcp_time_wait(sk); + tp->retrans_head = NULL; + /* + * This is tricky. We are retransmiting a + * segment of a window when congestion occured. + */ + tcp_do_retransmit(sk, 0); + tcp_reset_xmit_timer(sk, TIME_RETRANS, + tp->rto); + } + else + tcp_reset_xmit_timer(sk, TIME_RETRANS, when); } } + else + tcp_clear_xmit_timer(sk, TIME_RETRANS); + /* - * Final ack of a three way shake + * Remember the highest ack received. */ - if (sk->state==TCP_SYN_RECV) - { - tcp_set_state(sk, TCP_ESTABLISHED); - tcp_options(sk,th); - sk->dummy_th.dest=th->source; - sk->copied_seq = sk->acked_seq; - if(!sk->dead) - sk->state_change(sk); - if(sk->max_window==0) - { - sk->max_window=32; /* Sanity check */ - sk->mss=min(sk->max_window,sk->mtu); - } - /* Reset the RTT estimator to the initial - * state rather than testing to avoid - * updating it on the ACK to the SYN packet. - */ - sk->rtt = 0; - sk->rto = TCP_TIMEOUT_INIT; - sk->mdev = TCP_TIMEOUT_INIT; - } + tp->snd_una = ack; + + tcp_fast_retrans(sk, ack, (flag & (FLAG_DATA|FLAG_WIN_UPDATE))); + /* - * The following code has been greatly simplified from the - * old hacked up stuff. The wonders of properly setting the - * retransmission timeouts. - * - * If we are retransmitting, and we acked a packet on the retransmit - * queue, and there is still something in the retransmit queue, - * then we can output some retransmission packets. + * Maybe we can take some stuff off of the write queue, + * and put it onto the xmit queue. */ - if (sk->send_head != NULL && (flag&2) && sk->retransmits) - { - tcp_do_retransmit(sk, 1); - } return 1; uninteresting_ack: - if(sk->debug) - printk("Ack ignored %u %u\n",ack,sk->sent_seq); - /* - * Keepalive processing. - */ - - if (after(ack, sk->sent_seq)) - { - return 0; - } + tcp_fast_retrans(sk, ack, 0); - /* - * Restart the keepalive timer. - */ + if(sk->debug) + printk("Ack ignored %u %u\n",ack,tp->snd_nxt); - if (sk->keepopen) - { - if(sk->ip_xmit_timeout==TIME_KEEPOPEN) - tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN); - } - return 1; + return 0; } @@ -1237,6 +805,8 @@ static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) { sk->fin_seq = skb->end_seq; + tcp_send_ack(sk); + if (!sk->dead) { sk->state_change(sk); @@ -1249,10 +819,11 @@ static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) case TCP_SYN_SENT: case TCP_ESTABLISHED: /* - * move to CLOSE_WAIT, tcp_data() already handled - * sending the ack. + * move to CLOSE_WAIT */ + tcp_set_state(sk, TCP_CLOSE_WAIT); + if (th->rst) sk->shutdown = SHUTDOWN_MASK; break; @@ -1280,26 +851,10 @@ static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) * This causes a WRITE timeout, which will either * move on to TIME_WAIT when we timeout, or resend * the FIN properly (maybe we get rid of that annoying - * FIN lost hang). The TIME_WRITE code is already correct - * for handling this timeout. + * FIN lost hang). The TIME_WRITE code is already + * correct for handling this timeout. */ - if (sk->ip_xmit_timeout != TIME_WRITE) { - if (sk->send_head) - tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto); - else if (sk->ip_xmit_timeout != TIME_PROBE0 - || skb_queue_empty(&sk->write_queue)) { - /* BUG check case. - * We have a problem here if there - * is no timer running [leads to - * frozen socket] or no data in the - * write queue [means we sent a fin - * and lost it from the queue before - * changing the ack properly]. - */ - printk(KERN_ERR "Lost timer or fin packet in tcp_fin.\n"); - } - } tcp_set_state(sk, TCP_CLOSING); break; case TCP_FIN_WAIT2: @@ -1326,155 +881,175 @@ static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) return(0); } + + /* - * Add a sk_buff to the TCP receive queue, calculating - * the ACK sequence as we go.. + * This one checks to see if we can put data from the + * out_of_order queue into the receive_queue */ -static inline void tcp_insert_skb(struct sk_buff * skb, struct sk_buff_head * list) + +static __inline__ void tcp_ofo_queue(struct sock *sk) { - struct sk_buff * prev, * next; - u32 seq; + struct sk_buff * skb; + struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp); - /* - * Find where the new skb goes.. (This goes backwards, - * on the assumption that we get the packets in order) - */ - seq = skb->seq; - prev = list->prev; - next = (struct sk_buff *) list; - for (;;) { - if (prev == (struct sk_buff *) list || !after(prev->seq, seq)) + while ((skb = skb_peek(&sk->out_of_order_queue))) { + + if (after(skb->seq, tp->rcv_nxt)) break; - next = prev; - prev = prev->prev; + + if (!after(skb->end_seq, tp->rcv_nxt)) { + + if (sk->debug) + printk("ofo packet was allready received \n"); + + skb_unlink(skb); + kfree_skb(skb, FREE_READ); + + continue; + } + + if (sk->debug) + printk("ofo requeuing : rcv_next %X seq %X - %X\n", + tp->rcv_nxt, skb->seq, skb->end_seq); + + skb_unlink(skb); + + + skb_queue_tail(&sk->receive_queue, skb); + + + tp->rcv_nxt = skb->end_seq; } - __skb_insert(skb, prev, next, list); } -/* - * Called for each packet when we find a new ACK endpoint sequence in it - */ -static inline u32 tcp_queue_ack(struct sk_buff * skb, struct sock * sk) +static __inline__ void tcp_data_queue(struct sock *sk, struct sk_buff *skb) { + struct sk_buff * skb1; + struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp); + /* - * When we ack the fin, we do the FIN - * processing. + * Queue data for delivery to the user + * Packets in sequence go to the receive queue + * Out of sequence packets to out_of_order_queue */ - skb->acked = 1; - if (skb->h.th->fin) - tcp_fin(skb,sk,skb->h.th); - return skb->end_seq; -} -static void tcp_queue(struct sk_buff * skb, struct sock * sk, struct tcphdr *th) -{ - u32 ack_seq; - tcp_insert_skb(skb, &sk->receive_queue); + if (skb->seq == tp->rcv_nxt) { /* - * Did we get anything new to ack? + * Ok. In sequence. */ - ack_seq = sk->acked_seq; - if (!after(skb->seq, ack_seq)) { - if (after(skb->end_seq, ack_seq)) { - /* the packet straddles our window end */ - struct sk_buff_head * list = &sk->receive_queue; - struct sk_buff * next; - ack_seq = tcp_queue_ack(skb, sk); + skb_queue_tail(&sk->receive_queue, skb); + + + tp->rcv_nxt = skb->end_seq; + + tcp_ofo_queue(sk); + + if (skb_queue_len(&sk->out_of_order_queue) == 0) + tp->pred_flags = htonl((0x5010 << 16) | tp->snd_wnd); + + return; + } /* - * Do we have any old packets to ack that the above - * made visible? (Go forward from skb) + * Not in sequence + * either a retransmit or some packet got lost */ - next = skb->next; - while (next != (struct sk_buff *) list) { - if (after(next->seq, ack_seq)) - break; - if (after(next->end_seq, ack_seq)) - ack_seq = tcp_queue_ack(next, sk); - next = next->next; - } + + if (!after(skb->end_seq, tp->rcv_nxt)) { /* - * Ok, we found new data, update acked_seq as - * necessary (and possibly send the actual - * ACK packet). + * A retransmit. + * 2nd most common case. + * force an imediate ack */ - sk->acked_seq = ack_seq; - } else { if (sk->debug) - printk("Ack duplicate packet.\n"); - tcp_send_ack(sk); + printk("retransmit received: seq %X\n", skb->seq); + + sk->delayed_acks = MAX_DELAY_ACK; + kfree_skb(skb, FREE_READ); + return; } + if (before(skb->seq, tp->rcv_nxt)) { + /* - * Delay the ack if possible. Send ack's to - * fin frames immediately as there shouldn't be - * anything more to come. + * Partial packet + * seq < rcv_next < end_seq */ - if (!sk->delay_acks || th->fin) { - tcp_send_ack(sk); - } else { + + if (sk->debug) + printk("partial packet: rcv_next %X seq %X - %X\n", + tp->rcv_nxt, skb->seq, skb->end_seq); + + skb_queue_tail(&sk->receive_queue, skb); + + + tp->rcv_nxt = skb->end_seq; + + tcp_ofo_queue(sk); + + if (skb_queue_len(&sk->out_of_order_queue) == 0) + tp->pred_flags = htonl((0x5010 << 16) | tp->snd_wnd); + + return; + } + /* - * If psh is set we assume it's an - * interactive session that wants quick - * acks to avoid nagling too much. + * Ok. This is an out_of_order segment */ - int delay = HZ/2; - if (th->psh) - delay = HZ/50; - tcp_send_delayed_ack(sk, delay, sk->ato); - } + + /* Force an ack */ + + sk->delayed_acks = MAX_DELAY_ACK; /* - * Tell the user we have some more data. + * disable header predition */ - if (!sk->dead) - sk->data_ready(sk,0); + tp->pred_flags = 0; + if (sk->debug) + printk("out of order segment: rcv_next %X seq %X - %X\n", + tp->rcv_nxt, skb->seq, skb->end_seq); + + if (skb_peek(&sk->out_of_order_queue) == NULL) { + skb_queue_head(&sk->out_of_order_queue,skb); } else + for(skb1=sk->out_of_order_queue.prev; ; skb1 = skb1->prev) { + + /* allready there */ + if (skb->seq==skb1->seq && skb->len>=skb1->len) { - /* - * If we've missed a packet, send an ack. - * Also start a timer to send another. - * - * 4.3reno machines look for these kind of acks so - * they can do fast recovery. Three identical 'old' - * acks lets it know that one frame has been lost - * and should be resent. Because this is before the - * whole window of data has timed out it can take - * one lost frame per window without stalling. - * [See Jacobson RFC1323, Stevens TCP/IP illus vol2] - * - * We also should be spotting triple bad sequences. - * [We now do this.] - * - */ + skb_append(skb1,skb); + skb_unlink(skb1); + kfree_skb(skb1,FREE_READ); + break; + } - if (!skb->acked) + if (after(skb->seq, skb1->seq)) { - if(sk->debug) - printk("Ack past end of seq packet.\n"); - tcp_send_ack(sk); + skb_append(skb1,skb); + break; + } + /* - * We need to be very careful here. We must - * not violate Jacobsons packet conservation condition. - * This means we should only send an ACK when a packet - * leaves the network. We can say a packet left the - * network when we see a packet leave the network, or - * when an rto measure expires. + * See if we've hit the start. If so insert. */ - tcp_send_delayed_ack(sk,sk->rto,sk->rto); + if (skb1 == skb_peek(&sk->out_of_order_queue)) { + skb_queue_head(&sk->out_of_order_queue,skb); + break; } } + } @@ -1484,103 +1059,110 @@ static void tcp_queue(struct sk_buff * skb, struct sock * sk, struct tcphdr *th) * room, then we will just have to discard the packet. */ -static int tcp_data(struct sk_buff *skb, struct sock *sk, - unsigned long saddr, unsigned int len) +static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len) { struct tcphdr *th; - u32 new_seq, shut_seq; + struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp); th = skb->h.th; skb_pull(skb,th->doff*4); skb_trim(skb,len-(th->doff*4)); + if (skb->len == 0 && !th->fin) + { + return(0); + } + /* - * The bytes in the receive read/assembly queue has increased. Needed for the - * low memory discard algorithm + * FIXME: don't accept data after the receved fin */ - sk->bytes_rcv += skb->len; - - if (skb->len == 0 && !th->fin) - { /* - * Don't want to keep passing ack's back and forth. - * (someone sent us dataless, boring frame) + * The bytes in the receive read/assembly queue has increased. + * Needed for the low memory discard algorithm */ - if (!th->ack) - tcp_send_ack(sk); - kfree_skb(skb, FREE_READ); - return(0); - } + sk->bytes_rcv += skb->len; /* * We no longer have anyone receiving data on this connection. */ -#ifndef TCP_DONT_RST_SHUTDOWN + tcp_data_queue(sk, skb); - if(sk->shutdown & RCV_SHUTDOWN) + if (before(tp->rcv_nxt, sk->copied_seq)) { + printk("*** tcp.c:tcp_data bug acked < copied\n"); + tp->rcv_nxt = sk->copied_seq; + } + + sk->delayed_acks++; + + /* - * FIXME: BSD has some magic to avoid sending resets to - * broken 4.2 BSD keepalives. Much to my surprise a few non - * BSD stacks still have broken keepalives so we want to - * cope with it. + * Now tell the user we may have some data. */ - if(skb->len) /* We don't care if it's just an ack or - a keepalive/window probe */ + if (!sk->dead) { - new_seq = skb->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */ + if(sk->debug) + printk("Data wakeup.\n"); + sk->data_ready(sk,0); + } + return(1); +} - /* Do this the way 4.4BSD treats it. Not what I'd - regard as the meaning of the spec but it's what BSD - does and clearly they know everything 8) */ +static void tcp_data_snd_check(struct sock *sk) +{ + struct sk_buff *skb; + struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp); + if ((skb = tp->send_head)) + { + if (!after(skb->end_seq, tp->snd_una + tp->snd_wnd) && + sk->packets_out < sk->cong_window ) + { /* - * This is valid because of two things - * - * a) The way tcp_data behaves at the bottom. - * b) A fin takes effect when read not when received. + * Add more data to the send queue. */ - shut_seq = sk->acked_seq+1; /* Last byte */ - - if(after(new_seq,shut_seq)) - { - if(sk->debug) - printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n", - sk, new_seq, shut_seq, sk->blog); - if(sk->dead) - { - sk->acked_seq = new_seq + th->fin; - tcp_send_reset(sk->saddr, sk->daddr, skb->h.th, - sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl); - tcp_statistics.TcpEstabResets++; - sk->err = EPIPE; - sk->error_report(sk); - sk->shutdown = SHUTDOWN_MASK; - tcp_set_state(sk,TCP_CLOSE); - kfree_skb(skb, FREE_READ); - return 0; + tcp_write_xmit(sk); + wake_up_interruptible(sk->sleep); } + else if (sk->packets_out == 0 && !tp->pending) + { + /* + * Data to queue but no room. + */ + tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto); } } } -#endif - +static __inline__ void tcp_ack_snd_check(struct sock *sk) +{ /* - * We should only call this if there is data in the frame. + * This also takes care of updating the window. + * This if statement needs to be simplified. + * + * rules for delaying an ack: + * - delay time <= 0.5 HZ + * - we don't have a window update to send + * - must send at least every 2 full sized packets */ - tcp_delack_estimator(sk); - tcp_queue(skb, sk, th); + if (sk->delayed_acks == 0) + return; - return(0); + if (sk->delayed_acks >= MAX_DELAY_ACK || tcp_raise_window(sk)) + { + tcp_send_ack(sk); + } + else + { + tcp_send_delayed_ack(sk, HZ/2); + } } - /* * This routine is only called when we have urgent data @@ -1588,13 +1170,13 @@ static int tcp_data(struct sk_buff *skb, struct sock *sk, * moved inline now as tcp_urg is only called from one * place. We handle URGent data wrong. We have to - as * BSD still doesn't use the correction from RFC961. - * * For 1003.1g we should support a new option TCP_STDURG to permit * either form. */ static void tcp_check_urg(struct sock * sk, struct tcphdr * th) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); u32 ptr = ntohs(th->urg_ptr); if (ptr) @@ -1628,6 +1210,9 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th) sk->copied_seq++; /* Move the copied sequence on correctly */ sk->urg_data = URG_NOTYET; sk->urg_seq = ptr; + + /* disable header prediction */ + tp->pred_flags = 0; } /* @@ -1662,429 +1247,430 @@ static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len } } + +static __inline__ void prune_queue(struct sock *sk) +{ + struct sk_buff * skb; + /* - * This should be a bit smarter and remove partially - * overlapping stuff too, but this should be good - * enough for any even remotely normal case (and the - * worst that can happen is that we have a few - * unnecessary packets in the receive queue). - * - * This function is never called with an empty list.. + * clean the out_of_order queue */ -static inline void tcp_remove_dups(struct sk_buff_head * list) -{ - struct sk_buff * next = list->next; - for (;;) { - struct sk_buff * skb = next; - next = next->next; - if (next == (struct sk_buff *) list) - break; - if (before(next->end_seq, skb->end_seq)) { - __skb_unlink(next, list); - kfree_skb(next, FREE_READ); - next = skb; - continue; - } - if (next->seq != skb->seq) - continue; - __skb_unlink(skb, list); + while ((skb = skb_dequeue(&sk->out_of_order_queue))) + { kfree_skb(skb, FREE_READ); } } + +void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, + struct tcphdr *th, __u16 len) +{ + struct tcp_opt *tp; + int queued = 0; + u32 flg; + /* - * Throw out all unnecessary packets: we've gone over the - * receive queue limit. This shouldn't happen in a normal - * TCP connection, but we might have gotten duplicates etc. + * Header prediction. + * The code follows the one in the famous + * "30 instruction TCP receive" Van Jacobson mail. + * + * Van's trick is to deposit buffers into socket queue + * on a device interrupt, to call tcp_recv function + * on the receive process context and checksum and copy + * the buffer to user space. smart... + * + * Our current scheme is not silly either but we take the + * extra cost of the net_bh soft interrupt processing... + * We do checksum and copy also but from device to kernel. */ -static void prune_queue(struct sk_buff_head * list) + + tp = &(sk->tp_pinfo.af_tcp); + flg = *(((u32 *)th) + 3); + + /* + * pred_flags is 0x5?10 << 16 + snd_wnd + * if header_predition is to be made + * ? will be 0 else it will be !0 + * (when there are holes in the receive + * space for instance) + */ + + if (flg == tp->pred_flags && skb->seq == tp->rcv_nxt) + { + if (len <= sizeof(struct tcphdr)) { - for (;;) { - struct sk_buff * skb = list->prev; + if (len == sizeof(struct tcphdr)) + { + tcp_ack(sk, th, skb->seq, skb->ack_seq, len); + } + + tcp_data_snd_check(sk); - /* gone through it all? */ - if (skb == (struct sk_buff *) list) - break; - if (!skb->acked) { - __skb_unlink(skb, list); kfree_skb(skb, FREE_READ); - continue; - } - tcp_remove_dups(list); - break; - } -} + return; -#ifdef CONFIG_IP_TRANSPARENT_PROXY + } + else if (skb->ack_seq == tp->snd_una) + { /* - * Check whether a received TCP packet might be for one of our - * connections. + * Bulk data transfer: receiver */ -int tcp_chkaddr(struct sk_buff *skb) + skb_pull(skb,sizeof(struct tcphdr)); + + skb_queue_tail(&sk->receive_queue, skb); + tp->rcv_nxt = skb->end_seq; + sk->bytes_rcv += len - sizeof(struct tcphdr); + + sk->data_ready(sk, 0); + tcp_delack_estimator(tp); + + if (sk->delayed_acks++) { - struct iphdr *iph = skb->h.iph; - struct tcphdr *th = (struct tcphdr *)(skb->h.raw + iph->ihl*4); - struct sock *sk; + tcp_send_delayed_ack(sk, HZ/2); + } + else + tcp_send_ack(sk); - sk = get_sock(&tcp_prot, th->dest, iph->saddr, th->source, iph->daddr, 0, 0); + return; + } + } - if (!sk) return 0; - /* 0 means accept all LOCAL addresses here, not all the world... */ - if (sk->rcv_saddr == 0) return 0; - return 1; + if (!tcp_sequence(tp, skb->seq, skb->end_seq)) + { + if (!th->rst) + { + if (after(skb->seq, tp->rcv_nxt)) + { + printk(KERN_DEBUG "->seq:%d end:%d " + "wup:%d wnd:%d\n", + skb->seq, skb->end_seq, + tp->rcv_wup, tp->rcv_wnd); + } + tcp_send_ack(sk); + kfree_skb(skb, FREE_READ); + return; + } } -#endif -/* - * A TCP packet has arrived. - * skb->h.raw is the TCP header. - */ + if(th->syn && skb->seq != sk->syn_seq) + { + printk(KERN_DEBUG "syn in established state\n"); + tcp_reset(sk, skb); + kfree_skb(skb, FREE_READ); + return; + } -int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt, - __u32 daddr, unsigned short len, - __u32 saddr, int redo, struct inet_protocol * protocol) + if(th->rst) { - struct tcphdr *th; - struct sock *sk; - __u32 seq; -#ifdef CONFIG_IP_TRANSPARENT_PROXY - int r; -#endif + tcp_reset(sk,skb); + kfree_skb(skb, FREE_READ); + return; + } + + if(th->ack) + { + tcp_ack(sk, th, skb->seq, skb->ack_seq, len); + } + /* - * "redo" is 1 if we have already seen this skb but couldn't - * use it at that time (the socket was locked). In that case - * we have already done a lot of the work (looked up the socket - * etc). + * Process urgent data */ - th = skb->h.th; - sk = skb->sk; - if (!redo) { - tcp_statistics.TcpInSegs++; - if (skb->pkt_type!=PACKET_HOST) - goto discard_it; + + tcp_urg(sk, th, len); /* - * Pull up the IP header. + * step 7: process the segment text */ - skb_pull(skb, skb->h.raw-skb->data); + + queued = tcp_data(skb, sk, len); /* - * Try to use the device checksum if provided. + * step 8: check the FIN bit */ - switch (skb->ip_summed) + + if (th->fin) { - case CHECKSUM_NONE: - skb->csum = csum_partial((char *)th, len, 0); - case CHECKSUM_HW: - if (tcp_check(th, len, saddr, daddr, skb->csum)) - goto discard_it; - default: - /* CHECKSUM_UNNECESSARY */ + tcp_fin(skb, sk, th); } - sk = get_tcp_sock(saddr, th->source, daddr, th->dest, dev->pa_addr, skb->redirport); - if (!sk) - goto no_tcp_socket; - skb->sk = sk; - skb->seq = ntohl(th->seq); - skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4; - skb->ack_seq = ntohl(th->ack_seq); - skb->acked = 0; - skb->used = 0; - skb->free = 1; - skb->saddr = daddr; - skb->daddr = saddr; + tcp_data_snd_check(sk); + tcp_ack_snd_check(sk); /* - * We may need to add it to the backlog here. + * If our receive queue has grown past its limits, + * try to prune away duplicates etc.. */ - if (sk->users) - { - __skb_queue_tail(&sk->back_log, skb); - return(0); - } - } + if (sk->rmem_alloc > sk->rcvbuf) + prune_queue(sk); /* - * If this socket has got a reset it's to all intents and purposes - * really dead. Count closed sockets as dead. - * - * Note: BSD appears to have a bug here. A 'closed' TCP in BSD - * simply drops data. This seems incorrect as a 'closed' TCP doesn't - * exist so should cause resets as if the port was unreachable. + * And done */ - if (sk->zapped || sk->state==TCP_CLOSE) - goto no_tcp_socket; + if (queued) + return; - if (!sk->prot) - { - printk(KERN_CRIT "IMPOSSIBLE 3\n"); - return(0); + kfree_skb(skb, FREE_READ); } /* - * Charge the memory to the socket. + * This function implements the receiving procedure of RFC 793. + * It's called from both tcp_v4_rcv and tcp_v6_rcv and should be + * address independent. */ - skb->sk=sk; - atomic_add(skb->truesize, &sk->rmem_alloc); +int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, + struct tcphdr *th, void *opt, __u16 len) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int queued = 0; + int rcv_mss; /* - * Mark the time of the last received packet. + * state == CLOSED + * tested in tcp_v{4,6}_rcv */ - sk->idletime = jiffies; - /* - * We should now do header prediction. - */ + switch (sk->state) { - /* - * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We - * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug - * compatibility. We also set up variables more thoroughly [Karn notes in the - * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths]. - */ - if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */ - { + case TCP_LISTEN: + + if (th->rst) + goto discard; /* - * Now deal with unusual cases. + * These use the socket TOS.. + * might want to be the received TOS */ - if(sk->state==TCP_LISTEN) + if(th->ack) { - if(th->ack) /* These use the socket TOS.. might want to be the received TOS */ - tcp_send_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl); - /* - * We don't care for RST, and non SYN are absorbed (old segments) - * Broadcast/multicast SYN isn't allowed. Note - bug if you change the - * netmask on a running connection it can go broadcast. Even Sun's have - * this problem so I'm ignoring it + * send reset */ -#ifdef CONFIG_IP_TRANSPARENT_PROXY - /* - * We may get non-local addresses and still want to - * handle them locally, due to transparent proxying. - * Thus, narrow down the test to what is really meant. - */ - if(th->rst || !th->syn || th->ack || (r = ip_chk_addr(daddr)) == IS_BROADCAST || r == IS_MULTICAST) -#else - if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR) -#endif - { - kfree_skb(skb, FREE_READ); - return 0; + return 1; } - /* - * Guess we need to make a new socket up - */ - seq = secure_tcp_sequence_number(saddr, daddr, - skb->h.th->dest, - skb->h.th->source); - tcp_conn_request(sk, skb, daddr, saddr, opt, dev, seq); + + if(th->syn) + { + int err; + __u32 isn; + + isn = tp->af_specific->init_sequence(sk, skb); + err = tp->af_specific->conn_request(sk, skb, opt, isn); + + if (err < 0) + return 1; /* - * Now we have several options: In theory there is nothing else - * in the frame. KA9Q has an option to send data with the syn, - * BSD accepts data with the syn up to the [to be] advertised window - * and Solaris 2.1 gives you a protocol error. For now we just ignore - * it, that fits the spec precisely and avoids incompatibilities. It - * would be nice in future to drop through and process the data. + * Now we have several options: In theory there is + * nothing else in the frame. KA9Q has an option to + * send data with the syn, BSD accepts data with the + * syn up to the [to be] advertised window and + * Solaris 2.1 gives you a protocol error. For now + * we just ignore it, that fits the spec precisely + * and avoids incompatibilities. It would be nice in + * future to drop through and process the data. * - * Now TTCP is starting to use we ought to queue this data. + * Now that TTCP is starting to be used we ought to + * queue this data. */ return 0; } - /* - * Retransmitted SYN for our socket. This is uninteresting. If sk->state==TCP_LISTEN - * then it's a new connection - */ + goto discard; + break; - if (sk->state == TCP_SYN_RECV && th->syn && skb->seq+1 == sk->acked_seq) - { - kfree_skb(skb, FREE_READ); - return 0; - } + case TCP_SYN_SENT: /* - * SYN sent means we have to look for a suitable ack and either reset - * for bad matches or go to connected. The SYN_SENT case is unusual and should + * SYN sent means we have to look for a suitable ack and + * either reset for bad matches or go to connected. + * The SYN_SENT case is unusual and should * not be in line code. [AC] */ - if(sk->state==TCP_SYN_SENT) - { - /* Crossed SYN or previous junk segment */ if(th->ack) { - /* We got an ack, but it's not a good ack. - * We used to test this with a call to tcp_ack, - * but this loses, because it takes the SYN - * packet out of the send queue, even if - * the ACK doesn't have the SYN bit sent, and - * therefore isn't the one we are waiting for. - */ - if (after(skb->ack_seq, sk->sent_seq) || before(skb->ack_seq, sk->rcv_ack_seq)) + /* We got an ack, but it's not a good ack */ + if(!tcp_ack(sk,th, skb->seq, skb->ack_seq, len)) { - /* Reset the ack - it's an ack from a - different connection [ th->rst is checked in tcp_send_reset()] */ tcp_statistics.TcpAttemptFails++; - tcp_send_reset(daddr, saddr, th, - sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl); - kfree_skb(skb, FREE_READ); - return(0); + return 1; } + if(th->rst) - return tcp_reset(sk,skb); - if(!th->syn) { - /* A valid ack from a different connection - start. Shouldn't happen but cover it */ - tcp_statistics.TcpAttemptFails++; - tcp_send_reset(daddr, saddr, th, - sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl); - kfree_skb(skb, FREE_READ); - return 0; + tcp_reset(sk,skb); + goto discard; } - /* process the ACK, get the SYN packet out - * of the send queue, do other initial - * processing stuff. [We know it's good, and - * we know it's the SYN,ACK we want.] + if(!th->syn) + { + /* + * A valid ack from a different connection + * start. Shouldn't happen but cover it */ - tcp_ack(sk,th,skb->ack_seq,len); - + tcp_statistics.TcpAttemptFails++; + return 1; + } /* - * Ok.. it's good. Set up sequence numbers and + * Ok.. it's good. Set up sequence + * numbers and * move to established. */ - sk->acked_seq = skb->seq+1; - sk->lastwin_seq = skb->seq+1; + + tp->rcv_nxt = skb->seq+1; + tp->rcv_wnd = 0; + tp->rcv_wup = skb->seq+1; + + tp->snd_wnd = htons(th->window); + tp->snd_wl1 = skb->seq; + tp->snd_wl2 = skb->ack_seq; + sk->fin_seq = skb->seq; tcp_send_ack(sk); + tcp_set_state(sk, TCP_ESTABLISHED); - tcp_options(sk,th); + rcv_mss = tcp_parse_options(th); + + if (rcv_mss == 0) + { + rcv_mss = 536; + } + + sk->mss = min(sk->mss, rcv_mss); + sk->dummy_th.dest = th->source; - sk->copied_seq = sk->acked_seq; + sk->copied_seq = tp->rcv_nxt; + if(!sk->dead) { sk->state_change(sk); sock_wake_async(sk->socket, 0); } - if(sk->max_window==0) - { - sk->max_window = 32; - sk->mss = min(sk->max_window, sk->mtu); - } - /* Reset the RTT estimator to the initial - * state rather than testing to avoid - * updating it on the ACK to the SYN packet. - */ - sk->rtt = 0; - sk->rto = TCP_TIMEOUT_INIT; - sk->mdev = TCP_TIMEOUT_INIT; + + /* Drop through step 6 */ + goto step6; } else { - /* See if SYN's cross. Drop if boring */ if(th->syn && !th->rst) { - /* Crossed SYN's are fine - but talking to - yourself is right out... */ - if(sk->saddr==saddr && sk->daddr==daddr && - sk->dummy_th.source==th->source && - sk->dummy_th.dest==th->dest) - { - tcp_statistics.TcpAttemptFails++; - return tcp_reset(sk,skb); - } - tcp_set_state(sk,TCP_SYN_RECV); - /* - * FIXME: - * Must send SYN|ACK here + * the previous version of the code + * checked for "connecting to self" + * here. that check is done now in + * tcp_connect */ + + tcp_set_state(sk, TCP_SYN_RECV); + + tp->rcv_nxt = skb->seq + 1; + tp->rcv_wup = skb->seq + 1; + + tp->snd_wnd = htons(th->window); + tp->snd_wl1 = skb->seq; + + tcp_send_synack(sk); + goto discard; } - /* Discard junk segment */ - kfree_skb(skb, FREE_READ); - return 0; - } - /* - * SYN_RECV with data maybe.. drop through - */ - goto rfc_step6; + } + break; + case TCP_TIME_WAIT: /* - * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is - * a more complex suggestion for fixing these reuse issues in RFC1644 - * but not yet ready for general use. Also see RFC1379. + * RFC 1122: + * "When a connection is [...] on TIME-WAIT state [...] + * [a TCP] MAY accept a new SYN from the remote TCP to + * reopen the connection directly, if it: * - * Note the funny way we go back to the top of this function for - * this case ("goto try_next_socket"). That also takes care of - * checking "sk->users" for the new socket as well as doing all - * the normal tests on the packet. + * (1) assigns its initial sequence number for the new + * connection to be larger than the largest sequence + * number it used on the previous connection incarnation, + * and + * + * (2) returns to TIME-WAIT state if the SYN turns out + * to be an old duplicate". */ -#define BSD_TIME_WAIT -#ifdef BSD_TIME_WAIT - if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead && - after(skb->seq, sk->acked_seq) && !th->rst) + if (th->syn && !th->rst && after(skb->seq, tp->rcv_nxt)) { - u32 seq = sk->write_seq; - if(sk->debug) - printk("Doing a BSD time wait\n"); - tcp_statistics.TcpEstabResets++; + __u32 isn; + int err; + atomic_sub(skb->truesize, &sk->rmem_alloc); skb->sk = NULL; sk->err = ECONNRESET; tcp_set_state(sk, TCP_CLOSE); sk->shutdown = SHUTDOWN_MASK; - sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr, dev->pa_addr, skb->redirport); - /* this is not really correct: we should check sk->users */ - if (sk && sk->state==TCP_LISTEN) - { + + isn = tp->rcv_nxt + 128000; + + sk = tp->af_specific->get_sock(skb, th); + + if (sk == NULL) + goto discard; + skb->sk = sk; + tp = &sk->tp_pinfo.af_tcp; atomic_add(skb->truesize, &sk->rmem_alloc); - tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000); - return 0; - } - kfree_skb(skb, FREE_READ); + + err = tp->af_specific->conn_request(sk, skb, opt, isn); + + if (err < 0) + return 1; + return 0; } -#endif + + break; + } /* - * We are now in normal data flow (see the step list in the RFC) - * Note most of these are inline now. I'll inline the lot when - * I have time to test it hard and look at what gcc outputs + * step 1: check sequence number */ - if (!tcp_sequence(sk, skb->seq, skb->end_seq-th->syn)) + if (!tcp_sequence(tp, skb->seq, skb->end_seq)) { - bad_tcp_sequence(sk, th, skb->end_seq-th->syn, dev); - kfree_skb(skb, FREE_READ); - return 0; + if (!th->rst) + { + tcp_send_ack(sk); + goto discard; + } } + + /* + * step 2: check RST bit + */ + if(th->rst) - return tcp_reset(sk,skb); + { + tcp_reset(sk,skb); + goto discard; + } /* + * step 3: check security and precedence + * [ignored] + */ + + /* + * step 4: + * * Check for a SYN, and ensure it matches the SYN we were * first sent. We have to handle the rather unusual (but valid) * sequence that KA9Q derived products may generate of @@ -2101,74 +1687,149 @@ int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt, if (th->syn && skb->seq!=sk->syn_seq) { - tcp_send_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255); - return tcp_reset(sk,skb); + tcp_reset(sk, skb); + return 1; } /* - * Process the ACK + * step 5: check the ACK field */ + if (th->ack) + { + int acceptable = tcp_ack(sk,th,skb->seq, skb->ack_seq,len); - if(th->ack && !tcp_ack(sk,th,skb->ack_seq,len)) + switch(sk->state) { + case TCP_SYN_RECV: + if (acceptable) { - /* - * Our three way handshake failed. - */ + tcp_set_state(sk, TCP_ESTABLISHED); + sk->dummy_th.dest=th->source; + sk->copied_seq = tp->rcv_nxt; + + if(!sk->dead) + sk->state_change(sk); + + tp->snd_una = skb->ack_seq; + tp->snd_wnd = htons(th->window); + tp->snd_wl1 = skb->seq; + tp->snd_wl2 = skb->ack_seq; + + } + else + return 1; + break; + + case TCP_FIN_WAIT1: - if(sk->state==TCP_SYN_RECV) + if (tp->snd_una == sk->write_seq) { - tcp_send_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl); + sk->shutdown |= SEND_SHUTDOWN; + tcp_set_state(sk, TCP_FIN_WAIT2); + if (!sk->dead) + sk->state_change(sk); } - kfree_skb(skb, FREE_READ); - return 0; + break; + + case TCP_CLOSING: + + if (tp->snd_una == sk->write_seq) + { + tcp_time_wait(sk); + if (!sk->dead) + sk->state_change(sk); } + break; -rfc_step6: /* I'll clean this up later */ + case TCP_LAST_ACK: + + if (tp->snd_una == sk->write_seq) + { + sk->shutdown = SHUTDOWN_MASK; + tcp_set_state(sk,TCP_CLOSE); + if (!sk->dead) + sk->state_change(sk); + goto discard; + } + break; + case TCP_TIME_WAIT: /* - * If the accepted buffer put us over our queue size we - * now drop it (we must process the ack first to avoid - * deadlock cases). + * keep us in TIME_WAIT until we stop getting + * packets, reset the timeout. */ + tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); + break; + + } + } + else + goto discard; + + step6: /* - * Process urgent data + * step 6: check the URG bit */ tcp_urg(sk, th, len); /* - * Process the encapsulated data + * step 7: process the segment text */ - if(tcp_data(skb,sk, saddr, len)) - kfree_skb(skb, FREE_READ); + switch (sk->state) { + case TCP_CLOSE_WAIT: + case TCP_CLOSING: + if (!before(skb->seq, sk->fin_seq)) + break; - /* - * If our receive queue has grown past its limits, - * try to prune away duplicates etc.. - */ - if (sk->rmem_alloc > sk->rcvbuf) - prune_queue(&sk->receive_queue); + case TCP_FIN_WAIT1: + case TCP_FIN_WAIT2: /* - * And done + * RFC 793 says to queue data in this states, + * RFC 1122 says we MUST send a reset. + * BSD 4.4 also does reset. */ - return 0; + if ((sk->shutdown & RCV_SHUTDOWN) && sk->dead) + { + if (after(skb->end_seq - th->fin, tp->rcv_nxt)) + { + tcp_reset(sk, skb); + return 1; + } + } -no_tcp_socket: - /* - * No such TCB. If th->rst is 0 send a reset (checked in tcp_send_reset) - */ - tcp_send_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255); + case TCP_ESTABLISHED: + queued = tcp_data(skb, sk, len); + break; + } -discard_it: /* - * Discard frame + * step 8: check the FIN bit */ - skb->sk = NULL; + + if (th->fin) + { + tcp_fin(skb, sk, th); + } + + tcp_data_snd_check(sk); + tcp_ack_snd_check(sk); + + if (queued) + return 0; + discard: + kfree_skb(skb, FREE_READ); return 0; } + +/* + * Local variables: + * compile-command: "gcc -D__KERNEL__ -I/usr/src/linux/include -Wall -Wstrict-prototypes -O2 -fomit-frame-pointer -fno-strength-reduce -pipe -m486 -DCPU=486 -c -o tcp_input.o tcp_input.c" + * c-file-style: "Linux" + * End: + */ |