qemu/slirp/tcp_output.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
   3 *      The Regents of the University of California.  All rights reserved.
   4 *
   5 * Redistribution and use in source and binary forms, with or without
   6 * modification, are permitted provided that the following conditions
   7 * are met:
   8 * 1. Redistributions of source code must retain the above copyright
   9 *    notice, this list of conditions and the following disclaimer.
  10 * 2. Redistributions in binary form must reproduce the above copyright
  11 *    notice, this list of conditions and the following disclaimer in the
  12 *    documentation and/or other materials provided with the distribution.
  13 * 3. Neither the name of the University nor the names of its contributors
  14 *    may be used to endorse or promote products derived from this software
  15 *    without specific prior written permission.
  16 *
  17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27 * SUCH DAMAGE.
  28 *
  29 *      @(#)tcp_output.c        8.3 (Berkeley) 12/30/93
  30 * tcp_output.c,v 1.3 1994/09/15 10:36:55 davidg Exp
  31 */
  32
  33/*
  34 * Changes and additions relating to SLiRP
  35 * Copyright (c) 1995 Danny Gasparovski.
  36 *
  37 * Please read the file COPYRIGHT for the
  38 * terms and conditions of the copyright.
  39 */
  40
  41#include <slirp.h>
  42
  43static const u_char  tcp_outflags[TCP_NSTATES] = {
  44        TH_RST|TH_ACK, 0,      TH_SYN,        TH_SYN|TH_ACK,
  45        TH_ACK,        TH_ACK, TH_FIN|TH_ACK, TH_FIN|TH_ACK,
  46        TH_FIN|TH_ACK, TH_ACK, TH_ACK,
  47};
  48
  49
  50#undef MAX_TCPOPTLEN
  51#define MAX_TCPOPTLEN   32      /* max # bytes that go in options */
  52
  53/*
  54 * Tcp output routine: figure out what should be sent and send it.
  55 */
  56int
  57tcp_output(struct tcpcb *tp)
  58{
  59        register struct socket *so = tp->t_socket;
  60        register long len, win;
  61        int off, flags, error;
  62        register struct mbuf *m;
  63        register struct tcpiphdr *ti;
  64        u_char opt[MAX_TCPOPTLEN];
  65        unsigned optlen, hdrlen;
  66        int idle, sendalot;
  67
  68        DEBUG_CALL("tcp_output");
  69        DEBUG_ARG("tp = %lx", (long )tp);
  70
  71        /*
  72         * Determine length of data that should be transmitted,
  73         * and flags that will be used.
  74         * If there is some data or critical controls (SYN, RST)
  75         * to send, then transmit; otherwise, investigate further.
  76         */
  77        idle = (tp->snd_max == tp->snd_una);
  78        if (idle && tp->t_idle >= tp->t_rxtcur)
  79                /*
  80                 * We have been idle for "a while" and no acks are
  81                 * expected to clock out any data we send --
  82                 * slow start to get ack "clock" running again.
  83                 */
  84                tp->snd_cwnd = tp->t_maxseg;
  85again:
  86        sendalot = 0;
  87        off = tp->snd_nxt - tp->snd_una;
  88        win = min(tp->snd_wnd, tp->snd_cwnd);
  89
  90        flags = tcp_outflags[tp->t_state];
  91
  92        DEBUG_MISC((dfd, " --- tcp_output flags = 0x%x\n",flags));
  93
  94        /*
  95         * If in persist timeout with window of 0, send 1 byte.
  96         * Otherwise, if window is small but nonzero
  97         * and timer expired, we will send what we can
  98         * and go to transmit state.
  99         */
 100        if (tp->t_force) {
 101                if (win == 0) {
 102                        /*
 103                         * If we still have some data to send, then
 104                         * clear the FIN bit.  Usually this would
 105                         * happen below when it realizes that we
 106                         * aren't sending all the data.  However,
 107                         * if we have exactly 1 byte of unset data,
 108                         * then it won't clear the FIN bit below,
 109                         * and if we are in persist state, we wind
 110                         * up sending the packet without recording
 111                         * that we sent the FIN bit.
 112                         *
 113                         * We can't just blindly clear the FIN bit,
 114                         * because if we don't have any more data
 115                         * to send then the probe will be the FIN
 116                         * itself.
 117                         */
 118                        if (off < so->so_snd.sb_cc)
 119                                flags &= ~TH_FIN;
 120                        win = 1;
 121                } else {
 122                        tp->t_timer[TCPT_PERSIST] = 0;
 123                        tp->t_rxtshift = 0;
 124                }
 125        }
 126
 127        len = min(so->so_snd.sb_cc, win) - off;
 128
 129        if (len < 0) {
 130                /*
 131                 * If FIN has been sent but not acked,
 132                 * but we haven't been called to retransmit,
 133                 * len will be -1.  Otherwise, window shrank
 134                 * after we sent into it.  If window shrank to 0,
 135                 * cancel pending retransmit and pull snd_nxt
 136                 * back to (closed) window.  We will enter persist
 137                 * state below.  If the window didn't close completely,
 138                 * just wait for an ACK.
 139                 */
 140                len = 0;
 141                if (win == 0) {
 142                        tp->t_timer[TCPT_REXMT] = 0;
 143                        tp->snd_nxt = tp->snd_una;
 144                }
 145        }
 146
 147        if (len > tp->t_maxseg) {
 148                len = tp->t_maxseg;
 149                sendalot = 1;
 150        }
 151        if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
 152                flags &= ~TH_FIN;
 153
 154        win = sbspace(&so->so_rcv);
 155
 156        /*
 157         * Sender silly window avoidance.  If connection is idle
 158         * and can send all data, a maximum segment,
 159         * at least a maximum default-size segment do it,
 160         * or are forced, do it; otherwise don't bother.
 161         * If peer's buffer is tiny, then send
 162         * when window is at least half open.
 163         * If retransmitting (possibly after persist timer forced us
 164         * to send into a small window), then must resend.
 165         */
 166        if (len) {
 167                if (len == tp->t_maxseg)
 168                        goto send;
 169                if ((1 || idle || tp->t_flags & TF_NODELAY) &&
 170                    len + off >= so->so_snd.sb_cc)
 171                        goto send;
 172                if (tp->t_force)
 173                        goto send;
 174                if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
 175                        goto send;
 176                if (SEQ_LT(tp->snd_nxt, tp->snd_max))
 177                        goto send;
 178        }
 179
 180        /*
 181         * Compare available window to amount of window
 182         * known to peer (as advertised window less
 183         * next expected input).  If the difference is at least two
 184         * max size segments, or at least 50% of the maximum possible
 185         * window, then want to send a window update to peer.
 186         */
 187        if (win > 0) {
 188                /*
 189                 * "adv" is the amount we can increase the window,
 190                 * taking into account that we are limited by
 191                 * TCP_MAXWIN << tp->rcv_scale.
 192                 */
 193                long adv = min(win, (long)TCP_MAXWIN << tp->rcv_scale) -
 194                        (tp->rcv_adv - tp->rcv_nxt);
 195
 196                if (adv >= (long) (2 * tp->t_maxseg))
 197                        goto send;
 198                if (2 * adv >= (long) so->so_rcv.sb_datalen)
 199                        goto send;
 200        }
 201
 202        /*
 203         * Send if we owe peer an ACK.
 204         */
 205        if (tp->t_flags & TF_ACKNOW)
 206                goto send;
 207        if (flags & (TH_SYN|TH_RST))
 208                goto send;
 209        if (SEQ_GT(tp->snd_up, tp->snd_una))
 210                goto send;
 211        /*
 212         * If our state indicates that FIN should be sent
 213         * and we have not yet done so, or we're retransmitting the FIN,
 214         * then we need to send.
 215         */
 216        if (flags & TH_FIN &&
 217            ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
 218                goto send;
 219
 220        /*
 221         * TCP window updates are not reliable, rather a polling protocol
 222         * using ``persist'' packets is used to insure receipt of window
 223         * updates.  The three ``states'' for the output side are:
 224         *      idle                    not doing retransmits or persists
 225         *      persisting              to move a small or zero window
 226         *      (re)transmitting        and thereby not persisting
 227         *
 228         * tp->t_timer[TCPT_PERSIST]
 229         *      is set when we are in persist state.
 230         * tp->t_force
 231         *      is set when we are called to send a persist packet.
 232         * tp->t_timer[TCPT_REXMT]
 233         *      is set when we are retransmitting
 234         * The output side is idle when both timers are zero.
 235         *
 236         * If send window is too small, there is data to transmit, and no
 237         * retransmit or persist is pending, then go to persist state.
 238         * If nothing happens soon, send when timer expires:
 239         * if window is nonzero, transmit what we can,
 240         * otherwise force out a byte.
 241         */
 242        if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 &&
 243            tp->t_timer[TCPT_PERSIST] == 0) {
 244                tp->t_rxtshift = 0;
 245                tcp_setpersist(tp);
 246        }
 247
 248        /*
 249         * No reason to send a segment, just return.
 250         */
 251        return (0);
 252
 253send:
 254        /*
 255         * Before ESTABLISHED, force sending of initial options
 256         * unless TCP set not to do any options.
 257         * NOTE: we assume that the IP/TCP header plus TCP options
 258         * always fit in a single mbuf, leaving room for a maximum
 259         * link header, i.e.
 260         *      max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MHLEN
 261         */
 262        optlen = 0;
 263        hdrlen = sizeof (struct tcpiphdr);
 264        if (flags & TH_SYN) {
 265                tp->snd_nxt = tp->iss;
 266                if ((tp->t_flags & TF_NOOPT) == 0) {
 267                        uint16_t mss;
 268
 269                        opt[0] = TCPOPT_MAXSEG;
 270                        opt[1] = 4;
 271                        mss = htons((uint16_t) tcp_mss(tp, 0));
 272                        memcpy((caddr_t)(opt + 2), (caddr_t)&mss, sizeof(mss));
 273                        optlen = 4;
 274                }
 275        }
 276
 277        hdrlen += optlen;
 278
 279        /*
 280         * Adjust data length if insertion of options will
 281         * bump the packet length beyond the t_maxseg length.
 282         */
 283         if (len > tp->t_maxseg - optlen) {
 284                len = tp->t_maxseg - optlen;
 285                sendalot = 1;
 286         }
 287
 288        /*
 289         * Grab a header mbuf, attaching a copy of data to
 290         * be transmitted, and initialize the header from
 291         * the template for sends on this connection.
 292         */
 293        if (len) {
 294                m = m_get(so->slirp);
 295                if (m == NULL) {
 296                        error = 1;
 297                        goto out;
 298                }
 299                m->m_data += IF_MAXLINKHDR;
 300                m->m_len = hdrlen;
 301
 302                sbcopy(&so->so_snd, off, (int) len, mtod(m, caddr_t) + hdrlen);
 303                m->m_len += len;
 304
 305                /*
 306                 * If we're sending everything we've got, set PUSH.
 307                 * (This will keep happy those implementations which only
 308                 * give data to the user when a buffer fills or
 309                 * a PUSH comes in.)
 310                 */
 311                if (off + len == so->so_snd.sb_cc)
 312                        flags |= TH_PUSH;
 313        } else {
 314                m = m_get(so->slirp);
 315                if (m == NULL) {
 316                        error = 1;
 317                        goto out;
 318                }
 319                m->m_data += IF_MAXLINKHDR;
 320                m->m_len = hdrlen;
 321        }
 322
 323        ti = mtod(m, struct tcpiphdr *);
 324
 325        memcpy((caddr_t)ti, &tp->t_template, sizeof (struct tcpiphdr));
 326
 327        /*
 328         * Fill in fields, remembering maximum advertised
 329         * window for use in delaying messages about window sizes.
 330         * If resending a FIN, be sure not to use a new sequence number.
 331         */
 332        if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
 333            tp->snd_nxt == tp->snd_max)
 334                tp->snd_nxt--;
 335        /*
 336         * If we are doing retransmissions, then snd_nxt will
 337         * not reflect the first unsent octet.  For ACK only
 338         * packets, we do not want the sequence number of the
 339         * retransmitted packet, we want the sequence number
 340         * of the next unsent octet.  So, if there is no data
 341         * (and no SYN or FIN), use snd_max instead of snd_nxt
 342         * when filling in ti_seq.  But if we are in persist
 343         * state, snd_max might reflect one byte beyond the
 344         * right edge of the window, so use snd_nxt in that
 345         * case, since we know we aren't doing a retransmission.
 346         * (retransmit and persist are mutually exclusive...)
 347         */
 348        if (len || (flags & (TH_SYN|TH_FIN)) || tp->t_timer[TCPT_PERSIST])
 349                ti->ti_seq = htonl(tp->snd_nxt);
 350        else
 351                ti->ti_seq = htonl(tp->snd_max);
 352        ti->ti_ack = htonl(tp->rcv_nxt);
 353        if (optlen) {
 354                memcpy((caddr_t)(ti + 1), (caddr_t)opt, optlen);
 355                ti->ti_off = (sizeof (struct tcphdr) + optlen) >> 2;
 356        }
 357        ti->ti_flags = flags;
 358        /*
 359         * Calculate receive window.  Don't shrink window,
 360         * but avoid silly window syndrome.
 361         */
 362        if (win < (long)(so->so_rcv.sb_datalen / 4) && win < (long)tp->t_maxseg)
 363                win = 0;
 364        if (win > (long)TCP_MAXWIN << tp->rcv_scale)
 365                win = (long)TCP_MAXWIN << tp->rcv_scale;
 366        if (win < (long)(tp->rcv_adv - tp->rcv_nxt))
 367                win = (long)(tp->rcv_adv - tp->rcv_nxt);
 368        ti->ti_win = htons((uint16_t) (win>>tp->rcv_scale));
 369
 370        if (SEQ_GT(tp->snd_up, tp->snd_una)) {
 371                ti->ti_urp = htons((uint16_t)(tp->snd_up - ntohl(ti->ti_seq)));
 372                ti->ti_flags |= TH_URG;
 373        } else
 374                /*
 375                 * If no urgent pointer to send, then we pull
 376                 * the urgent pointer to the left edge of the send window
 377                 * so that it doesn't drift into the send window on sequence
 378                 * number wraparound.
 379                 */
 380                tp->snd_up = tp->snd_una;               /* drag it along */
 381
 382        /*
 383         * Put TCP length in extended header, and then
 384         * checksum extended header and data.
 385         */
 386        if (len + optlen)
 387                ti->ti_len = htons((uint16_t)(sizeof (struct tcphdr) +
 388                    optlen + len));
 389        ti->ti_sum = cksum(m, (int)(hdrlen + len));
 390
 391        /*
 392         * In transmit state, time the transmission and arrange for
 393         * the retransmit.  In persist state, just set snd_max.
 394         */
 395        if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) {
 396                tcp_seq startseq = tp->snd_nxt;
 397
 398                /*
 399                 * Advance snd_nxt over sequence space of this segment.
 400                 */
 401                if (flags & (TH_SYN|TH_FIN)) {
 402                        if (flags & TH_SYN)
 403                                tp->snd_nxt++;
 404                        if (flags & TH_FIN) {
 405                                tp->snd_nxt++;
 406                                tp->t_flags |= TF_SENTFIN;
 407                        }
 408                }
 409                tp->snd_nxt += len;
 410                if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
 411                        tp->snd_max = tp->snd_nxt;
 412                        /*
 413                         * Time this transmission if not a retransmission and
 414                         * not currently timing anything.
 415                         */
 416                        if (tp->t_rtt == 0) {
 417                                tp->t_rtt = 1;
 418                                tp->t_rtseq = startseq;
 419                        }
 420                }
 421
 422                /*
 423                 * Set retransmit timer if not currently set,
 424                 * and not doing an ack or a keep-alive probe.
 425                 * Initial value for retransmit timer is smoothed
 426                 * round-trip time + 2 * round-trip time variance.
 427                 * Initialize shift counter which is used for backoff
 428                 * of retransmit time.
 429                 */
 430                if (tp->t_timer[TCPT_REXMT] == 0 &&
 431                    tp->snd_nxt != tp->snd_una) {
 432                        tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
 433                        if (tp->t_timer[TCPT_PERSIST]) {
 434                                tp->t_timer[TCPT_PERSIST] = 0;
 435                                tp->t_rxtshift = 0;
 436                        }
 437                }
 438        } else
 439                if (SEQ_GT(tp->snd_nxt + len, tp->snd_max))
 440                        tp->snd_max = tp->snd_nxt + len;
 441
 442        /*
 443         * Fill in IP length and desired time to live and
 444         * send to IP level.  There should be a better way
 445         * to handle ttl and tos; we could keep them in
 446         * the template, but need a way to checksum without them.
 447         */
 448        m->m_len = hdrlen + len; /* XXX Needed? m_len should be correct */
 449
 450    {
 451
 452        ((struct ip *)ti)->ip_len = m->m_len;
 453
 454        ((struct ip *)ti)->ip_ttl = IPDEFTTL;
 455        ((struct ip *)ti)->ip_tos = so->so_iptos;
 456
 457        error = ip_output(so, m);
 458    }
 459        if (error) {
 460out:
 461                return (error);
 462        }
 463
 464        /*
 465         * Data sent (as far as we can tell).
 466         * If this advertises a larger window than any other segment,
 467         * then remember the size of the advertised window.
 468         * Any pending ACK has now been sent.
 469         */
 470        if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv))
 471                tp->rcv_adv = tp->rcv_nxt + win;
 472        tp->last_ack_sent = tp->rcv_nxt;
 473        tp->t_flags &= ~(TF_ACKNOW|TF_DELACK);
 474        if (sendalot)
 475                goto again;
 476
 477        return (0);
 478}
 479
 480void
 481tcp_setpersist(struct tcpcb *tp)
 482{
 483    int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
 484
 485        /*
 486         * Start/restart persistence timer.
 487         */
 488        TCPT_RANGESET(tp->t_timer[TCPT_PERSIST],
 489            t * tcp_backoff[tp->t_rxtshift],
 490            TCPTV_PERSMIN, TCPTV_PERSMAX);
 491        if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
 492                tp->t_rxtshift++;
 493}
 494