qemu/slirp/tcp_output.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
   3 *      The Regents of the University of California.  All rights reserved.
   4 *
   5 * Redistribution and use in source and binary forms, with or without
   6 * modification, are permitted provided that the following conditions
   7 * are met:
   8 * 1. Redistributions of source code must retain the above copyright
   9 *    notice, this list of conditions and the following disclaimer.
  10 * 2. Redistributions in binary form must reproduce the above copyright
  11 *    notice, this list of conditions and the following disclaimer in the
  12 *    documentation and/or other materials provided with the distribution.
  13 * 3. Neither the name of the University nor the names of its contributors
  14 *    may be used to endorse or promote products derived from this software
  15 *    without specific prior written permission.
  16 *
  17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27 * SUCH DAMAGE.
  28 *
  29 *      @(#)tcp_output.c        8.3 (Berkeley) 12/30/93
  30 * tcp_output.c,v 1.3 1994/09/15 10:36:55 davidg Exp
  31 */
  32
  33/*
  34 * Changes and additions relating to SLiRP
  35 * Copyright (c) 1995 Danny Gasparovski.
  36 *
  37 * Please read the file COPYRIGHT for the
  38 * terms and conditions of the copyright.
  39 */
  40
  41#include "qemu/osdep.h"
  42#include "slirp.h"
  43
  44static const u_char  tcp_outflags[TCP_NSTATES] = {
  45        TH_RST|TH_ACK, 0,      TH_SYN,        TH_SYN|TH_ACK,
  46        TH_ACK,        TH_ACK, TH_FIN|TH_ACK, TH_FIN|TH_ACK,
  47        TH_FIN|TH_ACK, TH_ACK, TH_ACK,
  48};
  49
  50
  51#undef MAX_TCPOPTLEN
  52#define MAX_TCPOPTLEN   32      /* max # bytes that go in options */
  53
  54/*
  55 * Tcp output routine: figure out what should be sent and send it.
  56 */
  57int
  58tcp_output(struct tcpcb *tp)
  59{
  60        register struct socket *so = tp->t_socket;
  61        register long len, win;
  62        int off, flags, error;
  63        register struct mbuf *m;
  64        register struct tcpiphdr *ti, tcpiph_save;
  65        struct ip *ip;
  66        struct ip6 *ip6;
  67        u_char opt[MAX_TCPOPTLEN];
  68        unsigned optlen, hdrlen;
  69        int idle, sendalot;
  70
  71        DEBUG_CALL("tcp_output");
  72        DEBUG_ARG("tp = %p", tp);
  73
  74        /*
  75         * Determine length of data that should be transmitted,
  76         * and flags that will be used.
  77         * If there is some data or critical controls (SYN, RST)
  78         * to send, then transmit; otherwise, investigate further.
  79         */
  80        idle = (tp->snd_max == tp->snd_una);
  81        if (idle && tp->t_idle >= tp->t_rxtcur)
  82                /*
  83                 * We have been idle for "a while" and no acks are
  84                 * expected to clock out any data we send --
  85                 * slow start to get ack "clock" running again.
  86                 */
  87                tp->snd_cwnd = tp->t_maxseg;
  88again:
  89        sendalot = 0;
  90        off = tp->snd_nxt - tp->snd_una;
  91        win = MIN(tp->snd_wnd, tp->snd_cwnd);
  92
  93        flags = tcp_outflags[tp->t_state];
  94
  95        DEBUG_MISC((dfd, " --- tcp_output flags = 0x%x\n",flags));
  96
  97        /*
  98         * If in persist timeout with window of 0, send 1 byte.
  99         * Otherwise, if window is small but nonzero
 100         * and timer expired, we will send what we can
 101         * and go to transmit state.
 102         */
 103        if (tp->t_force) {
 104                if (win == 0) {
 105                        /*
 106                         * If we still have some data to send, then
 107                         * clear the FIN bit.  Usually this would
 108                         * happen below when it realizes that we
 109                         * aren't sending all the data.  However,
 110                         * if we have exactly 1 byte of unset data,
 111                         * then it won't clear the FIN bit below,
 112                         * and if we are in persist state, we wind
 113                         * up sending the packet without recording
 114                         * that we sent the FIN bit.
 115                         *
 116                         * We can't just blindly clear the FIN bit,
 117                         * because if we don't have any more data
 118                         * to send then the probe will be the FIN
 119                         * itself.
 120                         */
 121                        if (off < so->so_snd.sb_cc)
 122                                flags &= ~TH_FIN;
 123                        win = 1;
 124                } else {
 125                        tp->t_timer[TCPT_PERSIST] = 0;
 126                        tp->t_rxtshift = 0;
 127                }
 128        }
 129
 130        len = MIN(so->so_snd.sb_cc, win) - off;
 131
 132        if (len < 0) {
 133                /*
 134                 * If FIN has been sent but not acked,
 135                 * but we haven't been called to retransmit,
 136                 * len will be -1.  Otherwise, window shrank
 137                 * after we sent into it.  If window shrank to 0,
 138                 * cancel pending retransmit and pull snd_nxt
 139                 * back to (closed) window.  We will enter persist
 140                 * state below.  If the window didn't close completely,
 141                 * just wait for an ACK.
 142                 */
 143                len = 0;
 144                if (win == 0) {
 145                        tp->t_timer[TCPT_REXMT] = 0;
 146                        tp->snd_nxt = tp->snd_una;
 147                }
 148        }
 149
 150        if (len > tp->t_maxseg) {
 151                len = tp->t_maxseg;
 152                sendalot = 1;
 153        }
 154        if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
 155                flags &= ~TH_FIN;
 156
 157        win = sbspace(&so->so_rcv);
 158
 159        /*
 160         * Sender silly window avoidance.  If connection is idle
 161         * and can send all data, a maximum segment,
 162         * at least a maximum default-size segment do it,
 163         * or are forced, do it; otherwise don't bother.
 164         * If peer's buffer is tiny, then send
 165         * when window is at least half open.
 166         * If retransmitting (possibly after persist timer forced us
 167         * to send into a small window), then must resend.
 168         */
 169        if (len) {
 170                if (len == tp->t_maxseg)
 171                        goto send;
 172                if ((1 || idle || tp->t_flags & TF_NODELAY) &&
 173                    len + off >= so->so_snd.sb_cc)
 174                        goto send;
 175                if (tp->t_force)
 176                        goto send;
 177                if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
 178                        goto send;
 179                if (SEQ_LT(tp->snd_nxt, tp->snd_max))
 180                        goto send;
 181        }
 182
 183        /*
 184         * Compare available window to amount of window
 185         * known to peer (as advertised window less
 186         * next expected input).  If the difference is at least two
 187         * max size segments, or at least 50% of the maximum possible
 188         * window, then want to send a window update to peer.
 189         */
 190        if (win > 0) {
 191                /*
 192                 * "adv" is the amount we can increase the window,
 193                 * taking into account that we are limited by
 194                 * TCP_MAXWIN << tp->rcv_scale.
 195                 */
 196                long adv = MIN(win, (long)TCP_MAXWIN << tp->rcv_scale) -
 197                        (tp->rcv_adv - tp->rcv_nxt);
 198
 199                if (adv >= (long) (2 * tp->t_maxseg))
 200                        goto send;
 201                if (2 * adv >= (long) so->so_rcv.sb_datalen)
 202                        goto send;
 203        }
 204
 205        /*
 206         * Send if we owe peer an ACK.
 207         */
 208        if (tp->t_flags & TF_ACKNOW)
 209                goto send;
 210        if (flags & (TH_SYN|TH_RST))
 211                goto send;
 212        if (SEQ_GT(tp->snd_up, tp->snd_una))
 213                goto send;
 214        /*
 215         * If our state indicates that FIN should be sent
 216         * and we have not yet done so, or we're retransmitting the FIN,
 217         * then we need to send.
 218         */
 219        if (flags & TH_FIN &&
 220            ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
 221                goto send;
 222
 223        /*
 224         * TCP window updates are not reliable, rather a polling protocol
 225         * using ``persist'' packets is used to insure receipt of window
 226         * updates.  The three ``states'' for the output side are:
 227         *      idle                    not doing retransmits or persists
 228         *      persisting              to move a small or zero window
 229         *      (re)transmitting        and thereby not persisting
 230         *
 231         * tp->t_timer[TCPT_PERSIST]
 232         *      is set when we are in persist state.
 233         * tp->t_force
 234         *      is set when we are called to send a persist packet.
 235         * tp->t_timer[TCPT_REXMT]
 236         *      is set when we are retransmitting
 237         * The output side is idle when both timers are zero.
 238         *
 239         * If send window is too small, there is data to transmit, and no
 240         * retransmit or persist is pending, then go to persist state.
 241         * If nothing happens soon, send when timer expires:
 242         * if window is nonzero, transmit what we can,
 243         * otherwise force out a byte.
 244         */
 245        if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 &&
 246            tp->t_timer[TCPT_PERSIST] == 0) {
 247                tp->t_rxtshift = 0;
 248                tcp_setpersist(tp);
 249        }
 250
 251        /*
 252         * No reason to send a segment, just return.
 253         */
 254        return (0);
 255
 256send:
 257        /*
 258         * Before ESTABLISHED, force sending of initial options
 259         * unless TCP set not to do any options.
 260         * NOTE: we assume that the IP/TCP header plus TCP options
 261         * always fit in a single mbuf, leaving room for a maximum
 262         * link header, i.e.
 263         *      max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MHLEN
 264         */
 265        optlen = 0;
 266        hdrlen = sizeof (struct tcpiphdr);
 267        if (flags & TH_SYN) {
 268                tp->snd_nxt = tp->iss;
 269                if ((tp->t_flags & TF_NOOPT) == 0) {
 270                        uint16_t mss;
 271
 272                        opt[0] = TCPOPT_MAXSEG;
 273                        opt[1] = 4;
 274                        mss = htons((uint16_t) tcp_mss(tp, 0));
 275                        memcpy((caddr_t)(opt + 2), (caddr_t)&mss, sizeof(mss));
 276                        optlen = 4;
 277                }
 278        }
 279
 280        hdrlen += optlen;
 281
 282        /*
 283         * Adjust data length if insertion of options will
 284         * bump the packet length beyond the t_maxseg length.
 285         */
 286         if (len > tp->t_maxseg - optlen) {
 287                len = tp->t_maxseg - optlen;
 288                sendalot = 1;
 289         }
 290
 291        /*
 292         * Grab a header mbuf, attaching a copy of data to
 293         * be transmitted, and initialize the header from
 294         * the template for sends on this connection.
 295         */
 296        if (len) {
 297                m = m_get(so->slirp);
 298                if (m == NULL) {
 299                        error = 1;
 300                        goto out;
 301                }
 302                m->m_data += IF_MAXLINKHDR;
 303                m->m_len = hdrlen;
 304
 305                sbcopy(&so->so_snd, off, (int) len, mtod(m, caddr_t) + hdrlen);
 306                m->m_len += len;
 307
 308                /*
 309                 * If we're sending everything we've got, set PUSH.
 310                 * (This will keep happy those implementations which only
 311                 * give data to the user when a buffer fills or
 312                 * a PUSH comes in.)
 313                 */
 314                if (off + len == so->so_snd.sb_cc)
 315                        flags |= TH_PUSH;
 316        } else {
 317                m = m_get(so->slirp);
 318                if (m == NULL) {
 319                        error = 1;
 320                        goto out;
 321                }
 322                m->m_data += IF_MAXLINKHDR;
 323                m->m_len = hdrlen;
 324        }
 325
 326        ti = mtod(m, struct tcpiphdr *);
 327
 328        memcpy((caddr_t)ti, &tp->t_template, sizeof (struct tcpiphdr));
 329
 330        /*
 331         * Fill in fields, remembering maximum advertised
 332         * window for use in delaying messages about window sizes.
 333         * If resending a FIN, be sure not to use a new sequence number.
 334         */
 335        if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
 336            tp->snd_nxt == tp->snd_max)
 337                tp->snd_nxt--;
 338        /*
 339         * If we are doing retransmissions, then snd_nxt will
 340         * not reflect the first unsent octet.  For ACK only
 341         * packets, we do not want the sequence number of the
 342         * retransmitted packet, we want the sequence number
 343         * of the next unsent octet.  So, if there is no data
 344         * (and no SYN or FIN), use snd_max instead of snd_nxt
 345         * when filling in ti_seq.  But if we are in persist
 346         * state, snd_max might reflect one byte beyond the
 347         * right edge of the window, so use snd_nxt in that
 348         * case, since we know we aren't doing a retransmission.
 349         * (retransmit and persist are mutually exclusive...)
 350         */
 351        if (len || (flags & (TH_SYN|TH_FIN)) || tp->t_timer[TCPT_PERSIST])
 352                ti->ti_seq = htonl(tp->snd_nxt);
 353        else
 354                ti->ti_seq = htonl(tp->snd_max);
 355        ti->ti_ack = htonl(tp->rcv_nxt);
 356        if (optlen) {
 357                memcpy((caddr_t)(ti + 1), (caddr_t)opt, optlen);
 358                ti->ti_off = (sizeof (struct tcphdr) + optlen) >> 2;
 359        }
 360        ti->ti_flags = flags;
 361        /*
 362         * Calculate receive window.  Don't shrink window,
 363         * but avoid silly window syndrome.
 364         */
 365        if (win < (long)(so->so_rcv.sb_datalen / 4) && win < (long)tp->t_maxseg)
 366                win = 0;
 367        if (win > (long)TCP_MAXWIN << tp->rcv_scale)
 368                win = (long)TCP_MAXWIN << tp->rcv_scale;
 369        if (win < (long)(tp->rcv_adv - tp->rcv_nxt))
 370                win = (long)(tp->rcv_adv - tp->rcv_nxt);
 371        ti->ti_win = htons((uint16_t) (win>>tp->rcv_scale));
 372
 373        if (SEQ_GT(tp->snd_up, tp->snd_una)) {
 374                ti->ti_urp = htons((uint16_t)(tp->snd_up - ntohl(ti->ti_seq)));
 375                ti->ti_flags |= TH_URG;
 376        } else
 377                /*
 378                 * If no urgent pointer to send, then we pull
 379                 * the urgent pointer to the left edge of the send window
 380                 * so that it doesn't drift into the send window on sequence
 381                 * number wraparound.
 382                 */
 383                tp->snd_up = tp->snd_una;               /* drag it along */
 384
 385        /*
 386         * Put TCP length in extended header, and then
 387         * checksum extended header and data.
 388         */
 389        if (len + optlen)
 390                ti->ti_len = htons((uint16_t)(sizeof (struct tcphdr) +
 391                    optlen + len));
 392        ti->ti_sum = cksum(m, (int)(hdrlen + len));
 393
 394        /*
 395         * In transmit state, time the transmission and arrange for
 396         * the retransmit.  In persist state, just set snd_max.
 397         */
 398        if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) {
 399                tcp_seq startseq = tp->snd_nxt;
 400
 401                /*
 402                 * Advance snd_nxt over sequence space of this segment.
 403                 */
 404                if (flags & (TH_SYN|TH_FIN)) {
 405                        if (flags & TH_SYN)
 406                                tp->snd_nxt++;
 407                        if (flags & TH_FIN) {
 408                                tp->snd_nxt++;
 409                                tp->t_flags |= TF_SENTFIN;
 410                        }
 411                }
 412                tp->snd_nxt += len;
 413                if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
 414                        tp->snd_max = tp->snd_nxt;
 415                        /*
 416                         * Time this transmission if not a retransmission and
 417                         * not currently timing anything.
 418                         */
 419                        if (tp->t_rtt == 0) {
 420                                tp->t_rtt = 1;
 421                                tp->t_rtseq = startseq;
 422                        }
 423                }
 424
 425                /*
 426                 * Set retransmit timer if not currently set,
 427                 * and not doing an ack or a keep-alive probe.
 428                 * Initial value for retransmit timer is smoothed
 429                 * round-trip time + 2 * round-trip time variance.
 430                 * Initialize shift counter which is used for backoff
 431                 * of retransmit time.
 432                 */
 433                if (tp->t_timer[TCPT_REXMT] == 0 &&
 434                    tp->snd_nxt != tp->snd_una) {
 435                        tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
 436                        if (tp->t_timer[TCPT_PERSIST]) {
 437                                tp->t_timer[TCPT_PERSIST] = 0;
 438                                tp->t_rxtshift = 0;
 439                        }
 440                }
 441        } else
 442                if (SEQ_GT(tp->snd_nxt + len, tp->snd_max))
 443                        tp->snd_max = tp->snd_nxt + len;
 444
 445        /*
 446         * Fill in IP length and desired time to live and
 447         * send to IP level.  There should be a better way
 448         * to handle ttl and tos; we could keep them in
 449         * the template, but need a way to checksum without them.
 450         */
 451        m->m_len = hdrlen + len; /* XXX Needed? m_len should be correct */
 452        tcpiph_save = *mtod(m, struct tcpiphdr *);
 453
 454        switch (so->so_ffamily) {
 455        case AF_INET:
 456            m->m_data += sizeof(struct tcpiphdr) - sizeof(struct tcphdr)
 457                                                 - sizeof(struct ip);
 458            m->m_len  -= sizeof(struct tcpiphdr) - sizeof(struct tcphdr)
 459                                                 - sizeof(struct ip);
 460            ip = mtod(m, struct ip *);
 461
 462            ip->ip_len = m->m_len;
 463            ip->ip_dst = tcpiph_save.ti_dst;
 464            ip->ip_src = tcpiph_save.ti_src;
 465            ip->ip_p = tcpiph_save.ti_pr;
 466
 467            ip->ip_ttl = IPDEFTTL;
 468            ip->ip_tos = so->so_iptos;
 469            error = ip_output(so, m);
 470            break;
 471
 472        case AF_INET6:
 473            m->m_data += sizeof(struct tcpiphdr) - sizeof(struct tcphdr)
 474                                                 - sizeof(struct ip6);
 475            m->m_len  -= sizeof(struct tcpiphdr) - sizeof(struct tcphdr)
 476                                                 - sizeof(struct ip6);
 477            ip6 = mtod(m, struct ip6 *);
 478
 479            ip6->ip_pl = tcpiph_save.ti_len;
 480            ip6->ip_dst = tcpiph_save.ti_dst6;
 481            ip6->ip_src = tcpiph_save.ti_src6;
 482            ip6->ip_nh = tcpiph_save.ti_nh6;
 483
 484            error = ip6_output(so, m, 0);
 485            break;
 486
 487        default:
 488            g_assert_not_reached();
 489        }
 490
 491        if (error) {
 492out:
 493                return (error);
 494        }
 495
 496        /*
 497         * Data sent (as far as we can tell).
 498         * If this advertises a larger window than any other segment,
 499         * then remember the size of the advertised window.
 500         * Any pending ACK has now been sent.
 501         */
 502        if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv))
 503                tp->rcv_adv = tp->rcv_nxt + win;
 504        tp->last_ack_sent = tp->rcv_nxt;
 505        tp->t_flags &= ~(TF_ACKNOW|TF_DELACK);
 506        if (sendalot)
 507                goto again;
 508
 509        return (0);
 510}
 511
 512void
 513tcp_setpersist(struct tcpcb *tp)
 514{
 515    int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
 516
 517        /*
 518         * Start/restart persistence timer.
 519         */
 520        TCPT_RANGESET(tp->t_timer[TCPT_PERSIST],
 521            t * tcp_backoff[tp->t_rxtshift],
 522            TCPTV_PERSMIN, TCPTV_PERSMAX);
 523        if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
 524                tp->t_rxtshift++;
 525}
 526