LXR linux/net/ipv4/tcp

   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Implementation of the Transmission Control Protocol(TCP).
   7 *
   8 * Authors:     Ross Biro
   9 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10 *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  11 *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  12 *              Florian La Roche, <flla@stud.uni-sb.de>
  13 *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  14 *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  15 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  16 *              Matthew Dillon, <dillon@apollo.west.oic.com>
  17 *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  18 *              Jorge Cwik, <jorge@laser.satlink.net>
  19 */
  20
  21/*
  22 * Changes:
  23 *              Pedro Roque     :       Fast Retransmit/Recovery.
  24 *                                      Two receive queues.
  25 *                                      Retransmit queue handled by TCP.
  26 *                                      Better retransmit timer handling.
  27 *                                      New congestion avoidance.
  28 *                                      Header prediction.
  29 *                                      Variable renaming.
  30 *
  31 *              Eric            :       Fast Retransmit.
  32 *              Randy Scott     :       MSS option defines.
  33 *              Eric Schenk     :       Fixes to slow start algorithm.
  34 *              Eric Schenk     :       Yet another double ACK bug.
  35 *              Eric Schenk     :       Delayed ACK bug fixes.
  36 *              Eric Schenk     :       Floyd style fast retrans war avoidance.
  37 *              David S. Miller :       Don't allow zero congestion window.
  38 *              Eric Schenk     :       Fix retransmitter so that it sends
  39 *                                      next packet on ack of previous packet.
  40 *              Andi Kleen      :       Moved open_request checking here
  41 *                                      and process RSTs for open_requests.
  42 *              Andi Kleen      :       Better prune_queue, and other fixes.
  43 *              Andrey Savochkin:       Fix RTT measurements in the presence of
  44 *                                      timestamps.
  45 *              Andrey Savochkin:       Check sequence numbers correctly when
  46 *                                      removing SACKs due to in sequence incoming
  47 *                                      data segments.
  48 *              Andi Kleen:             Make sure we never ack data there is not
  49 *                                      enough room for. Also make this condition
  50 *                                      a fatal error if it might still happen.
  51 *              Andi Kleen:             Add tcp_measure_rcv_mss to make
  52 *                                      connections with MSS<min(MTU,ann. MSS)
  53 *                                      work without delayed acks.
  54 *              Andi Kleen:             Process packets with PSH set in the
  55 *                                      fast path.
  56 *              J Hadi Salim:           ECN support
  57 *              Andrei Gurtov,
  58 *              Pasi Sarolahti,
  59 *              Panu Kuhlberg:          Experimental audit of TCP (re)transmission
  60 *                                      engine. Lots of bugs are found.
  61 *              Pasi Sarolahti:         F-RTO for dealing with spurious RTOs
  62 */
  63
  64#define pr_fmt(fmt) "TCP: " fmt
  65
  66#include <linux/mm.h>
  67#include <linux/slab.h>
  68#include <linux/module.h>
  69#include <linux/sysctl.h>
  70#include <linux/kernel.h>
  71#include <net/dst.h>
  72#include <net/tcp.h>
  73#include <net/inet_common.h>
  74#include <linux/ipsec.h>
  75#include <asm/unaligned.h>
  76
  77int sysctl_tcp_timestamps __read_mostly = 1;
  78int sysctl_tcp_window_scaling __read_mostly = 1;
  79int sysctl_tcp_sack __read_mostly = 1;
  80int sysctl_tcp_fack __read_mostly = 1;
  81int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH;
  82EXPORT_SYMBOL(sysctl_tcp_reordering);
  83int sysctl_tcp_dsack __read_mostly = 1;
  84int sysctl_tcp_app_win __read_mostly = 31;
  85int sysctl_tcp_adv_win_scale __read_mostly = 1;
  86EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
  87
  88/* rfc5961 challenge ack rate limiting */
  89int sysctl_tcp_challenge_ack_limit = 1000;
  90
  91int sysctl_tcp_stdurg __read_mostly;
  92int sysctl_tcp_rfc1337 __read_mostly;
  93int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
  94int sysctl_tcp_frto __read_mostly = 2;
  95
  96int sysctl_tcp_thin_dupack __read_mostly;
  97
  98int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
  99int sysctl_tcp_early_retrans __read_mostly = 3;
 100int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
 101
 102#define FLAG_DATA               0x01 /* Incoming frame contained data.          */
 103#define FLAG_WIN_UPDATE         0x02 /* Incoming ACK was a window update.       */
 104#define FLAG_DATA_ACKED         0x04 /* This ACK acknowledged new data.         */
 105#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted.  */
 106#define FLAG_SYN_ACKED          0x10 /* This ACK acknowledged SYN.              */
 107#define FLAG_DATA_SACKED        0x20 /* New SACK.                               */
 108#define FLAG_ECE                0x40 /* ECE in this ACK                         */
 109#define FLAG_SLOWPATH           0x100 /* Do not skip RFC checks for window update.*/
 110#define FLAG_ORIG_SACK_ACKED    0x200 /* Never retransmitted data are (s)acked  */
 111#define FLAG_SND_UNA_ADVANCED   0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
 112#define FLAG_DSACKING_ACK       0x800 /* SACK blocks contained D-SACK info */
 113#define FLAG_SACK_RENEGING      0x2000 /* snd_una advanced to a sacked seq */
 114#define FLAG_UPDATE_TS_RECENT   0x4000 /* tcp_replace_ts_recent() */
 115
 116#define FLAG_ACKED              (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
 117#define FLAG_NOT_DUP            (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
 118#define FLAG_CA_ALERT           (FLAG_DATA_SACKED|FLAG_ECE)
 119#define FLAG_FORWARD_PROGRESS   (FLAG_ACKED|FLAG_DATA_SACKED)
 120
 121#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
 122#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
 123
 124static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb,
 125                             unsigned int len)
 126{
 127        static bool __once __read_mostly;
 128
 129        if (!__once) {
 130                struct net_device *dev;
 131
 132                __once = true;
 133
 134                rcu_read_lock();
 135                dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif);
 136                if (!dev || len >= dev->mtu)
 137                        pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n",
 138                                dev ? dev->name : "Unknown driver");
 139                rcu_read_unlock();
 140        }
 141}
 142
 143/* Adapt the MSS value used to make delayed ack decision to the
 144 * real world.
 145 */
 146static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
 147{
 148        struct inet_connection_sock *icsk = inet_csk(sk);
 149        const unsigned int lss = icsk->icsk_ack.last_seg_size;
 150        unsigned int len;
 151
 152        icsk->icsk_ack.last_seg_size = 0;
 153
 154        /* skb->len may jitter because of SACKs, even if peer
 155         * sends good full-sized frames.
 156         */
 157        len = skb_shinfo(skb)->gso_size ? : skb->len;
 158        if (len >= icsk->icsk_ack.rcv_mss) {
 159                icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
 160                                               tcp_sk(sk)->advmss);
 161                /* Account for possibly-removed options */
 162                if (unlikely(len > icsk->icsk_ack.rcv_mss +
 163                                   MAX_TCP_OPTION_SPACE))
 164                        tcp_gro_dev_warn(sk, skb, len);
 165        } else {
 166                /* Otherwise, we make more careful check taking into account,
 167                 * that SACKs block is variable.
 168                 *
 169                 * "len" is invariant segment length, including TCP header.
 170                 */
 171                len += skb->data - skb_transport_header(skb);
 172                if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) ||
 173                    /* If PSH is not set, packet should be
 174                     * full sized, provided peer TCP is not badly broken.
 175                     * This observation (if it is correct 8)) allows
 176                     * to handle super-low mtu links fairly.
 177                     */
 178                    (len >= TCP_MIN_MSS + sizeof(struct tcphdr) &&
 179                     !(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) {
 180                        /* Subtract also invariant (if peer is RFC compliant),
 181                         * tcp header plus fixed timestamp option length.
 182                         * Resulting "len" is MSS free of SACK jitter.
 183                         */
 184                        len -= tcp_sk(sk)->tcp_header_len;
 185                        icsk->icsk_ack.last_seg_size = len;
 186                        if (len == lss) {
 187                                icsk->icsk_ack.rcv_mss = len;
 188                                return;
 189                        }
 190                }
 191                if (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)
 192                        icsk->icsk_ack.pending |= ICSK_ACK_PUSHED2;
 193                icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
 194        }
 195}
 196
 197static void tcp_incr_quickack(struct sock *sk)
 198{
 199        struct inet_connection_sock *icsk = inet_csk(sk);
 200        unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
 201
 202        if (quickacks == 0)
 203                quickacks = 2;
 204        if (quickacks > icsk->icsk_ack.quick)
 205                icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
 206}
 207
 208static void tcp_enter_quickack_mode(struct sock *sk)
 209{
 210        struct inet_connection_sock *icsk = inet_csk(sk);
 211        tcp_incr_quickack(sk);
 212        icsk->icsk_ack.pingpong = 0;
 213        icsk->icsk_ack.ato = TCP_ATO_MIN;
 214}
 215
 216/* Send ACKs quickly, if "quick" count is not exhausted
 217 * and the session is not interactive.
 218 */
 219
 220static bool tcp_in_quickack_mode(struct sock *sk)
 221{
 222        const struct inet_connection_sock *icsk = inet_csk(sk);
 223        const struct dst_entry *dst = __sk_dst_get(sk);
 224
 225        return (dst && dst_metric(dst, RTAX_QUICKACK)) ||
 226                (icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong);
 227}
 228
 229static inline void TCP_ECN_queue_cwr(struct tcp_sock *tp)
 230{
 231        if (tp->ecn_flags & TCP_ECN_OK)
 232                tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
 233}
 234
 235static inline void TCP_ECN_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb)
 236{
 237        if (tcp_hdr(skb)->cwr)
 238                tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
 239}
 240
 241static inline void TCP_ECN_withdraw_cwr(struct tcp_sock *tp)
 242{
 243        tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
 244}
 245
 246static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
 247{
 248        if (!(tp->ecn_flags & TCP_ECN_OK))
 249                return;
 250
 251        switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
 252        case INET_ECN_NOT_ECT:
 253                /* Funny extension: if ECT is not set on a segment,
 254                 * and we already seen ECT on a previous segment,
 255                 * it is probably a retransmit.
 256                 */
 257                if (tp->ecn_flags & TCP_ECN_SEEN)
 258                        tcp_enter_quickack_mode((struct sock *)tp);
 259                break;
 260        case INET_ECN_CE:
 261                if (tcp_ca_needs_ecn((struct sock *)tp))
 262                        tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_IS_CE);
 263
 264                if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
 265                        /* Better not delay acks, sender can have a very low cwnd */
 266                        tcp_enter_quickack_mode((struct sock *)tp);
 267                        tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
 268                }
 269                tp->ecn_flags |= TCP_ECN_SEEN;
 270                break;
 271        default:
 272                if (tcp_ca_needs_ecn((struct sock *)tp))
 273                        tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_NO_CE);
 274                tp->ecn_flags |= TCP_ECN_SEEN;
 275                break;
 276        }
 277}
 278
 279static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th)
 280{
 281        if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr))
 282                tp->ecn_flags &= ~TCP_ECN_OK;
 283}
 284
 285static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th)
 286{
 287        if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr))
 288                tp->ecn_flags &= ~TCP_ECN_OK;
 289}
 290
 291static bool TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th)
 292{
 293        if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK))
 294                return true;
 295        return false;
 296}
 297
 298/* Buffer size and advertised window tuning.
 299 *
 300 * 1. Tuning sk->sk_sndbuf, when connection enters established state.
 301 */
 302
 303static void tcp_sndbuf_expand(struct sock *sk)
 304{
 305        const struct tcp_sock *tp = tcp_sk(sk);
 306        int sndmem, per_mss;
 307        u32 nr_segs;
 308
 309        /* Worst case is non GSO/TSO : each frame consumes one skb
 310         * and skb->head is kmalloced using power of two area of memory
 311         */
 312        per_mss = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
 313                  MAX_TCP_HEADER +
 314                  SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 315
 316        per_mss = roundup_pow_of_two(per_mss) +
 317                  SKB_DATA_ALIGN(sizeof(struct sk_buff));
 318
 319        nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);
 320        nr_segs = max_t(u32, nr_segs, tp->reordering + 1);
 321
 322        /* Fast Recovery (RFC 5681 3.2) :
 323         * Cubic needs 1.7 factor, rounded to 2 to include
 324         * extra cushion (application might react slowly to POLLOUT)
 325         */
 326        sndmem = 2 * nr_segs * per_mss;
 327
 328        if (sk->sk_sndbuf < sndmem)
 329                sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
 330}
 331
 332/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
 333 *
 334 * All tcp_full_space() is split to two parts: "network" buffer, allocated
 335 * forward and advertised in receiver window (tp->rcv_wnd) and
 336 * "application buffer", required to isolate scheduling/application
 337 * latencies from network.
 338 * window_clamp is maximal advertised window. It can be less than
 339 * tcp_full_space(), in this case tcp_full_space() - window_clamp
 340 * is reserved for "application" buffer. The less window_clamp is
 341 * the smoother our behaviour from viewpoint of network, but the lower
 342 * throughput and the higher sensitivity of the connection to losses. 8)
 343 *
 344 * rcv_ssthresh is more strict window_clamp used at "slow start"
 345 * phase to predict further behaviour of this connection.
 346 * It is used for two goals:
 347 * - to enforce header prediction at sender, even when application
 348 *   requires some significant "application buffer". It is check #1.
 349 * - to prevent pruning of receive queue because of misprediction
 350 *   of receiver window. Check #2.
 351 *
 352 * The scheme does not work when sender sends good segments opening
 353 * window and then starts to feed us spaghetti. But it should work
 354 * in common situations. Otherwise, we have to rely on queue collapsing.
 355 */
 356
 357/* Slow part of check#2. */
 358static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
 359{
 360        struct tcp_sock *tp = tcp_sk(sk);
 361        /* Optimize this! */
 362        int truesize = tcp_win_from_space(skb->truesize) >> 1;
 363        int window = tcp_win_from_space(sysctl_tcp_rmem[2]) >> 1;
 364
 365        while (tp->rcv_ssthresh <= window) {
 366                if (truesize <= skb->len)
 367                        return 2 * inet_csk(sk)->icsk_ack.rcv_mss;
 368
 369                truesize >>= 1;
 370                window >>= 1;
 371        }
 372        return 0;
 373}
 374
 375static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
 376{
 377        struct tcp_sock *tp = tcp_sk(sk);
 378
 379        /* Check #1 */
 380        if (tp->rcv_ssthresh < tp->window_clamp &&
 381            (int)tp->rcv_ssthresh < tcp_space(sk) &&
 382            !tcp_under_memory_pressure(sk)) {
 383                int incr;
 384
 385                /* Check #2. Increase window, if skb with such overhead
 386                 * will fit to rcvbuf in future.
 387                 */
 388                if (tcp_win_from_space(skb->truesize) <= skb->len)
 389                        incr = 2 * tp->advmss;
 390                else
 391                        incr = __tcp_grow_window(sk, skb);
 392
 393                if (incr) {
 394                        incr = max_t(int, incr, 2 * skb->len);
 395                        tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr,
 396                                               tp->window_clamp);
 397                        inet_csk(sk)->icsk_ack.quick |= 1;
 398                }
 399        }
 400}
 401
 402/* 3. Tuning rcvbuf, when connection enters established state. */
 403static void tcp_fixup_rcvbuf(struct sock *sk)
 404{
 405        u32 mss = tcp_sk(sk)->advmss;
 406        int rcvmem;
 407
 408        rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) *
 409                 tcp_default_init_rwnd(mss);
 410
 411        /* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency
 412         * Allow enough cushion so that sender is not limited by our window
 413         */
 414        if (sysctl_tcp_moderate_rcvbuf)
 415                rcvmem <<= 2;
 416
 417        if (sk->sk_rcvbuf < rcvmem)
 418                sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]);
 419}
 420
 421/* 4. Try to fixup all. It is made immediately after connection enters
 422 *    established state.
 423 */
 424void tcp_init_buffer_space(struct sock *sk)
 425{
 426        struct tcp_sock *tp = tcp_sk(sk);
 427        int maxwin;
 428
 429        if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
 430                tcp_fixup_rcvbuf(sk);
 431        if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
 432                tcp_sndbuf_expand(sk);
 433
 434        tp->rcvq_space.space = tp->rcv_wnd;
 435        tp->rcvq_space.time = tcp_time_stamp;
 436        tp->rcvq_space.seq = tp->copied_seq;
 437
 438        maxwin = tcp_full_space(sk);
 439
 440        if (tp->window_clamp >= maxwin) {
 441                tp->window_clamp = maxwin;
 442
 443                if (sysctl_tcp_app_win && maxwin > 4 * tp->advmss)
 444                        tp->window_clamp = max(maxwin -
 445                                               (maxwin >> sysctl_tcp_app_win),
 446                                               4 * tp->advmss);
 447        }
 448
 449        /* Force reservation of one segment. */
 450        if (sysctl_tcp_app_win &&
 451            tp->window_clamp > 2 * tp->advmss &&
 452            tp->window_clamp + tp->advmss > maxwin)
 453                tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss);
 454
 455        tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
 456        tp->snd_cwnd_stamp = tcp_time_stamp;
 457}
 458
 459/* 5. Recalculate window clamp after socket hit its memory bounds. */
 460static void tcp_clamp_window(struct sock *sk)
 461{
 462        struct tcp_sock *tp = tcp_sk(sk);
 463        struct inet_connection_sock *icsk = inet_csk(sk);
 464
 465        icsk->icsk_ack.quick = 0;
 466
 467        if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
 468            !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
 469            !tcp_under_memory_pressure(sk) &&
 470            sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
 471                sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
 472                                    sysctl_tcp_rmem[2]);
 473        }
 474        if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
 475                tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
 476}
 477
 478/* Initialize RCV_MSS value.
 479 * RCV_MSS is an our guess about MSS used by the peer.
 480 * We haven't any direct information about the MSS.
 481 * It's better to underestimate the RCV_MSS rather than overestimate.
 482 * Overestimations make us ACKing less frequently than needed.
 483 * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss().
 484 */
 485void tcp_initialize_rcv_mss(struct sock *sk)
 486{
 487        const struct tcp_sock *tp = tcp_sk(sk);
 488        unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
 489
 490        hint = min(hint, tp->rcv_wnd / 2);
 491        hint = min(hint, TCP_MSS_DEFAULT);
 492        hint = max(hint, TCP_MIN_MSS);
 493
 494        inet_csk(sk)->icsk_ack.rcv_mss = hint;
 495}
 496EXPORT_SYMBOL(tcp_initialize_rcv_mss);
 497
 498/* Receiver "autotuning" code.
 499 *
 500 * The algorithm for RTT estimation w/o timestamps is based on
 501 * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
 502 * <http://public.lanl.gov/radiant/pubs.html#DRS>
 503 *
 504 * More detail on this code can be found at
 505 * <http://staff.psc.edu/jheffner/>,
 506 * though this reference is out of date.  A new paper
 507 * is pending.
 508 */
 509static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
 510{
 511        u32 new_sample = tp->rcv_rtt_est.rtt;
 512        long m = sample;
 513
 514        if (m == 0)
 515                m = 1;
 516
 517        if (new_sample != 0) {
 518                /* If we sample in larger samples in the non-timestamp
 519                 * case, we could grossly overestimate the RTT especially
 520                 * with chatty applications or bulk transfer apps which
 521                 * are stalled on filesystem I/O.
 522                 *
 523                 * Also, since we are only going for a minimum in the
 524                 * non-timestamp case, we do not smooth things out
 525                 * else with timestamps disabled convergence takes too
 526                 * long.
 527                 */
 528                if (!win_dep) {
 529                        m -= (new_sample >> 3);
 530                        new_sample += m;
 531                } else {
 532                        m <<= 3;
 533                        if (m < new_sample)
 534                                new_sample = m;
 535                }
 536        } else {
 537                /* No previous measure. */
 538                new_sample = m << 3;
 539        }
 540
 541        if (tp->rcv_rtt_est.rtt != new_sample)
 542                tp->rcv_rtt_est.rtt = new_sample;
 543}
 544
 545static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
 546{
 547        if (tp->rcv_rtt_est.time == 0)
 548                goto new_measure;
 549        if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
 550                return;
 551        tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rcv_rtt_est.time, 1);
 552
 553new_measure:
 554        tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
 555        tp->rcv_rtt_est.time = tcp_time_stamp;
 556}
 557
 558static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
 559                                          const struct sk_buff *skb)
 560{
 561        struct tcp_sock *tp = tcp_sk(sk);
 562        if (tp->rx_opt.rcv_tsecr &&
 563            (TCP_SKB_CB(skb)->end_seq -
 564             TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss))
 565                tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0);
 566}
 567
 568/*
 569 * This function should be called every time data is copied to user space.
 570 * It calculates the appropriate TCP receive buffer space.
 571 */
 572void tcp_rcv_space_adjust(struct sock *sk)
 573{
 574        struct tcp_sock *tp = tcp_sk(sk);
 575        int time;
 576        int copied;
 577
 578        time = tcp_time_stamp - tp->rcvq_space.time;
 579        if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)
 580                return;
 581
 582        /* Number of bytes copied to user in last RTT */
 583        copied = tp->copied_seq - tp->rcvq_space.seq;
 584        if (copied <= tp->rcvq_space.space)
 585                goto new_measure;
 586
 587        /* A bit of theory :
 588         * copied = bytes received in previous RTT, our base window
 589         * To cope with packet losses, we need a 2x factor
 590         * To cope with slow start, and sender growing its cwin by 100 %
 591         * every RTT, we need a 4x factor, because the ACK we are sending
 592         * now is for the next RTT, not the current one :
 593         * <prev RTT . ><current RTT .. ><next RTT .... >
 594         */
 595
 596        if (sysctl_tcp_moderate_rcvbuf &&
 597            !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
 598                int rcvwin, rcvmem, rcvbuf;
 599
 600                /* minimal window to cope with packet losses, assuming
 601                 * steady state. Add some cushion because of small variations.
 602                 */
 603                rcvwin = (copied << 1) + 16 * tp->advmss;
 604
 605                /* If rate increased by 25%,
 606                 *      assume slow start, rcvwin = 3 * copied
 607                 * If rate increased by 50%,
 608                 *      assume sender can use 2x growth, rcvwin = 4 * copied
 609                 */
 610                if (copied >=
 611                    tp->rcvq_space.space + (tp->rcvq_space.space >> 2)) {
 612                        if (copied >=
 613                            tp->rcvq_space.space + (tp->rcvq_space.space >> 1))
 614                                rcvwin <<= 1;
 615                        else
 616                                rcvwin += (rcvwin >> 1);
 617                }
 618
 619                rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
 620                while (tcp_win_from_space(rcvmem) < tp->advmss)
 621                        rcvmem += 128;
 622
 623                rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]);
 624                if (rcvbuf > sk->sk_rcvbuf) {
 625                        sk->sk_rcvbuf = rcvbuf;
 626
 627                        /* Make the window clamp follow along.  */
 628                        tp->window_clamp = rcvwin;
 629                }
 630        }
 631        tp->rcvq_space.space = copied;
 632
 633new_measure:
 634        tp->rcvq_space.seq = tp->copied_seq;
 635        tp->rcvq_space.time = tcp_time_stamp;
 636}
 637
 638/* There is something which you must keep in mind when you analyze the
 639 * behavior of the tp->ato delayed ack timeout interval.  When a
 640 * connection starts up, we want to ack as quickly as possible.  The
 641 * problem is that "good" TCP's do slow start at the beginning of data
 642 * transmission.  The means that until we send the first few ACK's the
 643 * sender will sit on his end and only queue most of his data, because
 644 * he can only send snd_cwnd unacked packets at any given time.  For
 645 * each ACK we send, he increments snd_cwnd and transmits more of his
 646 * queue.  -DaveM
 647 */
 648static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
 649{
 650        struct tcp_sock *tp = tcp_sk(sk);
 651        struct inet_connection_sock *icsk = inet_csk(sk);
 652        u32 now;
 653
 654        inet_csk_schedule_ack(sk);
 655
 656        tcp_measure_rcv_mss(sk, skb);
 657
 658        tcp_rcv_rtt_measure(tp);
 659
 660        now = tcp_time_stamp;
 661
 662        if (!icsk->icsk_ack.ato) {
 663                /* The _first_ data packet received, initialize
 664                 * delayed ACK engine.
 665                 */
 666                tcp_incr_quickack(sk);
 667                icsk->icsk_ack.ato = TCP_ATO_MIN;
 668        } else {
 669                int m = now - icsk->icsk_ack.lrcvtime;
 670
 671                if (m <= TCP_ATO_MIN / 2) {
 672                        /* The fastest case is the first. */
 673                        icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2;
 674                } else if (m < icsk->icsk_ack.ato) {
 675                        icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m;
 676                        if (icsk->icsk_ack.ato > icsk->icsk_rto)
 677                                icsk->icsk_ack.ato = icsk->icsk_rto;
 678                } else if (m > icsk->icsk_rto) {
 679                        /* Too long gap. Apparently sender failed to
 680                         * restart window, so that we send ACKs quickly.
 681                         */
 682                        tcp_incr_quickack(sk);
 683                        sk_mem_reclaim(sk);
 684                }
 685        }
 686        icsk->icsk_ack.lrcvtime = now;
 687
 688        TCP_ECN_check_ce(tp, skb);
 689
 690        if (skb->len >= 128)
 691                tcp_grow_window(sk, skb);
 692}
 693
 694/* Called to compute a smoothed rtt estimate. The data fed to this
 695 * routine either comes from timestamps, or from segments that were
 696 * known _not_ to have been retransmitted [see Karn/Partridge
 697 * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
 698 * piece by Van Jacobson.
 699 * NOTE: the next three routines used to be one big routine.
 700 * To save cycles in the RFC 1323 implementation it was better to break
 701 * it up into three procedures. -- erics
 702 */
 703static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
 704{
 705        struct tcp_sock *tp = tcp_sk(sk);
 706        long m = mrtt_us; /* RTT */
 707        u32 srtt = tp->srtt_us;
 708
 709        /*      The following amusing code comes from Jacobson's
 710         *      article in SIGCOMM '88.  Note that rtt and mdev
 711         *      are scaled versions of rtt and mean deviation.
 712         *      This is designed to be as fast as possible
 713         *      m stands for "measurement".
 714         *
 715         *      On a 1990 paper the rto value is changed to:
 716         *      RTO = rtt + 4 * mdev
 717         *
 718         * Funny. This algorithm seems to be very broken.
 719         * These formulae increase RTO, when it should be decreased, increase
 720         * too slowly, when it should be increased quickly, decrease too quickly
 721         * etc. I guess in BSD RTO takes ONE value, so that it is absolutely
 722         * does not matter how to _calculate_ it. Seems, it was trap
 723         * that VJ failed to avoid. 8)
 724         */
 725        if (srtt != 0) {
 726                m -= (srtt >> 3);       /* m is now error in rtt est */
 727                srtt += m;              /* rtt = 7/8 rtt + 1/8 new */
 728                if (m < 0) {
 729                        m = -m;         /* m is now abs(error) */
 730                        m -= (tp->mdev_us >> 2);   /* similar update on mdev */
 731                        /* This is similar to one of Eifel findings.
 732                         * Eifel blocks mdev updates when rtt decreases.
 733                         * This solution is a bit different: we use finer gain
 734                         * for mdev in this case (alpha*beta).
 735                         * Like Eifel it also prevents growth of rto,
 736                         * but also it limits too fast rto decreases,
 737                         * happening in pure Eifel.
 738                         */
 739                        if (m > 0)
 740                                m >>= 3;
 741                } else {
 742                        m -= (tp->mdev_us >> 2);   /* similar update on mdev */
 743                }
 744                tp->mdev_us += m;               /* mdev = 3/4 mdev + 1/4 new */
 745                if (tp->mdev_us > tp->mdev_max_us) {
 746                        tp->mdev_max_us = tp->mdev_us;
 747                        if (tp->mdev_max_us > tp->rttvar_us)
 748                                tp->rttvar_us = tp->mdev_max_us;
 749                }
 750                if (after(tp->snd_una, tp->rtt_seq)) {
 751                        if (tp->mdev_max_us < tp->rttvar_us)
 752                                tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2;
 753                        tp->rtt_seq = tp->snd_nxt;
 754                        tp->mdev_max_us = tcp_rto_min_us(sk);
 755                }
 756        } else {
 757                /* no previous measure. */
 758                srtt = m << 3;          /* take the measured time to be rtt */
 759                tp->mdev_us = m << 1;   /* make sure rto = 3*rtt */
 760                tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk));
 761                tp->mdev_max_us = tp->rttvar_us;
 762                tp->rtt_seq = tp->snd_nxt;
 763        }
 764        tp->srtt_us = max(1U, srtt);
 765}
 766
 767/* Set the sk_pacing_rate to allow proper sizing of TSO packets.
 768 * Note: TCP stack does not yet implement pacing.
 769 * FQ packet scheduler can be used to implement cheap but effective
 770 * TCP pacing, to smooth the burst on large writes when packets
 771 * in flight is significantly lower than cwnd (or rwin)
 772 */
 773static void tcp_update_pacing_rate(struct sock *sk)
 774{
 775        const struct tcp_sock *tp = tcp_sk(sk);
 776        u64 rate;
 777
 778        /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
 779        rate = (u64)tp->mss_cache * 2 * (USEC_PER_SEC << 3);
 780
 781        rate *= max(tp->snd_cwnd, tp->packets_out);
 782
 783        if (likely(tp->srtt_us))
 784                do_div(rate, tp->srtt_us);
 785
 786        /* ACCESS_ONCE() is needed because sch_fq fetches sk_pacing_rate
 787         * without any lock. We want to make sure compiler wont store
 788         * intermediate values in this location.
 789         */
 790        ACCESS_ONCE(sk->sk_pacing_rate) = min_t(u64, rate,
 791                                                sk->sk_max_pacing_rate);
 792}
 793
 794/* Calculate rto without backoff.  This is the second half of Van Jacobson's
 795 * routine referred to above.
 796 */
 797static void tcp_set_rto(struct sock *sk)
 798{
 799        const struct tcp_sock *tp = tcp_sk(sk);
 800        /* Old crap is replaced with new one. 8)
 801         *
 802         * More seriously:
 803         * 1. If rtt variance happened to be less 50msec, it is hallucination.
 804         *    It cannot be less due to utterly erratic ACK generation made
 805         *    at least by solaris and freebsd. "Erratic ACKs" has _nothing_
 806         *    to do with delayed acks, because at cwnd>2 true delack timeout
 807         *    is invisible. Actually, Linux-2.4 also generates erratic
 808         *    ACKs in some circumstances.
 809         */
 810        inet_csk(sk)->icsk_rto = __tcp_set_rto(tp);
 811
 812        /* 2. Fixups made earlier cannot be right.
 813         *    If we do not estimate RTO correctly without them,
 814         *    all the algo is pure shit and should be replaced
 815         *    with correct one. It is exactly, which we pretend to do.
 816         */
 817
 818        /* NOTE: clamping at TCP_RTO_MIN is not required, current algo
 819         * guarantees that rto is higher.
 820         */
 821        tcp_bound_rto(sk);
 822}
 823
 824__u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
 825{
 826        __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
 827
 828        if (!cwnd)
 829                cwnd = TCP_INIT_CWND;
 830        return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
 831}
 832
 833/*
 834 * Packet counting of FACK is based on in-order assumptions, therefore TCP
 835 * disables it when reordering is detected
 836 */
 837void tcp_disable_fack(struct tcp_sock *tp)
 838{
 839        /* RFC3517 uses different metric in lost marker => reset on change */
 840        if (tcp_is_fack(tp))
 841                tp->lost_skb_hint = NULL;
 842        tp->rx_opt.sack_ok &= ~TCP_FACK_ENABLED;
 843}
 844
 845/* Take a notice that peer is sending D-SACKs */
 846static void tcp_dsack_seen(struct tcp_sock *tp)
 847{
 848        tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
 849}
 850
 851static void tcp_update_reordering(struct sock *sk, const int metric,
 852                                  const int ts)
 853{
 854        struct tcp_sock *tp = tcp_sk(sk);
 855        if (metric > tp->reordering) {
 856                int mib_idx;
 857
 858                tp->reordering = min(TCP_MAX_REORDERING, metric);
 859
 860                /* This exciting event is worth to be remembered. 8) */
 861                if (ts)
 862                        mib_idx = LINUX_MIB_TCPTSREORDER;
 863                else if (tcp_is_reno(tp))
 864                        mib_idx = LINUX_MIB_TCPRENOREORDER;
 865                else if (tcp_is_fack(tp))
 866                        mib_idx = LINUX_MIB_TCPFACKREORDER;
 867                else
 868                        mib_idx = LINUX_MIB_TCPSACKREORDER;
 869
 870                NET_INC_STATS_BH(sock_net(sk), mib_idx);
 871#if FASTRETRANS_DEBUG > 1
 872                pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
 873                         tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
 874                         tp->reordering,
 875                         tp->fackets_out,
 876                         tp->sacked_out,
 877                         tp->undo_marker ? tp->undo_retrans : 0);
 878#endif
 879                tcp_disable_fack(tp);
 880        }
 881
 882        if (metric > 0)
 883                tcp_disable_early_retrans(tp);
 884}
 885
 886/* This must be called before lost_out is incremented */
 887static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
 888{
 889        if ((tp->retransmit_skb_hint == NULL) ||
 890            before(TCP_SKB_CB(skb)->seq,
 891                   TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
 892                tp->retransmit_skb_hint = skb;
 893
 894        if (!tp->lost_out ||
 895            after(TCP_SKB_CB(skb)->end_seq, tp->retransmit_high))
 896                tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
 897}
 898
 899static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
 900{
 901        if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
 902                tcp_verify_retransmit_hint(tp, skb);
 903
 904                tp->lost_out += tcp_skb_pcount(skb);
 905                TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
 906        }
 907}
 908
 909static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp,
 910                                            struct sk_buff *skb)
 911{
 912        tcp_verify_retransmit_hint(tp, skb);
 913
 914        if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
 915                tp->lost_out += tcp_skb_pcount(skb);
 916                TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
 917        }
 918}
 919
 920/* This procedure tags the retransmission queue when SACKs arrive.
 921 *
 922 * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L).
 923 * Packets in queue with these bits set are counted in variables
 924 * sacked_out, retrans_out and lost_out, correspondingly.
 925 *
 926 * Valid combinations are:
 927 * Tag  InFlight        Description
 928 * 0    1               - orig segment is in flight.
 929 * S    0               - nothing flies, orig reached receiver.
 930 * L    0               - nothing flies, orig lost by net.
 931 * R    2               - both orig and retransmit are in flight.
 932 * L|R  1               - orig is lost, retransmit is in flight.
 933 * S|R  1               - orig reached receiver, retrans is still in flight.
 934 * (L|S|R is logically valid, it could occur when L|R is sacked,
 935 *  but it is equivalent to plain S and code short-curcuits it to S.
 936 *  L|S is logically invalid, it would mean -1 packet in flight 8))
 937 *
 938 * These 6 states form finite state machine, controlled by the following events:
 939 * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue())
 940 * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue())
 941 * 3. Loss detection event of two flavors:
 942 *      A. Scoreboard estimator decided the packet is lost.
 943 *         A'. Reno "three dupacks" marks head of queue lost.
 944 *         A''. Its FACK modification, head until snd.fack is lost.
 945 *      B. SACK arrives sacking SND.NXT at the moment, when the
 946 *         segment was retransmitted.
 947 * 4. D-SACK added new rule: D-SACK changes any tag to S.
 948 *
 949 * It is pleasant to note, that state diagram turns out to be commutative,
 950 * so that we are allowed not to be bothered by order of our actions,
 951 * when multiple events arrive simultaneously. (see the function below).
 952 *
 953 * Reordering detection.
 954 * --------------------
 955 * Reordering metric is maximal distance, which a packet can be displaced
 956 * in packet stream. With SACKs we can estimate it:
 957 *
 958 * 1. SACK fills old hole and the corresponding segment was not
 959 *    ever retransmitted -> reordering. Alas, we cannot use it
 960 *    when segment was retransmitted.
 961 * 2. The last flaw is solved with D-SACK. D-SACK arrives
 962 *    for retransmitted and already SACKed segment -> reordering..
 963 * Both of these heuristics are not used in Loss state, when we cannot
 964 * account for retransmits accurately.
 965 *
 966 * SACK block validation.
 967 * ----------------------
 968 *
 969 * SACK block range validation checks that the received SACK block fits to
 970 * the expected sequence limits, i.e., it is between SND.UNA and SND.NXT.
 971 * Note that SND.UNA is not included to the range though being valid because
 972 * it means that the receiver is rather inconsistent with itself reporting
 973 * SACK reneging when it should advance SND.UNA. Such SACK block this is
 974 * perfectly valid, however, in light of RFC2018 which explicitly states
 975 * that "SACK block MUST reflect the newest segment.  Even if the newest
 976 * segment is going to be discarded ...", not that it looks very clever
 977 * in case of head skb. Due to potentional receiver driven attacks, we
 978 * choose to avoid immediate execution of a walk in write queue due to
 979 * reneging and defer head skb's loss recovery to standard loss recovery
 980 * procedure that will eventually trigger (nothing forbids us doing this).
 981 *
 982 * Implements also blockage to start_seq wrap-around. Problem lies in the
 983 * fact that though start_seq (s) is before end_seq (i.e., not reversed),
 984 * there's no guarantee that it will be before snd_nxt (n). The problem
 985 * happens when start_seq resides between end_seq wrap (e_w) and snd_nxt
 986 * wrap (s_w):
 987 *
 988 *         <- outs wnd ->                          <- wrapzone ->
 989 *         u     e      n                         u_w   e_w  s n_w
 990 *         |     |      |                          |     |   |  |
 991 * |<------------+------+----- TCP seqno space --------------+---------->|
 992 * ...-- <2^31 ->|                                           |<--------...
 993 * ...---- >2^31 ------>|                                    |<--------...
 994 *
 995 * Current code wouldn't be vulnerable but it's better still to discard such
 996 * crazy SACK blocks. Doing this check for start_seq alone closes somewhat
 997 * similar case (end_seq after snd_nxt wrap) as earlier reversed check in
 998 * snd_nxt wrap -> snd_una region will then become "well defined", i.e.,
 999 * equal to the ideal case (infinite seqno space without wrap caused issues).
1000 *

1001 * With D-SACK the lower bound is extended to cover sequence space below
1002 * SND.UNA down to undo_marker, which is the last point of interest. Yet
1003 * again, D-SACK block must not to go across snd_una (for the same reason as
1004 * for the normal SACK blocks, explained above). But there all simplicity
1005 * ends, TCP might receive valid D-SACKs below that. As long as they reside
1006 * fully below undo_marker they do not affect behavior in anyway and can
1007 * therefore be safely ignored. In rare cases (which are more or less
1008 * theoretical ones), the D-SACK will nicely cross that boundary due to skb
1009 * fragmentation and packet reordering past skb's retransmission. To consider
1010 * them correctly, the acceptable range must be extended even more though
1011 * the exact amount is rather hard to quantify. However, tp->max_window can
1012 * be used as an exaggerated estimate.
1013 */
1014static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,
1015                                   u32 start_seq, u32 end_seq)
1016{
1017        /* Too far in future, or reversed (interpretation is ambiguous) */
1018        if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq))
1019                return false;
1020
1021        /* Nasty start_seq wrap-around check (see comments above) */
1022        if (!before(start_seq, tp->snd_nxt))
1023                return false;
1024
1025        /* In outstanding window? ...This is valid exit for D-SACKs too.
1026         * start_seq == snd_una is non-sensical (see comments above)
1027         */
1028        if (after(start_seq, tp->snd_una))
1029                return true;
1030
1031        if (!is_dsack || !tp->undo_marker)
1032                return false;
1033
1034        /* ...Then it's D-SACK, and must reside below snd_una completely */
1035        if (after(end_seq, tp->snd_una))
1036                return false;
1037
1038        if (!before(start_seq, tp->undo_marker))
1039                return true;
1040
1041        /* Too old */
1042        if (!after(end_seq, tp->undo_marker))
1043                return false;
1044
1045        /* Undo_marker boundary crossing (overestimates a lot). Known already:
1046         *   start_seq < undo_marker and end_seq >= undo_marker.
1047         */
1048        return !before(start_seq, end_seq - tp->max_window);
1049}
1050
1051/* Check for lost retransmit. This superb idea is borrowed from "ratehalving".
1052 * Event "B". Later note: FACK people cheated me again 8), we have to account
1053 * for reordering! Ugly, but should help.
1054 *
1055 * Search retransmitted skbs from write_queue that were sent when snd_nxt was
1056 * less than what is now known to be received by the other end (derived from
1057 * highest SACK block). Also calculate the lowest snd_nxt among the remaining
1058 * retransmitted skbs to avoid some costly processing per ACKs.
1059 */
1060static void tcp_mark_lost_retrans(struct sock *sk)
1061{
1062        const struct inet_connection_sock *icsk = inet_csk(sk);
1063        struct tcp_sock *tp = tcp_sk(sk);
1064        struct sk_buff *skb;
1065        int cnt = 0;
1066        u32 new_low_seq = tp->snd_nxt;
1067        u32 received_upto = tcp_highest_sack_seq(tp);
1068
1069        if (!tcp_is_fack(tp) || !tp->retrans_out ||
1070            !after(received_upto, tp->lost_retrans_low) ||
1071            icsk->icsk_ca_state != TCP_CA_Recovery)
1072                return;
1073
1074        tcp_for_write_queue(skb, sk) {
1075                u32 ack_seq = TCP_SKB_CB(skb)->ack_seq;
1076
1077                if (skb == tcp_send_head(sk))
1078                        break;
1079                if (cnt == tp->retrans_out)
1080                        break;
1081                if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
1082                        continue;
1083
1084                if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS))
1085                        continue;
1086
1087                /* TODO: We would like to get rid of tcp_is_fack(tp) only
1088                 * constraint here (see above) but figuring out that at
1089                 * least tp->reordering SACK blocks reside between ack_seq
1090                 * and received_upto is not easy task to do cheaply with
1091                 * the available datastructures.
1092                 *
1093                 * Whether FACK should check here for tp->reordering segs
1094                 * in-between one could argue for either way (it would be
1095                 * rather simple to implement as we could count fack_count
1096                 * during the walk and do tp->fackets_out - fack_count).
1097                 */
1098                if (after(received_upto, ack_seq)) {
1099                        TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
1100                        tp->retrans_out -= tcp_skb_pcount(skb);
1101
1102                        tcp_skb_mark_lost_uncond_verify(tp, skb);
1103                        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT);
1104                } else {
1105                        if (before(ack_seq, new_low_seq))
1106                                new_low_seq = ack_seq;
1107                        cnt += tcp_skb_pcount(skb);
1108                }
1109        }
1110
1111        if (tp->retrans_out)
1112                tp->lost_retrans_low = new_low_seq;
1113}
1114
1115static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
1116                            struct tcp_sack_block_wire *sp, int num_sacks,
1117                            u32 prior_snd_una)
1118{
1119        struct tcp_sock *tp = tcp_sk(sk);
1120        u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq);
1121        u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq);
1122        bool dup_sack = false;
1123
1124        if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
1125                dup_sack = true;
1126                tcp_dsack_seen(tp);
1127                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
1128        } else if (num_sacks > 1) {
1129                u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq);
1130                u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq);
1131
1132                if (!after(end_seq_0, end_seq_1) &&
1133                    !before(start_seq_0, start_seq_1)) {
1134                        dup_sack = true;
1135                        tcp_dsack_seen(tp);
1136                        NET_INC_STATS_BH(sock_net(sk),
1137                                        LINUX_MIB_TCPDSACKOFORECV);
1138                }
1139        }
1140
1141        /* D-SACK for already forgotten data... Do dumb counting. */
1142        if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 &&
1143            !after(end_seq_0, prior_snd_una) &&
1144            after(end_seq_0, tp->undo_marker))
1145                tp->undo_retrans--;
1146
1147        return dup_sack;
1148}
1149
1150struct tcp_sacktag_state {
1151        int     reord;
1152        int     fack_count;
1153        long    rtt_us; /* RTT measured by SACKing never-retransmitted data */
1154        int     flag;
1155};
1156
1157/* Check if skb is fully within the SACK block. In presence of GSO skbs,
1158 * the incoming SACK may not exactly match but we can find smaller MSS
1159 * aligned portion of it that matches. Therefore we might need to fragment
1160 * which may fail and creates some hassle (caller must handle error case
1161 * returns).
1162 *
1163 * FIXME: this could be merged to shift decision code
1164 */
1165static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
1166                                  u32 start_seq, u32 end_seq)
1167{
1168        int err;
1169        bool in_sack;
1170        unsigned int pkt_len;
1171        unsigned int mss;
1172
1173        in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
1174                  !before(end_seq, TCP_SKB_CB(skb)->end_seq);
1175
1176        if (tcp_skb_pcount(skb) > 1 && !in_sack &&
1177            after(TCP_SKB_CB(skb)->end_seq, start_seq)) {
1178                mss = tcp_skb_mss(skb);
1179                in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
1180
1181                if (!in_sack) {
1182                        pkt_len = start_seq - TCP_SKB_CB(skb)->seq;
1183                        if (pkt_len < mss)
1184                                pkt_len = mss;
1185                } else {
1186                        pkt_len = end_seq - TCP_SKB_CB(skb)->seq;
1187                        if (pkt_len < mss)
1188                                return -EINVAL;
1189                }
1190
1191                /* Round if necessary so that SACKs cover only full MSSes
1192                 * and/or the remaining small portion (if present)
1193                 */
1194                if (pkt_len > mss) {
1195                        unsigned int new_len = (pkt_len / mss) * mss;
1196                        if (!in_sack && new_len < pkt_len)
1197                                new_len += mss;
1198                        pkt_len = new_len;
1199                }
1200
1201                if (pkt_len >= skb->len && !in_sack)
1202                        return 0;
1203
1204                err = tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, pkt_len, mss);
1205                if (err < 0)
1206                        return err;
1207        }
1208
1209        return in_sack;
1210}
1211
1212/* Mark the given newly-SACKed range as such, adjusting counters and hints. */
1213static u8 tcp_sacktag_one(struct sock *sk,
1214                          struct tcp_sacktag_state *state, u8 sacked,
1215                          u32 start_seq, u32 end_seq,
1216                          int dup_sack, int pcount,
1217                          const struct skb_mstamp *xmit_time)
1218{
1219        struct tcp_sock *tp = tcp_sk(sk);
1220        int fack_count = state->fack_count;
1221
1222        /* Account D-SACK for retransmitted packet. */
1223        if (dup_sack && (sacked & TCPCB_RETRANS)) {
1224                if (tp->undo_marker && tp->undo_retrans > 0 &&
1225                    after(end_seq, tp->undo_marker))
1226                        tp->undo_retrans--;
1227                if (sacked & TCPCB_SACKED_ACKED)
1228                        state->reord = min(fack_count, state->reord);
1229        }
1230
1231        /* Nothing to do; acked frame is about to be dropped (was ACKed). */
1232        if (!after(end_seq, tp->snd_una))
1233                return sacked;
1234
1235        if (!(sacked & TCPCB_SACKED_ACKED)) {
1236                if (sacked & TCPCB_SACKED_RETRANS) {
1237                        /* If the segment is not tagged as lost,
1238                         * we do not clear RETRANS, believing
1239                         * that retransmission is still in flight.
1240                         */
1241                        if (sacked & TCPCB_LOST) {
1242                                sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
1243                                tp->lost_out -= pcount;
1244                                tp->retrans_out -= pcount;
1245                        }
1246                } else {
1247                        if (!(sacked & TCPCB_RETRANS)) {
1248                                /* New sack for not retransmitted frame,
1249                                 * which was in hole. It is reordering.
1250                                 */
1251                                if (before(start_seq,
1252                                           tcp_highest_sack_seq(tp)))
1253                                        state->reord = min(fack_count,
1254                                                           state->reord);
1255                                if (!after(end_seq, tp->high_seq))
1256                                        state->flag |= FLAG_ORIG_SACK_ACKED;
1257                                /* Pick the earliest sequence sacked for RTT */
1258                                if (state->rtt_us < 0) {
1259                                        struct skb_mstamp now;
1260
1261                                        skb_mstamp_get(&now);
1262                                        state->rtt_us = skb_mstamp_us_delta(&now,
1263                                                                xmit_time);
1264                                }
1265                        }
1266
1267                        if (sacked & TCPCB_LOST) {
1268                                sacked &= ~TCPCB_LOST;
1269                                tp->lost_out -= pcount;
1270                        }
1271                }
1272
1273                sacked |= TCPCB_SACKED_ACKED;
1274                state->flag |= FLAG_DATA_SACKED;
1275                tp->sacked_out += pcount;
1276
1277                fack_count += pcount;
1278
1279                /* Lost marker hint past SACKed? Tweak RFC3517 cnt */
1280                if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) &&
1281                    before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))
1282                        tp->lost_cnt_hint += pcount;
1283
1284                if (fack_count > tp->fackets_out)
1285                        tp->fackets_out = fack_count;
1286        }
1287
1288        /* D-SACK. We can detect redundant retransmission in S|R and plain R
1289         * frames and clear it. undo_retrans is decreased above, L|R frames
1290         * are accounted above as well.
1291         */
1292        if (dup_sack && (sacked & TCPCB_SACKED_RETRANS)) {
1293                sacked &= ~TCPCB_SACKED_RETRANS;
1294                tp->retrans_out -= pcount;
1295        }
1296
1297        return sacked;
1298}
1299
1300/* Shift newly-SACKed bytes from this skb to the immediately previous
1301 * already-SACKed sk_buff. Mark the newly-SACKed bytes as such.
1302 */
1303static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
1304                            struct sk_buff *skb,
1305                            struct tcp_sacktag_state *state,
1306                            unsigned int pcount, int shifted, int mss,
1307                            bool dup_sack)
1308{
1309        struct tcp_sock *tp = tcp_sk(sk);
1310        u32 start_seq = TCP_SKB_CB(skb)->seq;   /* start of newly-SACKed */
1311        u32 end_seq = start_seq + shifted;      /* end of newly-SACKed */
1312
1313        BUG_ON(!pcount);
1314
1315        /* Adjust counters and hints for the newly sacked sequence
1316         * range but discard the return value since prev is already
1317         * marked. We must tag the range first because the seq
1318         * advancement below implicitly advances
1319         * tcp_highest_sack_seq() when skb is highest_sack.
1320         */
1321        tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
1322                        start_seq, end_seq, dup_sack, pcount,
1323                        &skb->skb_mstamp);
1324
1325        if (skb == tp->lost_skb_hint)
1326                tp->lost_cnt_hint += pcount;
1327
1328        TCP_SKB_CB(prev)->end_seq += shifted;
1329        TCP_SKB_CB(skb)->seq += shifted;
1330
1331        skb_shinfo(prev)->gso_segs += pcount;
1332        WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
1333        skb_shinfo(skb)->gso_segs -= pcount;
1334
1335        /* When we're adding to gso_segs == 1, gso_size will be zero,
1336         * in theory this shouldn't be necessary but as long as DSACK
1337         * code can come after this skb later on it's better to keep
1338         * setting gso_size to something.
1339         */
1340        if (!skb_shinfo(prev)->gso_size) {
1341                skb_shinfo(prev)->gso_size = mss;
1342                skb_shinfo(prev)->gso_type = sk->sk_gso_type;
1343        }
1344
1345        /* CHECKME: To clear or not to clear? Mimics normal skb currently */
1346        if (skb_shinfo(skb)->gso_segs <= 1) {
1347                skb_shinfo(skb)->gso_size = 0;
1348                skb_shinfo(skb)->gso_type = 0;
1349        }
1350
1351        /* Difference in this won't matter, both ACKed by the same cumul. ACK */
1352        TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
1353
1354        if (skb->len > 0) {
1355                BUG_ON(!tcp_skb_pcount(skb));
1356                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTED);
1357                return false;
1358        }
1359
1360        /* Whole SKB was eaten :-) */
1361
1362        if (skb == tp->retransmit_skb_hint)
1363                tp->retransmit_skb_hint = prev;
1364        if (skb == tp->lost_skb_hint) {
1365                tp->lost_skb_hint = prev;
1366                tp->lost_cnt_hint -= tcp_skb_pcount(prev);
1367        }
1368
1369        TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1370        if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
1371                TCP_SKB_CB(prev)->end_seq++;
1372
1373        if (skb == tcp_highest_sack(sk))
1374                tcp_advance_highest_sack(sk, skb);
1375
1376        tcp_unlink_write_queue(skb, sk);
1377        sk_wmem_free_skb(sk, skb);
1378
1379        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKMERGED);
1380
1381        return true;
1382}
1383
1384/* I wish gso_size would have a bit more sane initialization than
1385 * something-or-zero which complicates things
1386 */
1387static int tcp_skb_seglen(const struct sk_buff *skb)
1388{
1389        return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb);
1390}
1391
1392/* Shifting pages past head area doesn't work */
1393static int skb_can_shift(const struct sk_buff *skb)
1394{
1395        return !skb_headlen(skb) && skb_is_nonlinear(skb);
1396}
1397
1398int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from,
1399                  int pcount, int shiftlen)
1400{
1401        /* TCP min gso_size is 8 bytes (TCP_MIN_GSO_SIZE)
1402         * Since TCP_SKB_CB(skb)->tcp_gso_segs is 16 bits, we need
1403         * to make sure not storing more than 65535 * 8 bytes per skb,
1404         * even if current MSS is bigger.
1405         */
1406        if (unlikely(to->len + shiftlen >= 65535 * TCP_MIN_GSO_SIZE))
1407                return 0;
1408        if (unlikely(tcp_skb_pcount(to) + pcount > 65535))
1409                return 0;
1410        return skb_shift(to, from, shiftlen);
1411}
1412
1413/* Try collapsing SACK blocks spanning across multiple skbs to a single
1414 * skb.
1415 */
1416static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
1417                                          struct tcp_sacktag_state *state,
1418                                          u32 start_seq, u32 end_seq,
1419                                          bool dup_sack)
1420{
1421        struct tcp_sock *tp = tcp_sk(sk);
1422        struct sk_buff *prev;
1423        int mss;
1424        int pcount = 0;
1425        int len;
1426        int in_sack;
1427
1428        if (!sk_can_gso(sk))
1429                goto fallback;
1430
1431        /* Normally R but no L won't result in plain S */
1432        if (!dup_sack &&
1433            (TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_RETRANS)) == TCPCB_SACKED_RETRANS)
1434                goto fallback;
1435        if (!skb_can_shift(skb))
1436                goto fallback;
1437        /* This frame is about to be dropped (was ACKed). */
1438        if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
1439                goto fallback;
1440
1441        /* Can only happen with delayed DSACK + discard craziness */
1442        if (unlikely(skb == tcp_write_queue_head(sk)))
1443                goto fallback;
1444        prev = tcp_write_queue_prev(sk, skb);
1445
1446        if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
1447                goto fallback;
1448
1449        in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
1450                  !before(end_seq, TCP_SKB_CB(skb)->end_seq);
1451
1452        if (in_sack) {
1453                len = skb->len;
1454                pcount = tcp_skb_pcount(skb);
1455                mss = tcp_skb_seglen(skb);
1456
1457                /* TODO: Fix DSACKs to not fragment already SACKed and we can
1458                 * drop this restriction as unnecessary
1459                 */
1460                if (mss != tcp_skb_seglen(prev))
1461                        goto fallback;
1462        } else {
1463                if (!after(TCP_SKB_CB(skb)->end_seq, start_seq))
1464                        goto noop;
1465                /* CHECKME: This is non-MSS split case only?, this will
1466                 * cause skipped skbs due to advancing loop btw, original
1467                 * has that feature too
1468                 */
1469                if (tcp_skb_pcount(skb) <= 1)
1470                        goto noop;
1471
1472                in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
1473                if (!in_sack) {
1474                        /* TODO: head merge to next could be attempted here
1475                         * if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)),
1476                         * though it might not be worth of the additional hassle
1477                         *
1478                         * ...we can probably just fallback to what was done
1479                         * previously. We could try merging non-SACKed ones
1480                         * as well but it probably isn't going to buy off
1481                         * because later SACKs might again split them, and
1482                         * it would make skb timestamp tracking considerably
1483                         * harder problem.
1484                         */
1485                        goto fallback;
1486                }
1487
1488                len = end_seq - TCP_SKB_CB(skb)->seq;
1489                BUG_ON(len < 0);
1490                BUG_ON(len > skb->len);
1491
1492                /* MSS boundaries should be honoured or else pcount will
1493                 * severely break even though it makes things bit trickier.
1494                 * Optimize common case to avoid most of the divides
1495                 */
1496                mss = tcp_skb_mss(skb);
1497
1498                /* TODO: Fix DSACKs to not fragment already SACKed and we can
1499                 * drop this restriction as unnecessary
1500                 */
1501                if (mss != tcp_skb_seglen(prev))
1502                        goto fallback;
1503
1504                if (len == mss) {
1505                        pcount = 1;
1506                } else if (len < mss) {
1507                        goto noop;
1508                } else {
1509                        pcount = len / mss;
1510                        len = pcount * mss;
1511                }
1512        }
1513
1514        /* tcp_sacktag_one() won't SACK-tag ranges below snd_una */
1515        if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una))
1516                goto fallback;
1517
1518        if (!tcp_skb_shift(prev, skb, pcount, len))
1519                goto fallback;
1520        if (!tcp_shifted_skb(sk, prev, skb, state, pcount, len, mss, dup_sack))
1521                goto out;
1522
1523        /* Hole filled allows collapsing with the next as well, this is very
1524         * useful when hole on every nth skb pattern happens
1525         */
1526        if (prev == tcp_write_queue_tail(sk))
1527                goto out;
1528        skb = tcp_write_queue_next(sk, prev);
1529
1530        if (!skb_can_shift(skb) ||
1531            (skb == tcp_send_head(sk)) ||
1532            ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) ||
1533            (mss != tcp_skb_seglen(skb)))
1534                goto out;
1535
1536        len = skb->len;
1537        if (tcp_skb_shift(prev, skb, tcp_skb_pcount(skb), len)) {
1538                pcount += tcp_skb_pcount(skb);
1539                tcp_shifted_skb(sk, prev, skb, state, tcp_skb_pcount(skb),
1540                                len, mss, 0);
1541        }
1542out:
1543        state->fack_count += pcount;
1544        return prev;
1545
1546noop:
1547        return skb;
1548
1549fallback:
1550        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK);
1551        return NULL;
1552}
1553
1554static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
1555                                        struct tcp_sack_block *next_dup,
1556                                        struct tcp_sacktag_state *state,
1557                                        u32 start_seq, u32 end_seq,
1558                                        bool dup_sack_in)
1559{
1560        struct tcp_sock *tp = tcp_sk(sk);
1561        struct sk_buff *tmp;
1562
1563        tcp_for_write_queue_from(skb, sk) {
1564                int in_sack = 0;
1565                bool dup_sack = dup_sack_in;
1566
1567                if (skb == tcp_send_head(sk))
1568                        break;
1569
1570                /* queue is in-order => we can short-circuit the walk early */
1571                if (!before(TCP_SKB_CB(skb)->seq, end_seq))
1572                        break;
1573
1574                if ((next_dup != NULL) &&
1575                    before(TCP_SKB_CB(skb)->seq, next_dup->end_seq)) {
1576                        in_sack = tcp_match_skb_to_sack(sk, skb,
1577                                                        next_dup->start_seq,
1578                                                        next_dup->end_seq);
1579                        if (in_sack > 0)
1580                                dup_sack = true;
1581                }
1582
1583                /* skb reference here is a bit tricky to get right, since
1584                 * shifting can eat and free both this skb and the next,
1585                 * so not even _safe variant of the loop is enough.
1586                 */
1587                if (in_sack <= 0) {
1588                        tmp = tcp_shift_skb_data(sk, skb, state,
1589                                                 start_seq, end_seq, dup_sack);
1590                        if (tmp != NULL) {
1591                                if (tmp != skb) {
1592                                        skb = tmp;
1593                                        continue;
1594                                }
1595
1596                                in_sack = 0;
1597                        } else {
1598                                in_sack = tcp_match_skb_to_sack(sk, skb,
1599                                                                start_seq,
1600                                                                end_seq);
1601                        }
1602                }
1603
1604                if (unlikely(in_sack < 0))
1605                        break;
1606
1607                if (in_sack) {
1608                        TCP_SKB_CB(skb)->sacked =
1609                                tcp_sacktag_one(sk,
1610                                                state,
1611                                                TCP_SKB_CB(skb)->sacked,
1612                                                TCP_SKB_CB(skb)->seq,
1613                                                TCP_SKB_CB(skb)->end_seq,
1614                                                dup_sack,
1615                                                tcp_skb_pcount(skb),
1616                                                &skb->skb_mstamp);
1617
1618                        if (!before(TCP_SKB_CB(skb)->seq,
1619                                    tcp_highest_sack_seq(tp)))
1620                                tcp_advance_highest_sack(sk, skb);
1621                }
1622
1623                state->fack_count += tcp_skb_pcount(skb);
1624        }
1625        return skb;
1626}
1627
1628/* Avoid all extra work that is being done by sacktag while walking in
1629 * a normal way
1630 */
1631static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk,
1632                                        struct tcp_sacktag_state *state,
1633                                        u32 skip_to_seq)
1634{
1635        tcp_for_write_queue_from(skb, sk) {
1636                if (skb == tcp_send_head(sk))
1637                        break;
1638
1639                if (after(TCP_SKB_CB(skb)->end_seq, skip_to_seq))
1640                        break;
1641
1642                state->fack_count += tcp_skb_pcount(skb);
1643        }
1644        return skb;
1645}
1646
1647static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
1648                                                struct sock *sk,
1649                                                struct tcp_sack_block *next_dup,
1650                                                struct tcp_sacktag_state *state,
1651                                                u32 skip_to_seq)
1652{
1653        if (next_dup == NULL)
1654                return skb;
1655
1656        if (before(next_dup->start_seq, skip_to_seq)) {
1657                skb = tcp_sacktag_skip(skb, sk, state, next_dup->start_seq);
1658                skb = tcp_sacktag_walk(skb, sk, NULL, state,
1659                                       next_dup->start_seq, next_dup->end_seq,
1660                                       1);
1661        }
1662
1663        return skb;
1664}
1665
1666static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_block *cache)
1667{
1668        return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
1669}
1670
1671static int
1672tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
1673                        u32 prior_snd_una, long *sack_rtt_us)
1674{
1675        struct tcp_sock *tp = tcp_sk(sk);
1676        const unsigned char *ptr = (skb_transport_header(ack_skb) +
1677                                    TCP_SKB_CB(ack_skb)->sacked);
1678        struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2);
1679        struct tcp_sack_block sp[TCP_NUM_SACKS];
1680        struct tcp_sack_block *cache;
1681        struct tcp_sacktag_state state;
1682        struct sk_buff *skb;
1683        int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3);
1684        int used_sacks;
1685        bool found_dup_sack = false;
1686        int i, j;
1687        int first_sack_index;
1688
1689        state.flag = 0;
1690        state.reord = tp->packets_out;
1691        state.rtt_us = -1L;
1692
1693        if (!tp->sacked_out) {
1694                if (WARN_ON(tp->fackets_out))
1695                        tp->fackets_out = 0;
1696                tcp_highest_sack_reset(sk);
1697        }
1698
1699        found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
1700                                         num_sacks, prior_snd_una);
1701        if (found_dup_sack)
1702                state.flag |= FLAG_DSACKING_ACK;
1703
1704        /* Eliminate too old ACKs, but take into
1705         * account more or less fresh ones, they can
1706         * contain valid SACK info.
1707         */
1708        if (before(TCP_SKB_CB(ack_skb)->ack_seq, prior_snd_una - tp->max_window))
1709                return 0;
1710
1711        if (!tp->packets_out)
1712                goto out;
1713
1714        used_sacks = 0;
1715        first_sack_index = 0;
1716        for (i = 0; i < num_sacks; i++) {
1717                bool dup_sack = !i && found_dup_sack;
1718
1719                sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq);
1720                sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq);
1721
1722                if (!tcp_is_sackblock_valid(tp, dup_sack,
1723                                            sp[used_sacks].start_seq,
1724                                            sp[used_sacks].end_seq)) {
1725                        int mib_idx;
1726
1727                        if (dup_sack) {
1728                                if (!tp->undo_marker)
1729                                        mib_idx = LINUX_MIB_TCPDSACKIGNOREDNOUNDO;
1730                                else
1731                                        mib_idx = LINUX_MIB_TCPDSACKIGNOREDOLD;
1732                        } else {
1733                                /* Don't count olds caused by ACK reordering */
1734                                if ((TCP_SKB_CB(ack_skb)->ack_seq != tp->snd_una) &&
1735                                    !after(sp[used_sacks].end_seq, tp->snd_una))
1736                                        continue;
1737                                mib_idx = LINUX_MIB_TCPSACKDISCARD;
1738                        }
1739
1740                        NET_INC_STATS_BH(sock_net(sk), mib_idx);
1741                        if (i == 0)
1742                                first_sack_index = -1;
1743                        continue;
1744                }
1745
1746                /* Ignore very old stuff early */
1747                if (!after(sp[used_sacks].end_seq, prior_snd_una))
1748                        continue;
1749
1750                used_sacks++;
1751        }
1752
1753        /* order SACK blocks to allow in order walk of the retrans queue */
1754        for (i = used_sacks - 1; i > 0; i--) {
1755                for (j = 0; j < i; j++) {
1756                        if (after(sp[j].start_seq, sp[j + 1].start_seq)) {
1757                                swap(sp[j], sp[j + 1]);
1758
1759                                /* Track where the first SACK block goes to */
1760                                if (j == first_sack_index)
1761                                        first_sack_index = j + 1;
1762                        }
1763                }
1764        }
1765
1766        skb = tcp_write_queue_head(sk);
1767        state.fack_count = 0;
1768        i = 0;
1769
1770        if (!tp->sacked_out) {
1771                /* It's already past, so skip checking against it */
1772                cache = tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
1773        } else {
1774                cache = tp->recv_sack_cache;
1775                /* Skip empty blocks in at head of the cache */
1776                while (tcp_sack_cache_ok(tp, cache) && !cache->start_seq &&
1777                       !cache->end_seq)
1778                        cache++;
1779        }
1780
1781        while (i < used_sacks) {
1782                u32 start_seq = sp[i].start_seq;
1783                u32 end_seq = sp[i].end_seq;
1784                bool dup_sack = (found_dup_sack && (i == first_sack_index));
1785                struct tcp_sack_block *next_dup = NULL;
1786
1787                if (found_dup_sack && ((i + 1) == first_sack_index))
1788                        next_dup = &sp[i + 1];
1789
1790                /* Skip too early cached blocks */
1791                while (tcp_sack_cache_ok(tp, cache) &&
1792                       !before(start_seq, cache->end_seq))
1793                        cache++;
1794
1795                /* Can skip some work by looking recv_sack_cache? */
1796                if (tcp_sack_cache_ok(tp, cache) && !dup_sack &&
1797                    after(end_seq, cache->start_seq)) {
1798
1799                        /* Head todo? */
1800                        if (before(start_seq, cache->start_seq)) {
1801                                skb = tcp_sacktag_skip(skb, sk, &state,
1802                                                       start_seq);
1803                                skb = tcp_sacktag_walk(skb, sk, next_dup,
1804                                                       &state,
1805                                                       start_seq,
1806                                                       cache->start_seq,
1807                                                       dup_sack);
1808                        }
1809
1810                        /* Rest of the block already fully processed? */
1811                        if (!after(end_seq, cache->end_seq))
1812                                goto advance_sp;
1813
1814                        skb = tcp_maybe_skipping_dsack(skb, sk, next_dup,
1815                                                       &state,
1816                                                       cache->end_seq);
1817
1818                        /* ...tail remains todo... */
1819                        if (tcp_highest_sack_seq(tp) == cache->end_seq) {
1820                                /* ...but better entrypoint exists! */
1821                                skb = tcp_highest_sack(sk);
1822                                if (skb == NULL)
1823                                        break;
1824                                state.fack_count = tp->fackets_out;
1825                                cache++;
1826                                goto walk;
1827                        }
1828
1829                        skb = tcp_sacktag_skip(skb, sk, &state, cache->end_seq);
1830                        /* Check overlap against next cached too (past this one already) */
1831                        cache++;
1832                        continue;
1833                }
1834
1835                if (!before(start_seq, tcp_highest_sack_seq(tp))) {
1836                        skb = tcp_highest_sack(sk);
1837                        if (skb == NULL)
1838                                break;
1839                        state.fack_count = tp->fackets_out;
1840                }
1841                skb = tcp_sacktag_skip(skb, sk, &state, start_seq);
1842
1843walk:
1844                skb = tcp_sacktag_walk(skb, sk, next_dup, &state,
1845                                       start_seq, end_seq, dup_sack);
1846
1847advance_sp:
1848                i++;
1849        }
1850
1851        /* Clear the head of the cache sack blocks so we can skip it next time */
1852        for (i = 0; i < ARRAY_SIZE(tp->recv_sack_cache) - used_sacks; i++) {
1853                tp->recv_sack_cache[i].start_seq = 0;
1854                tp->recv_sack_cache[i].end_seq = 0;
1855        }
1856        for (j = 0; j < used_sacks; j++)
1857                tp->recv_sack_cache[i++] = sp[j];
1858
1859        tcp_mark_lost_retrans(sk);
1860
1861        tcp_verify_left_out(tp);
1862
1863        if ((state.reord < tp->fackets_out) &&
1864            ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker))
1865                tcp_update_reordering(sk, tp->fackets_out - state.reord, 0);
1866
1867out:
1868
1869#if FASTRETRANS_DEBUG > 0
1870        WARN_ON((int)tp->sacked_out < 0);
1871        WARN_ON((int)tp->lost_out < 0);
1872        WARN_ON((int)tp->retrans_out < 0);
1873        WARN_ON((int)tcp_packets_in_flight(tp) < 0);
1874#endif
1875        *sack_rtt_us = state.rtt_us;
1876        return state.flag;
1877}
1878
1879/* Limits sacked_out so that sum with lost_out isn't ever larger than
1880 * packets_out. Returns false if sacked_out adjustement wasn't necessary.
1881 */
1882static bool tcp_limit_reno_sacked(struct tcp_sock *tp)
1883{
1884        u32 holes;
1885
1886        holes = max(tp->lost_out, 1U);
1887        holes = min(holes, tp->packets_out);
1888
1889        if ((tp->sacked_out + holes) > tp->packets_out) {
1890                tp->sacked_out = tp->packets_out - holes;
1891                return true;
1892        }
1893        return false;
1894}
1895
1896/* If we receive more dupacks than we expected counting segments
1897 * in assumption of absent reordering, interpret this as reordering.
1898 * The only another reason could be bug in receiver TCP.
1899 */
1900static void tcp_check_reno_reordering(struct sock *sk, const int addend)
1901{
1902        struct tcp_sock *tp = tcp_sk(sk);
1903        if (tcp_limit_reno_sacked(tp))
1904                tcp_update_reordering(sk, tp->packets_out + addend, 0);
1905}
1906
1907/* Emulate SACKs for SACKless connection: account for a new dupack. */
1908
1909static void tcp_add_reno_sack(struct sock *sk)
1910{
1911        struct tcp_sock *tp = tcp_sk(sk);
1912        tp->sacked_out++;
1913        tcp_check_reno_reordering(sk, 0);
1914        tcp_verify_left_out(tp);
1915}
1916
1917/* Account for ACK, ACKing some data in Reno Recovery phase. */
1918
1919static void tcp_remove_reno_sacks(struct sock *sk, int acked)
1920{
1921        struct tcp_sock *tp = tcp_sk(sk);
1922
1923        if (acked > 0) {
1924                /* One ACK acked hole. The rest eat duplicate ACKs. */
1925                if (acked - 1 >= tp->sacked_out)
1926                        tp->sacked_out = 0;
1927                else
1928                        tp->sacked_out -= acked - 1;
1929        }
1930        tcp_check_reno_reordering(sk, acked);
1931        tcp_verify_left_out(tp);
1932}
1933
1934static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
1935{
1936        tp->sacked_out = 0;
1937}
1938
1939void tcp_clear_retrans(struct tcp_sock *tp)
1940{
1941        tp->retrans_out = 0;
1942        tp->lost_out = 0;
1943        tp->undo_marker = 0;
1944        tp->undo_retrans = -1;
1945        tp->fackets_out = 0;
1946        tp->sacked_out = 0;
1947}
1948
1949static inline void tcp_init_undo(struct tcp_sock *tp)
1950{
1951        tp->undo_marker = tp->snd_una;
1952        /* Retransmission still in flight may cause DSACKs later. */
1953        tp->undo_retrans = tp->retrans_out ? : -1;
1954}
1955
1956/* Enter Loss state. If we detect SACK reneging, forget all SACK information
1957 * and reset tags completely, otherwise preserve SACKs. If receiver
1958 * dropped its ofo queue, we will know this due to reneging detection.
1959 */
1960void tcp_enter_loss(struct sock *sk)
1961{
1962        const struct inet_connection_sock *icsk = inet_csk(sk);
1963        struct tcp_sock *tp = tcp_sk(sk);
1964        struct sk_buff *skb;
1965        bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
1966        bool is_reneg;                  /* is receiver reneging on SACKs? */
1967
1968        /* Reduce ssthresh if it has not yet been made inside this window. */
1969        if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
1970            !after(tp->high_seq, tp->snd_una) ||
1971            (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
1972                tp->prior_ssthresh = tcp_current_ssthresh(sk);
1973                tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
1974                tcp_ca_event(sk, CA_EVENT_LOSS);
1975                tcp_init_undo(tp);
1976        }
1977        tp->snd_cwnd       = 1;
1978        tp->snd_cwnd_cnt   = 0;
1979        tp->snd_cwnd_stamp = tcp_time_stamp;
1980
1981        tp->retrans_out = 0;
1982        tp->lost_out = 0;
1983
1984        if (tcp_is_reno(tp))
1985                tcp_reset_reno_sack(tp);
1986
1987        skb = tcp_write_queue_head(sk);
1988        is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
1989        if (is_reneg) {
1990                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
1991                tp->sacked_out = 0;
1992                tp->fackets_out = 0;
1993        }
1994        tcp_clear_all_retrans_hints(tp);
1995
1996        tcp_for_write_queue(skb, sk) {
1997                if (skb == tcp_send_head(sk))
1998                        break;
1999
2000                TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;

2001                if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) {
2002                        TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
2003                        TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
2004                        tp->lost_out += tcp_skb_pcount(skb);
2005                        tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
2006                }
2007        }
2008        tcp_verify_left_out(tp);
2009
2010        /* Timeout in disordered state after receiving substantial DUPACKs
2011         * suggests that the degree of reordering is over-estimated.
2012         */
2013        if (icsk->icsk_ca_state <= TCP_CA_Disorder &&
2014            tp->sacked_out >= sysctl_tcp_reordering)
2015                tp->reordering = min_t(unsigned int, tp->reordering,
2016                                       sysctl_tcp_reordering);
2017        tcp_set_ca_state(sk, TCP_CA_Loss);
2018        tp->high_seq = tp->snd_nxt;
2019        TCP_ECN_queue_cwr(tp);
2020
2021        /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
2022         * loss recovery is underway except recurring timeout(s) on
2023         * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
2024         */
2025        tp->frto = sysctl_tcp_frto &&
2026                   (new_recovery || icsk->icsk_retransmits) &&
2027                   !inet_csk(sk)->icsk_mtup.probe_size;
2028}
2029
2030/* If ACK arrived pointing to a remembered SACK, it means that our
2031 * remembered SACKs do not reflect real state of receiver i.e.
2032 * receiver _host_ is heavily congested (or buggy).
2033 *
2034 * To avoid big spurious retransmission bursts due to transient SACK
2035 * scoreboard oddities that look like reneging, we give the receiver a
2036 * little time (max(RTT/2, 10ms)) to send us some more ACKs that will
2037 * restore sanity to the SACK scoreboard. If the apparent reneging
2038 * persists until this RTO then we'll clear the SACK scoreboard.
2039 */
2040static bool tcp_check_sack_reneging(struct sock *sk, int flag)
2041{
2042        if (flag & FLAG_SACK_RENEGING) {
2043                struct tcp_sock *tp = tcp_sk(sk);
2044                unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4),
2045                                          msecs_to_jiffies(10));
2046
2047                inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
2048                                          delay, TCP_RTO_MAX);
2049                return true;
2050        }
2051        return false;
2052}
2053
2054static inline int tcp_fackets_out(const struct tcp_sock *tp)
2055{
2056        return tcp_is_reno(tp) ? tp->sacked_out + 1 : tp->fackets_out;
2057}
2058
2059/* Heurestics to calculate number of duplicate ACKs. There's no dupACKs
2060 * counter when SACK is enabled (without SACK, sacked_out is used for
2061 * that purpose).
2062 *
2063 * Instead, with FACK TCP uses fackets_out that includes both SACKed
2064 * segments up to the highest received SACK block so far and holes in
2065 * between them.
2066 *
2067 * With reordering, holes may still be in flight, so RFC3517 recovery
2068 * uses pure sacked_out (total number of SACKed segments) even though
2069 * it violates the RFC that uses duplicate ACKs, often these are equal
2070 * but when e.g. out-of-window ACKs or packet duplication occurs,
2071 * they differ. Since neither occurs due to loss, TCP should really
2072 * ignore them.
2073 */
2074static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
2075{
2076        return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
2077}
2078
2079static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
2080{
2081        struct tcp_sock *tp = tcp_sk(sk);
2082        unsigned long delay;
2083
2084        /* Delay early retransmit and entering fast recovery for
2085         * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples
2086         * available, or RTO is scheduled to fire first.
2087         */
2088        if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 ||
2089            (flag & FLAG_ECE) || !tp->srtt_us)
2090                return false;
2091
2092        delay = max(usecs_to_jiffies(tp->srtt_us >> 5),
2093                    msecs_to_jiffies(2));
2094
2095        if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay)))
2096                return false;
2097
2098        inet_csk_reset_xmit_timer(sk, ICSK_TIME_EARLY_RETRANS, delay,
2099                                  TCP_RTO_MAX);
2100        return true;
2101}
2102
2103/* Linux NewReno/SACK/FACK/ECN state machine.
2104 * --------------------------------------
2105 *
2106 * "Open"       Normal state, no dubious events, fast path.
2107 * "Disorder"   In all the respects it is "Open",
2108 *              but requires a bit more attention. It is entered when
2109 *              we see some SACKs or dupacks. It is split of "Open"
2110 *              mainly to move some processing from fast path to slow one.
2111 * "CWR"        CWND was reduced due to some Congestion Notification event.
2112 *              It can be ECN, ICMP source quench, local device congestion.
2113 * "Recovery"   CWND was reduced, we are fast-retransmitting.
2114 * "Loss"       CWND was reduced due to RTO timeout or SACK reneging.
2115 *
2116 * tcp_fastretrans_alert() is entered:
2117 * - each incoming ACK, if state is not "Open"
2118 * - when arrived ACK is unusual, namely:
2119 *      * SACK
2120 *      * Duplicate ACK.
2121 *      * ECN ECE.
2122 *
2123 * Counting packets in flight is pretty simple.
2124 *
2125 *      in_flight = packets_out - left_out + retrans_out
2126 *
2127 *      packets_out is SND.NXT-SND.UNA counted in packets.
2128 *
2129 *      retrans_out is number of retransmitted segments.
2130 *
2131 *      left_out is number of segments left network, but not ACKed yet.
2132 *
2133 *              left_out = sacked_out + lost_out
2134 *
2135 *     sacked_out: Packets, which arrived to receiver out of order
2136 *                 and hence not ACKed. With SACKs this number is simply
2137 *                 amount of SACKed data. Even without SACKs
2138 *                 it is easy to give pretty reliable estimate of this number,
2139 *                 counting duplicate ACKs.
2140 *
2141 *       lost_out: Packets lost by network. TCP has no explicit
2142 *                 "loss notification" feedback from network (for now).
2143 *                 It means that this number can be only _guessed_.
2144 *                 Actually, it is the heuristics to predict lossage that
2145 *                 distinguishes different algorithms.
2146 *
2147 *      F.e. after RTO, when all the queue is considered as lost,
2148 *      lost_out = packets_out and in_flight = retrans_out.
2149 *
2150 *              Essentially, we have now two algorithms counting
2151 *              lost packets.
2152 *
2153 *              FACK: It is the simplest heuristics. As soon as we decided
2154 *              that something is lost, we decide that _all_ not SACKed
2155 *              packets until the most forward SACK are lost. I.e.
2156 *              lost_out = fackets_out - sacked_out and left_out = fackets_out.
2157 *              It is absolutely correct estimate, if network does not reorder
2158 *              packets. And it loses any connection to reality when reordering
2159 *              takes place. We use FACK by default until reordering
2160 *              is suspected on the path to this destination.
2161 *
2162 *              NewReno: when Recovery is entered, we assume that one segment
2163 *              is lost (classic Reno). While we are in Recovery and
2164 *              a partial ACK arrives, we assume that one more packet
2165 *              is lost (NewReno). This heuristics are the same in NewReno
2166 *              and SACK.
2167 *
2168 *  Imagine, that's all! Forget about all this shamanism about CWND inflation
2169 *  deflation etc. CWND is real congestion window, never inflated, changes
2170 *  only according to classic VJ rules.
2171 *
2172 * Really tricky (and requiring careful tuning) part of algorithm
2173 * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue().
2174 * The first determines the moment _when_ we should reduce CWND and,
2175 * hence, slow down forward transmission. In fact, it determines the moment
2176 * when we decide that hole is caused by loss, rather than by a reorder.
2177 *
2178 * tcp_xmit_retransmit_queue() decides, _what_ we should retransmit to fill
2179 * holes, caused by lost packets.
2180 *
2181 * And the most logically complicated part of algorithm is undo
2182 * heuristics. We detect false retransmits due to both too early
2183 * fast retransmit (reordering) and underestimated RTO, analyzing
2184 * timestamps and D-SACKs. When we detect that some segments were
2185 * retransmitted by mistake and CWND reduction was wrong, we undo
2186 * window reduction and abort recovery phase. This logic is hidden
2187 * inside several functions named tcp_try_undo_<something>.
2188 */
2189
2190/* This function decides, when we should leave Disordered state
2191 * and enter Recovery phase, reducing congestion window.
2192 *
2193 * Main question: may we further continue forward transmission
2194 * with the same cwnd?
2195 */
2196static bool tcp_time_to_recover(struct sock *sk, int flag)
2197{
2198        struct tcp_sock *tp = tcp_sk(sk);
2199        __u32 packets_out;
2200
2201        /* Trick#1: The loss is proven. */
2202        if (tp->lost_out)
2203                return true;
2204
2205        /* Not-A-Trick#2 : Classic rule... */
2206        if (tcp_dupack_heuristics(tp) > tp->reordering)
2207                return true;
2208
2209        /* Trick#4: It is still not OK... But will it be useful to delay
2210         * recovery more?
2211         */
2212        packets_out = tp->packets_out;
2213        if (packets_out <= tp->reordering &&
2214            tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) &&
2215            !tcp_may_send_now(sk)) {
2216                /* We have nothing to send. This connection is limited
2217                 * either by receiver window or by application.
2218                 */
2219                return true;
2220        }
2221
2222        /* If a thin stream is detected, retransmit after first
2223         * received dupack. Employ only if SACK is supported in order
2224         * to avoid possible corner-case series of spurious retransmissions
2225         * Use only if there are no unsent data.
2226         */
2227        if ((tp->thin_dupack || sysctl_tcp_thin_dupack) &&
2228            tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 &&
2229            tcp_is_sack(tp) && !tcp_send_head(sk))
2230                return true;
2231
2232        /* Trick#6: TCP early retransmit, per RFC5827.  To avoid spurious
2233         * retransmissions due to small network reorderings, we implement
2234         * Mitigation A.3 in the RFC and delay the retransmission for a short
2235         * interval if appropriate.
2236         */
2237        if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out &&
2238            (tp->packets_out >= (tp->sacked_out + 1) && tp->packets_out < 4) &&
2239            !tcp_may_send_now(sk))
2240                return !tcp_pause_early_retransmit(sk, flag);
2241
2242        return false;
2243}
2244
2245/* Detect loss in event "A" above by marking head of queue up as lost.
2246 * For FACK or non-SACK(Reno) senders, the first "packets" number of segments
2247 * are considered lost. For RFC3517 SACK, a segment is considered lost if it
2248 * has at least tp->reordering SACKed seqments above it; "packets" refers to
2249 * the maximum SACKed segments to pass before reaching this limit.
2250 */
2251static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
2252{
2253        struct tcp_sock *tp = tcp_sk(sk);
2254        struct sk_buff *skb;
2255        int cnt, oldcnt, lost;
2256        unsigned int mss;
2257        /* Use SACK to deduce losses of new sequences sent during recovery */
2258        const u32 loss_high = tcp_is_sack(tp) ?  tp->snd_nxt : tp->high_seq;
2259
2260        WARN_ON(packets > tp->packets_out);
2261        if (tp->lost_skb_hint) {
2262                skb = tp->lost_skb_hint;
2263                cnt = tp->lost_cnt_hint;
2264                /* Head already handled? */
2265                if (mark_head && skb != tcp_write_queue_head(sk))
2266                        return;
2267        } else {
2268                skb = tcp_write_queue_head(sk);
2269                cnt = 0;
2270        }
2271
2272        tcp_for_write_queue_from(skb, sk) {
2273                if (skb == tcp_send_head(sk))
2274                        break;
2275                /* TODO: do this better */
2276                /* this is not the most efficient way to do this... */
2277                tp->lost_skb_hint = skb;
2278                tp->lost_cnt_hint = cnt;
2279
2280                if (after(TCP_SKB_CB(skb)->end_seq, loss_high))
2281                        break;
2282
2283                oldcnt = cnt;
2284                if (tcp_is_fack(tp) || tcp_is_reno(tp) ||
2285                    (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
2286                        cnt += tcp_skb_pcount(skb);
2287
2288                if (cnt > packets) {
2289                        if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) ||
2290                            (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
2291                            (oldcnt >= packets))
2292                                break;
2293
2294                        mss = skb_shinfo(skb)->gso_size;
2295                        /* If needed, chop off the prefix to mark as lost. */
2296                        lost = (packets - oldcnt) * mss;
2297                        if (lost < skb->len &&
2298                            tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, lost, mss) < 0)
2299                                break;
2300                        cnt = packets;
2301                }
2302
2303                tcp_skb_mark_lost(tp, skb);
2304
2305                if (mark_head)
2306                        break;
2307        }
2308        tcp_verify_left_out(tp);
2309}
2310
2311/* Account newly detected lost packet(s) */
2312
2313static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
2314{
2315        struct tcp_sock *tp = tcp_sk(sk);
2316
2317        if (tcp_is_reno(tp)) {
2318                tcp_mark_head_lost(sk, 1, 1);
2319        } else if (tcp_is_fack(tp)) {
2320                int lost = tp->fackets_out - tp->reordering;
2321                if (lost <= 0)
2322                        lost = 1;
2323                tcp_mark_head_lost(sk, lost, 0);
2324        } else {
2325                int sacked_upto = tp->sacked_out - tp->reordering;
2326                if (sacked_upto >= 0)
2327                        tcp_mark_head_lost(sk, sacked_upto, 0);
2328                else if (fast_rexmit)
2329                        tcp_mark_head_lost(sk, 1, 1);
2330        }
2331}
2332
2333/* CWND moderation, preventing bursts due to too big ACKs
2334 * in dubious situations.
2335 */
2336static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
2337{
2338        tp->snd_cwnd = min(tp->snd_cwnd,
2339                           tcp_packets_in_flight(tp) + tcp_max_burst(tp));
2340        tp->snd_cwnd_stamp = tcp_time_stamp;
2341}
2342
2343/* Nothing was retransmitted or returned timestamp is less
2344 * than timestamp of the first retransmission.
2345 */
2346static inline bool tcp_packet_delayed(const struct tcp_sock *tp)
2347{
2348        return !tp->retrans_stamp ||
2349                (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
2350                 before(tp->rx_opt.rcv_tsecr, tp->retrans_stamp));
2351}
2352
2353/* Undo procedures. */
2354
2355/* We can clear retrans_stamp when there are no retransmissions in the
2356 * window. It would seem that it is trivially available for us in
2357 * tp->retrans_out, however, that kind of assumptions doesn't consider
2358 * what will happen if errors occur when sending retransmission for the
2359 * second time. ...It could the that such segment has only
2360 * TCPCB_EVER_RETRANS set at the present time. It seems that checking
2361 * the head skb is enough except for some reneging corner cases that
2362 * are not worth the effort.
2363 *
2364 * Main reason for all this complexity is the fact that connection dying
2365 * time now depends on the validity of the retrans_stamp, in particular,
2366 * that successive retransmissions of a segment must not advance
2367 * retrans_stamp under any conditions.
2368 */
2369static bool tcp_any_retrans_done(const struct sock *sk)
2370{
2371        const struct tcp_sock *tp = tcp_sk(sk);
2372        struct sk_buff *skb;
2373
2374        if (tp->retrans_out)
2375                return true;
2376
2377        skb = tcp_write_queue_head(sk);
2378        if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
2379                return true;
2380
2381        return false;
2382}
2383
2384#if FASTRETRANS_DEBUG > 1
2385static void DBGUNDO(struct sock *sk, const char *msg)
2386{
2387        struct tcp_sock *tp = tcp_sk(sk);
2388        struct inet_sock *inet = inet_sk(sk);
2389
2390        if (sk->sk_family == AF_INET) {
2391                pr_debug("Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
2392                         msg,
2393                         &inet->inet_daddr, ntohs(inet->inet_dport),
2394                         tp->snd_cwnd, tcp_left_out(tp),
2395                         tp->snd_ssthresh, tp->prior_ssthresh,
2396                         tp->packets_out);
2397        }
2398#if IS_ENABLED(CONFIG_IPV6)
2399        else if (sk->sk_family == AF_INET6) {
2400                struct ipv6_pinfo *np = inet6_sk(sk);
2401                pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
2402                         msg,
2403                         &np->daddr, ntohs(inet->inet_dport),
2404                         tp->snd_cwnd, tcp_left_out(tp),
2405                         tp->snd_ssthresh, tp->prior_ssthresh,
2406                         tp->packets_out);
2407        }
2408#endif
2409}
2410#else
2411#define DBGUNDO(x...) do { } while (0)
2412#endif
2413
2414static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
2415{
2416        struct tcp_sock *tp = tcp_sk(sk);
2417
2418        if (unmark_loss) {
2419                struct sk_buff *skb;
2420
2421                tcp_for_write_queue(skb, sk) {
2422                        if (skb == tcp_send_head(sk))
2423                                break;
2424                        TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
2425                }
2426                tp->lost_out = 0;
2427                tcp_clear_all_retrans_hints(tp);
2428        }
2429
2430        if (tp->prior_ssthresh) {
2431                const struct inet_connection_sock *icsk = inet_csk(sk);
2432
2433                if (icsk->icsk_ca_ops->undo_cwnd)
2434                        tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
2435                else
2436                        tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1);
2437
2438                if (tp->prior_ssthresh > tp->snd_ssthresh) {
2439                        tp->snd_ssthresh = tp->prior_ssthresh;
2440                        TCP_ECN_withdraw_cwr(tp);
2441                }
2442        } else {
2443                tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh);
2444        }
2445        tp->snd_cwnd_stamp = tcp_time_stamp;
2446        tp->undo_marker = 0;
2447}
2448
2449static inline bool tcp_may_undo(const struct tcp_sock *tp)
2450{
2451        return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp));
2452}
2453
2454/* People celebrate: "We love our President!" */
2455static bool tcp_try_undo_recovery(struct sock *sk)
2456{
2457        struct tcp_sock *tp = tcp_sk(sk);
2458
2459        if (tcp_may_undo(tp)) {
2460                int mib_idx;
2461
2462                /* Happy end! We did not retransmit anything
2463                 * or our original transmission succeeded.
2464                 */
2465                DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
2466                tcp_undo_cwnd_reduction(sk, false);
2467                if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
2468                        mib_idx = LINUX_MIB_TCPLOSSUNDO;
2469                else
2470                        mib_idx = LINUX_MIB_TCPFULLUNDO;
2471
2472                NET_INC_STATS_BH(sock_net(sk), mib_idx);
2473        }
2474        if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
2475                /* Hold old state until something *above* high_seq
2476                 * is ACKed. For Reno it is MUST to prevent false
2477                 * fast retransmits (RFC2582). SACK TCP is safe. */
2478                tcp_moderate_cwnd(tp);
2479                if (!tcp_any_retrans_done(sk))
2480                        tp->retrans_stamp = 0;
2481                return true;
2482        }
2483        tcp_set_ca_state(sk, TCP_CA_Open);
2484        return false;
2485}
2486
2487/* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */
2488static bool tcp_try_undo_dsack(struct sock *sk)
2489{
2490        struct tcp_sock *tp = tcp_sk(sk);
2491
2492        if (tp->undo_marker && !tp->undo_retrans) {
2493                DBGUNDO(sk, "D-SACK");
2494                tcp_undo_cwnd_reduction(sk, false);
2495                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
2496                return true;
2497        }
2498        return false;
2499}
2500
2501/* Undo during loss recovery after partial ACK or using F-RTO. */
2502static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
2503{
2504        struct tcp_sock *tp = tcp_sk(sk);
2505
2506        if (frto_undo || tcp_may_undo(tp)) {
2507                tcp_undo_cwnd_reduction(sk, true);
2508
2509                DBGUNDO(sk, "partial loss");
2510                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
2511                if (frto_undo)
2512                        NET_INC_STATS_BH(sock_net(sk),
2513                                         LINUX_MIB_TCPSPURIOUSRTOS);
2514                inet_csk(sk)->icsk_retransmits = 0;
2515                if (frto_undo || tcp_is_sack(tp))
2516                        tcp_set_ca_state(sk, TCP_CA_Open);
2517                return true;
2518        }
2519        return false;
2520}
2521
2522/* The cwnd reduction in CWR and Recovery use the PRR algorithm
2523 * https://datatracker.ietf.org/doc/draft-ietf-tcpm-proportional-rate-reduction/
2524 * It computes the number of packets to send (sndcnt) based on packets newly
2525 * delivered:
2526 *   1) If the packets in flight is larger than ssthresh, PRR spreads the
2527 *      cwnd reductions across a full RTT.
2528 *   2) If packets in flight is lower than ssthresh (such as due to excess
2529 *      losses and/or application stalls), do not perform any further cwnd
2530 *      reductions, but instead slow start up to ssthresh.
2531 */
2532static void tcp_init_cwnd_reduction(struct sock *sk)
2533{
2534        struct tcp_sock *tp = tcp_sk(sk);
2535
2536        tp->high_seq = tp->snd_nxt;
2537        tp->tlp_high_seq = 0;
2538        tp->snd_cwnd_cnt = 0;
2539        tp->prior_cwnd = tp->snd_cwnd;
2540        tp->prr_delivered = 0;
2541        tp->prr_out = 0;
2542        tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
2543        TCP_ECN_queue_cwr(tp);
2544}
2545
2546static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked,
2547                               int fast_rexmit)
2548{
2549        struct tcp_sock *tp = tcp_sk(sk);
2550        int sndcnt = 0;
2551        int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
2552        int newly_acked_sacked = prior_unsacked -
2553                                 (tp->packets_out - tp->sacked_out);
2554
2555        tp->prr_delivered += newly_acked_sacked;
2556        if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) {
2557                u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
2558                               tp->prior_cwnd - 1;
2559                sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
2560        } else {
2561                sndcnt = min_t(int, delta,
2562                               max_t(int, tp->prr_delivered - tp->prr_out,
2563                                     newly_acked_sacked) + 1);
2564        }
2565
2566        sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0));
2567        tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
2568}
2569
2570static inline void tcp_end_cwnd_reduction(struct sock *sk)
2571{
2572        struct tcp_sock *tp = tcp_sk(sk);
2573
2574        /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
2575        if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR ||
2576            (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) {
2577                tp->snd_cwnd = tp->snd_ssthresh;
2578                tp->snd_cwnd_stamp = tcp_time_stamp;
2579        }
2580        tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
2581}
2582
2583/* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */
2584void tcp_enter_cwr(struct sock *sk)
2585{
2586        struct tcp_sock *tp = tcp_sk(sk);
2587
2588        tp->prior_ssthresh = 0;
2589        if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
2590                tp->undo_marker = 0;
2591                tcp_init_cwnd_reduction(sk);
2592                tcp_set_ca_state(sk, TCP_CA_CWR);
2593        }
2594}
2595
2596static void tcp_try_keep_open(struct sock *sk)
2597{
2598        struct tcp_sock *tp = tcp_sk(sk);
2599        int state = TCP_CA_Open;
2600
2601        if (tcp_left_out(tp) || tcp_any_retrans_done(sk))
2602                state = TCP_CA_Disorder;
2603
2604        if (inet_csk(sk)->icsk_ca_state != state) {
2605                tcp_set_ca_state(sk, state);
2606                tp->high_seq = tp->snd_nxt;
2607        }
2608}
2609
2610static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked)
2611{
2612        struct tcp_sock *tp = tcp_sk(sk);
2613
2614        tcp_verify_left_out(tp);
2615
2616        if (!tcp_any_retrans_done(sk))
2617                tp->retrans_stamp = 0;
2618
2619        if (flag & FLAG_ECE)
2620                tcp_enter_cwr(sk);
2621
2622        if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
2623                tcp_try_keep_open(sk);
2624        } else {
2625                tcp_cwnd_reduction(sk, prior_unsacked, 0);
2626        }
2627}
2628
2629static void tcp_mtup_probe_failed(struct sock *sk)
2630{
2631        struct inet_connection_sock *icsk = inet_csk(sk);
2632
2633        icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
2634        icsk->icsk_mtup.probe_size = 0;
2635}
2636
2637static void tcp_mtup_probe_success(struct sock *sk)
2638{
2639        struct tcp_sock *tp = tcp_sk(sk);
2640        struct inet_connection_sock *icsk = inet_csk(sk);
2641
2642        /* FIXME: breaks with very large cwnd */
2643        tp->prior_ssthresh = tcp_current_ssthresh(sk);
2644        tp->snd_cwnd = tp->snd_cwnd *
2645                       tcp_mss_to_mtu(sk, tp->mss_cache) /
2646                       icsk->icsk_mtup.probe_size;
2647        tp->snd_cwnd_cnt = 0;
2648        tp->snd_cwnd_stamp = tcp_time_stamp;
2649        tp->snd_ssthresh = tcp_current_ssthresh(sk);
2650
2651        icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
2652        icsk->icsk_mtup.probe_size = 0;
2653        tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
2654}
2655
2656/* Do a simple retransmit without using the backoff mechanisms in
2657 * tcp_timer. This is used for path mtu discovery.
2658 * The socket is already locked here.
2659 */
2660void tcp_simple_retransmit(struct sock *sk)
2661{
2662        const struct inet_connection_sock *icsk = inet_csk(sk);
2663        struct tcp_sock *tp = tcp_sk(sk);
2664        struct sk_buff *skb;
2665        unsigned int mss = tcp_current_mss(sk);
2666        u32 prior_lost = tp->lost_out;
2667
2668        tcp_for_write_queue(skb, sk) {
2669                if (skb == tcp_send_head(sk))
2670                        break;
2671                if (tcp_skb_seglen(skb) > mss &&
2672                    !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
2673                        if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
2674                                TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
2675                                tp->retrans_out -= tcp_skb_pcount(skb);
2676                        }
2677                        tcp_skb_mark_lost_uncond_verify(tp, skb);
2678                }
2679        }
2680
2681        tcp_clear_retrans_hints_partial(tp);
2682
2683        if (prior_lost == tp->lost_out)
2684                return;
2685
2686        if (tcp_is_reno(tp))
2687                tcp_limit_reno_sacked(tp);
2688
2689        tcp_verify_left_out(tp);
2690
2691        /* Don't muck with the congestion window here.
2692         * Reason is that we do not increase amount of _data_
2693         * in network, but units changed and effective
2694         * cwnd/ssthresh really reduced now.
2695         */
2696        if (icsk->icsk_ca_state != TCP_CA_Loss) {
2697                tp->high_seq = tp->snd_nxt;
2698                tp->snd_ssthresh = tcp_current_ssthresh(sk);
2699                tp->prior_ssthresh = 0;
2700                tp->undo_marker = 0;
2701                tcp_set_ca_state(sk, TCP_CA_Loss);
2702        }
2703        tcp_xmit_retransmit_queue(sk);
2704}
2705EXPORT_SYMBOL(tcp_simple_retransmit);
2706
2707static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
2708{
2709        struct tcp_sock *tp = tcp_sk(sk);
2710        int mib_idx;
2711
2712        if (tcp_is_reno(tp))
2713                mib_idx = LINUX_MIB_TCPRENORECOVERY;
2714        else
2715                mib_idx = LINUX_MIB_TCPSACKRECOVERY;
2716
2717        NET_INC_STATS_BH(sock_net(sk), mib_idx);
2718
2719        tp->prior_ssthresh = 0;
2720        tcp_init_undo(tp);
2721
2722        if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
2723                if (!ece_ack)
2724                        tp->prior_ssthresh = tcp_current_ssthresh(sk);
2725                tcp_init_cwnd_reduction(sk);
2726        }
2727        tcp_set_ca_state(sk, TCP_CA_Recovery);
2728}
2729
2730/* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
2731 * recovered or spurious. Otherwise retransmits more on partial ACKs.
2732 */
2733static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
2734{
2735        struct tcp_sock *tp = tcp_sk(sk);
2736        bool recovered = !before(tp->snd_una, tp->high_seq);
2737
2738        if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
2739                /* Step 3.b. A timeout is spurious if not all data are
2740                 * lost, i.e., never-retransmitted data are (s)acked.
2741                 */
2742                if (tcp_try_undo_loss(sk, flag & FLAG_ORIG_SACK_ACKED))
2743                        return;
2744
2745                if (after(tp->snd_nxt, tp->high_seq) &&
2746                    (flag & FLAG_DATA_SACKED || is_dupack)) {
2747                        tp->frto = 0; /* Loss was real: 2nd part of step 3.a */
2748                } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
2749                        tp->high_seq = tp->snd_nxt;
2750                        __tcp_push_pending_frames(sk, tcp_current_mss(sk),
2751                                                  TCP_NAGLE_OFF);
2752                        if (after(tp->snd_nxt, tp->high_seq))
2753                                return; /* Step 2.b */
2754                        tp->frto = 0;
2755                }
2756        }
2757
2758        if (recovered) {
2759                /* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */
2760                tcp_try_undo_recovery(sk);
2761                return;
2762        }
2763        if (tcp_is_reno(tp)) {
2764                /* A Reno DUPACK means new data in F-RTO step 2.b above are
2765                 * delivered. Lower inflight to clock out (re)tranmissions.
2766                 */
2767                if (after(tp->snd_nxt, tp->high_seq) && is_dupack)
2768                        tcp_add_reno_sack(sk);
2769                else if (flag & FLAG_SND_UNA_ADVANCED)
2770                        tcp_reset_reno_sack(tp);
2771        }
2772        if (tcp_try_undo_loss(sk, false))
2773                return;
2774        tcp_xmit_retransmit_queue(sk);
2775}
2776
2777/* Undo during fast recovery after partial ACK. */
2778static bool tcp_try_undo_partial(struct sock *sk, const int acked,
2779                                 const int prior_unsacked)
2780{
2781        struct tcp_sock *tp = tcp_sk(sk);
2782
2783        if (tp->undo_marker && tcp_packet_delayed(tp)) {
2784                /* Plain luck! Hole if filled with delayed
2785                 * packet, rather than with a retransmit.
2786                 */
2787                tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1);
2788
2789                /* We are getting evidence that the reordering degree is higher
2790                 * than we realized. If there are no retransmits out then we
2791                 * can undo. Otherwise we clock out new packets but do not
2792                 * mark more packets lost or retransmit more.
2793                 */
2794                if (tp->retrans_out) {
2795                        tcp_cwnd_reduction(sk, prior_unsacked, 0);
2796                        return true;
2797                }
2798
2799                if (!tcp_any_retrans_done(sk))
2800                        tp->retrans_stamp = 0;
2801
2802                DBGUNDO(sk, "partial recovery");
2803                tcp_undo_cwnd_reduction(sk, true);
2804                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
2805                tcp_try_keep_open(sk);
2806                return true;
2807        }
2808        return false;
2809}
2810
2811/* Process an event, which can update packets-in-flight not trivially.
2812 * Main goal of this function is to calculate new estimate for left_out,
2813 * taking into account both packets sitting in receiver's buffer and
2814 * packets lost by network.
2815 *
2816 * Besides that it does CWND reduction, when packet loss is detected
2817 * and changes state of machine.
2818 *
2819 * It does _not_ decide what to send, it is made in function
2820 * tcp_xmit_retransmit_queue().
2821 */
2822static void tcp_fastretrans_alert(struct sock *sk, const int acked,
2823                                  const int prior_unsacked,
2824                                  bool is_dupack, int flag)
2825{
2826        struct inet_connection_sock *icsk = inet_csk(sk);
2827        struct tcp_sock *tp = tcp_sk(sk);
2828        bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
2829                                    (tcp_fackets_out(tp) > tp->reordering));
2830        int fast_rexmit = 0;
2831
2832        if (WARN_ON(!tp->packets_out && tp->sacked_out))
2833                tp->sacked_out = 0;
2834        if (WARN_ON(!tp->sacked_out && tp->fackets_out))
2835                tp->fackets_out = 0;
2836
2837        /* Now state machine starts.
2838         * A. ECE, hence prohibit cwnd undoing, the reduction is required. */
2839        if (flag & FLAG_ECE)
2840                tp->prior_ssthresh = 0;
2841
2842        /* B. In all the states check for reneging SACKs. */
2843        if (tcp_check_sack_reneging(sk, flag))
2844                return;
2845
2846        /* C. Check consistency of the current state. */
2847        tcp_verify_left_out(tp);
2848
2849        /* D. Check state exit conditions. State can be terminated
2850         *    when high_seq is ACKed. */
2851        if (icsk->icsk_ca_state == TCP_CA_Open) {
2852                WARN_ON(tp->retrans_out != 0);
2853                tp->retrans_stamp = 0;
2854        } else if (!before(tp->snd_una, tp->high_seq)) {
2855                switch (icsk->icsk_ca_state) {
2856                case TCP_CA_CWR:
2857                        /* CWR is to be held something *above* high_seq
2858                         * is ACKed for CWR bit to reach receiver. */
2859                        if (tp->snd_una != tp->high_seq) {
2860                                tcp_end_cwnd_reduction(sk);
2861                                tcp_set_ca_state(sk, TCP_CA_Open);
2862                        }
2863                        break;
2864
2865                case TCP_CA_Recovery:
2866                        if (tcp_is_reno(tp))
2867                                tcp_reset_reno_sack(tp);
2868                        if (tcp_try_undo_recovery(sk))
2869                                return;
2870                        tcp_end_cwnd_reduction(sk);
2871                        break;
2872                }
2873        }
2874
2875        /* E. Process state. */
2876        switch (icsk->icsk_ca_state) {
2877        case TCP_CA_Recovery:
2878                if (!(flag & FLAG_SND_UNA_ADVANCED)) {
2879                        if (tcp_is_reno(tp) && is_dupack)
2880                                tcp_add_reno_sack(sk);
2881                } else {
2882                        if (tcp_try_undo_partial(sk, acked, prior_unsacked))
2883                                return;
2884                        /* Partial ACK arrived. Force fast retransmit. */
2885                        do_lost = tcp_is_reno(tp) ||
2886                                  tcp_fackets_out(tp) > tp->reordering;
2887                }
2888                if (tcp_try_undo_dsack(sk)) {
2889                        tcp_try_keep_open(sk);
2890                        return;
2891                }
2892                break;
2893        case TCP_CA_Loss:
2894                tcp_process_loss(sk, flag, is_dupack);
2895                if (icsk->icsk_ca_state != TCP_CA_Open)
2896                        return;
2897                /* Fall through to processing in Open state. */
2898        default:
2899                if (tcp_is_reno(tp)) {
2900                        if (flag & FLAG_SND_UNA_ADVANCED)
2901                                tcp_reset_reno_sack(tp);
2902                        if (is_dupack)
2903                                tcp_add_reno_sack(sk);
2904                }
2905
2906                if (icsk->icsk_ca_state <= TCP_CA_Disorder)
2907                        tcp_try_undo_dsack(sk);
2908
2909                if (!tcp_time_to_recover(sk, flag)) {
2910                        tcp_try_to_open(sk, flag, prior_unsacked);
2911                        return;
2912                }
2913
2914                /* MTU probe failure: don't reduce cwnd */
2915                if (icsk->icsk_ca_state < TCP_CA_CWR &&
2916                    icsk->icsk_mtup.probe_size &&
2917                    tp->snd_una == tp->mtu_probe.probe_seq_start) {
2918                        tcp_mtup_probe_failed(sk);
2919                        /* Restores the reduction we did in tcp_mtup_probe() */
2920                        tp->snd_cwnd++;
2921                        tcp_simple_retransmit(sk);
2922                        return;
2923                }
2924
2925                /* Otherwise enter Recovery state */
2926                tcp_enter_recovery(sk, (flag & FLAG_ECE));
2927                fast_rexmit = 1;
2928        }
2929
2930        if (do_lost)
2931                tcp_update_scoreboard(sk, fast_rexmit);
2932        tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit);
2933        tcp_xmit_retransmit_queue(sk);
2934}
2935
2936static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
2937                                      long seq_rtt_us, long sack_rtt_us)
2938{
2939        const struct tcp_sock *tp = tcp_sk(sk);
2940
2941        /* Prefer RTT measured from ACK's timing to TS-ECR. This is because
2942         * broken middle-boxes or peers may corrupt TS-ECR fields. But
2943         * Karn's algorithm forbids taking RTT if some retransmitted data
2944         * is acked (RFC6298).
2945         */
2946        if (seq_rtt_us < 0)
2947                seq_rtt_us = sack_rtt_us;
2948
2949        /* RTTM Rule: A TSecr value received in a segment is used to
2950         * update the averaged RTT measurement only if the segment
2951         * acknowledges some new data, i.e., only if it advances the
2952         * left edge of the send window.
2953         * See draft-ietf-tcplw-high-performance-00, section 3.3.
2954         */
2955        if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
2956            flag & FLAG_ACKED)
2957                seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - tp->rx_opt.rcv_tsecr);
2958
2959        if (seq_rtt_us < 0)
2960                return false;
2961
2962        tcp_rtt_estimator(sk, seq_rtt_us);
2963        tcp_set_rto(sk);
2964
2965        /* RFC6298: only reset backoff on valid RTT measurement. */
2966        inet_csk(sk)->icsk_backoff = 0;
2967        return true;
2968}
2969
2970/* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */
2971static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp)
2972{
2973        struct tcp_sock *tp = tcp_sk(sk);
2974        long seq_rtt_us = -1L;
2975
2976        if (synack_stamp && !tp->total_retrans)
2977                seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - synack_stamp);
2978
2979        /* If the ACK acks both the SYNACK and the (Fast Open'd) data packets
2980         * sent in SYN_RECV, SYNACK RTT is the smooth RTT computed in tcp_ack()
2981         */
2982        if (!tp->srtt_us)
2983                tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt_us, -1L);
2984}
2985
2986static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
2987{
2988        const struct inet_connection_sock *icsk = inet_csk(sk);
2989
2990        icsk->icsk_ca_ops->cong_avoid(sk, ack, acked);
2991        tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp;
2992}
2993
2994/* Restart timer after forward progress on connection.
2995 * RFC2988 recommends to restart timer to now+rto.
2996 */
2997void tcp_rearm_rto(struct sock *sk)
2998{
2999        const struct inet_connection_sock *icsk = inet_csk(sk);
3000        struct tcp_sock *tp = tcp_sk(sk);

3001
3002        /* If the retrans timer is currently being used by Fast Open
3003         * for SYN-ACK retrans purpose, stay put.
3004         */
3005        if (tp->fastopen_rsk)
3006                return;
3007
3008        if (!tp->packets_out) {
3009                inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
3010        } else {
3011                u32 rto = inet_csk(sk)->icsk_rto;
3012                /* Offset the time elapsed after installing regular RTO */
3013                if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
3014                    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
3015                        struct sk_buff *skb = tcp_write_queue_head(sk);
3016                        const u32 rto_time_stamp =
3017                                tcp_skb_timestamp(skb) + rto;
3018                        s32 delta = (s32)(rto_time_stamp - tcp_time_stamp);
3019                        /* delta may not be positive if the socket is locked
3020                         * when the retrans timer fires and is rescheduled.
3021                         */
3022                        rto = max_t(int, delta, 1);
3023                }
3024                inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
3025                                          TCP_RTO_MAX);
3026        }
3027}
3028
3029/* This function is called when the delayed ER timer fires. TCP enters
3030 * fast recovery and performs fast-retransmit.
3031 */
3032void tcp_resume_early_retransmit(struct sock *sk)
3033{
3034        struct tcp_sock *tp = tcp_sk(sk);
3035
3036        tcp_rearm_rto(sk);
3037
3038        /* Stop if ER is disabled after the delayed ER timer is scheduled */
3039        if (!tp->do_early_retrans)
3040                return;
3041
3042        tcp_enter_recovery(sk, false);
3043        tcp_update_scoreboard(sk, 1);
3044        tcp_xmit_retransmit_queue(sk);
3045}
3046
3047/* If we get here, the whole TSO packet has not been acked. */
3048static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
3049{
3050        struct tcp_sock *tp = tcp_sk(sk);
3051        u32 packets_acked;
3052
3053        BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una));
3054
3055        packets_acked = tcp_skb_pcount(skb);
3056        if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
3057                return 0;
3058        packets_acked -= tcp_skb_pcount(skb);
3059
3060        if (packets_acked) {
3061                BUG_ON(tcp_skb_pcount(skb) == 0);
3062                BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq));
3063        }
3064
3065        return packets_acked;
3066}
3067
3068/* Remove acknowledged frames from the retransmission queue. If our packet
3069 * is before the ack sequence we can discard it as it's confirmed to have
3070 * arrived at the other end.
3071 */
3072static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3073                               u32 prior_snd_una, long sack_rtt_us)
3074{
3075        const struct inet_connection_sock *icsk = inet_csk(sk);
3076        struct skb_mstamp first_ackt, last_ackt, now;
3077        struct tcp_sock *tp = tcp_sk(sk);
3078        u32 prior_sacked = tp->sacked_out;
3079        u32 reord = tp->packets_out;
3080        bool fully_acked = true;
3081        bool rtt_update;
3082        long ca_seq_rtt_us = -1L;
3083        long seq_rtt_us = -1L;
3084        struct sk_buff *skb;
3085        u32 pkts_acked = 0;
3086        int flag = 0;
3087
3088        first_ackt.v64 = 0;
3089
3090        while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
3091                struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
3092                u8 sacked = scb->sacked;
3093                u32 acked_pcount;
3094
3095                /* Determine how many packets and what bytes were acked, tso and else */
3096                if (after(scb->end_seq, tp->snd_una)) {
3097                        if (tcp_skb_pcount(skb) == 1 ||
3098                            !after(tp->snd_una, scb->seq))
3099                                break;
3100
3101                        acked_pcount = tcp_tso_acked(sk, skb);
3102                        if (!acked_pcount)
3103                                break;
3104
3105                        fully_acked = false;
3106                } else {
3107                        acked_pcount = tcp_skb_pcount(skb);
3108                }
3109
3110                if (sacked & TCPCB_RETRANS) {
3111                        if (sacked & TCPCB_SACKED_RETRANS)
3112                                tp->retrans_out -= acked_pcount;
3113                        flag |= FLAG_RETRANS_DATA_ACKED;
3114                } else {
3115                        last_ackt = skb->skb_mstamp;
3116                        WARN_ON_ONCE(last_ackt.v64 == 0);
3117                        if (!first_ackt.v64)
3118                                first_ackt = last_ackt;
3119
3120                        if (!(sacked & TCPCB_SACKED_ACKED)) {
3121                                reord = min(pkts_acked, reord);
3122                                if (!after(scb->end_seq, tp->high_seq))
3123                                        flag |= FLAG_ORIG_SACK_ACKED;
3124                        }
3125                }
3126
3127                if (sacked & TCPCB_SACKED_ACKED)
3128                        tp->sacked_out -= acked_pcount;
3129                if (sacked & TCPCB_LOST)
3130                        tp->lost_out -= acked_pcount;
3131
3132                tp->packets_out -= acked_pcount;
3133                pkts_acked += acked_pcount;
3134
3135                /* Initial outgoing SYN's get put onto the write_queue
3136                 * just like anything else we transmit.  It is not
3137                 * true data, and if we misinform our callers that
3138                 * this ACK acks real data, we will erroneously exit
3139                 * connection startup slow start one packet too
3140                 * quickly.  This is severely frowned upon behavior.
3141                 */
3142                if (!(scb->tcp_flags & TCPHDR_SYN)) {
3143                        flag |= FLAG_DATA_ACKED;
3144                } else {
3145                        flag |= FLAG_SYN_ACKED;
3146                        tp->retrans_stamp = 0;
3147                }
3148
3149                if (!fully_acked)
3150                        break;
3151
3152                tcp_unlink_write_queue(skb, sk);
3153                sk_wmem_free_skb(sk, skb);
3154                if (skb == tp->retransmit_skb_hint)
3155                        tp->retransmit_skb_hint = NULL;
3156                if (skb == tp->lost_skb_hint)
3157                        tp->lost_skb_hint = NULL;
3158        }
3159
3160        if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
3161                tp->snd_up = tp->snd_una;
3162
3163        if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
3164                flag |= FLAG_SACK_RENEGING;
3165
3166        skb_mstamp_get(&now);
3167        if (first_ackt.v64 && !(flag & FLAG_RETRANS_DATA_ACKED)) {
3168                seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt);
3169                ca_seq_rtt_us = skb_mstamp_us_delta(&now, &last_ackt);
3170        }
3171
3172        rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us);
3173
3174        if (flag & FLAG_ACKED) {
3175                const struct tcp_congestion_ops *ca_ops
3176                        = inet_csk(sk)->icsk_ca_ops;
3177
3178                tcp_rearm_rto(sk);
3179                if (unlikely(icsk->icsk_mtup.probe_size &&
3180                             !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
3181                        tcp_mtup_probe_success(sk);
3182                }
3183
3184                if (tcp_is_reno(tp)) {
3185                        tcp_remove_reno_sacks(sk, pkts_acked);
3186
3187                        /* If any of the cumulatively ACKed segments was
3188                         * retransmitted, non-SACK case cannot confirm that
3189                         * progress was due to original transmission due to
3190                         * lack of TCPCB_SACKED_ACKED bits even if some of
3191                         * the packets may have been never retransmitted.
3192                         */
3193                        if (flag & FLAG_RETRANS_DATA_ACKED)
3194                                flag &= ~FLAG_ORIG_SACK_ACKED;
3195                } else {
3196                        int delta;
3197
3198                        /* Non-retransmitted hole got filled? That's reordering */
3199                        if (reord < prior_fackets && reord <= tp->fackets_out)
3200                                tcp_update_reordering(sk, tp->fackets_out - reord, 0);
3201
3202                        delta = tcp_is_fack(tp) ? pkts_acked :
3203                                                  prior_sacked - tp->sacked_out;
3204                        tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
3205                }
3206
3207                tp->fackets_out -= min(pkts_acked, tp->fackets_out);
3208
3209                if (ca_ops->pkts_acked)
3210                        ca_ops->pkts_acked(sk, pkts_acked, ca_seq_rtt_us);
3211
3212        } else if (skb && rtt_update && sack_rtt_us >= 0 &&
3213                   sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) {
3214                /* Do not re-arm RTO if the sack RTT is measured from data sent
3215                 * after when the head was last (re)transmitted. Otherwise the
3216                 * timeout may continue to extend in loss recovery.
3217                 */
3218                tcp_rearm_rto(sk);
3219        }
3220
3221#if FASTRETRANS_DEBUG > 0
3222        WARN_ON((int)tp->sacked_out < 0);
3223        WARN_ON((int)tp->lost_out < 0);
3224        WARN_ON((int)tp->retrans_out < 0);
3225        if (!tp->packets_out && tcp_is_sack(tp)) {
3226                icsk = inet_csk(sk);
3227                if (tp->lost_out) {
3228                        pr_debug("Leak l=%u %d\n",
3229                                 tp->lost_out, icsk->icsk_ca_state);
3230                        tp->lost_out = 0;
3231                }
3232                if (tp->sacked_out) {
3233                        pr_debug("Leak s=%u %d\n",
3234                                 tp->sacked_out, icsk->icsk_ca_state);
3235                        tp->sacked_out = 0;
3236                }
3237                if (tp->retrans_out) {
3238                        pr_debug("Leak r=%u %d\n",
3239                                 tp->retrans_out, icsk->icsk_ca_state);
3240                        tp->retrans_out = 0;
3241                }
3242        }
3243#endif
3244        return flag;
3245}
3246
3247static void tcp_ack_probe(struct sock *sk)
3248{
3249        const struct tcp_sock *tp = tcp_sk(sk);
3250        struct inet_connection_sock *icsk = inet_csk(sk);
3251
3252        /* Was it a usable window open? */
3253
3254        if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tcp_wnd_end(tp))) {
3255                icsk->icsk_backoff = 0;
3256                inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
3257                /* Socket must be waked up by subsequent tcp_data_snd_check().
3258                 * This function is not for random using!
3259                 */
3260        } else {
3261                unsigned long when = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
3262
3263                inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
3264                                          when, TCP_RTO_MAX);
3265        }
3266}
3267
3268static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag)
3269{
3270        return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
3271                inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
3272}
3273
3274/* Decide wheather to run the increase function of congestion control. */
3275static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
3276{
3277        if (tcp_in_cwnd_reduction(sk))
3278                return false;
3279
3280        /* If reordering is high then always grow cwnd whenever data is
3281         * delivered regardless of its ordering. Otherwise stay conservative
3282         * and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/
3283         * new SACK or ECE mark may first advance cwnd here and later reduce
3284         * cwnd in tcp_fastretrans_alert() based on more states.
3285         */
3286        if (tcp_sk(sk)->reordering > sysctl_tcp_reordering)
3287                return flag & FLAG_FORWARD_PROGRESS;
3288
3289        return flag & FLAG_DATA_ACKED;
3290}
3291
3292/* Check that window update is acceptable.
3293 * The function assumes that snd_una<=ack<=snd_next.
3294 */
3295static inline bool tcp_may_update_window(const struct tcp_sock *tp,
3296                                        const u32 ack, const u32 ack_seq,
3297                                        const u32 nwin)
3298{
3299        return  after(ack, tp->snd_una) ||
3300                after(ack_seq, tp->snd_wl1) ||
3301                (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd);
3302}
3303
3304/* If we update tp->snd_una, also update tp->bytes_acked */
3305static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
3306{
3307        u32 delta = ack - tp->snd_una;
3308
3309        u64_stats_update_begin(&tp->syncp);
3310        tp->bytes_acked += delta;
3311        u64_stats_update_end(&tp->syncp);
3312        tp->snd_una = ack;
3313}
3314
3315/* If we update tp->rcv_nxt, also update tp->bytes_received */
3316static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
3317{
3318        u32 delta = seq - tp->rcv_nxt;
3319
3320        u64_stats_update_begin(&tp->syncp);
3321        tp->bytes_received += delta;
3322        u64_stats_update_end(&tp->syncp);
3323        tp->rcv_nxt = seq;
3324}
3325
3326/* Update our send window.
3327 *
3328 * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
3329 * and in FreeBSD. NetBSD's one is even worse.) is wrong.
3330 */
3331static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 ack,
3332                                 u32 ack_seq)
3333{
3334        struct tcp_sock *tp = tcp_sk(sk);
3335        int flag = 0;
3336        u32 nwin = ntohs(tcp_hdr(skb)->window);
3337
3338        if (likely(!tcp_hdr(skb)->syn))
3339                nwin <<= tp->rx_opt.snd_wscale;
3340
3341        if (tcp_may_update_window(tp, ack, ack_seq, nwin)) {
3342                flag |= FLAG_WIN_UPDATE;
3343                tcp_update_wl(tp, ack_seq);
3344
3345                if (tp->snd_wnd != nwin) {
3346                        tp->snd_wnd = nwin;
3347
3348                        /* Note, it is the only place, where
3349                         * fast path is recovered for sending TCP.
3350                         */
3351                        tp->pred_flags = 0;
3352                        tcp_fast_path_check(sk);
3353
3354                        if (nwin > tp->max_window) {
3355                                tp->max_window = nwin;
3356                                tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
3357                        }
3358                }
3359        }
3360
3361        tcp_snd_una_update(tp, ack);
3362
3363        return flag;
3364}
3365
3366static bool __tcp_oow_rate_limited(struct net *net, int mib_idx,
3367                                   u32 *last_oow_ack_time)
3368{
3369        if (*last_oow_ack_time) {
3370                s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time);
3371
3372                if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) {
3373                        NET_INC_STATS(net, mib_idx);
3374                        return true;    /* rate-limited: don't send yet! */
3375                }
3376        }
3377
3378        *last_oow_ack_time = tcp_time_stamp;
3379
3380        return false;   /* not rate-limited: go ahead, send dupack now! */
3381}
3382
3383/* Return true if we're currently rate-limiting out-of-window ACKs and
3384 * thus shouldn't send a dupack right now. We rate-limit dupacks in
3385 * response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS
3386 * attacks that send repeated SYNs or ACKs for the same connection. To
3387 * do this, we do not send a duplicate SYNACK or ACK if the remote
3388 * endpoint is sending out-of-window SYNs or pure ACKs at a high rate.
3389 */
3390bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb,
3391                          int mib_idx, u32 *last_oow_ack_time)
3392{
3393        /* Data packets without SYNs are not likely part of an ACK loop. */
3394        if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) &&
3395            !tcp_hdr(skb)->syn)
3396                return false;
3397
3398        return __tcp_oow_rate_limited(net, mib_idx, last_oow_ack_time);
3399}
3400
3401/* RFC 5961 7 [ACK Throttling] */
3402static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
3403{
3404        /* unprotected vars, we dont care of overwrites */
3405        static u32 challenge_timestamp;
3406        static unsigned int challenge_count;
3407        struct tcp_sock *tp = tcp_sk(sk);
3408        u32 count, now;
3409
3410        /* First check our per-socket dupack rate limit. */
3411        if (__tcp_oow_rate_limited(sock_net(sk),
3412                                   LINUX_MIB_TCPACKSKIPPEDCHALLENGE,
3413                                   &tp->last_oow_ack_time))
3414                return;
3415
3416        /* Then check host-wide RFC 5961 rate limit. */
3417        now = jiffies / HZ;
3418        if (now != challenge_timestamp) {
3419                u32 half = (sysctl_tcp_challenge_ack_limit + 1) >> 1;
3420
3421                challenge_timestamp = now;
3422                WRITE_ONCE(challenge_count, half +
3423                           prandom_u32_max(sysctl_tcp_challenge_ack_limit));
3424        }
3425        count = READ_ONCE(challenge_count);
3426        if (count > 0) {
3427                WRITE_ONCE(challenge_count, count - 1);
3428                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK);
3429                tcp_send_ack(sk);
3430        }
3431}
3432
3433static void tcp_store_ts_recent(struct tcp_sock *tp)
3434{
3435        tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
3436        tp->rx_opt.ts_recent_stamp = get_seconds();
3437}
3438
3439static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
3440{
3441        if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
3442                /* PAWS bug workaround wrt. ACK frames, the PAWS discard
3443                 * extra check below makes sure this can only happen
3444                 * for pure ACK frames.  -DaveM
3445                 *
3446                 * Not only, also it occurs for expired timestamps.
3447                 */
3448
3449                if (tcp_paws_check(&tp->rx_opt, 0))
3450                        tcp_store_ts_recent(tp);
3451        }
3452}
3453
3454/* This routine deals with acks during a TLP episode.
3455 * Ref: loss detection algorithm in draft-dukkipati-tcpm-tcp-loss-probe.
3456 */
3457static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
3458{
3459        struct tcp_sock *tp = tcp_sk(sk);
3460        bool is_tlp_dupack = (ack == tp->tlp_high_seq) &&
3461                             !(flag & (FLAG_SND_UNA_ADVANCED |
3462                                       FLAG_NOT_DUP | FLAG_DATA_SACKED));
3463
3464        /* Mark the end of TLP episode on receiving TLP dupack or when
3465         * ack is after tlp_high_seq.
3466         */
3467        if (is_tlp_dupack) {
3468                tp->tlp_high_seq = 0;
3469                return;
3470        }
3471
3472        if (after(ack, tp->tlp_high_seq)) {
3473                tp->tlp_high_seq = 0;
3474                /* Don't reduce cwnd if DSACK arrives for TLP retrans. */
3475                if (!(flag & FLAG_DSACKING_ACK)) {
3476                        tcp_init_cwnd_reduction(sk);
3477                        tcp_set_ca_state(sk, TCP_CA_CWR);
3478                        tcp_end_cwnd_reduction(sk);
3479                        tcp_try_keep_open(sk);
3480                        NET_INC_STATS_BH(sock_net(sk),
3481                                         LINUX_MIB_TCPLOSSPROBERECOVERY);
3482                }
3483        }
3484}
3485
3486static inline void tcp_in_ack_event(struct sock *sk, u32 flags)
3487{
3488        const struct inet_connection_sock *icsk = inet_csk(sk);
3489
3490        if (icsk->icsk_ca_ops->in_ack_event)
3491                icsk->icsk_ca_ops->in_ack_event(sk, flags);
3492}
3493
3494/* This routine deals with incoming acks, but not outgoing ones. */
3495static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3496{
3497        struct inet_connection_sock *icsk = inet_csk(sk);
3498        struct tcp_sock *tp = tcp_sk(sk);
3499        u32 prior_snd_una = tp->snd_una;
3500        u32 ack_seq = TCP_SKB_CB(skb)->seq;
3501        u32 ack = TCP_SKB_CB(skb)->ack_seq;
3502        bool is_dupack = false;
3503        u32 prior_fackets;
3504        int prior_packets = tp->packets_out;
3505        const int prior_unsacked = tp->packets_out - tp->sacked_out;
3506        int acked = 0; /* Number of packets newly acked */
3507        long sack_rtt_us = -1L;
3508
3509        /* If the ack is older than previous acks
3510         * then we can probably ignore it.
3511         */
3512        if (before(ack, prior_snd_una)) {
3513                /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
3514                if (before(ack, prior_snd_una - tp->max_window)) {
3515                        tcp_send_challenge_ack(sk, skb);
3516                        return -1;
3517                }
3518                goto old_ack;
3519        }
3520
3521        /* If the ack includes data we haven't sent yet, discard
3522         * this segment (RFC793 Section 3.9).
3523         */
3524        if (after(ack, tp->snd_nxt))
3525                goto invalid_ack;
3526
3527        if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
3528            icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
3529                tcp_rearm_rto(sk);
3530
3531        if (after(ack, prior_snd_una)) {
3532                flag |= FLAG_SND_UNA_ADVANCED;
3533                icsk->icsk_retransmits = 0;
3534        }
3535
3536        prior_fackets = tp->fackets_out;
3537
3538        /* ts_recent update must be made after we are sure that the packet
3539         * is in window.
3540         */
3541        if (flag & FLAG_UPDATE_TS_RECENT)
3542                tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
3543
3544        if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
3545                /* Window is constant, pure forward advance.
3546                 * No more checks are required.
3547                 * Note, we use the fact that SND.UNA>=SND.WL2.
3548                 */
3549                tcp_update_wl(tp, ack_seq);
3550                tcp_snd_una_update(tp, ack);
3551                flag |= FLAG_WIN_UPDATE;
3552
3553                tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
3554
3555                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS);
3556        } else {
3557                u32 ack_ev_flags = CA_ACK_SLOWPATH;
3558
3559                if (ack_seq != TCP_SKB_CB(skb)->end_seq)
3560                        flag |= FLAG_DATA;
3561                else
3562                        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPUREACKS);
3563
3564                flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
3565
3566                if (TCP_SKB_CB(skb)->sacked)
3567                        flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
3568                                                        &sack_rtt_us);
3569
3570                if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) {
3571                        flag |= FLAG_ECE;
3572                        ack_ev_flags |= CA_ACK_ECE;
3573                }
3574
3575                if (flag & FLAG_WIN_UPDATE)
3576                        ack_ev_flags |= CA_ACK_WIN_UPDATE;
3577
3578                tcp_in_ack_event(sk, ack_ev_flags);
3579        }
3580
3581        /* We passed data and got it acked, remove any soft error
3582         * log. Something worked...
3583         */
3584        sk->sk_err_soft = 0;
3585        icsk->icsk_probes_out = 0;
3586        tp->rcv_tstamp = tcp_time_stamp;
3587        if (!prior_packets)
3588                goto no_queue;
3589
3590        /* See if we can take anything off of the retransmit queue. */
3591        acked = tp->packets_out;
3592        flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una,
3593                                    sack_rtt_us);
3594        acked -= tp->packets_out;
3595
3596        /* Advance cwnd if state allows */
3597        if (tcp_may_raise_cwnd(sk, flag))
3598                tcp_cong_avoid(sk, ack, acked);
3599
3600        if (tcp_ack_is_dubious(sk, flag)) {
3601                is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
3602                tcp_fastretrans_alert(sk, acked, prior_unsacked,
3603                                      is_dupack, flag);
3604        }
3605        if (tp->tlp_high_seq)
3606                tcp_process_tlp_ack(sk, ack, flag);
3607
3608        if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
3609                sk_dst_confirm(sk);
3610
3611        if (icsk->icsk_pending == ICSK_TIME_RETRANS)
3612                tcp_schedule_loss_probe(sk);
3613        tcp_update_pacing_rate(sk);
3614        return 1;
3615
3616no_queue:
3617        /* If data was DSACKed, see if we can undo a cwnd reduction. */
3618        if (flag & FLAG_DSACKING_ACK)
3619                tcp_fastretrans_alert(sk, acked, prior_unsacked,
3620                                      is_dupack, flag);
3621        /* If this ack opens up a zero window, clear backoff.  It was
3622         * being used to time the probes, and is probably far higher than
3623         * it needs to be for normal retransmission.
3624         */
3625        if (tcp_send_head(sk))
3626                tcp_ack_probe(sk);
3627
3628        if (tp->tlp_high_seq)
3629                tcp_process_tlp_ack(sk, ack, flag);
3630        return 1;
3631
3632invalid_ack:
3633        SOCK_DEBUG(sk, "Ack %u after %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
3634        return -1;
3635
3636old_ack:
3637        /* If data was SACKed, tag it and see if we should send more data.
3638         * If data was DSACKed, see if we can undo a cwnd reduction.
3639         */
3640        if (TCP_SKB_CB(skb)->sacked) {
3641                flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
3642                                                &sack_rtt_us);
3643                tcp_fastretrans_alert(sk, acked, prior_unsacked,
3644                                      is_dupack, flag);
3645        }
3646
3647        SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
3648        return 0;
3649}
3650
3651static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
3652                                      bool syn, struct tcp_fastopen_cookie *foc,
3653                                      bool exp_opt)
3654{
3655        /* Valid only in SYN or SYN-ACK with an even length.  */
3656        if (!foc || !syn || len < 0 || (len & 1))
3657                return;
3658
3659        if (len >= TCP_FASTOPEN_COOKIE_MIN &&
3660            len <= TCP_FASTOPEN_COOKIE_MAX)
3661                memcpy(foc->val, cookie, len);
3662        else if (len != 0)
3663                len = -1;
3664        foc->len = len;
3665        foc->exp = exp_opt;
3666}
3667
3668/* Look for tcp options. Normally only called on SYN and SYNACK packets.
3669 * But, this can also be called on packets in the established flow when
3670 * the fast version below fails.
3671 */
3672void tcp_parse_options(const struct sk_buff *skb,
3673                       struct tcp_options_received *opt_rx, int estab,
3674                       struct tcp_fastopen_cookie *foc)
3675{
3676        const unsigned char *ptr;
3677        const struct tcphdr *th = tcp_hdr(skb);
3678        int length = (th->doff * 4) - sizeof(struct tcphdr);
3679
3680        ptr = (const unsigned char *)(th + 1);
3681        opt_rx->saw_tstamp = 0;
3682
3683        while (length > 0) {
3684                int opcode = *ptr++;
3685                int opsize;
3686
3687                switch (opcode) {
3688                case TCPOPT_EOL:
3689                        return;
3690                case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
3691                        length--;
3692                        continue;
3693                default:
3694                        opsize = *ptr++;
3695                        if (opsize < 2) /* "silly options" */
3696                                return;
3697                        if (opsize > length)
3698                                return; /* don't parse partial options */
3699                        switch (opcode) {
3700                        case TCPOPT_MSS:
3701                                if (opsize == TCPOLEN_MSS && th->syn && !estab) {
3702                                        u16 in_mss = get_unaligned_be16(ptr);
3703                                        if (in_mss) {
3704                                                if (opt_rx->user_mss &&
3705                                                    opt_rx->user_mss < in_mss)
3706                                                        in_mss = opt_rx->user_mss;
3707                                                opt_rx->mss_clamp = in_mss;
3708                                        }
3709                                }
3710                                break;
3711                        case TCPOPT_WINDOW:
3712                                if (opsize == TCPOLEN_WINDOW && th->syn &&
3713                                    !estab && sysctl_tcp_window_scaling) {
3714                                        __u8 snd_wscale = *(__u8 *)ptr;
3715                                        opt_rx->wscale_ok = 1;
3716                                        if (snd_wscale > 14) {
3717                                                net_info_ratelimited("%s: Illegal window scaling value %d >14 received\n",
3718                                                                     __func__,
3719                                                                     snd_wscale);
3720                                                snd_wscale = 14;
3721                                        }
3722                                        opt_rx->snd_wscale = snd_wscale;
3723                                }
3724                                break;
3725                        case TCPOPT_TIMESTAMP:
3726                                if ((opsize == TCPOLEN_TIMESTAMP) &&
3727                                    ((estab && opt_rx->tstamp_ok) ||
3728                                     (!estab && sysctl_tcp_timestamps))) {
3729                                        opt_rx->saw_tstamp = 1;
3730                                        opt_rx->rcv_tsval = get_unaligned_be32(ptr);
3731                                        opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4);
3732                                }
3733                                break;
3734                        case TCPOPT_SACK_PERM:
3735                                if (opsize == TCPOLEN_SACK_PERM && th->syn &&
3736                                    !estab && sysctl_tcp_sack) {
3737                                        opt_rx->sack_ok = TCP_SACK_SEEN;
3738                                        tcp_sack_reset(opt_rx);
3739                                }
3740                                break;
3741
3742                        case TCPOPT_SACK:
3743                                if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
3744                                   !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) &&
3745                                   opt_rx->sack_ok) {
3746                                        TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th;
3747                                }
3748                                break;
3749#ifdef CONFIG_TCP_MD5SIG
3750                        case TCPOPT_MD5SIG:
3751                                /*
3752                                 * The MD5 Hash has already been
3753                                 * checked (see tcp_v{4,6}_do_rcv()).
3754                                 */
3755                                break;
3756#endif
3757                        case TCPOPT_FASTOPEN:
3758                                tcp_parse_fastopen_option(
3759                                        opsize - TCPOLEN_FASTOPEN_BASE,
3760                                        ptr, th->syn, foc, false);
3761                                break;
3762
3763                        case TCPOPT_EXP:
3764                                /* Fast Open option shares code 254 using a
3765                                 * 16 bits magic number.
3766                                 */
3767                                if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE &&
3768                                    get_unaligned_be16(ptr) ==
3769                                    TCPOPT_FASTOPEN_MAGIC)
3770                                        tcp_parse_fastopen_option(opsize -
3771                                                TCPOLEN_EXP_FASTOPEN_BASE,
3772                                                ptr + 2, th->syn, foc, true);
3773                                break;
3774
3775                        }
3776                        ptr += opsize-2;
3777                        length -= opsize;
3778                }
3779        }
3780}
3781EXPORT_SYMBOL(tcp_parse_options);
3782
3783static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr *th)
3784{
3785        const __be32 *ptr = (const __be32 *)(th + 1);
3786
3787        if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
3788                          | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
3789                tp->rx_opt.saw_tstamp = 1;
3790                ++ptr;
3791                tp->rx_opt.rcv_tsval = ntohl(*ptr);
3792                ++ptr;
3793                if (*ptr)
3794                        tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset;
3795                else
3796                        tp->rx_opt.rcv_tsecr = 0;
3797                return true;
3798        }
3799        return false;
3800}
3801
3802/* Fast parse options. This hopes to only see timestamps.
3803 * If it is wrong it falls back on tcp_parse_options().
3804 */
3805static bool tcp_fast_parse_options(const struct sk_buff *skb,
3806                                   const struct tcphdr *th, struct tcp_sock *tp)
3807{
3808        /* In the spirit of fast parsing, compare doff directly to constant
3809         * values.  Because equality is used, short doff can be ignored here.
3810         */
3811        if (th->doff == (sizeof(*th) / 4)) {
3812                tp->rx_opt.saw_tstamp = 0;
3813                return false;
3814        } else if (tp->rx_opt.tstamp_ok &&
3815                   th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
3816                if (tcp_parse_aligned_timestamp(tp, th))
3817                        return true;
3818        }
3819
3820        tcp_parse_options(skb, &tp->rx_opt, 1, NULL);
3821        if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
3822                tp->rx_opt.rcv_tsecr -= tp->tsoffset;
3823
3824        return true;
3825}
3826
3827#ifdef CONFIG_TCP_MD5SIG
3828/*
3829 * Parse MD5 Signature option
3830 */
3831const u8 *tcp_parse_md5sig_option(const struct tcphdr *th)
3832{
3833        int length = (th->doff << 2) - sizeof(*th);
3834        const u8 *ptr = (const u8 *)(th + 1);
3835
3836        /* If not enough data remaining, we can short cut */
3837        while (length >= TCPOLEN_MD5SIG) {
3838                int opcode = *ptr++;
3839                int opsize;
3840
3841                switch(opcode) {
3842                case TCPOPT_EOL:
3843                        return NULL;
3844                case TCPOPT_NOP:
3845                        length--;
3846                        continue;
3847                default:
3848                        opsize = *ptr++;
3849                        if (opsize < 2 || opsize > length)
3850                                return NULL;
3851                        if (opcode == TCPOPT_MD5SIG)
3852                                return opsize == TCPOLEN_MD5SIG ? ptr : NULL;
3853                }
3854                ptr += opsize - 2;
3855                length -= opsize;
3856        }
3857        return NULL;
3858}
3859EXPORT_SYMBOL(tcp_parse_md5sig_option);
3860#endif
3861
3862/* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
3863 *
3864 * It is not fatal. If this ACK does _not_ change critical state (seqs, window)
3865 * it can pass through stack. So, the following predicate verifies that
3866 * this segment is not used for anything but congestion avoidance or
3867 * fast retransmit. Moreover, we even are able to eliminate most of such
3868 * second order effects, if we apply some small "replay" window (~RTO)
3869 * to timestamp space.
3870 *
3871 * All these measures still do not guarantee that we reject wrapped ACKs
3872 * on networks with high bandwidth, when sequence space is recycled fastly,
3873 * but it guarantees that such events will be very rare and do not affect
3874 * connection seriously. This doesn't look nice, but alas, PAWS is really
3875 * buggy extension.
3876 *
3877 * [ Later note. Even worse! It is buggy for segments _with_ data. RFC
3878 * states that events when retransmit arrives after original data are rare.
3879 * It is a blatant lie. VJ forgot about fast retransmit! 8)8) It is
3880 * the biggest problem on large power networks even with minor reordering.
3881 * OK, let's give it small replay window. If peer clock is even 1hz, it is safe
3882 * up to bandwidth of 18Gigabit/sec. 8) ]
3883 */
3884
3885static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
3886{
3887        const struct tcp_sock *tp = tcp_sk(sk);
3888        const struct tcphdr *th = tcp_hdr(skb);
3889        u32 seq = TCP_SKB_CB(skb)->seq;
3890        u32 ack = TCP_SKB_CB(skb)->ack_seq;
3891
3892        return (/* 1. Pure ACK with correct sequence number. */
3893                (th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) &&
3894
3895                /* 2. ... and duplicate ACK. */
3896                ack == tp->snd_una &&
3897
3898                /* 3. ... and does not update window. */
3899                !tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&
3900
3901                /* 4. ... and sits in replay window. */
3902                (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);
3903}
3904
3905static inline bool tcp_paws_discard(const struct sock *sk,
3906                                   const struct sk_buff *skb)
3907{
3908        const struct tcp_sock *tp = tcp_sk(sk);
3909
3910        return !tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW) &&
3911               !tcp_disordered_ack(sk, skb);
3912}
3913
3914/* Check segment sequence number for validity.
3915 *
3916 * Segment controls are considered valid, if the segment
3917 * fits to the window after truncation to the window. Acceptability
3918 * of data (and SYN, FIN, of course) is checked separately.
3919 * See tcp_data_queue(), for example.
3920 *
3921 * Also, controls (RST is main one) are accepted using RCV.WUP instead
3922 * of RCV.NXT. Peer still did not advance his SND.UNA when we
3923 * delayed ACK, so that hisSND.UNA<=ourRCV.WUP.
3924 * (borrowed from freebsd)
3925 */
3926
3927static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
3928{
3929        return  !before(end_seq, tp->rcv_wup) &&
3930                !after(seq, tp->rcv_nxt + tcp_receive_window(tp));
3931}
3932
3933/* When we get a reset we do this. */
3934void tcp_reset(struct sock *sk)
3935{
3936        /* We want the right error as BSD sees it (and indeed as we do). */
3937        switch (sk->sk_state) {
3938        case TCP_SYN_SENT:
3939                sk->sk_err = ECONNREFUSED;
3940                break;
3941        case TCP_CLOSE_WAIT:
3942                sk->sk_err = EPIPE;
3943                break;
3944        case TCP_CLOSE:
3945                return;
3946        default:
3947                sk->sk_err = ECONNRESET;
3948        }
3949        /* This barrier is coupled with smp_rmb() in tcp_poll() */
3950        smp_wmb();
3951
3952        tcp_done(sk);
3953
3954        if (!sock_flag(sk, SOCK_DEAD))
3955                sk->sk_error_report(sk);
3956}
3957
3958/*
3959 *      Process the FIN bit. This now behaves as it is supposed to work
3960 *      and the FIN takes effect when it is validly part of sequence
3961 *      space. Not before when we get holes.
3962 *
3963 *      If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
3964 *      (and thence onto LAST-ACK and finally, CLOSE, we never enter
3965 *      TIME-WAIT)
3966 *
3967 *      If we are in FINWAIT-1, a received FIN indicates simultaneous
3968 *      close and we go into CLOSING (and later onto TIME-WAIT)
3969 *
3970 *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
3971 */
3972static void tcp_fin(struct sock *sk)
3973{
3974        struct tcp_sock *tp = tcp_sk(sk);
3975
3976        inet_csk_schedule_ack(sk);
3977
3978        sk->sk_shutdown |= RCV_SHUTDOWN;
3979        sock_set_flag(sk, SOCK_DONE);
3980
3981        switch (sk->sk_state) {
3982        case TCP_SYN_RECV:
3983        case TCP_ESTABLISHED:
3984                /* Move to CLOSE_WAIT */
3985                tcp_set_state(sk, TCP_CLOSE_WAIT);
3986                inet_csk(sk)->icsk_ack.pingpong = 1;
3987                break;
3988
3989        case TCP_CLOSE_WAIT:
3990        case TCP_CLOSING:
3991                /* Received a retransmission of the FIN, do
3992                 * nothing.
3993                 */
3994                break;
3995        case TCP_LAST_ACK:
3996                /* RFC793: Remain in the LAST-ACK state. */
3997                break;
3998
3999        case TCP_FIN_WAIT1:
4000                /* This case occurs when a simultaneous close

4001                 * happens, we must ack the received FIN and
4002                 * enter the CLOSING state.
4003                 */
4004                tcp_send_ack(sk);
4005                tcp_set_state(sk, TCP_CLOSING);
4006                break;
4007        case TCP_FIN_WAIT2:
4008                /* Received a FIN -- send ACK and enter TIME_WAIT. */
4009                tcp_send_ack(sk);
4010                tcp_time_wait(sk, TCP_TIME_WAIT, 0);
4011                break;
4012        default:
4013                /* Only TCP_LISTEN and TCP_CLOSE are left, in these
4014                 * cases we should never reach this piece of code.
4015                 */
4016                pr_err("%s: Impossible, sk->sk_state=%d\n",
4017                       __func__, sk->sk_state);
4018                break;
4019        }
4020
4021        /* It _is_ possible, that we have something out-of-order _after_ FIN.
4022         * Probably, we should reset in this case. For now drop them.
4023         */
4024        skb_rbtree_purge(&tp->out_of_order_queue);
4025        if (tcp_is_sack(tp))
4026                tcp_sack_reset(&tp->rx_opt);
4027        sk_mem_reclaim(sk);
4028
4029        if (!sock_flag(sk, SOCK_DEAD)) {
4030                sk->sk_state_change(sk);
4031
4032                /* Do not send POLL_HUP for half duplex close. */
4033                if (sk->sk_shutdown == SHUTDOWN_MASK ||
4034                    sk->sk_state == TCP_CLOSE)
4035                        sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
4036                else
4037                        sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
4038        }
4039}
4040
4041static inline bool tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
4042                                  u32 end_seq)
4043{
4044        if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
4045                if (before(seq, sp->start_seq))
4046                        sp->start_seq = seq;
4047                if (after(end_seq, sp->end_seq))
4048                        sp->end_seq = end_seq;
4049                return true;
4050        }
4051        return false;
4052}
4053
4054static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
4055{
4056        struct tcp_sock *tp = tcp_sk(sk);
4057
4058        if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
4059                int mib_idx;
4060
4061                if (before(seq, tp->rcv_nxt))
4062                        mib_idx = LINUX_MIB_TCPDSACKOLDSENT;
4063                else
4064                        mib_idx = LINUX_MIB_TCPDSACKOFOSENT;
4065
4066                NET_INC_STATS_BH(sock_net(sk), mib_idx);
4067
4068                tp->rx_opt.dsack = 1;
4069                tp->duplicate_sack[0].start_seq = seq;
4070                tp->duplicate_sack[0].end_seq = end_seq;
4071        }
4072}
4073
4074static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
4075{
4076        struct tcp_sock *tp = tcp_sk(sk);
4077
4078        if (!tp->rx_opt.dsack)
4079                tcp_dsack_set(sk, seq, end_seq);
4080        else
4081                tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
4082}
4083
4084static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
4085{
4086        struct tcp_sock *tp = tcp_sk(sk);
4087
4088        if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
4089            before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
4090                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
4091                tcp_enter_quickack_mode(sk);
4092
4093                if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
4094                        u32 end_seq = TCP_SKB_CB(skb)->end_seq;
4095
4096                        if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
4097                                end_seq = tp->rcv_nxt;
4098                        tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq);
4099                }
4100        }
4101
4102        tcp_send_ack(sk);
4103}
4104
4105/* These routines update the SACK block as out-of-order packets arrive or
4106 * in-order packets close up the sequence space.
4107 */
4108static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
4109{
4110        int this_sack;
4111        struct tcp_sack_block *sp = &tp->selective_acks[0];
4112        struct tcp_sack_block *swalk = sp + 1;
4113
4114        /* See if the recent change to the first SACK eats into
4115         * or hits the sequence space of other SACK blocks, if so coalesce.
4116         */
4117        for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;) {
4118                if (tcp_sack_extend(sp, swalk->start_seq, swalk->end_seq)) {
4119                        int i;
4120
4121                        /* Zap SWALK, by moving every further SACK up by one slot.
4122                         * Decrease num_sacks.
4123                         */
4124                        tp->rx_opt.num_sacks--;
4125                        for (i = this_sack; i < tp->rx_opt.num_sacks; i++)
4126                                sp[i] = sp[i + 1];
4127                        continue;
4128                }
4129                this_sack++, swalk++;
4130        }
4131}
4132
4133static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
4134{
4135        struct tcp_sock *tp = tcp_sk(sk);
4136        struct tcp_sack_block *sp = &tp->selective_acks[0];
4137        int cur_sacks = tp->rx_opt.num_sacks;
4138        int this_sack;
4139
4140        if (!cur_sacks)
4141                goto new_sack;
4142
4143        for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) {
4144                if (tcp_sack_extend(sp, seq, end_seq)) {
4145                        /* Rotate this_sack to the first one. */
4146                        for (; this_sack > 0; this_sack--, sp--)
4147                                swap(*sp, *(sp - 1));
4148                        if (cur_sacks > 1)
4149                                tcp_sack_maybe_coalesce(tp);
4150                        return;
4151                }
4152        }
4153
4154        /* Could not find an adjacent existing SACK, build a new one,
4155         * put it at the front, and shift everyone else down.  We
4156         * always know there is at least one SACK present already here.
4157         *
4158         * If the sack array is full, forget about the last one.
4159         */
4160        if (this_sack >= TCP_NUM_SACKS) {
4161                this_sack--;
4162                tp->rx_opt.num_sacks--;
4163                sp--;
4164        }
4165        for (; this_sack > 0; this_sack--, sp--)
4166                *sp = *(sp - 1);
4167
4168new_sack:
4169        /* Build the new head SACK, and we're done. */
4170        sp->start_seq = seq;
4171        sp->end_seq = end_seq;
4172        tp->rx_opt.num_sacks++;
4173}
4174
4175/* RCV.NXT advances, some SACKs should be eaten. */
4176
4177static void tcp_sack_remove(struct tcp_sock *tp)
4178{
4179        struct tcp_sack_block *sp = &tp->selective_acks[0];
4180        int num_sacks = tp->rx_opt.num_sacks;
4181        int this_sack;
4182
4183        /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
4184        if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
4185                tp->rx_opt.num_sacks = 0;
4186                return;
4187        }
4188
4189        for (this_sack = 0; this_sack < num_sacks;) {
4190                /* Check if the start of the sack is covered by RCV.NXT. */
4191                if (!before(tp->rcv_nxt, sp->start_seq)) {
4192                        int i;
4193
4194                        /* RCV.NXT must cover all the block! */
4195                        WARN_ON(before(tp->rcv_nxt, sp->end_seq));
4196
4197                        /* Zap this SACK, by moving forward any other SACKS. */
4198                        for (i=this_sack+1; i < num_sacks; i++)
4199                                tp->selective_acks[i-1] = tp->selective_acks[i];
4200                        num_sacks--;
4201                        continue;
4202                }
4203                this_sack++;
4204                sp++;
4205        }
4206        tp->rx_opt.num_sacks = num_sacks;
4207}
4208
4209/**
4210 * tcp_try_coalesce - try to merge skb to prior one
4211 * @sk: socket
4212 * @to: prior buffer
4213 * @from: buffer to add in queue
4214 * @fragstolen: pointer to boolean
4215 *
4216 * Before queueing skb @from after @to, try to merge them
4217 * to reduce overall memory use and queue lengths, if cost is small.
4218 * Packets in ofo or receive queues can stay a long time.
4219 * Better try to coalesce them right now to avoid future collapses.
4220 * Returns true if caller should free @from instead of queueing it
4221 */
4222static bool tcp_try_coalesce(struct sock *sk,
4223                             struct sk_buff *to,
4224                             struct sk_buff *from,
4225                             bool *fragstolen)
4226{
4227        int delta;
4228
4229        *fragstolen = false;
4230
4231        /* Its possible this segment overlaps with prior segment in queue */
4232        if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
4233                return false;
4234
4235        if (!skb_try_coalesce(to, from, fragstolen, &delta))
4236                return false;
4237
4238        atomic_add(delta, &sk->sk_rmem_alloc);
4239        sk_mem_charge(sk, delta);
4240        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
4241        TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
4242        TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
4243        TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags;
4244        return true;
4245}
4246
4247static bool tcp_ooo_try_coalesce(struct sock *sk,
4248                             struct sk_buff *to,
4249                             struct sk_buff *from,
4250                             bool *fragstolen)
4251{
4252        bool res = tcp_try_coalesce(sk, to, from, fragstolen);
4253
4254        /* In case tcp_drop() is called later, update to->gso_segs */
4255        if (res) {
4256                u32 gso_segs = max_t(u16, 1, skb_shinfo(to)->gso_segs) +
4257                               max_t(u16, 1, skb_shinfo(from)->gso_segs);
4258
4259                skb_shinfo(to)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
4260        }
4261        return res;
4262}
4263
4264static void tcp_drop(struct sock *sk, struct sk_buff *skb)
4265{
4266        sk_drops_add(sk, skb);
4267        __kfree_skb(skb);
4268}
4269
4270/* This one checks to see if we can put data from the
4271 * out_of_order queue into the receive_queue.
4272 */
4273static void tcp_ofo_queue(struct sock *sk)
4274{
4275        struct tcp_sock *tp = tcp_sk(sk);
4276        __u32 dsack_high = tp->rcv_nxt;
4277        bool fin, fragstolen, eaten;
4278        struct sk_buff *skb, *tail;
4279        struct rb_node *p;
4280
4281        p = rb_first(&tp->out_of_order_queue);
4282        while (p) {
4283                skb = rb_to_skb(p);
4284                if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
4285                        break;
4286
4287                if (before(TCP_SKB_CB(skb)->seq, dsack_high)) {
4288                        __u32 dsack = dsack_high;
4289                        if (before(TCP_SKB_CB(skb)->end_seq, dsack_high))
4290                                dsack_high = TCP_SKB_CB(skb)->end_seq;
4291                        tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
4292                }
4293                p = rb_next(p);
4294                rb_erase(&skb->rbnode, &tp->out_of_order_queue);
4295
4296                if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
4297                        SOCK_DEBUG(sk, "ofo packet was already received\n");
4298                        tcp_drop(sk, skb);
4299                        continue;
4300                }
4301                SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
4302                           tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
4303                           TCP_SKB_CB(skb)->end_seq);
4304
4305                tail = skb_peek_tail(&sk->sk_receive_queue);
4306                eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
4307                tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
4308                fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
4309                if (!eaten)
4310                        __skb_queue_tail(&sk->sk_receive_queue, skb);
4311                else
4312                        kfree_skb_partial(skb, fragstolen);
4313
4314                if (unlikely(fin)) {
4315                        tcp_fin(sk);
4316                        /* tcp_fin() purges tp->out_of_order_queue,
4317                         * so we must end this loop right now.
4318                         */
4319                        break;
4320                }
4321        }
4322}
4323
4324static bool tcp_prune_ofo_queue(struct sock *sk);
4325static int tcp_prune_queue(struct sock *sk);
4326
4327static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
4328                                 unsigned int size)
4329{
4330        if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
4331            !sk_rmem_schedule(sk, skb, size)) {
4332
4333                if (tcp_prune_queue(sk) < 0)
4334                        return -1;
4335
4336                while (!sk_rmem_schedule(sk, skb, size)) {
4337                        if (!tcp_prune_ofo_queue(sk))
4338                                return -1;
4339                }
4340        }
4341        return 0;
4342}
4343
4344static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
4345{
4346        struct tcp_sock *tp = tcp_sk(sk);
4347        struct rb_node **p, *parent;
4348        struct sk_buff *skb1;
4349        u32 seq, end_seq;
4350        bool fragstolen;
4351
4352        TCP_ECN_check_ce(tp, skb);
4353
4354        if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
4355                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP);
4356                tcp_drop(sk, skb);
4357                return;
4358        }
4359
4360        /* Disable header prediction. */
4361        tp->pred_flags = 0;
4362        inet_csk_schedule_ack(sk);
4363
4364        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
4365        seq = TCP_SKB_CB(skb)->seq;
4366        end_seq = TCP_SKB_CB(skb)->end_seq;
4367        SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
4368                   tp->rcv_nxt, seq, end_seq);
4369
4370        p = &tp->out_of_order_queue.rb_node;
4371        if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
4372                /* Initial out of order segment, build 1 SACK. */
4373                if (tcp_is_sack(tp)) {
4374                        tp->rx_opt.num_sacks = 1;
4375                        tp->selective_acks[0].start_seq = seq;
4376                        tp->selective_acks[0].end_seq = end_seq;
4377                }
4378                rb_link_node(&skb->rbnode, NULL, p);
4379                rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
4380                tp->ooo_last_skb = skb;
4381                goto end;
4382        }
4383
4384        /* In the typical case, we are adding an skb to the end of the list.
4385         * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
4386         */
4387        if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) {
4388coalesce_done:
4389                tcp_grow_window(sk, skb);
4390                kfree_skb_partial(skb, fragstolen);
4391                skb = NULL;
4392                goto add_sack;
4393        }
4394
4395        /* Find place to insert this segment. Handle overlaps on the way. */
4396        parent = NULL;
4397        while (*p) {
4398                parent = *p;
4399                skb1 = rb_to_skb(parent);
4400                if (before(seq, TCP_SKB_CB(skb1)->seq)) {
4401                        p = &parent->rb_left;
4402                        continue;
4403                }
4404                if (before(seq, TCP_SKB_CB(skb1)->end_seq)) {
4405                        if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
4406                                /* All the bits are present. Drop. */
4407                                NET_INC_STATS_BH(sock_net(sk),
4408                                              LINUX_MIB_TCPOFOMERGE);
4409                                tcp_drop(sk, skb);
4410                                skb = NULL;
4411                                tcp_dsack_set(sk, seq, end_seq);
4412                                goto add_sack;
4413                        }
4414                        if (after(seq, TCP_SKB_CB(skb1)->seq)) {
4415                                /* Partial overlap. */
4416                                tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq);
4417                        } else {
4418                                /* skb's seq == skb1's seq and skb covers skb1.
4419                                 * Replace skb1 with skb.
4420                                 */
4421                                rb_replace_node(&skb1->rbnode, &skb->rbnode,
4422                                                &tp->out_of_order_queue);
4423                                tcp_dsack_extend(sk,
4424                                                 TCP_SKB_CB(skb1)->seq,
4425                                                 TCP_SKB_CB(skb1)->end_seq);
4426                                NET_INC_STATS_BH(sock_net(sk),
4427                                              LINUX_MIB_TCPOFOMERGE);
4428                                tcp_drop(sk, skb1);
4429                                goto merge_right;
4430                        }
4431                } else if (tcp_ooo_try_coalesce(sk, skb1, skb, &fragstolen)) {
4432                        goto coalesce_done;
4433                }
4434                p = &parent->rb_right;
4435        }
4436
4437        /* Insert segment into RB tree. */
4438        rb_link_node(&skb->rbnode, parent, p);
4439        rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
4440
4441merge_right:
4442        /* Remove other segments covered by skb. */
4443        while ((skb1 = skb_rb_next(skb)) != NULL) {
4444                if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
4445                        break;
4446                if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
4447                        tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
4448                                         end_seq);
4449                        break;
4450                }
4451                rb_erase(&skb1->rbnode, &tp->out_of_order_queue);
4452                tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
4453                                 TCP_SKB_CB(skb1)->end_seq);
4454                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
4455                tcp_drop(sk, skb1);
4456        }
4457        /* If there is no skb after us, we are the last_skb ! */
4458        if (!skb1)
4459                tp->ooo_last_skb = skb;
4460
4461add_sack:
4462        if (tcp_is_sack(tp))
4463                tcp_sack_new_ofo_skb(sk, seq, end_seq);
4464end:
4465        if (skb) {
4466                tcp_grow_window(sk, skb);
4467                skb_set_owner_r(skb, sk);
4468        }
4469}
4470
4471static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
4472                  bool *fragstolen)
4473{
4474        int eaten;
4475        struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
4476
4477        __skb_pull(skb, hdrlen);
4478        eaten = (tail &&
4479                 tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0;
4480        tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq);
4481        if (!eaten) {
4482                __skb_queue_tail(&sk->sk_receive_queue, skb);
4483                skb_set_owner_r(skb, sk);
4484        }
4485        return eaten;
4486}
4487
4488int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
4489{
4490        struct sk_buff *skb;
4491        bool fragstolen;
4492
4493        if (size == 0)
4494                return 0;
4495
4496        skb = alloc_skb(size, sk->sk_allocation);
4497        if (!skb)
4498                goto err;
4499
4500        if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
4501                goto err_free;
4502
4503        if (memcpy_from_msg(skb_put(skb, size), msg, size))
4504                goto err_free;
4505
4506        TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
4507        TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
4508        TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;
4509
4510        if (tcp_queue_rcv(sk, skb, 0, &fragstolen)) {
4511                WARN_ON_ONCE(fragstolen); /* should not happen */
4512                __kfree_skb(skb);
4513        }
4514        return size;
4515
4516err_free:
4517        kfree_skb(skb);
4518err:
4519        return -ENOMEM;
4520}
4521
4522static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
4523{
4524        struct tcp_sock *tp = tcp_sk(sk);
4525        bool fragstolen = false;
4526        int eaten = -1;
4527
4528        if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
4529                __kfree_skb(skb);
4530                return;
4531        }
4532        skb_dst_drop(skb);
4533        __skb_pull(skb, tcp_hdr(skb)->doff * 4);
4534
4535        TCP_ECN_accept_cwr(tp, skb);
4536
4537        tp->rx_opt.dsack = 0;
4538
4539        /*  Queue data for delivery to the user.
4540         *  Packets in sequence go to the receive queue.
4541         *  Out of sequence packets to the out_of_order_queue.
4542         */
4543        if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
4544                if (tcp_receive_window(tp) == 0)
4545                        goto out_of_window;
4546
4547                /* Ok. In sequence. In window. */
4548                if (tp->ucopy.task == current &&
4549                    tp->copied_seq == tp->rcv_nxt && tp->ucopy.len &&
4550                    sock_owned_by_user(sk) && !tp->urg_data) {
4551                        int chunk = min_t(unsigned int, skb->len,
4552                                          tp->ucopy.len);
4553
4554                        __set_current_state(TASK_RUNNING);
4555
4556                        local_bh_enable();
4557                        if (!skb_copy_datagram_iovec(skb, 0, tp->ucopy.iov, chunk)) {
4558                                tp->ucopy.len -= chunk;
4559                                tp->copied_seq += chunk;
4560                                eaten = (chunk == skb->len);
4561                                tcp_rcv_space_adjust(sk);
4562                        }
4563                        local_bh_disable();
4564                }
4565
4566                if (eaten <= 0) {
4567queue_and_out:
4568                        if (eaten < 0) {
4569                                if (skb_queue_len(&sk->sk_receive_queue) == 0)
4570                                        sk_forced_mem_schedule(sk, skb->truesize);
4571                                else if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
4572                                        goto drop;
4573                        }
4574                        eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
4575                }
4576                tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
4577                if (skb->len)
4578                        tcp_event_data_recv(sk, skb);
4579                if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
4580                        tcp_fin(sk);
4581
4582                if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
4583                        tcp_ofo_queue(sk);
4584
4585                        /* RFC2581. 4.2. SHOULD send immediate ACK, when
4586                         * gap in queue is filled.
4587                         */
4588                        if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
4589                                inet_csk(sk)->icsk_ack.pingpong = 0;
4590                }
4591
4592                if (tp->rx_opt.num_sacks)
4593                        tcp_sack_remove(tp);
4594
4595                tcp_fast_path_check(sk);
4596
4597                if (eaten > 0)
4598                        kfree_skb_partial(skb, fragstolen);
4599                if (!sock_flag(sk, SOCK_DEAD))
4600                        sk->sk_data_ready(sk, 0);
4601                return;
4602        }
4603
4604        if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
4605                /* A retransmit, 2nd most common case.  Force an immediate ack. */
4606                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
4607                tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
4608
4609out_of_window:
4610                tcp_enter_quickack_mode(sk);
4611                inet_csk_schedule_ack(sk);
4612drop:
4613                tcp_drop(sk, skb);
4614                return;
4615        }
4616
4617        /* Out of window. F.e. zero window probe. */
4618        if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
4619                goto out_of_window;
4620
4621        tcp_enter_quickack_mode(sk);
4622
4623        if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
4624                /* Partial packet, seq < rcv_next < end_seq */
4625                SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
4626                           tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
4627                           TCP_SKB_CB(skb)->end_seq);
4628
4629                tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);
4630
4631                /* If window is closed, drop tail of packet. But after
4632                 * remembering D-SACK for its head made in previous line.
4633                 */
4634                if (!tcp_receive_window(tp))
4635                        goto out_of_window;
4636                goto queue_and_out;
4637        }
4638
4639        tcp_data_queue_ofo(sk, skb);
4640}
4641
4642static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *list)
4643{
4644        if (list)
4645                return !skb_queue_is_last(list, skb) ? skb->next : NULL;
4646
4647        return skb_rb_next(skb);
4648}
4649
4650static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
4651                                        struct sk_buff_head *list,
4652                                        struct rb_root *root)
4653{
4654        struct sk_buff *next = tcp_skb_next(skb, list);
4655
4656        if (list)
4657                __skb_unlink(skb, list);
4658        else
4659                rb_erase(&skb->rbnode, root);
4660
4661        __kfree_skb(skb);
4662        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
4663
4664        return next;
4665}
4666
4667/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */
4668static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
4669{
4670        struct rb_node **p = &root->rb_node;
4671        struct rb_node *parent = NULL;
4672        struct sk_buff *skb1;
4673
4674        while (*p) {
4675                parent = *p;
4676                skb1 = rb_to_skb(parent);
4677                if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
4678                        p = &parent->rb_left;
4679                else
4680                        p = &parent->rb_right;
4681        }
4682        rb_link_node(&skb->rbnode, parent, p);
4683        rb_insert_color(&skb->rbnode, root);
4684}
4685
4686/* Collapse contiguous sequence of skbs head..tail with
4687 * sequence numbers start..end.
4688 *
4689 * If tail is NULL, this means until the end of the queue.
4690 *
4691 * Segments with FIN/SYN are not collapsed (only because this
4692 * simplifies code)
4693 */
4694static void
4695tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
4696             struct sk_buff *head, struct sk_buff *tail, u32 start, u32 end)
4697{
4698        struct sk_buff *skb = head, *n;
4699        struct sk_buff_head tmp;
4700        bool end_of_skbs;
4701
4702        /* First, check that queue is collapsible and find
4703         * the point where collapsing can be useful.
4704         */
4705restart:
4706        for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) {
4707                n = tcp_skb_next(skb, list);
4708
4709                /* No new bits? It is possible on ofo queue. */
4710                if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
4711                        skb = tcp_collapse_one(sk, skb, list, root);
4712                        if (!skb)
4713                                break;
4714                        goto restart;
4715                }
4716
4717                /* The first skb to collapse is:
4718                 * - not SYN/FIN and
4719                 * - bloated or contains data before "start" or
4720                 *   overlaps to the next one.
4721                 */
4722                if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) &&
4723                    (tcp_win_from_space(skb->truesize) > skb->len ||
4724                     before(TCP_SKB_CB(skb)->seq, start))) {
4725                        end_of_skbs = false;
4726                        break;
4727                }
4728
4729                if (n && n != tail &&
4730                    TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
4731                        end_of_skbs = false;
4732                        break;
4733                }
4734
4735                /* Decided to skip this, advance start seq. */
4736                start = TCP_SKB_CB(skb)->end_seq;
4737        }
4738        if (end_of_skbs ||
4739            (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
4740                return;
4741
4742        __skb_queue_head_init(&tmp);
4743
4744        while (before(start, end)) {
4745                int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start);
4746                struct sk_buff *nskb;
4747
4748                nskb = alloc_skb(copy, GFP_ATOMIC);
4749                if (!nskb)
4750                        break;
4751
4752                memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
4753                TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
4754                if (list)
4755                        __skb_queue_before(list, skb, nskb);
4756                else
4757                        __skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */
4758                skb_set_owner_r(nskb, sk);
4759
4760                /* Copy data, releasing collapsed skbs. */
4761                while (copy > 0) {
4762                        int offset = start - TCP_SKB_CB(skb)->seq;
4763                        int size = TCP_SKB_CB(skb)->end_seq - start;
4764
4765                        BUG_ON(offset < 0);
4766                        if (size > 0) {
4767                                size = min(copy, size);
4768                                if (skb_copy_bits(skb, offset, skb_put(nskb, size), size))
4769                                        BUG();
4770                                TCP_SKB_CB(nskb)->end_seq += size;
4771                                copy -= size;
4772                                start += size;
4773                        }
4774                        if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
4775                                skb = tcp_collapse_one(sk, skb, list, root);
4776                                if (!skb ||
4777                                    skb == tail ||
4778                                    (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
4779                                        goto end;
4780                        }
4781                }
4782        }
4783end:
4784        skb_queue_walk_safe(&tmp, skb, n)
4785                tcp_rbtree_insert(root, skb);
4786}
4787
4788/* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
4789 * and tcp_collapse() them until all the queue is collapsed.
4790 */
4791static void tcp_collapse_ofo_queue(struct sock *sk)
4792{
4793        struct tcp_sock *tp = tcp_sk(sk);
4794        u32 range_truesize, sum_tiny = 0;
4795        struct sk_buff *skb, *head;
4796        u32 start, end;
4797
4798        skb = skb_rb_first(&tp->out_of_order_queue);
4799new_range:
4800        if (!skb) {
4801                tp->ooo_last_skb = skb_rb_last(&tp->out_of_order_queue);
4802                return;
4803        }
4804        start = TCP_SKB_CB(skb)->seq;
4805        end = TCP_SKB_CB(skb)->end_seq;
4806        range_truesize = skb->truesize;
4807
4808        for (head = skb;;) {
4809                skb = skb_rb_next(skb);
4810
4811                /* Range is terminated when we see a gap or when
4812                 * we are at the queue end.
4813                 */
4814                if (!skb ||
4815                    after(TCP_SKB_CB(skb)->seq, end) ||
4816                    before(TCP_SKB_CB(skb)->end_seq, start)) {
4817                        /* Do not attempt collapsing tiny skbs */
4818                        if (range_truesize != head->truesize ||
4819                            end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) {
4820                                tcp_collapse(sk, NULL, &tp->out_of_order_queue,
4821                                             head, skb, start, end);
4822                        } else {
4823                                sum_tiny += range_truesize;
4824                                if (sum_tiny > sk->sk_rcvbuf >> 3)
4825                                        return;
4826                        }
4827                        goto new_range;
4828                }
4829
4830                range_truesize += skb->truesize;
4831                if (unlikely(before(TCP_SKB_CB(skb)->seq, start)))
4832                        start = TCP_SKB_CB(skb)->seq;
4833                if (after(TCP_SKB_CB(skb)->end_seq, end))
4834                        end = TCP_SKB_CB(skb)->end_seq;
4835        }
4836}
4837
4838/*
4839 * Clean the out-of-order queue to make room.
4840 * We drop high sequences packets to :
4841 * 1) Let a chance for holes to be filled.
4842 * 2) not add too big latencies if thousands of packets sit there.
4843 *    (But if application shrinks SO_RCVBUF, we could still end up
4844 *     freeing whole queue here)
4845 * 3) Drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks.
4846 *
4847 * Return true if queue has shrunk.
4848 */
4849static bool tcp_prune_ofo_queue(struct sock *sk)
4850{
4851        struct tcp_sock *tp = tcp_sk(sk);
4852        struct rb_node *node, *prev;
4853        int goal;
4854
4855        if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
4856                return false;
4857
4858        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED);
4859        goal = sk->sk_rcvbuf >> 3;
4860        node = &tp->ooo_last_skb->rbnode;
4861        do {
4862                prev = rb_prev(node);
4863                rb_erase(node, &tp->out_of_order_queue);
4864                goal -= rb_to_skb(node)->truesize;
4865                tcp_drop(sk, rb_to_skb(node));
4866                if (!prev || goal <= 0) {
4867                        sk_mem_reclaim(sk);
4868                        if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
4869                            !tcp_under_memory_pressure(sk))
4870                                break;
4871                        goal = sk->sk_rcvbuf >> 3;
4872                }
4873                node = prev;
4874        } while (node);
4875        tp->ooo_last_skb = rb_to_skb(prev);
4876
4877        /* Reset SACK state.  A conforming SACK implementation will
4878         * do the same at a timeout based retransmit.  When a connection
4879         * is in a sad state like this, we care only about integrity
4880         * of the connection not performance.
4881         */
4882        if (tp->rx_opt.sack_ok)
4883                tcp_sack_reset(&tp->rx_opt);
4884        return true;
4885}
4886
4887/* Reduce allocated memory if we can, trying to get
4888 * the socket within its memory limits again.
4889 *
4890 * Return less than zero if we should start dropping frames
4891 * until the socket owning process reads some of the data
4892 * to stabilize the situation.
4893 */
4894static int tcp_prune_queue(struct sock *sk)
4895{
4896        struct tcp_sock *tp = tcp_sk(sk);
4897
4898        SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
4899
4900        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PRUNECALLED);
4901
4902        if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
4903                tcp_clamp_window(sk);
4904        else if (tcp_under_memory_pressure(sk))
4905                tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
4906
4907        if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
4908                return 0;
4909
4910        tcp_collapse_ofo_queue(sk);
4911        if (!skb_queue_empty(&sk->sk_receive_queue))
4912                tcp_collapse(sk, &sk->sk_receive_queue, NULL,
4913                             skb_peek(&sk->sk_receive_queue),
4914                             NULL,
4915                             tp->copied_seq, tp->rcv_nxt);
4916        sk_mem_reclaim(sk);
4917
4918        if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
4919                return 0;
4920
4921        /* Collapsing did not help, destructive actions follow.
4922         * This must not ever occur. */
4923
4924        tcp_prune_ofo_queue(sk);
4925
4926        if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
4927                return 0;
4928
4929        /* If we are really being abused, tell the caller to silently
4930         * drop receive data on the floor.  It will get retransmitted
4931         * and hopefully then we'll have sufficient space.
4932         */
4933        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_RCVPRUNED);
4934
4935        /* Massive buffer overcommit. */
4936        tp->pred_flags = 0;
4937        return -1;
4938}
4939
4940static bool tcp_should_expand_sndbuf(const struct sock *sk)
4941{
4942        const struct tcp_sock *tp = tcp_sk(sk);
4943
4944        /* If the user specified a specific send buffer setting, do
4945         * not modify it.
4946         */
4947        if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
4948                return false;
4949
4950        /* If we are under global TCP memory pressure, do not expand.  */
4951        if (tcp_under_memory_pressure(sk))
4952                return false;
4953
4954        /* If we are under soft global TCP memory pressure, do not expand.  */
4955        if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
4956                return false;
4957
4958        /* If we filled the congestion window, do not expand.  */
4959        if (tp->packets_out >= tp->snd_cwnd)
4960                return false;
4961
4962        return true;
4963}
4964
4965/* When incoming ACK allowed to free some skb from write_queue,
4966 * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket
4967 * on the exit from tcp input handler.
4968 *
4969 * PROBLEM: sndbuf expansion does not work well with largesend.
4970 */
4971static void tcp_new_space(struct sock *sk)
4972{
4973        struct tcp_sock *tp = tcp_sk(sk);
4974
4975        if (tcp_should_expand_sndbuf(sk)) {
4976                tcp_sndbuf_expand(sk);
4977                tp->snd_cwnd_stamp = tcp_time_stamp;
4978        }
4979
4980        sk->sk_write_space(sk);
4981}
4982
4983static void tcp_check_space(struct sock *sk)
4984{
4985        if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
4986                sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
4987                if (sk->sk_socket &&
4988                    test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
4989                        tcp_new_space(sk);
4990        }
4991}
4992
4993static inline void tcp_data_snd_check(struct sock *sk)
4994{
4995        tcp_push_pending_frames(sk);
4996        tcp_check_space(sk);
4997}
4998
4999/*
5000 * Check if sending an ack is needed.

5001 */
5002static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
5003{
5004        struct tcp_sock *tp = tcp_sk(sk);
5005
5006            /* More than one full frame received... */
5007        if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
5008             /* ... and right edge of window advances far enough.
5009              * (tcp_recvmsg() will send ACK otherwise). Or...
5010              */
5011             __tcp_select_window(sk) >= tp->rcv_wnd) ||
5012            /* We ACK each frame or... */
5013            tcp_in_quickack_mode(sk) ||
5014            /* We have out of order data. */
5015            (ofo_possible && !RB_EMPTY_ROOT(&tp->out_of_order_queue))) {
5016                /* Then ack it now */
5017                tcp_send_ack(sk);
5018        } else {
5019                /* Else, send delayed ack. */
5020                tcp_send_delayed_ack(sk);
5021        }
5022}
5023
5024static inline void tcp_ack_snd_check(struct sock *sk)
5025{
5026        if (!inet_csk_ack_scheduled(sk)) {
5027                /* We sent a data segment already. */
5028                return;
5029        }
5030        __tcp_ack_snd_check(sk, 1);
5031}
5032
5033/*
5034 *      This routine is only called when we have urgent data
5035 *      signaled. Its the 'slow' part of tcp_urg. It could be
5036 *      moved inline now as tcp_urg is only called from one
5037 *      place. We handle URGent data wrong. We have to - as
5038 *      BSD still doesn't use the correction from RFC961.
5039 *      For 1003.1g we should support a new option TCP_STDURG to permit
5040 *      either form (or just set the sysctl tcp_stdurg).
5041 */
5042
5043static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
5044{
5045        struct tcp_sock *tp = tcp_sk(sk);
5046        u32 ptr = ntohs(th->urg_ptr);
5047
5048        if (ptr && !sysctl_tcp_stdurg)
5049                ptr--;
5050        ptr += ntohl(th->seq);
5051
5052        /* Ignore urgent data that we've already seen and read. */
5053        if (after(tp->copied_seq, ptr))
5054                return;
5055
5056        /* Do not replay urg ptr.
5057         *
5058         * NOTE: interesting situation not covered by specs.
5059         * Misbehaving sender may send urg ptr, pointing to segment,
5060         * which we already have in ofo queue. We are not able to fetch
5061         * such data and will stay in TCP_URG_NOTYET until will be eaten
5062         * by recvmsg(). Seems, we are not obliged to handle such wicked
5063         * situations. But it is worth to think about possibility of some
5064         * DoSes using some hypothetical application level deadlock.
5065         */
5066        if (before(ptr, tp->rcv_nxt))
5067                return;
5068
5069        /* Do we already have a newer (or duplicate) urgent pointer? */
5070        if (tp->urg_data && !after(ptr, tp->urg_seq))
5071                return;
5072
5073        /* Tell the world about our new urgent pointer. */
5074        sk_send_sigurg(sk);
5075
5076        /* We may be adding urgent data when the last byte read was
5077         * urgent. To do this requires some care. We cannot just ignore
5078         * tp->copied_seq since we would read the last urgent byte again
5079         * as data, nor can we alter copied_seq until this data arrives
5080         * or we break the semantics of SIOCATMARK (and thus sockatmark())
5081         *
5082         * NOTE. Double Dutch. Rendering to plain English: author of comment
5083         * above did something sort of  send("A", MSG_OOB); send("B", MSG_OOB);
5084         * and expect that both A and B disappear from stream. This is _wrong_.
5085         * Though this happens in BSD with high probability, this is occasional.
5086         * Any application relying on this is buggy. Note also, that fix "works"
5087         * only in this artificial test. Insert some normal data between A and B and we will
5088         * decline of BSD again. Verdict: it is better to remove to trap
5089         * buggy users.
5090         */
5091        if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
5092            !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
5093                struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
5094                tp->copied_seq++;
5095                if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
5096                        __skb_unlink(skb, &sk->sk_receive_queue);
5097                        __kfree_skb(skb);
5098                }
5099        }
5100
5101        tp->urg_data = TCP_URG_NOTYET;
5102        tp->urg_seq = ptr;
5103
5104        /* Disable header prediction. */
5105        tp->pred_flags = 0;
5106}
5107
5108/* This is the 'fast' part of urgent handling. */
5109static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th)
5110{
5111        struct tcp_sock *tp = tcp_sk(sk);
5112
5113        /* Check if we get a new urgent pointer - normally not. */
5114        if (th->urg)
5115                tcp_check_urg(sk, th);
5116
5117        /* Do we wait for any urgent data? - normally not... */
5118        if (tp->urg_data == TCP_URG_NOTYET) {
5119                u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * 4) -
5120                          th->syn;
5121
5122                /* Is the urgent pointer pointing into this packet? */
5123                if (ptr < skb->len) {
5124                        u8 tmp;
5125                        if (skb_copy_bits(skb, ptr, &tmp, 1))
5126                                BUG();
5127                        tp->urg_data = TCP_URG_VALID | tmp;
5128                        if (!sock_flag(sk, SOCK_DEAD))
5129                                sk->sk_data_ready(sk, 0);
5130                }
5131        }
5132}
5133
5134static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
5135{
5136        struct tcp_sock *tp = tcp_sk(sk);
5137        int chunk = skb->len - hlen;
5138        int err;
5139
5140        local_bh_enable();
5141        if (skb_csum_unnecessary(skb))
5142                err = skb_copy_datagram_iovec(skb, hlen, tp->ucopy.iov, chunk);
5143        else
5144                err = skb_copy_and_csum_datagram_iovec(skb, hlen,
5145                                                       tp->ucopy.iov, chunk);
5146
5147        if (!err) {
5148                tp->ucopy.len -= chunk;
5149                tp->copied_seq += chunk;
5150                tcp_rcv_space_adjust(sk);
5151        }
5152
5153        local_bh_disable();
5154        return err;
5155}
5156
5157static __sum16 __tcp_checksum_complete_user(struct sock *sk,
5158                                            struct sk_buff *skb)
5159{
5160        __sum16 result;
5161
5162        if (sock_owned_by_user(sk)) {
5163                local_bh_enable();
5164                result = __tcp_checksum_complete(skb);
5165                local_bh_disable();
5166        } else {
5167                result = __tcp_checksum_complete(skb);
5168        }
5169        return result;
5170}
5171
5172static inline bool tcp_checksum_complete_user(struct sock *sk,
5173                                             struct sk_buff *skb)
5174{
5175        return !skb_csum_unnecessary(skb) &&
5176               __tcp_checksum_complete_user(sk, skb);
5177}
5178
5179/* Does PAWS and seqno based validation of an incoming segment, flags will
5180 * play significant role here.
5181 */
5182static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
5183                                  const struct tcphdr *th, int syn_inerr)
5184{
5185        struct tcp_sock *tp = tcp_sk(sk);
5186
5187        /* RFC1323: H1. Apply PAWS check first. */
5188        if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
5189            tcp_paws_discard(sk, skb)) {
5190                if (!th->rst) {
5191                        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
5192                        if (!tcp_oow_rate_limited(sock_net(sk), skb,
5193                                                  LINUX_MIB_TCPACKSKIPPEDPAWS,
5194                                                  &tp->last_oow_ack_time))
5195                                tcp_send_dupack(sk, skb);
5196                        goto discard;
5197                }
5198                /* Reset is accepted even if it did not pass PAWS. */
5199        }
5200
5201        /* Step 1: check sequence number */
5202        if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
5203                /* RFC793, page 37: "In all states except SYN-SENT, all reset
5204                 * (RST) segments are validated by checking their SEQ-fields."
5205                 * And page 69: "If an incoming segment is not acceptable,
5206                 * an acknowledgment should be sent in reply (unless the RST
5207                 * bit is set, if so drop the segment and return)".
5208                 */
5209                if (!th->rst) {
5210                        if (th->syn)
5211                                goto syn_challenge;
5212                        if (!tcp_oow_rate_limited(sock_net(sk), skb,
5213                                                  LINUX_MIB_TCPACKSKIPPEDSEQ,
5214                                                  &tp->last_oow_ack_time))
5215                                tcp_send_dupack(sk, skb);
5216                }
5217                goto discard;
5218        }
5219
5220        /* Step 2: check RST bit */
5221        if (th->rst) {
5222                /* RFC 5961 3.2 :
5223                 * If sequence number exactly matches RCV.NXT, then
5224                 *     RESET the connection
5225                 * else
5226                 *     Send a challenge ACK
5227                 */
5228                if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt)
5229                        tcp_reset(sk);
5230                else
5231                        tcp_send_challenge_ack(sk, skb);
5232                goto discard;
5233        }
5234
5235        /* step 3: check security and precedence [ignored] */
5236
5237        /* step 4: Check for a SYN
5238         * RFC 5691 4.2 : Send a challenge ack
5239         */
5240        if (th->syn) {
5241syn_challenge:
5242                if (syn_inerr)
5243                        TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
5244                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
5245                tcp_send_challenge_ack(sk, skb);
5246                goto discard;
5247        }
5248
5249        return true;
5250
5251discard:
5252        tcp_drop(sk, skb);
5253        return false;
5254}
5255
5256/*
5257 *      TCP receive function for the ESTABLISHED state.
5258 *
5259 *      It is split into a fast path and a slow path. The fast path is
5260 *      disabled when:
5261 *      - A zero window was announced from us - zero window probing
5262 *        is only handled properly in the slow path.
5263 *      - Out of order segments arrived.
5264 *      - Urgent data is expected.
5265 *      - There is no buffer space left
5266 *      - Unexpected TCP flags/window values/header lengths are received
5267 *        (detected by checking the TCP header against pred_flags)
5268 *      - Data is sent in both directions. Fast path only supports pure senders
5269 *        or pure receivers (this means either the sequence number or the ack
5270 *        value must stay constant)
5271 *      - Unexpected TCP option.
5272 *
5273 *      When these conditions are not satisfied it drops into a standard
5274 *      receive procedure patterned after RFC793 to handle all cases.
5275 *      The first three cases are guaranteed by proper pred_flags setting,
5276 *      the rest is checked inline. Fast processing is turned on in
5277 *      tcp_data_queue when everything is OK.
5278 */
5279void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
5280                         const struct tcphdr *th, unsigned int len)
5281{
5282        struct tcp_sock *tp = tcp_sk(sk);
5283
5284        if (unlikely(sk->sk_rx_dst == NULL))
5285                inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
5286        /*
5287         *      Header prediction.
5288         *      The code loosely follows the one in the famous
5289         *      "30 instruction TCP receive" Van Jacobson mail.
5290         *
5291         *      Van's trick is to deposit buffers into socket queue
5292         *      on a device interrupt, to call tcp_recv function
5293         *      on the receive process context and checksum and copy
5294         *      the buffer to user space. smart...
5295         *
5296         *      Our current scheme is not silly either but we take the
5297         *      extra cost of the net_bh soft interrupt processing...
5298         *      We do checksum and copy also but from device to kernel.
5299         */
5300
5301        tp->rx_opt.saw_tstamp = 0;
5302
5303        /*      pred_flags is 0xS?10 << 16 + snd_wnd
5304         *      if header_prediction is to be made
5305         *      'S' will always be tp->tcp_header_len >> 2
5306         *      '?' will be 0 for the fast path, otherwise pred_flags is 0 to
5307         *  turn it off (when there are holes in the receive
5308         *       space for instance)
5309         *      PSH flag is ignored.
5310         */
5311
5312        if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
5313            TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
5314            !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
5315                int tcp_header_len = tp->tcp_header_len;
5316
5317                /* Timestamp header prediction: tcp_header_len
5318                 * is automatically equal to th->doff*4 due to pred_flags
5319                 * match.
5320                 */
5321
5322                /* Check timestamp */
5323                if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
5324                        /* No? Slow path! */
5325                        if (!tcp_parse_aligned_timestamp(tp, th))
5326                                goto slow_path;
5327
5328                        /* If PAWS failed, check it more carefully in slow path */
5329                        if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
5330                                goto slow_path;
5331
5332                        /* DO NOT update ts_recent here, if checksum fails
5333                         * and timestamp was corrupted part, it will result
5334                         * in a hung connection since we will drop all
5335                         * future packets due to the PAWS test.
5336                         */
5337                }
5338
5339                if (len <= tcp_header_len) {
5340                        /* Bulk data transfer: sender */
5341                        if (len == tcp_header_len) {
5342                                /* Predicted packet is in window by definition.
5343                                 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
5344                                 * Hence, check seq<=rcv_wup reduces to:
5345                                 */
5346                                if (tcp_header_len ==
5347                                    (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
5348                                    tp->rcv_nxt == tp->rcv_wup)
5349                                        tcp_store_ts_recent(tp);
5350
5351                                /* We know that such packets are checksummed
5352                                 * on entry.
5353                                 */
5354                                tcp_ack(sk, skb, 0);
5355                                __kfree_skb(skb);
5356                                tcp_data_snd_check(sk);
5357                                return;
5358                        } else { /* Header too small */
5359                                TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
5360                                goto discard;
5361                        }
5362                } else {
5363                        int eaten = 0;
5364                        bool fragstolen = false;
5365
5366                        if (tp->ucopy.task == current &&
5367                            tp->copied_seq == tp->rcv_nxt &&
5368                            len - tcp_header_len <= tp->ucopy.len &&
5369                            sock_owned_by_user(sk)) {
5370                                __set_current_state(TASK_RUNNING);
5371
5372                                if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) {
5373                                        /* Predicted packet is in window by definition.
5374                                         * seq == rcv_nxt and rcv_wup <= rcv_nxt.
5375                                         * Hence, check seq<=rcv_wup reduces to:
5376                                         */
5377                                        if (tcp_header_len ==
5378                                            (sizeof(struct tcphdr) +
5379                                             TCPOLEN_TSTAMP_ALIGNED) &&
5380                                            tp->rcv_nxt == tp->rcv_wup)
5381                                                tcp_store_ts_recent(tp);
5382
5383                                        tcp_rcv_rtt_measure_ts(sk, skb);
5384
5385                                        __skb_pull(skb, tcp_header_len);
5386                                        tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
5387                                        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER);
5388                                        eaten = 1;
5389                                }
5390                        }
5391                        if (!eaten) {
5392                                if (tcp_checksum_complete_user(sk, skb))
5393                                        goto csum_error;
5394
5395                                if ((int)skb->truesize > sk->sk_forward_alloc)
5396                                        goto step5;
5397
5398                                /* Predicted packet is in window by definition.
5399                                 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
5400                                 * Hence, check seq<=rcv_wup reduces to:
5401                                 */
5402                                if (tcp_header_len ==
5403                                    (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
5404                                    tp->rcv_nxt == tp->rcv_wup)
5405                                        tcp_store_ts_recent(tp);
5406
5407                                tcp_rcv_rtt_measure_ts(sk, skb);
5408
5409                                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS);
5410
5411                                /* Bulk data transfer: receiver */
5412                                eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
5413                                                      &fragstolen);
5414                        }
5415
5416                        tcp_event_data_recv(sk, skb);
5417
5418                        if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
5419                                /* Well, only one small jumplet in fast path... */
5420                                tcp_ack(sk, skb, FLAG_DATA);
5421                                tcp_data_snd_check(sk);
5422                                if (!inet_csk_ack_scheduled(sk))
5423                                        goto no_ack;
5424                        }
5425
5426                        __tcp_ack_snd_check(sk, 0);
5427no_ack:
5428                        if (eaten)
5429                                kfree_skb_partial(skb, fragstolen);
5430                        sk->sk_data_ready(sk, 0);
5431                        return;
5432                }
5433        }
5434
5435slow_path:
5436        if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb))
5437                goto csum_error;
5438
5439        if (!th->ack && !th->rst && !th->syn)
5440                goto discard;
5441
5442        /*
5443         *      Standard slow path.
5444         */
5445
5446        if (!tcp_validate_incoming(sk, skb, th, 1))
5447                return;
5448
5449step5:
5450        if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0)
5451                goto discard;
5452
5453        tcp_rcv_rtt_measure_ts(sk, skb);
5454
5455        /* Process urgent data. */
5456        tcp_urg(sk, skb, th);
5457
5458        /* step 7: process the segment text */
5459        tcp_data_queue(sk, skb);
5460
5461        tcp_data_snd_check(sk);
5462        tcp_ack_snd_check(sk);
5463        return;
5464
5465csum_error:
5466        TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
5467        TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
5468
5469discard:
5470        tcp_drop(sk, skb);
5471}
5472EXPORT_SYMBOL(tcp_rcv_established);
5473
5474void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
5475{
5476        struct tcp_sock *tp = tcp_sk(sk);
5477        struct inet_connection_sock *icsk = inet_csk(sk);
5478
5479        tcp_set_state(sk, TCP_ESTABLISHED);
5480
5481        if (skb != NULL) {
5482                icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
5483                security_inet_conn_established(sk, skb);
5484        }
5485
5486        /* Make sure socket is routed, for correct metrics.  */
5487        icsk->icsk_af_ops->rebuild_header(sk);
5488
5489        tcp_init_metrics(sk);
5490
5491        tcp_init_congestion_control(sk);
5492
5493        /* Prevent spurious tcp_cwnd_restart() on first data
5494         * packet.
5495         */
5496        tp->lsndtime = tcp_time_stamp;
5497
5498        tcp_init_buffer_space(sk);
5499
5500        if (sock_flag(sk, SOCK_KEEPOPEN))
5501                inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
5502
5503        if (!tp->rx_opt.snd_wscale)
5504                __tcp_fast_path_on(tp, tp->snd_wnd);
5505        else
5506                tp->pred_flags = 0;
5507
5508        if (!sock_flag(sk, SOCK_DEAD)) {
5509                sk->sk_state_change(sk);
5510                sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
5511        }
5512}
5513
5514static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
5515                                    struct tcp_fastopen_cookie *cookie)
5516{
5517        struct tcp_sock *tp = tcp_sk(sk);
5518        struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL;
5519        u16 mss = tp->rx_opt.mss_clamp, try_exp = 0;
5520        bool syn_drop = false;
5521
5522        if (mss == tp->rx_opt.user_mss) {
5523                struct tcp_options_received opt;
5524
5525                /* Get original SYNACK MSS value if user MSS sets mss_clamp */
5526                tcp_clear_options(&opt);
5527                opt.user_mss = opt.mss_clamp = 0;
5528                tcp_parse_options(synack, &opt, 0, NULL);
5529                mss = opt.mss_clamp;
5530        }
5531
5532        if (!tp->syn_fastopen) {
5533                /* Ignore an unsolicited cookie */
5534                cookie->len = -1;
5535        } else if (tp->total_retrans) {
5536                /* SYN timed out and the SYN-ACK neither has a cookie nor
5537                 * acknowledges data. Presumably the remote received only
5538                 * the retransmitted (regular) SYNs: either the original
5539                 * SYN-data or the corresponding SYN-ACK was dropped.
5540                 */
5541                syn_drop = (cookie->len < 0 && data);
5542        } else if (cookie->len < 0 && !tp->syn_data) {
5543                /* We requested a cookie but didn't get it. If we did not use
5544                 * the (old) exp opt format then try so next time (try_exp=1).
5545                 * Otherwise we go back to use the RFC7413 opt (try_exp=2).
5546                 */
5547                try_exp = tp->syn_fastopen_exp ? 2 : 1;
5548        }
5549
5550        tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);
5551
5552        if (data) { /* Retransmit unacked data in SYN */
5553                tcp_for_write_queue_from(data, sk) {
5554                        if (data == tcp_send_head(sk) ||
5555                            __tcp_retransmit_skb(sk, data))
5556                                break;
5557                }
5558                tcp_rearm_rto(sk);
5559                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL);
5560                return true;
5561        }
5562        tp->syn_data_acked = tp->syn_data;
5563        if (tp->syn_data_acked)
5564                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
5565        return false;
5566}
5567
5568static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5569                                         const struct tcphdr *th, unsigned int len)
5570{
5571        struct inet_connection_sock *icsk = inet_csk(sk);
5572        struct tcp_sock *tp = tcp_sk(sk);
5573        struct tcp_fastopen_cookie foc = { .len = -1 };
5574        int saved_clamp = tp->rx_opt.mss_clamp;
5575
5576        tcp_parse_options(skb, &tp->rx_opt, 0, &foc);
5577        if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
5578                tp->rx_opt.rcv_tsecr -= tp->tsoffset;
5579
5580        if (th->ack) {
5581                /* rfc793:
5582                 * "If the state is SYN-SENT then
5583                 *    first check the ACK bit
5584                 *      If the ACK bit is set
5585                 *        If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
5586                 *        a reset (unless the RST bit is set, if so drop
5587                 *        the segment and return)"
5588                 */
5589                if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) ||
5590                    after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt))
5591                        goto reset_and_undo;
5592
5593                if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
5594                    !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
5595                             tcp_time_stamp)) {
5596                        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSACTIVEREJECTED);
5597                        goto reset_and_undo;
5598                }
5599
5600                /* Now ACK is acceptable.
5601                 *
5602                 * "If the RST bit is set
5603                 *    If the ACK was acceptable then signal the user "error:
5604                 *    connection reset", drop the segment, enter CLOSED state,
5605                 *    delete TCB, and return."
5606                 */
5607
5608                if (th->rst) {
5609                        tcp_reset(sk);
5610                        goto discard;
5611                }
5612
5613                /* rfc793:
5614                 *   "fifth, if neither of the SYN or RST bits is set then
5615                 *    drop the segment and return."
5616                 *
5617                 *    See note below!
5618                 *                                        --ANK(990513)
5619                 */
5620                if (!th->syn)
5621                        goto discard_and_undo;
5622
5623                /* rfc793:
5624                 *   "If the SYN bit is on ...
5625                 *    are acceptable then ...
5626                 *    (our SYN has been ACKed), change the connection
5627                 *    state to ESTABLISHED..."
5628                 */
5629
5630                TCP_ECN_rcv_synack(tp, th);
5631
5632                tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
5633                tcp_ack(sk, skb, FLAG_SLOWPATH);
5634
5635                /* Ok.. it's good. Set up sequence numbers and
5636                 * move to established.
5637                 */
5638                tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
5639                tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
5640
5641                /* RFC1323: The window in SYN & SYN/ACK segments is
5642                 * never scaled.
5643                 */
5644                tp->snd_wnd = ntohs(th->window);
5645
5646                if (!tp->rx_opt.wscale_ok) {
5647                        tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
5648                        tp->window_clamp = min(tp->window_clamp, 65535U);
5649                }
5650
5651                if (tp->rx_opt.saw_tstamp) {
5652                        tp->rx_opt.tstamp_ok       = 1;
5653                        tp->tcp_header_len =
5654                                sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
5655                        tp->advmss          -= TCPOLEN_TSTAMP_ALIGNED;
5656                        tcp_store_ts_recent(tp);
5657                } else {
5658                        tp->tcp_header_len = sizeof(struct tcphdr);
5659                }
5660
5661                if (tcp_is_sack(tp) && sysctl_tcp_fack)
5662                        tcp_enable_fack(tp);
5663
5664                tcp_mtup_init(sk);
5665                tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
5666                tcp_initialize_rcv_mss(sk);
5667
5668                /* Remember, tcp_poll() does not lock socket!
5669                 * Change state from SYN-SENT only after copied_seq
5670                 * is initialized. */
5671                tp->copied_seq = tp->rcv_nxt;
5672
5673                smp_mb();
5674
5675                tcp_finish_connect(sk, skb);
5676
5677                if ((tp->syn_fastopen || tp->syn_data) &&
5678                    tcp_rcv_fastopen_synack(sk, skb, &foc))
5679                        return -1;
5680
5681                if (sk->sk_write_pending ||
5682                    icsk->icsk_accept_queue.rskq_defer_accept ||
5683                    icsk->icsk_ack.pingpong) {
5684                        /* Save one ACK. Data will be ready after
5685                         * several ticks, if write_pending is set.
5686                         *
5687                         * It may be deleted, but with this feature tcpdumps
5688                         * look so _wonderfully_ clever, that I was not able
5689                         * to stand against the temptation 8)     --ANK
5690                         */
5691                        inet_csk_schedule_ack(sk);
5692                        icsk->icsk_ack.lrcvtime = tcp_time_stamp;
5693                        tcp_enter_quickack_mode(sk);
5694                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
5695                                                  TCP_DELACK_MAX, TCP_RTO_MAX);
5696
5697discard:
5698                        tcp_drop(sk, skb);
5699                        return 0;
5700                } else {
5701                        tcp_send_ack(sk);
5702                }
5703                return -1;
5704        }
5705
5706        /* No ACK in the segment */
5707
5708        if (th->rst) {
5709                /* rfc793:
5710                 * "If the RST bit is set
5711                 *
5712                 *      Otherwise (no ACK) drop the segment and return."
5713                 */
5714
5715                goto discard_and_undo;
5716        }
5717
5718        /* PAWS check. */
5719        if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp &&
5720            tcp_paws_reject(&tp->rx_opt, 0))
5721                goto discard_and_undo;
5722
5723        if (th->syn) {
5724                /* We see SYN without ACK. It is attempt of
5725                 * simultaneous connect with crossed SYNs.
5726                 * Particularly, it can be connect to self.
5727                 */
5728                tcp_set_state(sk, TCP_SYN_RECV);
5729
5730                if (tp->rx_opt.saw_tstamp) {
5731                        tp->rx_opt.tstamp_ok = 1;
5732                        tcp_store_ts_recent(tp);
5733                        tp->tcp_header_len =
5734                                sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
5735                } else {
5736                        tp->tcp_header_len = sizeof(struct tcphdr);
5737                }
5738
5739                tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
5740                tp->copied_seq = tp->rcv_nxt;
5741                tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
5742
5743                /* RFC1323: The window in SYN & SYN/ACK segments is
5744                 * never scaled.
5745                 */
5746                tp->snd_wnd    = ntohs(th->window);
5747                tp->snd_wl1    = TCP_SKB_CB(skb)->seq;
5748                tp->max_window = tp->snd_wnd;
5749
5750                TCP_ECN_rcv_syn(tp, th);
5751
5752                tcp_mtup_init(sk);
5753                tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
5754                tcp_initialize_rcv_mss(sk);
5755
5756                tcp_send_synack(sk);
5757#if 0
5758                /* Note, we could accept data and URG from this segment.
5759                 * There are no obstacles to make this (except that we must
5760                 * either change tcp_recvmsg() to prevent it from returning data
5761                 * before 3WHS completes per RFC793, or employ TCP Fast Open).
5762                 *
5763                 * However, if we ignore data in ACKless segments sometimes,
5764                 * we have no reasons to accept it sometimes.
5765                 * Also, seems the code doing it in step6 of tcp_rcv_state_process
5766                 * is not flawless. So, discard packet for sanity.
5767                 * Uncomment this return to process the data.
5768                 */
5769                return -1;
5770#else
5771                goto discard;
5772#endif
5773        }
5774        /* "fifth, if neither of the SYN or RST bits is set then
5775         * drop the segment and return."
5776         */
5777
5778discard_and_undo:
5779        tcp_clear_options(&tp->rx_opt);
5780        tp->rx_opt.mss_clamp = saved_clamp;
5781        goto discard;
5782
5783reset_and_undo:
5784        tcp_clear_options(&tp->rx_opt);
5785        tp->rx_opt.mss_clamp = saved_clamp;
5786        return 1;
5787}
5788
5789/*
5790 *      This function implements the receiving procedure of RFC 793 for
5791 *      all states except ESTABLISHED and TIME_WAIT.
5792 *      It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
5793 *      address independent.
5794 */
5795
5796int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5797                          const struct tcphdr *th, unsigned int len)
5798{
5799        struct tcp_sock *tp = tcp_sk(sk);
5800        struct inet_connection_sock *icsk = inet_csk(sk);
5801        struct request_sock *req;
5802        int queued = 0;
5803        bool acceptable;
5804        u32 synack_stamp;
5805
5806        tp->rx_opt.saw_tstamp = 0;
5807
5808        switch (sk->sk_state) {
5809        case TCP_CLOSE:
5810                goto discard;
5811
5812        case TCP_LISTEN:
5813                if (th->ack)
5814                        return 1;
5815
5816                if (th->rst)
5817                        goto discard;
5818
5819                if (th->syn) {
5820                        if (th->fin)
5821                                goto discard;
5822                        if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
5823                                return 1;
5824
5825                        /* Now we have several options: In theory there is
5826                         * nothing else in the frame. KA9Q has an option to
5827                         * send data with the syn, BSD accepts data with the
5828                         * syn up to the [to be] advertised window and
5829                         * Solaris 2.1 gives you a protocol error. For now
5830                         * we just ignore it, that fits the spec precisely
5831                         * and avoids incompatibilities. It would be nice in
5832                         * future to drop through and process the data.
5833                         *
5834                         * Now that TTCP is starting to be used we ought to
5835                         * queue this data.
5836                         * But, this leaves one open to an easy denial of
5837                         * service attack, and SYN cookies can't defend
5838                         * against this problem. So, we drop the data
5839                         * in the interest of security over speed unless
5840                         * it's still in use.
5841                         */
5842                        kfree_skb(skb);
5843                        return 0;
5844                }
5845                goto discard;
5846
5847        case TCP_SYN_SENT:
5848                queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
5849                if (queued >= 0)
5850                        return queued;
5851
5852                /* Do step6 onward by hand. */
5853                tcp_urg(sk, skb, th);
5854                __kfree_skb(skb);
5855                tcp_data_snd_check(sk);
5856                return 0;
5857        }
5858
5859        req = tp->fastopen_rsk;
5860        if (req != NULL) {
5861                WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
5862                    sk->sk_state != TCP_FIN_WAIT1);
5863
5864                if (tcp_check_req(sk, skb, req, NULL, true) == NULL)
5865                        goto discard;
5866        }
5867
5868        if (!th->ack && !th->rst && !th->syn)
5869                goto discard;
5870
5871        if (!tcp_validate_incoming(sk, skb, th, 0))
5872                return 0;
5873
5874        /* step 5: check the ACK field */
5875        acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
5876                                      FLAG_UPDATE_TS_RECENT) > 0;
5877
5878        switch (sk->sk_state) {
5879        case TCP_SYN_RECV:
5880                if (!acceptable)
5881                        return 1;
5882
5883                /* Once we leave TCP_SYN_RECV, we no longer need req
5884                 * so release it.
5885                 */
5886                if (req) {
5887                        synack_stamp = tcp_rsk(req)->snt_synack;
5888                        tp->total_retrans = req->num_retrans;
5889                        reqsk_fastopen_remove(sk, req, false);
5890                } else {
5891                        synack_stamp = tp->lsndtime;
5892                        /* Make sure socket is routed, for correct metrics. */
5893                        icsk->icsk_af_ops->rebuild_header(sk);
5894                        tcp_init_congestion_control(sk);
5895
5896                        tcp_mtup_init(sk);
5897                        tp->copied_seq = tp->rcv_nxt;
5898                        tcp_init_buffer_space(sk);
5899                }
5900                smp_mb();
5901                tcp_set_state(sk, TCP_ESTABLISHED);
5902                sk->sk_state_change(sk);
5903
5904                /* Note, that this wakeup is only for marginal crossed SYN case.
5905                 * Passively open sockets are not waked up, because
5906                 * sk->sk_sleep == NULL and sk->sk_socket == NULL.
5907                 */
5908                if (sk->sk_socket)
5909                        sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
5910
5911                tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
5912                tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
5913                tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
5914                tcp_synack_rtt_meas(sk, synack_stamp);
5915
5916                if (tp->rx_opt.tstamp_ok)
5917                        tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
5918
5919                if (req) {
5920                        /* Re-arm the timer because data may have been sent out.
5921                         * This is similar to the regular data transmission case
5922                         * when new data has just been ack'ed.
5923                         *
5924                         * (TFO) - we could try to be more aggressive and
5925                         * retransmitting any data sooner based on when they
5926                         * are sent out.
5927                         */
5928                        tcp_rearm_rto(sk);
5929                } else
5930                        tcp_init_metrics(sk);
5931
5932                tcp_update_pacing_rate(sk);
5933
5934                /* Prevent spurious tcp_cwnd_restart() on first data packet */
5935                tp->lsndtime = tcp_time_stamp;
5936
5937                tcp_initialize_rcv_mss(sk);
5938                tcp_fast_path_on(tp);
5939                break;
5940
5941        case TCP_FIN_WAIT1: {
5942                int tmo;
5943
5944                /* If we enter the TCP_FIN_WAIT1 state and we are a
5945                 * Fast Open socket and this is the first acceptable
5946                 * ACK we have received, this would have acknowledged
5947                 * our SYNACK so stop the SYNACK timer.
5948                 */
5949                if (req != NULL) {
5950                        /* Return RST if ack_seq is invalid.
5951                         * Note that RFC793 only says to generate a
5952                         * DUPACK for it but for TCP Fast Open it seems
5953                         * better to treat this case like TCP_SYN_RECV
5954                         * above.
5955                         */
5956                        if (!acceptable)
5957                                return 1;
5958                        /* We no longer need the request sock. */
5959                        reqsk_fastopen_remove(sk, req, false);
5960                        tcp_rearm_rto(sk);
5961                }
5962                if (tp->snd_una != tp->write_seq)
5963                        break;
5964
5965                tcp_set_state(sk, TCP_FIN_WAIT2);
5966                sk->sk_shutdown |= SEND_SHUTDOWN;
5967
5968                sk_dst_confirm(sk);
5969
5970                if (!sock_flag(sk, SOCK_DEAD)) {
5971                        /* Wake up lingering close() */
5972                        sk->sk_state_change(sk);
5973                        break;
5974                }
5975
5976                if (tp->linger2 < 0 ||
5977                    (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
5978                     after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) {
5979                        tcp_done(sk);
5980                        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
5981                        return 1;
5982                }
5983
5984                tmo = tcp_fin_time(sk);
5985                if (tmo > TCP_TIMEWAIT_LEN) {
5986                        inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
5987                } else if (th->fin || sock_owned_by_user(sk)) {
5988                        /* Bad case. We could lose such FIN otherwise.
5989                         * It is not a big problem, but it looks confusing
5990                         * and not so rare event. We still can lose it now,
5991                         * if it spins in bh_lock_sock(), but it is really
5992                         * marginal case.
5993                         */
5994                        inet_csk_reset_keepalive_timer(sk, tmo);
5995                } else {
5996                        tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
5997                        goto discard;
5998                }
5999                break;
6000        }

6001
6002        case TCP_CLOSING:
6003                if (tp->snd_una == tp->write_seq) {
6004                        tcp_time_wait(sk, TCP_TIME_WAIT, 0);
6005                        goto discard;
6006                }
6007                break;
6008
6009        case TCP_LAST_ACK:
6010                if (tp->snd_una == tp->write_seq) {
6011                        tcp_update_metrics(sk);
6012                        tcp_done(sk);
6013                        goto discard;
6014                }
6015                break;
6016        }
6017
6018        /* step 6: check the URG bit */
6019        tcp_urg(sk, skb, th);
6020
6021        /* step 7: process the segment text */
6022        switch (sk->sk_state) {
6023        case TCP_CLOSE_WAIT:
6024        case TCP_CLOSING:
6025        case TCP_LAST_ACK:
6026                if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
6027                        break;
6028        case TCP_FIN_WAIT1:
6029        case TCP_FIN_WAIT2:
6030                /* RFC 793 says to queue data in these states,
6031                 * RFC 1122 says we MUST send a reset.
6032                 * BSD 4.4 also does reset.
6033                 */
6034                if (sk->sk_shutdown & RCV_SHUTDOWN) {
6035                        if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
6036                            after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
6037                                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
6038                                tcp_reset(sk);
6039                                return 1;
6040                        }
6041                }
6042                /* Fall through */
6043        case TCP_ESTABLISHED:
6044                tcp_data_queue(sk, skb);
6045                queued = 1;
6046                break;
6047        }
6048
6049        /* tcp_data could move socket to TIME-WAIT */
6050        if (sk->sk_state != TCP_CLOSE) {
6051                tcp_data_snd_check(sk);
6052                tcp_ack_snd_check(sk);
6053        }
6054
6055        if (!queued) {
6056discard:
6057                tcp_drop(sk, skb);
6058        }
6059        return 0;
6060}
6061EXPORT_SYMBOL(tcp_rcv_state_process);
6062
6063static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
6064{
6065        struct inet_request_sock *ireq = inet_rsk(req);
6066
6067        if (family == AF_INET)
6068                LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
6069                               &ireq->ir_rmt_addr, port);
6070#if IS_ENABLED(CONFIG_IPV6)
6071        else if (family == AF_INET6)
6072                LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI6/%u\n"),
6073                               &ireq->ir_v6_rmt_addr, port);
6074#endif
6075}
6076
6077/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
6078 *
6079 * If we receive a SYN packet with these bits set, it means a
6080 * network is playing bad games with TOS bits. In order to
6081 * avoid possible false congestion notifications, we disable
6082 * TCP ECN negotiation.
6083 *
6084 * Exception: tcp_ca wants ECN. This is required for DCTCP
6085 * congestion control: Linux DCTCP asserts ECT on all packets,
6086 * including SYN, which is most optimal solution; however,
6087 * others, such as FreeBSD do not.
6088 */
6089static void tcp_ecn_create_request(struct request_sock *req,
6090                                   const struct sk_buff *skb,
6091                                   const struct sock *listen_sk,
6092                                   const struct dst_entry *dst)
6093{
6094        const struct tcphdr *th = tcp_hdr(skb);
6095        const struct net *net = sock_net(listen_sk);
6096        bool th_ecn = th->ece && th->cwr;
6097        bool ect, ecn_ok;
6098        u32 ecn_ok_dst;
6099
6100        if (!th_ecn)
6101                return;
6102
6103        ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
6104        ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK);
6105        ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst;
6106
6107        if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk) ||
6108            (ecn_ok_dst & DST_FEATURE_ECN_CA))
6109                inet_rsk(req)->ecn_ok = 1;
6110}
6111
6112int tcp_conn_request(struct request_sock_ops *rsk_ops,
6113                     const struct tcp_request_sock_ops *af_ops,
6114                     struct sock *sk, struct sk_buff *skb)
6115{
6116        struct tcp_options_received tmp_opt;
6117        struct request_sock *req;
6118        struct tcp_sock *tp = tcp_sk(sk);
6119        struct dst_entry *dst = NULL;
6120        __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
6121        bool want_cookie = false, fastopen;
6122        struct flowi fl;
6123        struct tcp_fastopen_cookie foc = { .len = -1 };
6124        int err;
6125
6126
6127        /* TW buckets are converted to open requests without
6128         * limitations, they conserve resources and peer is
6129         * evidently real one.
6130         */
6131        if ((sysctl_tcp_syncookies == 2 ||
6132             inet_csk_reqsk_queue_is_full(sk)) && !isn) {
6133                want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);
6134                if (!want_cookie)
6135                        goto drop;
6136        }
6137
6138
6139        /* Accept backlog is full. If we have already queued enough
6140         * of warm entries in syn queue, drop request. It is better than
6141         * clogging syn queue with openreqs with exponentially increasing
6142         * timeout.
6143         */
6144        if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
6145                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
6146                goto drop;
6147        }
6148
6149        req = inet_reqsk_alloc(rsk_ops);
6150        if (!req)
6151                goto drop;
6152
6153        inet_rsk(req)->ireq_family = sk->sk_family;
6154
6155        tcp_rsk(req)->af_specific = af_ops;
6156
6157        tcp_clear_options(&tmp_opt);
6158        tmp_opt.mss_clamp = af_ops->mss_clamp;
6159        tmp_opt.user_mss  = tp->rx_opt.user_mss;
6160        tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
6161
6162        if (want_cookie && !tmp_opt.saw_tstamp)
6163                tcp_clear_options(&tmp_opt);
6164
6165        tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
6166        tcp_openreq_init(req, &tmp_opt, skb);
6167
6168        af_ops->init_req(req, sk, skb);
6169
6170        if (security_inet_conn_request(sk, skb, req))
6171                goto drop_and_free;
6172
6173        if (!want_cookie && !isn) {
6174                /* VJ's idea. We save last timestamp seen
6175                 * from the destination in peer table, when entering
6176                 * state TIME-WAIT, and check against it before
6177                 * accepting new connection request.
6178                 *
6179                 * If "isn" is not zero, this request hit alive
6180                 * timewait bucket, so that all the necessary checks
6181                 * are made in the function processing timewait state.
6182                 */
6183                if (tcp_death_row.sysctl_tw_recycle) {
6184                        bool strict;
6185
6186                        dst = af_ops->route_req(sk, &fl, req, &strict);
6187
6188                        if (dst && strict &&
6189                            !tcp_peer_is_proven(req, dst, true,
6190                                                tmp_opt.saw_tstamp)) {
6191                                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
6192                                goto drop_and_release;
6193                        }
6194                }
6195                /* Kill the following clause, if you dislike this way. */
6196                else if (!sysctl_tcp_syncookies &&
6197                         (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
6198                          (sysctl_max_syn_backlog >> 2)) &&
6199                         !tcp_peer_is_proven(req, dst, false,
6200                                             tmp_opt.saw_tstamp)) {
6201                        /* Without syncookies last quarter of
6202                         * backlog is filled with destinations,
6203                         * proven to be alive.
6204                         * It means that we continue to communicate
6205                         * to destinations, already remembered
6206                         * to the moment of synflood.
6207                         */
6208                        pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
6209                                    rsk_ops->family);
6210                        goto drop_and_release;
6211                }
6212
6213                isn = af_ops->init_seq(skb);
6214        }
6215        if (!dst) {
6216                dst = af_ops->route_req(sk, &fl, req, NULL);
6217                if (!dst)
6218                        goto drop_and_free;
6219        }
6220
6221        tcp_ecn_create_request(req, skb, sk, dst);
6222
6223        if (want_cookie) {
6224                isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
6225                req->cookie_ts = tmp_opt.tstamp_ok;
6226                if (!tmp_opt.tstamp_ok)
6227                        inet_rsk(req)->ecn_ok = 0;
6228        }
6229
6230        tcp_rsk(req)->snt_isn = isn;
6231        tcp_openreq_init_rwin(req, sk, dst);
6232        fastopen = !want_cookie &&
6233                   tcp_try_fastopen(sk, skb, req, &foc, dst);
6234        err = af_ops->send_synack(sk, dst, &fl, req,
6235                                  skb_get_queue_mapping(skb), &foc);
6236        if (!fastopen) {
6237                if (err || want_cookie)
6238                        goto drop_and_free;
6239
6240                tcp_rsk(req)->listener = NULL;
6241                af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
6242        }
6243
6244        return 0;
6245
6246drop_and_release:
6247        dst_release(dst);
6248drop_and_free:
6249        reqsk_free(req);
6250drop:
6251        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
6252        return 0;
6253}
6254EXPORT_SYMBOL(tcp_conn_request);
6255