linux/net/ipv4/tcp_output.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Implementation of the Transmission Control Protocol(TCP).
   7 *
   8 * Authors:     Ross Biro
   9 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10 *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  11 *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  12 *              Florian La Roche, <flla@stud.uni-sb.de>
  13 *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  14 *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  15 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  16 *              Matthew Dillon, <dillon@apollo.west.oic.com>
  17 *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  18 *              Jorge Cwik, <jorge@laser.satlink.net>
  19 */
  20
  21/*
  22 * Changes:     Pedro Roque     :       Retransmit queue handled by TCP.
  23 *                              :       Fragmentation on mtu decrease
  24 *                              :       Segment collapse on retransmit
  25 *                              :       AF independence
  26 *
  27 *              Linus Torvalds  :       send_delayed_ack
  28 *              David S. Miller :       Charge memory using the right skb
  29 *                                      during syn/ack processing.
  30 *              David S. Miller :       Output engine completely rewritten.
  31 *              Andrea Arcangeli:       SYNACK carry ts_recent in tsecr.
  32 *              Cacophonix Gaul :       draft-minshall-nagle-01
  33 *              J Hadi Salim    :       ECN support
  34 *
  35 */
  36
  37#define pr_fmt(fmt) "TCP: " fmt
  38
  39#include <net/tcp.h>
  40
  41#include <linux/compiler.h>
  42#include <linux/gfp.h>
  43#include <linux/module.h>
  44#include <linux/static_key.h>
  45
  46#include <trace/events/tcp.h>
  47
  48/* Refresh clocks of a TCP socket,
  49 * ensuring monotically increasing values.
  50 */
  51void tcp_mstamp_refresh(struct tcp_sock *tp)
  52{
  53        u64 val = tcp_clock_ns();
  54
  55        if (val > tp->tcp_clock_cache)
  56                tp->tcp_clock_cache = val;
  57
  58        val = div_u64(val, NSEC_PER_USEC);
  59        if (val > tp->tcp_mstamp)
  60                tp->tcp_mstamp = val;
  61}
  62
  63static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
  64                           int push_one, gfp_t gfp);
  65
  66/* Account for new data that has been sent to the network. */
  67static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
  68{
  69        struct inet_connection_sock *icsk = inet_csk(sk);
  70        struct tcp_sock *tp = tcp_sk(sk);
  71        unsigned int prior_packets = tp->packets_out;
  72
  73        tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
  74
  75        __skb_unlink(skb, &sk->sk_write_queue);
  76        tcp_rbtree_insert(&sk->tcp_rtx_queue, skb);
  77
  78        tp->packets_out += tcp_skb_pcount(skb);
  79        if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
  80                tcp_rearm_rto(sk);
  81
  82        NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
  83                      tcp_skb_pcount(skb));
  84}
  85
  86/* SND.NXT, if window was not shrunk or the amount of shrunk was less than one
  87 * window scaling factor due to loss of precision.
  88 * If window has been shrunk, what should we make? It is not clear at all.
  89 * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
  90 * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
  91 * invalid. OK, let's make this for now:
  92 */
  93static inline __u32 tcp_acceptable_seq(const struct sock *sk)
  94{
  95        const struct tcp_sock *tp = tcp_sk(sk);
  96
  97        if (!before(tcp_wnd_end(tp), tp->snd_nxt) ||
  98            (tp->rx_opt.wscale_ok &&
  99             ((tp->snd_nxt - tcp_wnd_end(tp)) < (1 << tp->rx_opt.rcv_wscale))))
 100                return tp->snd_nxt;
 101        else
 102                return tcp_wnd_end(tp);
 103}
 104
 105/* Calculate mss to advertise in SYN segment.
 106 * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
 107 *
 108 * 1. It is independent of path mtu.
 109 * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
 110 * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
 111 *    attached devices, because some buggy hosts are confused by
 112 *    large MSS.
 113 * 4. We do not make 3, we advertise MSS, calculated from first
 114 *    hop device mtu, but allow to raise it to ip_rt_min_advmss.
 115 *    This may be overridden via information stored in routing table.
 116 * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
 117 *    probably even Jumbo".
 118 */
 119static __u16 tcp_advertise_mss(struct sock *sk)
 120{
 121        struct tcp_sock *tp = tcp_sk(sk);
 122        const struct dst_entry *dst = __sk_dst_get(sk);
 123        int mss = tp->advmss;
 124
 125        if (dst) {
 126                unsigned int metric = dst_metric_advmss(dst);
 127
 128                if (metric < mss) {
 129                        mss = metric;
 130                        tp->advmss = mss;
 131                }
 132        }
 133
 134        return (__u16)mss;
 135}
 136
 137/* RFC2861. Reset CWND after idle period longer RTO to "restart window".
 138 * This is the first part of cwnd validation mechanism.
 139 */
 140void tcp_cwnd_restart(struct sock *sk, s32 delta)
 141{
 142        struct tcp_sock *tp = tcp_sk(sk);
 143        u32 restart_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
 144        u32 cwnd = tp->snd_cwnd;
 145
 146        tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
 147
 148        tp->snd_ssthresh = tcp_current_ssthresh(sk);
 149        restart_cwnd = min(restart_cwnd, cwnd);
 150
 151        while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
 152                cwnd >>= 1;
 153        tp->snd_cwnd = max(cwnd, restart_cwnd);
 154        tp->snd_cwnd_stamp = tcp_jiffies32;
 155        tp->snd_cwnd_used = 0;
 156}
 157
 158/* Congestion state accounting after a packet has been sent. */
 159static void tcp_event_data_sent(struct tcp_sock *tp,
 160                                struct sock *sk)
 161{
 162        struct inet_connection_sock *icsk = inet_csk(sk);
 163        const u32 now = tcp_jiffies32;
 164
 165        if (tcp_packets_in_flight(tp) == 0)
 166                tcp_ca_event(sk, CA_EVENT_TX_START);
 167
 168        /* If this is the first data packet sent in response to the
 169         * previous received data,
 170         * and it is a reply for ato after last received packet,
 171         * increase pingpong count.
 172         */
 173        if (before(tp->lsndtime, icsk->icsk_ack.lrcvtime) &&
 174            (u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
 175                inet_csk_inc_pingpong_cnt(sk);
 176
 177        tp->lsndtime = now;
 178}
 179
 180/* Account for an ACK we sent. */
 181static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts,
 182                                      u32 rcv_nxt)
 183{
 184        struct tcp_sock *tp = tcp_sk(sk);
 185
 186        if (unlikely(tp->compressed_ack > TCP_FASTRETRANS_THRESH)) {
 187                NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
 188                              tp->compressed_ack - TCP_FASTRETRANS_THRESH);
 189                tp->compressed_ack = TCP_FASTRETRANS_THRESH;
 190                if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
 191                        __sock_put(sk);
 192        }
 193
 194        if (unlikely(rcv_nxt != tp->rcv_nxt))
 195                return;  /* Special ACK sent by DCTCP to reflect ECN */
 196        tcp_dec_quickack_mode(sk, pkts);
 197        inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
 198}
 199
 200/* Determine a window scaling and initial window to offer.
 201 * Based on the assumption that the given amount of space
 202 * will be offered. Store the results in the tp structure.
 203 * NOTE: for smooth operation initial space offering should
 204 * be a multiple of mss if possible. We assume here that mss >= 1.
 205 * This MUST be enforced by all callers.
 206 */
 207void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
 208                               __u32 *rcv_wnd, __u32 *window_clamp,
 209                               int wscale_ok, __u8 *rcv_wscale,
 210                               __u32 init_rcv_wnd)
 211{
 212        unsigned int space = (__space < 0 ? 0 : __space);
 213
 214        /* If no clamp set the clamp to the max possible scaled window */
 215        if (*window_clamp == 0)
 216                (*window_clamp) = (U16_MAX << TCP_MAX_WSCALE);
 217        space = min(*window_clamp, space);
 218
 219        /* Quantize space offering to a multiple of mss if possible. */
 220        if (space > mss)
 221                space = rounddown(space, mss);
 222
 223        /* NOTE: offering an initial window larger than 32767
 224         * will break some buggy TCP stacks. If the admin tells us
 225         * it is likely we could be speaking with such a buggy stack
 226         * we will truncate our initial window offering to 32K-1
 227         * unless the remote has sent us a window scaling option,
 228         * which we interpret as a sign the remote TCP is not
 229         * misinterpreting the window field as a signed quantity.
 230         */
 231        if (sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)
 232                (*rcv_wnd) = min(space, MAX_TCP_WINDOW);
 233        else
 234                (*rcv_wnd) = min_t(u32, space, U16_MAX);
 235
 236        if (init_rcv_wnd)
 237                *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
 238
 239        *rcv_wscale = 0;
 240        if (wscale_ok) {
 241                /* Set window scaling on max possible window */
 242                space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
 243                space = max_t(u32, space, sysctl_rmem_max);
 244                space = min_t(u32, space, *window_clamp);
 245                *rcv_wscale = clamp_t(int, ilog2(space) - 15,
 246                                      0, TCP_MAX_WSCALE);
 247        }
 248        /* Set the clamp no higher than max representable value */
 249        (*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp);
 250}
 251EXPORT_SYMBOL(tcp_select_initial_window);
 252
 253/* Chose a new window to advertise, update state in tcp_sock for the
 254 * socket, and return result with RFC1323 scaling applied.  The return
 255 * value can be stuffed directly into th->window for an outgoing
 256 * frame.
 257 */
 258static u16 tcp_select_window(struct sock *sk)
 259{
 260        struct tcp_sock *tp = tcp_sk(sk);
 261        u32 old_win = tp->rcv_wnd;
 262        u32 cur_win = tcp_receive_window(tp);
 263        u32 new_win = __tcp_select_window(sk);
 264
 265        /* Never shrink the offered window */
 266        if (new_win < cur_win) {
 267                /* Danger Will Robinson!
 268                 * Don't update rcv_wup/rcv_wnd here or else
 269                 * we will not be able to advertise a zero
 270                 * window in time.  --DaveM
 271                 *
 272                 * Relax Will Robinson.
 273                 */
 274                if (new_win == 0)
 275                        NET_INC_STATS(sock_net(sk),
 276                                      LINUX_MIB_TCPWANTZEROWINDOWADV);
 277                new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
 278        }
 279        tp->rcv_wnd = new_win;
 280        tp->rcv_wup = tp->rcv_nxt;
 281
 282        /* Make sure we do not exceed the maximum possible
 283         * scaled window.
 284         */
 285        if (!tp->rx_opt.rcv_wscale &&
 286            sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)
 287                new_win = min(new_win, MAX_TCP_WINDOW);
 288        else
 289                new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
 290
 291        /* RFC1323 scaling applied */
 292        new_win >>= tp->rx_opt.rcv_wscale;
 293
 294        /* If we advertise zero window, disable fast path. */
 295        if (new_win == 0) {
 296                tp->pred_flags = 0;
 297                if (old_win)
 298                        NET_INC_STATS(sock_net(sk),
 299                                      LINUX_MIB_TCPTOZEROWINDOWADV);
 300        } else if (old_win == 0) {
 301                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV);
 302        }
 303
 304        return new_win;
 305}
 306
 307/* Packet ECN state for a SYN-ACK */
 308static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
 309{
 310        const struct tcp_sock *tp = tcp_sk(sk);
 311
 312        TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
 313        if (!(tp->ecn_flags & TCP_ECN_OK))
 314                TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
 315        else if (tcp_ca_needs_ecn(sk) ||
 316                 tcp_bpf_ca_needs_ecn(sk))
 317                INET_ECN_xmit(sk);
 318}
 319
 320/* Packet ECN state for a SYN.  */
 321static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
 322{
 323        struct tcp_sock *tp = tcp_sk(sk);
 324        bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
 325        bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
 326                tcp_ca_needs_ecn(sk) || bpf_needs_ecn;
 327
 328        if (!use_ecn) {
 329                const struct dst_entry *dst = __sk_dst_get(sk);
 330
 331                if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
 332                        use_ecn = true;
 333        }
 334
 335        tp->ecn_flags = 0;
 336
 337        if (use_ecn) {
 338                TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
 339                tp->ecn_flags = TCP_ECN_OK;
 340                if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
 341                        INET_ECN_xmit(sk);
 342        }
 343}
 344
 345static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb)
 346{
 347        if (sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)
 348                /* tp->ecn_flags are cleared at a later point in time when
 349                 * SYN ACK is ultimatively being received.
 350                 */
 351                TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR);
 352}
 353
 354static void
 355tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th)
 356{
 357        if (inet_rsk(req)->ecn_ok)
 358                th->ece = 1;
 359}
 360
 361/* Set up ECN state for a packet on a ESTABLISHED socket that is about to
 362 * be sent.
 363 */
 364static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
 365                         struct tcphdr *th, int tcp_header_len)
 366{
 367        struct tcp_sock *tp = tcp_sk(sk);
 368
 369        if (tp->ecn_flags & TCP_ECN_OK) {
 370                /* Not-retransmitted data segment: set ECT and inject CWR. */
 371                if (skb->len != tcp_header_len &&
 372                    !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {
 373                        INET_ECN_xmit(sk);
 374                        if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) {
 375                                tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
 376                                th->cwr = 1;
 377                                skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
 378                        }
 379                } else if (!tcp_ca_needs_ecn(sk)) {
 380                        /* ACK or retransmitted segment: clear ECT|CE */
 381                        INET_ECN_dontxmit(sk);
 382                }
 383                if (tp->ecn_flags & TCP_ECN_DEMAND_CWR)
 384                        th->ece = 1;
 385        }
 386}
 387
 388/* Constructs common control bits of non-data skb. If SYN/FIN is present,
 389 * auto increment end seqno.
 390 */
 391static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
 392{
 393        skb->ip_summed = CHECKSUM_PARTIAL;
 394
 395        TCP_SKB_CB(skb)->tcp_flags = flags;
 396        TCP_SKB_CB(skb)->sacked = 0;
 397
 398        tcp_skb_pcount_set(skb, 1);
 399
 400        TCP_SKB_CB(skb)->seq = seq;
 401        if (flags & (TCPHDR_SYN | TCPHDR_FIN))
 402                seq++;
 403        TCP_SKB_CB(skb)->end_seq = seq;
 404}
 405
 406static inline bool tcp_urg_mode(const struct tcp_sock *tp)
 407{
 408        return tp->snd_una != tp->snd_up;
 409}
 410
 411#define OPTION_SACK_ADVERTISE   (1 << 0)
 412#define OPTION_TS               (1 << 1)
 413#define OPTION_MD5              (1 << 2)
 414#define OPTION_WSCALE           (1 << 3)
 415#define OPTION_FAST_OPEN_COOKIE (1 << 8)
 416#define OPTION_SMC              (1 << 9)
 417
 418static void smc_options_write(__be32 *ptr, u16 *options)
 419{
 420#if IS_ENABLED(CONFIG_SMC)
 421        if (static_branch_unlikely(&tcp_have_smc)) {
 422                if (unlikely(OPTION_SMC & *options)) {
 423                        *ptr++ = htonl((TCPOPT_NOP  << 24) |
 424                                       (TCPOPT_NOP  << 16) |
 425                                       (TCPOPT_EXP <<  8) |
 426                                       (TCPOLEN_EXP_SMC_BASE));
 427                        *ptr++ = htonl(TCPOPT_SMC_MAGIC);
 428                }
 429        }
 430#endif
 431}
 432
 433struct tcp_out_options {
 434        u16 options;            /* bit field of OPTION_* */
 435        u16 mss;                /* 0 to disable */
 436        u8 ws;                  /* window scale, 0 to disable */
 437        u8 num_sack_blocks;     /* number of SACK blocks to include */
 438        u8 hash_size;           /* bytes in hash_location */
 439        __u8 *hash_location;    /* temporary pointer, overloaded */
 440        __u32 tsval, tsecr;     /* need to include OPTION_TS */
 441        struct tcp_fastopen_cookie *fastopen_cookie;    /* Fast open cookie */
 442};
 443
 444/* Write previously computed TCP options to the packet.
 445 *
 446 * Beware: Something in the Internet is very sensitive to the ordering of
 447 * TCP options, we learned this through the hard way, so be careful here.
 448 * Luckily we can at least blame others for their non-compliance but from
 449 * inter-operability perspective it seems that we're somewhat stuck with
 450 * the ordering which we have been using if we want to keep working with
 451 * those broken things (not that it currently hurts anybody as there isn't
 452 * particular reason why the ordering would need to be changed).
 453 *
 454 * At least SACK_PERM as the first option is known to lead to a disaster
 455 * (but it may well be that other scenarios fail similarly).
 456 */
 457static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
 458                              struct tcp_out_options *opts)
 459{
 460        u16 options = opts->options;    /* mungable copy */
 461
 462        if (unlikely(OPTION_MD5 & options)) {
 463                *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 464                               (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);
 465                /* overload cookie hash location */
 466                opts->hash_location = (__u8 *)ptr;
 467                ptr += 4;
 468        }
 469
 470        if (unlikely(opts->mss)) {
 471                *ptr++ = htonl((TCPOPT_MSS << 24) |
 472                               (TCPOLEN_MSS << 16) |
 473                               opts->mss);
 474        }
 475
 476        if (likely(OPTION_TS & options)) {
 477                if (unlikely(OPTION_SACK_ADVERTISE & options)) {
 478                        *ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
 479                                       (TCPOLEN_SACK_PERM << 16) |
 480                                       (TCPOPT_TIMESTAMP << 8) |
 481                                       TCPOLEN_TIMESTAMP);
 482                        options &= ~OPTION_SACK_ADVERTISE;
 483                } else {
 484                        *ptr++ = htonl((TCPOPT_NOP << 24) |
 485                                       (TCPOPT_NOP << 16) |
 486                                       (TCPOPT_TIMESTAMP << 8) |
 487                                       TCPOLEN_TIMESTAMP);
 488                }
 489                *ptr++ = htonl(opts->tsval);
 490                *ptr++ = htonl(opts->tsecr);
 491        }
 492
 493        if (unlikely(OPTION_SACK_ADVERTISE & options)) {
 494                *ptr++ = htonl((TCPOPT_NOP << 24) |
 495                               (TCPOPT_NOP << 16) |
 496                               (TCPOPT_SACK_PERM << 8) |
 497                               TCPOLEN_SACK_PERM);
 498        }
 499
 500        if (unlikely(OPTION_WSCALE & options)) {
 501                *ptr++ = htonl((TCPOPT_NOP << 24) |
 502                               (TCPOPT_WINDOW << 16) |
 503                               (TCPOLEN_WINDOW << 8) |
 504                               opts->ws);
 505        }
 506
 507        if (unlikely(opts->num_sack_blocks)) {
 508                struct tcp_sack_block *sp = tp->rx_opt.dsack ?
 509                        tp->duplicate_sack : tp->selective_acks;
 510                int this_sack;
 511
 512                *ptr++ = htonl((TCPOPT_NOP  << 24) |
 513                               (TCPOPT_NOP  << 16) |
 514                               (TCPOPT_SACK <<  8) |
 515                               (TCPOLEN_SACK_BASE + (opts->num_sack_blocks *
 516                                                     TCPOLEN_SACK_PERBLOCK)));
 517
 518                for (this_sack = 0; this_sack < opts->num_sack_blocks;
 519                     ++this_sack) {
 520                        *ptr++ = htonl(sp[this_sack].start_seq);
 521                        *ptr++ = htonl(sp[this_sack].end_seq);
 522                }
 523
 524                tp->rx_opt.dsack = 0;
 525        }
 526
 527        if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
 528                struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;
 529                u8 *p = (u8 *)ptr;
 530                u32 len; /* Fast Open option length */
 531
 532                if (foc->exp) {
 533                        len = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
 534                        *ptr = htonl((TCPOPT_EXP << 24) | (len << 16) |
 535                                     TCPOPT_FASTOPEN_MAGIC);
 536                        p += TCPOLEN_EXP_FASTOPEN_BASE;
 537                } else {
 538                        len = TCPOLEN_FASTOPEN_BASE + foc->len;
 539                        *p++ = TCPOPT_FASTOPEN;
 540                        *p++ = len;
 541                }
 542
 543                memcpy(p, foc->val, foc->len);
 544                if ((len & 3) == 2) {
 545                        p[foc->len] = TCPOPT_NOP;
 546                        p[foc->len + 1] = TCPOPT_NOP;
 547                }
 548                ptr += (len + 3) >> 2;
 549        }
 550
 551        smc_options_write(ptr, &options);
 552}
 553
 554static void smc_set_option(const struct tcp_sock *tp,
 555                           struct tcp_out_options *opts,
 556                           unsigned int *remaining)
 557{
 558#if IS_ENABLED(CONFIG_SMC)
 559        if (static_branch_unlikely(&tcp_have_smc)) {
 560                if (tp->syn_smc) {
 561                        if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
 562                                opts->options |= OPTION_SMC;
 563                                *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
 564                        }
 565                }
 566        }
 567#endif
 568}
 569
 570static void smc_set_option_cond(const struct tcp_sock *tp,
 571                                const struct inet_request_sock *ireq,
 572                                struct tcp_out_options *opts,
 573                                unsigned int *remaining)
 574{
 575#if IS_ENABLED(CONFIG_SMC)
 576        if (static_branch_unlikely(&tcp_have_smc)) {
 577                if (tp->syn_smc && ireq->smc_ok) {
 578                        if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
 579                                opts->options |= OPTION_SMC;
 580                                *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
 581                        }
 582                }
 583        }
 584#endif
 585}
 586
 587/* Compute TCP options for SYN packets. This is not the final
 588 * network wire format yet.
 589 */
 590static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 591                                struct tcp_out_options *opts,
 592                                struct tcp_md5sig_key **md5)
 593{
 594        struct tcp_sock *tp = tcp_sk(sk);
 595        unsigned int remaining = MAX_TCP_OPTION_SPACE;
 596        struct tcp_fastopen_request *fastopen = tp->fastopen_req;
 597
 598        *md5 = NULL;
 599#ifdef CONFIG_TCP_MD5SIG
 600        if (static_branch_unlikely(&tcp_md5_needed) &&
 601            rcu_access_pointer(tp->md5sig_info)) {
 602                *md5 = tp->af_specific->md5_lookup(sk, sk);
 603                if (*md5) {
 604                        opts->options |= OPTION_MD5;
 605                        remaining -= TCPOLEN_MD5SIG_ALIGNED;
 606                }
 607        }
 608#endif
 609
 610        /* We always get an MSS option.  The option bytes which will be seen in
 611         * normal data packets should timestamps be used, must be in the MSS
 612         * advertised.  But we subtract them from tp->mss_cache so that
 613         * calculations in tcp_sendmsg are simpler etc.  So account for this
 614         * fact here if necessary.  If we don't do this correctly, as a
 615         * receiver we won't recognize data packets as being full sized when we
 616         * should, and thus we won't abide by the delayed ACK rules correctly.
 617         * SACKs don't matter, we never delay an ACK when we have any of those
 618         * going out.  */
 619        opts->mss = tcp_advertise_mss(sk);
 620        remaining -= TCPOLEN_MSS_ALIGNED;
 621
 622        if (likely(sock_net(sk)->ipv4.sysctl_tcp_timestamps && !*md5)) {
 623                opts->options |= OPTION_TS;
 624                opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset;
 625                opts->tsecr = tp->rx_opt.ts_recent;
 626                remaining -= TCPOLEN_TSTAMP_ALIGNED;
 627        }
 628        if (likely(sock_net(sk)->ipv4.sysctl_tcp_window_scaling)) {
 629                opts->ws = tp->rx_opt.rcv_wscale;
 630                opts->options |= OPTION_WSCALE;
 631                remaining -= TCPOLEN_WSCALE_ALIGNED;
 632        }
 633        if (likely(sock_net(sk)->ipv4.sysctl_tcp_sack)) {
 634                opts->options |= OPTION_SACK_ADVERTISE;
 635                if (unlikely(!(OPTION_TS & opts->options)))
 636                        remaining -= TCPOLEN_SACKPERM_ALIGNED;
 637        }
 638
 639        if (fastopen && fastopen->cookie.len >= 0) {
 640                u32 need = fastopen->cookie.len;
 641
 642                need += fastopen->cookie.exp ? TCPOLEN_EXP_FASTOPEN_BASE :
 643                                               TCPOLEN_FASTOPEN_BASE;
 644                need = (need + 3) & ~3U;  /* Align to 32 bits */
 645                if (remaining >= need) {
 646                        opts->options |= OPTION_FAST_OPEN_COOKIE;
 647                        opts->fastopen_cookie = &fastopen->cookie;
 648                        remaining -= need;
 649                        tp->syn_fastopen = 1;
 650                        tp->syn_fastopen_exp = fastopen->cookie.exp ? 1 : 0;
 651                }
 652        }
 653
 654        smc_set_option(tp, opts, &remaining);
 655
 656        return MAX_TCP_OPTION_SPACE - remaining;
 657}
 658
 659/* Set up TCP options for SYN-ACKs. */
 660static unsigned int tcp_synack_options(const struct sock *sk,
 661                                       struct request_sock *req,
 662                                       unsigned int mss, struct sk_buff *skb,
 663                                       struct tcp_out_options *opts,
 664                                       const struct tcp_md5sig_key *md5,
 665                                       struct tcp_fastopen_cookie *foc)
 666{
 667        struct inet_request_sock *ireq = inet_rsk(req);
 668        unsigned int remaining = MAX_TCP_OPTION_SPACE;
 669
 670#ifdef CONFIG_TCP_MD5SIG
 671        if (md5) {
 672                opts->options |= OPTION_MD5;
 673                remaining -= TCPOLEN_MD5SIG_ALIGNED;
 674
 675                /* We can't fit any SACK blocks in a packet with MD5 + TS
 676                 * options. There was discussion about disabling SACK
 677                 * rather than TS in order to fit in better with old,
 678                 * buggy kernels, but that was deemed to be unnecessary.
 679                 */
 680                ireq->tstamp_ok &= !ireq->sack_ok;
 681        }
 682#endif
 683
 684        /* We always send an MSS option. */
 685        opts->mss = mss;
 686        remaining -= TCPOLEN_MSS_ALIGNED;
 687
 688        if (likely(ireq->wscale_ok)) {
 689                opts->ws = ireq->rcv_wscale;
 690                opts->options |= OPTION_WSCALE;
 691                remaining -= TCPOLEN_WSCALE_ALIGNED;
 692        }
 693        if (likely(ireq->tstamp_ok)) {
 694                opts->options |= OPTION_TS;
 695                opts->tsval = tcp_skb_timestamp(skb) + tcp_rsk(req)->ts_off;
 696                opts->tsecr = req->ts_recent;
 697                remaining -= TCPOLEN_TSTAMP_ALIGNED;
 698        }
 699        if (likely(ireq->sack_ok)) {
 700                opts->options |= OPTION_SACK_ADVERTISE;
 701                if (unlikely(!ireq->tstamp_ok))
 702                        remaining -= TCPOLEN_SACKPERM_ALIGNED;
 703        }
 704        if (foc != NULL && foc->len >= 0) {
 705                u32 need = foc->len;
 706
 707                need += foc->exp ? TCPOLEN_EXP_FASTOPEN_BASE :
 708                                   TCPOLEN_FASTOPEN_BASE;
 709                need = (need + 3) & ~3U;  /* Align to 32 bits */
 710                if (remaining >= need) {
 711                        opts->options |= OPTION_FAST_OPEN_COOKIE;
 712                        opts->fastopen_cookie = foc;
 713                        remaining -= need;
 714                }
 715        }
 716
 717        smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
 718
 719        return MAX_TCP_OPTION_SPACE - remaining;
 720}
 721
 722/* Compute TCP options for ESTABLISHED sockets. This is not the
 723 * final wire format yet.
 724 */
 725static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb,
 726                                        struct tcp_out_options *opts,
 727                                        struct tcp_md5sig_key **md5)
 728{
 729        struct tcp_sock *tp = tcp_sk(sk);
 730        unsigned int size = 0;
 731        unsigned int eff_sacks;
 732
 733        opts->options = 0;
 734
 735        *md5 = NULL;
 736#ifdef CONFIG_TCP_MD5SIG
 737        if (static_branch_unlikely(&tcp_md5_needed) &&
 738            rcu_access_pointer(tp->md5sig_info)) {
 739                *md5 = tp->af_specific->md5_lookup(sk, sk);
 740                if (*md5) {
 741                        opts->options |= OPTION_MD5;
 742                        size += TCPOLEN_MD5SIG_ALIGNED;
 743                }
 744        }
 745#endif
 746
 747        if (likely(tp->rx_opt.tstamp_ok)) {
 748                opts->options |= OPTION_TS;
 749                opts->tsval = skb ? tcp_skb_timestamp(skb) + tp->tsoffset : 0;
 750                opts->tsecr = tp->rx_opt.ts_recent;
 751                size += TCPOLEN_TSTAMP_ALIGNED;
 752        }
 753
 754        eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
 755        if (unlikely(eff_sacks)) {
 756                const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
 757                opts->num_sack_blocks =
 758                        min_t(unsigned int, eff_sacks,
 759                              (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
 760                              TCPOLEN_SACK_PERBLOCK);
 761                size += TCPOLEN_SACK_BASE_ALIGNED +
 762                        opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
 763        }
 764
 765        return size;
 766}
 767
 768
 769/* TCP SMALL QUEUES (TSQ)
 770 *
 771 * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev)
 772 * to reduce RTT and bufferbloat.
 773 * We do this using a special skb destructor (tcp_wfree).
 774 *
 775 * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb
 776 * needs to be reallocated in a driver.
 777 * The invariant being skb->truesize subtracted from sk->sk_wmem_alloc
 778 *
 779 * Since transmit from skb destructor is forbidden, we use a tasklet
 780 * to process all sockets that eventually need to send more skbs.
 781 * We use one tasklet per cpu, with its own queue of sockets.
 782 */
 783struct tsq_tasklet {
 784        struct tasklet_struct   tasklet;
 785        struct list_head        head; /* queue of tcp sockets */
 786};
 787static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
 788
 789static void tcp_tsq_write(struct sock *sk)
 790{
 791        if ((1 << sk->sk_state) &
 792            (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
 793             TCPF_CLOSE_WAIT  | TCPF_LAST_ACK)) {
 794                struct tcp_sock *tp = tcp_sk(sk);
 795
 796                if (tp->lost_out > tp->retrans_out &&
 797                    tp->snd_cwnd > tcp_packets_in_flight(tp)) {
 798                        tcp_mstamp_refresh(tp);
 799                        tcp_xmit_retransmit_queue(sk);
 800                }
 801
 802                tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle,
 803                               0, GFP_ATOMIC);
 804        }
 805}
 806
 807static void tcp_tsq_handler(struct sock *sk)
 808{
 809        bh_lock_sock(sk);
 810        if (!sock_owned_by_user(sk))
 811                tcp_tsq_write(sk);
 812        else if (!test_and_set_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
 813                sock_hold(sk);
 814        bh_unlock_sock(sk);
 815}
 816/*
 817 * One tasklet per cpu tries to send more skbs.
 818 * We run in tasklet context but need to disable irqs when
 819 * transferring tsq->head because tcp_wfree() might
 820 * interrupt us (non NAPI drivers)
 821 */
 822static void tcp_tasklet_func(unsigned long data)
 823{
 824        struct tsq_tasklet *tsq = (struct tsq_tasklet *)data;
 825        LIST_HEAD(list);
 826        unsigned long flags;
 827        struct list_head *q, *n;
 828        struct tcp_sock *tp;
 829        struct sock *sk;
 830
 831        local_irq_save(flags);
 832        list_splice_init(&tsq->head, &list);
 833        local_irq_restore(flags);
 834
 835        list_for_each_safe(q, n, &list) {
 836                tp = list_entry(q, struct tcp_sock, tsq_node);
 837                list_del(&tp->tsq_node);
 838
 839                sk = (struct sock *)tp;
 840                smp_mb__before_atomic();
 841                clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags);
 842
 843                tcp_tsq_handler(sk);
 844                sk_free(sk);
 845        }
 846}
 847
 848#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED |           \
 849                          TCPF_WRITE_TIMER_DEFERRED |   \
 850                          TCPF_DELACK_TIMER_DEFERRED |  \
 851                          TCPF_MTU_REDUCED_DEFERRED)
 852/**
 853 * tcp_release_cb - tcp release_sock() callback
 854 * @sk: socket
 855 *
 856 * called from release_sock() to perform protocol dependent
 857 * actions before socket release.
 858 */
 859void tcp_release_cb(struct sock *sk)
 860{
 861        unsigned long flags, nflags;
 862
 863        /* perform an atomic operation only if at least one flag is set */
 864        do {
 865                flags = sk->sk_tsq_flags;
 866                if (!(flags & TCP_DEFERRED_ALL))
 867                        return;
 868                nflags = flags & ~TCP_DEFERRED_ALL;
 869        } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
 870
 871        if (flags & TCPF_TSQ_DEFERRED) {
 872                tcp_tsq_write(sk);
 873                __sock_put(sk);
 874        }
 875        /* Here begins the tricky part :
 876         * We are called from release_sock() with :
 877         * 1) BH disabled
 878         * 2) sk_lock.slock spinlock held
 879         * 3) socket owned by us (sk->sk_lock.owned == 1)
 880         *
 881         * But following code is meant to be called from BH handlers,
 882         * so we should keep BH disabled, but early release socket ownership
 883         */
 884        sock_release_ownership(sk);
 885
 886        if (flags & TCPF_WRITE_TIMER_DEFERRED) {
 887                tcp_write_timer_handler(sk);
 888                __sock_put(sk);
 889        }
 890        if (flags & TCPF_DELACK_TIMER_DEFERRED) {
 891                tcp_delack_timer_handler(sk);
 892                __sock_put(sk);
 893        }
 894        if (flags & TCPF_MTU_REDUCED_DEFERRED) {
 895                inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
 896                __sock_put(sk);
 897        }
 898}
 899EXPORT_SYMBOL(tcp_release_cb);
 900
 901void __init tcp_tasklet_init(void)
 902{
 903        int i;
 904
 905        for_each_possible_cpu(i) {
 906                struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);
 907
 908                INIT_LIST_HEAD(&tsq->head);
 909                tasklet_init(&tsq->tasklet,
 910                             tcp_tasklet_func,
 911                             (unsigned long)tsq);
 912        }
 913}
 914
 915/*
 916 * Write buffer destructor automatically called from kfree_skb.
 917 * We can't xmit new skbs from this context, as we might already
 918 * hold qdisc lock.
 919 */
 920void tcp_wfree(struct sk_buff *skb)
 921{
 922        struct sock *sk = skb->sk;
 923        struct tcp_sock *tp = tcp_sk(sk);
 924        unsigned long flags, nval, oval;
 925
 926        /* Keep one reference on sk_wmem_alloc.
 927         * Will be released by sk_free() from here or tcp_tasklet_func()
 928         */
 929        WARN_ON(refcount_sub_and_test(skb->truesize - 1, &sk->sk_wmem_alloc));
 930
 931        /* If this softirq is serviced by ksoftirqd, we are likely under stress.
 932         * Wait until our queues (qdisc + devices) are drained.
 933         * This gives :
 934         * - less callbacks to tcp_write_xmit(), reducing stress (batches)
 935         * - chance for incoming ACK (processed by another cpu maybe)
 936         *   to migrate this flow (skb->ooo_okay will be eventually set)
 937         */
 938        if (refcount_read(&sk->sk_wmem_alloc) >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
 939                goto out;
 940
 941        for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
 942                struct tsq_tasklet *tsq;
 943                bool empty;
 944
 945                if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
 946                        goto out;
 947
 948                nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED;
 949                nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
 950                if (nval != oval)
 951                        continue;
 952
 953                /* queue this socket to tasklet queue */
 954                local_irq_save(flags);
 955                tsq = this_cpu_ptr(&tsq_tasklet);
 956                empty = list_empty(&tsq->head);
 957                list_add(&tp->tsq_node, &tsq->head);
 958                if (empty)
 959                        tasklet_schedule(&tsq->tasklet);
 960                local_irq_restore(flags);
 961                return;
 962        }
 963out:
 964        sk_free(sk);
 965}
 966
 967/* Note: Called under soft irq.
 968 * We can call TCP stack right away, unless socket is owned by user.
 969 */
 970enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
 971{
 972        struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer);
 973        struct sock *sk = (struct sock *)tp;
 974
 975        tcp_tsq_handler(sk);
 976        sock_put(sk);
 977
 978        return HRTIMER_NORESTART;
 979}
 980
 981static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb,
 982                                      u64 prior_wstamp)
 983{
 984        struct tcp_sock *tp = tcp_sk(sk);
 985
 986        if (sk->sk_pacing_status != SK_PACING_NONE) {
 987                unsigned long rate = sk->sk_pacing_rate;
 988
 989                /* Original sch_fq does not pace first 10 MSS
 990                 * Note that tp->data_segs_out overflows after 2^32 packets,
 991                 * this is a minor annoyance.
 992                 */
 993                if (rate != ~0UL && rate && tp->data_segs_out >= 10) {
 994                        u64 len_ns = div64_ul((u64)skb->len * NSEC_PER_SEC, rate);
 995                        u64 credit = tp->tcp_wstamp_ns - prior_wstamp;
 996
 997                        /* take into account OS jitter */
 998                        len_ns -= min_t(u64, len_ns / 2, credit);
 999                        tp->tcp_wstamp_ns += len_ns;
1000                }
1001        }
1002        list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
1003}
1004
1005/* This routine actually transmits TCP packets queued in by
1006 * tcp_do_sendmsg().  This is used by both the initial
1007 * transmission and possible later retransmissions.
1008 * All SKB's seen here are completely headerless.  It is our
1009 * job to build the TCP header, and pass the packet down to
1010 * IP so it can do the same plus pass the packet off to the
1011 * device.
1012 *
1013 * We are working here with either a clone of the original
1014 * SKB, or a fresh unique copy made by the retransmit engine.
1015 */
1016static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
1017                              int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
1018{
1019        const struct inet_connection_sock *icsk = inet_csk(sk);
1020        struct inet_sock *inet;
1021        struct tcp_sock *tp;
1022        struct tcp_skb_cb *tcb;
1023        struct tcp_out_options opts;
1024        unsigned int tcp_options_size, tcp_header_size;
1025        struct sk_buff *oskb = NULL;
1026        struct tcp_md5sig_key *md5;
1027        struct tcphdr *th;
1028        u64 prior_wstamp;
1029        int err;
1030
1031        BUG_ON(!skb || !tcp_skb_pcount(skb));
1032        tp = tcp_sk(sk);
1033        prior_wstamp = tp->tcp_wstamp_ns;
1034        tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
1035        skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
1036        if (clone_it) {
1037                TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
1038                        - tp->snd_una;
1039                oskb = skb;
1040
1041                tcp_skb_tsorted_save(oskb) {
1042                        if (unlikely(skb_cloned(oskb)))
1043                                skb = pskb_copy(oskb, gfp_mask);
1044                        else
1045                                skb = skb_clone(oskb, gfp_mask);
1046                } tcp_skb_tsorted_restore(oskb);
1047
1048                if (unlikely(!skb))
1049                        return -ENOBUFS;
1050        }
1051
1052        inet = inet_sk(sk);
1053        tcb = TCP_SKB_CB(skb);
1054        memset(&opts, 0, sizeof(opts));
1055
1056        if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
1057                tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
1058        else
1059                tcp_options_size = tcp_established_options(sk, skb, &opts,
1060                                                           &md5);
1061        tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
1062
1063        /* if no packet is in qdisc/device queue, then allow XPS to select
1064         * another queue. We can be called from tcp_tsq_handler()
1065         * which holds one reference to sk.
1066         *
1067         * TODO: Ideally, in-flight pure ACK packets should not matter here.
1068         * One way to get this would be to set skb->truesize = 2 on them.
1069         */
1070        skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1);
1071
1072        /* If we had to use memory reserve to allocate this skb,
1073         * this might cause drops if packet is looped back :
1074         * Other socket might not have SOCK_MEMALLOC.
1075         * Packets not looped back do not care about pfmemalloc.
1076         */
1077        skb->pfmemalloc = 0;
1078
1079        skb_push(skb, tcp_header_size);
1080        skb_reset_transport_header(skb);
1081
1082        skb_orphan(skb);
1083        skb->sk = sk;
1084        skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree;
1085        skb_set_hash_from_sk(skb, sk);
1086        refcount_add(skb->truesize, &sk->sk_wmem_alloc);
1087
1088        skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm);
1089
1090        /* Build TCP header and checksum it. */
1091        th = (struct tcphdr *)skb->data;
1092        th->source              = inet->inet_sport;
1093        th->dest                = inet->inet_dport;
1094        th->seq                 = htonl(tcb->seq);
1095        th->ack_seq             = htonl(rcv_nxt);
1096        *(((__be16 *)th) + 6)   = htons(((tcp_header_size >> 2) << 12) |
1097                                        tcb->tcp_flags);
1098
1099        th->check               = 0;
1100        th->urg_ptr             = 0;
1101
1102        /* The urg_mode check is necessary during a below snd_una win probe */
1103        if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {
1104                if (before(tp->snd_up, tcb->seq + 0x10000)) {
1105                        th->urg_ptr = htons(tp->snd_up - tcb->seq);
1106                        th->urg = 1;
1107                } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
1108                        th->urg_ptr = htons(0xFFFF);
1109                        th->urg = 1;
1110                }
1111        }
1112
1113        tcp_options_write((__be32 *)(th + 1), tp, &opts);
1114        skb_shinfo(skb)->gso_type = sk->sk_gso_type;
1115        if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) {
1116                th->window      = htons(tcp_select_window(sk));
1117                tcp_ecn_send(sk, skb, th, tcp_header_size);
1118        } else {
1119                /* RFC1323: The window in SYN & SYN/ACK segments
1120                 * is never scaled.
1121                 */
1122                th->window      = htons(min(tp->rcv_wnd, 65535U));
1123        }
1124#ifdef CONFIG_TCP_MD5SIG
1125        /* Calculate the MD5 hash, as we have all we need now */
1126        if (md5) {
1127                sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1128                tp->af_specific->calc_md5_hash(opts.hash_location,
1129                                               md5, sk, skb);
1130        }
1131#endif
1132
1133        icsk->icsk_af_ops->send_check(sk, skb);
1134
1135        if (likely(tcb->tcp_flags & TCPHDR_ACK))
1136                tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt);
1137
1138        if (skb->len != tcp_header_size) {
1139                tcp_event_data_sent(tp, sk);
1140                tp->data_segs_out += tcp_skb_pcount(skb);
1141                tp->bytes_sent += skb->len - tcp_header_size;
1142        }
1143
1144        if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
1145                TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
1146                              tcp_skb_pcount(skb));
1147
1148        tp->segs_out += tcp_skb_pcount(skb);
1149        /* OK, its time to fill skb_shinfo(skb)->gso_{segs|size} */
1150        skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
1151        skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);
1152
1153        /* Leave earliest departure time in skb->tstamp (skb->skb_mstamp_ns) */
1154
1155        /* Cleanup our debris for IP stacks */
1156        memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
1157                               sizeof(struct inet6_skb_parm)));
1158
1159        err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
1160
1161        if (unlikely(err > 0)) {
1162                tcp_enter_cwr(sk);
1163                err = net_xmit_eval(err);
1164        }
1165        if (!err && oskb) {
1166                tcp_update_skb_after_send(sk, oskb, prior_wstamp);
1167                tcp_rate_skb_sent(sk, oskb);
1168        }
1169        return err;
1170}
1171
1172static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
1173                            gfp_t gfp_mask)
1174{
1175        return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask,
1176                                  tcp_sk(sk)->rcv_nxt);
1177}
1178
1179/* This routine just queues the buffer for sending.
1180 *
1181 * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
1182 * otherwise socket can stall.
1183 */
1184static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
1185{
1186        struct tcp_sock *tp = tcp_sk(sk);
1187
1188        /* Advance write_seq and place onto the write_queue. */
1189        tp->write_seq = TCP_SKB_CB(skb)->end_seq;
1190        __skb_header_release(skb);
1191        tcp_add_write_queue_tail(sk, skb);
1192        sk->sk_wmem_queued += skb->truesize;
1193        sk_mem_charge(sk, skb->truesize);
1194}
1195
1196/* Initialize TSO segments for a packet. */
1197static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
1198{
1199        if (skb->len <= mss_now) {
1200                /* Avoid the costly divide in the normal
1201                 * non-TSO case.
1202                 */
1203                tcp_skb_pcount_set(skb, 1);
1204                TCP_SKB_CB(skb)->tcp_gso_size = 0;
1205        } else {
1206                tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now));
1207                TCP_SKB_CB(skb)->tcp_gso_size = mss_now;
1208        }
1209}
1210
1211/* Pcount in the middle of the write queue got changed, we need to do various
1212 * tweaks to fix counters
1213 */
1214static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
1215{
1216        struct tcp_sock *tp = tcp_sk(sk);
1217
1218        tp->packets_out -= decr;
1219
1220        if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
1221                tp->sacked_out -= decr;
1222        if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
1223                tp->retrans_out -= decr;
1224        if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
1225                tp->lost_out -= decr;
1226
1227        /* Reno case is special. Sigh... */
1228        if (tcp_is_reno(tp) && decr > 0)
1229                tp->sacked_out -= min_t(u32, tp->sacked_out, decr);
1230
1231        if (tp->lost_skb_hint &&
1232            before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
1233            (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
1234                tp->lost_cnt_hint -= decr;
1235
1236        tcp_verify_left_out(tp);
1237}
1238
1239static bool tcp_has_tx_tstamp(const struct sk_buff *skb)
1240{
1241        return TCP_SKB_CB(skb)->txstamp_ack ||
1242                (skb_shinfo(skb)->tx_flags & SKBTX_ANY_TSTAMP);
1243}
1244
1245static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)
1246{
1247        struct skb_shared_info *shinfo = skb_shinfo(skb);
1248
1249        if (unlikely(tcp_has_tx_tstamp(skb)) &&
1250            !before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) {
1251                struct skb_shared_info *shinfo2 = skb_shinfo(skb2);
1252                u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP;
1253
1254                shinfo->tx_flags &= ~tsflags;
1255                shinfo2->tx_flags |= tsflags;
1256                swap(shinfo->tskey, shinfo2->tskey);
1257                TCP_SKB_CB(skb2)->txstamp_ack = TCP_SKB_CB(skb)->txstamp_ack;
1258                TCP_SKB_CB(skb)->txstamp_ack = 0;
1259        }
1260}
1261
1262static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2)
1263{
1264        TCP_SKB_CB(skb2)->eor = TCP_SKB_CB(skb)->eor;
1265        TCP_SKB_CB(skb)->eor = 0;
1266}
1267
1268/* Insert buff after skb on the write or rtx queue of sk.  */
1269static void tcp_insert_write_queue_after(struct sk_buff *skb,
1270                                         struct sk_buff *buff,
1271                                         struct sock *sk,
1272                                         enum tcp_queue tcp_queue)
1273{
1274        if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE)
1275                __skb_queue_after(&sk->sk_write_queue, skb, buff);
1276        else
1277                tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
1278}
1279
1280/* Function to create two new TCP segments.  Shrinks the given segment
1281 * to the specified size and appends a new segment with the rest of the
1282 * packet to the list.  This won't be called frequently, I hope.
1283 * Remember, these are still headerless SKBs at this point.
1284 */
1285int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
1286                 struct sk_buff *skb, u32 len,
1287                 unsigned int mss_now, gfp_t gfp)
1288{
1289        struct tcp_sock *tp = tcp_sk(sk);
1290        struct sk_buff *buff;
1291        int nsize, old_factor;
1292        int nlen;
1293        u8 flags;
1294
1295        if (WARN_ON(len > skb->len))
1296                return -EINVAL;
1297
1298        nsize = skb_headlen(skb) - len;
1299        if (nsize < 0)
1300                nsize = 0;
1301
1302        if (skb_unclone(skb, gfp))
1303                return -ENOMEM;
1304
1305        /* Get a new skb... force flag on. */
1306        buff = sk_stream_alloc_skb(sk, nsize, gfp, true);
1307        if (!buff)
1308                return -ENOMEM; /* We'll just try again later. */
1309
1310        sk->sk_wmem_queued += buff->truesize;
1311        sk_mem_charge(sk, buff->truesize);
1312        nlen = skb->len - len - nsize;
1313        buff->truesize += nlen;
1314        skb->truesize -= nlen;
1315
1316        /* Correct the sequence numbers. */
1317        TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
1318        TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
1319        TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
1320
1321        /* PSH and FIN should only be set in the second packet. */
1322        flags = TCP_SKB_CB(skb)->tcp_flags;
1323        TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
1324        TCP_SKB_CB(buff)->tcp_flags = flags;
1325        TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
1326        tcp_skb_fragment_eor(skb, buff);
1327
1328        skb_split(skb, buff, len);
1329
1330        buff->ip_summed = CHECKSUM_PARTIAL;
1331
1332        buff->tstamp = skb->tstamp;
1333        tcp_fragment_tstamp(skb, buff);
1334
1335        old_factor = tcp_skb_pcount(skb);
1336
1337        /* Fix up tso_factor for both original and new SKB.  */
1338        tcp_set_skb_tso_segs(skb, mss_now);
1339        tcp_set_skb_tso_segs(buff, mss_now);
1340
1341        /* Update delivered info for the new segment */
1342        TCP_SKB_CB(buff)->tx = TCP_SKB_CB(skb)->tx;
1343
1344        /* If this packet has been sent out already, we must
1345         * adjust the various packet counters.
1346         */
1347        if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
1348                int diff = old_factor - tcp_skb_pcount(skb) -
1349                        tcp_skb_pcount(buff);
1350
1351                if (diff)
1352                        tcp_adjust_pcount(sk, skb, diff);
1353        }
1354
1355        /* Link BUFF into the send queue. */
1356        __skb_header_release(buff);
1357        tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
1358        if (tcp_queue == TCP_FRAG_IN_RTX_QUEUE)
1359                list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor);
1360
1361        return 0;
1362}
1363
1364/* This is similar to __pskb_pull_tail(). The difference is that pulled
1365 * data is not copied, but immediately discarded.
1366 */
1367static int __pskb_trim_head(struct sk_buff *skb, int len)
1368{
1369        struct skb_shared_info *shinfo;
1370        int i, k, eat;
1371
1372        eat = min_t(int, len, skb_headlen(skb));
1373        if (eat) {
1374                __skb_pull(skb, eat);
1375                len -= eat;
1376                if (!len)
1377                        return 0;
1378        }
1379        eat = len;
1380        k = 0;
1381        shinfo = skb_shinfo(skb);
1382        for (i = 0; i < shinfo->nr_frags; i++) {
1383                int size = skb_frag_size(&shinfo->frags[i]);
1384
1385                if (size <= eat) {
1386                        skb_frag_unref(skb, i);
1387                        eat -= size;
1388                } else {
1389                        shinfo->frags[k] = shinfo->frags[i];
1390                        if (eat) {
1391                                shinfo->frags[k].page_offset += eat;
1392                                skb_frag_size_sub(&shinfo->frags[k], eat);
1393                                eat = 0;
1394                        }
1395                        k++;
1396                }
1397        }
1398        shinfo->nr_frags = k;
1399
1400        skb->data_len -= len;
1401        skb->len = skb->data_len;
1402        return len;
1403}
1404
1405/* Remove acked data from a packet in the transmit queue. */
1406int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
1407{
1408        u32 delta_truesize;
1409
1410        if (skb_unclone(skb, GFP_ATOMIC))
1411                return -ENOMEM;
1412
1413        delta_truesize = __pskb_trim_head(skb, len);
1414
1415        TCP_SKB_CB(skb)->seq += len;
1416        skb->ip_summed = CHECKSUM_PARTIAL;
1417
1418        if (delta_truesize) {
1419                skb->truesize      -= delta_truesize;
1420                sk->sk_wmem_queued -= delta_truesize;
1421                sk_mem_uncharge(sk, delta_truesize);
1422                sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
1423        }
1424
1425        /* Any change of skb->len requires recalculation of tso factor. */
1426        if (tcp_skb_pcount(skb) > 1)
1427                tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb));
1428
1429        return 0;
1430}
1431
1432/* Calculate MSS not accounting any TCP options.  */
1433static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu)
1434{
1435        const struct tcp_sock *tp = tcp_sk(sk);
1436        const struct inet_connection_sock *icsk = inet_csk(sk);
1437        int mss_now;
1438
1439        /* Calculate base mss without TCP options:
1440           It is MMS_S - sizeof(tcphdr) of rfc1122
1441         */
1442        mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
1443
1444        /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
1445        if (icsk->icsk_af_ops->net_frag_header_len) {
1446                const struct dst_entry *dst = __sk_dst_get(sk);
1447
1448                if (dst && dst_allfrag(dst))
1449                        mss_now -= icsk->icsk_af_ops->net_frag_header_len;
1450        }
1451
1452        /* Clamp it (mss_clamp does not include tcp options) */
1453        if (mss_now > tp->rx_opt.mss_clamp)
1454                mss_now = tp->rx_opt.mss_clamp;
1455
1456        /* Now subtract optional transport overhead */
1457        mss_now -= icsk->icsk_ext_hdr_len;
1458
1459        /* Then reserve room for full set of TCP options and 8 bytes of data */
1460        if (mss_now < 48)
1461                mss_now = 48;
1462        return mss_now;
1463}
1464
1465/* Calculate MSS. Not accounting for SACKs here.  */
1466int tcp_mtu_to_mss(struct sock *sk, int pmtu)
1467{
1468        /* Subtract TCP options size, not including SACKs */
1469        return __tcp_mtu_to_mss(sk, pmtu) -
1470               (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr));
1471}
1472
1473/* Inverse of above */
1474int tcp_mss_to_mtu(struct sock *sk, int mss)
1475{
1476        const struct tcp_sock *tp = tcp_sk(sk);
1477        const struct inet_connection_sock *icsk = inet_csk(sk);
1478        int mtu;
1479
1480        mtu = mss +
1481              tp->tcp_header_len +
1482              icsk->icsk_ext_hdr_len +
1483              icsk->icsk_af_ops->net_header_len;
1484
1485        /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
1486        if (icsk->icsk_af_ops->net_frag_header_len) {
1487                const struct dst_entry *dst = __sk_dst_get(sk);
1488
1489                if (dst && dst_allfrag(dst))
1490                        mtu += icsk->icsk_af_ops->net_frag_header_len;
1491        }
1492        return mtu;
1493}
1494EXPORT_SYMBOL(tcp_mss_to_mtu);
1495
1496/* MTU probing init per socket */
1497void tcp_mtup_init(struct sock *sk)
1498{
1499        struct tcp_sock *tp = tcp_sk(sk);
1500        struct inet_connection_sock *icsk = inet_csk(sk);
1501        struct net *net = sock_net(sk);
1502
1503        icsk->icsk_mtup.enabled = net->ipv4.sysctl_tcp_mtu_probing > 1;
1504        icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
1505                               icsk->icsk_af_ops->net_header_len;
1506        icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss);
1507        icsk->icsk_mtup.probe_size = 0;
1508        if (icsk->icsk_mtup.enabled)
1509                icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
1510}
1511EXPORT_SYMBOL(tcp_mtup_init);
1512
1513/* This function synchronize snd mss to current pmtu/exthdr set.
1514
1515   tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
1516   for TCP options, but includes only bare TCP header.
1517
1518   tp->rx_opt.mss_clamp is mss negotiated at connection setup.
1519   It is minimum of user_mss and mss received with SYN.
1520   It also does not include TCP options.
1521
1522   inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function.
1523
1524   tp->mss_cache is current effective sending mss, including
1525   all tcp options except for SACKs. It is evaluated,
1526   taking into account current pmtu, but never exceeds
1527   tp->rx_opt.mss_clamp.
1528
1529   NOTE1. rfc1122 clearly states that advertised MSS
1530   DOES NOT include either tcp or ip options.
1531
1532   NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache
1533   are READ ONLY outside this function.         --ANK (980731)
1534 */
1535unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
1536{
1537        struct tcp_sock *tp = tcp_sk(sk);
1538        struct inet_connection_sock *icsk = inet_csk(sk);
1539        int mss_now;
1540
1541        if (icsk->icsk_mtup.search_high > pmtu)
1542                icsk->icsk_mtup.search_high = pmtu;
1543
1544        mss_now = tcp_mtu_to_mss(sk, pmtu);
1545        mss_now = tcp_bound_to_half_wnd(tp, mss_now);
1546
1547        /* And store cached results */
1548        icsk->icsk_pmtu_cookie = pmtu;
1549        if (icsk->icsk_mtup.enabled)
1550                mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
1551        tp->mss_cache = mss_now;
1552
1553        return mss_now;
1554}
1555EXPORT_SYMBOL(tcp_sync_mss);
1556
1557/* Compute the current effective MSS, taking SACKs and IP options,
1558 * and even PMTU discovery events into account.
1559 */
1560unsigned int tcp_current_mss(struct sock *sk)
1561{
1562        const struct tcp_sock *tp = tcp_sk(sk);
1563        const struct dst_entry *dst = __sk_dst_get(sk);
1564        u32 mss_now;
1565        unsigned int header_len;
1566        struct tcp_out_options opts;
1567        struct tcp_md5sig_key *md5;
1568
1569        mss_now = tp->mss_cache;
1570
1571        if (dst) {
1572                u32 mtu = dst_mtu(dst);
1573                if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
1574                        mss_now = tcp_sync_mss(sk, mtu);
1575        }
1576
1577        header_len = tcp_established_options(sk, NULL, &opts, &md5) +
1578                     sizeof(struct tcphdr);
1579        /* The mss_cache is sized based on tp->tcp_header_len, which assumes
1580         * some common options. If this is an odd packet (because we have SACK
1581         * blocks etc) then our calculated header_len will be different, and
1582         * we have to adjust mss_now correspondingly */
1583        if (header_len != tp->tcp_header_len) {
1584                int delta = (int) header_len - tp->tcp_header_len;
1585                mss_now -= delta;
1586        }
1587
1588        return mss_now;
1589}
1590
1591/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
1592 * As additional protections, we do not touch cwnd in retransmission phases,
1593 * and if application hit its sndbuf limit recently.
1594 */
1595static void tcp_cwnd_application_limited(struct sock *sk)
1596{
1597        struct tcp_sock *tp = tcp_sk(sk);
1598
1599        if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
1600            sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
1601                /* Limited by application or receiver window. */
1602                u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
1603                u32 win_used = max(tp->snd_cwnd_used, init_win);
1604                if (win_used < tp->snd_cwnd) {
1605                        tp->snd_ssthresh = tcp_current_ssthresh(sk);
1606                        tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
1607                }
1608                tp->snd_cwnd_used = 0;
1609        }
1610        tp->snd_cwnd_stamp = tcp_jiffies32;
1611}
1612
1613static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
1614{
1615        const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
1616        struct tcp_sock *tp = tcp_sk(sk);
1617
1618        /* Track the maximum number of outstanding packets in each
1619         * window, and remember whether we were cwnd-limited then.
1620         */
1621        if (!before(tp->snd_una, tp->max_packets_seq) ||
1622            tp->packets_out > tp->max_packets_out) {
1623                tp->max_packets_out = tp->packets_out;
1624                tp->max_packets_seq = tp->snd_nxt;
1625                tp->is_cwnd_limited = is_cwnd_limited;
1626        }
1627
1628        if (tcp_is_cwnd_limited(sk)) {
1629                /* Network is feed fully. */
1630                tp->snd_cwnd_used = 0;
1631                tp->snd_cwnd_stamp = tcp_jiffies32;
1632        } else {
1633                /* Network starves. */
1634                if (tp->packets_out > tp->snd_cwnd_used)
1635                        tp->snd_cwnd_used = tp->packets_out;
1636
1637                if (sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle &&
1638                    (s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto &&
1639                    !ca_ops->cong_control)
1640                        tcp_cwnd_application_limited(sk);
1641
1642                /* The following conditions together indicate the starvation
1643                 * is caused by insufficient sender buffer:
1644                 * 1) just sent some data (see tcp_write_xmit)
1645                 * 2) not cwnd limited (this else condition)
1646                 * 3) no more data to send (tcp_write_queue_empty())
1647                 * 4) application is hitting buffer limit (SOCK_NOSPACE)
1648                 */
1649                if (tcp_write_queue_empty(sk) && sk->sk_socket &&
1650                    test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) &&
1651                    (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
1652                        tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED);
1653        }
1654}
1655
1656/* Minshall's variant of the Nagle send check. */
1657static bool tcp_minshall_check(const struct tcp_sock *tp)
1658{
1659        return after(tp->snd_sml, tp->snd_una) &&
1660                !after(tp->snd_sml, tp->snd_nxt);
1661}
1662
1663/* Update snd_sml if this skb is under mss
1664 * Note that a TSO packet might end with a sub-mss segment
1665 * The test is really :
1666 * if ((skb->len % mss) != 0)
1667 *        tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
1668 * But we can avoid doing the divide again given we already have
1669 *  skb_pcount = skb->len / mss_now
1670 */
1671static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
1672                                const struct sk_buff *skb)
1673{
1674        if (skb->len < tcp_skb_pcount(skb) * mss_now)
1675                tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
1676}
1677
1678/* Return false, if packet can be sent now without violation Nagle's rules:
1679 * 1. It is full sized. (provided by caller in %partial bool)
1680 * 2. Or it contains FIN. (already checked by caller)
1681 * 3. Or TCP_CORK is not set, and TCP_NODELAY is set.
1682 * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
1683 *    With Minshall's modification: all sent small packets are ACKed.
1684 */
1685static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
1686                            int nonagle)
1687{
1688        return partial &&
1689                ((nonagle & TCP_NAGLE_CORK) ||
1690                 (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
1691}
1692
1693/* Return how many segs we'd like on a TSO packet,
1694 * to send one TSO packet per ms
1695 */
1696static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
1697                            int min_tso_segs)
1698{
1699        u32 bytes, segs;
1700
1701        bytes = min_t(unsigned long,
1702                      sk->sk_pacing_rate >> sk->sk_pacing_shift,
1703                      sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
1704
1705        /* Goal is to send at least one packet per ms,
1706         * not one big TSO packet every 100 ms.
1707         * This preserves ACK clocking and is consistent
1708         * with tcp_tso_should_defer() heuristic.
1709         */
1710        segs = max_t(u32, bytes / mss_now, min_tso_segs);
1711
1712        return segs;
1713}
1714
1715/* Return the number of segments we want in the skb we are transmitting.
1716 * See if congestion control module wants to decide; otherwise, autosize.
1717 */
1718static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
1719{
1720        const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
1721        u32 min_tso, tso_segs;
1722
1723        min_tso = ca_ops->min_tso_segs ?
1724                        ca_ops->min_tso_segs(sk) :
1725                        sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs;
1726
1727        tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
1728        return min_t(u32, tso_segs, sk->sk_gso_max_segs);
1729}
1730
1731/* Returns the portion of skb which can be sent right away */
1732static unsigned int tcp_mss_split_point(const struct sock *sk,
1733                                        const struct sk_buff *skb,
1734                                        unsigned int mss_now,
1735                                        unsigned int max_segs,
1736                                        int nonagle)
1737{
1738        const struct tcp_sock *tp = tcp_sk(sk);
1739        u32 partial, needed, window, max_len;
1740
1741        window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
1742        max_len = mss_now * max_segs;
1743
1744        if (likely(max_len <= window && skb != tcp_write_queue_tail(sk)))
1745                return max_len;
1746
1747        needed = min(skb->len, window);
1748
1749        if (max_len <= needed)
1750                return max_len;
1751
1752        partial = needed % mss_now;
1753        /* If last segment is not a full MSS, check if Nagle rules allow us
1754         * to include this last segment in this skb.
1755         * Otherwise, we'll split the skb at last MSS boundary
1756         */
1757        if (tcp_nagle_check(partial != 0, tp, nonagle))
1758                return needed - partial;
1759
1760        return needed;
1761}
1762
1763/* Can at least one segment of SKB be sent right now, according to the
1764 * congestion window rules?  If so, return how many segments are allowed.
1765 */
1766static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
1767                                         const struct sk_buff *skb)
1768{
1769        u32 in_flight, cwnd, halfcwnd;
1770
1771        /* Don't be strict about the congestion window for the final FIN.  */
1772        if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
1773            tcp_skb_pcount(skb) == 1)
1774                return 1;
1775
1776        in_flight = tcp_packets_in_flight(tp);
1777        cwnd = tp->snd_cwnd;
1778        if (in_flight >= cwnd)
1779                return 0;
1780
1781        /* For better scheduling, ensure we have at least
1782         * 2 GSO packets in flight.
1783         */
1784        halfcwnd = max(cwnd >> 1, 1U);
1785        return min(halfcwnd, cwnd - in_flight);
1786}
1787
1788/* Initialize TSO state of a skb.
1789 * This must be invoked the first time we consider transmitting
1790 * SKB onto the wire.
1791 */
1792static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now)
1793{
1794        int tso_segs = tcp_skb_pcount(skb);
1795
1796        if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) {
1797                tcp_set_skb_tso_segs(skb, mss_now);
1798                tso_segs = tcp_skb_pcount(skb);
1799        }
1800        return tso_segs;
1801}
1802
1803
1804/* Return true if the Nagle test allows this packet to be
1805 * sent now.
1806 */
1807static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
1808                                  unsigned int cur_mss, int nonagle)
1809{
1810        /* Nagle rule does not apply to frames, which sit in the middle of the
1811         * write_queue (they have no chances to get new data).
1812         *
1813         * This is implemented in the callers, where they modify the 'nonagle'
1814         * argument based upon the location of SKB in the send queue.
1815         */
1816        if (nonagle & TCP_NAGLE_PUSH)
1817                return true;
1818
1819        /* Don't use the nagle rule for urgent data (or for the final FIN). */
1820        if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
1821                return true;
1822
1823        if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle))
1824                return true;
1825
1826        return false;
1827}
1828
1829/* Does at least the first segment of SKB fit into the send window? */
1830static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
1831                             const struct sk_buff *skb,
1832                             unsigned int cur_mss)
1833{
1834        u32 end_seq = TCP_SKB_CB(skb)->end_seq;
1835
1836        if (skb->len > cur_mss)
1837                end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
1838
1839        return !after(end_seq, tcp_wnd_end(tp));
1840}
1841
1842/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
1843 * which is put after SKB on the list.  It is very much like
1844 * tcp_fragment() except that it may make several kinds of assumptions
1845 * in order to speed up the splitting operation.  In particular, we
1846 * know that all the data is in scatter-gather pages, and that the
1847 * packet has never been sent out before (and thus is not cloned).
1848 */
1849static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1850                        unsigned int mss_now, gfp_t gfp)
1851{
1852        int nlen = skb->len - len;
1853        struct sk_buff *buff;
1854        u8 flags;
1855
1856        /* All of a TSO frame must be composed of paged data.  */
1857        if (skb->len != skb->data_len)
1858                return tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
1859                                    skb, len, mss_now, gfp);
1860
1861        buff = sk_stream_alloc_skb(sk, 0, gfp, true);
1862        if (unlikely(!buff))
1863                return -ENOMEM;
1864
1865        sk->sk_wmem_queued += buff->truesize;
1866        sk_mem_charge(sk, buff->truesize);
1867        buff->truesize += nlen;
1868        skb->truesize -= nlen;
1869
1870        /* Correct the sequence numbers. */
1871        TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
1872        TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
1873        TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
1874
1875        /* PSH and FIN should only be set in the second packet. */
1876        flags = TCP_SKB_CB(skb)->tcp_flags;
1877        TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
1878        TCP_SKB_CB(buff)->tcp_flags = flags;
1879
1880        /* This packet was never sent out yet, so no SACK bits. */
1881        TCP_SKB_CB(buff)->sacked = 0;
1882
1883        tcp_skb_fragment_eor(skb, buff);
1884
1885        buff->ip_summed = CHECKSUM_PARTIAL;
1886        skb_split(skb, buff, len);
1887        tcp_fragment_tstamp(skb, buff);
1888
1889        /* Fix up tso_factor for both original and new SKB.  */
1890        tcp_set_skb_tso_segs(skb, mss_now);
1891        tcp_set_skb_tso_segs(buff, mss_now);
1892
1893        /* Link BUFF into the send queue. */
1894        __skb_header_release(buff);
1895        tcp_insert_write_queue_after(skb, buff, sk, TCP_FRAG_IN_WRITE_QUEUE);
1896
1897        return 0;
1898}
1899
1900/* Try to defer sending, if possible, in order to minimize the amount
1901 * of TSO splitting we do.  View it as a kind of TSO Nagle test.
1902 *
1903 * This algorithm is from John Heffner.
1904 */
1905static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
1906                                 bool *is_cwnd_limited,
1907                                 bool *is_rwnd_limited,
1908                                 u32 max_segs)
1909{
1910        const struct inet_connection_sock *icsk = inet_csk(sk);
1911        u32 send_win, cong_win, limit, in_flight;
1912        struct tcp_sock *tp = tcp_sk(sk);
1913        struct sk_buff *head;
1914        int win_divisor;
1915        s64 delta;
1916
1917        if (icsk->icsk_ca_state >= TCP_CA_Recovery)
1918                goto send_now;
1919
1920        /* Avoid bursty behavior by allowing defer
1921         * only if the last write was recent (1 ms).
1922         * Note that tp->tcp_wstamp_ns can be in the future if we have
1923         * packets waiting in a qdisc or device for EDT delivery.
1924         */
1925        delta = tp->tcp_clock_cache - tp->tcp_wstamp_ns - NSEC_PER_MSEC;
1926        if (delta > 0)
1927                goto send_now;
1928
1929        in_flight = tcp_packets_in_flight(tp);
1930
1931        BUG_ON(tcp_skb_pcount(skb) <= 1);
1932        BUG_ON(tp->snd_cwnd <= in_flight);
1933
1934        send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
1935
1936        /* From in_flight test above, we know that cwnd > in_flight.  */
1937        cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
1938
1939        limit = min(send_win, cong_win);
1940
1941        /* If a full-sized TSO skb can be sent, do it. */
1942        if (limit >= max_segs * tp->mss_cache)
1943                goto send_now;
1944
1945        /* Middle in queue won't get any more data, full sendable already? */
1946        if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
1947                goto send_now;
1948
1949        win_divisor = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor);
1950        if (win_divisor) {
1951                u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
1952
1953                /* If at least some fraction of a window is available,
1954                 * just use it.
1955                 */
1956                chunk /= win_divisor;
1957                if (limit >= chunk)
1958                        goto send_now;
1959        } else {
1960                /* Different approach, try not to defer past a single
1961                 * ACK.  Receiver should ACK every other full sized
1962                 * frame, so if we have space for more than 3 frames
1963                 * then send now.
1964                 */
1965                if (limit > tcp_max_tso_deferred_mss(tp) * tp->mss_cache)
1966                        goto send_now;
1967        }
1968
1969        /* TODO : use tsorted_sent_queue ? */
1970        head = tcp_rtx_queue_head(sk);
1971        if (!head)
1972                goto send_now;
1973        delta = tp->tcp_clock_cache - head->tstamp;
1974        /* If next ACK is likely to come too late (half srtt), do not defer */
1975        if ((s64)(delta - (u64)NSEC_PER_USEC * (tp->srtt_us >> 4)) < 0)
1976                goto send_now;
1977
1978        /* Ok, it looks like it is advisable to defer.
1979         * Three cases are tracked :
1980         * 1) We are cwnd-limited
1981         * 2) We are rwnd-limited
1982         * 3) We are application limited.
1983         */
1984        if (cong_win < send_win) {
1985                if (cong_win <= skb->len) {
1986                        *is_cwnd_limited = true;
1987                        return true;
1988                }
1989        } else {
1990                if (send_win <= skb->len) {
1991                        *is_rwnd_limited = true;
1992                        return true;
1993                }
1994        }
1995
1996        /* If this packet won't get more data, do not wait. */
1997        if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) ||
1998            TCP_SKB_CB(skb)->eor)
1999                goto send_now;
2000
2001        return true;
2002
2003send_now:
2004        return false;
2005}
2006
2007static inline void tcp_mtu_check_reprobe(struct sock *sk)
2008{
2009        struct inet_connection_sock *icsk = inet_csk(sk);
2010        struct tcp_sock *tp = tcp_sk(sk);
2011        struct net *net = sock_net(sk);
2012        u32 interval;
2013        s32 delta;
2014
2015        interval = net->ipv4.sysctl_tcp_probe_interval;
2016        delta = tcp_jiffies32 - icsk->icsk_mtup.probe_timestamp;
2017        if (unlikely(delta >= interval * HZ)) {
2018                int mss = tcp_current_mss(sk);
2019
2020                /* Update current search range */
2021                icsk->icsk_mtup.probe_size = 0;
2022                icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp +
2023                        sizeof(struct tcphdr) +
2024                        icsk->icsk_af_ops->net_header_len;
2025                icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
2026
2027                /* Update probe time stamp */
2028                icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
2029        }
2030}
2031
2032static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len)
2033{
2034        struct sk_buff *skb, *next;
2035
2036        skb = tcp_send_head(sk);
2037        tcp_for_write_queue_from_safe(skb, next, sk) {
2038                if (len <= skb->len)
2039                        break;
2040
2041                if (unlikely(TCP_SKB_CB(skb)->eor))
2042                        return false;
2043
2044                len -= skb->len;
2045        }
2046
2047        return true;
2048}
2049
2050/* Create a new MTU probe if we are ready.
2051 * MTU probe is regularly attempting to increase the path MTU by
2052 * deliberately sending larger packets.  This discovers routing
2053 * changes resulting in larger path MTUs.
2054 *
2055 * Returns 0 if we should wait to probe (no cwnd available),
2056 *         1 if a probe was sent,
2057 *         -1 otherwise
2058 */
2059static int tcp_mtu_probe(struct sock *sk)
2060{
2061        struct inet_connection_sock *icsk = inet_csk(sk);
2062        struct tcp_sock *tp = tcp_sk(sk);
2063        struct sk_buff *skb, *nskb, *next;
2064        struct net *net = sock_net(sk);
2065        int probe_size;
2066        int size_needed;
2067        int copy, len;
2068        int mss_now;
2069        int interval;
2070
2071        /* Not currently probing/verifying,
2072         * not in recovery,
2073         * have enough cwnd, and
2074         * not SACKing (the variable headers throw things off)
2075         */
2076        if (likely(!icsk->icsk_mtup.enabled ||
2077                   icsk->icsk_mtup.probe_size ||
2078                   inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
2079                   tp->snd_cwnd < 11 ||
2080                   tp->rx_opt.num_sacks || tp->rx_opt.dsack))
2081                return -1;
2082
2083        /* Use binary search for probe_size between tcp_mss_base,
2084         * and current mss_clamp. if (search_high - search_low)
2085         * smaller than a threshold, backoff from probing.
2086         */
2087        mss_now = tcp_current_mss(sk);
2088        probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high +
2089                                    icsk->icsk_mtup.search_low) >> 1);
2090        size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
2091        interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low;
2092        /* When misfortune happens, we are reprobing actively,
2093         * and then reprobe timer has expired. We stick with current
2094         * probing process by not resetting search range to its orignal.
2095         */
2096        if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) ||
2097                interval < net->ipv4.sysctl_tcp_probe_threshold) {
2098                /* Check whether enough time has elaplased for
2099                 * another round of probing.
2100                 */
2101                tcp_mtu_check_reprobe(sk);
2102                return -1;
2103        }
2104
2105        /* Have enough data in the send queue to probe? */
2106        if (tp->write_seq - tp->snd_nxt < size_needed)
2107                return -1;
2108
2109        if (tp->snd_wnd < size_needed)
2110                return -1;
2111        if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp)))
2112                return 0;
2113
2114        /* Do we need to wait to drain cwnd? With none in flight, don't stall */
2115        if (tcp_packets_in_flight(tp) + 2 > tp->snd_cwnd) {
2116                if (!tcp_packets_in_flight(tp))
2117                        return -1;
2118                else
2119                        return 0;
2120        }
2121
2122        if (!tcp_can_coalesce_send_queue_head(sk, probe_size))
2123                return -1;
2124
2125        /* We're allowed to probe.  Build it now. */
2126        nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false);
2127        if (!nskb)
2128                return -1;
2129        sk->sk_wmem_queued += nskb->truesize;
2130        sk_mem_charge(sk, nskb->truesize);
2131
2132        skb = tcp_send_head(sk);
2133
2134        TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
2135        TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
2136        TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK;
2137        TCP_SKB_CB(nskb)->sacked = 0;
2138        nskb->csum = 0;
2139        nskb->ip_summed = CHECKSUM_PARTIAL;
2140
2141        tcp_insert_write_queue_before(nskb, skb, sk);
2142        tcp_highest_sack_replace(sk, skb, nskb);
2143
2144        len = 0;
2145        tcp_for_write_queue_from_safe(skb, next, sk) {
2146                copy = min_t(int, skb->len, probe_size - len);
2147                skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
2148
2149                if (skb->len <= copy) {
2150                        /* We've eaten all the data from this skb.
2151                         * Throw it away. */
2152                        TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
2153                        /* If this is the last SKB we copy and eor is set
2154                         * we need to propagate it to the new skb.
2155                         */
2156                        TCP_SKB_CB(nskb)->eor = TCP_SKB_CB(skb)->eor;
2157                        tcp_unlink_write_queue(skb, sk);
2158                        sk_wmem_free_skb(sk, skb);
2159                } else {
2160                        TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags &
2161                                                   ~(TCPHDR_FIN|TCPHDR_PSH);
2162                        if (!skb_shinfo(skb)->nr_frags) {
2163                                skb_pull(skb, copy);
2164                        } else {
2165                                __pskb_trim_head(skb, copy);
2166                                tcp_set_skb_tso_segs(skb, mss_now);
2167                        }
2168                        TCP_SKB_CB(skb)->seq += copy;
2169                }
2170
2171                len += copy;
2172
2173                if (len >= probe_size)
2174                        break;
2175        }
2176        tcp_init_tso_segs(nskb, nskb->len);
2177
2178        /* We're ready to send.  If this fails, the probe will
2179         * be resegmented into mss-sized pieces by tcp_write_xmit().
2180         */
2181        if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
2182                /* Decrement cwnd here because we are sending
2183                 * effectively two packets. */
2184                tp->snd_cwnd--;
2185                tcp_event_new_data_sent(sk, nskb);
2186
2187                icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
2188                tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq;
2189                tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;
2190
2191                return 1;
2192        }
2193
2194        return -1;
2195}
2196
2197static bool tcp_pacing_check(struct sock *sk)
2198{
2199        struct tcp_sock *tp = tcp_sk(sk);
2200
2201        if (!tcp_needs_internal_pacing(sk))
2202                return false;
2203
2204        if (tp->tcp_wstamp_ns <= tp->tcp_clock_cache)
2205                return false;
2206
2207        if (!hrtimer_is_queued(&tp->pacing_timer)) {
2208                hrtimer_start(&tp->pacing_timer,
2209                              ns_to_ktime(tp->tcp_wstamp_ns),
2210                              HRTIMER_MODE_ABS_PINNED_SOFT);
2211                sock_hold(sk);
2212        }
2213        return true;
2214}
2215
2216/* TCP Small Queues :
2217 * Control number of packets in qdisc/devices to two packets / or ~1 ms.
2218 * (These limits are doubled for retransmits)
2219 * This allows for :
2220 *  - better RTT estimation and ACK scheduling
2221 *  - faster recovery
2222 *  - high rates
2223 * Alas, some drivers / subsystems require a fair amount
2224 * of queued bytes to ensure line rate.
2225 * One example is wifi aggregation (802.11 AMPDU)
2226 */
2227static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
2228                                  unsigned int factor)
2229{
2230        unsigned long limit;
2231
2232        limit = max_t(unsigned long,
2233                      2 * skb->truesize,
2234                      sk->sk_pacing_rate >> sk->sk_pacing_shift);
2235        if (sk->sk_pacing_status == SK_PACING_NONE)
2236                limit = min_t(unsigned long, limit,
2237                              sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
2238        limit <<= factor;
2239
2240        if (refcount_read(&sk->sk_wmem_alloc) > limit) {
2241                /* Always send skb if rtx queue is empty.
2242                 * No need to wait for TX completion to call us back,
2243                 * after softirq/tasklet schedule.
2244                 * This helps when TX completions are delayed too much.
2245                 */
2246                if (tcp_rtx_queue_empty(sk))
2247                        return false;
2248
2249                set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
2250                /* It is possible TX completion already happened
2251                 * before we set TSQ_THROTTLED, so we must
2252                 * test again the condition.
2253                 */
2254                smp_mb__after_atomic();
2255                if (refcount_read(&sk->sk_wmem_alloc) > limit)
2256                        return true;
2257        }
2258        return false;
2259}
2260
2261static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new)
2262{
2263        const u32 now = tcp_jiffies32;
2264        enum tcp_chrono old = tp->chrono_type;
2265
2266        if (old > TCP_CHRONO_UNSPEC)
2267                tp->chrono_stat[old - 1] += now - tp->chrono_start;
2268        tp->chrono_start = now;
2269        tp->chrono_type = new;
2270}
2271
2272void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type)
2273{
2274        struct tcp_sock *tp = tcp_sk(sk);
2275
2276        /* If there are multiple conditions worthy of tracking in a
2277         * chronograph then the highest priority enum takes precedence
2278         * over the other conditions. So that if something "more interesting"
2279         * starts happening, stop the previous chrono and start a new one.
2280         */
2281        if (type > tp->chrono_type)
2282                tcp_chrono_set(tp, type);
2283}
2284
2285void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
2286{
2287        struct tcp_sock *tp = tcp_sk(sk);
2288
2289
2290        /* There are multiple conditions worthy of tracking in a
2291         * chronograph, so that the highest priority enum takes
2292         * precedence over the other conditions (see tcp_chrono_start).
2293         * If a condition stops, we only stop chrono tracking if
2294         * it's the "most interesting" or current chrono we are
2295         * tracking and starts busy chrono if we have pending data.
2296         */
2297        if (tcp_rtx_and_write_queues_empty(sk))
2298                tcp_chrono_set(tp, TCP_CHRONO_UNSPEC);
2299        else if (type == tp->chrono_type)
2300                tcp_chrono_set(tp, TCP_CHRONO_BUSY);
2301}
2302
2303/* This routine writes packets to the network.  It advances the
2304 * send_head.  This happens as incoming acks open up the remote
2305 * window for us.
2306 *
2307 * LARGESEND note: !tcp_urg_mode is overkill, only frames between
2308 * snd_up-64k-mss .. snd_up cannot be large. However, taking into
2309 * account rare use of URG, this is not a big flaw.
2310 *
2311 * Send at most one packet when push_one > 0. Temporarily ignore
2312 * cwnd limit to force at most one packet out when push_one == 2.
2313
2314 * Returns true, if no segments are in flight and we have queued segments,
2315 * but cannot send anything now because of SWS or another problem.
2316 */
2317static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
2318                           int push_one, gfp_t gfp)
2319{
2320        struct tcp_sock *tp = tcp_sk(sk);
2321        struct sk_buff *skb;
2322        unsigned int tso_segs, sent_pkts;
2323        int cwnd_quota;
2324        int result;
2325        bool is_cwnd_limited = false, is_rwnd_limited = false;
2326        u32 max_segs;
2327
2328        sent_pkts = 0;
2329
2330        tcp_mstamp_refresh(tp);
2331        if (!push_one) {
2332                /* Do MTU probing. */
2333                result = tcp_mtu_probe(sk);
2334                if (!result) {
2335                        return false;
2336                } else if (result > 0) {
2337                        sent_pkts = 1;
2338                }
2339        }
2340
2341        max_segs = tcp_tso_segs(sk, mss_now);
2342        while ((skb = tcp_send_head(sk))) {
2343                unsigned int limit;
2344
2345                if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
2346                        /* "skb_mstamp_ns" is used as a start point for the retransmit timer */
2347                        skb->skb_mstamp_ns = tp->tcp_wstamp_ns = tp->tcp_clock_cache;
2348                        list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
2349                        tcp_init_tso_segs(skb, mss_now);
2350                        goto repair; /* Skip network transmission */
2351                }
2352
2353                if (tcp_pacing_check(sk))
2354                        break;
2355
2356                tso_segs = tcp_init_tso_segs(skb, mss_now);
2357                BUG_ON(!tso_segs);
2358
2359                cwnd_quota = tcp_cwnd_test(tp, skb);
2360                if (!cwnd_quota) {
2361                        if (push_one == 2)
2362                                /* Force out a loss probe pkt. */
2363                                cwnd_quota = 1;
2364                        else
2365                                break;
2366                }
2367
2368                if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) {
2369                        is_rwnd_limited = true;
2370                        break;
2371                }
2372
2373                if (tso_segs == 1) {
2374                        if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
2375                                                     (tcp_skb_is_last(sk, skb) ?
2376                                                      nonagle : TCP_NAGLE_PUSH))))
2377                                break;
2378                } else {
2379                        if (!push_one &&
2380                            tcp_tso_should_defer(sk, skb, &is_cwnd_limited,
2381                                                 &is_rwnd_limited, max_segs))
2382                                break;
2383                }
2384
2385                limit = mss_now;
2386                if (tso_segs > 1 && !tcp_urg_mode(tp))
2387                        limit = tcp_mss_split_point(sk, skb, mss_now,
2388                                                    min_t(unsigned int,
2389                                                          cwnd_quota,
2390                                                          max_segs),
2391                                                    nonagle);
2392
2393                if (skb->len > limit &&
2394                    unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
2395                        break;
2396
2397                if (tcp_small_queue_check(sk, skb, 0))
2398                        break;
2399
2400                if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
2401                        break;
2402
2403repair:
2404                /* Advance the send_head.  This one is sent out.
2405                 * This call will increment packets_out.
2406                 */
2407                tcp_event_new_data_sent(sk, skb);
2408
2409                tcp_minshall_update(tp, mss_now, skb);
2410                sent_pkts += tcp_skb_pcount(skb);
2411
2412                if (push_one)
2413                        break;
2414        }
2415
2416        if (is_rwnd_limited)
2417                tcp_chrono_start(sk, TCP_CHRONO_RWND_LIMITED);
2418        else
2419                tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED);
2420
2421        if (likely(sent_pkts)) {
2422                if (tcp_in_cwnd_reduction(sk))
2423                        tp->prr_out += sent_pkts;
2424
2425                /* Send one loss probe per tail loss episode. */
2426                if (push_one != 2)
2427                        tcp_schedule_loss_probe(sk, false);
2428                is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd);
2429                tcp_cwnd_validate(sk, is_cwnd_limited);
2430                return false;
2431        }
2432        return !tp->packets_out && !tcp_write_queue_empty(sk);
2433}
2434
2435bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
2436{
2437        struct inet_connection_sock *icsk = inet_csk(sk);
2438        struct tcp_sock *tp = tcp_sk(sk);
2439        u32 timeout, rto_delta_us;
2440        int early_retrans;
2441
2442        /* Don't do any loss probe on a Fast Open connection before 3WHS
2443         * finishes.
2444         */
2445        if (tp->fastopen_rsk)
2446                return false;
2447
2448        early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans;
2449        /* Schedule a loss probe in 2*RTT for SACK capable connections
2450         * not in loss recovery, that are either limited by cwnd or application.
2451         */
2452        if ((early_retrans != 3 && early_retrans != 4) ||
2453            !tp->packets_out || !tcp_is_sack(tp) ||
2454            (icsk->icsk_ca_state != TCP_CA_Open &&
2455             icsk->icsk_ca_state != TCP_CA_CWR))
2456                return false;
2457
2458        /* Probe timeout is 2*rtt. Add minimum RTO to account
2459         * for delayed ack when there's one outstanding packet. If no RTT
2460         * sample is available then probe after TCP_TIMEOUT_INIT.
2461         */
2462        if (tp->srtt_us) {
2463                timeout = usecs_to_jiffies(tp->srtt_us >> 2);
2464                if (tp->packets_out == 1)
2465                        timeout += TCP_RTO_MIN;
2466                else
2467                        timeout += TCP_TIMEOUT_MIN;
2468        } else {
2469                timeout = TCP_TIMEOUT_INIT;
2470        }
2471
2472        /* If the RTO formula yields an earlier time, then use that time. */
2473        rto_delta_us = advancing_rto ?
2474                        jiffies_to_usecs(inet_csk(sk)->icsk_rto) :
2475                        tcp_rto_delta_us(sk);  /* How far in future is RTO? */
2476        if (rto_delta_us > 0)
2477                timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us));
2478
2479        tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
2480                             TCP_RTO_MAX, NULL);
2481        return true;
2482}
2483
2484/* Thanks to skb fast clones, we can detect if a prior transmit of
2485 * a packet is still in a qdisc or driver queue.
2486 * In this case, there is very little point doing a retransmit !
2487 */
2488static bool skb_still_in_host_queue(const struct sock *sk,
2489                                    const struct sk_buff *skb)
2490{
2491        if (unlikely(skb_fclone_busy(sk, skb))) {
2492                NET_INC_STATS(sock_net(sk),
2493                              LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
2494                return true;
2495        }
2496        return false;
2497}
2498
2499/* When probe timeout (PTO) fires, try send a new segment if possible, else
2500 * retransmit the last segment.
2501 */
2502void tcp_send_loss_probe(struct sock *sk)
2503{
2504        struct tcp_sock *tp = tcp_sk(sk);
2505        struct sk_buff *skb;
2506        int pcount;
2507        int mss = tcp_current_mss(sk);
2508
2509        skb = tcp_send_head(sk);
2510        if (skb && tcp_snd_wnd_test(tp, skb, mss)) {
2511                pcount = tp->packets_out;
2512                tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
2513                if (tp->packets_out > pcount)
2514                        goto probe_sent;
2515                goto rearm_timer;
2516        }
2517        skb = skb_rb_last(&sk->tcp_rtx_queue);
2518        if (unlikely(!skb)) {
2519                WARN_ONCE(tp->packets_out,
2520                          "invalid inflight: %u state %u cwnd %u mss %d\n",
2521                          tp->packets_out, sk->sk_state, tp->snd_cwnd, mss);
2522                inet_csk(sk)->icsk_pending = 0;
2523                return;
2524        }
2525
2526        /* At most one outstanding TLP retransmission. */
2527        if (tp->tlp_high_seq)
2528                goto rearm_timer;
2529
2530        if (skb_still_in_host_queue(sk, skb))
2531                goto rearm_timer;
2532
2533        pcount = tcp_skb_pcount(skb);
2534        if (WARN_ON(!pcount))
2535                goto rearm_timer;
2536
2537        if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
2538                if (unlikely(tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
2539                                          (pcount - 1) * mss, mss,
2540                                          GFP_ATOMIC)))
2541                        goto rearm_timer;
2542                skb = skb_rb_next(skb);
2543        }
2544
2545        if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
2546                goto rearm_timer;
2547
2548        if (__tcp_retransmit_skb(sk, skb, 1))
2549                goto rearm_timer;
2550
2551        /* Record snd_nxt for loss detection. */
2552        tp->tlp_high_seq = tp->snd_nxt;
2553
2554probe_sent:
2555        NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES);
2556        /* Reset s.t. tcp_rearm_rto will restart timer from now */
2557        inet_csk(sk)->icsk_pending = 0;
2558rearm_timer:
2559        tcp_rearm_rto(sk);
2560}
2561
2562/* Push out any pending frames which were held back due to
2563 * TCP_CORK or attempt at coalescing tiny packets.
2564 * The socket must be locked by the caller.
2565 */
2566void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
2567                               int nonagle)
2568{
2569        /* If we are closed, the bytes will have to remain here.
2570         * In time closedown will finish, we empty the write queue and
2571         * all will be happy.
2572         */
2573        if (unlikely(sk->sk_state == TCP_CLOSE))
2574                return;
2575
2576        if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
2577                           sk_gfp_mask(sk, GFP_ATOMIC)))
2578                tcp_check_probe_timer(sk);
2579}
2580
2581/* Send _single_ skb sitting at the send head. This function requires
2582 * true push pending frames to setup probe timer etc.
2583 */
2584void tcp_push_one(struct sock *sk, unsigned int mss_now)
2585{
2586        struct sk_buff *skb = tcp_send_head(sk);
2587
2588        BUG_ON(!skb || skb->len < mss_now);
2589
2590        tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation);
2591}
2592
2593/* This function returns the amount that we can raise the
2594 * usable window based on the following constraints
2595 *
2596 * 1. The window can never be shrunk once it is offered (RFC 793)
2597 * 2. We limit memory per socket
2598 *
2599 * RFC 1122:
2600 * "the suggested [SWS] avoidance algorithm for the receiver is to keep
2601 *  RECV.NEXT + RCV.WIN fixed until:
2602 *  RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
2603 *
2604 * i.e. don't raise the right edge of the window until you can raise
2605 * it at least MSS bytes.
2606 *
2607 * Unfortunately, the recommended algorithm breaks header prediction,
2608 * since header prediction assumes th->window stays fixed.
2609 *
2610 * Strictly speaking, keeping th->window fixed violates the receiver
2611 * side SWS prevention criteria. The problem is that under this rule
2612 * a stream of single byte packets will cause the right side of the
2613 * window to always advance by a single byte.
2614 *
2615 * Of course, if the sender implements sender side SWS prevention
2616 * then this will not be a problem.
2617 *
2618 * BSD seems to make the following compromise:
2619 *
2620 *      If the free space is less than the 1/4 of the maximum
2621 *      space available and the free space is less than 1/2 mss,
2622 *      then set the window to 0.
2623 *      [ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
2624 *      Otherwise, just prevent the window from shrinking
2625 *      and from being larger than the largest representable value.
2626 *
2627 * This prevents incremental opening of the window in the regime
2628 * where TCP is limited by the speed of the reader side taking
2629 * data out of the TCP receive queue. It does nothing about
2630 * those cases where the window is constrained on the sender side
2631 * because the pipeline is full.
2632 *
2633 * BSD also seems to "accidentally" limit itself to windows that are a
2634 * multiple of MSS, at least until the free space gets quite small.
2635 * This would appear to be a side effect of the mbuf implementation.
2636 * Combining these two algorithms results in the observed behavior
2637 * of having a fixed window size at almost all times.
2638 *
2639 * Below we obtain similar behavior by forcing the offered window to
2640 * a multiple of the mss when it is feasible to do so.
2641 *
2642 * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
2643 * Regular options like TIMESTAMP are taken into account.
2644 */
2645u32 __tcp_select_window(struct sock *sk)
2646{
2647        struct inet_connection_sock *icsk = inet_csk(sk);
2648        struct tcp_sock *tp = tcp_sk(sk);
2649        /* MSS for the peer's data.  Previous versions used mss_clamp
2650         * here.  I don't know if the value based on our guesses
2651         * of peer's MSS is better for the performance.  It's more correct
2652         * but may be worse for the performance because of rcv_mss
2653         * fluctuations.  --SAW  1998/11/1
2654         */
2655        int mss = icsk->icsk_ack.rcv_mss;
2656        int free_space = tcp_space(sk);
2657        int allowed_space = tcp_full_space(sk);
2658        int full_space = min_t(int, tp->window_clamp, allowed_space);
2659        int window;
2660
2661        if (unlikely(mss > full_space)) {
2662                mss = full_space;
2663                if (mss <= 0)
2664                        return 0;
2665        }
2666        if (free_space < (full_space >> 1)) {
2667                icsk->icsk_ack.quick = 0;
2668
2669                if (tcp_under_memory_pressure(sk))
2670                        tp->rcv_ssthresh = min(tp->rcv_ssthresh,
2671                                               4U * tp->advmss);
2672
2673                /* free_space might become our new window, make sure we don't
2674                 * increase it due to wscale.
2675                 */
2676                free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale);
2677
2678                /* if free space is less than mss estimate, or is below 1/16th
2679                 * of the maximum allowed, try to move to zero-window, else
2680                 * tcp_clamp_window() will grow rcv buf up to tcp_rmem[2], and
2681                 * new incoming data is dropped due to memory limits.
2682                 * With large window, mss test triggers way too late in order
2683                 * to announce zero window in time before rmem limit kicks in.
2684                 */
2685                if (free_space < (allowed_space >> 4) || free_space < mss)
2686                        return 0;
2687        }
2688
2689        if (free_space > tp->rcv_ssthresh)
2690                free_space = tp->rcv_ssthresh;
2691
2692        /* Don't do rounding if we are using window scaling, since the
2693         * scaled window will not line up with the MSS boundary anyway.
2694         */
2695        if (tp->rx_opt.rcv_wscale) {
2696                window = free_space;
2697
2698                /* Advertise enough space so that it won't get scaled away.
2699                 * Import case: prevent zero window announcement if
2700                 * 1<<rcv_wscale > mss.
2701                 */
2702                window = ALIGN(window, (1 << tp->rx_opt.rcv_wscale));
2703        } else {
2704                window = tp->rcv_wnd;
2705                /* Get the largest window that is a nice multiple of mss.
2706                 * Window clamp already applied above.
2707                 * If our current window offering is within 1 mss of the
2708                 * free space we just keep it. This prevents the divide
2709                 * and multiply from happening most of the time.
2710                 * We also don't do any window rounding when the free space
2711                 * is too small.
2712                 */
2713                if (window <= free_space - mss || window > free_space)
2714                        window = rounddown(free_space, mss);
2715                else if (mss == full_space &&
2716                         free_space > window + (full_space >> 1))
2717                        window = free_space;
2718        }
2719
2720        return window;
2721}
2722
2723void tcp_skb_collapse_tstamp(struct sk_buff *skb,
2724                             const struct sk_buff *next_skb)
2725{
2726        if (unlikely(tcp_has_tx_tstamp(next_skb))) {
2727                const struct skb_shared_info *next_shinfo =
2728                        skb_shinfo(next_skb);
2729                struct skb_shared_info *shinfo = skb_shinfo(skb);
2730
2731                shinfo->tx_flags |= next_shinfo->tx_flags & SKBTX_ANY_TSTAMP;
2732                shinfo->tskey = next_shinfo->tskey;
2733                TCP_SKB_CB(skb)->txstamp_ack |=
2734                        TCP_SKB_CB(next_skb)->txstamp_ack;
2735        }
2736}
2737
2738/* Collapses two adjacent SKB's during retransmission. */
2739static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
2740{
2741        struct tcp_sock *tp = tcp_sk(sk);
2742        struct sk_buff *next_skb = skb_rb_next(skb);
2743        int next_skb_size;
2744
2745        next_skb_size = next_skb->len;
2746
2747        BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
2748
2749        if (next_skb_size) {
2750                if (next_skb_size <= skb_availroom(skb))
2751                        skb_copy_bits(next_skb, 0, skb_put(skb, next_skb_size),
2752                                      next_skb_size);
2753                else if (!skb_shift(skb, next_skb, next_skb_size))
2754                        return false;
2755        }
2756        tcp_highest_sack_replace(sk, next_skb, skb);
2757
2758        /* Update sequence range on original skb. */
2759        TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
2760
2761        /* Merge over control information. This moves PSH/FIN etc. over */
2762        TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(next_skb)->tcp_flags;
2763
2764        /* All done, get rid of second SKB and account for it so
2765         * packet counting does not break.
2766         */
2767        TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS;
2768        TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor;
2769
2770        /* changed transmit queue under us so clear hints */
2771        tcp_clear_retrans_hints_partial(tp);
2772        if (next_skb == tp->retransmit_skb_hint)
2773                tp->retransmit_skb_hint = skb;
2774
2775        tcp_adjust_pcount(sk, next_skb, tcp_skb_pcount(next_skb));
2776
2777        tcp_skb_collapse_tstamp(skb, next_skb);
2778
2779        tcp_rtx_queue_unlink_and_free(next_skb, sk);
2780        return true;
2781}
2782
2783/* Check if coalescing SKBs is legal. */
2784static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
2785{
2786        if (tcp_skb_pcount(skb) > 1)
2787                return false;
2788        if (skb_cloned(skb))
2789                return false;
2790        /* Some heuristics for collapsing over SACK'd could be invented */
2791        if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
2792                return false;
2793
2794        return true;
2795}
2796
2797/* Collapse packets in the retransmit queue to make to create
2798 * less packets on the wire. This is only done on retransmission.
2799 */
2800static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
2801                                     int space)
2802{
2803        struct tcp_sock *tp = tcp_sk(sk);
2804        struct sk_buff *skb = to, *tmp;
2805        bool first = true;
2806
2807        if (!sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse)
2808                return;
2809        if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
2810                return;
2811
2812        skb_rbtree_walk_from_safe(skb, tmp) {
2813                if (!tcp_can_collapse(sk, skb))
2814                        break;
2815
2816                if (!tcp_skb_can_collapse_to(to))
2817                        break;
2818
2819                space -= skb->len;
2820
2821                if (first) {
2822                        first = false;
2823                        continue;
2824                }
2825
2826                if (space < 0)
2827                        break;
2828
2829                if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
2830                        break;
2831
2832                if (!tcp_collapse_retrans(sk, to))
2833                        break;
2834        }
2835}
2836
2837/* This retransmits one SKB.  Policy decisions and retransmit queue
2838 * state updates are done by the caller.  Returns non-zero if an
2839 * error occurred which prevented the send.
2840 */
2841int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
2842{
2843        struct inet_connection_sock *icsk = inet_csk(sk);
2844        struct tcp_sock *tp = tcp_sk(sk);
2845        unsigned int cur_mss;
2846        int diff, len, err;
2847
2848
2849        /* Inconclusive MTU probe */
2850        if (icsk->icsk_mtup.probe_size)
2851                icsk->icsk_mtup.probe_size = 0;
2852
2853        /* Do not sent more than we queued. 1/4 is reserved for possible
2854         * copying overhead: fragmentation, tunneling, mangling etc.
2855         */
2856        if (refcount_read(&sk->sk_wmem_alloc) >
2857            min_t(u32, sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2),
2858                  sk->sk_sndbuf))
2859                return -EAGAIN;
2860
2861        if (skb_still_in_host_queue(sk, skb))
2862                return -EBUSY;
2863
2864        if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
2865                if (unlikely(before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))) {
2866                        WARN_ON_ONCE(1);
2867                        return -EINVAL;
2868                }
2869                if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
2870                        return -ENOMEM;
2871        }
2872
2873        if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
2874                return -EHOSTUNREACH; /* Routing failure or similar. */
2875
2876        cur_mss = tcp_current_mss(sk);
2877
2878        /* If receiver has shrunk his window, and skb is out of
2879         * new window, do not retransmit it. The exception is the
2880         * case, when window is shrunk to zero. In this case
2881         * our retransmit serves as a zero window probe.
2882         */
2883        if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) &&
2884            TCP_SKB_CB(skb)->seq != tp->snd_una)
2885                return -EAGAIN;
2886
2887        len = cur_mss * segs;
2888        if (skb->len > len) {
2889                if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len,
2890                                 cur_mss, GFP_ATOMIC))
2891                        return -ENOMEM; /* We'll try again later. */
2892        } else {
2893                if (skb_unclone(skb, GFP_ATOMIC))
2894                        return -ENOMEM;
2895
2896                diff = tcp_skb_pcount(skb);
2897                tcp_set_skb_tso_segs(skb, cur_mss);
2898                diff -= tcp_skb_pcount(skb);
2899                if (diff)
2900                        tcp_adjust_pcount(sk, skb, diff);
2901                if (skb->len < cur_mss)
2902                        tcp_retrans_try_collapse(sk, skb, cur_mss);
2903        }
2904
2905        /* RFC3168, section 6.1.1.1. ECN fallback */
2906        if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
2907                tcp_ecn_clear_syn(sk, skb);
2908
2909        /* Update global and local TCP statistics. */
2910        segs = tcp_skb_pcount(skb);
2911        TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs);
2912        if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
2913                __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
2914        tp->total_retrans += segs;
2915        tp->bytes_retrans += skb->len;
2916
2917        /* make sure skb->data is aligned on arches that require it
2918         * and check if ack-trimming & collapsing extended the headroom
2919         * beyond what csum_start can cover.
2920         */
2921        if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) ||
2922                     skb_headroom(skb) >= 0xFFFF)) {
2923                struct sk_buff *nskb;
2924
2925                tcp_skb_tsorted_save(skb) {
2926                        nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
2927                        err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
2928                                     -ENOBUFS;
2929                } tcp_skb_tsorted_restore(skb);
2930
2931                if (!err) {
2932                        tcp_update_skb_after_send(sk, skb, tp->tcp_wstamp_ns);
2933                        tcp_rate_skb_sent(sk, skb);
2934                }
2935        } else {
2936                err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
2937        }
2938
2939        /* To avoid taking spuriously low RTT samples based on a timestamp
2940         * for a transmit that never happened, always mark EVER_RETRANS
2941         */
2942        TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
2943
2944        if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG))
2945                tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB,
2946                                  TCP_SKB_CB(skb)->seq, segs, err);
2947
2948        if (likely(!err)) {
2949                trace_tcp_retransmit_skb(sk, skb);
2950        } else if (err != -EBUSY) {
2951                NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs);
2952        }
2953        return err;
2954}
2955
2956int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
2957{
2958        struct tcp_sock *tp = tcp_sk(sk);
2959        int err = __tcp_retransmit_skb(sk, skb, segs);
2960
2961        if (err == 0) {
2962#if FASTRETRANS_DEBUG > 0
2963                if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
2964                        net_dbg_ratelimited("retrans_out leaked\n");
2965                }
2966#endif
2967                TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
2968                tp->retrans_out += tcp_skb_pcount(skb);
2969        }
2970
2971        /* Save stamp of the first (attempted) retransmit. */
2972        if (!tp->retrans_stamp)
2973                tp->retrans_stamp = tcp_skb_timestamp(skb);
2974
2975        if (tp->undo_retrans < 0)
2976                tp->undo_retrans = 0;
2977        tp->undo_retrans += tcp_skb_pcount(skb);
2978        return err;
2979}
2980
2981/* This gets called after a retransmit timeout, and the initially
2982 * retransmitted data is acknowledged.  It tries to continue
2983 * resending the rest of the retransmit queue, until either
2984 * we've sent it all or the congestion window limit is reached.
2985 */
2986void tcp_xmit_retransmit_queue(struct sock *sk)
2987{
2988        const struct inet_connection_sock *icsk = inet_csk(sk);
2989        struct sk_buff *skb, *rtx_head, *hole = NULL;
2990        struct tcp_sock *tp = tcp_sk(sk);
2991        u32 max_segs;
2992        int mib_idx;
2993
2994        if (!tp->packets_out)
2995                return;
2996
2997        rtx_head = tcp_rtx_queue_head(sk);
2998        skb = tp->retransmit_skb_hint ?: rtx_head;
2999        max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
3000        skb_rbtree_walk_from(skb) {
3001                __u8 sacked;
3002                int segs;
3003
3004                if (tcp_pacing_check(sk))
3005                        break;
3006
3007                /* we could do better than to assign each time */
3008                if (!hole)
3009                        tp->retransmit_skb_hint = skb;
3010
3011                segs = tp->snd_cwnd - tcp_packets_in_flight(tp);
3012                if (segs <= 0)
3013                        return;
3014                sacked = TCP_SKB_CB(skb)->sacked;
3015                /* In case tcp_shift_skb_data() have aggregated large skbs,
3016                 * we need to make sure not sending too bigs TSO packets
3017                 */
3018                segs = min_t(int, segs, max_segs);
3019
3020                if (tp->retrans_out >= tp->lost_out) {
3021                        break;
3022                } else if (!(sacked & TCPCB_LOST)) {
3023                        if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
3024                                hole = skb;
3025                        continue;
3026
3027                } else {
3028                        if (icsk->icsk_ca_state != TCP_CA_Loss)
3029                                mib_idx = LINUX_MIB_TCPFASTRETRANS;
3030                        else
3031                                mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
3032                }
3033
3034                if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
3035                        continue;
3036
3037                if (tcp_small_queue_check(sk, skb, 1))
3038                        return;
3039
3040                if (tcp_retransmit_skb(sk, skb, segs))
3041                        return;
3042
3043                NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb));
3044
3045                if (tcp_in_cwnd_reduction(sk))
3046                        tp->prr_out += tcp_skb_pcount(skb);
3047
3048                if (skb == rtx_head &&
3049                    icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
3050                        tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
3051                                             inet_csk(sk)->icsk_rto,
3052                                             TCP_RTO_MAX,
3053                                             skb);
3054        }
3055}
3056
3057/* We allow to exceed memory limits for FIN packets to expedite
3058 * connection tear down and (memory) recovery.
3059 * Otherwise tcp_send_fin() could be tempted to either delay FIN
3060 * or even be forced to close flow without any FIN.
3061 * In general, we want to allow one skb per socket to avoid hangs
3062 * with edge trigger epoll()
3063 */
3064void sk_forced_mem_schedule(struct sock *sk, int size)
3065{
3066        int amt;
3067
3068        if (size <= sk->sk_forward_alloc)
3069                return;
3070        amt = sk_mem_pages(size);
3071        sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
3072        sk_memory_allocated_add(sk, amt);
3073
3074        if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3075                mem_cgroup_charge_skmem(sk->sk_memcg, amt);
3076}
3077
3078/* Send a FIN. The caller locks the socket for us.
3079 * We should try to send a FIN packet really hard, but eventually give up.
3080 */
3081void tcp_send_fin(struct sock *sk)
3082{
3083        struct sk_buff *skb, *tskb = tcp_write_queue_tail(sk);
3084        struct tcp_sock *tp = tcp_sk(sk);
3085
3086        /* Optimization, tack on the FIN if we have one skb in write queue and
3087         * this skb was not yet sent, or we are under memory pressure.
3088         * Note: in the latter case, FIN packet will be sent after a timeout,
3089         * as TCP stack thinks it has already been transmitted.
3090         */
3091        if (!tskb && tcp_under_memory_pressure(sk))
3092                tskb = skb_rb_last(&sk->tcp_rtx_queue);
3093
3094        if (tskb) {
3095coalesce:
3096                TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
3097                TCP_SKB_CB(tskb)->end_seq++;
3098                tp->write_seq++;
3099                if (tcp_write_queue_empty(sk)) {
3100                        /* This means tskb was already sent.
3101                         * Pretend we included the FIN on previous transmit.
3102                         * We need to set tp->snd_nxt to the value it would have
3103                         * if FIN had been sent. This is because retransmit path
3104                         * does not change tp->snd_nxt.
3105                         */
3106                        tp->snd_nxt++;
3107                        return;
3108                }
3109        } else {
3110                skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
3111                if (unlikely(!skb)) {
3112                        if (tskb)
3113                                goto coalesce;
3114                        return;
3115                }
3116                INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
3117                skb_reserve(skb, MAX_TCP_HEADER);
3118                sk_forced_mem_schedule(sk, skb->truesize);
3119                /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
3120                tcp_init_nondata_skb(skb, tp->write_seq,
3121                                     TCPHDR_ACK | TCPHDR_FIN);
3122                tcp_queue_skb(sk, skb);
3123        }
3124        __tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF);
3125}
3126
3127/* We get here when a process closes a file descriptor (either due to
3128 * an explicit close() or as a byproduct of exit()'ing) and there
3129 * was unread data in the receive queue.  This behavior is recommended
3130 * by RFC 2525, section 2.17.  -DaveM
3131 */
3132void tcp_send_active_reset(struct sock *sk, gfp_t priority)
3133{
3134        struct sk_buff *skb;
3135
3136        TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
3137
3138        /* NOTE: No TCP options attached and we never retransmit this. */
3139        skb = alloc_skb(MAX_TCP_HEADER, priority);
3140        if (!skb) {
3141                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
3142                return;
3143        }
3144
3145        /* Reserve space for headers and prepare control bits. */
3146        skb_reserve(skb, MAX_TCP_HEADER);
3147        tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),
3148                             TCPHDR_ACK | TCPHDR_RST);
3149        tcp_mstamp_refresh(tcp_sk(sk));
3150        /* Send it off. */
3151        if (tcp_transmit_skb(sk, skb, 0, priority))
3152                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
3153
3154        /* skb of trace_tcp_send_reset() keeps the skb that caused RST,
3155         * skb here is different to the troublesome skb, so use NULL
3156         */
3157        trace_tcp_send_reset(sk, NULL);
3158}
3159
3160/* Send a crossed SYN-ACK during socket establishment.
3161 * WARNING: This routine must only be called when we have already sent
3162 * a SYN packet that crossed the incoming SYN that caused this routine
3163 * to get called. If this assumption fails then the initial rcv_wnd
3164 * and rcv_wscale values will not be correct.
3165 */
3166int tcp_send_synack(struct sock *sk)
3167{
3168        struct sk_buff *skb;
3169
3170        skb = tcp_rtx_queue_head(sk);
3171        if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
3172                pr_err("%s: wrong queue state\n", __func__);
3173                return -EFAULT;
3174        }
3175        if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
3176                if (skb_cloned(skb)) {
3177                        struct sk_buff *nskb;
3178
3179                        tcp_skb_tsorted_save(skb) {
3180                                nskb = skb_copy(skb, GFP_ATOMIC);
3181                        } tcp_skb_tsorted_restore(skb);
3182                        if (!nskb)
3183                                return -ENOMEM;
3184                        INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor);
3185                        tcp_rtx_queue_unlink_and_free(skb, sk);
3186                        __skb_header_release(nskb);
3187                        tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb);
3188                        sk->sk_wmem_queued += nskb->truesize;
3189                        sk_mem_charge(sk, nskb->truesize);
3190                        skb = nskb;
3191                }
3192
3193                TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK;
3194                tcp_ecn_send_synack(sk, skb);
3195        }
3196        return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
3197}
3198
3199/**
3200 * tcp_make_synack - Prepare a SYN-ACK.
3201 * sk: listener socket
3202 * dst: dst entry attached to the SYNACK
3203 * req: request_sock pointer
3204 *
3205 * Allocate one skb and build a SYNACK packet.
3206 * @dst is consumed : Caller should not use it again.
3207 */
3208struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
3209                                struct request_sock *req,
3210                                struct tcp_fastopen_cookie *foc,
3211                                enum tcp_synack_type synack_type)
3212{
3213        struct inet_request_sock *ireq = inet_rsk(req);
3214        const struct tcp_sock *tp = tcp_sk(sk);
3215        struct tcp_md5sig_key *md5 = NULL;
3216        struct tcp_out_options opts;
3217        struct sk_buff *skb;
3218        int tcp_header_size;
3219        struct tcphdr *th;
3220        int mss;
3221
3222        skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
3223        if (unlikely(!skb)) {
3224                dst_release(dst);
3225                return NULL;
3226        }
3227        /* Reserve space for headers. */
3228        skb_reserve(skb, MAX_TCP_HEADER);
3229
3230        switch (synack_type) {
3231        case TCP_SYNACK_NORMAL:
3232                skb_set_owner_w(skb, req_to_sk(req));
3233                break;
3234        case TCP_SYNACK_COOKIE:
3235                /* Under synflood, we do not attach skb to a socket,
3236                 * to avoid false sharing.
3237                 */
3238                break;
3239        case TCP_SYNACK_FASTOPEN:
3240                /* sk is a const pointer, because we want to express multiple
3241                 * cpu might call us concurrently.
3242                 * sk->sk_wmem_alloc in an atomic, we can promote to rw.
3243                 */
3244                skb_set_owner_w(skb, (struct sock *)sk);
3245                break;
3246        }
3247        skb_dst_set(skb, dst);
3248
3249        mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
3250
3251        memset(&opts, 0, sizeof(opts));
3252#ifdef CONFIG_SYN_COOKIES
3253        if (unlikely(req->cookie_ts))
3254                skb->skb_mstamp_ns = cookie_init_timestamp(req);
3255        else
3256#endif
3257                skb->skb_mstamp_ns = tcp_clock_ns();
3258
3259#ifdef CONFIG_TCP_MD5SIG
3260        rcu_read_lock();
3261        md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
3262#endif
3263        skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
3264        tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5,
3265                                             foc) + sizeof(*th);
3266
3267        skb_push(skb, tcp_header_size);
3268        skb_reset_transport_header(skb);
3269
3270        th = (struct tcphdr *)skb->data;
3271        memset(th, 0, sizeof(struct tcphdr));
3272        th->syn = 1;
3273        th->ack = 1;
3274        tcp_ecn_make_synack(req, th);
3275        th->source = htons(ireq->ir_num);
3276        th->dest = ireq->ir_rmt_port;
3277        skb->mark = ireq->ir_mark;
3278        skb->ip_summed = CHECKSUM_PARTIAL;
3279        th->seq = htonl(tcp_rsk(req)->snt_isn);
3280        /* XXX data is queued and acked as is. No buffer/window check */
3281        th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
3282
3283        /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
3284        th->window = htons(min(req->rsk_rcv_wnd, 65535U));
3285        tcp_options_write((__be32 *)(th + 1), NULL, &opts);
3286        th->doff = (tcp_header_size >> 2);
3287        __TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
3288
3289#ifdef CONFIG_TCP_MD5SIG
3290        /* Okay, we have all we need - do the md5 hash if needed */
3291        if (md5)
3292                tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
3293                                               md5, req_to_sk(req), skb);
3294        rcu_read_unlock();
3295#endif
3296
3297        /* Do not fool tcpdump (if any), clean our debris */
3298        skb->tstamp = 0;
3299        return skb;
3300}
3301EXPORT_SYMBOL(tcp_make_synack);
3302
3303static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst)
3304{
3305        struct inet_connection_sock *icsk = inet_csk(sk);
3306        const struct tcp_congestion_ops *ca;
3307        u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
3308
3309        if (ca_key == TCP_CA_UNSPEC)
3310                return;
3311
3312        rcu_read_lock();
3313        ca = tcp_ca_find_key(ca_key);
3314        if (likely(ca && try_module_get(ca->owner))) {
3315                module_put(icsk->icsk_ca_ops->owner);
3316                icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
3317                icsk->icsk_ca_ops = ca;
3318        }
3319        rcu_read_unlock();
3320}
3321
3322/* Do all connect socket setups that can be done AF independent. */
3323static void tcp_connect_init(struct sock *sk)
3324{
3325        const struct dst_entry *dst = __sk_dst_get(sk);
3326        struct tcp_sock *tp = tcp_sk(sk);
3327        __u8 rcv_wscale;
3328        u32 rcv_wnd;
3329
3330        /* We'll fix this up when we get a response from the other end.
3331         * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
3332         */
3333        tp->tcp_header_len = sizeof(struct tcphdr);
3334        if (sock_net(sk)->ipv4.sysctl_tcp_timestamps)
3335                tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED;
3336
3337#ifdef CONFIG_TCP_MD5SIG
3338        if (tp->af_specific->md5_lookup(sk, sk))
3339                tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
3340#endif
3341
3342        /* If user gave his TCP_MAXSEG, record it to clamp */
3343        if (tp->rx_opt.user_mss)
3344                tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
3345        tp->max_window = 0;
3346        tcp_mtup_init(sk);
3347        tcp_sync_mss(sk, dst_mtu(dst));
3348
3349        tcp_ca_dst_init(sk, dst);
3350
3351        if (!tp->window_clamp)
3352                tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
3353        tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
3354
3355        tcp_initialize_rcv_mss(sk);
3356
3357        /* limit the window selection if the user enforce a smaller rx buffer */
3358        if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
3359            (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
3360                tp->window_clamp = tcp_full_space(sk);
3361
3362        rcv_wnd = tcp_rwnd_init_bpf(sk);
3363        if (rcv_wnd == 0)
3364                rcv_wnd = dst_metric(dst, RTAX_INITRWND);
3365
3366        tcp_select_initial_window(sk, tcp_full_space(sk),
3367                                  tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
3368                                  &tp->rcv_wnd,
3369                                  &tp->window_clamp,
3370                                  sock_net(sk)->ipv4.sysctl_tcp_window_scaling,
3371                                  &rcv_wscale,
3372                                  rcv_wnd);
3373
3374        tp->rx_opt.rcv_wscale = rcv_wscale;
3375        tp->rcv_ssthresh = tp->rcv_wnd;
3376
3377        sk->sk_err = 0;
3378        sock_reset_flag(sk, SOCK_DONE);
3379        tp->snd_wnd = 0;
3380        tcp_init_wl(tp, 0);
3381        tcp_write_queue_purge(sk);
3382        tp->snd_una = tp->write_seq;
3383        tp->snd_sml = tp->write_seq;
3384        tp->snd_up = tp->write_seq;
3385        tp->snd_nxt = tp->write_seq;
3386
3387        if (likely(!tp->repair))
3388                tp->rcv_nxt = 0;
3389        else
3390                tp->rcv_tstamp = tcp_jiffies32;
3391        tp->rcv_wup = tp->rcv_nxt;
3392        tp->copied_seq = tp->rcv_nxt;
3393
3394        inet_csk(sk)->icsk_rto = tcp_timeout_init(sk);
3395        inet_csk(sk)->icsk_retransmits = 0;
3396        tcp_clear_retrans(tp);
3397}
3398
3399static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
3400{
3401        struct tcp_sock *tp = tcp_sk(sk);
3402        struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
3403
3404        tcb->end_seq += skb->len;
3405        __skb_header_release(skb);
3406        sk->sk_wmem_queued += skb->truesize;
3407        sk_mem_charge(sk, skb->truesize);
3408        tp->write_seq = tcb->end_seq;
3409        tp->packets_out += tcp_skb_pcount(skb);
3410}
3411
3412/* Build and send a SYN with data and (cached) Fast Open cookie. However,
3413 * queue a data-only packet after the regular SYN, such that regular SYNs
3414 * are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges
3415 * only the SYN sequence, the data are retransmitted in the first ACK.
3416 * If cookie is not cached or other error occurs, falls back to send a
3417 * regular SYN with Fast Open cookie request option.
3418 */
3419static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
3420{
3421        struct tcp_sock *tp = tcp_sk(sk);
3422        struct tcp_fastopen_request *fo = tp->fastopen_req;
3423        int space, err = 0;
3424        struct sk_buff *syn_data;
3425
3426        tp->rx_opt.mss_clamp = tp->advmss;  /* If MSS is not cached */
3427        if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie))
3428                goto fallback;
3429
3430        /* MSS for SYN-data is based on cached MSS and bounded by PMTU and
3431         * user-MSS. Reserve maximum option space for middleboxes that add
3432         * private TCP options. The cost is reduced data space in SYN :(
3433         */
3434        tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp);
3435
3436        space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
3437                MAX_TCP_OPTION_SPACE;
3438
3439        space = min_t(size_t, space, fo->size);
3440
3441        /* limit to order-0 allocations */
3442        space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER));
3443
3444        syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation, false);
3445        if (!syn_data)
3446                goto fallback;
3447        syn_data->ip_summed = CHECKSUM_PARTIAL;
3448        memcpy(syn_data->cb, syn->cb, sizeof(syn->cb));
3449        if (space) {
3450                int copied = copy_from_iter(skb_put(syn_data, space), space,
3451                                            &fo->data->msg_iter);
3452                if (unlikely(!copied)) {
3453                        tcp_skb_tsorted_anchor_cleanup(syn_data);
3454                        kfree_skb(syn_data);
3455                        goto fallback;
3456                }
3457                if (copied != space) {
3458                        skb_trim(syn_data, copied);
3459                        space = copied;
3460                }
3461                skb_zcopy_set(syn_data, fo->uarg, NULL);
3462        }
3463        /* No more data pending in inet_wait_for_connect() */
3464        if (space == fo->size)
3465                fo->data = NULL;
3466        fo->copied = space;
3467
3468        tcp_connect_queue_skb(sk, syn_data);
3469        if (syn_data->len)
3470                tcp_chrono_start(sk, TCP_CHRONO_BUSY);
3471
3472        err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
3473
3474        syn->skb_mstamp_ns = syn_data->skb_mstamp_ns;
3475
3476        /* Now full SYN+DATA was cloned and sent (or not),
3477         * remove the SYN from the original skb (syn_data)
3478         * we keep in write queue in case of a retransmit, as we
3479         * also have the SYN packet (with no data) in the same queue.
3480         */
3481        TCP_SKB_CB(syn_data)->seq++;
3482        TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH;
3483        if (!err) {
3484                tp->syn_data = (fo->copied > 0);
3485                tcp_rbtree_insert(&sk->tcp_rtx_queue, syn_data);
3486                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
3487                goto done;
3488        }
3489
3490        /* data was not sent, put it in write_queue */
3491        __skb_queue_tail(&sk->sk_write_queue, syn_data);
3492        tp->packets_out -= tcp_skb_pcount(syn_data);
3493
3494fallback:
3495        /* Send a regular SYN with Fast Open cookie request option */
3496        if (fo->cookie.len > 0)
3497                fo->cookie.len = 0;
3498        err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation);
3499        if (err)
3500                tp->syn_fastopen = 0;
3501done:
3502        fo->cookie.len = -1;  /* Exclude Fast Open option for SYN retries */
3503        return err;
3504}
3505
3506/* Build a SYN and send it off. */
3507int tcp_connect(struct sock *sk)
3508{
3509        struct tcp_sock *tp = tcp_sk(sk);
3510        struct sk_buff *buff;
3511        int err;
3512
3513        tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB, 0, NULL);
3514
3515        if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
3516                return -EHOSTUNREACH; /* Routing failure or similar. */
3517
3518        tcp_connect_init(sk);
3519
3520        if (unlikely(tp->repair)) {
3521                tcp_finish_connect(sk, NULL);
3522                return 0;
3523        }
3524
3525        buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, true);
3526        if (unlikely(!buff))
3527                return -ENOBUFS;
3528
3529        tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
3530        tcp_mstamp_refresh(tp);
3531        tp->retrans_stamp = tcp_time_stamp(tp);
3532        tcp_connect_queue_skb(sk, buff);
3533        tcp_ecn_send_syn(sk, buff);
3534        tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
3535
3536        /* Send off SYN; include data in Fast Open. */
3537        err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
3538              tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
3539        if (err == -ECONNREFUSED)
3540                return err;
3541
3542        /* We change tp->snd_nxt after the tcp_transmit_skb() call
3543         * in order to make this packet get counted in tcpOutSegs.
3544         */
3545        tp->snd_nxt = tp->write_seq;
3546        tp->pushed_seq = tp->write_seq;
3547        buff = tcp_send_head(sk);
3548        if (unlikely(buff)) {
3549                tp->snd_nxt     = TCP_SKB_CB(buff)->seq;
3550                tp->pushed_seq  = TCP_SKB_CB(buff)->seq;
3551        }
3552        TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
3553
3554        /* Timer for repeating the SYN until an answer. */
3555        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
3556                                  inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
3557        return 0;
3558}
3559EXPORT_SYMBOL(tcp_connect);
3560
3561/* Send out a delayed ack, the caller does the policy checking
3562 * to see if we should even be here.  See tcp_input.c:tcp_ack_snd_check()
3563 * for details.
3564 */
3565void tcp_send_delayed_ack(struct sock *sk)
3566{
3567        struct inet_connection_sock *icsk = inet_csk(sk);
3568        int ato = icsk->icsk_ack.ato;
3569        unsigned long timeout;
3570
3571        if (ato > TCP_DELACK_MIN) {
3572                const struct tcp_sock *tp = tcp_sk(sk);
3573                int max_ato = HZ / 2;
3574
3575                if (inet_csk_in_pingpong_mode(sk) ||
3576                    (icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
3577                        max_ato = TCP_DELACK_MAX;
3578
3579                /* Slow path, intersegment interval is "high". */
3580
3581                /* If some rtt estimate is known, use it to bound delayed ack.
3582                 * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
3583                 * directly.
3584                 */
3585                if (tp->srtt_us) {
3586                        int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> 3),
3587                                        TCP_DELACK_MIN);
3588
3589                        if (rtt < max_ato)
3590                                max_ato = rtt;
3591                }
3592
3593                ato = min(ato, max_ato);
3594        }
3595
3596        /* Stay within the limit we were given */
3597        timeout = jiffies + ato;
3598
3599        /* Use new timeout only if there wasn't a older one earlier. */
3600        if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
3601                /* If delack timer was blocked or is about to expire,
3602                 * send ACK now.
3603                 */
3604                if (icsk->icsk_ack.blocked ||
3605                    time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
3606                        tcp_send_ack(sk);
3607                        return;
3608                }
3609
3610                if (!time_before(timeout, icsk->icsk_ack.timeout))
3611                        timeout = icsk->icsk_ack.timeout;
3612        }
3613        icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
3614        icsk->icsk_ack.timeout = timeout;
3615        sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
3616}
3617
3618/* This routine sends an ack and also updates the window. */
3619void __tcp_send_ack(struct sock *sk, u32 rcv_nxt)
3620{
3621        struct sk_buff *buff;
3622
3623        /* If we have been reset, we may not send again. */
3624        if (sk->sk_state == TCP_CLOSE)
3625                return;
3626
3627        /* We are not putting this on the write queue, so
3628         * tcp_transmit_skb() will set the ownership to this
3629         * sock.
3630         */
3631        buff = alloc_skb(MAX_TCP_HEADER,
3632                         sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
3633        if (unlikely(!buff)) {
3634                inet_csk_schedule_ack(sk);
3635                inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
3636                inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
3637                                          TCP_DELACK_MAX, TCP_RTO_MAX);
3638                return;
3639        }
3640
3641        /* Reserve space for headers and prepare control bits. */
3642        skb_reserve(buff, MAX_TCP_HEADER);
3643        tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);
3644
3645        /* We do not want pure acks influencing TCP Small Queues or fq/pacing
3646         * too much.
3647         * SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784
3648         */
3649        skb_set_tcp_pure_ack(buff);
3650
3651        /* Send it off, this clears delayed acks for us. */
3652        __tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt);
3653}
3654EXPORT_SYMBOL_GPL(__tcp_send_ack);
3655
3656void tcp_send_ack(struct sock *sk)
3657{
3658        __tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt);
3659}
3660
3661/* This routine sends a packet with an out of date sequence
3662 * number. It assumes the other end will try to ack it.
3663 *
3664 * Question: what should we make while urgent mode?
3665 * 4.4BSD forces sending single byte of data. We cannot send
3666 * out of window data, because we have SND.NXT==SND.MAX...
3667 *
3668 * Current solution: to send TWO zero-length segments in urgent mode:
3669 * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
3670 * out-of-date with SND.UNA-1 to probe window.
3671 */
3672static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib)
3673{
3674        struct tcp_sock *tp = tcp_sk(sk);
3675        struct sk_buff *skb;
3676
3677        /* We don't queue it, tcp_transmit_skb() sets ownership. */
3678        skb = alloc_skb(MAX_TCP_HEADER,
3679                        sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
3680        if (!skb)
3681                return -1;
3682
3683        /* Reserve space for headers and set control bits. */
3684        skb_reserve(skb, MAX_TCP_HEADER);
3685        /* Use a previous sequence.  This should cause the other
3686         * end to send an ack.  Don't queue or clone SKB, just
3687         * send it.
3688         */
3689        tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
3690        NET_INC_STATS(sock_net(sk), mib);
3691        return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0);
3692}
3693
3694/* Called from setsockopt( ... TCP_REPAIR ) */
3695void tcp_send_window_probe(struct sock *sk)
3696{
3697        if (sk->sk_state == TCP_ESTABLISHED) {
3698                tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1;
3699                tcp_mstamp_refresh(tcp_sk(sk));
3700                tcp_xmit_probe_skb(sk, 0, LINUX_MIB_TCPWINPROBE);
3701        }
3702}
3703
3704/* Initiate keepalive or window probe from timer. */
3705int tcp_write_wakeup(struct sock *sk, int mib)
3706{
3707        struct tcp_sock *tp = tcp_sk(sk);
3708        struct sk_buff *skb;
3709
3710        if (sk->sk_state == TCP_CLOSE)
3711                return -1;
3712
3713        skb = tcp_send_head(sk);
3714        if (skb && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
3715                int err;
3716                unsigned int mss = tcp_current_mss(sk);
3717                unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
3718
3719                if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
3720                        tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
3721
3722                /* We are probing the opening of a window
3723                 * but the window size is != 0
3724                 * must have been a result SWS avoidance ( sender )
3725                 */
3726                if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
3727                    skb->len > mss) {
3728                        seg_size = min(seg_size, mss);
3729                        TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
3730                        if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
3731                                         skb, seg_size, mss, GFP_ATOMIC))
3732                                return -1;
3733                } else if (!tcp_skb_pcount(skb))
3734                        tcp_set_skb_tso_segs(skb, mss);
3735
3736                TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
3737                err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
3738                if (!err)
3739                        tcp_event_new_data_sent(sk, skb);
3740                return err;
3741        } else {
3742                if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF))
3743                        tcp_xmit_probe_skb(sk, 1, mib);
3744                return tcp_xmit_probe_skb(sk, 0, mib);
3745        }
3746}
3747
3748/* A window probe timeout has occurred.  If window is not closed send
3749 * a partial packet else a zero probe.
3750 */
3751void tcp_send_probe0(struct sock *sk)
3752{
3753        struct inet_connection_sock *icsk = inet_csk(sk);
3754        struct tcp_sock *tp = tcp_sk(sk);
3755        struct net *net = sock_net(sk);
3756        unsigned long timeout;
3757        int err;
3758
3759        err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
3760
3761        if (tp->packets_out || tcp_write_queue_empty(sk)) {
3762                /* Cancel probe timer, if it is not required. */
3763                icsk->icsk_probes_out = 0;
3764                icsk->icsk_backoff = 0;
3765                return;
3766        }
3767
3768        icsk->icsk_probes_out++;
3769        if (err <= 0) {
3770                if (icsk->icsk_backoff < net->ipv4.sysctl_tcp_retries2)
3771                        icsk->icsk_backoff++;
3772                timeout = tcp_probe0_when(sk, TCP_RTO_MAX);
3773        } else {
3774                /* If packet was not sent due to local congestion,
3775                 * Let senders fight for local resources conservatively.
3776                 */
3777                timeout = TCP_RESOURCE_PROBE_INTERVAL;
3778        }
3779        tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, TCP_RTO_MAX, NULL);
3780}
3781
3782int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
3783{
3784        const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
3785        struct flowi fl;
3786        int res;
3787
3788        tcp_rsk(req)->txhash = net_tx_rndhash();
3789        res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL);
3790        if (!res) {
3791                __TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
3792                __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
3793                if (unlikely(tcp_passive_fastopen(sk)))
3794                        tcp_sk(sk)->total_retrans++;
3795                trace_tcp_retransmit_synack(sk, req);
3796        }
3797        return res;
3798}
3799EXPORT_SYMBOL(tcp_rtx_synack);
3800