linux/net/ipv4/tcp_ipv4.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Implementation of the Transmission Control Protocol(TCP).
   7 *
   8 *              IPv4 specific functions
   9 *
  10 *
  11 *              code split from:
  12 *              linux/ipv4/tcp.c
  13 *              linux/ipv4/tcp_input.c
  14 *              linux/ipv4/tcp_output.c
  15 *
  16 *              See tcp.c for author information
  17 *
  18 *      This program is free software; you can redistribute it and/or
  19 *      modify it under the terms of the GNU General Public License
  20 *      as published by the Free Software Foundation; either version
  21 *      2 of the License, or (at your option) any later version.
  22 */
  23
  24/*
  25 * Changes:
  26 *              David S. Miller :       New socket lookup architecture.
  27 *                                      This code is dedicated to John Dyson.
  28 *              David S. Miller :       Change semantics of established hash,
  29 *                                      half is devoted to TIME_WAIT sockets
  30 *                                      and the rest go in the other half.
  31 *              Andi Kleen :            Add support for syncookies and fixed
  32 *                                      some bugs: ip options weren't passed to
  33 *                                      the TCP layer, missed a check for an
  34 *                                      ACK bit.
  35 *              Andi Kleen :            Implemented fast path mtu discovery.
  36 *                                      Fixed many serious bugs in the
  37 *                                      request_sock handling and moved
  38 *                                      most of it into the af independent code.
  39 *                                      Added tail drop and some other bugfixes.
  40 *                                      Added new listen semantics.
  41 *              Mike McLagan    :       Routing by source
  42 *      Juan Jose Ciarlante:            ip_dynaddr bits
  43 *              Andi Kleen:             various fixes.
  44 *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45 *                                      coma.
  46 *      Andi Kleen              :       Fix new listen.
  47 *      Andi Kleen              :       Fix accept error reporting.
  48 *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49 *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50 *                                      a single port at the same time.
  51 */
  52
  53#define pr_fmt(fmt) "TCP: " fmt
  54
  55#include <linux/bottom_half.h>
  56#include <linux/types.h>
  57#include <linux/fcntl.h>
  58#include <linux/module.h>
  59#include <linux/random.h>
  60#include <linux/cache.h>
  61#include <linux/jhash.h>
  62#include <linux/init.h>
  63#include <linux/times.h>
  64#include <linux/slab.h>
  65
  66#include <net/net_namespace.h>
  67#include <net/icmp.h>
  68#include <net/inet_hashtables.h>
  69#include <net/tcp.h>
  70#include <net/transp_v6.h>
  71#include <net/ipv6.h>
  72#include <net/inet_common.h>
  73#include <net/timewait_sock.h>
  74#include <net/xfrm.h>
  75#include <net/netdma.h>
  76#include <net/secure_seq.h>
  77#include <net/tcp_memcontrol.h>
  78
  79#include <linux/inet.h>
  80#include <linux/ipv6.h>
  81#include <linux/stddef.h>
  82#include <linux/proc_fs.h>
  83#include <linux/seq_file.h>
  84
  85#include <linux/crypto.h>
  86#include <linux/scatterlist.h>
  87
  88int sysctl_tcp_tw_reuse __read_mostly;
  89int sysctl_tcp_low_latency __read_mostly;
  90EXPORT_SYMBOL(sysctl_tcp_low_latency);
  91
  92
  93#ifdef CONFIG_TCP_MD5SIG
  94static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  95                               __be32 daddr, __be32 saddr, const struct tcphdr *th);
  96#endif
  97
  98struct inet_hashinfo tcp_hashinfo;
  99EXPORT_SYMBOL(tcp_hashinfo);
 100
 101static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
 102{
 103        return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 104                                          ip_hdr(skb)->saddr,
 105                                          tcp_hdr(skb)->dest,
 106                                          tcp_hdr(skb)->source);
 107}
 108
 109int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 110{
 111        const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 112        struct tcp_sock *tp = tcp_sk(sk);
 113
 114        /* With PAWS, it is safe from the viewpoint
 115           of data integrity. Even without PAWS it is safe provided sequence
 116           spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 117
 118           Actually, the idea is close to VJ's one, only timestamp cache is
 119           held not per host, but per port pair and TW bucket is used as state
 120           holder.
 121
 122           If TW bucket has been already destroyed we fall back to VJ's scheme
 123           and use initial timestamp retrieved from peer table.
 124         */
 125        if (tcptw->tw_ts_recent_stamp &&
 126            (twp == NULL || (sysctl_tcp_tw_reuse &&
 127                             get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 128                tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 129                if (tp->write_seq == 0)
 130                        tp->write_seq = 1;
 131                tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 132                tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 133                sock_hold(sktw);
 134                return 1;
 135        }
 136
 137        return 0;
 138}
 139EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 140
 141/* This will initiate an outgoing connection. */
 142int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 143{
 144        struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 145        struct inet_sock *inet = inet_sk(sk);
 146        struct tcp_sock *tp = tcp_sk(sk);
 147        __be16 orig_sport, orig_dport;
 148        __be32 daddr, nexthop;
 149        struct flowi4 *fl4;
 150        struct rtable *rt;
 151        int err;
 152        struct ip_options_rcu *inet_opt;
 153
 154        if (addr_len < sizeof(struct sockaddr_in))
 155                return -EINVAL;
 156
 157        if (usin->sin_family != AF_INET)
 158                return -EAFNOSUPPORT;
 159
 160        nexthop = daddr = usin->sin_addr.s_addr;
 161        inet_opt = rcu_dereference_protected(inet->inet_opt,
 162                                             sock_owned_by_user(sk));
 163        if (inet_opt && inet_opt->opt.srr) {
 164                if (!daddr)
 165                        return -EINVAL;
 166                nexthop = inet_opt->opt.faddr;
 167        }
 168
 169        orig_sport = inet->inet_sport;
 170        orig_dport = usin->sin_port;
 171        fl4 = &inet->cork.fl.u.ip4;
 172        rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 173                              RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 174                              IPPROTO_TCP,
 175                              orig_sport, orig_dport, sk, true);
 176        if (IS_ERR(rt)) {
 177                err = PTR_ERR(rt);
 178                if (err == -ENETUNREACH)
 179                        IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 180                return err;
 181        }
 182
 183        if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 184                ip_rt_put(rt);
 185                return -ENETUNREACH;
 186        }
 187
 188        if (!inet_opt || !inet_opt->opt.srr)
 189                daddr = fl4->daddr;
 190
 191        if (!inet->inet_saddr)
 192                inet->inet_saddr = fl4->saddr;
 193        inet->inet_rcv_saddr = inet->inet_saddr;
 194
 195        if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 196                /* Reset inherited state */
 197                tp->rx_opt.ts_recent       = 0;
 198                tp->rx_opt.ts_recent_stamp = 0;
 199                if (likely(!tp->repair))
 200                        tp->write_seq      = 0;
 201        }
 202
 203        if (tcp_death_row.sysctl_tw_recycle &&
 204            !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
 205                tcp_fetch_timewait_stamp(sk, &rt->dst);
 206
 207        inet->inet_dport = usin->sin_port;
 208        inet->inet_daddr = daddr;
 209
 210        inet_csk(sk)->icsk_ext_hdr_len = 0;
 211        if (inet_opt)
 212                inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 213
 214        tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 215
 216        /* Socket identity is still unknown (sport may be zero).
 217         * However we set state to SYN-SENT and not releasing socket
 218         * lock select source port, enter ourselves into the hash tables and
 219         * complete initialization after this.
 220         */
 221        tcp_set_state(sk, TCP_SYN_SENT);
 222        err = inet_hash_connect(&tcp_death_row, sk);
 223        if (err)
 224                goto failure;
 225
 226        rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 227                               inet->inet_sport, inet->inet_dport, sk);
 228        if (IS_ERR(rt)) {
 229                err = PTR_ERR(rt);
 230                rt = NULL;
 231                goto failure;
 232        }
 233        /* OK, now commit destination to socket.  */
 234        sk->sk_gso_type = SKB_GSO_TCPV4;
 235        sk_setup_caps(sk, &rt->dst);
 236
 237        if (!tp->write_seq && likely(!tp->repair))
 238                tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 239                                                           inet->inet_daddr,
 240                                                           inet->inet_sport,
 241                                                           usin->sin_port);
 242
 243        inet->inet_id = tp->write_seq ^ jiffies;
 244
 245        err = tcp_connect(sk);
 246
 247        rt = NULL;
 248        if (err)
 249                goto failure;
 250
 251        return 0;
 252
 253failure:
 254        /*
 255         * This unhashes the socket and releases the local port,
 256         * if necessary.
 257         */
 258        tcp_set_state(sk, TCP_CLOSE);
 259        ip_rt_put(rt);
 260        sk->sk_route_caps = 0;
 261        inet->inet_dport = 0;
 262        return err;
 263}
 264EXPORT_SYMBOL(tcp_v4_connect);
 265
 266/*
 267 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 268 * It can be called through tcp_release_cb() if socket was owned by user
 269 * at the time tcp_v4_err() was called to handle ICMP message.
 270 */
 271static void tcp_v4_mtu_reduced(struct sock *sk)
 272{
 273        struct dst_entry *dst;
 274        struct inet_sock *inet = inet_sk(sk);
 275        u32 mtu = tcp_sk(sk)->mtu_info;
 276
 277        dst = inet_csk_update_pmtu(sk, mtu);
 278        if (!dst)
 279                return;
 280
 281        /* Something is about to be wrong... Remember soft error
 282         * for the case, if this connection will not able to recover.
 283         */
 284        if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 285                sk->sk_err_soft = EMSGSIZE;
 286
 287        mtu = dst_mtu(dst);
 288
 289        if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 290            inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 291                tcp_sync_mss(sk, mtu);
 292
 293                /* Resend the TCP packet because it's
 294                 * clear that the old packet has been
 295                 * dropped. This is the new "fast" path mtu
 296                 * discovery.
 297                 */
 298                tcp_simple_retransmit(sk);
 299        } /* else let the usual retransmit timer handle it */
 300}
 301
 302static void do_redirect(struct sk_buff *skb, struct sock *sk)
 303{
 304        struct dst_entry *dst = __sk_dst_check(sk, 0);
 305
 306        if (dst)
 307                dst->ops->redirect(dst, sk, skb);
 308}
 309
 310/*
 311 * This routine is called by the ICMP module when it gets some
 312 * sort of error condition.  If err < 0 then the socket should
 313 * be closed and the error returned to the user.  If err > 0
 314 * it's just the icmp type << 8 | icmp code.  After adjustment
 315 * header points to the first 8 bytes of the tcp header.  We need
 316 * to find the appropriate port.
 317 *
 318 * The locking strategy used here is very "optimistic". When
 319 * someone else accesses the socket the ICMP is just dropped
 320 * and for some paths there is no check at all.
 321 * A more general error queue to queue errors for later handling
 322 * is probably better.
 323 *
 324 */
 325
 326void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 327{
 328        const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 329        struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 330        struct inet_connection_sock *icsk;
 331        struct tcp_sock *tp;
 332        struct inet_sock *inet;
 333        const int type = icmp_hdr(icmp_skb)->type;
 334        const int code = icmp_hdr(icmp_skb)->code;
 335        struct sock *sk;
 336        struct sk_buff *skb;
 337        struct request_sock *req;
 338        __u32 seq;
 339        __u32 remaining;
 340        int err;
 341        struct net *net = dev_net(icmp_skb->dev);
 342
 343        if (icmp_skb->len < (iph->ihl << 2) + 8) {
 344                ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 345                return;
 346        }
 347
 348        sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
 349                        iph->saddr, th->source, inet_iif(icmp_skb));
 350        if (!sk) {
 351                ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 352                return;
 353        }
 354        if (sk->sk_state == TCP_TIME_WAIT) {
 355                inet_twsk_put(inet_twsk(sk));
 356                return;
 357        }
 358
 359        bh_lock_sock(sk);
 360        /* If too many ICMPs get dropped on busy
 361         * servers this needs to be solved differently.
 362         * We do take care of PMTU discovery (RFC1191) special case :
 363         * we can receive locally generated ICMP messages while socket is held.
 364         */
 365        if (sock_owned_by_user(sk)) {
 366                if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 367                        NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 368        }
 369        if (sk->sk_state == TCP_CLOSE)
 370                goto out;
 371
 372        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 373                NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
 374                goto out;
 375        }
 376
 377        icsk = inet_csk(sk);
 378        tp = tcp_sk(sk);
 379        req = tp->fastopen_rsk;
 380        seq = ntohl(th->seq);
 381        if (sk->sk_state != TCP_LISTEN &&
 382            !between(seq, tp->snd_una, tp->snd_nxt) &&
 383            (req == NULL || seq != tcp_rsk(req)->snt_isn)) {
 384                /* For a Fast Open socket, allow seq to be snt_isn. */
 385                NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 386                goto out;
 387        }
 388
 389        switch (type) {
 390        case ICMP_REDIRECT:
 391                do_redirect(icmp_skb, sk);
 392                goto out;
 393        case ICMP_SOURCE_QUENCH:
 394                /* Just silently ignore these. */
 395                goto out;
 396        case ICMP_PARAMETERPROB:
 397                err = EPROTO;
 398                break;
 399        case ICMP_DEST_UNREACH:
 400                if (code > NR_ICMP_UNREACH)
 401                        goto out;
 402
 403                if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 404                        /* We are not interested in TCP_LISTEN and open_requests
 405                         * (SYN-ACKs send out by Linux are always <576bytes so
 406                         * they should go through unfragmented).
 407                         */
 408                        if (sk->sk_state == TCP_LISTEN)
 409                                goto out;
 410
 411                        tp->mtu_info = info;
 412                        if (!sock_owned_by_user(sk)) {
 413                                tcp_v4_mtu_reduced(sk);
 414                        } else {
 415                                if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
 416                                        sock_hold(sk);
 417                        }
 418                        goto out;
 419                }
 420
 421                err = icmp_err_convert[code].errno;
 422                /* check if icmp_skb allows revert of backoff
 423                 * (see draft-zimmermann-tcp-lcd) */
 424                if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 425                        break;
 426                if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 427                    !icsk->icsk_backoff)
 428                        break;
 429
 430                /* XXX (TFO) - revisit the following logic for TFO */
 431
 432                if (sock_owned_by_user(sk))
 433                        break;
 434
 435                icsk->icsk_backoff--;
 436                inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
 437                        TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
 438                tcp_bound_rto(sk);
 439
 440                skb = tcp_write_queue_head(sk);
 441                BUG_ON(!skb);
 442
 443                remaining = icsk->icsk_rto - min(icsk->icsk_rto,
 444                                tcp_time_stamp - TCP_SKB_CB(skb)->when);
 445
 446                if (remaining) {
 447                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 448                                                  remaining, TCP_RTO_MAX);
 449                } else {
 450                        /* RTO revert clocked out retransmission.
 451                         * Will retransmit now */
 452                        tcp_retransmit_timer(sk);
 453                }
 454
 455                break;
 456        case ICMP_TIME_EXCEEDED:
 457                err = EHOSTUNREACH;
 458                break;
 459        default:
 460                goto out;
 461        }
 462
 463        /* XXX (TFO) - if it's a TFO socket and has been accepted, rather
 464         * than following the TCP_SYN_RECV case and closing the socket,
 465         * we ignore the ICMP error and keep trying like a fully established
 466         * socket. Is this the right thing to do?
 467         */
 468        if (req && req->sk == NULL)
 469                goto out;
 470
 471        switch (sk->sk_state) {
 472                struct request_sock *req, **prev;
 473        case TCP_LISTEN:
 474                if (sock_owned_by_user(sk))
 475                        goto out;
 476
 477                req = inet_csk_search_req(sk, &prev, th->dest,
 478                                          iph->daddr, iph->saddr);
 479                if (!req)
 480                        goto out;
 481
 482                /* ICMPs are not backlogged, hence we cannot get
 483                   an established socket here.
 484                 */
 485                WARN_ON(req->sk);
 486
 487                if (seq != tcp_rsk(req)->snt_isn) {
 488                        NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 489                        goto out;
 490                }
 491
 492                /*
 493                 * Still in SYN_RECV, just remove it silently.
 494                 * There is no good way to pass the error to the newly
 495                 * created socket, and POSIX does not want network
 496                 * errors returned from accept().
 497                 */
 498                inet_csk_reqsk_queue_drop(sk, req, prev);
 499                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
 500                goto out;
 501
 502        case TCP_SYN_SENT:
 503        case TCP_SYN_RECV:  /* Cannot happen.
 504                               It can f.e. if SYNs crossed,
 505                               or Fast Open.
 506                             */
 507                if (!sock_owned_by_user(sk)) {
 508                        sk->sk_err = err;
 509
 510                        sk->sk_error_report(sk);
 511
 512                        tcp_done(sk);
 513                } else {
 514                        sk->sk_err_soft = err;
 515                }
 516                goto out;
 517        }
 518
 519        /* If we've already connected we will keep trying
 520         * until we time out, or the user gives up.
 521         *
 522         * rfc1122 4.2.3.9 allows to consider as hard errors
 523         * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 524         * but it is obsoleted by pmtu discovery).
 525         *
 526         * Note, that in modern internet, where routing is unreliable
 527         * and in each dark corner broken firewalls sit, sending random
 528         * errors ordered by their masters even this two messages finally lose
 529         * their original sense (even Linux sends invalid PORT_UNREACHs)
 530         *
 531         * Now we are in compliance with RFCs.
 532         *                                                      --ANK (980905)
 533         */
 534
 535        inet = inet_sk(sk);
 536        if (!sock_owned_by_user(sk) && inet->recverr) {
 537                sk->sk_err = err;
 538                sk->sk_error_report(sk);
 539        } else  { /* Only an error on timeout */
 540                sk->sk_err_soft = err;
 541        }
 542
 543out:
 544        bh_unlock_sock(sk);
 545        sock_put(sk);
 546}
 547
 548static void __tcp_v4_send_check(struct sk_buff *skb,
 549                                __be32 saddr, __be32 daddr)
 550{
 551        struct tcphdr *th = tcp_hdr(skb);
 552
 553        if (skb->ip_summed == CHECKSUM_PARTIAL) {
 554                th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 555                skb->csum_start = skb_transport_header(skb) - skb->head;
 556                skb->csum_offset = offsetof(struct tcphdr, check);
 557        } else {
 558                th->check = tcp_v4_check(skb->len, saddr, daddr,
 559                                         csum_partial(th,
 560                                                      th->doff << 2,
 561                                                      skb->csum));
 562        }
 563}
 564
 565/* This routine computes an IPv4 TCP checksum. */
 566void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 567{
 568        const struct inet_sock *inet = inet_sk(sk);
 569
 570        __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 571}
 572EXPORT_SYMBOL(tcp_v4_send_check);
 573
 574int tcp_v4_gso_send_check(struct sk_buff *skb)
 575{
 576        const struct iphdr *iph;
 577        struct tcphdr *th;
 578
 579        if (!pskb_may_pull(skb, sizeof(*th)))
 580                return -EINVAL;
 581
 582        iph = ip_hdr(skb);
 583        th = tcp_hdr(skb);
 584
 585        th->check = 0;
 586        skb->ip_summed = CHECKSUM_PARTIAL;
 587        __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
 588        return 0;
 589}
 590
 591/*
 592 *      This routine will send an RST to the other tcp.
 593 *
 594 *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 595 *                    for reset.
 596 *      Answer: if a packet caused RST, it is not for a socket
 597 *              existing in our system, if it is matched to a socket,
 598 *              it is just duplicate segment or bug in other side's TCP.
 599 *              So that we build reply only basing on parameters
 600 *              arrived with segment.
 601 *      Exception: precedence violation. We do not implement it in any case.
 602 */
 603
 604static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 605{
 606        const struct tcphdr *th = tcp_hdr(skb);
 607        struct {
 608                struct tcphdr th;
 609#ifdef CONFIG_TCP_MD5SIG
 610                __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 611#endif
 612        } rep;
 613        struct ip_reply_arg arg;
 614#ifdef CONFIG_TCP_MD5SIG
 615        struct tcp_md5sig_key *key;
 616        const __u8 *hash_location = NULL;
 617        unsigned char newhash[16];
 618        int genhash;
 619        struct sock *sk1 = NULL;
 620#endif
 621        struct net *net;
 622
 623        /* Never send a reset in response to a reset. */
 624        if (th->rst)
 625                return;
 626
 627        if (skb_rtable(skb)->rt_type != RTN_LOCAL)
 628                return;
 629
 630        /* Swap the send and the receive. */
 631        memset(&rep, 0, sizeof(rep));
 632        rep.th.dest   = th->source;
 633        rep.th.source = th->dest;
 634        rep.th.doff   = sizeof(struct tcphdr) / 4;
 635        rep.th.rst    = 1;
 636
 637        if (th->ack) {
 638                rep.th.seq = th->ack_seq;
 639        } else {
 640                rep.th.ack = 1;
 641                rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 642                                       skb->len - (th->doff << 2));
 643        }
 644
 645        memset(&arg, 0, sizeof(arg));
 646        arg.iov[0].iov_base = (unsigned char *)&rep;
 647        arg.iov[0].iov_len  = sizeof(rep.th);
 648
 649#ifdef CONFIG_TCP_MD5SIG
 650        hash_location = tcp_parse_md5sig_option(th);
 651        if (!sk && hash_location) {
 652                /*
 653                 * active side is lost. Try to find listening socket through
 654                 * source port, and then find md5 key through listening socket.
 655                 * we are not loose security here:
 656                 * Incoming packet is checked with md5 hash with finding key,
 657                 * no RST generated if md5 hash doesn't match.
 658                 */
 659                sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
 660                                             &tcp_hashinfo, ip_hdr(skb)->saddr,
 661                                             th->source, ip_hdr(skb)->daddr,
 662                                             ntohs(th->source), inet_iif(skb));
 663                /* don't send rst if it can't find key */
 664                if (!sk1)
 665                        return;
 666                rcu_read_lock();
 667                key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 668                                        &ip_hdr(skb)->saddr, AF_INET);
 669                if (!key)
 670                        goto release_sk1;
 671
 672                genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
 673                if (genhash || memcmp(hash_location, newhash, 16) != 0)
 674                        goto release_sk1;
 675        } else {
 676                key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 677                                             &ip_hdr(skb)->saddr,
 678                                             AF_INET) : NULL;
 679        }
 680
 681        if (key) {
 682                rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 683                                   (TCPOPT_NOP << 16) |
 684                                   (TCPOPT_MD5SIG << 8) |
 685                                   TCPOLEN_MD5SIG);
 686                /* Update length and the length the header thinks exists */
 687                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 688                rep.th.doff = arg.iov[0].iov_len / 4;
 689
 690                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 691                                     key, ip_hdr(skb)->saddr,
 692                                     ip_hdr(skb)->daddr, &rep.th);
 693        }
 694#endif
 695        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 696                                      ip_hdr(skb)->saddr, /* XXX */
 697                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 698        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 699        arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 700        /* When socket is gone, all binding information is lost.
 701         * routing might fail in this case. No choice here, if we choose to force
 702         * input interface, we will misroute in case of asymmetric route.
 703         */
 704        if (sk)
 705                arg.bound_dev_if = sk->sk_bound_dev_if;
 706
 707        net = dev_net(skb_dst(skb)->dev);
 708        arg.tos = ip_hdr(skb)->tos;
 709        ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
 710                              ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
 711
 712        TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 713        TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 714
 715#ifdef CONFIG_TCP_MD5SIG
 716release_sk1:
 717        if (sk1) {
 718                rcu_read_unlock();
 719                sock_put(sk1);
 720        }
 721#endif
 722}
 723
 724/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 725   outside socket context is ugly, certainly. What can I do?
 726 */
 727
 728static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 729                            u32 win, u32 tsval, u32 tsecr, int oif,
 730                            struct tcp_md5sig_key *key,
 731                            int reply_flags, u8 tos)
 732{
 733        const struct tcphdr *th = tcp_hdr(skb);
 734        struct {
 735                struct tcphdr th;
 736                __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 737#ifdef CONFIG_TCP_MD5SIG
 738                           + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 739#endif
 740                        ];
 741        } rep;
 742        struct ip_reply_arg arg;
 743        struct net *net = dev_net(skb_dst(skb)->dev);
 744
 745        memset(&rep.th, 0, sizeof(struct tcphdr));
 746        memset(&arg, 0, sizeof(arg));
 747
 748        arg.iov[0].iov_base = (unsigned char *)&rep;
 749        arg.iov[0].iov_len  = sizeof(rep.th);
 750        if (tsecr) {
 751                rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 752                                   (TCPOPT_TIMESTAMP << 8) |
 753                                   TCPOLEN_TIMESTAMP);
 754                rep.opt[1] = htonl(tsval);
 755                rep.opt[2] = htonl(tsecr);
 756                arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 757        }
 758
 759        /* Swap the send and the receive. */
 760        rep.th.dest    = th->source;
 761        rep.th.source  = th->dest;
 762        rep.th.doff    = arg.iov[0].iov_len / 4;
 763        rep.th.seq     = htonl(seq);
 764        rep.th.ack_seq = htonl(ack);
 765        rep.th.ack     = 1;
 766        rep.th.window  = htons(win);
 767
 768#ifdef CONFIG_TCP_MD5SIG
 769        if (key) {
 770                int offset = (tsecr) ? 3 : 0;
 771
 772                rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 773                                          (TCPOPT_NOP << 16) |
 774                                          (TCPOPT_MD5SIG << 8) |
 775                                          TCPOLEN_MD5SIG);
 776                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 777                rep.th.doff = arg.iov[0].iov_len/4;
 778
 779                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 780                                    key, ip_hdr(skb)->saddr,
 781                                    ip_hdr(skb)->daddr, &rep.th);
 782        }
 783#endif
 784        arg.flags = reply_flags;
 785        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 786                                      ip_hdr(skb)->saddr, /* XXX */
 787                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 788        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 789        if (oif)
 790                arg.bound_dev_if = oif;
 791        arg.tos = tos;
 792        ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
 793                              ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
 794
 795        TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 796}
 797
 798static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 799{
 800        struct inet_timewait_sock *tw = inet_twsk(sk);
 801        struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 802
 803        tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 804                        tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 805                        tcp_time_stamp + tcptw->tw_ts_offset,
 806                        tcptw->tw_ts_recent,
 807                        tw->tw_bound_dev_if,
 808                        tcp_twsk_md5_key(tcptw),
 809                        tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 810                        tw->tw_tos
 811                        );
 812
 813        inet_twsk_put(tw);
 814}
 815
 816static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 817                                  struct request_sock *req)
 818{
 819        /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 820         * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 821         */
 822        tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
 823                        tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
 824                        tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
 825                        tcp_time_stamp,
 826                        req->ts_recent,
 827                        0,
 828                        tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
 829                                          AF_INET),
 830                        inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 831                        ip_hdr(skb)->tos);
 832}
 833
 834/*
 835 *      Send a SYN-ACK after having received a SYN.
 836 *      This still operates on a request_sock only, not on a big
 837 *      socket.
 838 */
 839static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 840                              struct request_sock *req,
 841                              struct request_values *rvp,
 842                              u16 queue_mapping,
 843                              bool nocache)
 844{
 845        const struct inet_request_sock *ireq = inet_rsk(req);
 846        struct flowi4 fl4;
 847        int err = -1;
 848        struct sk_buff * skb;
 849
 850        /* First, grab a route. */
 851        if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 852                return -1;
 853
 854        skb = tcp_make_synack(sk, dst, req, rvp, NULL);
 855
 856        if (skb) {
 857                __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
 858
 859                skb_set_queue_mapping(skb, queue_mapping);
 860                err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 861                                            ireq->rmt_addr,
 862                                            ireq->opt);
 863                err = net_xmit_eval(err);
 864                if (!tcp_rsk(req)->snt_synack && !err)
 865                        tcp_rsk(req)->snt_synack = tcp_time_stamp;
 866        }
 867
 868        return err;
 869}
 870
 871static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
 872                             struct request_values *rvp)
 873{
 874        int res = tcp_v4_send_synack(sk, NULL, req, rvp, 0, false);
 875
 876        if (!res)
 877                TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
 878        return res;
 879}
 880
 881/*
 882 *      IPv4 request_sock destructor.
 883 */
 884static void tcp_v4_reqsk_destructor(struct request_sock *req)
 885{
 886        kfree(inet_rsk(req)->opt);
 887}
 888
 889/*
 890 * Return true if a syncookie should be sent
 891 */
 892bool tcp_syn_flood_action(struct sock *sk,
 893                         const struct sk_buff *skb,
 894                         const char *proto)
 895{
 896        const char *msg = "Dropping request";
 897        bool want_cookie = false;
 898        struct listen_sock *lopt;
 899
 900
 901
 902#ifdef CONFIG_SYN_COOKIES
 903        if (sysctl_tcp_syncookies) {
 904                msg = "Sending cookies";
 905                want_cookie = true;
 906                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
 907        } else
 908#endif
 909                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
 910
 911        lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
 912        if (!lopt->synflood_warned) {
 913                lopt->synflood_warned = 1;
 914                pr_info("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
 915                        proto, ntohs(tcp_hdr(skb)->dest), msg);
 916        }
 917        return want_cookie;
 918}
 919EXPORT_SYMBOL(tcp_syn_flood_action);
 920
 921/*
 922 * Save and compile IPv4 options into the request_sock if needed.
 923 */
 924static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
 925{
 926        const struct ip_options *opt = &(IPCB(skb)->opt);
 927        struct ip_options_rcu *dopt = NULL;
 928
 929        if (opt && opt->optlen) {
 930                int opt_size = sizeof(*dopt) + opt->optlen;
 931
 932                dopt = kmalloc(opt_size, GFP_ATOMIC);
 933                if (dopt) {
 934                        if (ip_options_echo(&dopt->opt, skb)) {
 935                                kfree(dopt);
 936                                dopt = NULL;
 937                        }
 938                }
 939        }
 940        return dopt;
 941}
 942
 943#ifdef CONFIG_TCP_MD5SIG
 944/*
 945 * RFC2385 MD5 checksumming requires a mapping of
 946 * IP address->MD5 Key.
 947 * We need to maintain these in the sk structure.
 948 */
 949
 950/* Find the Key structure for an address.  */
 951struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
 952                                         const union tcp_md5_addr *addr,
 953                                         int family)
 954{
 955        struct tcp_sock *tp = tcp_sk(sk);
 956        struct tcp_md5sig_key *key;
 957        unsigned int size = sizeof(struct in_addr);
 958        struct tcp_md5sig_info *md5sig;
 959
 960        /* caller either holds rcu_read_lock() or socket lock */
 961        md5sig = rcu_dereference_check(tp->md5sig_info,
 962                                       sock_owned_by_user(sk) ||
 963                                       lockdep_is_held(&sk->sk_lock.slock));
 964        if (!md5sig)
 965                return NULL;
 966#if IS_ENABLED(CONFIG_IPV6)
 967        if (family == AF_INET6)
 968                size = sizeof(struct in6_addr);
 969#endif
 970        hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 971                if (key->family != family)
 972                        continue;
 973                if (!memcmp(&key->addr, addr, size))
 974                        return key;
 975        }
 976        return NULL;
 977}
 978EXPORT_SYMBOL(tcp_md5_do_lookup);
 979
 980struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 981                                         struct sock *addr_sk)
 982{
 983        union tcp_md5_addr *addr;
 984
 985        addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
 986        return tcp_md5_do_lookup(sk, addr, AF_INET);
 987}
 988EXPORT_SYMBOL(tcp_v4_md5_lookup);
 989
 990static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
 991                                                      struct request_sock *req)
 992{
 993        union tcp_md5_addr *addr;
 994
 995        addr = (union tcp_md5_addr *)&inet_rsk(req)->rmt_addr;
 996        return tcp_md5_do_lookup(sk, addr, AF_INET);
 997}
 998
 999/* This can be called on a newly created socket, from other files */
1000int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1001                   int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
1002{
1003        /* Add Key to the list */
1004        struct tcp_md5sig_key *key;
1005        struct tcp_sock *tp = tcp_sk(sk);
1006        struct tcp_md5sig_info *md5sig;
1007
1008        key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
1009        if (key) {
1010                /* Pre-existing entry - just update that one. */
1011                memcpy(key->key, newkey, newkeylen);
1012                key->keylen = newkeylen;
1013                return 0;
1014        }
1015
1016        md5sig = rcu_dereference_protected(tp->md5sig_info,
1017                                           sock_owned_by_user(sk));
1018        if (!md5sig) {
1019                md5sig = kmalloc(sizeof(*md5sig), gfp);
1020                if (!md5sig)
1021                        return -ENOMEM;
1022
1023                sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1024                INIT_HLIST_HEAD(&md5sig->head);
1025                rcu_assign_pointer(tp->md5sig_info, md5sig);
1026        }
1027
1028        key = sock_kmalloc(sk, sizeof(*key), gfp);
1029        if (!key)
1030                return -ENOMEM;
1031        if (hlist_empty(&md5sig->head) && !tcp_alloc_md5sig_pool(sk)) {
1032                sock_kfree_s(sk, key, sizeof(*key));
1033                return -ENOMEM;
1034        }
1035
1036        memcpy(key->key, newkey, newkeylen);
1037        key->keylen = newkeylen;
1038        key->family = family;
1039        memcpy(&key->addr, addr,
1040               (family == AF_INET6) ? sizeof(struct in6_addr) :
1041                                      sizeof(struct in_addr));
1042        hlist_add_head_rcu(&key->node, &md5sig->head);
1043        return 0;
1044}
1045EXPORT_SYMBOL(tcp_md5_do_add);
1046
1047int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
1048{
1049        struct tcp_sock *tp = tcp_sk(sk);
1050        struct tcp_md5sig_key *key;
1051        struct tcp_md5sig_info *md5sig;
1052
1053        key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
1054        if (!key)
1055                return -ENOENT;
1056        hlist_del_rcu(&key->node);
1057        atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1058        kfree_rcu(key, rcu);
1059        md5sig = rcu_dereference_protected(tp->md5sig_info,
1060                                           sock_owned_by_user(sk));
1061        if (hlist_empty(&md5sig->head))
1062                tcp_free_md5sig_pool();
1063        return 0;
1064}
1065EXPORT_SYMBOL(tcp_md5_do_del);
1066
1067static void tcp_clear_md5_list(struct sock *sk)
1068{
1069        struct tcp_sock *tp = tcp_sk(sk);
1070        struct tcp_md5sig_key *key;
1071        struct hlist_node *n;
1072        struct tcp_md5sig_info *md5sig;
1073
1074        md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1075
1076        if (!hlist_empty(&md5sig->head))
1077                tcp_free_md5sig_pool();
1078        hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1079                hlist_del_rcu(&key->node);
1080                atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1081                kfree_rcu(key, rcu);
1082        }
1083}
1084
1085static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1086                                 int optlen)
1087{
1088        struct tcp_md5sig cmd;
1089        struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1090
1091        if (optlen < sizeof(cmd))
1092                return -EINVAL;
1093
1094        if (copy_from_user(&cmd, optval, sizeof(cmd)))
1095                return -EFAULT;
1096
1097        if (sin->sin_family != AF_INET)
1098                return -EINVAL;
1099
1100        if (!cmd.tcpm_key || !cmd.tcpm_keylen)
1101                return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1102                                      AF_INET);
1103
1104        if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1105                return -EINVAL;
1106
1107        return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1108                              AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1109                              GFP_KERNEL);
1110}
1111
1112static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1113                                        __be32 daddr, __be32 saddr, int nbytes)
1114{
1115        struct tcp4_pseudohdr *bp;
1116        struct scatterlist sg;
1117
1118        bp = &hp->md5_blk.ip4;
1119
1120        /*
1121         * 1. the TCP pseudo-header (in the order: source IP address,
1122         * destination IP address, zero-padded protocol number, and
1123         * segment length)
1124         */
1125        bp->saddr = saddr;
1126        bp->daddr = daddr;
1127        bp->pad = 0;
1128        bp->protocol = IPPROTO_TCP;
1129        bp->len = cpu_to_be16(nbytes);
1130
1131        sg_init_one(&sg, bp, sizeof(*bp));
1132        return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1133}
1134
1135static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1136                               __be32 daddr, __be32 saddr, const struct tcphdr *th)
1137{
1138        struct tcp_md5sig_pool *hp;
1139        struct hash_desc *desc;
1140
1141        hp = tcp_get_md5sig_pool();
1142        if (!hp)
1143                goto clear_hash_noput;
1144        desc = &hp->md5_desc;
1145
1146        if (crypto_hash_init(desc))
1147                goto clear_hash;
1148        if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1149                goto clear_hash;
1150        if (tcp_md5_hash_header(hp, th))
1151                goto clear_hash;
1152        if (tcp_md5_hash_key(hp, key))
1153                goto clear_hash;
1154        if (crypto_hash_final(desc, md5_hash))
1155                goto clear_hash;
1156
1157        tcp_put_md5sig_pool();
1158        return 0;
1159
1160clear_hash:
1161        tcp_put_md5sig_pool();
1162clear_hash_noput:
1163        memset(md5_hash, 0, 16);
1164        return 1;
1165}
1166
1167int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1168                        const struct sock *sk, const struct request_sock *req,
1169                        const struct sk_buff *skb)
1170{
1171        struct tcp_md5sig_pool *hp;
1172        struct hash_desc *desc;
1173        const struct tcphdr *th = tcp_hdr(skb);
1174        __be32 saddr, daddr;
1175
1176        if (sk) {
1177                saddr = inet_sk(sk)->inet_saddr;
1178                daddr = inet_sk(sk)->inet_daddr;
1179        } else if (req) {
1180                saddr = inet_rsk(req)->loc_addr;
1181                daddr = inet_rsk(req)->rmt_addr;
1182        } else {
1183                const struct iphdr *iph = ip_hdr(skb);
1184                saddr = iph->saddr;
1185                daddr = iph->daddr;
1186        }
1187
1188        hp = tcp_get_md5sig_pool();
1189        if (!hp)
1190                goto clear_hash_noput;
1191        desc = &hp->md5_desc;
1192
1193        if (crypto_hash_init(desc))
1194                goto clear_hash;
1195
1196        if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1197                goto clear_hash;
1198        if (tcp_md5_hash_header(hp, th))
1199                goto clear_hash;
1200        if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1201                goto clear_hash;
1202        if (tcp_md5_hash_key(hp, key))
1203                goto clear_hash;
1204        if (crypto_hash_final(desc, md5_hash))
1205                goto clear_hash;
1206
1207        tcp_put_md5sig_pool();
1208        return 0;
1209
1210clear_hash:
1211        tcp_put_md5sig_pool();
1212clear_hash_noput:
1213        memset(md5_hash, 0, 16);
1214        return 1;
1215}
1216EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1217
1218static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1219{
1220        /*
1221         * This gets called for each TCP segment that arrives
1222         * so we want to be efficient.
1223         * We have 3 drop cases:
1224         * o No MD5 hash and one expected.
1225         * o MD5 hash and we're not expecting one.
1226         * o MD5 hash and its wrong.
1227         */
1228        const __u8 *hash_location = NULL;
1229        struct tcp_md5sig_key *hash_expected;
1230        const struct iphdr *iph = ip_hdr(skb);
1231        const struct tcphdr *th = tcp_hdr(skb);
1232        int genhash;
1233        unsigned char newhash[16];
1234
1235        hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1236                                          AF_INET);
1237        hash_location = tcp_parse_md5sig_option(th);
1238
1239        /* We've parsed the options - do we have a hash? */
1240        if (!hash_expected && !hash_location)
1241                return false;
1242
1243        if (hash_expected && !hash_location) {
1244                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1245                return true;
1246        }
1247
1248        if (!hash_expected && hash_location) {
1249                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1250                return true;
1251        }
1252
1253        /* Okay, so this is hash_expected and hash_location -
1254         * so we need to calculate the checksum.
1255         */
1256        genhash = tcp_v4_md5_hash_skb(newhash,
1257                                      hash_expected,
1258                                      NULL, NULL, skb);
1259
1260        if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1261                net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1262                                     &iph->saddr, ntohs(th->source),
1263                                     &iph->daddr, ntohs(th->dest),
1264                                     genhash ? " tcp_v4_calc_md5_hash failed"
1265                                     : "");
1266                return true;
1267        }
1268        return false;
1269}
1270
1271#endif
1272
1273struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1274        .family         =       PF_INET,
1275        .obj_size       =       sizeof(struct tcp_request_sock),
1276        .rtx_syn_ack    =       tcp_v4_rtx_synack,
1277        .send_ack       =       tcp_v4_reqsk_send_ack,
1278        .destructor     =       tcp_v4_reqsk_destructor,
1279        .send_reset     =       tcp_v4_send_reset,
1280        .syn_ack_timeout =      tcp_syn_ack_timeout,
1281};
1282
1283#ifdef CONFIG_TCP_MD5SIG
1284static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1285        .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1286        .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1287};
1288#endif
1289
1290static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
1291                               struct request_sock *req,
1292                               struct tcp_fastopen_cookie *foc,
1293                               struct tcp_fastopen_cookie *valid_foc)
1294{
1295        bool skip_cookie = false;
1296        struct fastopen_queue *fastopenq;
1297
1298        if (likely(!fastopen_cookie_present(foc))) {
1299                /* See include/net/tcp.h for the meaning of these knobs */
1300                if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) ||
1301                    ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) &&
1302                    (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1)))
1303                        skip_cookie = true; /* no cookie to validate */
1304                else
1305                        return false;
1306        }
1307        fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
1308        /* A FO option is present; bump the counter. */
1309        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE);
1310
1311        /* Make sure the listener has enabled fastopen, and we don't
1312         * exceed the max # of pending TFO requests allowed before trying
1313         * to validating the cookie in order to avoid burning CPU cycles
1314         * unnecessarily.
1315         *
1316         * XXX (TFO) - The implication of checking the max_qlen before
1317         * processing a cookie request is that clients can't differentiate
1318         * between qlen overflow causing Fast Open to be disabled
1319         * temporarily vs a server not supporting Fast Open at all.
1320         */
1321        if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 ||
1322            fastopenq == NULL || fastopenq->max_qlen == 0)
1323                return false;
1324
1325        if (fastopenq->qlen >= fastopenq->max_qlen) {
1326                struct request_sock *req1;
1327                spin_lock(&fastopenq->lock);
1328                req1 = fastopenq->rskq_rst_head;
1329                if ((req1 == NULL) || time_after(req1->expires, jiffies)) {
1330                        spin_unlock(&fastopenq->lock);
1331                        NET_INC_STATS_BH(sock_net(sk),
1332                            LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
1333                        /* Avoid bumping LINUX_MIB_TCPFASTOPENPASSIVEFAIL*/
1334                        foc->len = -1;
1335                        return false;
1336                }
1337                fastopenq->rskq_rst_head = req1->dl_next;
1338                fastopenq->qlen--;
1339                spin_unlock(&fastopenq->lock);
1340                reqsk_free(req1);
1341        }
1342        if (skip_cookie) {
1343                tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1344                return true;
1345        }
1346        if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) {
1347                if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) {
1348                        tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1349                        if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) ||
1350                            memcmp(&foc->val[0], &valid_foc->val[0],
1351                            TCP_FASTOPEN_COOKIE_SIZE) != 0)
1352                                return false;
1353                        valid_foc->len = -1;
1354                }
1355                /* Acknowledge the data received from the peer. */
1356                tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1357                return true;
1358        } else if (foc->len == 0) { /* Client requesting a cookie */
1359                tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1360                NET_INC_STATS_BH(sock_net(sk),
1361                    LINUX_MIB_TCPFASTOPENCOOKIEREQD);
1362        } else {
1363                /* Client sent a cookie with wrong size. Treat it
1364                 * the same as invalid and return a valid one.
1365                 */
1366                tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1367        }
1368        return false;
1369}
1370
1371static int tcp_v4_conn_req_fastopen(struct sock *sk,
1372                                    struct sk_buff *skb,
1373                                    struct sk_buff *skb_synack,
1374                                    struct request_sock *req,
1375                                    struct request_values *rvp)
1376{
1377        struct tcp_sock *tp = tcp_sk(sk);
1378        struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
1379        const struct inet_request_sock *ireq = inet_rsk(req);
1380        struct sock *child;
1381        int err;
1382
1383        req->num_retrans = 0;
1384        req->num_timeout = 0;
1385        req->sk = NULL;
1386
1387        child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
1388        if (child == NULL) {
1389                NET_INC_STATS_BH(sock_net(sk),
1390                                 LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1391                kfree_skb(skb_synack);
1392                return -1;
1393        }
1394        err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1395                                    ireq->rmt_addr, ireq->opt);
1396        err = net_xmit_eval(err);
1397        if (!err)
1398                tcp_rsk(req)->snt_synack = tcp_time_stamp;
1399        /* XXX (TFO) - is it ok to ignore error and continue? */
1400
1401        spin_lock(&queue->fastopenq->lock);
1402        queue->fastopenq->qlen++;
1403        spin_unlock(&queue->fastopenq->lock);
1404
1405        /* Initialize the child socket. Have to fix some values to take
1406         * into account the child is a Fast Open socket and is created
1407         * only out of the bits carried in the SYN packet.
1408         */
1409        tp = tcp_sk(child);
1410
1411        tp->fastopen_rsk = req;
1412        /* Do a hold on the listner sk so that if the listener is being
1413         * closed, the child that has been accepted can live on and still
1414         * access listen_lock.
1415         */
1416        sock_hold(sk);
1417        tcp_rsk(req)->listener = sk;
1418
1419        /* RFC1323: The window in SYN & SYN/ACK segments is never
1420         * scaled. So correct it appropriately.
1421         */
1422        tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
1423
1424        /* Activate the retrans timer so that SYNACK can be retransmitted.
1425         * The request socket is not added to the SYN table of the parent
1426         * because it's been added to the accept queue directly.
1427         */
1428        inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
1429            TCP_TIMEOUT_INIT, TCP_RTO_MAX);
1430
1431        /* Add the child socket directly into the accept queue */
1432        inet_csk_reqsk_queue_add(sk, req, child);
1433
1434        /* Now finish processing the fastopen child socket. */
1435        inet_csk(child)->icsk_af_ops->rebuild_header(child);
1436        tcp_init_congestion_control(child);
1437        tcp_mtup_init(child);
1438        tcp_init_buffer_space(child);
1439        tcp_init_metrics(child);
1440
1441        /* Queue the data carried in the SYN packet. We need to first
1442         * bump skb's refcnt because the caller will attempt to free it.
1443         *
1444         * XXX (TFO) - we honor a zero-payload TFO request for now.
1445         * (Any reason not to?)
1446         */
1447        if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) {
1448                /* Don't queue the skb if there is no payload in SYN.
1449                 * XXX (TFO) - How about SYN+FIN?
1450                 */
1451                tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1452        } else {
1453                skb = skb_get(skb);
1454                skb_dst_drop(skb);
1455                __skb_pull(skb, tcp_hdr(skb)->doff * 4);
1456                skb_set_owner_r(skb, child);
1457                __skb_queue_tail(&child->sk_receive_queue, skb);
1458                tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1459                tp->syn_data_acked = 1;
1460        }
1461        sk->sk_data_ready(sk, 0);
1462        bh_unlock_sock(child);
1463        sock_put(child);
1464        WARN_ON(req->sk == NULL);
1465        return 0;
1466}
1467
1468int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1469{
1470        struct tcp_extend_values tmp_ext;
1471        struct tcp_options_received tmp_opt;
1472        const u8 *hash_location;
1473        struct request_sock *req;
1474        struct inet_request_sock *ireq;
1475        struct tcp_sock *tp = tcp_sk(sk);
1476        struct dst_entry *dst = NULL;
1477        __be32 saddr = ip_hdr(skb)->saddr;
1478        __be32 daddr = ip_hdr(skb)->daddr;
1479        __u32 isn = TCP_SKB_CB(skb)->when;
1480        bool want_cookie = false;
1481        struct flowi4 fl4;
1482        struct tcp_fastopen_cookie foc = { .len = -1 };
1483        struct tcp_fastopen_cookie valid_foc = { .len = -1 };
1484        struct sk_buff *skb_synack;
1485        int do_fastopen;
1486
1487        /* Never answer to SYNs send to broadcast or multicast */
1488        if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1489                goto drop;
1490
1491        /* TW buckets are converted to open requests without
1492         * limitations, they conserve resources and peer is
1493         * evidently real one.
1494         */
1495        if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1496                want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
1497                if (!want_cookie)
1498                        goto drop;
1499        }
1500
1501        /* Accept backlog is full. If we have already queued enough
1502         * of warm entries in syn queue, drop request. It is better than
1503         * clogging syn queue with openreqs with exponentially increasing
1504         * timeout.
1505         */
1506        if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
1507                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1508                goto drop;
1509        }
1510
1511        req = inet_reqsk_alloc(&tcp_request_sock_ops);
1512        if (!req)
1513                goto drop;
1514
1515#ifdef CONFIG_TCP_MD5SIG
1516        tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1517#endif
1518
1519        tcp_clear_options(&tmp_opt);
1520        tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1521        tmp_opt.user_mss  = tp->rx_opt.user_mss;
1522        tcp_parse_options(skb, &tmp_opt, &hash_location, 0,
1523            want_cookie ? NULL : &foc);
1524
1525        if (tmp_opt.cookie_plus > 0 &&
1526            tmp_opt.saw_tstamp &&
1527            !tp->rx_opt.cookie_out_never &&
1528            (sysctl_tcp_cookie_size > 0 ||
1529             (tp->cookie_values != NULL &&
1530              tp->cookie_values->cookie_desired > 0))) {
1531                u8 *c;
1532                u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1533                int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1534
1535                if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1536                        goto drop_and_release;
1537
1538                /* Secret recipe starts with IP addresses */
1539                *mess++ ^= (__force u32)daddr;
1540                *mess++ ^= (__force u32)saddr;
1541
1542                /* plus variable length Initiator Cookie */
1543                c = (u8 *)mess;
1544                while (l-- > 0)
1545                        *c++ ^= *hash_location++;
1546
1547                want_cookie = false;    /* not our kind of cookie */
1548                tmp_ext.cookie_out_never = 0; /* false */
1549                tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1550        } else if (!tp->rx_opt.cookie_in_always) {
1551                /* redundant indications, but ensure initialization. */
1552                tmp_ext.cookie_out_never = 1; /* true */
1553                tmp_ext.cookie_plus = 0;
1554        } else {
1555                goto drop_and_release;
1556        }
1557        tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1558
1559        if (want_cookie && !tmp_opt.saw_tstamp)
1560                tcp_clear_options(&tmp_opt);
1561
1562        tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1563        tcp_openreq_init(req, &tmp_opt, skb);
1564
1565        ireq = inet_rsk(req);
1566        ireq->loc_addr = daddr;
1567        ireq->rmt_addr = saddr;
1568        ireq->no_srccheck = inet_sk(sk)->transparent;
1569        ireq->opt = tcp_v4_save_options(skb);
1570
1571        if (security_inet_conn_request(sk, skb, req))
1572                goto drop_and_free;
1573
1574        if (!want_cookie || tmp_opt.tstamp_ok)
1575                TCP_ECN_create_request(req, skb, sock_net(sk));
1576
1577        if (want_cookie) {
1578                isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1579                req->cookie_ts = tmp_opt.tstamp_ok;
1580        } else if (!isn) {
1581                /* VJ's idea. We save last timestamp seen
1582                 * from the destination in peer table, when entering
1583                 * state TIME-WAIT, and check against it before
1584                 * accepting new connection request.
1585                 *
1586                 * If "isn" is not zero, this request hit alive
1587                 * timewait bucket, so that all the necessary checks
1588                 * are made in the function processing timewait state.
1589                 */
1590                if (tmp_opt.saw_tstamp &&
1591                    tcp_death_row.sysctl_tw_recycle &&
1592                    (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1593                    fl4.daddr == saddr) {
1594                        if (!tcp_peer_is_proven(req, dst, true)) {
1595                                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1596                                goto drop_and_release;
1597                        }
1598                }
1599                /* Kill the following clause, if you dislike this way. */
1600                else if (!sysctl_tcp_syncookies &&
1601                         (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1602                          (sysctl_max_syn_backlog >> 2)) &&
1603                         !tcp_peer_is_proven(req, dst, false)) {
1604                        /* Without syncookies last quarter of
1605                         * backlog is filled with destinations,
1606                         * proven to be alive.
1607                         * It means that we continue to communicate
1608                         * to destinations, already remembered
1609                         * to the moment of synflood.
1610                         */
1611                        LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
1612                                       &saddr, ntohs(tcp_hdr(skb)->source));
1613                        goto drop_and_release;
1614                }
1615
1616                isn = tcp_v4_init_sequence(skb);
1617        }
1618        tcp_rsk(req)->snt_isn = isn;
1619
1620        if (dst == NULL) {
1621                dst = inet_csk_route_req(sk, &fl4, req);
1622                if (dst == NULL)
1623                        goto drop_and_free;
1624        }
1625        do_fastopen = tcp_fastopen_check(sk, skb, req, &foc, &valid_foc);
1626
1627        /* We don't call tcp_v4_send_synack() directly because we need
1628         * to make sure a child socket can be created successfully before
1629         * sending back synack!
1630         *
1631         * XXX (TFO) - Ideally one would simply call tcp_v4_send_synack()
1632         * (or better yet, call tcp_send_synack() in the child context
1633         * directly, but will have to fix bunch of other code first)
1634         * after syn_recv_sock() except one will need to first fix the
1635         * latter to remove its dependency on the current implementation
1636         * of tcp_v4_send_synack()->tcp_select_initial_window().
1637         */
1638        skb_synack = tcp_make_synack(sk, dst, req,
1639            (struct request_values *)&tmp_ext,
1640            fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL);
1641
1642        if (skb_synack) {
1643                __tcp_v4_send_check(skb_synack, ireq->loc_addr, ireq->rmt_addr);
1644                skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb));
1645        } else
1646                goto drop_and_free;
1647
1648        if (likely(!do_fastopen)) {
1649                int err;
1650                err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1651                     ireq->rmt_addr, ireq->opt);
1652                err = net_xmit_eval(err);
1653                if (err || want_cookie)
1654                        goto drop_and_free;
1655
1656                tcp_rsk(req)->snt_synack = tcp_time_stamp;
1657                tcp_rsk(req)->listener = NULL;
1658                /* Add the request_sock to the SYN table */
1659                inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1660                if (fastopen_cookie_present(&foc) && foc.len != 0)
1661                        NET_INC_STATS_BH(sock_net(sk),
1662                            LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1663        } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req,
1664            (struct request_values *)&tmp_ext))
1665                goto drop_and_free;
1666
1667        return 0;
1668
1669drop_and_release:
1670        dst_release(dst);
1671drop_and_free:
1672        reqsk_free(req);
1673drop:
1674        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1675        return 0;
1676}
1677EXPORT_SYMBOL(tcp_v4_conn_request);
1678
1679
1680/*
1681 * The three way handshake has completed - we got a valid synack -
1682 * now create the new socket.
1683 */
1684struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1685                                  struct request_sock *req,
1686                                  struct dst_entry *dst)
1687{
1688        struct inet_request_sock *ireq;
1689        struct inet_sock *newinet;
1690        struct tcp_sock *newtp;
1691        struct sock *newsk;
1692#ifdef CONFIG_TCP_MD5SIG
1693        struct tcp_md5sig_key *key;
1694#endif
1695        struct ip_options_rcu *inet_opt;
1696
1697        if (sk_acceptq_is_full(sk))
1698                goto exit_overflow;
1699
1700        newsk = tcp_create_openreq_child(sk, req, skb);
1701        if (!newsk)
1702                goto exit_nonewsk;
1703
1704        newsk->sk_gso_type = SKB_GSO_TCPV4;
1705        inet_sk_rx_dst_set(newsk, skb);
1706
1707        newtp                 = tcp_sk(newsk);
1708        newinet               = inet_sk(newsk);
1709        ireq                  = inet_rsk(req);
1710        newinet->inet_daddr   = ireq->rmt_addr;
1711        newinet->inet_rcv_saddr = ireq->loc_addr;
1712        newinet->inet_saddr           = ireq->loc_addr;
1713        inet_opt              = ireq->opt;
1714        rcu_assign_pointer(newinet->inet_opt, inet_opt);
1715        ireq->opt             = NULL;
1716        newinet->mc_index     = inet_iif(skb);
1717        newinet->mc_ttl       = ip_hdr(skb)->ttl;
1718        newinet->rcv_tos      = ip_hdr(skb)->tos;
1719        inet_csk(newsk)->icsk_ext_hdr_len = 0;
1720        if (inet_opt)
1721                inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1722        newinet->inet_id = newtp->write_seq ^ jiffies;
1723
1724        if (!dst) {
1725                dst = inet_csk_route_child_sock(sk, newsk, req);
1726                if (!dst)
1727                        goto put_and_exit;
1728        } else {
1729                /* syncookie case : see end of cookie_v4_check() */
1730        }
1731        sk_setup_caps(newsk, dst);
1732
1733        tcp_mtup_init(newsk);
1734        tcp_sync_mss(newsk, dst_mtu(dst));
1735        newtp->advmss = dst_metric_advmss(dst);
1736        if (tcp_sk(sk)->rx_opt.user_mss &&
1737            tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1738                newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1739
1740        tcp_initialize_rcv_mss(newsk);
1741        tcp_synack_rtt_meas(newsk, req);
1742        newtp->total_retrans = req->num_retrans;
1743
1744#ifdef CONFIG_TCP_MD5SIG
1745        /* Copy over the MD5 key from the original socket */
1746        key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1747                                AF_INET);
1748        if (key != NULL) {
1749                /*
1750                 * We're using one, so create a matching key
1751                 * on the newsk structure. If we fail to get
1752                 * memory, then we end up not copying the key
1753                 * across. Shucks.
1754                 */
1755                tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1756                               AF_INET, key->key, key->keylen, GFP_ATOMIC);
1757                sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1758        }
1759#endif
1760
1761        if (__inet_inherit_port(sk, newsk) < 0)
1762                goto put_and_exit;
1763        __inet_hash_nolisten(newsk, NULL);
1764
1765        return newsk;
1766
1767exit_overflow:
1768        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1769exit_nonewsk:
1770        dst_release(dst);
1771exit:
1772        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1773        return NULL;
1774put_and_exit:
1775        inet_csk_prepare_forced_close(newsk);
1776        tcp_done(newsk);
1777        goto exit;
1778}
1779EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1780
1781static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1782{
1783        struct tcphdr *th = tcp_hdr(skb);
1784        const struct iphdr *iph = ip_hdr(skb);
1785        struct sock *nsk;
1786        struct request_sock **prev;
1787        /* Find possible connection requests. */
1788        struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1789                                                       iph->saddr, iph->daddr);
1790        if (req)
1791                return tcp_check_req(sk, skb, req, prev, false);
1792
1793        nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1794                        th->source, iph->daddr, th->dest, inet_iif(skb));
1795
1796        if (nsk) {
1797                if (nsk->sk_state != TCP_TIME_WAIT) {
1798                        bh_lock_sock(nsk);
1799                        return nsk;
1800                }
1801                inet_twsk_put(inet_twsk(nsk));
1802                return NULL;
1803        }
1804
1805#ifdef CONFIG_SYN_COOKIES
1806        if (!th->syn)
1807                sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1808#endif
1809        return sk;
1810}
1811
1812static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1813{
1814        const struct iphdr *iph = ip_hdr(skb);
1815
1816        if (skb->ip_summed == CHECKSUM_COMPLETE) {
1817                if (!tcp_v4_check(skb->len, iph->saddr,
1818                                  iph->daddr, skb->csum)) {
1819                        skb->ip_summed = CHECKSUM_UNNECESSARY;
1820                        return 0;
1821                }
1822        }
1823
1824        skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1825                                       skb->len, IPPROTO_TCP, 0);
1826
1827        if (skb->len <= 76) {
1828                return __skb_checksum_complete(skb);
1829        }
1830        return 0;
1831}
1832
1833
1834/* The socket must have it's spinlock held when we get
1835 * here.
1836 *
1837 * We have a potential double-lock case here, so even when
1838 * doing backlog processing we use the BH locking scheme.
1839 * This is because we cannot sleep with the original spinlock
1840 * held.
1841 */
1842int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1843{
1844        struct sock *rsk;
1845#ifdef CONFIG_TCP_MD5SIG
1846        /*
1847         * We really want to reject the packet as early as possible
1848         * if:
1849         *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1850         *  o There is an MD5 option and we're not expecting one
1851         */
1852        if (tcp_v4_inbound_md5_hash(sk, skb))
1853                goto discard;
1854#endif
1855
1856        if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1857                struct dst_entry *dst = sk->sk_rx_dst;
1858
1859                sock_rps_save_rxhash(sk, skb);
1860                if (dst) {
1861                        if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1862                            dst->ops->check(dst, 0) == NULL) {
1863                                dst_release(dst);
1864                                sk->sk_rx_dst = NULL;
1865                        }
1866                }
1867                if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1868                        rsk = sk;
1869                        goto reset;
1870                }
1871                return 0;
1872        }
1873
1874        if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1875                goto csum_err;
1876
1877        if (sk->sk_state == TCP_LISTEN) {
1878                struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1879                if (!nsk)
1880                        goto discard;
1881
1882                if (nsk != sk) {
1883                        sock_rps_save_rxhash(nsk, skb);
1884                        if (tcp_child_process(sk, nsk, skb)) {
1885                                rsk = nsk;
1886                                goto reset;
1887                        }
1888                        return 0;
1889                }
1890        } else
1891                sock_rps_save_rxhash(sk, skb);
1892
1893        if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1894                rsk = sk;
1895                goto reset;
1896        }
1897        return 0;
1898
1899reset:
1900        tcp_v4_send_reset(rsk, skb);
1901discard:
1902        kfree_skb(skb);
1903        /* Be careful here. If this function gets more complicated and
1904         * gcc suffers from register pressure on the x86, sk (in %ebx)
1905         * might be destroyed here. This current version compiles correctly,
1906         * but you have been warned.
1907         */
1908        return 0;
1909
1910csum_err:
1911        TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1912        goto discard;
1913}
1914EXPORT_SYMBOL(tcp_v4_do_rcv);
1915
1916void tcp_v4_early_demux(struct sk_buff *skb)
1917{
1918        const struct iphdr *iph;
1919        const struct tcphdr *th;
1920        struct sock *sk;
1921
1922        if (skb->pkt_type != PACKET_HOST)
1923                return;
1924
1925        if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1926                return;
1927
1928        iph = ip_hdr(skb);
1929        th = tcp_hdr(skb);
1930
1931        if (th->doff < sizeof(struct tcphdr) / 4)
1932                return;
1933
1934        sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1935                                       iph->saddr, th->source,
1936                                       iph->daddr, ntohs(th->dest),
1937                                       skb->skb_iif);
1938        if (sk) {
1939                skb->sk = sk;
1940                skb->destructor = sock_edemux;
1941                if (sk->sk_state != TCP_TIME_WAIT) {
1942                        struct dst_entry *dst = sk->sk_rx_dst;
1943
1944                        if (dst)
1945                                dst = dst_check(dst, 0);
1946                        if (dst &&
1947                            inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1948                                skb_dst_set_noref(skb, dst);
1949                }
1950        }
1951}
1952
1953/*
1954 *      From tcp_input.c
1955 */
1956
1957int tcp_v4_rcv(struct sk_buff *skb)
1958{
1959        const struct iphdr *iph;
1960        const struct tcphdr *th;
1961        struct sock *sk;
1962        int ret;
1963        struct net *net = dev_net(skb->dev);
1964
1965        if (skb->pkt_type != PACKET_HOST)
1966                goto discard_it;
1967
1968        /* Count it even if it's bad */
1969        TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1970
1971        if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1972                goto discard_it;
1973
1974        th = tcp_hdr(skb);
1975
1976        if (th->doff < sizeof(struct tcphdr) / 4)
1977                goto bad_packet;
1978        if (!pskb_may_pull(skb, th->doff * 4))
1979                goto discard_it;
1980
1981        /* An explanation is required here, I think.
1982         * Packet length and doff are validated by header prediction,
1983         * provided case of th->doff==0 is eliminated.
1984         * So, we defer the checks. */
1985        if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1986                goto bad_packet;
1987
1988        th = tcp_hdr(skb);
1989        iph = ip_hdr(skb);
1990        TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1991        TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1992                                    skb->len - th->doff * 4);
1993        TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1994        TCP_SKB_CB(skb)->when    = 0;
1995        TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1996        TCP_SKB_CB(skb)->sacked  = 0;
1997
1998        sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1999        if (!sk)
2000                goto no_tcp_socket;
2001
2002process:
2003        if (sk->sk_state == TCP_TIME_WAIT)
2004                goto do_time_wait;
2005
2006        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2007                NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
2008                goto discard_and_relse;
2009        }
2010
2011        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2012                goto discard_and_relse;
2013        nf_reset(skb);
2014
2015        if (sk_filter(sk, skb))
2016                goto discard_and_relse;
2017
2018        skb->dev = NULL;
2019
2020        bh_lock_sock_nested(sk);
2021        ret = 0;
2022        if (!sock_owned_by_user(sk)) {
2023#ifdef CONFIG_NET_DMA
2024                struct tcp_sock *tp = tcp_sk(sk);
2025                if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
2026                        tp->ucopy.dma_chan = net_dma_find_channel();
2027                if (tp->ucopy.dma_chan)
2028                        ret = tcp_v4_do_rcv(sk, skb);
2029                else
2030#endif
2031                {
2032                        if (!tcp_prequeue(sk, skb))
2033                                ret = tcp_v4_do_rcv(sk, skb);
2034                }
2035        } else if (unlikely(sk_add_backlog(sk, skb,
2036                                           sk->sk_rcvbuf + sk->sk_sndbuf))) {
2037                bh_unlock_sock(sk);
2038                NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
2039                goto discard_and_relse;
2040        }
2041        bh_unlock_sock(sk);
2042
2043        sock_put(sk);
2044
2045        return ret;
2046
2047no_tcp_socket:
2048        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2049                goto discard_it;
2050
2051        if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
2052bad_packet:
2053                TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
2054        } else {
2055                tcp_v4_send_reset(NULL, skb);
2056        }
2057
2058discard_it:
2059        /* Discard frame. */
2060        kfree_skb(skb);
2061        return 0;
2062
2063discard_and_relse:
2064        sock_put(sk);
2065        goto discard_it;
2066
2067do_time_wait:
2068        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2069                inet_twsk_put(inet_twsk(sk));
2070                goto discard_it;
2071        }
2072
2073        if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
2074                TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
2075                inet_twsk_put(inet_twsk(sk));
2076                goto discard_it;
2077        }
2078        switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2079        case TCP_TW_SYN: {
2080                struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2081                                                        &tcp_hashinfo,
2082                                                        iph->saddr, th->source,
2083                                                        iph->daddr, th->dest,
2084                                                        inet_iif(skb));
2085                if (sk2) {
2086                        inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
2087                        inet_twsk_put(inet_twsk(sk));
2088                        sk = sk2;
2089                        goto process;
2090                }
2091                /* Fall through to ACK */
2092        }
2093        case TCP_TW_ACK:
2094                tcp_v4_timewait_ack(sk, skb);
2095                break;
2096        case TCP_TW_RST:
2097                goto no_tcp_socket;
2098        case TCP_TW_SUCCESS:;
2099        }
2100        goto discard_it;
2101}
2102
2103static struct timewait_sock_ops tcp_timewait_sock_ops = {
2104        .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2105        .twsk_unique    = tcp_twsk_unique,
2106        .twsk_destructor= tcp_twsk_destructor,
2107};
2108
2109void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2110{
2111        struct dst_entry *dst = skb_dst(skb);
2112
2113        dst_hold(dst);
2114        sk->sk_rx_dst = dst;
2115        inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2116}
2117EXPORT_SYMBOL(inet_sk_rx_dst_set);
2118
2119const struct inet_connection_sock_af_ops ipv4_specific = {
2120        .queue_xmit        = ip_queue_xmit,
2121        .send_check        = tcp_v4_send_check,
2122        .rebuild_header    = inet_sk_rebuild_header,
2123        .sk_rx_dst_set     = inet_sk_rx_dst_set,
2124        .conn_request      = tcp_v4_conn_request,
2125        .syn_recv_sock     = tcp_v4_syn_recv_sock,
2126        .net_header_len    = sizeof(struct iphdr),
2127        .setsockopt        = ip_setsockopt,
2128        .getsockopt        = ip_getsockopt,
2129        .addr2sockaddr     = inet_csk_addr2sockaddr,
2130        .sockaddr_len      = sizeof(struct sockaddr_in),
2131        .bind_conflict     = inet_csk_bind_conflict,
2132#ifdef CONFIG_COMPAT
2133        .compat_setsockopt = compat_ip_setsockopt,
2134        .compat_getsockopt = compat_ip_getsockopt,
2135#endif
2136};
2137EXPORT_SYMBOL(ipv4_specific);
2138
2139#ifdef CONFIG_TCP_MD5SIG
2140static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2141        .md5_lookup             = tcp_v4_md5_lookup,
2142        .calc_md5_hash          = tcp_v4_md5_hash_skb,
2143        .md5_parse              = tcp_v4_parse_md5_keys,
2144};
2145#endif
2146
2147/* NOTE: A lot of things set to zero explicitly by call to
2148 *       sk_alloc() so need not be done here.
2149 */
2150static int tcp_v4_init_sock(struct sock *sk)
2151{
2152        struct inet_connection_sock *icsk = inet_csk(sk);
2153
2154        tcp_init_sock(sk);
2155
2156        icsk->icsk_af_ops = &ipv4_specific;
2157
2158#ifdef CONFIG_TCP_MD5SIG
2159        tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2160#endif
2161
2162        return 0;
2163}
2164
2165void tcp_v4_destroy_sock(struct sock *sk)
2166{
2167        struct tcp_sock *tp = tcp_sk(sk);
2168
2169        tcp_clear_xmit_timers(sk);
2170
2171        tcp_cleanup_congestion_control(sk);
2172
2173        /* Cleanup up the write buffer. */
2174        tcp_write_queue_purge(sk);
2175
2176        /* Cleans up our, hopefully empty, out_of_order_queue. */
2177        __skb_queue_purge(&tp->out_of_order_queue);
2178
2179#ifdef CONFIG_TCP_MD5SIG
2180        /* Clean up the MD5 key list, if any */
2181        if (tp->md5sig_info) {
2182                tcp_clear_md5_list(sk);
2183                kfree_rcu(tp->md5sig_info, rcu);
2184                tp->md5sig_info = NULL;
2185        }
2186#endif
2187
2188#ifdef CONFIG_NET_DMA
2189        /* Cleans up our sk_async_wait_queue */
2190        __skb_queue_purge(&sk->sk_async_wait_queue);
2191#endif
2192
2193        /* Clean prequeue, it must be empty really */
2194        __skb_queue_purge(&tp->ucopy.prequeue);
2195
2196        /* Clean up a referenced TCP bind bucket. */
2197        if (inet_csk(sk)->icsk_bind_hash)
2198                inet_put_port(sk);
2199
2200        /* TCP Cookie Transactions */
2201        if (tp->cookie_values != NULL) {
2202                kref_put(&tp->cookie_values->kref,
2203                         tcp_cookie_values_release);
2204                tp->cookie_values = NULL;
2205        }
2206        BUG_ON(tp->fastopen_rsk != NULL);
2207
2208        /* If socket is aborted during connect operation */
2209        tcp_free_fastopen_req(tp);
2210
2211        sk_sockets_allocated_dec(sk);
2212        sock_release_memcg(sk);
2213}
2214EXPORT_SYMBOL(tcp_v4_destroy_sock);
2215
2216#ifdef CONFIG_PROC_FS
2217/* Proc filesystem TCP sock list dumping. */
2218
2219static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
2220{
2221        return hlist_nulls_empty(head) ? NULL :
2222                list_entry(head->first, struct inet_timewait_sock, tw_node);
2223}
2224
2225static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
2226{
2227        return !is_a_nulls(tw->tw_node.next) ?
2228                hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2229}
2230
2231/*
2232 * Get next listener socket follow cur.  If cur is NULL, get first socket
2233 * starting from bucket given in st->bucket; when st->bucket is zero the
2234 * very first socket in the hash table is returned.
2235 */
2236static void *listening_get_next(struct seq_file *seq, void *cur)
2237{
2238        struct inet_connection_sock *icsk;
2239        struct hlist_nulls_node *node;
2240        struct sock *sk = cur;
2241        struct inet_listen_hashbucket *ilb;
2242        struct tcp_iter_state *st = seq->private;
2243        struct net *net = seq_file_net(seq);
2244
2245        if (!sk) {
2246                ilb = &tcp_hashinfo.listening_hash[st->bucket];
2247                spin_lock_bh(&ilb->lock);
2248                sk = sk_nulls_head(&ilb->head);
2249                st->offset = 0;
2250                goto get_sk;
2251        }
2252        ilb = &tcp_hashinfo.listening_hash[st->bucket];
2253        ++st->num;
2254        ++st->offset;
2255
2256        if (st->state == TCP_SEQ_STATE_OPENREQ) {
2257                struct request_sock *req = cur;
2258
2259                icsk = inet_csk(st->syn_wait_sk);
2260                req = req->dl_next;
2261                while (1) {
2262                        while (req) {
2263                                if (req->rsk_ops->family == st->family) {
2264                                        cur = req;
2265                                        goto out;
2266                                }
2267                                req = req->dl_next;
2268                        }
2269                        if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2270                                break;
2271get_req:
2272                        req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2273                }
2274                sk        = sk_nulls_next(st->syn_wait_sk);
2275                st->state = TCP_SEQ_STATE_LISTENING;
2276                read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2277        } else {
2278                icsk = inet_csk(sk);
2279                read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2280                if (reqsk_queue_len(&icsk->icsk_accept_queue))
2281                        goto start_req;
2282                read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2283                sk = sk_nulls_next(sk);
2284        }
2285get_sk:
2286        sk_nulls_for_each_from(sk, node) {
2287                if (!net_eq(sock_net(sk), net))
2288                        continue;
2289                if (sk->sk_family == st->family) {
2290                        cur = sk;
2291                        goto out;
2292                }
2293                icsk = inet_csk(sk);
2294                read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2295                if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2296start_req:
2297                        st->uid         = sock_i_uid(sk);
2298                        st->syn_wait_sk = sk;
2299                        st->state       = TCP_SEQ_STATE_OPENREQ;
2300                        st->sbucket     = 0;
2301                        goto get_req;
2302                }
2303                read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2304        }
2305        spin_unlock_bh(&ilb->lock);
2306        st->offset = 0;
2307        if (++st->bucket < INET_LHTABLE_SIZE) {
2308                ilb = &tcp_hashinfo.listening_hash[st->bucket];
2309                spin_lock_bh(&ilb->lock);
2310                sk = sk_nulls_head(&ilb->head);
2311                goto get_sk;
2312        }
2313        cur = NULL;
2314out:
2315        return cur;
2316}
2317
2318static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2319{
2320        struct tcp_iter_state *st = seq->private;
2321        void *rc;
2322
2323        st->bucket = 0;
2324        st->offset = 0;
2325        rc = listening_get_next(seq, NULL);
2326
2327        while (rc && *pos) {
2328                rc = listening_get_next(seq, rc);
2329                --*pos;
2330        }
2331        return rc;
2332}
2333
2334static inline bool empty_bucket(struct tcp_iter_state *st)
2335{
2336        return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2337                hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2338}
2339
2340/*
2341 * Get first established socket starting from bucket given in st->bucket.
2342 * If st->bucket is zero, the very first socket in the hash is returned.
2343 */
2344static void *established_get_first(struct seq_file *seq)
2345{
2346        struct tcp_iter_state *st = seq->private;
2347        struct net *net = seq_file_net(seq);
2348        void *rc = NULL;
2349
2350        st->offset = 0;
2351        for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2352                struct sock *sk;
2353                struct hlist_nulls_node *node;
2354                struct inet_timewait_sock *tw;
2355                spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2356
2357                /* Lockless fast path for the common case of empty buckets */
2358                if (empty_bucket(st))
2359                        continue;
2360
2361                spin_lock_bh(lock);
2362                sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2363                        if (sk->sk_family != st->family ||
2364                            !net_eq(sock_net(sk), net)) {
2365                                continue;
2366                        }
2367                        rc = sk;
2368                        goto out;
2369                }
2370                st->state = TCP_SEQ_STATE_TIME_WAIT;
2371                inet_twsk_for_each(tw, node,
2372                                   &tcp_hashinfo.ehash[st->bucket].twchain) {
2373                        if (tw->tw_family != st->family ||
2374                            !net_eq(twsk_net(tw), net)) {
2375                                continue;
2376                        }
2377                        rc = tw;
2378                        goto out;
2379                }
2380                spin_unlock_bh(lock);
2381                st->state = TCP_SEQ_STATE_ESTABLISHED;
2382        }
2383out:
2384        return rc;
2385}
2386
2387static void *established_get_next(struct seq_file *seq, void *cur)
2388{
2389        struct sock *sk = cur;
2390        struct inet_timewait_sock *tw;
2391        struct hlist_nulls_node *node;
2392        struct tcp_iter_state *st = seq->private;
2393        struct net *net = seq_file_net(seq);
2394
2395        ++st->num;
2396        ++st->offset;
2397
2398        if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2399                tw = cur;
2400                tw = tw_next(tw);
2401get_tw:
2402                while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2403                        tw = tw_next(tw);
2404                }
2405                if (tw) {
2406                        cur = tw;
2407                        goto out;
2408                }
2409                spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2410                st->state = TCP_SEQ_STATE_ESTABLISHED;
2411
2412                /* Look for next non empty bucket */
2413                st->offset = 0;
2414                while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2415                                empty_bucket(st))
2416                        ;
2417                if (st->bucket > tcp_hashinfo.ehash_mask)
2418                        return NULL;
2419
2420                spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2421                sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2422        } else
2423                sk = sk_nulls_next(sk);
2424
2425        sk_nulls_for_each_from(sk, node) {
2426                if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2427                        goto found;
2428        }
2429
2430        st->state = TCP_SEQ_STATE_TIME_WAIT;
2431        tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2432        goto get_tw;
2433found:
2434        cur = sk;
2435out:
2436        return cur;
2437}
2438
2439static void *established_get_idx(struct seq_file *seq, loff_t pos)
2440{
2441        struct tcp_iter_state *st = seq->private;
2442        void *rc;
2443
2444        st->bucket = 0;
2445        rc = established_get_first(seq);
2446
2447        while (rc && pos) {
2448                rc = established_get_next(seq, rc);
2449                --pos;
2450        }
2451        return rc;
2452}
2453
2454static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2455{
2456        void *rc;
2457        struct tcp_iter_state *st = seq->private;
2458
2459        st->state = TCP_SEQ_STATE_LISTENING;
2460        rc        = listening_get_idx(seq, &pos);
2461
2462        if (!rc) {
2463                st->state = TCP_SEQ_STATE_ESTABLISHED;
2464                rc        = established_get_idx(seq, pos);
2465        }
2466
2467        return rc;
2468}
2469
2470static void *tcp_seek_last_pos(struct seq_file *seq)
2471{
2472        struct tcp_iter_state *st = seq->private;
2473        int offset = st->offset;
2474        int orig_num = st->num;
2475        void *rc = NULL;
2476
2477        switch (st->state) {
2478        case TCP_SEQ_STATE_OPENREQ:
2479        case TCP_SEQ_STATE_LISTENING:
2480                if (st->bucket >= INET_LHTABLE_SIZE)
2481                        break;
2482                st->state = TCP_SEQ_STATE_LISTENING;
2483                rc = listening_get_next(seq, NULL);
2484                while (offset-- && rc)
2485                        rc = listening_get_next(seq, rc);
2486                if (rc)
2487                        break;
2488                st->bucket = 0;
2489                /* Fallthrough */
2490        case TCP_SEQ_STATE_ESTABLISHED:
2491        case TCP_SEQ_STATE_TIME_WAIT:
2492                st->state = TCP_SEQ_STATE_ESTABLISHED;
2493                if (st->bucket > tcp_hashinfo.ehash_mask)
2494                        break;
2495                rc = established_get_first(seq);
2496                while (offset-- && rc)
2497                        rc = established_get_next(seq, rc);
2498        }
2499
2500        st->num = orig_num;
2501
2502        return rc;
2503}
2504
2505static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2506{
2507        struct tcp_iter_state *st = seq->private;
2508        void *rc;
2509
2510        if (*pos && *pos == st->last_pos) {
2511                rc = tcp_seek_last_pos(seq);
2512                if (rc)
2513                        goto out;
2514        }
2515
2516        st->state = TCP_SEQ_STATE_LISTENING;
2517        st->num = 0;
2518        st->bucket = 0;
2519        st->offset = 0;
2520        rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2521
2522out:
2523        st->last_pos = *pos;
2524        return rc;
2525}
2526
2527static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2528{
2529        struct tcp_iter_state *st = seq->private;
2530        void *rc = NULL;
2531
2532        if (v == SEQ_START_TOKEN) {
2533                rc = tcp_get_idx(seq, 0);
2534                goto out;
2535        }
2536
2537        switch (st->state) {
2538        case TCP_SEQ_STATE_OPENREQ:
2539        case TCP_SEQ_STATE_LISTENING:
2540                rc = listening_get_next(seq, v);
2541                if (!rc) {
2542                        st->state = TCP_SEQ_STATE_ESTABLISHED;
2543                        st->bucket = 0;
2544                        st->offset = 0;
2545                        rc        = established_get_first(seq);
2546                }
2547                break;
2548        case TCP_SEQ_STATE_ESTABLISHED:
2549        case TCP_SEQ_STATE_TIME_WAIT:
2550                rc = established_get_next(seq, v);
2551                break;
2552        }
2553out:
2554        ++*pos;
2555        st->last_pos = *pos;
2556        return rc;
2557}
2558
2559static void tcp_seq_stop(struct seq_file *seq, void *v)
2560{
2561        struct tcp_iter_state *st = seq->private;
2562
2563        switch (st->state) {
2564        case TCP_SEQ_STATE_OPENREQ:
2565                if (v) {
2566                        struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2567                        read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2568                }
2569        case TCP_SEQ_STATE_LISTENING:
2570                if (v != SEQ_START_TOKEN)
2571                        spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2572                break;
2573        case TCP_SEQ_STATE_TIME_WAIT:
2574        case TCP_SEQ_STATE_ESTABLISHED:
2575                if (v)
2576                        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2577                break;
2578        }
2579}
2580
2581int tcp_seq_open(struct inode *inode, struct file *file)
2582{
2583        struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2584        struct tcp_iter_state *s;
2585        int err;
2586
2587        err = seq_open_net(inode, file, &afinfo->seq_ops,
2588                          sizeof(struct tcp_iter_state));
2589        if (err < 0)
2590                return err;
2591
2592        s = ((struct seq_file *)file->private_data)->private;
2593        s->family               = afinfo->family;
2594        s->last_pos             = 0;
2595        return 0;
2596}
2597EXPORT_SYMBOL(tcp_seq_open);
2598
2599int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2600{
2601        int rc = 0;
2602        struct proc_dir_entry *p;
2603
2604        afinfo->seq_ops.start           = tcp_seq_start;
2605        afinfo->seq_ops.next            = tcp_seq_next;
2606        afinfo->seq_ops.stop            = tcp_seq_stop;
2607
2608        p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2609                             afinfo->seq_fops, afinfo);
2610        if (!p)
2611                rc = -ENOMEM;
2612        return rc;
2613}
2614EXPORT_SYMBOL(tcp_proc_register);
2615
2616void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2617{
2618        remove_proc_entry(afinfo->name, net->proc_net);
2619}
2620EXPORT_SYMBOL(tcp_proc_unregister);
2621
2622static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2623                         struct seq_file *f, int i, kuid_t uid, int *len)
2624{
2625        const struct inet_request_sock *ireq = inet_rsk(req);
2626        long delta = req->expires - jiffies;
2627
2628        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2629                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2630                i,
2631                ireq->loc_addr,
2632                ntohs(inet_sk(sk)->inet_sport),
2633                ireq->rmt_addr,
2634                ntohs(ireq->rmt_port),
2635                TCP_SYN_RECV,
2636                0, 0, /* could print option size, but that is af dependent. */
2637                1,    /* timers active (only the expire timer) */
2638                jiffies_delta_to_clock_t(delta),
2639                req->num_timeout,
2640                from_kuid_munged(seq_user_ns(f), uid),
2641                0,  /* non standard timer */
2642                0, /* open_requests have no inode */
2643                atomic_read(&sk->sk_refcnt),
2644                req,
2645                len);
2646}
2647
2648static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2649{
2650        int timer_active;
2651        unsigned long timer_expires;
2652        const struct tcp_sock *tp = tcp_sk(sk);
2653        const struct inet_connection_sock *icsk = inet_csk(sk);
2654        const struct inet_sock *inet = inet_sk(sk);
2655        struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
2656        __be32 dest = inet->inet_daddr;
2657        __be32 src = inet->inet_rcv_saddr;
2658        __u16 destp = ntohs(inet->inet_dport);
2659        __u16 srcp = ntohs(inet->inet_sport);
2660        int rx_queue;
2661
2662        if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2663                timer_active    = 1;
2664                timer_expires   = icsk->icsk_timeout;
2665        } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2666                timer_active    = 4;
2667                timer_expires   = icsk->icsk_timeout;
2668        } else if (timer_pending(&sk->sk_timer)) {
2669                timer_active    = 2;
2670                timer_expires   = sk->sk_timer.expires;
2671        } else {
2672                timer_active    = 0;
2673                timer_expires = jiffies;
2674        }
2675
2676        if (sk->sk_state == TCP_LISTEN)
2677                rx_queue = sk->sk_ack_backlog;
2678        else
2679                /*
2680                 * because we dont lock socket, we might find a transient negative value
2681                 */
2682                rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2683
2684        seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2685                        "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2686                i, src, srcp, dest, destp, sk->sk_state,
2687                tp->write_seq - tp->snd_una,
2688                rx_queue,
2689                timer_active,
2690                jiffies_delta_to_clock_t(timer_expires - jiffies),
2691                icsk->icsk_retransmits,
2692                from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2693                icsk->icsk_probes_out,
2694                sock_i_ino(sk),
2695                atomic_read(&sk->sk_refcnt), sk,
2696                jiffies_to_clock_t(icsk->icsk_rto),
2697                jiffies_to_clock_t(icsk->icsk_ack.ato),
2698                (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2699                tp->snd_cwnd,
2700                sk->sk_state == TCP_LISTEN ?
2701                    (fastopenq ? fastopenq->max_qlen : 0) :
2702                    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh),
2703                len);
2704}
2705
2706static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2707                               struct seq_file *f, int i, int *len)
2708{
2709        __be32 dest, src;
2710        __u16 destp, srcp;
2711        long delta = tw->tw_ttd - jiffies;
2712
2713        dest  = tw->tw_daddr;
2714        src   = tw->tw_rcv_saddr;
2715        destp = ntohs(tw->tw_dport);
2716        srcp  = ntohs(tw->tw_sport);
2717
2718        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2719                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2720                i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2721                3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2722                atomic_read(&tw->tw_refcnt), tw, len);
2723}
2724
2725#define TMPSZ 150
2726
2727static int tcp4_seq_show(struct seq_file *seq, void *v)
2728{
2729        struct tcp_iter_state *st;
2730        int len;
2731
2732        if (v == SEQ_START_TOKEN) {
2733                seq_printf(seq, "%-*s\n", TMPSZ - 1,
2734                           "  sl  local_address rem_address   st tx_queue "
2735                           "rx_queue tr tm->when retrnsmt   uid  timeout "
2736                           "inode");
2737                goto out;
2738        }
2739        st = seq->private;
2740
2741        switch (st->state) {
2742        case TCP_SEQ_STATE_LISTENING:
2743        case TCP_SEQ_STATE_ESTABLISHED:
2744                get_tcp4_sock(v, seq, st->num, &len);
2745                break;
2746        case TCP_SEQ_STATE_OPENREQ:
2747                get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2748                break;
2749        case TCP_SEQ_STATE_TIME_WAIT:
2750                get_timewait4_sock(v, seq, st->num, &len);
2751                break;
2752        }
2753        seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2754out:
2755        return 0;
2756}
2757
2758static const struct file_operations tcp_afinfo_seq_fops = {
2759        .owner   = THIS_MODULE,
2760        .open    = tcp_seq_open,
2761        .read    = seq_read,
2762        .llseek  = seq_lseek,
2763        .release = seq_release_net
2764};
2765
2766static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2767        .name           = "tcp",
2768        .family         = AF_INET,
2769        .seq_fops       = &tcp_afinfo_seq_fops,
2770        .seq_ops        = {
2771                .show           = tcp4_seq_show,
2772        },
2773};
2774
2775static int __net_init tcp4_proc_init_net(struct net *net)
2776{
2777        return tcp_proc_register(net, &tcp4_seq_afinfo);
2778}
2779
2780static void __net_exit tcp4_proc_exit_net(struct net *net)
2781{
2782        tcp_proc_unregister(net, &tcp4_seq_afinfo);
2783}
2784
2785static struct pernet_operations tcp4_net_ops = {
2786        .init = tcp4_proc_init_net,
2787        .exit = tcp4_proc_exit_net,
2788};
2789
2790int __init tcp4_proc_init(void)
2791{
2792        return register_pernet_subsys(&tcp4_net_ops);
2793}
2794
2795void tcp4_proc_exit(void)
2796{
2797        unregister_pernet_subsys(&tcp4_net_ops);
2798}
2799#endif /* CONFIG_PROC_FS */
2800
2801struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2802{
2803        const struct iphdr *iph = skb_gro_network_header(skb);
2804        __wsum wsum;
2805        __sum16 sum;
2806
2807        switch (skb->ip_summed) {
2808        case CHECKSUM_COMPLETE:
2809                if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2810                                  skb->csum)) {
2811                        skb->ip_summed = CHECKSUM_UNNECESSARY;
2812                        break;
2813                }
2814flush:
2815                NAPI_GRO_CB(skb)->flush = 1;
2816                return NULL;
2817
2818        case CHECKSUM_NONE:
2819                wsum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
2820                                          skb_gro_len(skb), IPPROTO_TCP, 0);
2821                sum = csum_fold(skb_checksum(skb,
2822                                             skb_gro_offset(skb),
2823                                             skb_gro_len(skb),
2824                                             wsum));
2825                if (sum)
2826                        goto flush;
2827
2828                skb->ip_summed = CHECKSUM_UNNECESSARY;
2829                break;
2830        }
2831
2832        return tcp_gro_receive(head, skb);
2833}
2834
2835int tcp4_gro_complete(struct sk_buff *skb)
2836{
2837        const struct iphdr *iph = ip_hdr(skb);
2838        struct tcphdr *th = tcp_hdr(skb);
2839
2840        th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2841                                  iph->saddr, iph->daddr, 0);
2842        skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2843
2844        return tcp_gro_complete(skb);
2845}
2846
2847struct proto tcp_prot = {
2848        .name                   = "TCP",
2849        .owner                  = THIS_MODULE,
2850        .close                  = tcp_close,
2851        .connect                = tcp_v4_connect,
2852        .disconnect             = tcp_disconnect,
2853        .accept                 = inet_csk_accept,
2854        .ioctl                  = tcp_ioctl,
2855        .init                   = tcp_v4_init_sock,
2856        .destroy                = tcp_v4_destroy_sock,
2857        .shutdown               = tcp_shutdown,
2858        .setsockopt             = tcp_setsockopt,
2859        .getsockopt             = tcp_getsockopt,
2860        .recvmsg                = tcp_recvmsg,
2861        .sendmsg                = tcp_sendmsg,
2862        .sendpage               = tcp_sendpage,
2863        .backlog_rcv            = tcp_v4_do_rcv,
2864        .release_cb             = tcp_release_cb,
2865        .mtu_reduced            = tcp_v4_mtu_reduced,
2866        .hash                   = inet_hash,
2867        .unhash                 = inet_unhash,
2868        .get_port               = inet_csk_get_port,
2869        .enter_memory_pressure  = tcp_enter_memory_pressure,
2870        .sockets_allocated      = &tcp_sockets_allocated,
2871        .orphan_count           = &tcp_orphan_count,
2872        .memory_allocated       = &tcp_memory_allocated,
2873        .memory_pressure        = &tcp_memory_pressure,
2874        .sysctl_wmem            = sysctl_tcp_wmem,
2875        .sysctl_rmem            = sysctl_tcp_rmem,
2876        .max_header             = MAX_TCP_HEADER,
2877        .obj_size               = sizeof(struct tcp_sock),
2878        .slab_flags             = SLAB_DESTROY_BY_RCU,
2879        .twsk_prot              = &tcp_timewait_sock_ops,
2880        .rsk_prot               = &tcp_request_sock_ops,
2881        .h.hashinfo             = &tcp_hashinfo,
2882        .no_autobind            = true,
2883#ifdef CONFIG_COMPAT
2884        .compat_setsockopt      = compat_tcp_setsockopt,
2885        .compat_getsockopt      = compat_tcp_getsockopt,
2886#endif
2887#ifdef CONFIG_MEMCG_KMEM
2888        .init_cgroup            = tcp_init_cgroup,
2889        .destroy_cgroup         = tcp_destroy_cgroup,
2890        .proto_cgroup           = tcp_proto_cgroup,
2891#endif
2892};
2893EXPORT_SYMBOL(tcp_prot);
2894
2895static int __net_init tcp_sk_init(struct net *net)
2896{
2897        net->ipv4.sysctl_tcp_ecn = 2;
2898        return 0;
2899}
2900
2901static void __net_exit tcp_sk_exit(struct net *net)
2902{
2903}
2904
2905static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2906{
2907        inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2908}
2909
2910static struct pernet_operations __net_initdata tcp_sk_ops = {
2911       .init       = tcp_sk_init,
2912       .exit       = tcp_sk_exit,
2913       .exit_batch = tcp_sk_exit_batch,
2914};
2915
2916void __init tcp_v4_init(void)
2917{
2918        inet_hashinfo_init(&tcp_hashinfo);
2919        if (register_pernet_subsys(&tcp_sk_ops))
2920                panic("Failed to create the TCP control socket.\n");
2921}
2922