linux/net/ipv4/tcp_ipv4.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Implementation of the Transmission Control Protocol(TCP).
   7 *
   8 *              IPv4 specific functions
   9 *
  10 *
  11 *              code split from:
  12 *              linux/ipv4/tcp.c
  13 *              linux/ipv4/tcp_input.c
  14 *              linux/ipv4/tcp_output.c
  15 *
  16 *              See tcp.c for author information
  17 *
  18 *      This program is free software; you can redistribute it and/or
  19 *      modify it under the terms of the GNU General Public License
  20 *      as published by the Free Software Foundation; either version
  21 *      2 of the License, or (at your option) any later version.
  22 */
  23
  24/*
  25 * Changes:
  26 *              David S. Miller :       New socket lookup architecture.
  27 *                                      This code is dedicated to John Dyson.
  28 *              David S. Miller :       Change semantics of established hash,
  29 *                                      half is devoted to TIME_WAIT sockets
  30 *                                      and the rest go in the other half.
  31 *              Andi Kleen :            Add support for syncookies and fixed
  32 *                                      some bugs: ip options weren't passed to
  33 *                                      the TCP layer, missed a check for an
  34 *                                      ACK bit.
  35 *              Andi Kleen :            Implemented fast path mtu discovery.
  36 *                                      Fixed many serious bugs in the
  37 *                                      request_sock handling and moved
  38 *                                      most of it into the af independent code.
  39 *                                      Added tail drop and some other bugfixes.
  40 *                                      Added new listen semantics.
  41 *              Mike McLagan    :       Routing by source
  42 *      Juan Jose Ciarlante:            ip_dynaddr bits
  43 *              Andi Kleen:             various fixes.
  44 *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45 *                                      coma.
  46 *      Andi Kleen              :       Fix new listen.
  47 *      Andi Kleen              :       Fix accept error reporting.
  48 *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49 *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50 *                                      a single port at the same time.
  51 */
  52
  53#define pr_fmt(fmt) "TCP: " fmt
  54
  55#include <linux/bottom_half.h>
  56#include <linux/types.h>
  57#include <linux/fcntl.h>
  58#include <linux/module.h>
  59#include <linux/random.h>
  60#include <linux/cache.h>
  61#include <linux/jhash.h>
  62#include <linux/init.h>
  63#include <linux/times.h>
  64#include <linux/slab.h>
  65
  66#include <net/net_namespace.h>
  67#include <net/icmp.h>
  68#include <net/inet_hashtables.h>
  69#include <net/tcp.h>
  70#include <net/transp_v6.h>
  71#include <net/ipv6.h>
  72#include <net/inet_common.h>
  73#include <net/timewait_sock.h>
  74#include <net/xfrm.h>
  75#include <net/netdma.h>
  76#include <net/secure_seq.h>
  77#include <net/tcp_memcontrol.h>
  78
  79#include <linux/inet.h>
  80#include <linux/ipv6.h>
  81#include <linux/stddef.h>
  82#include <linux/proc_fs.h>
  83#include <linux/seq_file.h>
  84
  85#include <linux/crypto.h>
  86#include <linux/scatterlist.h>
  87
  88int sysctl_tcp_tw_reuse __read_mostly;
  89int sysctl_tcp_low_latency __read_mostly;
  90EXPORT_SYMBOL(sysctl_tcp_low_latency);
  91
  92
  93#ifdef CONFIG_TCP_MD5SIG
  94static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  95                               __be32 daddr, __be32 saddr, const struct tcphdr *th);
  96#endif
  97
  98struct inet_hashinfo tcp_hashinfo;
  99EXPORT_SYMBOL(tcp_hashinfo);
 100
 101static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
 102{
 103        return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 104                                          ip_hdr(skb)->saddr,
 105                                          tcp_hdr(skb)->dest,
 106                                          tcp_hdr(skb)->source);
 107}
 108
 109int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 110{
 111        const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 112        struct tcp_sock *tp = tcp_sk(sk);
 113
 114        /* With PAWS, it is safe from the viewpoint
 115           of data integrity. Even without PAWS it is safe provided sequence
 116           spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 117
 118           Actually, the idea is close to VJ's one, only timestamp cache is
 119           held not per host, but per port pair and TW bucket is used as state
 120           holder.
 121
 122           If TW bucket has been already destroyed we fall back to VJ's scheme
 123           and use initial timestamp retrieved from peer table.
 124         */
 125        if (tcptw->tw_ts_recent_stamp &&
 126            (twp == NULL || (sysctl_tcp_tw_reuse &&
 127                             get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 128                tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 129                if (tp->write_seq == 0)
 130                        tp->write_seq = 1;
 131                tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 132                tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 133                sock_hold(sktw);
 134                return 1;
 135        }
 136
 137        return 0;
 138}
 139EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 140
 141/* This will initiate an outgoing connection. */
 142int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 143{
 144        struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 145        struct inet_sock *inet = inet_sk(sk);
 146        struct tcp_sock *tp = tcp_sk(sk);
 147        __be16 orig_sport, orig_dport;
 148        __be32 daddr, nexthop;
 149        struct flowi4 *fl4;
 150        struct rtable *rt;
 151        int err;
 152        struct ip_options_rcu *inet_opt;
 153
 154        if (addr_len < sizeof(struct sockaddr_in))
 155                return -EINVAL;
 156
 157        if (usin->sin_family != AF_INET)
 158                return -EAFNOSUPPORT;
 159
 160        nexthop = daddr = usin->sin_addr.s_addr;
 161        inet_opt = rcu_dereference_protected(inet->inet_opt,
 162                                             sock_owned_by_user(sk));
 163        if (inet_opt && inet_opt->opt.srr) {
 164                if (!daddr)
 165                        return -EINVAL;
 166                nexthop = inet_opt->opt.faddr;
 167        }
 168
 169        orig_sport = inet->inet_sport;
 170        orig_dport = usin->sin_port;
 171        fl4 = &inet->cork.fl.u.ip4;
 172        rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 173                              RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 174                              IPPROTO_TCP,
 175                              orig_sport, orig_dport, sk, true);
 176        if (IS_ERR(rt)) {
 177                err = PTR_ERR(rt);
 178                if (err == -ENETUNREACH)
 179                        IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 180                return err;
 181        }
 182
 183        if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 184                ip_rt_put(rt);
 185                return -ENETUNREACH;
 186        }
 187
 188        if (!inet_opt || !inet_opt->opt.srr)
 189                daddr = fl4->daddr;
 190
 191        if (!inet->inet_saddr)
 192                inet->inet_saddr = fl4->saddr;
 193        inet->inet_rcv_saddr = inet->inet_saddr;
 194
 195        if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 196                /* Reset inherited state */
 197                tp->rx_opt.ts_recent       = 0;
 198                tp->rx_opt.ts_recent_stamp = 0;
 199                if (likely(!tp->repair))
 200                        tp->write_seq      = 0;
 201        }
 202
 203        if (tcp_death_row.sysctl_tw_recycle &&
 204            !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
 205                tcp_fetch_timewait_stamp(sk, &rt->dst);
 206
 207        inet->inet_dport = usin->sin_port;
 208        inet->inet_daddr = daddr;
 209
 210        inet_csk(sk)->icsk_ext_hdr_len = 0;
 211        if (inet_opt)
 212                inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 213
 214        tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 215
 216        /* Socket identity is still unknown (sport may be zero).
 217         * However we set state to SYN-SENT and not releasing socket
 218         * lock select source port, enter ourselves into the hash tables and
 219         * complete initialization after this.
 220         */
 221        tcp_set_state(sk, TCP_SYN_SENT);
 222        err = inet_hash_connect(&tcp_death_row, sk);
 223        if (err)
 224                goto failure;
 225
 226        rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 227                               inet->inet_sport, inet->inet_dport, sk);
 228        if (IS_ERR(rt)) {
 229                err = PTR_ERR(rt);
 230                rt = NULL;
 231                goto failure;
 232        }
 233        /* OK, now commit destination to socket.  */
 234        sk->sk_gso_type = SKB_GSO_TCPV4;
 235        sk_setup_caps(sk, &rt->dst);
 236
 237        if (!tp->write_seq && likely(!tp->repair))
 238                tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 239                                                           inet->inet_daddr,
 240                                                           inet->inet_sport,
 241                                                           usin->sin_port);
 242
 243        inet->inet_id = tp->write_seq ^ jiffies;
 244
 245        err = tcp_connect(sk);
 246
 247        rt = NULL;
 248        if (err)
 249                goto failure;
 250
 251        return 0;
 252
 253failure:
 254        /*
 255         * This unhashes the socket and releases the local port,
 256         * if necessary.
 257         */
 258        tcp_set_state(sk, TCP_CLOSE);
 259        ip_rt_put(rt);
 260        sk->sk_route_caps = 0;
 261        inet->inet_dport = 0;
 262        return err;
 263}
 264EXPORT_SYMBOL(tcp_v4_connect);
 265
 266/*
 267 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 268 * It can be called through tcp_release_cb() if socket was owned by user
 269 * at the time tcp_v4_err() was called to handle ICMP message.
 270 */
 271static void tcp_v4_mtu_reduced(struct sock *sk)
 272{
 273        struct dst_entry *dst;
 274        struct inet_sock *inet = inet_sk(sk);
 275        u32 mtu = tcp_sk(sk)->mtu_info;
 276
 277        dst = inet_csk_update_pmtu(sk, mtu);
 278        if (!dst)
 279                return;
 280
 281        /* Something is about to be wrong... Remember soft error
 282         * for the case, if this connection will not able to recover.
 283         */
 284        if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 285                sk->sk_err_soft = EMSGSIZE;
 286
 287        mtu = dst_mtu(dst);
 288
 289        if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 290            inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 291                tcp_sync_mss(sk, mtu);
 292
 293                /* Resend the TCP packet because it's
 294                 * clear that the old packet has been
 295                 * dropped. This is the new "fast" path mtu
 296                 * discovery.
 297                 */
 298                tcp_simple_retransmit(sk);
 299        } /* else let the usual retransmit timer handle it */
 300}
 301
 302static void do_redirect(struct sk_buff *skb, struct sock *sk)
 303{
 304        struct dst_entry *dst = __sk_dst_check(sk, 0);
 305
 306        if (dst)
 307                dst->ops->redirect(dst, sk, skb);
 308}
 309
 310/*
 311 * This routine is called by the ICMP module when it gets some
 312 * sort of error condition.  If err < 0 then the socket should
 313 * be closed and the error returned to the user.  If err > 0
 314 * it's just the icmp type << 8 | icmp code.  After adjustment
 315 * header points to the first 8 bytes of the tcp header.  We need
 316 * to find the appropriate port.
 317 *
 318 * The locking strategy used here is very "optimistic". When
 319 * someone else accesses the socket the ICMP is just dropped
 320 * and for some paths there is no check at all.
 321 * A more general error queue to queue errors for later handling
 322 * is probably better.
 323 *
 324 */
 325
 326void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 327{
 328        const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 329        struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 330        struct inet_connection_sock *icsk;
 331        struct tcp_sock *tp;
 332        struct inet_sock *inet;
 333        const int type = icmp_hdr(icmp_skb)->type;
 334        const int code = icmp_hdr(icmp_skb)->code;
 335        struct sock *sk;
 336        struct sk_buff *skb;
 337        struct request_sock *req;
 338        __u32 seq;
 339        __u32 remaining;
 340        int err;
 341        struct net *net = dev_net(icmp_skb->dev);
 342
 343        if (icmp_skb->len < (iph->ihl << 2) + 8) {
 344                ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 345                return;
 346        }
 347
 348        sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
 349                        iph->saddr, th->source, inet_iif(icmp_skb));
 350        if (!sk) {
 351                ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 352                return;
 353        }
 354        if (sk->sk_state == TCP_TIME_WAIT) {
 355                inet_twsk_put(inet_twsk(sk));
 356                return;
 357        }
 358
 359        bh_lock_sock(sk);
 360        /* If too many ICMPs get dropped on busy
 361         * servers this needs to be solved differently.
 362         * We do take care of PMTU discovery (RFC1191) special case :
 363         * we can receive locally generated ICMP messages while socket is held.
 364         */
 365        if (sock_owned_by_user(sk)) {
 366                if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 367                        NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 368        }
 369        if (sk->sk_state == TCP_CLOSE)
 370                goto out;
 371
 372        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 373                NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
 374                goto out;
 375        }
 376
 377        icsk = inet_csk(sk);
 378        tp = tcp_sk(sk);
 379        req = tp->fastopen_rsk;
 380        seq = ntohl(th->seq);
 381        if (sk->sk_state != TCP_LISTEN &&
 382            !between(seq, tp->snd_una, tp->snd_nxt) &&
 383            (req == NULL || seq != tcp_rsk(req)->snt_isn)) {
 384                /* For a Fast Open socket, allow seq to be snt_isn. */
 385                NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 386                goto out;
 387        }
 388
 389        switch (type) {
 390        case ICMP_REDIRECT:
 391                do_redirect(icmp_skb, sk);
 392                goto out;
 393        case ICMP_SOURCE_QUENCH:
 394                /* Just silently ignore these. */
 395                goto out;
 396        case ICMP_PARAMETERPROB:
 397                err = EPROTO;
 398                break;
 399        case ICMP_DEST_UNREACH:
 400                if (code > NR_ICMP_UNREACH)
 401                        goto out;
 402
 403                if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 404                        /* We are not interested in TCP_LISTEN and open_requests
 405                         * (SYN-ACKs send out by Linux are always <576bytes so
 406                         * they should go through unfragmented).
 407                         */
 408                        if (sk->sk_state == TCP_LISTEN)
 409                                goto out;
 410
 411                        tp->mtu_info = info;
 412                        if (!sock_owned_by_user(sk)) {
 413                                tcp_v4_mtu_reduced(sk);
 414                        } else {
 415                                if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
 416                                        sock_hold(sk);
 417                        }
 418                        goto out;
 419                }
 420
 421                err = icmp_err_convert[code].errno;
 422                /* check if icmp_skb allows revert of backoff
 423                 * (see draft-zimmermann-tcp-lcd) */
 424                if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 425                        break;
 426                if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 427                    !icsk->icsk_backoff)
 428                        break;
 429
 430                /* XXX (TFO) - revisit the following logic for TFO */
 431
 432                if (sock_owned_by_user(sk))
 433                        break;
 434
 435                icsk->icsk_backoff--;
 436                inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
 437                        TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
 438                tcp_bound_rto(sk);
 439
 440                skb = tcp_write_queue_head(sk);
 441                BUG_ON(!skb);
 442
 443                remaining = icsk->icsk_rto - min(icsk->icsk_rto,
 444                                tcp_time_stamp - TCP_SKB_CB(skb)->when);
 445
 446                if (remaining) {
 447                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 448                                                  remaining, TCP_RTO_MAX);
 449                } else {
 450                        /* RTO revert clocked out retransmission.
 451                         * Will retransmit now */
 452                        tcp_retransmit_timer(sk);
 453                }
 454
 455                break;
 456        case ICMP_TIME_EXCEEDED:
 457                err = EHOSTUNREACH;
 458                break;
 459        default:
 460                goto out;
 461        }
 462
 463        /* XXX (TFO) - if it's a TFO socket and has been accepted, rather
 464         * than following the TCP_SYN_RECV case and closing the socket,
 465         * we ignore the ICMP error and keep trying like a fully established
 466         * socket. Is this the right thing to do?
 467         */
 468        if (req && req->sk == NULL)
 469                goto out;
 470
 471        switch (sk->sk_state) {
 472                struct request_sock *req, **prev;
 473        case TCP_LISTEN:
 474                if (sock_owned_by_user(sk))
 475                        goto out;
 476
 477                req = inet_csk_search_req(sk, &prev, th->dest,
 478                                          iph->daddr, iph->saddr);
 479                if (!req)
 480                        goto out;
 481
 482                /* ICMPs are not backlogged, hence we cannot get
 483                   an established socket here.
 484                 */
 485                WARN_ON(req->sk);
 486
 487                if (seq != tcp_rsk(req)->snt_isn) {
 488                        NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 489                        goto out;
 490                }
 491
 492                /*
 493                 * Still in SYN_RECV, just remove it silently.
 494                 * There is no good way to pass the error to the newly
 495                 * created socket, and POSIX does not want network
 496                 * errors returned from accept().
 497                 */
 498                inet_csk_reqsk_queue_drop(sk, req, prev);
 499                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
 500                goto out;
 501
 502        case TCP_SYN_SENT:
 503        case TCP_SYN_RECV:  /* Cannot happen.
 504                               It can f.e. if SYNs crossed,
 505                               or Fast Open.
 506                             */
 507                if (!sock_owned_by_user(sk)) {
 508                        sk->sk_err = err;
 509
 510                        sk->sk_error_report(sk);
 511
 512                        tcp_done(sk);
 513                } else {
 514                        sk->sk_err_soft = err;
 515                }
 516                goto out;
 517        }
 518
 519        /* If we've already connected we will keep trying
 520         * until we time out, or the user gives up.
 521         *
 522         * rfc1122 4.2.3.9 allows to consider as hard errors
 523         * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 524         * but it is obsoleted by pmtu discovery).
 525         *
 526         * Note, that in modern internet, where routing is unreliable
 527         * and in each dark corner broken firewalls sit, sending random
 528         * errors ordered by their masters even this two messages finally lose
 529         * their original sense (even Linux sends invalid PORT_UNREACHs)
 530         *
 531         * Now we are in compliance with RFCs.
 532         *                                                      --ANK (980905)
 533         */
 534
 535        inet = inet_sk(sk);
 536        if (!sock_owned_by_user(sk) && inet->recverr) {
 537                sk->sk_err = err;
 538                sk->sk_error_report(sk);
 539        } else  { /* Only an error on timeout */
 540                sk->sk_err_soft = err;
 541        }
 542
 543out:
 544        bh_unlock_sock(sk);
 545        sock_put(sk);
 546}
 547
 548static void __tcp_v4_send_check(struct sk_buff *skb,
 549                                __be32 saddr, __be32 daddr)
 550{
 551        struct tcphdr *th = tcp_hdr(skb);
 552
 553        if (skb->ip_summed == CHECKSUM_PARTIAL) {
 554                th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 555                skb->csum_start = skb_transport_header(skb) - skb->head;
 556                skb->csum_offset = offsetof(struct tcphdr, check);
 557        } else {
 558                th->check = tcp_v4_check(skb->len, saddr, daddr,
 559                                         csum_partial(th,
 560                                                      th->doff << 2,
 561                                                      skb->csum));
 562        }
 563}
 564
 565/* This routine computes an IPv4 TCP checksum. */
 566void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 567{
 568        const struct inet_sock *inet = inet_sk(sk);
 569
 570        __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 571}
 572EXPORT_SYMBOL(tcp_v4_send_check);
 573
 574int tcp_v4_gso_send_check(struct sk_buff *skb)
 575{
 576        const struct iphdr *iph;
 577        struct tcphdr *th;
 578
 579        if (!pskb_may_pull(skb, sizeof(*th)))
 580                return -EINVAL;
 581
 582        iph = ip_hdr(skb);
 583        th = tcp_hdr(skb);
 584
 585        th->check = 0;
 586        skb->ip_summed = CHECKSUM_PARTIAL;
 587        __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
 588        return 0;
 589}
 590
 591/*
 592 *      This routine will send an RST to the other tcp.
 593 *
 594 *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 595 *                    for reset.
 596 *      Answer: if a packet caused RST, it is not for a socket
 597 *              existing in our system, if it is matched to a socket,
 598 *              it is just duplicate segment or bug in other side's TCP.
 599 *              So that we build reply only basing on parameters
 600 *              arrived with segment.
 601 *      Exception: precedence violation. We do not implement it in any case.
 602 */
 603
 604static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 605{
 606        const struct tcphdr *th = tcp_hdr(skb);
 607        struct {
 608                struct tcphdr th;
 609#ifdef CONFIG_TCP_MD5SIG
 610                __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 611#endif
 612        } rep;
 613        struct ip_reply_arg arg;
 614#ifdef CONFIG_TCP_MD5SIG
 615        struct tcp_md5sig_key *key;
 616        const __u8 *hash_location = NULL;
 617        unsigned char newhash[16];
 618        int genhash;
 619        struct sock *sk1 = NULL;
 620#endif
 621        struct net *net;
 622
 623        /* Never send a reset in response to a reset. */
 624        if (th->rst)
 625                return;
 626
 627        if (skb_rtable(skb)->rt_type != RTN_LOCAL)
 628                return;
 629
 630        /* Swap the send and the receive. */
 631        memset(&rep, 0, sizeof(rep));
 632        rep.th.dest   = th->source;
 633        rep.th.source = th->dest;
 634        rep.th.doff   = sizeof(struct tcphdr) / 4;
 635        rep.th.rst    = 1;
 636
 637        if (th->ack) {
 638                rep.th.seq = th->ack_seq;
 639        } else {
 640                rep.th.ack = 1;
 641                rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 642                                       skb->len - (th->doff << 2));
 643        }
 644
 645        memset(&arg, 0, sizeof(arg));
 646        arg.iov[0].iov_base = (unsigned char *)&rep;
 647        arg.iov[0].iov_len  = sizeof(rep.th);
 648
 649#ifdef CONFIG_TCP_MD5SIG
 650        hash_location = tcp_parse_md5sig_option(th);
 651        if (!sk && hash_location) {
 652                /*
 653                 * active side is lost. Try to find listening socket through
 654                 * source port, and then find md5 key through listening socket.
 655                 * we are not loose security here:
 656                 * Incoming packet is checked with md5 hash with finding key,
 657                 * no RST generated if md5 hash doesn't match.
 658                 */
 659                sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
 660                                             &tcp_hashinfo, ip_hdr(skb)->saddr,
 661                                             th->source, ip_hdr(skb)->daddr,
 662                                             ntohs(th->source), inet_iif(skb));
 663                /* don't send rst if it can't find key */
 664                if (!sk1)
 665                        return;
 666                rcu_read_lock();
 667                key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 668                                        &ip_hdr(skb)->saddr, AF_INET);
 669                if (!key)
 670                        goto release_sk1;
 671
 672                genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
 673                if (genhash || memcmp(hash_location, newhash, 16) != 0)
 674                        goto release_sk1;
 675        } else {
 676                key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 677                                             &ip_hdr(skb)->saddr,
 678                                             AF_INET) : NULL;
 679        }
 680
 681        if (key) {
 682                rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 683                                   (TCPOPT_NOP << 16) |
 684                                   (TCPOPT_MD5SIG << 8) |
 685                                   TCPOLEN_MD5SIG);
 686                /* Update length and the length the header thinks exists */
 687                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 688                rep.th.doff = arg.iov[0].iov_len / 4;
 689
 690                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 691                                     key, ip_hdr(skb)->saddr,
 692                                     ip_hdr(skb)->daddr, &rep.th);
 693        }
 694#endif
 695        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 696                                      ip_hdr(skb)->saddr, /* XXX */
 697                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 698        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 699        arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 700        /* When socket is gone, all binding information is lost.
 701         * routing might fail in this case. No choice here, if we choose to force
 702         * input interface, we will misroute in case of asymmetric route.
 703         */
 704        if (sk)
 705                arg.bound_dev_if = sk->sk_bound_dev_if;
 706
 707        net = dev_net(skb_dst(skb)->dev);
 708        arg.tos = ip_hdr(skb)->tos;
 709        ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
 710                              ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
 711
 712        TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 713        TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 714
 715#ifdef CONFIG_TCP_MD5SIG
 716release_sk1:
 717        if (sk1) {
 718                rcu_read_unlock();
 719                sock_put(sk1);
 720        }
 721#endif
 722}
 723
 724/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 725   outside socket context is ugly, certainly. What can I do?
 726 */
 727
 728static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 729                            u32 win, u32 tsval, u32 tsecr, int oif,
 730                            struct tcp_md5sig_key *key,
 731                            int reply_flags, u8 tos)
 732{
 733        const struct tcphdr *th = tcp_hdr(skb);
 734        struct {
 735                struct tcphdr th;
 736                __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 737#ifdef CONFIG_TCP_MD5SIG
 738                           + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 739#endif
 740                        ];
 741        } rep;
 742        struct ip_reply_arg arg;
 743        struct net *net = dev_net(skb_dst(skb)->dev);
 744
 745        memset(&rep.th, 0, sizeof(struct tcphdr));
 746        memset(&arg, 0, sizeof(arg));
 747
 748        arg.iov[0].iov_base = (unsigned char *)&rep;
 749        arg.iov[0].iov_len  = sizeof(rep.th);
 750        if (tsecr) {
 751                rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 752                                   (TCPOPT_TIMESTAMP << 8) |
 753                                   TCPOLEN_TIMESTAMP);
 754                rep.opt[1] = htonl(tsval);
 755                rep.opt[2] = htonl(tsecr);
 756                arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 757        }
 758
 759        /* Swap the send and the receive. */
 760        rep.th.dest    = th->source;
 761        rep.th.source  = th->dest;
 762        rep.th.doff    = arg.iov[0].iov_len / 4;
 763        rep.th.seq     = htonl(seq);
 764        rep.th.ack_seq = htonl(ack);
 765        rep.th.ack     = 1;
 766        rep.th.window  = htons(win);
 767
 768#ifdef CONFIG_TCP_MD5SIG
 769        if (key) {
 770                int offset = (tsecr) ? 3 : 0;
 771
 772                rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 773                                          (TCPOPT_NOP << 16) |
 774                                          (TCPOPT_MD5SIG << 8) |
 775                                          TCPOLEN_MD5SIG);
 776                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 777                rep.th.doff = arg.iov[0].iov_len/4;
 778
 779                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 780                                    key, ip_hdr(skb)->saddr,
 781                                    ip_hdr(skb)->daddr, &rep.th);
 782        }
 783#endif
 784        arg.flags = reply_flags;
 785        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 786                                      ip_hdr(skb)->saddr, /* XXX */
 787                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 788        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 789        if (oif)
 790                arg.bound_dev_if = oif;
 791        arg.tos = tos;
 792        ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
 793                              ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
 794
 795        TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 796}
 797
 798static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 799{
 800        struct inet_timewait_sock *tw = inet_twsk(sk);
 801        struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 802
 803        tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 804                        tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 805                        tcp_time_stamp + tcptw->tw_ts_offset,
 806                        tcptw->tw_ts_recent,
 807                        tw->tw_bound_dev_if,
 808                        tcp_twsk_md5_key(tcptw),
 809                        tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 810                        tw->tw_tos
 811                        );
 812
 813        inet_twsk_put(tw);
 814}
 815
 816static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 817                                  struct request_sock *req)
 818{
 819        /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 820         * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 821         */
 822        tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
 823                        tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
 824                        tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
 825                        tcp_time_stamp,
 826                        req->ts_recent,
 827                        0,
 828                        tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
 829                                          AF_INET),
 830                        inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 831                        ip_hdr(skb)->tos);
 832}
 833
 834/*
 835 *      Send a SYN-ACK after having received a SYN.
 836 *      This still operates on a request_sock only, not on a big
 837 *      socket.
 838 */
 839static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 840                              struct request_sock *req,
 841                              u16 queue_mapping,
 842                              bool nocache)
 843{
 844        const struct inet_request_sock *ireq = inet_rsk(req);
 845        struct flowi4 fl4;
 846        int err = -1;
 847        struct sk_buff * skb;
 848
 849        /* First, grab a route. */
 850        if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 851                return -1;
 852
 853        skb = tcp_make_synack(sk, dst, req, NULL);
 854
 855        if (skb) {
 856                __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
 857
 858                skb_set_queue_mapping(skb, queue_mapping);
 859                err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 860                                            ireq->rmt_addr,
 861                                            ireq->opt);
 862                err = net_xmit_eval(err);
 863                if (!tcp_rsk(req)->snt_synack && !err)
 864                        tcp_rsk(req)->snt_synack = tcp_time_stamp;
 865        }
 866
 867        return err;
 868}
 869
 870static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)
 871{
 872        int res = tcp_v4_send_synack(sk, NULL, req, 0, false);
 873
 874        if (!res)
 875                TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
 876        return res;
 877}
 878
 879/*
 880 *      IPv4 request_sock destructor.
 881 */
 882static void tcp_v4_reqsk_destructor(struct request_sock *req)
 883{
 884        kfree(inet_rsk(req)->opt);
 885}
 886
 887/*
 888 * Return true if a syncookie should be sent
 889 */
 890bool tcp_syn_flood_action(struct sock *sk,
 891                         const struct sk_buff *skb,
 892                         const char *proto)
 893{
 894        const char *msg = "Dropping request";
 895        bool want_cookie = false;
 896        struct listen_sock *lopt;
 897
 898
 899
 900#ifdef CONFIG_SYN_COOKIES
 901        if (sysctl_tcp_syncookies) {
 902                msg = "Sending cookies";
 903                want_cookie = true;
 904                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
 905        } else
 906#endif
 907                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
 908
 909        lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
 910        if (!lopt->synflood_warned) {
 911                lopt->synflood_warned = 1;
 912                pr_info("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
 913                        proto, ntohs(tcp_hdr(skb)->dest), msg);
 914        }
 915        return want_cookie;
 916}
 917EXPORT_SYMBOL(tcp_syn_flood_action);
 918
 919/*
 920 * Save and compile IPv4 options into the request_sock if needed.
 921 */
 922static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
 923{
 924        const struct ip_options *opt = &(IPCB(skb)->opt);
 925        struct ip_options_rcu *dopt = NULL;
 926
 927        if (opt && opt->optlen) {
 928                int opt_size = sizeof(*dopt) + opt->optlen;
 929
 930                dopt = kmalloc(opt_size, GFP_ATOMIC);
 931                if (dopt) {
 932                        if (ip_options_echo(&dopt->opt, skb)) {
 933                                kfree(dopt);
 934                                dopt = NULL;
 935                        }
 936                }
 937        }
 938        return dopt;
 939}
 940
 941#ifdef CONFIG_TCP_MD5SIG
 942/*
 943 * RFC2385 MD5 checksumming requires a mapping of
 944 * IP address->MD5 Key.
 945 * We need to maintain these in the sk structure.
 946 */
 947
 948/* Find the Key structure for an address.  */
 949struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
 950                                         const union tcp_md5_addr *addr,
 951                                         int family)
 952{
 953        struct tcp_sock *tp = tcp_sk(sk);
 954        struct tcp_md5sig_key *key;
 955        unsigned int size = sizeof(struct in_addr);
 956        struct tcp_md5sig_info *md5sig;
 957
 958        /* caller either holds rcu_read_lock() or socket lock */
 959        md5sig = rcu_dereference_check(tp->md5sig_info,
 960                                       sock_owned_by_user(sk) ||
 961                                       lockdep_is_held(&sk->sk_lock.slock));
 962        if (!md5sig)
 963                return NULL;
 964#if IS_ENABLED(CONFIG_IPV6)
 965        if (family == AF_INET6)
 966                size = sizeof(struct in6_addr);
 967#endif
 968        hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 969                if (key->family != family)
 970                        continue;
 971                if (!memcmp(&key->addr, addr, size))
 972                        return key;
 973        }
 974        return NULL;
 975}
 976EXPORT_SYMBOL(tcp_md5_do_lookup);
 977
 978struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 979                                         struct sock *addr_sk)
 980{
 981        union tcp_md5_addr *addr;
 982
 983        addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
 984        return tcp_md5_do_lookup(sk, addr, AF_INET);
 985}
 986EXPORT_SYMBOL(tcp_v4_md5_lookup);
 987
 988static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
 989                                                      struct request_sock *req)
 990{
 991        union tcp_md5_addr *addr;
 992
 993        addr = (union tcp_md5_addr *)&inet_rsk(req)->rmt_addr;
 994        return tcp_md5_do_lookup(sk, addr, AF_INET);
 995}
 996
 997/* This can be called on a newly created socket, from other files */
 998int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 999                   int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
1000{
1001        /* Add Key to the list */
1002        struct tcp_md5sig_key *key;
1003        struct tcp_sock *tp = tcp_sk(sk);
1004        struct tcp_md5sig_info *md5sig;
1005
1006        key = tcp_md5_do_lookup(sk, addr, family);
1007        if (key) {
1008                /* Pre-existing entry - just update that one. */
1009                memcpy(key->key, newkey, newkeylen);
1010                key->keylen = newkeylen;
1011                return 0;
1012        }
1013
1014        md5sig = rcu_dereference_protected(tp->md5sig_info,
1015                                           sock_owned_by_user(sk));
1016        if (!md5sig) {
1017                md5sig = kmalloc(sizeof(*md5sig), gfp);
1018                if (!md5sig)
1019                        return -ENOMEM;
1020
1021                sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1022                INIT_HLIST_HEAD(&md5sig->head);
1023                rcu_assign_pointer(tp->md5sig_info, md5sig);
1024        }
1025
1026        key = sock_kmalloc(sk, sizeof(*key), gfp);
1027        if (!key)
1028                return -ENOMEM;
1029        if (hlist_empty(&md5sig->head) && !tcp_alloc_md5sig_pool(sk)) {
1030                sock_kfree_s(sk, key, sizeof(*key));
1031                return -ENOMEM;
1032        }
1033
1034        memcpy(key->key, newkey, newkeylen);
1035        key->keylen = newkeylen;
1036        key->family = family;
1037        memcpy(&key->addr, addr,
1038               (family == AF_INET6) ? sizeof(struct in6_addr) :
1039                                      sizeof(struct in_addr));
1040        hlist_add_head_rcu(&key->node, &md5sig->head);
1041        return 0;
1042}
1043EXPORT_SYMBOL(tcp_md5_do_add);
1044
1045int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
1046{
1047        struct tcp_sock *tp = tcp_sk(sk);
1048        struct tcp_md5sig_key *key;
1049        struct tcp_md5sig_info *md5sig;
1050
1051        key = tcp_md5_do_lookup(sk, addr, family);
1052        if (!key)
1053                return -ENOENT;
1054        hlist_del_rcu(&key->node);
1055        atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1056        kfree_rcu(key, rcu);
1057        md5sig = rcu_dereference_protected(tp->md5sig_info,
1058                                           sock_owned_by_user(sk));
1059        if (hlist_empty(&md5sig->head))
1060                tcp_free_md5sig_pool();
1061        return 0;
1062}
1063EXPORT_SYMBOL(tcp_md5_do_del);
1064
1065static void tcp_clear_md5_list(struct sock *sk)
1066{
1067        struct tcp_sock *tp = tcp_sk(sk);
1068        struct tcp_md5sig_key *key;
1069        struct hlist_node *n;
1070        struct tcp_md5sig_info *md5sig;
1071
1072        md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1073
1074        if (!hlist_empty(&md5sig->head))
1075                tcp_free_md5sig_pool();
1076        hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1077                hlist_del_rcu(&key->node);
1078                atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1079                kfree_rcu(key, rcu);
1080        }
1081}
1082
1083static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1084                                 int optlen)
1085{
1086        struct tcp_md5sig cmd;
1087        struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1088
1089        if (optlen < sizeof(cmd))
1090                return -EINVAL;
1091
1092        if (copy_from_user(&cmd, optval, sizeof(cmd)))
1093                return -EFAULT;
1094
1095        if (sin->sin_family != AF_INET)
1096                return -EINVAL;
1097
1098        if (!cmd.tcpm_key || !cmd.tcpm_keylen)
1099                return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1100                                      AF_INET);
1101
1102        if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1103                return -EINVAL;
1104
1105        return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1106                              AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1107                              GFP_KERNEL);
1108}
1109
1110static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1111                                        __be32 daddr, __be32 saddr, int nbytes)
1112{
1113        struct tcp4_pseudohdr *bp;
1114        struct scatterlist sg;
1115
1116        bp = &hp->md5_blk.ip4;
1117
1118        /*
1119         * 1. the TCP pseudo-header (in the order: source IP address,
1120         * destination IP address, zero-padded protocol number, and
1121         * segment length)
1122         */
1123        bp->saddr = saddr;
1124        bp->daddr = daddr;
1125        bp->pad = 0;
1126        bp->protocol = IPPROTO_TCP;
1127        bp->len = cpu_to_be16(nbytes);
1128
1129        sg_init_one(&sg, bp, sizeof(*bp));
1130        return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1131}
1132
1133static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1134                               __be32 daddr, __be32 saddr, const struct tcphdr *th)
1135{
1136        struct tcp_md5sig_pool *hp;
1137        struct hash_desc *desc;
1138
1139        hp = tcp_get_md5sig_pool();
1140        if (!hp)
1141                goto clear_hash_noput;
1142        desc = &hp->md5_desc;
1143
1144        if (crypto_hash_init(desc))
1145                goto clear_hash;
1146        if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1147                goto clear_hash;
1148        if (tcp_md5_hash_header(hp, th))
1149                goto clear_hash;
1150        if (tcp_md5_hash_key(hp, key))
1151                goto clear_hash;
1152        if (crypto_hash_final(desc, md5_hash))
1153                goto clear_hash;
1154
1155        tcp_put_md5sig_pool();
1156        return 0;
1157
1158clear_hash:
1159        tcp_put_md5sig_pool();
1160clear_hash_noput:
1161        memset(md5_hash, 0, 16);
1162        return 1;
1163}
1164
1165int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1166                        const struct sock *sk, const struct request_sock *req,
1167                        const struct sk_buff *skb)
1168{
1169        struct tcp_md5sig_pool *hp;
1170        struct hash_desc *desc;
1171        const struct tcphdr *th = tcp_hdr(skb);
1172        __be32 saddr, daddr;
1173
1174        if (sk) {
1175                saddr = inet_sk(sk)->inet_saddr;
1176                daddr = inet_sk(sk)->inet_daddr;
1177        } else if (req) {
1178                saddr = inet_rsk(req)->loc_addr;
1179                daddr = inet_rsk(req)->rmt_addr;
1180        } else {
1181                const struct iphdr *iph = ip_hdr(skb);
1182                saddr = iph->saddr;
1183                daddr = iph->daddr;
1184        }
1185
1186        hp = tcp_get_md5sig_pool();
1187        if (!hp)
1188                goto clear_hash_noput;
1189        desc = &hp->md5_desc;
1190
1191        if (crypto_hash_init(desc))
1192                goto clear_hash;
1193
1194        if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1195                goto clear_hash;
1196        if (tcp_md5_hash_header(hp, th))
1197                goto clear_hash;
1198        if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1199                goto clear_hash;
1200        if (tcp_md5_hash_key(hp, key))
1201                goto clear_hash;
1202        if (crypto_hash_final(desc, md5_hash))
1203                goto clear_hash;
1204
1205        tcp_put_md5sig_pool();
1206        return 0;
1207
1208clear_hash:
1209        tcp_put_md5sig_pool();
1210clear_hash_noput:
1211        memset(md5_hash, 0, 16);
1212        return 1;
1213}
1214EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1215
1216static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1217{
1218        /*
1219         * This gets called for each TCP segment that arrives
1220         * so we want to be efficient.
1221         * We have 3 drop cases:
1222         * o No MD5 hash and one expected.
1223         * o MD5 hash and we're not expecting one.
1224         * o MD5 hash and its wrong.
1225         */
1226        const __u8 *hash_location = NULL;
1227        struct tcp_md5sig_key *hash_expected;
1228        const struct iphdr *iph = ip_hdr(skb);
1229        const struct tcphdr *th = tcp_hdr(skb);
1230        int genhash;
1231        unsigned char newhash[16];
1232
1233        hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1234                                          AF_INET);
1235        hash_location = tcp_parse_md5sig_option(th);
1236
1237        /* We've parsed the options - do we have a hash? */
1238        if (!hash_expected && !hash_location)
1239                return false;
1240
1241        if (hash_expected && !hash_location) {
1242                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1243                return true;
1244        }
1245
1246        if (!hash_expected && hash_location) {
1247                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1248                return true;
1249        }
1250
1251        /* Okay, so this is hash_expected and hash_location -
1252         * so we need to calculate the checksum.
1253         */
1254        genhash = tcp_v4_md5_hash_skb(newhash,
1255                                      hash_expected,
1256                                      NULL, NULL, skb);
1257
1258        if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1259                net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1260                                     &iph->saddr, ntohs(th->source),
1261                                     &iph->daddr, ntohs(th->dest),
1262                                     genhash ? " tcp_v4_calc_md5_hash failed"
1263                                     : "");
1264                return true;
1265        }
1266        return false;
1267}
1268
1269#endif
1270
1271struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1272        .family         =       PF_INET,
1273        .obj_size       =       sizeof(struct tcp_request_sock),
1274        .rtx_syn_ack    =       tcp_v4_rtx_synack,
1275        .send_ack       =       tcp_v4_reqsk_send_ack,
1276        .destructor     =       tcp_v4_reqsk_destructor,
1277        .send_reset     =       tcp_v4_send_reset,
1278        .syn_ack_timeout =      tcp_syn_ack_timeout,
1279};
1280
1281#ifdef CONFIG_TCP_MD5SIG
1282static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1283        .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1284        .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1285};
1286#endif
1287
1288static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
1289                               struct request_sock *req,
1290                               struct tcp_fastopen_cookie *foc,
1291                               struct tcp_fastopen_cookie *valid_foc)
1292{
1293        bool skip_cookie = false;
1294        struct fastopen_queue *fastopenq;
1295
1296        if (likely(!fastopen_cookie_present(foc))) {
1297                /* See include/net/tcp.h for the meaning of these knobs */
1298                if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) ||
1299                    ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) &&
1300                    (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1)))
1301                        skip_cookie = true; /* no cookie to validate */
1302                else
1303                        return false;
1304        }
1305        fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
1306        /* A FO option is present; bump the counter. */
1307        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE);
1308
1309        /* Make sure the listener has enabled fastopen, and we don't
1310         * exceed the max # of pending TFO requests allowed before trying
1311         * to validating the cookie in order to avoid burning CPU cycles
1312         * unnecessarily.
1313         *
1314         * XXX (TFO) - The implication of checking the max_qlen before
1315         * processing a cookie request is that clients can't differentiate
1316         * between qlen overflow causing Fast Open to be disabled
1317         * temporarily vs a server not supporting Fast Open at all.
1318         */
1319        if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 ||
1320            fastopenq == NULL || fastopenq->max_qlen == 0)
1321                return false;
1322
1323        if (fastopenq->qlen >= fastopenq->max_qlen) {
1324                struct request_sock *req1;
1325                spin_lock(&fastopenq->lock);
1326                req1 = fastopenq->rskq_rst_head;
1327                if ((req1 == NULL) || time_after(req1->expires, jiffies)) {
1328                        spin_unlock(&fastopenq->lock);
1329                        NET_INC_STATS_BH(sock_net(sk),
1330                            LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
1331                        /* Avoid bumping LINUX_MIB_TCPFASTOPENPASSIVEFAIL*/
1332                        foc->len = -1;
1333                        return false;
1334                }
1335                fastopenq->rskq_rst_head = req1->dl_next;
1336                fastopenq->qlen--;
1337                spin_unlock(&fastopenq->lock);
1338                reqsk_free(req1);
1339        }
1340        if (skip_cookie) {
1341                tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1342                return true;
1343        }
1344        if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) {
1345                if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) {
1346                        tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1347                        if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) ||
1348                            memcmp(&foc->val[0], &valid_foc->val[0],
1349                            TCP_FASTOPEN_COOKIE_SIZE) != 0)
1350                                return false;
1351                        valid_foc->len = -1;
1352                }
1353                /* Acknowledge the data received from the peer. */
1354                tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1355                return true;
1356        } else if (foc->len == 0) { /* Client requesting a cookie */
1357                tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1358                NET_INC_STATS_BH(sock_net(sk),
1359                    LINUX_MIB_TCPFASTOPENCOOKIEREQD);
1360        } else {
1361                /* Client sent a cookie with wrong size. Treat it
1362                 * the same as invalid and return a valid one.
1363                 */
1364                tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1365        }
1366        return false;
1367}
1368
1369static int tcp_v4_conn_req_fastopen(struct sock *sk,
1370                                    struct sk_buff *skb,
1371                                    struct sk_buff *skb_synack,
1372                                    struct request_sock *req)
1373{
1374        struct tcp_sock *tp = tcp_sk(sk);
1375        struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
1376        const struct inet_request_sock *ireq = inet_rsk(req);
1377        struct sock *child;
1378        int err;
1379
1380        req->num_retrans = 0;
1381        req->num_timeout = 0;
1382        req->sk = NULL;
1383
1384        child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
1385        if (child == NULL) {
1386                NET_INC_STATS_BH(sock_net(sk),
1387                                 LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1388                kfree_skb(skb_synack);
1389                return -1;
1390        }
1391        err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1392                                    ireq->rmt_addr, ireq->opt);
1393        err = net_xmit_eval(err);
1394        if (!err)
1395                tcp_rsk(req)->snt_synack = tcp_time_stamp;
1396        /* XXX (TFO) - is it ok to ignore error and continue? */
1397
1398        spin_lock(&queue->fastopenq->lock);
1399        queue->fastopenq->qlen++;
1400        spin_unlock(&queue->fastopenq->lock);
1401
1402        /* Initialize the child socket. Have to fix some values to take
1403         * into account the child is a Fast Open socket and is created
1404         * only out of the bits carried in the SYN packet.
1405         */
1406        tp = tcp_sk(child);
1407
1408        tp->fastopen_rsk = req;
1409        /* Do a hold on the listner sk so that if the listener is being
1410         * closed, the child that has been accepted can live on and still
1411         * access listen_lock.
1412         */
1413        sock_hold(sk);
1414        tcp_rsk(req)->listener = sk;
1415
1416        /* RFC1323: The window in SYN & SYN/ACK segments is never
1417         * scaled. So correct it appropriately.
1418         */
1419        tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
1420
1421        /* Activate the retrans timer so that SYNACK can be retransmitted.
1422         * The request socket is not added to the SYN table of the parent
1423         * because it's been added to the accept queue directly.
1424         */
1425        inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
1426            TCP_TIMEOUT_INIT, TCP_RTO_MAX);
1427
1428        /* Add the child socket directly into the accept queue */
1429        inet_csk_reqsk_queue_add(sk, req, child);
1430
1431        /* Now finish processing the fastopen child socket. */
1432        inet_csk(child)->icsk_af_ops->rebuild_header(child);
1433        tcp_init_congestion_control(child);
1434        tcp_mtup_init(child);
1435        tcp_init_buffer_space(child);
1436        tcp_init_metrics(child);
1437
1438        /* Queue the data carried in the SYN packet. We need to first
1439         * bump skb's refcnt because the caller will attempt to free it.
1440         *
1441         * XXX (TFO) - we honor a zero-payload TFO request for now.
1442         * (Any reason not to?)
1443         */
1444        if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) {
1445                /* Don't queue the skb if there is no payload in SYN.
1446                 * XXX (TFO) - How about SYN+FIN?
1447                 */
1448                tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1449        } else {
1450                skb = skb_get(skb);
1451                skb_dst_drop(skb);
1452                __skb_pull(skb, tcp_hdr(skb)->doff * 4);
1453                skb_set_owner_r(skb, child);
1454                __skb_queue_tail(&child->sk_receive_queue, skb);
1455                tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1456                tp->syn_data_acked = 1;
1457        }
1458        sk->sk_data_ready(sk, 0);
1459        bh_unlock_sock(child);
1460        sock_put(child);
1461        WARN_ON(req->sk == NULL);
1462        return 0;
1463}
1464
1465int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1466{
1467        struct tcp_options_received tmp_opt;
1468        struct request_sock *req;
1469        struct inet_request_sock *ireq;
1470        struct tcp_sock *tp = tcp_sk(sk);
1471        struct dst_entry *dst = NULL;
1472        __be32 saddr = ip_hdr(skb)->saddr;
1473        __be32 daddr = ip_hdr(skb)->daddr;
1474        __u32 isn = TCP_SKB_CB(skb)->when;
1475        bool want_cookie = false;
1476        struct flowi4 fl4;
1477        struct tcp_fastopen_cookie foc = { .len = -1 };
1478        struct tcp_fastopen_cookie valid_foc = { .len = -1 };
1479        struct sk_buff *skb_synack;
1480        int do_fastopen;
1481
1482        /* Never answer to SYNs send to broadcast or multicast */
1483        if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1484                goto drop;
1485
1486        /* TW buckets are converted to open requests without
1487         * limitations, they conserve resources and peer is
1488         * evidently real one.
1489         */
1490        if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1491                want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
1492                if (!want_cookie)
1493                        goto drop;
1494        }
1495
1496        /* Accept backlog is full. If we have already queued enough
1497         * of warm entries in syn queue, drop request. It is better than
1498         * clogging syn queue with openreqs with exponentially increasing
1499         * timeout.
1500         */
1501        if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
1502                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1503                goto drop;
1504        }
1505
1506        req = inet_reqsk_alloc(&tcp_request_sock_ops);
1507        if (!req)
1508                goto drop;
1509
1510#ifdef CONFIG_TCP_MD5SIG
1511        tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1512#endif
1513
1514        tcp_clear_options(&tmp_opt);
1515        tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1516        tmp_opt.user_mss  = tp->rx_opt.user_mss;
1517        tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
1518
1519        if (want_cookie && !tmp_opt.saw_tstamp)
1520                tcp_clear_options(&tmp_opt);
1521
1522        tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1523        tcp_openreq_init(req, &tmp_opt, skb);
1524
1525        ireq = inet_rsk(req);
1526        ireq->loc_addr = daddr;
1527        ireq->rmt_addr = saddr;
1528        ireq->no_srccheck = inet_sk(sk)->transparent;
1529        ireq->opt = tcp_v4_save_options(skb);
1530
1531        if (security_inet_conn_request(sk, skb, req))
1532                goto drop_and_free;
1533
1534        if (!want_cookie || tmp_opt.tstamp_ok)
1535                TCP_ECN_create_request(req, skb, sock_net(sk));
1536
1537        if (want_cookie) {
1538                isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1539                req->cookie_ts = tmp_opt.tstamp_ok;
1540        } else if (!isn) {
1541                /* VJ's idea. We save last timestamp seen
1542                 * from the destination in peer table, when entering
1543                 * state TIME-WAIT, and check against it before
1544                 * accepting new connection request.
1545                 *
1546                 * If "isn" is not zero, this request hit alive
1547                 * timewait bucket, so that all the necessary checks
1548                 * are made in the function processing timewait state.
1549                 */
1550                if (tmp_opt.saw_tstamp &&
1551                    tcp_death_row.sysctl_tw_recycle &&
1552                    (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1553                    fl4.daddr == saddr) {
1554                        if (!tcp_peer_is_proven(req, dst, true)) {
1555                                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1556                                goto drop_and_release;
1557                        }
1558                }
1559                /* Kill the following clause, if you dislike this way. */
1560                else if (!sysctl_tcp_syncookies &&
1561                         (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1562                          (sysctl_max_syn_backlog >> 2)) &&
1563                         !tcp_peer_is_proven(req, dst, false)) {
1564                        /* Without syncookies last quarter of
1565                         * backlog is filled with destinations,
1566                         * proven to be alive.
1567                         * It means that we continue to communicate
1568                         * to destinations, already remembered
1569                         * to the moment of synflood.
1570                         */
1571                        LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
1572                                       &saddr, ntohs(tcp_hdr(skb)->source));
1573                        goto drop_and_release;
1574                }
1575
1576                isn = tcp_v4_init_sequence(skb);
1577        }
1578        tcp_rsk(req)->snt_isn = isn;
1579
1580        if (dst == NULL) {
1581                dst = inet_csk_route_req(sk, &fl4, req);
1582                if (dst == NULL)
1583                        goto drop_and_free;
1584        }
1585        do_fastopen = tcp_fastopen_check(sk, skb, req, &foc, &valid_foc);
1586
1587        /* We don't call tcp_v4_send_synack() directly because we need
1588         * to make sure a child socket can be created successfully before
1589         * sending back synack!
1590         *
1591         * XXX (TFO) - Ideally one would simply call tcp_v4_send_synack()
1592         * (or better yet, call tcp_send_synack() in the child context
1593         * directly, but will have to fix bunch of other code first)
1594         * after syn_recv_sock() except one will need to first fix the
1595         * latter to remove its dependency on the current implementation
1596         * of tcp_v4_send_synack()->tcp_select_initial_window().
1597         */
1598        skb_synack = tcp_make_synack(sk, dst, req,
1599            fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL);
1600
1601        if (skb_synack) {
1602                __tcp_v4_send_check(skb_synack, ireq->loc_addr, ireq->rmt_addr);
1603                skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb));
1604        } else
1605                goto drop_and_free;
1606
1607        if (likely(!do_fastopen)) {
1608                int err;
1609                err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1610                     ireq->rmt_addr, ireq->opt);
1611                err = net_xmit_eval(err);
1612                if (err || want_cookie)
1613                        goto drop_and_free;
1614
1615                tcp_rsk(req)->snt_synack = tcp_time_stamp;
1616                tcp_rsk(req)->listener = NULL;
1617                /* Add the request_sock to the SYN table */
1618                inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1619                if (fastopen_cookie_present(&foc) && foc.len != 0)
1620                        NET_INC_STATS_BH(sock_net(sk),
1621                            LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1622        } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req))
1623                goto drop_and_free;
1624
1625        return 0;
1626
1627drop_and_release:
1628        dst_release(dst);
1629drop_and_free:
1630        reqsk_free(req);
1631drop:
1632        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1633        return 0;
1634}
1635EXPORT_SYMBOL(tcp_v4_conn_request);
1636
1637
1638/*
1639 * The three way handshake has completed - we got a valid synack -
1640 * now create the new socket.
1641 */
1642struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1643                                  struct request_sock *req,
1644                                  struct dst_entry *dst)
1645{
1646        struct inet_request_sock *ireq;
1647        struct inet_sock *newinet;
1648        struct tcp_sock *newtp;
1649        struct sock *newsk;
1650#ifdef CONFIG_TCP_MD5SIG
1651        struct tcp_md5sig_key *key;
1652#endif
1653        struct ip_options_rcu *inet_opt;
1654
1655        if (sk_acceptq_is_full(sk))
1656                goto exit_overflow;
1657
1658        newsk = tcp_create_openreq_child(sk, req, skb);
1659        if (!newsk)
1660                goto exit_nonewsk;
1661
1662        newsk->sk_gso_type = SKB_GSO_TCPV4;
1663        inet_sk_rx_dst_set(newsk, skb);
1664
1665        newtp                 = tcp_sk(newsk);
1666        newinet               = inet_sk(newsk);
1667        ireq                  = inet_rsk(req);
1668        newinet->inet_daddr   = ireq->rmt_addr;
1669        newinet->inet_rcv_saddr = ireq->loc_addr;
1670        newinet->inet_saddr           = ireq->loc_addr;
1671        inet_opt              = ireq->opt;
1672        rcu_assign_pointer(newinet->inet_opt, inet_opt);
1673        ireq->opt             = NULL;
1674        newinet->mc_index     = inet_iif(skb);
1675        newinet->mc_ttl       = ip_hdr(skb)->ttl;
1676        newinet->rcv_tos      = ip_hdr(skb)->tos;
1677        inet_csk(newsk)->icsk_ext_hdr_len = 0;
1678        if (inet_opt)
1679                inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1680        newinet->inet_id = newtp->write_seq ^ jiffies;
1681
1682        if (!dst) {
1683                dst = inet_csk_route_child_sock(sk, newsk, req);
1684                if (!dst)
1685                        goto put_and_exit;
1686        } else {
1687                /* syncookie case : see end of cookie_v4_check() */
1688        }
1689        sk_setup_caps(newsk, dst);
1690
1691        tcp_mtup_init(newsk);
1692        tcp_sync_mss(newsk, dst_mtu(dst));
1693        newtp->advmss = dst_metric_advmss(dst);
1694        if (tcp_sk(sk)->rx_opt.user_mss &&
1695            tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1696                newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1697
1698        tcp_initialize_rcv_mss(newsk);
1699        tcp_synack_rtt_meas(newsk, req);
1700        newtp->total_retrans = req->num_retrans;
1701
1702#ifdef CONFIG_TCP_MD5SIG
1703        /* Copy over the MD5 key from the original socket */
1704        key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1705                                AF_INET);
1706        if (key != NULL) {
1707                /*
1708                 * We're using one, so create a matching key
1709                 * on the newsk structure. If we fail to get
1710                 * memory, then we end up not copying the key
1711                 * across. Shucks.
1712                 */
1713                tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1714                               AF_INET, key->key, key->keylen, GFP_ATOMIC);
1715                sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1716        }
1717#endif
1718
1719        if (__inet_inherit_port(sk, newsk) < 0)
1720                goto put_and_exit;
1721        __inet_hash_nolisten(newsk, NULL);
1722
1723        return newsk;
1724
1725exit_overflow:
1726        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1727exit_nonewsk:
1728        dst_release(dst);
1729exit:
1730        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1731        return NULL;
1732put_and_exit:
1733        inet_csk_prepare_forced_close(newsk);
1734        tcp_done(newsk);
1735        goto exit;
1736}
1737EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1738
1739static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1740{
1741        struct tcphdr *th = tcp_hdr(skb);
1742        const struct iphdr *iph = ip_hdr(skb);
1743        struct sock *nsk;
1744        struct request_sock **prev;
1745        /* Find possible connection requests. */
1746        struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1747                                                       iph->saddr, iph->daddr);
1748        if (req)
1749                return tcp_check_req(sk, skb, req, prev, false);
1750
1751        nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1752                        th->source, iph->daddr, th->dest, inet_iif(skb));
1753
1754        if (nsk) {
1755                if (nsk->sk_state != TCP_TIME_WAIT) {
1756                        bh_lock_sock(nsk);
1757                        return nsk;
1758                }
1759                inet_twsk_put(inet_twsk(nsk));
1760                return NULL;
1761        }
1762
1763#ifdef CONFIG_SYN_COOKIES
1764        if (!th->syn)
1765                sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1766#endif
1767        return sk;
1768}
1769
1770static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1771{
1772        const struct iphdr *iph = ip_hdr(skb);
1773
1774        if (skb->ip_summed == CHECKSUM_COMPLETE) {
1775                if (!tcp_v4_check(skb->len, iph->saddr,
1776                                  iph->daddr, skb->csum)) {
1777                        skb->ip_summed = CHECKSUM_UNNECESSARY;
1778                        return 0;
1779                }
1780        }
1781
1782        skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1783                                       skb->len, IPPROTO_TCP, 0);
1784
1785        if (skb->len <= 76) {
1786                return __skb_checksum_complete(skb);
1787        }
1788        return 0;
1789}
1790
1791
1792/* The socket must have it's spinlock held when we get
1793 * here.
1794 *
1795 * We have a potential double-lock case here, so even when
1796 * doing backlog processing we use the BH locking scheme.
1797 * This is because we cannot sleep with the original spinlock
1798 * held.
1799 */
1800int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1801{
1802        struct sock *rsk;
1803#ifdef CONFIG_TCP_MD5SIG
1804        /*
1805         * We really want to reject the packet as early as possible
1806         * if:
1807         *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1808         *  o There is an MD5 option and we're not expecting one
1809         */
1810        if (tcp_v4_inbound_md5_hash(sk, skb))
1811                goto discard;
1812#endif
1813
1814        if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1815                struct dst_entry *dst = sk->sk_rx_dst;
1816
1817                sock_rps_save_rxhash(sk, skb);
1818                if (dst) {
1819                        if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1820                            dst->ops->check(dst, 0) == NULL) {
1821                                dst_release(dst);
1822                                sk->sk_rx_dst = NULL;
1823                        }
1824                }
1825                if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1826                        rsk = sk;
1827                        goto reset;
1828                }
1829                return 0;
1830        }
1831
1832        if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1833                goto csum_err;
1834
1835        if (sk->sk_state == TCP_LISTEN) {
1836                struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1837                if (!nsk)
1838                        goto discard;
1839
1840                if (nsk != sk) {
1841                        sock_rps_save_rxhash(nsk, skb);
1842                        if (tcp_child_process(sk, nsk, skb)) {
1843                                rsk = nsk;
1844                                goto reset;
1845                        }
1846                        return 0;
1847                }
1848        } else
1849                sock_rps_save_rxhash(sk, skb);
1850
1851        if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1852                rsk = sk;
1853                goto reset;
1854        }
1855        return 0;
1856
1857reset:
1858        tcp_v4_send_reset(rsk, skb);
1859discard:
1860        kfree_skb(skb);
1861        /* Be careful here. If this function gets more complicated and
1862         * gcc suffers from register pressure on the x86, sk (in %ebx)
1863         * might be destroyed here. This current version compiles correctly,
1864         * but you have been warned.
1865         */
1866        return 0;
1867
1868csum_err:
1869        TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1870        TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1871        goto discard;
1872}
1873EXPORT_SYMBOL(tcp_v4_do_rcv);
1874
1875void tcp_v4_early_demux(struct sk_buff *skb)
1876{
1877        const struct iphdr *iph;
1878        const struct tcphdr *th;
1879        struct sock *sk;
1880
1881        if (skb->pkt_type != PACKET_HOST)
1882                return;
1883
1884        if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1885                return;
1886
1887        iph = ip_hdr(skb);
1888        th = tcp_hdr(skb);
1889
1890        if (th->doff < sizeof(struct tcphdr) / 4)
1891                return;
1892
1893        sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1894                                       iph->saddr, th->source,
1895                                       iph->daddr, ntohs(th->dest),
1896                                       skb->skb_iif);
1897        if (sk) {
1898                skb->sk = sk;
1899                skb->destructor = sock_edemux;
1900                if (sk->sk_state != TCP_TIME_WAIT) {
1901                        struct dst_entry *dst = sk->sk_rx_dst;
1902
1903                        if (dst)
1904                                dst = dst_check(dst, 0);
1905                        if (dst &&
1906                            inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1907                                skb_dst_set_noref(skb, dst);
1908                }
1909        }
1910}
1911
1912/* Packet is added to VJ-style prequeue for processing in process
1913 * context, if a reader task is waiting. Apparently, this exciting
1914 * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1915 * failed somewhere. Latency? Burstiness? Well, at least now we will
1916 * see, why it failed. 8)8)                               --ANK
1917 *
1918 */
1919bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1920{
1921        struct tcp_sock *tp = tcp_sk(sk);
1922
1923        if (sysctl_tcp_low_latency || !tp->ucopy.task)
1924                return false;
1925
1926        if (skb->len <= tcp_hdrlen(skb) &&
1927            skb_queue_len(&tp->ucopy.prequeue) == 0)
1928                return false;
1929
1930        skb_dst_force(skb);
1931        __skb_queue_tail(&tp->ucopy.prequeue, skb);
1932        tp->ucopy.memory += skb->truesize;
1933        if (tp->ucopy.memory > sk->sk_rcvbuf) {
1934                struct sk_buff *skb1;
1935
1936                BUG_ON(sock_owned_by_user(sk));
1937
1938                while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1939                        sk_backlog_rcv(sk, skb1);
1940                        NET_INC_STATS_BH(sock_net(sk),
1941                                         LINUX_MIB_TCPPREQUEUEDROPPED);
1942                }
1943
1944                tp->ucopy.memory = 0;
1945        } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1946                wake_up_interruptible_sync_poll(sk_sleep(sk),
1947                                           POLLIN | POLLRDNORM | POLLRDBAND);
1948                if (!inet_csk_ack_scheduled(sk))
1949                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1950                                                  (3 * tcp_rto_min(sk)) / 4,
1951                                                  TCP_RTO_MAX);
1952        }
1953        return true;
1954}
1955EXPORT_SYMBOL(tcp_prequeue);
1956
1957/*
1958 *      From tcp_input.c
1959 */
1960
1961int tcp_v4_rcv(struct sk_buff *skb)
1962{
1963        const struct iphdr *iph;
1964        const struct tcphdr *th;
1965        struct sock *sk;
1966        int ret;
1967        struct net *net = dev_net(skb->dev);
1968
1969        if (skb->pkt_type != PACKET_HOST)
1970                goto discard_it;
1971
1972        /* Count it even if it's bad */
1973        TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1974
1975        if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1976                goto discard_it;
1977
1978        th = tcp_hdr(skb);
1979
1980        if (th->doff < sizeof(struct tcphdr) / 4)
1981                goto bad_packet;
1982        if (!pskb_may_pull(skb, th->doff * 4))
1983                goto discard_it;
1984
1985        /* An explanation is required here, I think.
1986         * Packet length and doff are validated by header prediction,
1987         * provided case of th->doff==0 is eliminated.
1988         * So, we defer the checks. */
1989        if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1990                goto csum_error;
1991
1992        th = tcp_hdr(skb);
1993        iph = ip_hdr(skb);
1994        TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1995        TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1996                                    skb->len - th->doff * 4);
1997        TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1998        TCP_SKB_CB(skb)->when    = 0;
1999        TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2000        TCP_SKB_CB(skb)->sacked  = 0;
2001
2002        sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
2003        if (!sk)
2004                goto no_tcp_socket;
2005
2006process:
2007        if (sk->sk_state == TCP_TIME_WAIT)
2008                goto do_time_wait;
2009
2010        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2011                NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
2012                goto discard_and_relse;
2013        }
2014
2015        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2016                goto discard_and_relse;
2017        nf_reset(skb);
2018
2019        if (sk_filter(sk, skb))
2020                goto discard_and_relse;
2021
2022        skb->dev = NULL;
2023
2024        bh_lock_sock_nested(sk);
2025        ret = 0;
2026        if (!sock_owned_by_user(sk)) {
2027#ifdef CONFIG_NET_DMA
2028                struct tcp_sock *tp = tcp_sk(sk);
2029                if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
2030                        tp->ucopy.dma_chan = net_dma_find_channel();
2031                if (tp->ucopy.dma_chan)
2032                        ret = tcp_v4_do_rcv(sk, skb);
2033                else
2034#endif
2035                {
2036                        if (!tcp_prequeue(sk, skb))
2037                                ret = tcp_v4_do_rcv(sk, skb);
2038                }
2039        } else if (unlikely(sk_add_backlog(sk, skb,
2040                                           sk->sk_rcvbuf + sk->sk_sndbuf))) {
2041                bh_unlock_sock(sk);
2042                NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
2043                goto discard_and_relse;
2044        }
2045        bh_unlock_sock(sk);
2046
2047        sock_put(sk);
2048
2049        return ret;
2050
2051no_tcp_socket:
2052        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2053                goto discard_it;
2054
2055        if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
2056csum_error:
2057                TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
2058bad_packet:
2059                TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
2060        } else {
2061                tcp_v4_send_reset(NULL, skb);
2062        }
2063
2064discard_it:
2065        /* Discard frame. */
2066        kfree_skb(skb);
2067        return 0;
2068
2069discard_and_relse:
2070        sock_put(sk);
2071        goto discard_it;
2072
2073do_time_wait:
2074        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2075                inet_twsk_put(inet_twsk(sk));
2076                goto discard_it;
2077        }
2078
2079        if (skb->len < (th->doff << 2)) {
2080                inet_twsk_put(inet_twsk(sk));
2081                goto bad_packet;
2082        }
2083        if (tcp_checksum_complete(skb)) {
2084                inet_twsk_put(inet_twsk(sk));
2085                goto csum_error;
2086        }
2087        switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2088        case TCP_TW_SYN: {
2089                struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2090                                                        &tcp_hashinfo,
2091                                                        iph->saddr, th->source,
2092                                                        iph->daddr, th->dest,
2093                                                        inet_iif(skb));
2094                if (sk2) {
2095                        inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
2096                        inet_twsk_put(inet_twsk(sk));
2097                        sk = sk2;
2098                        goto process;
2099                }
2100                /* Fall through to ACK */
2101        }
2102        case TCP_TW_ACK:
2103                tcp_v4_timewait_ack(sk, skb);
2104                break;
2105        case TCP_TW_RST:
2106                goto no_tcp_socket;
2107        case TCP_TW_SUCCESS:;
2108        }
2109        goto discard_it;
2110}
2111
2112static struct timewait_sock_ops tcp_timewait_sock_ops = {
2113        .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2114        .twsk_unique    = tcp_twsk_unique,
2115        .twsk_destructor= tcp_twsk_destructor,
2116};
2117
2118void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2119{
2120        struct dst_entry *dst = skb_dst(skb);
2121
2122        dst_hold(dst);
2123        sk->sk_rx_dst = dst;
2124        inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2125}
2126EXPORT_SYMBOL(inet_sk_rx_dst_set);
2127
2128const struct inet_connection_sock_af_ops ipv4_specific = {
2129        .queue_xmit        = ip_queue_xmit,
2130        .send_check        = tcp_v4_send_check,
2131        .rebuild_header    = inet_sk_rebuild_header,
2132        .sk_rx_dst_set     = inet_sk_rx_dst_set,
2133        .conn_request      = tcp_v4_conn_request,
2134        .syn_recv_sock     = tcp_v4_syn_recv_sock,
2135        .net_header_len    = sizeof(struct iphdr),
2136        .setsockopt        = ip_setsockopt,
2137        .getsockopt        = ip_getsockopt,
2138        .addr2sockaddr     = inet_csk_addr2sockaddr,
2139        .sockaddr_len      = sizeof(struct sockaddr_in),
2140        .bind_conflict     = inet_csk_bind_conflict,
2141#ifdef CONFIG_COMPAT
2142        .compat_setsockopt = compat_ip_setsockopt,
2143        .compat_getsockopt = compat_ip_getsockopt,
2144#endif
2145};
2146EXPORT_SYMBOL(ipv4_specific);
2147
2148#ifdef CONFIG_TCP_MD5SIG
2149static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2150        .md5_lookup             = tcp_v4_md5_lookup,
2151        .calc_md5_hash          = tcp_v4_md5_hash_skb,
2152        .md5_parse              = tcp_v4_parse_md5_keys,
2153};
2154#endif
2155
2156/* NOTE: A lot of things set to zero explicitly by call to
2157 *       sk_alloc() so need not be done here.
2158 */
2159static int tcp_v4_init_sock(struct sock *sk)
2160{
2161        struct inet_connection_sock *icsk = inet_csk(sk);
2162
2163        tcp_init_sock(sk);
2164
2165        icsk->icsk_af_ops = &ipv4_specific;
2166
2167#ifdef CONFIG_TCP_MD5SIG
2168        tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2169#endif
2170
2171        return 0;
2172}
2173
2174void tcp_v4_destroy_sock(struct sock *sk)
2175{
2176        struct tcp_sock *tp = tcp_sk(sk);
2177
2178        tcp_clear_xmit_timers(sk);
2179
2180        tcp_cleanup_congestion_control(sk);
2181
2182        /* Cleanup up the write buffer. */
2183        tcp_write_queue_purge(sk);
2184
2185        /* Cleans up our, hopefully empty, out_of_order_queue. */
2186        __skb_queue_purge(&tp->out_of_order_queue);
2187
2188#ifdef CONFIG_TCP_MD5SIG
2189        /* Clean up the MD5 key list, if any */
2190        if (tp->md5sig_info) {
2191                tcp_clear_md5_list(sk);
2192                kfree_rcu(tp->md5sig_info, rcu);
2193                tp->md5sig_info = NULL;
2194        }
2195#endif
2196
2197#ifdef CONFIG_NET_DMA
2198        /* Cleans up our sk_async_wait_queue */
2199        __skb_queue_purge(&sk->sk_async_wait_queue);
2200#endif
2201
2202        /* Clean prequeue, it must be empty really */
2203        __skb_queue_purge(&tp->ucopy.prequeue);
2204
2205        /* Clean up a referenced TCP bind bucket. */
2206        if (inet_csk(sk)->icsk_bind_hash)
2207                inet_put_port(sk);
2208
2209        BUG_ON(tp->fastopen_rsk != NULL);
2210
2211        /* If socket is aborted during connect operation */
2212        tcp_free_fastopen_req(tp);
2213
2214        sk_sockets_allocated_dec(sk);
2215        sock_release_memcg(sk);
2216}
2217EXPORT_SYMBOL(tcp_v4_destroy_sock);
2218
2219#ifdef CONFIG_PROC_FS
2220/* Proc filesystem TCP sock list dumping. */
2221
2222static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
2223{
2224        return hlist_nulls_empty(head) ? NULL :
2225                list_entry(head->first, struct inet_timewait_sock, tw_node);
2226}
2227
2228static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
2229{
2230        return !is_a_nulls(tw->tw_node.next) ?
2231                hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2232}
2233
2234/*
2235 * Get next listener socket follow cur.  If cur is NULL, get first socket
2236 * starting from bucket given in st->bucket; when st->bucket is zero the
2237 * very first socket in the hash table is returned.
2238 */
2239static void *listening_get_next(struct seq_file *seq, void *cur)
2240{
2241        struct inet_connection_sock *icsk;
2242        struct hlist_nulls_node *node;
2243        struct sock *sk = cur;
2244        struct inet_listen_hashbucket *ilb;
2245        struct tcp_iter_state *st = seq->private;
2246        struct net *net = seq_file_net(seq);
2247
2248        if (!sk) {
2249                ilb = &tcp_hashinfo.listening_hash[st->bucket];
2250                spin_lock_bh(&ilb->lock);
2251                sk = sk_nulls_head(&ilb->head);
2252                st->offset = 0;
2253                goto get_sk;
2254        }
2255        ilb = &tcp_hashinfo.listening_hash[st->bucket];
2256        ++st->num;
2257        ++st->offset;
2258
2259        if (st->state == TCP_SEQ_STATE_OPENREQ) {
2260                struct request_sock *req = cur;
2261
2262                icsk = inet_csk(st->syn_wait_sk);
2263                req = req->dl_next;
2264                while (1) {
2265                        while (req) {
2266                                if (req->rsk_ops->family == st->family) {
2267                                        cur = req;
2268                                        goto out;
2269                                }
2270                                req = req->dl_next;
2271                        }
2272                        if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2273                                break;
2274get_req:
2275                        req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2276                }
2277                sk        = sk_nulls_next(st->syn_wait_sk);
2278                st->state = TCP_SEQ_STATE_LISTENING;
2279                read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2280        } else {
2281                icsk = inet_csk(sk);
2282                read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2283                if (reqsk_queue_len(&icsk->icsk_accept_queue))
2284                        goto start_req;
2285                read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2286                sk = sk_nulls_next(sk);
2287        }
2288get_sk:
2289        sk_nulls_for_each_from(sk, node) {
2290                if (!net_eq(sock_net(sk), net))
2291                        continue;
2292                if (sk->sk_family == st->family) {
2293                        cur = sk;
2294                        goto out;
2295                }
2296                icsk = inet_csk(sk);
2297                read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2298                if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2299start_req:
2300                        st->uid         = sock_i_uid(sk);
2301                        st->syn_wait_sk = sk;
2302                        st->state       = TCP_SEQ_STATE_OPENREQ;
2303                        st->sbucket     = 0;
2304                        goto get_req;
2305                }
2306                read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2307        }
2308        spin_unlock_bh(&ilb->lock);
2309        st->offset = 0;
2310        if (++st->bucket < INET_LHTABLE_SIZE) {
2311                ilb = &tcp_hashinfo.listening_hash[st->bucket];
2312                spin_lock_bh(&ilb->lock);
2313                sk = sk_nulls_head(&ilb->head);
2314                goto get_sk;
2315        }
2316        cur = NULL;
2317out:
2318        return cur;
2319}
2320
2321static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2322{
2323        struct tcp_iter_state *st = seq->private;
2324        void *rc;
2325
2326        st->bucket = 0;
2327        st->offset = 0;
2328        rc = listening_get_next(seq, NULL);
2329
2330        while (rc && *pos) {
2331                rc = listening_get_next(seq, rc);
2332                --*pos;
2333        }
2334        return rc;
2335}
2336
2337static inline bool empty_bucket(struct tcp_iter_state *st)
2338{
2339        return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2340                hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2341}
2342
2343/*
2344 * Get first established socket starting from bucket given in st->bucket.
2345 * If st->bucket is zero, the very first socket in the hash is returned.
2346 */
2347static void *established_get_first(struct seq_file *seq)
2348{
2349        struct tcp_iter_state *st = seq->private;
2350        struct net *net = seq_file_net(seq);
2351        void *rc = NULL;
2352
2353        st->offset = 0;
2354        for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2355                struct sock *sk;
2356                struct hlist_nulls_node *node;
2357                struct inet_timewait_sock *tw;
2358                spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2359
2360                /* Lockless fast path for the common case of empty buckets */
2361                if (empty_bucket(st))
2362                        continue;
2363
2364                spin_lock_bh(lock);
2365                sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2366                        if (sk->sk_family != st->family ||
2367                            !net_eq(sock_net(sk), net)) {
2368                                continue;
2369                        }
2370                        rc = sk;
2371                        goto out;
2372                }
2373                st->state = TCP_SEQ_STATE_TIME_WAIT;
2374                inet_twsk_for_each(tw, node,
2375                                   &tcp_hashinfo.ehash[st->bucket].twchain) {
2376                        if (tw->tw_family != st->family ||
2377                            !net_eq(twsk_net(tw), net)) {
2378                                continue;
2379                        }
2380                        rc = tw;
2381                        goto out;
2382                }
2383                spin_unlock_bh(lock);
2384                st->state = TCP_SEQ_STATE_ESTABLISHED;
2385        }
2386out:
2387        return rc;
2388}
2389
2390static void *established_get_next(struct seq_file *seq, void *cur)
2391{
2392        struct sock *sk = cur;
2393        struct inet_timewait_sock *tw;
2394        struct hlist_nulls_node *node;
2395        struct tcp_iter_state *st = seq->private;
2396        struct net *net = seq_file_net(seq);
2397
2398        ++st->num;
2399        ++st->offset;
2400
2401        if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2402                tw = cur;
2403                tw = tw_next(tw);
2404get_tw:
2405                while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2406                        tw = tw_next(tw);
2407                }
2408                if (tw) {
2409                        cur = tw;
2410                        goto out;
2411                }
2412                spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2413                st->state = TCP_SEQ_STATE_ESTABLISHED;
2414
2415                /* Look for next non empty bucket */
2416                st->offset = 0;
2417                while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2418                                empty_bucket(st))
2419                        ;
2420                if (st->bucket > tcp_hashinfo.ehash_mask)
2421                        return NULL;
2422
2423                spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2424                sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2425        } else
2426                sk = sk_nulls_next(sk);
2427
2428        sk_nulls_for_each_from(sk, node) {
2429                if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2430                        goto found;
2431        }
2432
2433        st->state = TCP_SEQ_STATE_TIME_WAIT;
2434        tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2435        goto get_tw;
2436found:
2437        cur = sk;
2438out:
2439        return cur;
2440}
2441
2442static void *established_get_idx(struct seq_file *seq, loff_t pos)
2443{
2444        struct tcp_iter_state *st = seq->private;
2445        void *rc;
2446
2447        st->bucket = 0;
2448        rc = established_get_first(seq);
2449
2450        while (rc && pos) {
2451                rc = established_get_next(seq, rc);
2452                --pos;
2453        }
2454        return rc;
2455}
2456
2457static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2458{
2459        void *rc;
2460        struct tcp_iter_state *st = seq->private;
2461
2462        st->state = TCP_SEQ_STATE_LISTENING;
2463        rc        = listening_get_idx(seq, &pos);
2464
2465        if (!rc) {
2466                st->state = TCP_SEQ_STATE_ESTABLISHED;
2467                rc        = established_get_idx(seq, pos);
2468        }
2469
2470        return rc;
2471}
2472
2473static void *tcp_seek_last_pos(struct seq_file *seq)
2474{
2475        struct tcp_iter_state *st = seq->private;
2476        int offset = st->offset;
2477        int orig_num = st->num;
2478        void *rc = NULL;
2479
2480        switch (st->state) {
2481        case TCP_SEQ_STATE_OPENREQ:
2482        case TCP_SEQ_STATE_LISTENING:
2483                if (st->bucket >= INET_LHTABLE_SIZE)
2484                        break;
2485                st->state = TCP_SEQ_STATE_LISTENING;
2486                rc = listening_get_next(seq, NULL);
2487                while (offset-- && rc)
2488                        rc = listening_get_next(seq, rc);
2489                if (rc)
2490                        break;
2491                st->bucket = 0;
2492                /* Fallthrough */
2493        case TCP_SEQ_STATE_ESTABLISHED:
2494        case TCP_SEQ_STATE_TIME_WAIT:
2495                st->state = TCP_SEQ_STATE_ESTABLISHED;
2496                if (st->bucket > tcp_hashinfo.ehash_mask)
2497                        break;
2498                rc = established_get_first(seq);
2499                while (offset-- && rc)
2500                        rc = established_get_next(seq, rc);
2501        }
2502
2503        st->num = orig_num;
2504
2505        return rc;
2506}
2507
2508static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2509{
2510        struct tcp_iter_state *st = seq->private;
2511        void *rc;
2512
2513        if (*pos && *pos == st->last_pos) {
2514                rc = tcp_seek_last_pos(seq);
2515                if (rc)
2516                        goto out;
2517        }
2518
2519        st->state = TCP_SEQ_STATE_LISTENING;
2520        st->num = 0;
2521        st->bucket = 0;
2522        st->offset = 0;
2523        rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2524
2525out:
2526        st->last_pos = *pos;
2527        return rc;
2528}
2529
2530static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2531{
2532        struct tcp_iter_state *st = seq->private;
2533        void *rc = NULL;
2534
2535        if (v == SEQ_START_TOKEN) {
2536                rc = tcp_get_idx(seq, 0);
2537                goto out;
2538        }
2539
2540        switch (st->state) {
2541        case TCP_SEQ_STATE_OPENREQ:
2542        case TCP_SEQ_STATE_LISTENING:
2543                rc = listening_get_next(seq, v);
2544                if (!rc) {
2545                        st->state = TCP_SEQ_STATE_ESTABLISHED;
2546                        st->bucket = 0;
2547                        st->offset = 0;
2548                        rc        = established_get_first(seq);
2549                }
2550                break;
2551        case TCP_SEQ_STATE_ESTABLISHED:
2552        case TCP_SEQ_STATE_TIME_WAIT:
2553                rc = established_get_next(seq, v);
2554                break;
2555        }
2556out:
2557        ++*pos;
2558        st->last_pos = *pos;
2559        return rc;
2560}
2561
2562static void tcp_seq_stop(struct seq_file *seq, void *v)
2563{
2564        struct tcp_iter_state *st = seq->private;
2565
2566        switch (st->state) {
2567        case TCP_SEQ_STATE_OPENREQ:
2568                if (v) {
2569                        struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2570                        read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2571                }
2572        case TCP_SEQ_STATE_LISTENING:
2573                if (v != SEQ_START_TOKEN)
2574                        spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2575                break;
2576        case TCP_SEQ_STATE_TIME_WAIT:
2577        case TCP_SEQ_STATE_ESTABLISHED:
2578                if (v)
2579                        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2580                break;
2581        }
2582}
2583
2584int tcp_seq_open(struct inode *inode, struct file *file)
2585{
2586        struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2587        struct tcp_iter_state *s;
2588        int err;
2589
2590        err = seq_open_net(inode, file, &afinfo->seq_ops,
2591                          sizeof(struct tcp_iter_state));
2592        if (err < 0)
2593                return err;
2594
2595        s = ((struct seq_file *)file->private_data)->private;
2596        s->family               = afinfo->family;
2597        s->last_pos             = 0;
2598        return 0;
2599}
2600EXPORT_SYMBOL(tcp_seq_open);
2601
2602int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2603{
2604        int rc = 0;
2605        struct proc_dir_entry *p;
2606
2607        afinfo->seq_ops.start           = tcp_seq_start;
2608        afinfo->seq_ops.next            = tcp_seq_next;
2609        afinfo->seq_ops.stop            = tcp_seq_stop;
2610
2611        p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2612                             afinfo->seq_fops, afinfo);
2613        if (!p)
2614                rc = -ENOMEM;
2615        return rc;
2616}
2617EXPORT_SYMBOL(tcp_proc_register);
2618
2619void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2620{
2621        remove_proc_entry(afinfo->name, net->proc_net);
2622}
2623EXPORT_SYMBOL(tcp_proc_unregister);
2624
2625static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2626                         struct seq_file *f, int i, kuid_t uid, int *len)
2627{
2628        const struct inet_request_sock *ireq = inet_rsk(req);
2629        long delta = req->expires - jiffies;
2630
2631        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2632                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2633                i,
2634                ireq->loc_addr,
2635                ntohs(inet_sk(sk)->inet_sport),
2636                ireq->rmt_addr,
2637                ntohs(ireq->rmt_port),
2638                TCP_SYN_RECV,
2639                0, 0, /* could print option size, but that is af dependent. */
2640                1,    /* timers active (only the expire timer) */
2641                jiffies_delta_to_clock_t(delta),
2642                req->num_timeout,
2643                from_kuid_munged(seq_user_ns(f), uid),
2644                0,  /* non standard timer */
2645                0, /* open_requests have no inode */
2646                atomic_read(&sk->sk_refcnt),
2647                req,
2648                len);
2649}
2650
2651static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2652{
2653        int timer_active;
2654        unsigned long timer_expires;
2655        const struct tcp_sock *tp = tcp_sk(sk);
2656        const struct inet_connection_sock *icsk = inet_csk(sk);
2657        const struct inet_sock *inet = inet_sk(sk);
2658        struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
2659        __be32 dest = inet->inet_daddr;
2660        __be32 src = inet->inet_rcv_saddr;
2661        __u16 destp = ntohs(inet->inet_dport);
2662        __u16 srcp = ntohs(inet->inet_sport);
2663        int rx_queue;
2664
2665        if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2666            icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2667            icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2668                timer_active    = 1;
2669                timer_expires   = icsk->icsk_timeout;
2670        } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2671                timer_active    = 4;
2672                timer_expires   = icsk->icsk_timeout;
2673        } else if (timer_pending(&sk->sk_timer)) {
2674                timer_active    = 2;
2675                timer_expires   = sk->sk_timer.expires;
2676        } else {
2677                timer_active    = 0;
2678                timer_expires = jiffies;
2679        }
2680
2681        if (sk->sk_state == TCP_LISTEN)
2682                rx_queue = sk->sk_ack_backlog;
2683        else
2684                /*
2685                 * because we dont lock socket, we might find a transient negative value
2686                 */
2687                rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2688
2689        seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2690                        "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2691                i, src, srcp, dest, destp, sk->sk_state,
2692                tp->write_seq - tp->snd_una,
2693                rx_queue,
2694                timer_active,
2695                jiffies_delta_to_clock_t(timer_expires - jiffies),
2696                icsk->icsk_retransmits,
2697                from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2698                icsk->icsk_probes_out,
2699                sock_i_ino(sk),
2700                atomic_read(&sk->sk_refcnt), sk,
2701                jiffies_to_clock_t(icsk->icsk_rto),
2702                jiffies_to_clock_t(icsk->icsk_ack.ato),
2703                (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2704                tp->snd_cwnd,
2705                sk->sk_state == TCP_LISTEN ?
2706                    (fastopenq ? fastopenq->max_qlen : 0) :
2707                    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh),
2708                len);
2709}
2710
2711static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2712                               struct seq_file *f, int i, int *len)
2713{
2714        __be32 dest, src;
2715        __u16 destp, srcp;
2716        long delta = tw->tw_ttd - jiffies;
2717
2718        dest  = tw->tw_daddr;
2719        src   = tw->tw_rcv_saddr;
2720        destp = ntohs(tw->tw_dport);
2721        srcp  = ntohs(tw->tw_sport);
2722
2723        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2724                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2725                i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2726                3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2727                atomic_read(&tw->tw_refcnt), tw, len);
2728}
2729
2730#define TMPSZ 150
2731
2732static int tcp4_seq_show(struct seq_file *seq, void *v)
2733{
2734        struct tcp_iter_state *st;
2735        int len;
2736
2737        if (v == SEQ_START_TOKEN) {
2738                seq_printf(seq, "%-*s\n", TMPSZ - 1,
2739                           "  sl  local_address rem_address   st tx_queue "
2740                           "rx_queue tr tm->when retrnsmt   uid  timeout "
2741                           "inode");
2742                goto out;
2743        }
2744        st = seq->private;
2745
2746        switch (st->state) {
2747        case TCP_SEQ_STATE_LISTENING:
2748        case TCP_SEQ_STATE_ESTABLISHED:
2749                get_tcp4_sock(v, seq, st->num, &len);
2750                break;
2751        case TCP_SEQ_STATE_OPENREQ:
2752                get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2753                break;
2754        case TCP_SEQ_STATE_TIME_WAIT:
2755                get_timewait4_sock(v, seq, st->num, &len);
2756                break;
2757        }
2758        seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2759out:
2760        return 0;
2761}
2762
2763static const struct file_operations tcp_afinfo_seq_fops = {
2764        .owner   = THIS_MODULE,
2765        .open    = tcp_seq_open,
2766        .read    = seq_read,
2767        .llseek  = seq_lseek,
2768        .release = seq_release_net
2769};
2770
2771static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2772        .name           = "tcp",
2773        .family         = AF_INET,
2774        .seq_fops       = &tcp_afinfo_seq_fops,
2775        .seq_ops        = {
2776                .show           = tcp4_seq_show,
2777        },
2778};
2779
2780static int __net_init tcp4_proc_init_net(struct net *net)
2781{
2782        return tcp_proc_register(net, &tcp4_seq_afinfo);
2783}
2784
2785static void __net_exit tcp4_proc_exit_net(struct net *net)
2786{
2787        tcp_proc_unregister(net, &tcp4_seq_afinfo);
2788}
2789
2790static struct pernet_operations tcp4_net_ops = {
2791        .init = tcp4_proc_init_net,
2792        .exit = tcp4_proc_exit_net,
2793};
2794
2795int __init tcp4_proc_init(void)
2796{
2797        return register_pernet_subsys(&tcp4_net_ops);
2798}
2799
2800void tcp4_proc_exit(void)
2801{
2802        unregister_pernet_subsys(&tcp4_net_ops);
2803}
2804#endif /* CONFIG_PROC_FS */
2805
2806struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2807{
2808        const struct iphdr *iph = skb_gro_network_header(skb);
2809        __wsum wsum;
2810        __sum16 sum;
2811
2812        switch (skb->ip_summed) {
2813        case CHECKSUM_COMPLETE:
2814                if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2815                                  skb->csum)) {
2816                        skb->ip_summed = CHECKSUM_UNNECESSARY;
2817                        break;
2818                }
2819flush:
2820                NAPI_GRO_CB(skb)->flush = 1;
2821                return NULL;
2822
2823        case CHECKSUM_NONE:
2824                wsum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
2825                                          skb_gro_len(skb), IPPROTO_TCP, 0);
2826                sum = csum_fold(skb_checksum(skb,
2827                                             skb_gro_offset(skb),
2828                                             skb_gro_len(skb),
2829                                             wsum));
2830                if (sum)
2831                        goto flush;
2832
2833                skb->ip_summed = CHECKSUM_UNNECESSARY;
2834                break;
2835        }
2836
2837        return tcp_gro_receive(head, skb);
2838}
2839
2840int tcp4_gro_complete(struct sk_buff *skb)
2841{
2842        const struct iphdr *iph = ip_hdr(skb);
2843        struct tcphdr *th = tcp_hdr(skb);
2844
2845        th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2846                                  iph->saddr, iph->daddr, 0);
2847        skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2848
2849        return tcp_gro_complete(skb);
2850}
2851
2852struct proto tcp_prot = {
2853        .name                   = "TCP",
2854        .owner                  = THIS_MODULE,
2855        .close                  = tcp_close,
2856        .connect                = tcp_v4_connect,
2857        .disconnect             = tcp_disconnect,
2858        .accept                 = inet_csk_accept,
2859        .ioctl                  = tcp_ioctl,
2860        .init                   = tcp_v4_init_sock,
2861        .destroy                = tcp_v4_destroy_sock,
2862        .shutdown               = tcp_shutdown,
2863        .setsockopt             = tcp_setsockopt,
2864        .getsockopt             = tcp_getsockopt,
2865        .recvmsg                = tcp_recvmsg,
2866        .sendmsg                = tcp_sendmsg,
2867        .sendpage               = tcp_sendpage,
2868        .backlog_rcv            = tcp_v4_do_rcv,
2869        .release_cb             = tcp_release_cb,
2870        .mtu_reduced            = tcp_v4_mtu_reduced,
2871        .hash                   = inet_hash,
2872        .unhash                 = inet_unhash,
2873        .get_port               = inet_csk_get_port,
2874        .enter_memory_pressure  = tcp_enter_memory_pressure,
2875        .sockets_allocated      = &tcp_sockets_allocated,
2876        .orphan_count           = &tcp_orphan_count,
2877        .memory_allocated       = &tcp_memory_allocated,
2878        .memory_pressure        = &tcp_memory_pressure,
2879        .sysctl_wmem            = sysctl_tcp_wmem,
2880        .sysctl_rmem            = sysctl_tcp_rmem,
2881        .max_header             = MAX_TCP_HEADER,
2882        .obj_size               = sizeof(struct tcp_sock),
2883        .slab_flags             = SLAB_DESTROY_BY_RCU,
2884        .twsk_prot              = &tcp_timewait_sock_ops,
2885        .rsk_prot               = &tcp_request_sock_ops,
2886        .h.hashinfo             = &tcp_hashinfo,
2887        .no_autobind            = true,
2888#ifdef CONFIG_COMPAT
2889        .compat_setsockopt      = compat_tcp_setsockopt,
2890        .compat_getsockopt      = compat_tcp_getsockopt,
2891#endif
2892#ifdef CONFIG_MEMCG_KMEM
2893        .init_cgroup            = tcp_init_cgroup,
2894        .destroy_cgroup         = tcp_destroy_cgroup,
2895        .proto_cgroup           = tcp_proto_cgroup,
2896#endif
2897};
2898EXPORT_SYMBOL(tcp_prot);
2899
2900static int __net_init tcp_sk_init(struct net *net)
2901{
2902        net->ipv4.sysctl_tcp_ecn = 2;
2903        return 0;
2904}
2905
2906static void __net_exit tcp_sk_exit(struct net *net)
2907{
2908}
2909
2910static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2911{
2912        inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2913}
2914
2915static struct pernet_operations __net_initdata tcp_sk_ops = {
2916       .init       = tcp_sk_init,
2917       .exit       = tcp_sk_exit,
2918       .exit_batch = tcp_sk_exit_batch,
2919};
2920
2921void __init tcp_v4_init(void)
2922{
2923        inet_hashinfo_init(&tcp_hashinfo);
2924        if (register_pernet_subsys(&tcp_sk_ops))
2925                panic("Failed to create the TCP control socket.\n");
2926}
2927