LXR linux/net/ipv4/tcp

   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Implementation of the Transmission Control Protocol(TCP).
   7 *
   8 *              IPv4 specific functions
   9 *
  10 *
  11 *              code split from:
  12 *              linux/ipv4/tcp.c
  13 *              linux/ipv4/tcp_input.c
  14 *              linux/ipv4/tcp_output.c
  15 *
  16 *              See tcp.c for author information
  17 *
  18 *      This program is free software; you can redistribute it and/or
  19 *      modify it under the terms of the GNU General Public License
  20 *      as published by the Free Software Foundation; either version
  21 *      2 of the License, or (at your option) any later version.
  22 */
  23
  24/*
  25 * Changes:
  26 *              David S. Miller :       New socket lookup architecture.
  27 *                                      This code is dedicated to John Dyson.
  28 *              David S. Miller :       Change semantics of established hash,
  29 *                                      half is devoted to TIME_WAIT sockets
  30 *                                      and the rest go in the other half.
  31 *              Andi Kleen :            Add support for syncookies and fixed
  32 *                                      some bugs: ip options weren't passed to
  33 *                                      the TCP layer, missed a check for an
  34 *                                      ACK bit.
  35 *              Andi Kleen :            Implemented fast path mtu discovery.
  36 *                                      Fixed many serious bugs in the
  37 *                                      request_sock handling and moved
  38 *                                      most of it into the af independent code.
  39 *                                      Added tail drop and some other bugfixes.
  40 *                                      Added new listen semantics.
  41 *              Mike McLagan    :       Routing by source
  42 *      Juan Jose Ciarlante:            ip_dynaddr bits
  43 *              Andi Kleen:             various fixes.
  44 *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45 *                                      coma.
  46 *      Andi Kleen              :       Fix new listen.
  47 *      Andi Kleen              :       Fix accept error reporting.
  48 *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49 *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50 *                                      a single port at the same time.
  51 */
  52
  53#define pr_fmt(fmt) "TCP: " fmt
  54
  55#include <linux/bottom_half.h>
  56#include <linux/types.h>
  57#include <linux/fcntl.h>
  58#include <linux/module.h>
  59#include <linux/random.h>
  60#include <linux/cache.h>
  61#include <linux/jhash.h>
  62#include <linux/init.h>
  63#include <linux/times.h>
  64#include <linux/slab.h>
  65
  66#include <net/net_namespace.h>
  67#include <net/icmp.h>
  68#include <net/inet_hashtables.h>
  69#include <net/tcp.h>
  70#include <net/transp_v6.h>
  71#include <net/ipv6.h>
  72#include <net/inet_common.h>
  73#include <net/timewait_sock.h>
  74#include <net/xfrm.h>
  75#include <net/netdma.h>
  76#include <net/secure_seq.h>
  77#include <net/tcp_memcontrol.h>
  78#include <net/busy_poll.h>
  79
  80#include <linux/inet.h>
  81#include <linux/ipv6.h>
  82#include <linux/stddef.h>
  83#include <linux/proc_fs.h>
  84#include <linux/seq_file.h>
  85
  86#include <linux/crypto.h>
  87#include <linux/scatterlist.h>
  88
  89int sysctl_tcp_tw_reuse __read_mostly;
  90int sysctl_tcp_low_latency __read_mostly;
  91EXPORT_SYMBOL(sysctl_tcp_low_latency);
  92
  93
  94#ifdef CONFIG_TCP_MD5SIG
  95static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  96                               __be32 daddr, __be32 saddr, const struct tcphdr *th);
  97#endif
  98
  99struct inet_hashinfo tcp_hashinfo;
 100EXPORT_SYMBOL(tcp_hashinfo);
 101
 102static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
 103{
 104        return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 105                                          ip_hdr(skb)->saddr,
 106                                          tcp_hdr(skb)->dest,
 107                                          tcp_hdr(skb)->source);
 108}
 109
 110int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 111{
 112        const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 113        struct tcp_sock *tp = tcp_sk(sk);
 114
 115        /* With PAWS, it is safe from the viewpoint
 116           of data integrity. Even without PAWS it is safe provided sequence
 117           spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 118
 119           Actually, the idea is close to VJ's one, only timestamp cache is
 120           held not per host, but per port pair and TW bucket is used as state
 121           holder.
 122
 123           If TW bucket has been already destroyed we fall back to VJ's scheme
 124           and use initial timestamp retrieved from peer table.
 125         */
 126        if (tcptw->tw_ts_recent_stamp &&
 127            (twp == NULL || (sysctl_tcp_tw_reuse &&
 128                             get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 129                tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 130                if (tp->write_seq == 0)
 131                        tp->write_seq = 1;
 132                tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 133                tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 134                sock_hold(sktw);
 135                return 1;
 136        }
 137
 138        return 0;
 139}
 140EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 141
 142/* This will initiate an outgoing connection. */
 143int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 144{
 145        struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 146        struct inet_sock *inet = inet_sk(sk);
 147        struct tcp_sock *tp = tcp_sk(sk);
 148        __be16 orig_sport, orig_dport;
 149        __be32 daddr, nexthop;
 150        struct flowi4 *fl4;
 151        struct rtable *rt;
 152        int err;
 153        struct ip_options_rcu *inet_opt;
 154
 155        if (addr_len < sizeof(struct sockaddr_in))
 156                return -EINVAL;
 157
 158        if (usin->sin_family != AF_INET)
 159                return -EAFNOSUPPORT;
 160
 161        nexthop = daddr = usin->sin_addr.s_addr;
 162        inet_opt = rcu_dereference_protected(inet->inet_opt,
 163                                             sock_owned_by_user(sk));
 164        if (inet_opt && inet_opt->opt.srr) {
 165                if (!daddr)
 166                        return -EINVAL;
 167                nexthop = inet_opt->opt.faddr;
 168        }
 169
 170        orig_sport = inet->inet_sport;
 171        orig_dport = usin->sin_port;
 172        fl4 = &inet->cork.fl.u.ip4;
 173        rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 174                              RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 175                              IPPROTO_TCP,
 176                              orig_sport, orig_dport, sk);
 177        if (IS_ERR(rt)) {
 178                err = PTR_ERR(rt);
 179                if (err == -ENETUNREACH)
 180                        IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 181                return err;
 182        }
 183
 184        if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 185                ip_rt_put(rt);
 186                return -ENETUNREACH;
 187        }
 188
 189        if (!inet_opt || !inet_opt->opt.srr)
 190                daddr = fl4->daddr;
 191
 192        if (!inet->inet_saddr)
 193                inet->inet_saddr = fl4->saddr;
 194        inet->inet_rcv_saddr = inet->inet_saddr;
 195
 196        if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 197                /* Reset inherited state */
 198                tp->rx_opt.ts_recent       = 0;
 199                tp->rx_opt.ts_recent_stamp = 0;
 200                if (likely(!tp->repair))
 201                        tp->write_seq      = 0;
 202        }
 203
 204        if (tcp_death_row.sysctl_tw_recycle &&
 205            !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
 206                tcp_fetch_timewait_stamp(sk, &rt->dst);
 207
 208        inet->inet_dport = usin->sin_port;
 209        inet->inet_daddr = daddr;
 210
 211        inet_set_txhash(sk);
 212
 213        inet_csk(sk)->icsk_ext_hdr_len = 0;
 214        if (inet_opt)
 215                inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 216
 217        tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 218
 219        /* Socket identity is still unknown (sport may be zero).
 220         * However we set state to SYN-SENT and not releasing socket
 221         * lock select source port, enter ourselves into the hash tables and
 222         * complete initialization after this.
 223         */
 224        tcp_set_state(sk, TCP_SYN_SENT);
 225        err = inet_hash_connect(&tcp_death_row, sk);
 226        if (err)
 227                goto failure;
 228
 229        rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 230                               inet->inet_sport, inet->inet_dport, sk);
 231        if (IS_ERR(rt)) {
 232                err = PTR_ERR(rt);
 233                rt = NULL;
 234                goto failure;
 235        }
 236        /* OK, now commit destination to socket.  */
 237        sk->sk_gso_type = SKB_GSO_TCPV4;
 238        sk_setup_caps(sk, &rt->dst);
 239
 240        if (!tp->write_seq && likely(!tp->repair))
 241                tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 242                                                           inet->inet_daddr,
 243                                                           inet->inet_sport,
 244                                                           usin->sin_port);
 245
 246        inet->inet_id = tp->write_seq ^ jiffies;
 247
 248        err = tcp_connect(sk);
 249
 250        rt = NULL;
 251        if (err)
 252                goto failure;
 253
 254        return 0;
 255
 256failure:
 257        /*
 258         * This unhashes the socket and releases the local port,
 259         * if necessary.
 260         */
 261        tcp_set_state(sk, TCP_CLOSE);
 262        ip_rt_put(rt);
 263        sk->sk_route_caps = 0;
 264        inet->inet_dport = 0;
 265        return err;
 266}
 267EXPORT_SYMBOL(tcp_v4_connect);
 268
 269/*
 270 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 271 * It can be called through tcp_release_cb() if socket was owned by user
 272 * at the time tcp_v4_err() was called to handle ICMP message.
 273 */
 274void tcp_v4_mtu_reduced(struct sock *sk)
 275{
 276        struct dst_entry *dst;
 277        struct inet_sock *inet = inet_sk(sk);
 278        u32 mtu = tcp_sk(sk)->mtu_info;
 279
 280        dst = inet_csk_update_pmtu(sk, mtu);
 281        if (!dst)
 282                return;
 283
 284        /* Something is about to be wrong... Remember soft error
 285         * for the case, if this connection will not able to recover.
 286         */
 287        if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 288                sk->sk_err_soft = EMSGSIZE;
 289
 290        mtu = dst_mtu(dst);
 291
 292        if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 293            ip_sk_accept_pmtu(sk) &&
 294            inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 295                tcp_sync_mss(sk, mtu);
 296
 297                /* Resend the TCP packet because it's
 298                 * clear that the old packet has been
 299                 * dropped. This is the new "fast" path mtu
 300                 * discovery.
 301                 */
 302                tcp_simple_retransmit(sk);
 303        } /* else let the usual retransmit timer handle it */
 304}
 305EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 306
 307static void do_redirect(struct sk_buff *skb, struct sock *sk)
 308{
 309        struct dst_entry *dst = __sk_dst_check(sk, 0);
 310
 311        if (dst)
 312                dst->ops->redirect(dst, sk, skb);
 313}
 314
 315/*
 316 * This routine is called by the ICMP module when it gets some
 317 * sort of error condition.  If err < 0 then the socket should
 318 * be closed and the error returned to the user.  If err > 0
 319 * it's just the icmp type << 8 | icmp code.  After adjustment
 320 * header points to the first 8 bytes of the tcp header.  We need
 321 * to find the appropriate port.
 322 *
 323 * The locking strategy used here is very "optimistic". When
 324 * someone else accesses the socket the ICMP is just dropped
 325 * and for some paths there is no check at all.
 326 * A more general error queue to queue errors for later handling
 327 * is probably better.
 328 *
 329 */
 330
 331void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 332{
 333        const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 334        struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 335        struct inet_connection_sock *icsk;
 336        struct tcp_sock *tp;
 337        struct inet_sock *inet;
 338        const int type = icmp_hdr(icmp_skb)->type;
 339        const int code = icmp_hdr(icmp_skb)->code;
 340        struct sock *sk;
 341        struct sk_buff *skb;
 342        struct request_sock *fastopen;
 343        __u32 seq, snd_una;
 344        __u32 remaining;
 345        int err;
 346        struct net *net = dev_net(icmp_skb->dev);
 347
 348        sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
 349                        iph->saddr, th->source, inet_iif(icmp_skb));
 350        if (!sk) {
 351                ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 352                return;
 353        }
 354        if (sk->sk_state == TCP_TIME_WAIT) {
 355                inet_twsk_put(inet_twsk(sk));
 356                return;
 357        }
 358
 359        bh_lock_sock(sk);
 360        /* If too many ICMPs get dropped on busy
 361         * servers this needs to be solved differently.
 362         * We do take care of PMTU discovery (RFC1191) special case :
 363         * we can receive locally generated ICMP messages while socket is held.
 364         */
 365        if (sock_owned_by_user(sk)) {
 366                if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 367                        NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 368        }
 369        if (sk->sk_state == TCP_CLOSE)
 370                goto out;
 371
 372        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 373                NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
 374                goto out;
 375        }
 376
 377        icsk = inet_csk(sk);
 378        tp = tcp_sk(sk);
 379        seq = ntohl(th->seq);
 380        /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 381        fastopen = tp->fastopen_rsk;
 382        snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 383        if (sk->sk_state != TCP_LISTEN &&
 384            !between(seq, snd_una, tp->snd_nxt)) {
 385                NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 386                goto out;
 387        }
 388
 389        switch (type) {
 390        case ICMP_REDIRECT:
 391                do_redirect(icmp_skb, sk);
 392                goto out;
 393        case ICMP_SOURCE_QUENCH:
 394                /* Just silently ignore these. */
 395                goto out;
 396        case ICMP_PARAMETERPROB:
 397                err = EPROTO;
 398                break;
 399        case ICMP_DEST_UNREACH:
 400                if (code > NR_ICMP_UNREACH)
 401                        goto out;
 402
 403                if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 404                        /* We are not interested in TCP_LISTEN and open_requests
 405                         * (SYN-ACKs send out by Linux are always <576bytes so
 406                         * they should go through unfragmented).
 407                         */
 408                        if (sk->sk_state == TCP_LISTEN)
 409                                goto out;
 410
 411                        tp->mtu_info = info;
 412                        if (!sock_owned_by_user(sk)) {
 413                                tcp_v4_mtu_reduced(sk);
 414                        } else {
 415                                if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
 416                                        sock_hold(sk);
 417                        }
 418                        goto out;
 419                }
 420
 421                err = icmp_err_convert[code].errno;
 422                /* check if icmp_skb allows revert of backoff
 423                 * (see draft-zimmermann-tcp-lcd) */
 424                if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 425                        break;
 426                if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 427                    !icsk->icsk_backoff || fastopen)
 428                        break;
 429
 430                if (sock_owned_by_user(sk))
 431                        break;
 432
 433                icsk->icsk_backoff--;
 434                inet_csk(sk)->icsk_rto = (tp->srtt_us ? __tcp_set_rto(tp) :
 435                        TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
 436                tcp_bound_rto(sk);
 437
 438                skb = tcp_write_queue_head(sk);
 439                BUG_ON(!skb);
 440
 441                remaining = icsk->icsk_rto - min(icsk->icsk_rto,
 442                                tcp_time_stamp - TCP_SKB_CB(skb)->when);
 443
 444                if (remaining) {
 445                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 446                                                  remaining, TCP_RTO_MAX);
 447                } else {
 448                        /* RTO revert clocked out retransmission.
 449                         * Will retransmit now */
 450                        tcp_retransmit_timer(sk);
 451                }
 452
 453                break;
 454        case ICMP_TIME_EXCEEDED:
 455                err = EHOSTUNREACH;
 456                break;
 457        default:
 458                goto out;
 459        }
 460
 461        switch (sk->sk_state) {
 462                struct request_sock *req, **prev;
 463        case TCP_LISTEN:
 464                if (sock_owned_by_user(sk))
 465                        goto out;
 466
 467                req = inet_csk_search_req(sk, &prev, th->dest,
 468                                          iph->daddr, iph->saddr);
 469                if (!req)
 470                        goto out;
 471
 472                /* ICMPs are not backlogged, hence we cannot get
 473                   an established socket here.
 474                 */
 475                WARN_ON(req->sk);
 476
 477                if (seq != tcp_rsk(req)->snt_isn) {
 478                        NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 479                        goto out;
 480                }
 481
 482                /*
 483                 * Still in SYN_RECV, just remove it silently.
 484                 * There is no good way to pass the error to the newly
 485                 * created socket, and POSIX does not want network
 486                 * errors returned from accept().
 487                 */
 488                inet_csk_reqsk_queue_drop(sk, req, prev);
 489                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
 490                goto out;
 491
 492        case TCP_SYN_SENT:
 493        case TCP_SYN_RECV:
 494                /* Only in fast or simultaneous open. If a fast open socket is
 495                 * is already accepted it is treated as a connected one below.
 496                 */
 497                if (fastopen && fastopen->sk == NULL)
 498                        break;
 499
 500                if (!sock_owned_by_user(sk)) {
 501                        sk->sk_err = err;
 502
 503                        sk->sk_error_report(sk);
 504
 505                        tcp_done(sk);
 506                } else {
 507                        sk->sk_err_soft = err;
 508                }
 509                goto out;
 510        }
 511
 512        /* If we've already connected we will keep trying
 513         * until we time out, or the user gives up.
 514         *
 515         * rfc1122 4.2.3.9 allows to consider as hard errors
 516         * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 517         * but it is obsoleted by pmtu discovery).
 518         *
 519         * Note, that in modern internet, where routing is unreliable
 520         * and in each dark corner broken firewalls sit, sending random
 521         * errors ordered by their masters even this two messages finally lose
 522         * their original sense (even Linux sends invalid PORT_UNREACHs)
 523         *
 524         * Now we are in compliance with RFCs.
 525         *                                                      --ANK (980905)
 526         */
 527
 528        inet = inet_sk(sk);
 529        if (!sock_owned_by_user(sk) && inet->recverr) {
 530                sk->sk_err = err;
 531                sk->sk_error_report(sk);
 532        } else  { /* Only an error on timeout */
 533                sk->sk_err_soft = err;
 534        }
 535
 536out:
 537        bh_unlock_sock(sk);
 538        sock_put(sk);
 539}
 540
 541void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 542{
 543        struct tcphdr *th = tcp_hdr(skb);
 544
 545        if (skb->ip_summed == CHECKSUM_PARTIAL) {
 546                th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 547                skb->csum_start = skb_transport_header(skb) - skb->head;
 548                skb->csum_offset = offsetof(struct tcphdr, check);
 549        } else {
 550                th->check = tcp_v4_check(skb->len, saddr, daddr,
 551                                         csum_partial(th,
 552                                                      th->doff << 2,
 553                                                      skb->csum));
 554        }
 555}
 556
 557/* This routine computes an IPv4 TCP checksum. */
 558void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 559{
 560        const struct inet_sock *inet = inet_sk(sk);
 561
 562        __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 563}
 564EXPORT_SYMBOL(tcp_v4_send_check);
 565
 566/*
 567 *      This routine will send an RST to the other tcp.
 568 *
 569 *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 570 *                    for reset.
 571 *      Answer: if a packet caused RST, it is not for a socket
 572 *              existing in our system, if it is matched to a socket,
 573 *              it is just duplicate segment or bug in other side's TCP.
 574 *              So that we build reply only basing on parameters
 575 *              arrived with segment.
 576 *      Exception: precedence violation. We do not implement it in any case.
 577 */
 578
 579static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 580{
 581        const struct tcphdr *th = tcp_hdr(skb);
 582        struct {
 583                struct tcphdr th;
 584#ifdef CONFIG_TCP_MD5SIG
 585                __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 586#endif
 587        } rep;
 588        struct ip_reply_arg arg;
 589#ifdef CONFIG_TCP_MD5SIG
 590        struct tcp_md5sig_key *key;
 591        const __u8 *hash_location = NULL;
 592        unsigned char newhash[16];
 593        int genhash;
 594        struct sock *sk1 = NULL;
 595#endif
 596        struct net *net;
 597
 598        /* Never send a reset in response to a reset. */
 599        if (th->rst)
 600                return;
 601
 602        if (skb_rtable(skb)->rt_type != RTN_LOCAL)
 603                return;
 604
 605        /* Swap the send and the receive. */
 606        memset(&rep, 0, sizeof(rep));
 607        rep.th.dest   = th->source;
 608        rep.th.source = th->dest;
 609        rep.th.doff   = sizeof(struct tcphdr) / 4;
 610        rep.th.rst    = 1;
 611
 612        if (th->ack) {
 613                rep.th.seq = th->ack_seq;
 614        } else {
 615                rep.th.ack = 1;
 616                rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 617                                       skb->len - (th->doff << 2));
 618        }
 619
 620        memset(&arg, 0, sizeof(arg));
 621        arg.iov[0].iov_base = (unsigned char *)&rep;
 622        arg.iov[0].iov_len  = sizeof(rep.th);
 623
 624#ifdef CONFIG_TCP_MD5SIG
 625        hash_location = tcp_parse_md5sig_option(th);
 626        if (!sk && hash_location) {
 627                /*
 628                 * active side is lost. Try to find listening socket through
 629                 * source port, and then find md5 key through listening socket.
 630                 * we are not loose security here:
 631                 * Incoming packet is checked with md5 hash with finding key,
 632                 * no RST generated if md5 hash doesn't match.
 633                 */
 634                sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
 635                                             &tcp_hashinfo, ip_hdr(skb)->saddr,
 636                                             th->source, ip_hdr(skb)->daddr,
 637                                             ntohs(th->source), inet_iif(skb));
 638                /* don't send rst if it can't find key */
 639                if (!sk1)
 640                        return;
 641                rcu_read_lock();
 642                key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 643                                        &ip_hdr(skb)->saddr, AF_INET);
 644                if (!key)
 645                        goto release_sk1;
 646
 647                genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
 648                if (genhash || memcmp(hash_location, newhash, 16) != 0)
 649                        goto release_sk1;
 650        } else {
 651                key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 652                                             &ip_hdr(skb)->saddr,
 653                                             AF_INET) : NULL;
 654        }
 655
 656        if (key) {
 657                rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 658                                   (TCPOPT_NOP << 16) |
 659                                   (TCPOPT_MD5SIG << 8) |
 660                                   TCPOLEN_MD5SIG);
 661                /* Update length and the length the header thinks exists */
 662                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 663                rep.th.doff = arg.iov[0].iov_len / 4;
 664
 665                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 666                                     key, ip_hdr(skb)->saddr,
 667                                     ip_hdr(skb)->daddr, &rep.th);
 668        }
 669#endif
 670        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 671                                      ip_hdr(skb)->saddr, /* XXX */
 672                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 673        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 674        arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 675        /* When socket is gone, all binding information is lost.
 676         * routing might fail in this case. No choice here, if we choose to force
 677         * input interface, we will misroute in case of asymmetric route.
 678         */
 679        if (sk)
 680                arg.bound_dev_if = sk->sk_bound_dev_if;
 681
 682        net = dev_net(skb_dst(skb)->dev);
 683        arg.tos = ip_hdr(skb)->tos;
 684        ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
 685                              ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
 686
 687        TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 688        TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 689
 690#ifdef CONFIG_TCP_MD5SIG
 691release_sk1:
 692        if (sk1) {
 693                rcu_read_unlock();
 694                sock_put(sk1);
 695        }
 696#endif
 697}
 698
 699/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 700   outside socket context is ugly, certainly. What can I do?
 701 */
 702
 703static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 704                            u32 win, u32 tsval, u32 tsecr, int oif,
 705                            struct tcp_md5sig_key *key,
 706                            int reply_flags, u8 tos)
 707{
 708        const struct tcphdr *th = tcp_hdr(skb);
 709        struct {
 710                struct tcphdr th;
 711                __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 712#ifdef CONFIG_TCP_MD5SIG
 713                           + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 714#endif
 715                        ];
 716        } rep;
 717        struct ip_reply_arg arg;
 718        struct net *net = dev_net(skb_dst(skb)->dev);
 719
 720        memset(&rep.th, 0, sizeof(struct tcphdr));
 721        memset(&arg, 0, sizeof(arg));
 722
 723        arg.iov[0].iov_base = (unsigned char *)&rep;
 724        arg.iov[0].iov_len  = sizeof(rep.th);
 725        if (tsecr) {
 726                rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 727                                   (TCPOPT_TIMESTAMP << 8) |
 728                                   TCPOLEN_TIMESTAMP);
 729                rep.opt[1] = htonl(tsval);
 730                rep.opt[2] = htonl(tsecr);
 731                arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 732        }
 733
 734        /* Swap the send and the receive. */
 735        rep.th.dest    = th->source;
 736        rep.th.source  = th->dest;
 737        rep.th.doff    = arg.iov[0].iov_len / 4;
 738        rep.th.seq     = htonl(seq);
 739        rep.th.ack_seq = htonl(ack);
 740        rep.th.ack     = 1;
 741        rep.th.window  = htons(win);
 742
 743#ifdef CONFIG_TCP_MD5SIG
 744        if (key) {
 745                int offset = (tsecr) ? 3 : 0;
 746
 747                rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 748                                          (TCPOPT_NOP << 16) |
 749                                          (TCPOPT_MD5SIG << 8) |
 750                                          TCPOLEN_MD5SIG);
 751                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 752                rep.th.doff = arg.iov[0].iov_len/4;
 753
 754                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 755                                    key, ip_hdr(skb)->saddr,
 756                                    ip_hdr(skb)->daddr, &rep.th);
 757        }
 758#endif
 759        arg.flags = reply_flags;
 760        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 761                                      ip_hdr(skb)->saddr, /* XXX */
 762                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 763        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 764        if (oif)
 765                arg.bound_dev_if = oif;
 766        arg.tos = tos;
 767        ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
 768                              ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
 769
 770        TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 771}
 772
 773static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 774{
 775        struct inet_timewait_sock *tw = inet_twsk(sk);
 776        struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 777
 778        tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 779                        tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 780                        tcp_time_stamp + tcptw->tw_ts_offset,
 781                        tcptw->tw_ts_recent,
 782                        tw->tw_bound_dev_if,
 783                        tcp_twsk_md5_key(tcptw),
 784                        tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 785                        tw->tw_tos
 786                        );
 787
 788        inet_twsk_put(tw);
 789}
 790
 791static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 792                                  struct request_sock *req)
 793{
 794        /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 795         * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 796         */
 797        tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
 798                        tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
 799                        tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
 800                        tcp_time_stamp,
 801                        req->ts_recent,
 802                        0,
 803                        tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
 804                                          AF_INET),
 805                        inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 806                        ip_hdr(skb)->tos);
 807}
 808
 809/*
 810 *      Send a SYN-ACK after having received a SYN.
 811 *      This still operates on a request_sock only, not on a big
 812 *      socket.
 813 */
 814static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 815                              struct flowi *fl,
 816                              struct request_sock *req,
 817                              u16 queue_mapping,
 818                              struct tcp_fastopen_cookie *foc)
 819{
 820        const struct inet_request_sock *ireq = inet_rsk(req);
 821        struct flowi4 fl4;
 822        int err = -1;
 823        struct sk_buff *skb;
 824
 825        /* First, grab a route. */
 826        if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 827                return -1;
 828
 829        skb = tcp_make_synack(sk, dst, req, foc);
 830
 831        if (skb) {
 832                __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 833
 834                skb_set_queue_mapping(skb, queue_mapping);
 835                err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 836                                            ireq->ir_rmt_addr,
 837                                            ireq->opt);
 838                err = net_xmit_eval(err);
 839        }
 840
 841        return err;
 842}
 843
 844/*
 845 *      IPv4 request_sock destructor.
 846 */
 847static void tcp_v4_reqsk_destructor(struct request_sock *req)
 848{
 849        kfree(inet_rsk(req)->opt);
 850}
 851
 852/*
 853 * Return true if a syncookie should be sent
 854 */
 855bool tcp_syn_flood_action(struct sock *sk,
 856                         const struct sk_buff *skb,
 857                         const char *proto)
 858{
 859        const char *msg = "Dropping request";
 860        bool want_cookie = false;
 861        struct listen_sock *lopt;
 862
 863#ifdef CONFIG_SYN_COOKIES
 864        if (sysctl_tcp_syncookies) {
 865                msg = "Sending cookies";
 866                want_cookie = true;
 867                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
 868        } else
 869#endif
 870                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
 871
 872        lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
 873        if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) {
 874                lopt->synflood_warned = 1;
 875                pr_info("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
 876                        proto, ntohs(tcp_hdr(skb)->dest), msg);
 877        }
 878        return want_cookie;
 879}
 880EXPORT_SYMBOL(tcp_syn_flood_action);
 881
 882/*
 883 * Save and compile IPv4 options into the request_sock if needed.
 884 */
 885static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
 886{
 887        const struct ip_options *opt = &(IPCB(skb)->opt);
 888        struct ip_options_rcu *dopt = NULL;
 889
 890        if (opt && opt->optlen) {
 891                int opt_size = sizeof(*dopt) + opt->optlen;
 892
 893                dopt = kmalloc(opt_size, GFP_ATOMIC);
 894                if (dopt) {
 895                        if (ip_options_echo(&dopt->opt, skb)) {
 896                                kfree(dopt);
 897                                dopt = NULL;
 898                        }
 899                }
 900        }
 901        return dopt;
 902}
 903
 904#ifdef CONFIG_TCP_MD5SIG
 905/*
 906 * RFC2385 MD5 checksumming requires a mapping of
 907 * IP address->MD5 Key.
 908 * We need to maintain these in the sk structure.
 909 */
 910
 911/* Find the Key structure for an address.  */
 912struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
 913                                         const union tcp_md5_addr *addr,
 914                                         int family)
 915{
 916        struct tcp_sock *tp = tcp_sk(sk);
 917        struct tcp_md5sig_key *key;
 918        unsigned int size = sizeof(struct in_addr);
 919        struct tcp_md5sig_info *md5sig;
 920
 921        /* caller either holds rcu_read_lock() or socket lock */
 922        md5sig = rcu_dereference_check(tp->md5sig_info,
 923                                       sock_owned_by_user(sk) ||
 924                                       lockdep_is_held(&sk->sk_lock.slock));
 925        if (!md5sig)
 926                return NULL;
 927#if IS_ENABLED(CONFIG_IPV6)
 928        if (family == AF_INET6)
 929                size = sizeof(struct in6_addr);
 930#endif
 931        hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 932                if (key->family != family)
 933                        continue;
 934                if (!memcmp(&key->addr, addr, size))
 935                        return key;
 936        }
 937        return NULL;
 938}
 939EXPORT_SYMBOL(tcp_md5_do_lookup);
 940
 941struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 942                                         struct sock *addr_sk)
 943{
 944        union tcp_md5_addr *addr;
 945
 946        addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
 947        return tcp_md5_do_lookup(sk, addr, AF_INET);
 948}
 949EXPORT_SYMBOL(tcp_v4_md5_lookup);
 950
 951static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
 952                                                      struct request_sock *req)
 953{
 954        union tcp_md5_addr *addr;
 955
 956        addr = (union tcp_md5_addr *)&inet_rsk(req)->ir_rmt_addr;
 957        return tcp_md5_do_lookup(sk, addr, AF_INET);
 958}
 959
 960/* This can be called on a newly created socket, from other files */
 961int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 962                   int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
 963{
 964        /* Add Key to the list */
 965        struct tcp_md5sig_key *key;
 966        struct tcp_sock *tp = tcp_sk(sk);
 967        struct tcp_md5sig_info *md5sig;
 968
 969        key = tcp_md5_do_lookup(sk, addr, family);
 970        if (key) {
 971                /* Pre-existing entry - just update that one. */
 972                memcpy(key->key, newkey, newkeylen);
 973                key->keylen = newkeylen;
 974                return 0;
 975        }
 976
 977        md5sig = rcu_dereference_protected(tp->md5sig_info,
 978                                           sock_owned_by_user(sk));
 979        if (!md5sig) {
 980                md5sig = kmalloc(sizeof(*md5sig), gfp);
 981                if (!md5sig)
 982                        return -ENOMEM;
 983
 984                sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 985                INIT_HLIST_HEAD(&md5sig->head);
 986                rcu_assign_pointer(tp->md5sig_info, md5sig);
 987        }
 988
 989        key = sock_kmalloc(sk, sizeof(*key), gfp);
 990        if (!key)
 991                return -ENOMEM;
 992        if (!tcp_alloc_md5sig_pool()) {
 993                sock_kfree_s(sk, key, sizeof(*key));
 994                return -ENOMEM;
 995        }
 996
 997        memcpy(key->key, newkey, newkeylen);
 998        key->keylen = newkeylen;
 999        key->family = family;
1000        memcpy(&key->addr, addr,

1001               (family == AF_INET6) ? sizeof(struct in6_addr) :
1002                                      sizeof(struct in_addr));
1003        hlist_add_head_rcu(&key->node, &md5sig->head);
1004        return 0;
1005}
1006EXPORT_SYMBOL(tcp_md5_do_add);
1007
1008int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
1009{
1010        struct tcp_md5sig_key *key;
1011
1012        key = tcp_md5_do_lookup(sk, addr, family);
1013        if (!key)
1014                return -ENOENT;
1015        hlist_del_rcu(&key->node);
1016        atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1017        kfree_rcu(key, rcu);
1018        return 0;
1019}
1020EXPORT_SYMBOL(tcp_md5_do_del);
1021
1022static void tcp_clear_md5_list(struct sock *sk)
1023{
1024        struct tcp_sock *tp = tcp_sk(sk);
1025        struct tcp_md5sig_key *key;
1026        struct hlist_node *n;
1027        struct tcp_md5sig_info *md5sig;
1028
1029        md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1030
1031        hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1032                hlist_del_rcu(&key->node);
1033                atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1034                kfree_rcu(key, rcu);
1035        }
1036}
1037
1038static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1039                                 int optlen)
1040{
1041        struct tcp_md5sig cmd;
1042        struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1043
1044        if (optlen < sizeof(cmd))
1045                return -EINVAL;
1046
1047        if (copy_from_user(&cmd, optval, sizeof(cmd)))
1048                return -EFAULT;
1049
1050        if (sin->sin_family != AF_INET)
1051                return -EINVAL;
1052
1053        if (!cmd.tcpm_keylen)
1054                return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1055                                      AF_INET);
1056
1057        if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1058                return -EINVAL;
1059
1060        return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1061                              AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1062                              GFP_KERNEL);
1063}
1064
1065static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1066                                        __be32 daddr, __be32 saddr, int nbytes)
1067{
1068        struct tcp4_pseudohdr *bp;
1069        struct scatterlist sg;
1070
1071        bp = &hp->md5_blk.ip4;
1072
1073        /*
1074         * 1. the TCP pseudo-header (in the order: source IP address,
1075         * destination IP address, zero-padded protocol number, and
1076         * segment length)
1077         */
1078        bp->saddr = saddr;
1079        bp->daddr = daddr;
1080        bp->pad = 0;
1081        bp->protocol = IPPROTO_TCP;
1082        bp->len = cpu_to_be16(nbytes);
1083
1084        sg_init_one(&sg, bp, sizeof(*bp));
1085        return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1086}
1087
1088static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1089                               __be32 daddr, __be32 saddr, const struct tcphdr *th)
1090{
1091        struct tcp_md5sig_pool *hp;
1092        struct hash_desc *desc;
1093
1094        hp = tcp_get_md5sig_pool();
1095        if (!hp)
1096                goto clear_hash_noput;
1097        desc = &hp->md5_desc;
1098
1099        if (crypto_hash_init(desc))
1100                goto clear_hash;
1101        if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1102                goto clear_hash;
1103        if (tcp_md5_hash_header(hp, th))
1104                goto clear_hash;
1105        if (tcp_md5_hash_key(hp, key))
1106                goto clear_hash;
1107        if (crypto_hash_final(desc, md5_hash))
1108                goto clear_hash;
1109
1110        tcp_put_md5sig_pool();
1111        return 0;
1112
1113clear_hash:
1114        tcp_put_md5sig_pool();
1115clear_hash_noput:
1116        memset(md5_hash, 0, 16);
1117        return 1;
1118}
1119
1120int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1121                        const struct sock *sk, const struct request_sock *req,
1122                        const struct sk_buff *skb)
1123{
1124        struct tcp_md5sig_pool *hp;
1125        struct hash_desc *desc;
1126        const struct tcphdr *th = tcp_hdr(skb);
1127        __be32 saddr, daddr;
1128
1129        if (sk) {
1130                saddr = inet_sk(sk)->inet_saddr;
1131                daddr = inet_sk(sk)->inet_daddr;
1132        } else if (req) {
1133                saddr = inet_rsk(req)->ir_loc_addr;
1134                daddr = inet_rsk(req)->ir_rmt_addr;
1135        } else {
1136                const struct iphdr *iph = ip_hdr(skb);
1137                saddr = iph->saddr;
1138                daddr = iph->daddr;
1139        }
1140
1141        hp = tcp_get_md5sig_pool();
1142        if (!hp)
1143                goto clear_hash_noput;
1144        desc = &hp->md5_desc;
1145
1146        if (crypto_hash_init(desc))
1147                goto clear_hash;
1148
1149        if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1150                goto clear_hash;
1151        if (tcp_md5_hash_header(hp, th))
1152                goto clear_hash;
1153        if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1154                goto clear_hash;
1155        if (tcp_md5_hash_key(hp, key))
1156                goto clear_hash;
1157        if (crypto_hash_final(desc, md5_hash))
1158                goto clear_hash;
1159
1160        tcp_put_md5sig_pool();
1161        return 0;
1162
1163clear_hash:
1164        tcp_put_md5sig_pool();
1165clear_hash_noput:
1166        memset(md5_hash, 0, 16);
1167        return 1;
1168}
1169EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1170
1171static bool __tcp_v4_inbound_md5_hash(struct sock *sk,
1172                                      const struct sk_buff *skb)
1173{
1174        /*
1175         * This gets called for each TCP segment that arrives
1176         * so we want to be efficient.
1177         * We have 3 drop cases:
1178         * o No MD5 hash and one expected.
1179         * o MD5 hash and we're not expecting one.
1180         * o MD5 hash and its wrong.
1181         */
1182        const __u8 *hash_location = NULL;
1183        struct tcp_md5sig_key *hash_expected;
1184        const struct iphdr *iph = ip_hdr(skb);
1185        const struct tcphdr *th = tcp_hdr(skb);
1186        int genhash;
1187        unsigned char newhash[16];
1188
1189        hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1190                                          AF_INET);
1191        hash_location = tcp_parse_md5sig_option(th);
1192
1193        /* We've parsed the options - do we have a hash? */
1194        if (!hash_expected && !hash_location)
1195                return false;
1196
1197        if (hash_expected && !hash_location) {
1198                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1199                return true;
1200        }
1201
1202        if (!hash_expected && hash_location) {
1203                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1204                return true;
1205        }
1206
1207        /* Okay, so this is hash_expected and hash_location -
1208         * so we need to calculate the checksum.
1209         */
1210        genhash = tcp_v4_md5_hash_skb(newhash,
1211                                      hash_expected,
1212                                      NULL, NULL, skb);
1213
1214        if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1215                net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1216                                     &iph->saddr, ntohs(th->source),
1217                                     &iph->daddr, ntohs(th->dest),
1218                                     genhash ? " tcp_v4_calc_md5_hash failed"
1219                                     : "");
1220                return true;
1221        }
1222        return false;
1223}
1224
1225static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1226{
1227        bool ret;
1228
1229        rcu_read_lock();
1230        ret = __tcp_v4_inbound_md5_hash(sk, skb);
1231        rcu_read_unlock();
1232
1233        return ret;
1234}
1235
1236#endif
1237
1238static void tcp_v4_init_req(struct request_sock *req, struct sock *sk,
1239                            struct sk_buff *skb)
1240{
1241        struct inet_request_sock *ireq = inet_rsk(req);
1242
1243        ireq->ir_loc_addr = ip_hdr(skb)->daddr;
1244        ireq->ir_rmt_addr = ip_hdr(skb)->saddr;
1245        ireq->no_srccheck = inet_sk(sk)->transparent;
1246        ireq->opt = tcp_v4_save_options(skb);
1247}
1248
1249static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl,
1250                                          const struct request_sock *req,
1251                                          bool *strict)
1252{
1253        struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1254
1255        if (strict) {
1256                if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1257                        *strict = true;
1258                else
1259                        *strict = false;
1260        }
1261
1262        return dst;
1263}
1264
1265struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1266        .family         =       PF_INET,
1267        .obj_size       =       sizeof(struct tcp_request_sock),
1268        .rtx_syn_ack    =       tcp_rtx_synack,
1269        .send_ack       =       tcp_v4_reqsk_send_ack,
1270        .destructor     =       tcp_v4_reqsk_destructor,
1271        .send_reset     =       tcp_v4_send_reset,
1272        .syn_ack_timeout =      tcp_syn_ack_timeout,
1273};
1274
1275static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1276        .mss_clamp      =       TCP_MSS_DEFAULT,
1277#ifdef CONFIG_TCP_MD5SIG
1278        .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1279        .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1280#endif
1281        .init_req       =       tcp_v4_init_req,
1282#ifdef CONFIG_SYN_COOKIES
1283        .cookie_init_seq =      cookie_v4_init_sequence,
1284#endif
1285        .route_req      =       tcp_v4_route_req,
1286        .init_seq       =       tcp_v4_init_sequence,
1287        .send_synack    =       tcp_v4_send_synack,
1288        .queue_hash_add =       inet_csk_reqsk_queue_hash_add,
1289};
1290
1291int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1292{
1293        /* Never answer to SYNs send to broadcast or multicast */
1294        if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1295                goto drop;
1296
1297        return tcp_conn_request(&tcp_request_sock_ops,
1298                                &tcp_request_sock_ipv4_ops, sk, skb);
1299
1300drop:
1301        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1302        return 0;
1303}
1304EXPORT_SYMBOL(tcp_v4_conn_request);
1305
1306
1307/*
1308 * The three way handshake has completed - we got a valid synack -
1309 * now create the new socket.
1310 */
1311struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1312                                  struct request_sock *req,
1313                                  struct dst_entry *dst)
1314{
1315        struct inet_request_sock *ireq;
1316        struct inet_sock *newinet;
1317        struct tcp_sock *newtp;
1318        struct sock *newsk;
1319#ifdef CONFIG_TCP_MD5SIG
1320        struct tcp_md5sig_key *key;
1321#endif
1322        struct ip_options_rcu *inet_opt;
1323
1324        if (sk_acceptq_is_full(sk))
1325                goto exit_overflow;
1326
1327        newsk = tcp_create_openreq_child(sk, req, skb);
1328        if (!newsk)
1329                goto exit_nonewsk;
1330
1331        newsk->sk_gso_type = SKB_GSO_TCPV4;
1332        inet_sk_rx_dst_set(newsk, skb);
1333
1334        newtp                 = tcp_sk(newsk);
1335        newinet               = inet_sk(newsk);
1336        ireq                  = inet_rsk(req);
1337        newinet->inet_daddr   = ireq->ir_rmt_addr;
1338        newinet->inet_rcv_saddr = ireq->ir_loc_addr;
1339        newinet->inet_saddr           = ireq->ir_loc_addr;
1340        inet_opt              = ireq->opt;
1341        rcu_assign_pointer(newinet->inet_opt, inet_opt);
1342        ireq->opt             = NULL;
1343        newinet->mc_index     = inet_iif(skb);
1344        newinet->mc_ttl       = ip_hdr(skb)->ttl;
1345        newinet->rcv_tos      = ip_hdr(skb)->tos;
1346        inet_csk(newsk)->icsk_ext_hdr_len = 0;
1347        inet_set_txhash(newsk);
1348        if (inet_opt)
1349                inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1350        newinet->inet_id = newtp->write_seq ^ jiffies;
1351
1352        if (!dst) {
1353                dst = inet_csk_route_child_sock(sk, newsk, req);
1354                if (!dst)
1355                        goto put_and_exit;
1356        } else {
1357                /* syncookie case : see end of cookie_v4_check() */
1358        }
1359        sk_setup_caps(newsk, dst);
1360
1361        tcp_sync_mss(newsk, dst_mtu(dst));
1362        newtp->advmss = dst_metric_advmss(dst);
1363        if (tcp_sk(sk)->rx_opt.user_mss &&
1364            tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1365                newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1366
1367        tcp_initialize_rcv_mss(newsk);
1368
1369#ifdef CONFIG_TCP_MD5SIG
1370        /* Copy over the MD5 key from the original socket */
1371        key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1372                                AF_INET);
1373        if (key != NULL) {
1374                /*
1375                 * We're using one, so create a matching key
1376                 * on the newsk structure. If we fail to get
1377                 * memory, then we end up not copying the key
1378                 * across. Shucks.
1379                 */
1380                tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1381                               AF_INET, key->key, key->keylen, GFP_ATOMIC);
1382                sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1383        }
1384#endif
1385
1386        if (__inet_inherit_port(sk, newsk) < 0)
1387                goto put_and_exit;
1388        __inet_hash_nolisten(newsk, NULL);
1389
1390        return newsk;
1391
1392exit_overflow:
1393        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1394exit_nonewsk:
1395        dst_release(dst);
1396exit:
1397        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1398        return NULL;
1399put_and_exit:
1400        inet_csk_prepare_forced_close(newsk);
1401        tcp_done(newsk);
1402        goto exit;
1403}
1404EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1405
1406static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1407{
1408        struct tcphdr *th = tcp_hdr(skb);
1409        const struct iphdr *iph = ip_hdr(skb);
1410        struct sock *nsk;
1411        struct request_sock **prev;
1412        /* Find possible connection requests. */
1413        struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1414                                                       iph->saddr, iph->daddr);
1415        if (req)
1416                return tcp_check_req(sk, skb, req, prev, false);
1417
1418        nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1419                        th->source, iph->daddr, th->dest, inet_iif(skb));
1420
1421        if (nsk) {
1422                if (nsk->sk_state != TCP_TIME_WAIT) {
1423                        bh_lock_sock(nsk);
1424                        return nsk;
1425                }
1426                inet_twsk_put(inet_twsk(nsk));
1427                return NULL;
1428        }
1429
1430#ifdef CONFIG_SYN_COOKIES
1431        if (!th->syn)
1432                sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1433#endif
1434        return sk;
1435}
1436
1437/* The socket must have it's spinlock held when we get
1438 * here.
1439 *
1440 * We have a potential double-lock case here, so even when
1441 * doing backlog processing we use the BH locking scheme.
1442 * This is because we cannot sleep with the original spinlock
1443 * held.
1444 */
1445int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1446{
1447        struct sock *rsk;
1448
1449        if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1450                struct dst_entry *dst = sk->sk_rx_dst;
1451
1452                sock_rps_save_rxhash(sk, skb);
1453                if (dst) {
1454                        if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1455                            dst->ops->check(dst, 0) == NULL) {
1456                                dst_release(dst);
1457                                sk->sk_rx_dst = NULL;
1458                        }
1459                }
1460                tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1461                return 0;
1462        }
1463
1464        if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1465                goto csum_err;
1466
1467        if (sk->sk_state == TCP_LISTEN) {
1468                struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1469                if (!nsk)
1470                        goto discard;
1471
1472                if (nsk != sk) {
1473                        sock_rps_save_rxhash(nsk, skb);
1474                        if (tcp_child_process(sk, nsk, skb)) {
1475                                rsk = nsk;
1476                                goto reset;
1477                        }
1478                        return 0;
1479                }
1480        } else
1481                sock_rps_save_rxhash(sk, skb);
1482
1483        if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1484                rsk = sk;
1485                goto reset;
1486        }
1487        return 0;
1488
1489reset:
1490        tcp_v4_send_reset(rsk, skb);
1491discard:
1492        kfree_skb(skb);
1493        /* Be careful here. If this function gets more complicated and
1494         * gcc suffers from register pressure on the x86, sk (in %ebx)
1495         * might be destroyed here. This current version compiles correctly,
1496         * but you have been warned.
1497         */
1498        return 0;
1499
1500csum_err:
1501        TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1502        TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1503        goto discard;
1504}
1505EXPORT_SYMBOL(tcp_v4_do_rcv);
1506
1507void tcp_v4_early_demux(struct sk_buff *skb)
1508{
1509        const struct iphdr *iph;
1510        const struct tcphdr *th;
1511        struct sock *sk;
1512
1513        if (skb->pkt_type != PACKET_HOST)
1514                return;
1515
1516        if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1517                return;
1518
1519        iph = ip_hdr(skb);
1520        th = tcp_hdr(skb);
1521
1522        if (th->doff < sizeof(struct tcphdr) / 4)
1523                return;
1524
1525        sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1526                                       iph->saddr, th->source,
1527                                       iph->daddr, ntohs(th->dest),
1528                                       skb->skb_iif);
1529        if (sk) {
1530                skb->sk = sk;
1531                skb->destructor = sock_edemux;
1532                if (sk->sk_state != TCP_TIME_WAIT) {
1533                        struct dst_entry *dst = sk->sk_rx_dst;
1534
1535                        if (dst)
1536                                dst = dst_check(dst, 0);
1537                        if (dst &&
1538                            inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1539                                skb_dst_set_noref(skb, dst);
1540                }
1541        }
1542}
1543
1544/* Packet is added to VJ-style prequeue for processing in process
1545 * context, if a reader task is waiting. Apparently, this exciting
1546 * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1547 * failed somewhere. Latency? Burstiness? Well, at least now we will
1548 * see, why it failed. 8)8)                               --ANK
1549 *
1550 */
1551bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1552{
1553        struct tcp_sock *tp = tcp_sk(sk);
1554
1555        if (sysctl_tcp_low_latency || !tp->ucopy.task)
1556                return false;
1557
1558        if (skb->len <= tcp_hdrlen(skb) &&
1559            skb_queue_len(&tp->ucopy.prequeue) == 0)
1560                return false;
1561
1562        skb_dst_force(skb);
1563        __skb_queue_tail(&tp->ucopy.prequeue, skb);
1564        tp->ucopy.memory += skb->truesize;
1565        if (tp->ucopy.memory > sk->sk_rcvbuf) {
1566                struct sk_buff *skb1;
1567
1568                BUG_ON(sock_owned_by_user(sk));
1569
1570                while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1571                        sk_backlog_rcv(sk, skb1);
1572                        NET_INC_STATS_BH(sock_net(sk),
1573                                         LINUX_MIB_TCPPREQUEUEDROPPED);
1574                }
1575
1576                tp->ucopy.memory = 0;
1577        } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1578                wake_up_interruptible_sync_poll(sk_sleep(sk),
1579                                           POLLIN | POLLRDNORM | POLLRDBAND);
1580                if (!inet_csk_ack_scheduled(sk))
1581                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1582                                                  (3 * tcp_rto_min(sk)) / 4,
1583                                                  TCP_RTO_MAX);
1584        }
1585        return true;
1586}
1587EXPORT_SYMBOL(tcp_prequeue);
1588
1589/*
1590 *      From tcp_input.c
1591 */
1592
1593int tcp_v4_rcv(struct sk_buff *skb)
1594{
1595        const struct iphdr *iph;
1596        const struct tcphdr *th;
1597        struct sock *sk;
1598        int ret;
1599        struct net *net = dev_net(skb->dev);
1600
1601        if (skb->pkt_type != PACKET_HOST)
1602                goto discard_it;
1603
1604        /* Count it even if it's bad */
1605        TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1606
1607        if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1608                goto discard_it;
1609
1610        th = tcp_hdr(skb);
1611
1612        if (th->doff < sizeof(struct tcphdr) / 4)
1613                goto bad_packet;
1614        if (!pskb_may_pull(skb, th->doff * 4))
1615                goto discard_it;
1616
1617        /* An explanation is required here, I think.
1618         * Packet length and doff are validated by header prediction,
1619         * provided case of th->doff==0 is eliminated.
1620         * So, we defer the checks. */
1621
1622        if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1623                goto csum_error;
1624
1625        th = tcp_hdr(skb);
1626        iph = ip_hdr(skb);
1627        TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1628        TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1629                                    skb->len - th->doff * 4);
1630        TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1631        TCP_SKB_CB(skb)->when    = 0;
1632        TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1633        TCP_SKB_CB(skb)->sacked  = 0;
1634
1635        sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1636        if (!sk)
1637                goto no_tcp_socket;
1638
1639process:
1640        if (sk->sk_state == TCP_TIME_WAIT)
1641                goto do_time_wait;
1642
1643        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1644                NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1645                goto discard_and_relse;
1646        }
1647
1648        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1649                goto discard_and_relse;
1650
1651#ifdef CONFIG_TCP_MD5SIG
1652        /*
1653         * We really want to reject the packet as early as possible
1654         * if:
1655         *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1656         *  o There is an MD5 option and we're not expecting one
1657         */
1658        if (tcp_v4_inbound_md5_hash(sk, skb))
1659                goto discard_and_relse;
1660#endif
1661
1662        nf_reset(skb);
1663
1664        if (sk_filter(sk, skb))
1665                goto discard_and_relse;
1666
1667        sk_mark_napi_id(sk, skb);
1668        skb->dev = NULL;
1669
1670        bh_lock_sock_nested(sk);
1671        ret = 0;
1672        if (!sock_owned_by_user(sk)) {
1673#ifdef CONFIG_NET_DMA
1674                struct tcp_sock *tp = tcp_sk(sk);
1675                if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1676                        tp->ucopy.dma_chan = net_dma_find_channel();
1677                if (tp->ucopy.dma_chan)
1678                        ret = tcp_v4_do_rcv(sk, skb);
1679                else
1680#endif
1681                {
1682                        if (!tcp_prequeue(sk, skb))
1683                                ret = tcp_v4_do_rcv(sk, skb);
1684                }
1685        } else if (unlikely(sk_add_backlog(sk, skb,
1686                                           sk->sk_rcvbuf + sk->sk_sndbuf))) {
1687                bh_unlock_sock(sk);
1688                NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1689                goto discard_and_relse;
1690        }
1691        bh_unlock_sock(sk);
1692
1693        sock_put(sk);
1694
1695        return ret;
1696
1697no_tcp_socket:
1698        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1699                goto discard_it;
1700
1701        if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1702csum_error:
1703                TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1704bad_packet:
1705                TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1706        } else {
1707                tcp_v4_send_reset(NULL, skb);
1708        }
1709
1710discard_it:
1711        /* Discard frame. */
1712        kfree_skb(skb);
1713        return 0;
1714
1715discard_and_relse:
1716        sock_put(sk);
1717        goto discard_it;
1718
1719do_time_wait:
1720        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1721                inet_twsk_put(inet_twsk(sk));
1722                goto discard_it;
1723        }
1724
1725        if (skb->len < (th->doff << 2)) {
1726                inet_twsk_put(inet_twsk(sk));
1727                goto bad_packet;
1728        }
1729        if (tcp_checksum_complete(skb)) {
1730                inet_twsk_put(inet_twsk(sk));
1731                goto csum_error;
1732        }
1733        switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1734        case TCP_TW_SYN: {
1735                struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1736                                                        &tcp_hashinfo,
1737                                                        iph->saddr, th->source,
1738                                                        iph->daddr, th->dest,
1739                                                        inet_iif(skb));
1740                if (sk2) {
1741                        inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1742                        inet_twsk_put(inet_twsk(sk));
1743                        sk = sk2;
1744                        goto process;
1745                }
1746                /* Fall through to ACK */
1747        }
1748        case TCP_TW_ACK:
1749                tcp_v4_timewait_ack(sk, skb);
1750                break;
1751        case TCP_TW_RST:
1752                goto no_tcp_socket;
1753        case TCP_TW_SUCCESS:;
1754        }
1755        goto discard_it;
1756}
1757
1758static struct timewait_sock_ops tcp_timewait_sock_ops = {
1759        .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1760        .twsk_unique    = tcp_twsk_unique,
1761        .twsk_destructor= tcp_twsk_destructor,
1762};
1763
1764void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1765{
1766        struct dst_entry *dst = skb_dst(skb);
1767
1768        dst_hold(dst);
1769        sk->sk_rx_dst = dst;
1770        inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1771}
1772EXPORT_SYMBOL(inet_sk_rx_dst_set);
1773
1774const struct inet_connection_sock_af_ops ipv4_specific = {
1775        .queue_xmit        = ip_queue_xmit,
1776        .send_check        = tcp_v4_send_check,
1777        .rebuild_header    = inet_sk_rebuild_header,
1778        .sk_rx_dst_set     = inet_sk_rx_dst_set,
1779        .conn_request      = tcp_v4_conn_request,
1780        .syn_recv_sock     = tcp_v4_syn_recv_sock,
1781        .net_header_len    = sizeof(struct iphdr),
1782        .setsockopt        = ip_setsockopt,
1783        .getsockopt        = ip_getsockopt,
1784        .addr2sockaddr     = inet_csk_addr2sockaddr,
1785        .sockaddr_len      = sizeof(struct sockaddr_in),
1786        .bind_conflict     = inet_csk_bind_conflict,
1787#ifdef CONFIG_COMPAT
1788        .compat_setsockopt = compat_ip_setsockopt,
1789        .compat_getsockopt = compat_ip_getsockopt,
1790#endif
1791        .mtu_reduced       = tcp_v4_mtu_reduced,
1792};
1793EXPORT_SYMBOL(ipv4_specific);
1794
1795#ifdef CONFIG_TCP_MD5SIG
1796static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1797        .md5_lookup             = tcp_v4_md5_lookup,
1798        .calc_md5_hash          = tcp_v4_md5_hash_skb,
1799        .md5_parse              = tcp_v4_parse_md5_keys,
1800};
1801#endif
1802
1803/* NOTE: A lot of things set to zero explicitly by call to
1804 *       sk_alloc() so need not be done here.
1805 */
1806static int tcp_v4_init_sock(struct sock *sk)
1807{
1808        struct inet_connection_sock *icsk = inet_csk(sk);
1809
1810        tcp_init_sock(sk);
1811
1812        icsk->icsk_af_ops = &ipv4_specific;
1813
1814#ifdef CONFIG_TCP_MD5SIG
1815        tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1816#endif
1817
1818        return 0;
1819}
1820
1821void tcp_v4_destroy_sock(struct sock *sk)
1822{
1823        struct tcp_sock *tp = tcp_sk(sk);
1824
1825        tcp_clear_xmit_timers(sk);
1826
1827        tcp_cleanup_congestion_control(sk);
1828
1829        /* Cleanup up the write buffer. */
1830        tcp_write_queue_purge(sk);
1831
1832        /* Cleans up our, hopefully empty, out_of_order_queue. */
1833        __skb_queue_purge(&tp->out_of_order_queue);
1834
1835#ifdef CONFIG_TCP_MD5SIG
1836        /* Clean up the MD5 key list, if any */
1837        if (tp->md5sig_info) {
1838                tcp_clear_md5_list(sk);
1839                kfree_rcu(tp->md5sig_info, rcu);
1840                tp->md5sig_info = NULL;
1841        }
1842#endif
1843
1844#ifdef CONFIG_NET_DMA
1845        /* Cleans up our sk_async_wait_queue */
1846        __skb_queue_purge(&sk->sk_async_wait_queue);
1847#endif
1848
1849        /* Clean prequeue, it must be empty really */
1850        __skb_queue_purge(&tp->ucopy.prequeue);
1851
1852        /* Clean up a referenced TCP bind bucket. */
1853        if (inet_csk(sk)->icsk_bind_hash)
1854                inet_put_port(sk);
1855
1856        BUG_ON(tp->fastopen_rsk != NULL);
1857
1858        /* If socket is aborted during connect operation */
1859        tcp_free_fastopen_req(tp);
1860
1861        sk_sockets_allocated_dec(sk);
1862        sock_release_memcg(sk);
1863}
1864EXPORT_SYMBOL(tcp_v4_destroy_sock);
1865
1866#ifdef CONFIG_PROC_FS
1867/* Proc filesystem TCP sock list dumping. */
1868
1869/*
1870 * Get next listener socket follow cur.  If cur is NULL, get first socket
1871 * starting from bucket given in st->bucket; when st->bucket is zero the
1872 * very first socket in the hash table is returned.
1873 */
1874static void *listening_get_next(struct seq_file *seq, void *cur)
1875{
1876        struct inet_connection_sock *icsk;
1877        struct hlist_nulls_node *node;
1878        struct sock *sk = cur;
1879        struct inet_listen_hashbucket *ilb;
1880        struct tcp_iter_state *st = seq->private;
1881        struct net *net = seq_file_net(seq);
1882
1883        if (!sk) {
1884                ilb = &tcp_hashinfo.listening_hash[st->bucket];
1885                spin_lock_bh(&ilb->lock);
1886                sk = sk_nulls_head(&ilb->head);
1887                st->offset = 0;
1888                goto get_sk;
1889        }
1890        ilb = &tcp_hashinfo.listening_hash[st->bucket];
1891        ++st->num;
1892        ++st->offset;
1893
1894        if (st->state == TCP_SEQ_STATE_OPENREQ) {
1895                struct request_sock *req = cur;
1896
1897                icsk = inet_csk(st->syn_wait_sk);
1898                req = req->dl_next;
1899                while (1) {
1900                        while (req) {
1901                                if (req->rsk_ops->family == st->family) {
1902                                        cur = req;
1903                                        goto out;
1904                                }
1905                                req = req->dl_next;
1906                        }
1907                        if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1908                                break;
1909get_req:
1910                        req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1911                }
1912                sk        = sk_nulls_next(st->syn_wait_sk);
1913                st->state = TCP_SEQ_STATE_LISTENING;
1914                read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1915        } else {
1916                icsk = inet_csk(sk);
1917                read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1918                if (reqsk_queue_len(&icsk->icsk_accept_queue))
1919                        goto start_req;
1920                read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1921                sk = sk_nulls_next(sk);
1922        }
1923get_sk:
1924        sk_nulls_for_each_from(sk, node) {
1925                if (!net_eq(sock_net(sk), net))
1926                        continue;
1927                if (sk->sk_family == st->family) {
1928                        cur = sk;
1929                        goto out;
1930                }
1931                icsk = inet_csk(sk);
1932                read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1933                if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1934start_req:
1935                        st->uid         = sock_i_uid(sk);
1936                        st->syn_wait_sk = sk;
1937                        st->state       = TCP_SEQ_STATE_OPENREQ;
1938                        st->sbucket     = 0;
1939                        goto get_req;
1940                }
1941                read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1942        }
1943        spin_unlock_bh(&ilb->lock);
1944        st->offset = 0;
1945        if (++st->bucket < INET_LHTABLE_SIZE) {
1946                ilb = &tcp_hashinfo.listening_hash[st->bucket];
1947                spin_lock_bh(&ilb->lock);
1948                sk = sk_nulls_head(&ilb->head);
1949                goto get_sk;
1950        }
1951        cur = NULL;
1952out:
1953        return cur;
1954}
1955
1956static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1957{
1958        struct tcp_iter_state *st = seq->private;
1959        void *rc;
1960
1961        st->bucket = 0;
1962        st->offset = 0;
1963        rc = listening_get_next(seq, NULL);
1964
1965        while (rc && *pos) {
1966                rc = listening_get_next(seq, rc);
1967                --*pos;
1968        }
1969        return rc;
1970}
1971
1972static inline bool empty_bucket(const struct tcp_iter_state *st)
1973{
1974        return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1975}
1976
1977/*
1978 * Get first established socket starting from bucket given in st->bucket.
1979 * If st->bucket is zero, the very first socket in the hash is returned.
1980 */
1981static void *established_get_first(struct seq_file *seq)
1982{
1983        struct tcp_iter_state *st = seq->private;
1984        struct net *net = seq_file_net(seq);
1985        void *rc = NULL;
1986
1987        st->offset = 0;
1988        for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1989                struct sock *sk;
1990                struct hlist_nulls_node *node;
1991                spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1992
1993                /* Lockless fast path for the common case of empty buckets */
1994                if (empty_bucket(st))
1995                        continue;
1996
1997                spin_lock_bh(lock);
1998                sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1999                        if (sk->sk_family != st->family ||
2000                            !net_eq(sock_net(sk), net)) {

2001                                continue;
2002                        }
2003                        rc = sk;
2004                        goto out;
2005                }
2006                spin_unlock_bh(lock);
2007        }
2008out:
2009        return rc;
2010}
2011
2012static void *established_get_next(struct seq_file *seq, void *cur)
2013{
2014        struct sock *sk = cur;
2015        struct hlist_nulls_node *node;
2016        struct tcp_iter_state *st = seq->private;
2017        struct net *net = seq_file_net(seq);
2018
2019        ++st->num;
2020        ++st->offset;
2021
2022        sk = sk_nulls_next(sk);
2023
2024        sk_nulls_for_each_from(sk, node) {
2025                if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2026                        return sk;
2027        }
2028
2029        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2030        ++st->bucket;
2031        return established_get_first(seq);
2032}
2033
2034static void *established_get_idx(struct seq_file *seq, loff_t pos)
2035{
2036        struct tcp_iter_state *st = seq->private;
2037        void *rc;
2038
2039        st->bucket = 0;
2040        rc = established_get_first(seq);
2041
2042        while (rc && pos) {
2043                rc = established_get_next(seq, rc);
2044                --pos;
2045        }
2046        return rc;
2047}
2048
2049static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2050{
2051        void *rc;
2052        struct tcp_iter_state *st = seq->private;
2053
2054        st->state = TCP_SEQ_STATE_LISTENING;
2055        rc        = listening_get_idx(seq, &pos);
2056
2057        if (!rc) {
2058                st->state = TCP_SEQ_STATE_ESTABLISHED;
2059                rc        = established_get_idx(seq, pos);
2060        }
2061
2062        return rc;
2063}
2064
2065static void *tcp_seek_last_pos(struct seq_file *seq)
2066{
2067        struct tcp_iter_state *st = seq->private;
2068        int offset = st->offset;
2069        int orig_num = st->num;
2070        void *rc = NULL;
2071
2072        switch (st->state) {
2073        case TCP_SEQ_STATE_OPENREQ:
2074        case TCP_SEQ_STATE_LISTENING:
2075                if (st->bucket >= INET_LHTABLE_SIZE)
2076                        break;
2077                st->state = TCP_SEQ_STATE_LISTENING;
2078                rc = listening_get_next(seq, NULL);
2079                while (offset-- && rc)
2080                        rc = listening_get_next(seq, rc);
2081                if (rc)
2082                        break;
2083                st->bucket = 0;
2084                st->state = TCP_SEQ_STATE_ESTABLISHED;
2085                /* Fallthrough */
2086        case TCP_SEQ_STATE_ESTABLISHED:
2087                if (st->bucket > tcp_hashinfo.ehash_mask)
2088                        break;
2089                rc = established_get_first(seq);
2090                while (offset-- && rc)
2091                        rc = established_get_next(seq, rc);
2092        }
2093
2094        st->num = orig_num;
2095
2096        return rc;
2097}
2098
2099static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2100{
2101        struct tcp_iter_state *st = seq->private;
2102        void *rc;
2103
2104        if (*pos && *pos == st->last_pos) {
2105                rc = tcp_seek_last_pos(seq);
2106                if (rc)
2107                        goto out;
2108        }
2109
2110        st->state = TCP_SEQ_STATE_LISTENING;
2111        st->num = 0;
2112        st->bucket = 0;
2113        st->offset = 0;
2114        rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2115
2116out:
2117        st->last_pos = *pos;
2118        return rc;
2119}
2120
2121static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2122{
2123        struct tcp_iter_state *st = seq->private;
2124        void *rc = NULL;
2125
2126        if (v == SEQ_START_TOKEN) {
2127                rc = tcp_get_idx(seq, 0);
2128                goto out;
2129        }
2130
2131        switch (st->state) {
2132        case TCP_SEQ_STATE_OPENREQ:
2133        case TCP_SEQ_STATE_LISTENING:
2134                rc = listening_get_next(seq, v);
2135                if (!rc) {
2136                        st->state = TCP_SEQ_STATE_ESTABLISHED;
2137                        st->bucket = 0;
2138                        st->offset = 0;
2139                        rc        = established_get_first(seq);
2140                }
2141                break;
2142        case TCP_SEQ_STATE_ESTABLISHED:
2143                rc = established_get_next(seq, v);
2144                break;
2145        }
2146out:
2147        ++*pos;
2148        st->last_pos = *pos;
2149        return rc;
2150}
2151
2152static void tcp_seq_stop(struct seq_file *seq, void *v)
2153{
2154        struct tcp_iter_state *st = seq->private;
2155
2156        switch (st->state) {
2157        case TCP_SEQ_STATE_OPENREQ:
2158                if (v) {
2159                        struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2160                        read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2161                }
2162        case TCP_SEQ_STATE_LISTENING:
2163                if (v != SEQ_START_TOKEN)
2164                        spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2165                break;
2166        case TCP_SEQ_STATE_ESTABLISHED:
2167                if (v)
2168                        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2169                break;
2170        }
2171}
2172
2173int tcp_seq_open(struct inode *inode, struct file *file)
2174{
2175        struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2176        struct tcp_iter_state *s;
2177        int err;
2178
2179        err = seq_open_net(inode, file, &afinfo->seq_ops,
2180                          sizeof(struct tcp_iter_state));
2181        if (err < 0)
2182                return err;
2183
2184        s = ((struct seq_file *)file->private_data)->private;
2185        s->family               = afinfo->family;
2186        s->last_pos             = 0;
2187        return 0;
2188}
2189EXPORT_SYMBOL(tcp_seq_open);
2190
2191int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2192{
2193        int rc = 0;
2194        struct proc_dir_entry *p;
2195
2196        afinfo->seq_ops.start           = tcp_seq_start;
2197        afinfo->seq_ops.next            = tcp_seq_next;
2198        afinfo->seq_ops.stop            = tcp_seq_stop;
2199
2200        p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2201                             afinfo->seq_fops, afinfo);
2202        if (!p)
2203                rc = -ENOMEM;
2204        return rc;
2205}
2206EXPORT_SYMBOL(tcp_proc_register);
2207
2208void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2209{
2210        remove_proc_entry(afinfo->name, net->proc_net);
2211}
2212EXPORT_SYMBOL(tcp_proc_unregister);
2213
2214static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2215                         struct seq_file *f, int i, kuid_t uid)
2216{
2217        const struct inet_request_sock *ireq = inet_rsk(req);
2218        long delta = req->expires - jiffies;
2219
2220        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2221                " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2222                i,
2223                ireq->ir_loc_addr,
2224                ntohs(inet_sk(sk)->inet_sport),
2225                ireq->ir_rmt_addr,
2226                ntohs(ireq->ir_rmt_port),
2227                TCP_SYN_RECV,
2228                0, 0, /* could print option size, but that is af dependent. */
2229                1,    /* timers active (only the expire timer) */
2230                jiffies_delta_to_clock_t(delta),
2231                req->num_timeout,
2232                from_kuid_munged(seq_user_ns(f), uid),
2233                0,  /* non standard timer */
2234                0, /* open_requests have no inode */
2235                atomic_read(&sk->sk_refcnt),
2236                req);
2237}
2238
2239static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2240{
2241        int timer_active;
2242        unsigned long timer_expires;
2243        const struct tcp_sock *tp = tcp_sk(sk);
2244        const struct inet_connection_sock *icsk = inet_csk(sk);
2245        const struct inet_sock *inet = inet_sk(sk);
2246        struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
2247        __be32 dest = inet->inet_daddr;
2248        __be32 src = inet->inet_rcv_saddr;
2249        __u16 destp = ntohs(inet->inet_dport);
2250        __u16 srcp = ntohs(inet->inet_sport);
2251        int rx_queue;
2252
2253        if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2254            icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2255            icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2256                timer_active    = 1;
2257                timer_expires   = icsk->icsk_timeout;
2258        } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2259                timer_active    = 4;
2260                timer_expires   = icsk->icsk_timeout;
2261        } else if (timer_pending(&sk->sk_timer)) {
2262                timer_active    = 2;
2263                timer_expires   = sk->sk_timer.expires;
2264        } else {
2265                timer_active    = 0;
2266                timer_expires = jiffies;
2267        }
2268
2269        if (sk->sk_state == TCP_LISTEN)
2270                rx_queue = sk->sk_ack_backlog;
2271        else
2272                /*
2273                 * because we dont lock socket, we might find a transient negative value
2274                 */
2275                rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2276
2277        seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2278                        "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2279                i, src, srcp, dest, destp, sk->sk_state,
2280                tp->write_seq - tp->snd_una,
2281                rx_queue,
2282                timer_active,
2283                jiffies_delta_to_clock_t(timer_expires - jiffies),
2284                icsk->icsk_retransmits,
2285                from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2286                icsk->icsk_probes_out,
2287                sock_i_ino(sk),
2288                atomic_read(&sk->sk_refcnt), sk,
2289                jiffies_to_clock_t(icsk->icsk_rto),
2290                jiffies_to_clock_t(icsk->icsk_ack.ato),
2291                (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2292                tp->snd_cwnd,
2293                sk->sk_state == TCP_LISTEN ?
2294                    (fastopenq ? fastopenq->max_qlen : 0) :
2295                    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2296}
2297
2298static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2299                               struct seq_file *f, int i)
2300{
2301        __be32 dest, src;
2302        __u16 destp, srcp;
2303        s32 delta = tw->tw_ttd - inet_tw_time_stamp();
2304
2305        dest  = tw->tw_daddr;
2306        src   = tw->tw_rcv_saddr;
2307        destp = ntohs(tw->tw_dport);
2308        srcp  = ntohs(tw->tw_sport);
2309
2310        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2311                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2312                i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2313                3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2314                atomic_read(&tw->tw_refcnt), tw);
2315}
2316
2317#define TMPSZ 150
2318
2319static int tcp4_seq_show(struct seq_file *seq, void *v)
2320{
2321        struct tcp_iter_state *st;
2322        struct sock *sk = v;
2323
2324        seq_setwidth(seq, TMPSZ - 1);
2325        if (v == SEQ_START_TOKEN) {
2326                seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2327                           "rx_queue tr tm->when retrnsmt   uid  timeout "
2328                           "inode");
2329                goto out;
2330        }
2331        st = seq->private;
2332
2333        switch (st->state) {
2334        case TCP_SEQ_STATE_LISTENING:
2335        case TCP_SEQ_STATE_ESTABLISHED:
2336                if (sk->sk_state == TCP_TIME_WAIT)
2337                        get_timewait4_sock(v, seq, st->num);
2338                else
2339                        get_tcp4_sock(v, seq, st->num);
2340                break;
2341        case TCP_SEQ_STATE_OPENREQ:
2342                get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid);
2343                break;
2344        }
2345out:
2346        seq_pad(seq, '\n');
2347        return 0;
2348}
2349
2350static const struct file_operations tcp_afinfo_seq_fops = {
2351        .owner   = THIS_MODULE,
2352        .open    = tcp_seq_open,
2353        .read    = seq_read,
2354        .llseek  = seq_lseek,
2355        .release = seq_release_net
2356};
2357
2358static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2359        .name           = "tcp",
2360        .family         = AF_INET,
2361        .seq_fops       = &tcp_afinfo_seq_fops,
2362        .seq_ops        = {
2363                .show           = tcp4_seq_show,
2364        },
2365};
2366
2367static int __net_init tcp4_proc_init_net(struct net *net)
2368{
2369        return tcp_proc_register(net, &tcp4_seq_afinfo);
2370}
2371
2372static void __net_exit tcp4_proc_exit_net(struct net *net)
2373{
2374        tcp_proc_unregister(net, &tcp4_seq_afinfo);
2375}
2376
2377static struct pernet_operations tcp4_net_ops = {
2378        .init = tcp4_proc_init_net,
2379        .exit = tcp4_proc_exit_net,
2380};
2381
2382int __init tcp4_proc_init(void)
2383{
2384        return register_pernet_subsys(&tcp4_net_ops);
2385}
2386
2387void tcp4_proc_exit(void)
2388{
2389        unregister_pernet_subsys(&tcp4_net_ops);
2390}
2391#endif /* CONFIG_PROC_FS */
2392
2393struct proto tcp_prot = {
2394        .name                   = "TCP",
2395        .owner                  = THIS_MODULE,
2396        .close                  = tcp_close,
2397        .connect                = tcp_v4_connect,
2398        .disconnect             = tcp_disconnect,
2399        .accept                 = inet_csk_accept,
2400        .ioctl                  = tcp_ioctl,
2401        .init                   = tcp_v4_init_sock,
2402        .destroy                = tcp_v4_destroy_sock,
2403        .shutdown               = tcp_shutdown,
2404        .setsockopt             = tcp_setsockopt,
2405        .getsockopt             = tcp_getsockopt,
2406        .recvmsg                = tcp_recvmsg,
2407        .sendmsg                = tcp_sendmsg,
2408        .sendpage               = tcp_sendpage,
2409        .backlog_rcv            = tcp_v4_do_rcv,
2410        .release_cb             = tcp_release_cb,
2411        .hash                   = inet_hash,
2412        .unhash                 = inet_unhash,
2413        .get_port               = inet_csk_get_port,
2414        .enter_memory_pressure  = tcp_enter_memory_pressure,
2415        .stream_memory_free     = tcp_stream_memory_free,
2416        .sockets_allocated      = &tcp_sockets_allocated,
2417        .orphan_count           = &tcp_orphan_count,
2418        .memory_allocated       = &tcp_memory_allocated,
2419        .memory_pressure        = &tcp_memory_pressure,
2420        .sysctl_mem             = sysctl_tcp_mem,
2421        .sysctl_wmem            = sysctl_tcp_wmem,
2422        .sysctl_rmem            = sysctl_tcp_rmem,
2423        .max_header             = MAX_TCP_HEADER,
2424        .obj_size               = sizeof(struct tcp_sock),
2425        .slab_flags             = SLAB_DESTROY_BY_RCU,
2426        .twsk_prot              = &tcp_timewait_sock_ops,
2427        .rsk_prot               = &tcp_request_sock_ops,
2428        .h.hashinfo             = &tcp_hashinfo,
2429        .no_autobind            = true,
2430#ifdef CONFIG_COMPAT
2431        .compat_setsockopt      = compat_tcp_setsockopt,
2432        .compat_getsockopt      = compat_tcp_getsockopt,
2433#endif
2434#ifdef CONFIG_MEMCG_KMEM
2435        .init_cgroup            = tcp_init_cgroup,
2436        .destroy_cgroup         = tcp_destroy_cgroup,
2437        .proto_cgroup           = tcp_proto_cgroup,
2438#endif
2439};
2440EXPORT_SYMBOL(tcp_prot);
2441
2442static int __net_init tcp_sk_init(struct net *net)
2443{
2444        net->ipv4.sysctl_tcp_ecn = 2;
2445        return 0;
2446}
2447
2448static void __net_exit tcp_sk_exit(struct net *net)
2449{
2450}
2451
2452static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2453{
2454        inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2455}
2456
2457static struct pernet_operations __net_initdata tcp_sk_ops = {
2458       .init       = tcp_sk_init,
2459       .exit       = tcp_sk_exit,
2460       .exit_batch = tcp_sk_exit_batch,
2461};
2462
2463void __init tcp_v4_init(void)
2464{
2465        inet_hashinfo_init(&tcp_hashinfo);
2466        if (register_pernet_subsys(&tcp_sk_ops))
2467                panic("Failed to create the TCP control socket.\n");
2468}
2469