LXR linux/net/ipv4/tcp

   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Implementation of the Transmission Control Protocol(TCP).
   7 *
   8 *              IPv4 specific functions
   9 *
  10 *
  11 *              code split from:
  12 *              linux/ipv4/tcp.c
  13 *              linux/ipv4/tcp_input.c
  14 *              linux/ipv4/tcp_output.c
  15 *
  16 *              See tcp.c for author information
  17 *
  18 *      This program is free software; you can redistribute it and/or
  19 *      modify it under the terms of the GNU General Public License
  20 *      as published by the Free Software Foundation; either version
  21 *      2 of the License, or (at your option) any later version.
  22 */
  23
  24/*
  25 * Changes:
  26 *              David S. Miller :       New socket lookup architecture.
  27 *                                      This code is dedicated to John Dyson.
  28 *              David S. Miller :       Change semantics of established hash,
  29 *                                      half is devoted to TIME_WAIT sockets
  30 *                                      and the rest go in the other half.
  31 *              Andi Kleen :            Add support for syncookies and fixed
  32 *                                      some bugs: ip options weren't passed to
  33 *                                      the TCP layer, missed a check for an
  34 *                                      ACK bit.
  35 *              Andi Kleen :            Implemented fast path mtu discovery.
  36 *                                      Fixed many serious bugs in the
  37 *                                      request_sock handling and moved
  38 *                                      most of it into the af independent code.
  39 *                                      Added tail drop and some other bugfixes.
  40 *                                      Added new listen semantics.
  41 *              Mike McLagan    :       Routing by source
  42 *      Juan Jose Ciarlante:            ip_dynaddr bits
  43 *              Andi Kleen:             various fixes.
  44 *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45 *                                      coma.
  46 *      Andi Kleen              :       Fix new listen.
  47 *      Andi Kleen              :       Fix accept error reporting.
  48 *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49 *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50 *                                      a single port at the same time.
  51 */
  52
  53#define pr_fmt(fmt) "TCP: " fmt
  54
  55#include <linux/bottom_half.h>
  56#include <linux/types.h>
  57#include <linux/fcntl.h>
  58#include <linux/module.h>
  59#include <linux/random.h>
  60#include <linux/cache.h>
  61#include <linux/jhash.h>
  62#include <linux/init.h>
  63#include <linux/times.h>
  64#include <linux/slab.h>
  65
  66#include <net/net_namespace.h>
  67#include <net/icmp.h>
  68#include <net/inet_hashtables.h>
  69#include <net/tcp.h>
  70#include <net/transp_v6.h>
  71#include <net/ipv6.h>
  72#include <net/inet_common.h>
  73#include <net/timewait_sock.h>
  74#include <net/xfrm.h>
  75#include <net/secure_seq.h>
  76#include <net/busy_poll.h>
  77
  78#include <linux/inet.h>
  79#include <linux/ipv6.h>
  80#include <linux/stddef.h>
  81#include <linux/proc_fs.h>
  82#include <linux/seq_file.h>
  83#include <linux/inetdevice.h>
  84
  85#include <crypto/hash.h>
  86#include <linux/scatterlist.h>
  87
  88int sysctl_tcp_low_latency __read_mostly;
  89
  90#ifdef CONFIG_TCP_MD5SIG
  91static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  92                               __be32 daddr, __be32 saddr, const struct tcphdr *th);
  93#endif
  94
  95struct inet_hashinfo tcp_hashinfo;
  96EXPORT_SYMBOL(tcp_hashinfo);
  97
  98static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  99{
 100        return secure_tcp_seq(ip_hdr(skb)->daddr,
 101                              ip_hdr(skb)->saddr,
 102                              tcp_hdr(skb)->dest,
 103                              tcp_hdr(skb)->source);
 104}
 105
 106static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 107{
 108        return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 109}
 110
 111int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 112{
 113        const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 114        struct tcp_sock *tp = tcp_sk(sk);
 115
 116        /* With PAWS, it is safe from the viewpoint
 117           of data integrity. Even without PAWS it is safe provided sequence
 118           spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 119
 120           Actually, the idea is close to VJ's one, only timestamp cache is
 121           held not per host, but per port pair and TW bucket is used as state
 122           holder.
 123
 124           If TW bucket has been already destroyed we fall back to VJ's scheme
 125           and use initial timestamp retrieved from peer table.
 126         */
 127        if (tcptw->tw_ts_recent_stamp &&
 128            (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
 129                             get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 130                tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 131                if (tp->write_seq == 0)
 132                        tp->write_seq = 1;
 133                tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 134                tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 135                sock_hold(sktw);
 136                return 1;
 137        }
 138
 139        return 0;
 140}
 141EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 142
 143/* This will initiate an outgoing connection. */
 144int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 145{
 146        struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 147        struct inet_sock *inet = inet_sk(sk);
 148        struct tcp_sock *tp = tcp_sk(sk);
 149        __be16 orig_sport, orig_dport;
 150        __be32 daddr, nexthop;
 151        struct flowi4 *fl4;
 152        struct rtable *rt;
 153        int err;
 154        struct ip_options_rcu *inet_opt;
 155        struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 156
 157        if (addr_len < sizeof(struct sockaddr_in))
 158                return -EINVAL;
 159
 160        if (usin->sin_family != AF_INET)
 161                return -EAFNOSUPPORT;
 162
 163        nexthop = daddr = usin->sin_addr.s_addr;
 164        inet_opt = rcu_dereference_protected(inet->inet_opt,
 165                                             lockdep_sock_is_held(sk));
 166        if (inet_opt && inet_opt->opt.srr) {
 167                if (!daddr)
 168                        return -EINVAL;
 169                nexthop = inet_opt->opt.faddr;
 170        }
 171
 172        orig_sport = inet->inet_sport;
 173        orig_dport = usin->sin_port;
 174        fl4 = &inet->cork.fl.u.ip4;
 175        rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 176                              RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 177                              IPPROTO_TCP,
 178                              orig_sport, orig_dport, sk);
 179        if (IS_ERR(rt)) {
 180                err = PTR_ERR(rt);
 181                if (err == -ENETUNREACH)
 182                        IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 183                return err;
 184        }
 185
 186        if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 187                ip_rt_put(rt);
 188                return -ENETUNREACH;
 189        }
 190
 191        if (!inet_opt || !inet_opt->opt.srr)
 192                daddr = fl4->daddr;
 193
 194        if (!inet->inet_saddr)
 195                inet->inet_saddr = fl4->saddr;
 196        sk_rcv_saddr_set(sk, inet->inet_saddr);
 197
 198        if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 199                /* Reset inherited state */
 200                tp->rx_opt.ts_recent       = 0;
 201                tp->rx_opt.ts_recent_stamp = 0;
 202                if (likely(!tp->repair))
 203                        tp->write_seq      = 0;
 204        }
 205
 206        inet->inet_dport = usin->sin_port;
 207        sk_daddr_set(sk, daddr);
 208
 209        inet_csk(sk)->icsk_ext_hdr_len = 0;
 210        if (inet_opt)
 211                inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 212
 213        tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 214
 215        /* Socket identity is still unknown (sport may be zero).
 216         * However we set state to SYN-SENT and not releasing socket
 217         * lock select source port, enter ourselves into the hash tables and
 218         * complete initialization after this.
 219         */
 220        tcp_set_state(sk, TCP_SYN_SENT);
 221        err = inet_hash_connect(tcp_death_row, sk);
 222        if (err)
 223                goto failure;
 224
 225        sk_set_txhash(sk);
 226
 227        rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 228                               inet->inet_sport, inet->inet_dport, sk);
 229        if (IS_ERR(rt)) {
 230                err = PTR_ERR(rt);
 231                rt = NULL;
 232                goto failure;
 233        }
 234        /* OK, now commit destination to socket.  */
 235        sk->sk_gso_type = SKB_GSO_TCPV4;
 236        sk_setup_caps(sk, &rt->dst);
 237        rt = NULL;
 238
 239        if (likely(!tp->repair)) {
 240                if (!tp->write_seq)
 241                        tp->write_seq = secure_tcp_seq(inet->inet_saddr,
 242                                                       inet->inet_daddr,
 243                                                       inet->inet_sport,
 244                                                       usin->sin_port);
 245                tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 246                                                 inet->inet_saddr,
 247                                                 inet->inet_daddr);
 248        }
 249
 250        inet->inet_id = tp->write_seq ^ jiffies;
 251
 252        if (tcp_fastopen_defer_connect(sk, &err))
 253                return err;
 254        if (err)
 255                goto failure;
 256
 257        err = tcp_connect(sk);
 258
 259        if (err)
 260                goto failure;
 261
 262        return 0;
 263
 264failure:
 265        /*
 266         * This unhashes the socket and releases the local port,
 267         * if necessary.
 268         */
 269        tcp_set_state(sk, TCP_CLOSE);
 270        ip_rt_put(rt);
 271        sk->sk_route_caps = 0;
 272        inet->inet_dport = 0;
 273        return err;
 274}
 275EXPORT_SYMBOL(tcp_v4_connect);
 276
 277/*
 278 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 279 * It can be called through tcp_release_cb() if socket was owned by user
 280 * at the time tcp_v4_err() was called to handle ICMP message.
 281 */
 282void tcp_v4_mtu_reduced(struct sock *sk)
 283{
 284        struct inet_sock *inet = inet_sk(sk);
 285        struct dst_entry *dst;
 286        u32 mtu;
 287
 288        if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 289                return;
 290        mtu = tcp_sk(sk)->mtu_info;
 291        dst = inet_csk_update_pmtu(sk, mtu);
 292        if (!dst)
 293                return;
 294
 295        /* Something is about to be wrong... Remember soft error
 296         * for the case, if this connection will not able to recover.
 297         */
 298        if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 299                sk->sk_err_soft = EMSGSIZE;
 300
 301        mtu = dst_mtu(dst);
 302
 303        if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 304            ip_sk_accept_pmtu(sk) &&
 305            inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 306                tcp_sync_mss(sk, mtu);
 307
 308                /* Resend the TCP packet because it's
 309                 * clear that the old packet has been
 310                 * dropped. This is the new "fast" path mtu
 311                 * discovery.
 312                 */
 313                tcp_simple_retransmit(sk);
 314        } /* else let the usual retransmit timer handle it */
 315}
 316EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 317
 318static void do_redirect(struct sk_buff *skb, struct sock *sk)
 319{
 320        struct dst_entry *dst = __sk_dst_check(sk, 0);
 321
 322        if (dst)
 323                dst->ops->redirect(dst, sk, skb);
 324}
 325
 326
 327/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 328void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 329{
 330        struct request_sock *req = inet_reqsk(sk);
 331        struct net *net = sock_net(sk);
 332
 333        /* ICMPs are not backlogged, hence we cannot get
 334         * an established socket here.
 335         */
 336        if (seq != tcp_rsk(req)->snt_isn) {
 337                __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 338        } else if (abort) {
 339                /*
 340                 * Still in SYN_RECV, just remove it silently.
 341                 * There is no good way to pass the error to the newly
 342                 * created socket, and POSIX does not want network
 343                 * errors returned from accept().
 344                 */
 345                inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 346                tcp_listendrop(req->rsk_listener);
 347        }
 348        reqsk_put(req);
 349}
 350EXPORT_SYMBOL(tcp_req_err);
 351
 352/*
 353 * This routine is called by the ICMP module when it gets some
 354 * sort of error condition.  If err < 0 then the socket should
 355 * be closed and the error returned to the user.  If err > 0
 356 * it's just the icmp type << 8 | icmp code.  After adjustment
 357 * header points to the first 8 bytes of the tcp header.  We need
 358 * to find the appropriate port.
 359 *
 360 * The locking strategy used here is very "optimistic". When
 361 * someone else accesses the socket the ICMP is just dropped
 362 * and for some paths there is no check at all.
 363 * A more general error queue to queue errors for later handling
 364 * is probably better.
 365 *
 366 */
 367
 368void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 369{
 370        const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 371        struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 372        struct inet_connection_sock *icsk;
 373        struct tcp_sock *tp;
 374        struct inet_sock *inet;
 375        const int type = icmp_hdr(icmp_skb)->type;
 376        const int code = icmp_hdr(icmp_skb)->code;
 377        struct sock *sk;
 378        struct sk_buff *skb;
 379        struct request_sock *fastopen;
 380        u32 seq, snd_una;
 381        s32 remaining;
 382        u32 delta_us;
 383        int err;
 384        struct net *net = dev_net(icmp_skb->dev);
 385
 386        sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 387                                       th->dest, iph->saddr, ntohs(th->source),
 388                                       inet_iif(icmp_skb));
 389        if (!sk) {
 390                __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 391                return;
 392        }
 393        if (sk->sk_state == TCP_TIME_WAIT) {
 394                inet_twsk_put(inet_twsk(sk));
 395                return;
 396        }
 397        seq = ntohl(th->seq);
 398        if (sk->sk_state == TCP_NEW_SYN_RECV)
 399                return tcp_req_err(sk, seq,
 400                                  type == ICMP_PARAMETERPROB ||
 401                                  type == ICMP_TIME_EXCEEDED ||
 402                                  (type == ICMP_DEST_UNREACH &&
 403                                   (code == ICMP_NET_UNREACH ||
 404                                    code == ICMP_HOST_UNREACH)));
 405
 406        bh_lock_sock(sk);
 407        /* If too many ICMPs get dropped on busy
 408         * servers this needs to be solved differently.
 409         * We do take care of PMTU discovery (RFC1191) special case :
 410         * we can receive locally generated ICMP messages while socket is held.
 411         */
 412        if (sock_owned_by_user(sk)) {
 413                if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 414                        __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 415        }
 416        if (sk->sk_state == TCP_CLOSE)
 417                goto out;
 418
 419        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 420                __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 421                goto out;
 422        }
 423
 424        icsk = inet_csk(sk);
 425        tp = tcp_sk(sk);
 426        /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 427        fastopen = tp->fastopen_rsk;
 428        snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 429        if (sk->sk_state != TCP_LISTEN &&
 430            !between(seq, snd_una, tp->snd_nxt)) {
 431                __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 432                goto out;
 433        }
 434
 435        switch (type) {
 436        case ICMP_REDIRECT:
 437                if (!sock_owned_by_user(sk))
 438                        do_redirect(icmp_skb, sk);
 439                goto out;
 440        case ICMP_SOURCE_QUENCH:
 441                /* Just silently ignore these. */
 442                goto out;
 443        case ICMP_PARAMETERPROB:
 444                err = EPROTO;
 445                break;
 446        case ICMP_DEST_UNREACH:
 447                if (code > NR_ICMP_UNREACH)
 448                        goto out;
 449
 450                if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 451                        /* We are not interested in TCP_LISTEN and open_requests
 452                         * (SYN-ACKs send out by Linux are always <576bytes so
 453                         * they should go through unfragmented).
 454                         */
 455                        if (sk->sk_state == TCP_LISTEN)
 456                                goto out;
 457
 458                        tp->mtu_info = info;
 459                        if (!sock_owned_by_user(sk)) {
 460                                tcp_v4_mtu_reduced(sk);
 461                        } else {
 462                                if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 463                                        sock_hold(sk);
 464                        }
 465                        goto out;
 466                }
 467
 468                err = icmp_err_convert[code].errno;
 469                /* check if icmp_skb allows revert of backoff
 470                 * (see draft-zimmermann-tcp-lcd) */
 471                if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 472                        break;
 473                if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 474                    !icsk->icsk_backoff || fastopen)
 475                        break;
 476
 477                if (sock_owned_by_user(sk))
 478                        break;
 479
 480                icsk->icsk_backoff--;
 481                icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 482                                               TCP_TIMEOUT_INIT;
 483                icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 484
 485                skb = tcp_write_queue_head(sk);
 486                BUG_ON(!skb);
 487
 488                tcp_mstamp_refresh(tp);
 489                delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
 490                remaining = icsk->icsk_rto -
 491                            usecs_to_jiffies(delta_us);
 492
 493                if (remaining > 0) {
 494                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 495                                                  remaining, TCP_RTO_MAX);
 496                } else {
 497                        /* RTO revert clocked out retransmission.
 498                         * Will retransmit now */
 499                        tcp_retransmit_timer(sk);
 500                }
 501
 502                break;
 503        case ICMP_TIME_EXCEEDED:
 504                err = EHOSTUNREACH;
 505                break;
 506        default:
 507                goto out;
 508        }
 509
 510        switch (sk->sk_state) {
 511        case TCP_SYN_SENT:
 512        case TCP_SYN_RECV:
 513                /* Only in fast or simultaneous open. If a fast open socket is
 514                 * is already accepted it is treated as a connected one below.
 515                 */
 516                if (fastopen && !fastopen->sk)
 517                        break;
 518
 519                if (!sock_owned_by_user(sk)) {
 520                        sk->sk_err = err;
 521
 522                        sk->sk_error_report(sk);
 523
 524                        tcp_done(sk);
 525                } else {
 526                        sk->sk_err_soft = err;
 527                }
 528                goto out;
 529        }
 530
 531        /* If we've already connected we will keep trying
 532         * until we time out, or the user gives up.
 533         *
 534         * rfc1122 4.2.3.9 allows to consider as hard errors
 535         * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 536         * but it is obsoleted by pmtu discovery).
 537         *
 538         * Note, that in modern internet, where routing is unreliable
 539         * and in each dark corner broken firewalls sit, sending random
 540         * errors ordered by their masters even this two messages finally lose
 541         * their original sense (even Linux sends invalid PORT_UNREACHs)
 542         *
 543         * Now we are in compliance with RFCs.
 544         *                                                      --ANK (980905)
 545         */
 546
 547        inet = inet_sk(sk);
 548        if (!sock_owned_by_user(sk) && inet->recverr) {
 549                sk->sk_err = err;
 550                sk->sk_error_report(sk);
 551        } else  { /* Only an error on timeout */
 552                sk->sk_err_soft = err;
 553        }
 554
 555out:
 556        bh_unlock_sock(sk);
 557        sock_put(sk);
 558}
 559
 560void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 561{
 562        struct tcphdr *th = tcp_hdr(skb);
 563
 564        if (skb->ip_summed == CHECKSUM_PARTIAL) {
 565                th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 566                skb->csum_start = skb_transport_header(skb) - skb->head;
 567                skb->csum_offset = offsetof(struct tcphdr, check);
 568        } else {
 569                th->check = tcp_v4_check(skb->len, saddr, daddr,
 570                                         csum_partial(th,
 571                                                      th->doff << 2,
 572                                                      skb->csum));
 573        }
 574}
 575
 576/* This routine computes an IPv4 TCP checksum. */
 577void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 578{
 579        const struct inet_sock *inet = inet_sk(sk);
 580
 581        __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 582}
 583EXPORT_SYMBOL(tcp_v4_send_check);
 584
 585/*
 586 *      This routine will send an RST to the other tcp.
 587 *
 588 *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 589 *                    for reset.
 590 *      Answer: if a packet caused RST, it is not for a socket
 591 *              existing in our system, if it is matched to a socket,
 592 *              it is just duplicate segment or bug in other side's TCP.
 593 *              So that we build reply only basing on parameters
 594 *              arrived with segment.
 595 *      Exception: precedence violation. We do not implement it in any case.
 596 */
 597
 598static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 599{
 600        const struct tcphdr *th = tcp_hdr(skb);
 601        struct {
 602                struct tcphdr th;
 603#ifdef CONFIG_TCP_MD5SIG
 604                __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 605#endif
 606        } rep;
 607        struct ip_reply_arg arg;
 608#ifdef CONFIG_TCP_MD5SIG
 609        struct tcp_md5sig_key *key = NULL;
 610        const __u8 *hash_location = NULL;
 611        unsigned char newhash[16];
 612        int genhash;
 613        struct sock *sk1 = NULL;
 614#endif
 615        struct net *net;
 616
 617        /* Never send a reset in response to a reset. */
 618        if (th->rst)
 619                return;
 620
 621        /* If sk not NULL, it means we did a successful lookup and incoming
 622         * route had to be correct. prequeue might have dropped our dst.
 623         */
 624        if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 625                return;
 626
 627        /* Swap the send and the receive. */
 628        memset(&rep, 0, sizeof(rep));
 629        rep.th.dest   = th->source;
 630        rep.th.source = th->dest;
 631        rep.th.doff   = sizeof(struct tcphdr) / 4;
 632        rep.th.rst    = 1;
 633
 634        if (th->ack) {
 635                rep.th.seq = th->ack_seq;
 636        } else {
 637                rep.th.ack = 1;
 638                rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 639                                       skb->len - (th->doff << 2));
 640        }
 641
 642        memset(&arg, 0, sizeof(arg));
 643        arg.iov[0].iov_base = (unsigned char *)&rep;
 644        arg.iov[0].iov_len  = sizeof(rep.th);
 645
 646        net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 647#ifdef CONFIG_TCP_MD5SIG
 648        rcu_read_lock();
 649        hash_location = tcp_parse_md5sig_option(th);
 650        if (sk && sk_fullsock(sk)) {
 651                key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 652                                        &ip_hdr(skb)->saddr, AF_INET);
 653        } else if (hash_location) {
 654                /*
 655                 * active side is lost. Try to find listening socket through
 656                 * source port, and then find md5 key through listening socket.
 657                 * we are not loose security here:
 658                 * Incoming packet is checked with md5 hash with finding key,
 659                 * no RST generated if md5 hash doesn't match.
 660                 */
 661                sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 662                                             ip_hdr(skb)->saddr,
 663                                             th->source, ip_hdr(skb)->daddr,
 664                                             ntohs(th->source), inet_iif(skb));
 665                /* don't send rst if it can't find key */
 666                if (!sk1)
 667                        goto out;
 668
 669                key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 670                                        &ip_hdr(skb)->saddr, AF_INET);
 671                if (!key)
 672                        goto out;
 673
 674
 675                genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 676                if (genhash || memcmp(hash_location, newhash, 16) != 0)
 677                        goto out;
 678
 679        }
 680
 681        if (key) {
 682                rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 683                                   (TCPOPT_NOP << 16) |
 684                                   (TCPOPT_MD5SIG << 8) |
 685                                   TCPOLEN_MD5SIG);
 686                /* Update length and the length the header thinks exists */
 687                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 688                rep.th.doff = arg.iov[0].iov_len / 4;
 689
 690                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 691                                     key, ip_hdr(skb)->saddr,
 692                                     ip_hdr(skb)->daddr, &rep.th);
 693        }
 694#endif
 695        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 696                                      ip_hdr(skb)->saddr, /* XXX */
 697                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 698        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 699        arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 700
 701        /* When socket is gone, all binding information is lost.
 702         * routing might fail in this case. No choice here, if we choose to force
 703         * input interface, we will misroute in case of asymmetric route.
 704         */
 705        if (sk)
 706                arg.bound_dev_if = sk->sk_bound_dev_if;
 707
 708        BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 709                     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 710
 711        arg.tos = ip_hdr(skb)->tos;
 712        arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 713        local_bh_disable();
 714        ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 715                              skb, &TCP_SKB_CB(skb)->header.h4.opt,
 716                              ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 717                              &arg, arg.iov[0].iov_len);
 718
 719        __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 720        __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 721        local_bh_enable();
 722
 723#ifdef CONFIG_TCP_MD5SIG
 724out:
 725        rcu_read_unlock();
 726#endif
 727}
 728
 729/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 730   outside socket context is ugly, certainly. What can I do?
 731 */
 732
 733static void tcp_v4_send_ack(const struct sock *sk,
 734                            struct sk_buff *skb, u32 seq, u32 ack,
 735                            u32 win, u32 tsval, u32 tsecr, int oif,
 736                            struct tcp_md5sig_key *key,
 737                            int reply_flags, u8 tos)
 738{
 739        const struct tcphdr *th = tcp_hdr(skb);
 740        struct {
 741                struct tcphdr th;
 742                __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 743#ifdef CONFIG_TCP_MD5SIG
 744                           + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 745#endif
 746                        ];
 747        } rep;
 748        struct net *net = sock_net(sk);
 749        struct ip_reply_arg arg;
 750
 751        memset(&rep.th, 0, sizeof(struct tcphdr));
 752        memset(&arg, 0, sizeof(arg));
 753
 754        arg.iov[0].iov_base = (unsigned char *)&rep;
 755        arg.iov[0].iov_len  = sizeof(rep.th);
 756        if (tsecr) {
 757                rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 758                                   (TCPOPT_TIMESTAMP << 8) |
 759                                   TCPOLEN_TIMESTAMP);
 760                rep.opt[1] = htonl(tsval);
 761                rep.opt[2] = htonl(tsecr);
 762                arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 763        }
 764
 765        /* Swap the send and the receive. */
 766        rep.th.dest    = th->source;
 767        rep.th.source  = th->dest;
 768        rep.th.doff    = arg.iov[0].iov_len / 4;
 769        rep.th.seq     = htonl(seq);
 770        rep.th.ack_seq = htonl(ack);
 771        rep.th.ack     = 1;
 772        rep.th.window  = htons(win);
 773
 774#ifdef CONFIG_TCP_MD5SIG
 775        if (key) {
 776                int offset = (tsecr) ? 3 : 0;
 777
 778                rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 779                                          (TCPOPT_NOP << 16) |
 780                                          (TCPOPT_MD5SIG << 8) |
 781                                          TCPOLEN_MD5SIG);
 782                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 783                rep.th.doff = arg.iov[0].iov_len/4;
 784
 785                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 786                                    key, ip_hdr(skb)->saddr,
 787                                    ip_hdr(skb)->daddr, &rep.th);
 788        }
 789#endif
 790        arg.flags = reply_flags;
 791        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 792                                      ip_hdr(skb)->saddr, /* XXX */
 793                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 794        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 795        if (oif)
 796                arg.bound_dev_if = oif;
 797        arg.tos = tos;
 798        arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 799        local_bh_disable();
 800        ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 801                              skb, &TCP_SKB_CB(skb)->header.h4.opt,
 802                              ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 803                              &arg, arg.iov[0].iov_len);
 804
 805        __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 806        local_bh_enable();
 807}
 808
 809static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 810{
 811        struct inet_timewait_sock *tw = inet_twsk(sk);
 812        struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 813
 814        tcp_v4_send_ack(sk, skb,
 815                        tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 816                        tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 817                        tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 818                        tcptw->tw_ts_recent,
 819                        tw->tw_bound_dev_if,
 820                        tcp_twsk_md5_key(tcptw),
 821                        tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 822                        tw->tw_tos
 823                        );
 824
 825        inet_twsk_put(tw);
 826}
 827
 828static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 829                                  struct request_sock *req)
 830{
 831        /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 832         * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 833         */
 834        u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 835                                             tcp_sk(sk)->snd_nxt;
 836
 837        /* RFC 7323 2.3
 838         * The window field (SEG.WND) of every outgoing segment, with the
 839         * exception of <SYN> segments, MUST be right-shifted by
 840         * Rcv.Wind.Shift bits:
 841         */
 842        tcp_v4_send_ack(sk, skb, seq,
 843                        tcp_rsk(req)->rcv_nxt,
 844                        req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 845                        tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 846                        req->ts_recent,
 847                        0,
 848                        tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
 849                                          AF_INET),
 850                        inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 851                        ip_hdr(skb)->tos);
 852}
 853
 854/*
 855 *      Send a SYN-ACK after having received a SYN.
 856 *      This still operates on a request_sock only, not on a big
 857 *      socket.
 858 */
 859static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 860                              struct flowi *fl,
 861                              struct request_sock *req,
 862                              struct tcp_fastopen_cookie *foc,
 863                              enum tcp_synack_type synack_type)
 864{
 865        const struct inet_request_sock *ireq = inet_rsk(req);
 866        struct flowi4 fl4;
 867        int err = -1;
 868        struct sk_buff *skb;
 869
 870        /* First, grab a route. */
 871        if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 872                return -1;
 873
 874        skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 875
 876        if (skb) {
 877                __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 878
 879                err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 880                                            ireq->ir_rmt_addr,
 881                                            ireq->opt);
 882                err = net_xmit_eval(err);
 883        }
 884
 885        return err;
 886}
 887
 888/*
 889 *      IPv4 request_sock destructor.
 890 */
 891static void tcp_v4_reqsk_destructor(struct request_sock *req)
 892{
 893        kfree(inet_rsk(req)->opt);
 894}
 895
 896#ifdef CONFIG_TCP_MD5SIG
 897/*
 898 * RFC2385 MD5 checksumming requires a mapping of
 899 * IP address->MD5 Key.
 900 * We need to maintain these in the sk structure.
 901 */
 902
 903/* Find the Key structure for an address.  */
 904struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
 905                                         const union tcp_md5_addr *addr,
 906                                         int family)
 907{
 908        const struct tcp_sock *tp = tcp_sk(sk);
 909        struct tcp_md5sig_key *key;
 910        const struct tcp_md5sig_info *md5sig;
 911        __be32 mask;
 912        struct tcp_md5sig_key *best_match = NULL;
 913        bool match;
 914
 915        /* caller either holds rcu_read_lock() or socket lock */
 916        md5sig = rcu_dereference_check(tp->md5sig_info,
 917                                       lockdep_sock_is_held(sk));
 918        if (!md5sig)
 919                return NULL;
 920
 921        hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 922                if (key->family != family)
 923                        continue;
 924
 925                if (family == AF_INET) {
 926                        mask = inet_make_mask(key->prefixlen);
 927                        match = (key->addr.a4.s_addr & mask) ==
 928                                (addr->a4.s_addr & mask);
 929#if IS_ENABLED(CONFIG_IPV6)
 930                } else if (family == AF_INET6) {
 931                        match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
 932                                                  key->prefixlen);
 933#endif
 934                } else {
 935                        match = false;
 936                }
 937
 938                if (match && (!best_match ||
 939                              key->prefixlen > best_match->prefixlen))
 940                        best_match = key;
 941        }
 942        return best_match;
 943}
 944EXPORT_SYMBOL(tcp_md5_do_lookup);
 945
 946static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
 947                                                      const union tcp_md5_addr *addr,
 948                                                      int family, u8 prefixlen)
 949{
 950        const struct tcp_sock *tp = tcp_sk(sk);
 951        struct tcp_md5sig_key *key;
 952        unsigned int size = sizeof(struct in_addr);
 953        const struct tcp_md5sig_info *md5sig;
 954
 955        /* caller either holds rcu_read_lock() or socket lock */
 956        md5sig = rcu_dereference_check(tp->md5sig_info,
 957                                       lockdep_sock_is_held(sk));
 958        if (!md5sig)
 959                return NULL;
 960#if IS_ENABLED(CONFIG_IPV6)
 961        if (family == AF_INET6)
 962                size = sizeof(struct in6_addr);
 963#endif
 964        hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 965                if (key->family != family)
 966                        continue;
 967                if (!memcmp(&key->addr, addr, size) &&
 968                    key->prefixlen == prefixlen)
 969                        return key;
 970        }
 971        return NULL;
 972}
 973
 974struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
 975                                         const struct sock *addr_sk)
 976{
 977        const union tcp_md5_addr *addr;
 978
 979        addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
 980        return tcp_md5_do_lookup(sk, addr, AF_INET);
 981}
 982EXPORT_SYMBOL(tcp_v4_md5_lookup);
 983
 984/* This can be called on a newly created socket, from other files */
 985int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 986                   int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
 987                   gfp_t gfp)
 988{
 989        /* Add Key to the list */
 990        struct tcp_md5sig_key *key;
 991        struct tcp_sock *tp = tcp_sk(sk);
 992        struct tcp_md5sig_info *md5sig;
 993
 994        key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
 995        if (key) {
 996                /* Pre-existing entry - just update that one. */
 997                memcpy(key->key, newkey, newkeylen);
 998                key->keylen = newkeylen;
 999                return 0;
1000        }

1001
1002        md5sig = rcu_dereference_protected(tp->md5sig_info,
1003                                           lockdep_sock_is_held(sk));
1004        if (!md5sig) {
1005                md5sig = kmalloc(sizeof(*md5sig), gfp);
1006                if (!md5sig)
1007                        return -ENOMEM;
1008
1009                sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1010                INIT_HLIST_HEAD(&md5sig->head);
1011                rcu_assign_pointer(tp->md5sig_info, md5sig);
1012        }
1013
1014        key = sock_kmalloc(sk, sizeof(*key), gfp);
1015        if (!key)
1016                return -ENOMEM;
1017        if (!tcp_alloc_md5sig_pool()) {
1018                sock_kfree_s(sk, key, sizeof(*key));
1019                return -ENOMEM;
1020        }
1021
1022        memcpy(key->key, newkey, newkeylen);
1023        key->keylen = newkeylen;
1024        key->family = family;
1025        key->prefixlen = prefixlen;
1026        memcpy(&key->addr, addr,
1027               (family == AF_INET6) ? sizeof(struct in6_addr) :
1028                                      sizeof(struct in_addr));
1029        hlist_add_head_rcu(&key->node, &md5sig->head);
1030        return 0;
1031}
1032EXPORT_SYMBOL(tcp_md5_do_add);
1033
1034int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1035                   u8 prefixlen)
1036{
1037        struct tcp_md5sig_key *key;
1038
1039        key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1040        if (!key)
1041                return -ENOENT;
1042        hlist_del_rcu(&key->node);
1043        atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1044        kfree_rcu(key, rcu);
1045        return 0;
1046}
1047EXPORT_SYMBOL(tcp_md5_do_del);
1048
1049static void tcp_clear_md5_list(struct sock *sk)
1050{
1051        struct tcp_sock *tp = tcp_sk(sk);
1052        struct tcp_md5sig_key *key;
1053        struct hlist_node *n;
1054        struct tcp_md5sig_info *md5sig;
1055
1056        md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1057
1058        hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1059                hlist_del_rcu(&key->node);
1060                atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1061                kfree_rcu(key, rcu);
1062        }
1063}
1064
1065static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1066                                 char __user *optval, int optlen)
1067{
1068        struct tcp_md5sig cmd;
1069        struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1070        u8 prefixlen = 32;
1071
1072        if (optlen < sizeof(cmd))
1073                return -EINVAL;
1074
1075        if (copy_from_user(&cmd, optval, sizeof(cmd)))
1076                return -EFAULT;
1077
1078        if (sin->sin_family != AF_INET)
1079                return -EINVAL;
1080
1081        if (optname == TCP_MD5SIG_EXT &&
1082            cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1083                prefixlen = cmd.tcpm_prefixlen;
1084                if (prefixlen > 32)
1085                        return -EINVAL;
1086        }
1087
1088        if (!cmd.tcpm_keylen)
1089                return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1090                                      AF_INET, prefixlen);
1091
1092        if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1093                return -EINVAL;
1094
1095        return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1096                              AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1097                              GFP_KERNEL);
1098}
1099
1100static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1101                                   __be32 daddr, __be32 saddr,
1102                                   const struct tcphdr *th, int nbytes)
1103{
1104        struct tcp4_pseudohdr *bp;
1105        struct scatterlist sg;
1106        struct tcphdr *_th;
1107
1108        bp = hp->scratch;
1109        bp->saddr = saddr;
1110        bp->daddr = daddr;
1111        bp->pad = 0;
1112        bp->protocol = IPPROTO_TCP;
1113        bp->len = cpu_to_be16(nbytes);
1114
1115        _th = (struct tcphdr *)(bp + 1);
1116        memcpy(_th, th, sizeof(*th));
1117        _th->check = 0;
1118
1119        sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1120        ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1121                                sizeof(*bp) + sizeof(*th));
1122        return crypto_ahash_update(hp->md5_req);
1123}
1124
1125static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1126                               __be32 daddr, __be32 saddr, const struct tcphdr *th)
1127{
1128        struct tcp_md5sig_pool *hp;
1129        struct ahash_request *req;
1130
1131        hp = tcp_get_md5sig_pool();
1132        if (!hp)
1133                goto clear_hash_noput;
1134        req = hp->md5_req;
1135
1136        if (crypto_ahash_init(req))
1137                goto clear_hash;
1138        if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1139                goto clear_hash;
1140        if (tcp_md5_hash_key(hp, key))
1141                goto clear_hash;
1142        ahash_request_set_crypt(req, NULL, md5_hash, 0);
1143        if (crypto_ahash_final(req))
1144                goto clear_hash;
1145
1146        tcp_put_md5sig_pool();
1147        return 0;
1148
1149clear_hash:
1150        tcp_put_md5sig_pool();
1151clear_hash_noput:
1152        memset(md5_hash, 0, 16);
1153        return 1;
1154}
1155
1156int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1157                        const struct sock *sk,
1158                        const struct sk_buff *skb)
1159{
1160        struct tcp_md5sig_pool *hp;
1161        struct ahash_request *req;
1162        const struct tcphdr *th = tcp_hdr(skb);
1163        __be32 saddr, daddr;
1164
1165        if (sk) { /* valid for establish/request sockets */
1166                saddr = sk->sk_rcv_saddr;
1167                daddr = sk->sk_daddr;
1168        } else {
1169                const struct iphdr *iph = ip_hdr(skb);
1170                saddr = iph->saddr;
1171                daddr = iph->daddr;
1172        }
1173
1174        hp = tcp_get_md5sig_pool();
1175        if (!hp)
1176                goto clear_hash_noput;
1177        req = hp->md5_req;
1178
1179        if (crypto_ahash_init(req))
1180                goto clear_hash;
1181
1182        if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1183                goto clear_hash;
1184        if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1185                goto clear_hash;
1186        if (tcp_md5_hash_key(hp, key))
1187                goto clear_hash;
1188        ahash_request_set_crypt(req, NULL, md5_hash, 0);
1189        if (crypto_ahash_final(req))
1190                goto clear_hash;
1191
1192        tcp_put_md5sig_pool();
1193        return 0;
1194
1195clear_hash:
1196        tcp_put_md5sig_pool();
1197clear_hash_noput:
1198        memset(md5_hash, 0, 16);
1199        return 1;
1200}
1201EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1202
1203#endif
1204
1205/* Called with rcu_read_lock() */
1206static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1207                                    const struct sk_buff *skb)
1208{
1209#ifdef CONFIG_TCP_MD5SIG
1210        /*
1211         * This gets called for each TCP segment that arrives
1212         * so we want to be efficient.
1213         * We have 3 drop cases:
1214         * o No MD5 hash and one expected.
1215         * o MD5 hash and we're not expecting one.
1216         * o MD5 hash and its wrong.
1217         */
1218        const __u8 *hash_location = NULL;
1219        struct tcp_md5sig_key *hash_expected;
1220        const struct iphdr *iph = ip_hdr(skb);
1221        const struct tcphdr *th = tcp_hdr(skb);
1222        int genhash;
1223        unsigned char newhash[16];
1224
1225        hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1226                                          AF_INET);
1227        hash_location = tcp_parse_md5sig_option(th);
1228
1229        /* We've parsed the options - do we have a hash? */
1230        if (!hash_expected && !hash_location)
1231                return false;
1232
1233        if (hash_expected && !hash_location) {
1234                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1235                return true;
1236        }
1237
1238        if (!hash_expected && hash_location) {
1239                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1240                return true;
1241        }
1242
1243        /* Okay, so this is hash_expected and hash_location -
1244         * so we need to calculate the checksum.
1245         */
1246        genhash = tcp_v4_md5_hash_skb(newhash,
1247                                      hash_expected,
1248                                      NULL, skb);
1249
1250        if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1251                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1252                net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1253                                     &iph->saddr, ntohs(th->source),
1254                                     &iph->daddr, ntohs(th->dest),
1255                                     genhash ? " tcp_v4_calc_md5_hash failed"
1256                                     : "");
1257                return true;
1258        }
1259        return false;
1260#endif
1261        return false;
1262}
1263
1264static void tcp_v4_init_req(struct request_sock *req,
1265                            const struct sock *sk_listener,
1266                            struct sk_buff *skb)
1267{
1268        struct inet_request_sock *ireq = inet_rsk(req);
1269
1270        sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1271        sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1272        ireq->opt = tcp_v4_save_options(skb);
1273}
1274
1275static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1276                                          struct flowi *fl,
1277                                          const struct request_sock *req)
1278{
1279        return inet_csk_route_req(sk, &fl->u.ip4, req);
1280}
1281
1282struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1283        .family         =       PF_INET,
1284        .obj_size       =       sizeof(struct tcp_request_sock),
1285        .rtx_syn_ack    =       tcp_rtx_synack,
1286        .send_ack       =       tcp_v4_reqsk_send_ack,
1287        .destructor     =       tcp_v4_reqsk_destructor,
1288        .send_reset     =       tcp_v4_send_reset,
1289        .syn_ack_timeout =      tcp_syn_ack_timeout,
1290};
1291
1292static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1293        .mss_clamp      =       TCP_MSS_DEFAULT,
1294#ifdef CONFIG_TCP_MD5SIG
1295        .req_md5_lookup =       tcp_v4_md5_lookup,
1296        .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1297#endif
1298        .init_req       =       tcp_v4_init_req,
1299#ifdef CONFIG_SYN_COOKIES
1300        .cookie_init_seq =      cookie_v4_init_sequence,
1301#endif
1302        .route_req      =       tcp_v4_route_req,
1303        .init_seq       =       tcp_v4_init_seq,
1304        .init_ts_off    =       tcp_v4_init_ts_off,
1305        .send_synack    =       tcp_v4_send_synack,
1306};
1307
1308int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1309{
1310        /* Never answer to SYNs send to broadcast or multicast */
1311        if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1312                goto drop;
1313
1314        return tcp_conn_request(&tcp_request_sock_ops,
1315                                &tcp_request_sock_ipv4_ops, sk, skb);
1316
1317drop:
1318        tcp_listendrop(sk);
1319        return 0;
1320}
1321EXPORT_SYMBOL(tcp_v4_conn_request);
1322
1323
1324/*
1325 * The three way handshake has completed - we got a valid synack -
1326 * now create the new socket.
1327 */
1328struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1329                                  struct request_sock *req,
1330                                  struct dst_entry *dst,
1331                                  struct request_sock *req_unhash,
1332                                  bool *own_req)
1333{
1334        struct inet_request_sock *ireq;
1335        struct inet_sock *newinet;
1336        struct tcp_sock *newtp;
1337        struct sock *newsk;
1338#ifdef CONFIG_TCP_MD5SIG
1339        struct tcp_md5sig_key *key;
1340#endif
1341        struct ip_options_rcu *inet_opt;
1342
1343        if (sk_acceptq_is_full(sk))
1344                goto exit_overflow;
1345
1346        newsk = tcp_create_openreq_child(sk, req, skb);
1347        if (!newsk)
1348                goto exit_nonewsk;
1349
1350        newsk->sk_gso_type = SKB_GSO_TCPV4;
1351        inet_sk_rx_dst_set(newsk, skb);
1352
1353        newtp                 = tcp_sk(newsk);
1354        newinet               = inet_sk(newsk);
1355        ireq                  = inet_rsk(req);
1356        sk_daddr_set(newsk, ireq->ir_rmt_addr);
1357        sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1358        newsk->sk_bound_dev_if = ireq->ir_iif;
1359        newinet->inet_saddr           = ireq->ir_loc_addr;
1360        inet_opt              = ireq->opt;
1361        rcu_assign_pointer(newinet->inet_opt, inet_opt);
1362        ireq->opt             = NULL;
1363        newinet->mc_index     = inet_iif(skb);
1364        newinet->mc_ttl       = ip_hdr(skb)->ttl;
1365        newinet->rcv_tos      = ip_hdr(skb)->tos;
1366        inet_csk(newsk)->icsk_ext_hdr_len = 0;
1367        if (inet_opt)
1368                inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1369        newinet->inet_id = newtp->write_seq ^ jiffies;
1370
1371        if (!dst) {
1372                dst = inet_csk_route_child_sock(sk, newsk, req);
1373                if (!dst)
1374                        goto put_and_exit;
1375        } else {
1376                /* syncookie case : see end of cookie_v4_check() */
1377        }
1378        sk_setup_caps(newsk, dst);
1379
1380        tcp_ca_openreq_child(newsk, dst);
1381
1382        tcp_sync_mss(newsk, dst_mtu(dst));
1383        newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1384
1385        tcp_initialize_rcv_mss(newsk);
1386
1387#ifdef CONFIG_TCP_MD5SIG
1388        /* Copy over the MD5 key from the original socket */
1389        key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1390                                AF_INET);
1391        if (key) {
1392                /*
1393                 * We're using one, so create a matching key
1394                 * on the newsk structure. If we fail to get
1395                 * memory, then we end up not copying the key
1396                 * across. Shucks.
1397                 */
1398                tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1399                               AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1400                sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1401        }
1402#endif
1403
1404        if (__inet_inherit_port(sk, newsk) < 0)
1405                goto put_and_exit;
1406        *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1407        if (*own_req)
1408                tcp_move_syn(newtp, req);
1409
1410        return newsk;
1411
1412exit_overflow:
1413        NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1414exit_nonewsk:
1415        dst_release(dst);
1416exit:
1417        tcp_listendrop(sk);
1418        return NULL;
1419put_and_exit:
1420        inet_csk_prepare_forced_close(newsk);
1421        tcp_done(newsk);
1422        goto exit;
1423}
1424EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1425
1426static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1427{
1428#ifdef CONFIG_SYN_COOKIES
1429        const struct tcphdr *th = tcp_hdr(skb);
1430
1431        if (!th->syn)
1432                sk = cookie_v4_check(sk, skb);
1433#endif
1434        return sk;
1435}
1436
1437/* The socket must have it's spinlock held when we get
1438 * here, unless it is a TCP_LISTEN socket.
1439 *
1440 * We have a potential double-lock case here, so even when
1441 * doing backlog processing we use the BH locking scheme.
1442 * This is because we cannot sleep with the original spinlock
1443 * held.
1444 */
1445int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1446{
1447        struct sock *rsk;
1448
1449        if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1450                struct dst_entry *dst = sk->sk_rx_dst;
1451
1452                sock_rps_save_rxhash(sk, skb);
1453                sk_mark_napi_id(sk, skb);
1454                if (dst) {
1455                        if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1456                            !dst->ops->check(dst, 0)) {
1457                                dst_release(dst);
1458                                sk->sk_rx_dst = NULL;
1459                        }
1460                }
1461                tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1462                return 0;
1463        }
1464
1465        if (tcp_checksum_complete(skb))
1466                goto csum_err;
1467
1468        if (sk->sk_state == TCP_LISTEN) {
1469                struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1470
1471                if (!nsk)
1472                        goto discard;
1473                if (nsk != sk) {
1474                        if (tcp_child_process(sk, nsk, skb)) {
1475                                rsk = nsk;
1476                                goto reset;
1477                        }
1478                        return 0;
1479                }
1480        } else
1481                sock_rps_save_rxhash(sk, skb);
1482
1483        if (tcp_rcv_state_process(sk, skb)) {
1484                rsk = sk;
1485                goto reset;
1486        }
1487        return 0;
1488
1489reset:
1490        tcp_v4_send_reset(rsk, skb);
1491discard:
1492        kfree_skb(skb);
1493        /* Be careful here. If this function gets more complicated and
1494         * gcc suffers from register pressure on the x86, sk (in %ebx)
1495         * might be destroyed here. This current version compiles correctly,
1496         * but you have been warned.
1497         */
1498        return 0;
1499
1500csum_err:
1501        TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1502        TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1503        goto discard;
1504}
1505EXPORT_SYMBOL(tcp_v4_do_rcv);
1506
1507void tcp_v4_early_demux(struct sk_buff *skb)
1508{
1509        const struct iphdr *iph;
1510        const struct tcphdr *th;
1511        struct sock *sk;
1512
1513        if (skb->pkt_type != PACKET_HOST)
1514                return;
1515
1516        if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1517                return;
1518
1519        iph = ip_hdr(skb);
1520        th = tcp_hdr(skb);
1521
1522        if (th->doff < sizeof(struct tcphdr) / 4)
1523                return;
1524
1525        sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1526                                       iph->saddr, th->source,
1527                                       iph->daddr, ntohs(th->dest),
1528                                       skb->skb_iif);
1529        if (sk) {
1530                skb->sk = sk;
1531                skb->destructor = sock_edemux;
1532                if (sk_fullsock(sk)) {
1533                        struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1534
1535                        if (dst)
1536                                dst = dst_check(dst, 0);
1537                        if (dst &&
1538                            inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1539                                skb_dst_set_noref(skb, dst);
1540                }
1541        }
1542}
1543
1544/* Packet is added to VJ-style prequeue for processing in process
1545 * context, if a reader task is waiting. Apparently, this exciting
1546 * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1547 * failed somewhere. Latency? Burstiness? Well, at least now we will
1548 * see, why it failed. 8)8)                               --ANK
1549 *
1550 */
1551bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1552{
1553        struct tcp_sock *tp = tcp_sk(sk);
1554
1555        if (sysctl_tcp_low_latency || !tp->ucopy.task)
1556                return false;
1557
1558        if (skb->len <= tcp_hdrlen(skb) &&
1559            skb_queue_len(&tp->ucopy.prequeue) == 0)
1560                return false;
1561
1562        /* Before escaping RCU protected region, we need to take care of skb
1563         * dst. Prequeue is only enabled for established sockets.
1564         * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1565         * Instead of doing full sk_rx_dst validity here, let's perform
1566         * an optimistic check.
1567         */
1568        if (likely(sk->sk_rx_dst))
1569                skb_dst_drop(skb);
1570        else
1571                skb_dst_force_safe(skb);
1572
1573        __skb_queue_tail(&tp->ucopy.prequeue, skb);
1574        tp->ucopy.memory += skb->truesize;
1575        if (skb_queue_len(&tp->ucopy.prequeue) >= 32 ||
1576            tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
1577                struct sk_buff *skb1;
1578
1579                BUG_ON(sock_owned_by_user(sk));
1580                __NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED,
1581                                skb_queue_len(&tp->ucopy.prequeue));
1582
1583                while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1584                        sk_backlog_rcv(sk, skb1);
1585
1586                tp->ucopy.memory = 0;
1587        } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1588                wake_up_interruptible_sync_poll(sk_sleep(sk),
1589                                           POLLIN | POLLRDNORM | POLLRDBAND);
1590                if (!inet_csk_ack_scheduled(sk))
1591                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1592                                                  (3 * tcp_rto_min(sk)) / 4,
1593                                                  TCP_RTO_MAX);
1594        }
1595        return true;
1596}
1597EXPORT_SYMBOL(tcp_prequeue);
1598
1599bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1600{
1601        u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1602
1603        /* Only socket owner can try to collapse/prune rx queues
1604         * to reduce memory overhead, so add a little headroom here.
1605         * Few sockets backlog are possibly concurrently non empty.
1606         */
1607        limit += 64*1024;
1608
1609        /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1610         * we can fix skb->truesize to its real value to avoid future drops.
1611         * This is valid because skb is not yet charged to the socket.
1612         * It has been noticed pure SACK packets were sometimes dropped
1613         * (if cooked by drivers without copybreak feature).
1614         */
1615        skb_condense(skb);
1616
1617        if (unlikely(sk_add_backlog(sk, skb, limit))) {
1618                bh_unlock_sock(sk);
1619                __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1620                return true;
1621        }
1622        return false;
1623}
1624EXPORT_SYMBOL(tcp_add_backlog);
1625
1626int tcp_filter(struct sock *sk, struct sk_buff *skb)
1627{
1628        struct tcphdr *th = (struct tcphdr *)skb->data;
1629        unsigned int eaten = skb->len;
1630        int err;
1631
1632        err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1633        if (!err) {
1634                eaten -= skb->len;
1635                TCP_SKB_CB(skb)->end_seq -= eaten;
1636        }
1637        return err;
1638}
1639EXPORT_SYMBOL(tcp_filter);
1640
1641/*
1642 *      From tcp_input.c
1643 */
1644
1645int tcp_v4_rcv(struct sk_buff *skb)
1646{
1647        struct net *net = dev_net(skb->dev);
1648        const struct iphdr *iph;
1649        const struct tcphdr *th;
1650        bool refcounted;
1651        struct sock *sk;
1652        int ret;
1653
1654        if (skb->pkt_type != PACKET_HOST)
1655                goto discard_it;
1656
1657        /* Count it even if it's bad */
1658        __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1659
1660        if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1661                goto discard_it;
1662
1663        th = (const struct tcphdr *)skb->data;
1664
1665        if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1666                goto bad_packet;
1667        if (!pskb_may_pull(skb, th->doff * 4))
1668                goto discard_it;
1669
1670        /* An explanation is required here, I think.
1671         * Packet length and doff are validated by header prediction,
1672         * provided case of th->doff==0 is eliminated.
1673         * So, we defer the checks. */
1674
1675        if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1676                goto csum_error;
1677
1678        th = (const struct tcphdr *)skb->data;
1679        iph = ip_hdr(skb);
1680        /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1681         * barrier() makes sure compiler wont play fool^Waliasing games.
1682         */
1683        memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1684                sizeof(struct inet_skb_parm));
1685        barrier();
1686
1687        TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1688        TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1689                                    skb->len - th->doff * 4);
1690        TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1691        TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1692        TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1693        TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1694        TCP_SKB_CB(skb)->sacked  = 0;
1695
1696lookup:
1697        sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1698                               th->dest, &refcounted);
1699        if (!sk)
1700                goto no_tcp_socket;
1701
1702process:
1703        if (sk->sk_state == TCP_TIME_WAIT)
1704                goto do_time_wait;
1705
1706        if (sk->sk_state == TCP_NEW_SYN_RECV) {
1707                struct request_sock *req = inet_reqsk(sk);
1708                struct sock *nsk;
1709
1710                sk = req->rsk_listener;
1711                if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1712                        sk_drops_add(sk, skb);
1713                        reqsk_put(req);
1714                        goto discard_it;
1715                }
1716                if (unlikely(sk->sk_state != TCP_LISTEN)) {
1717                        inet_csk_reqsk_queue_drop_and_put(sk, req);
1718                        goto lookup;
1719                }
1720                /* We own a reference on the listener, increase it again
1721                 * as we might lose it too soon.
1722                 */
1723                sock_hold(sk);
1724                refcounted = true;
1725                if (tcp_filter(sk, skb))
1726                        goto discard_and_relse;
1727                nsk = tcp_check_req(sk, skb, req, false);
1728                if (!nsk) {
1729                        reqsk_put(req);
1730                        goto discard_and_relse;
1731                }
1732                if (nsk == sk) {
1733                        reqsk_put(req);
1734                } else if (tcp_child_process(sk, nsk, skb)) {
1735                        tcp_v4_send_reset(nsk, skb);
1736                        goto discard_and_relse;
1737                } else {
1738                        sock_put(sk);
1739                        return 0;
1740                }
1741        }
1742        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1743                __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1744                goto discard_and_relse;
1745        }
1746
1747        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1748                goto discard_and_relse;
1749
1750        if (tcp_v4_inbound_md5_hash(sk, skb))
1751                goto discard_and_relse;
1752
1753        nf_reset(skb);
1754
1755        if (tcp_filter(sk, skb))
1756                goto discard_and_relse;
1757        th = (const struct tcphdr *)skb->data;
1758        iph = ip_hdr(skb);
1759
1760        skb->dev = NULL;
1761
1762        if (sk->sk_state == TCP_LISTEN) {
1763                ret = tcp_v4_do_rcv(sk, skb);
1764                goto put_and_return;
1765        }
1766
1767        sk_incoming_cpu_update(sk);
1768
1769        bh_lock_sock_nested(sk);
1770        tcp_segs_in(tcp_sk(sk), skb);
1771        ret = 0;
1772        if (!sock_owned_by_user(sk)) {
1773                if (!tcp_prequeue(sk, skb))
1774                        ret = tcp_v4_do_rcv(sk, skb);
1775        } else if (tcp_add_backlog(sk, skb)) {
1776                goto discard_and_relse;
1777        }
1778        bh_unlock_sock(sk);
1779
1780put_and_return:
1781        if (refcounted)
1782                sock_put(sk);
1783
1784        return ret;
1785
1786no_tcp_socket:
1787        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1788                goto discard_it;
1789
1790        if (tcp_checksum_complete(skb)) {
1791csum_error:
1792                __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1793bad_packet:
1794                __TCP_INC_STATS(net, TCP_MIB_INERRS);
1795        } else {
1796                tcp_v4_send_reset(NULL, skb);
1797        }
1798
1799discard_it:
1800        /* Discard frame. */
1801        kfree_skb(skb);
1802        return 0;
1803
1804discard_and_relse:
1805        sk_drops_add(sk, skb);
1806        if (refcounted)
1807                sock_put(sk);
1808        goto discard_it;
1809
1810do_time_wait:
1811        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1812                inet_twsk_put(inet_twsk(sk));
1813                goto discard_it;
1814        }
1815
1816        if (tcp_checksum_complete(skb)) {
1817                inet_twsk_put(inet_twsk(sk));
1818                goto csum_error;
1819        }
1820        switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1821        case TCP_TW_SYN: {
1822                struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1823                                                        &tcp_hashinfo, skb,
1824                                                        __tcp_hdrlen(th),
1825                                                        iph->saddr, th->source,
1826                                                        iph->daddr, th->dest,
1827                                                        inet_iif(skb));
1828                if (sk2) {
1829                        inet_twsk_deschedule_put(inet_twsk(sk));
1830                        sk = sk2;
1831                        refcounted = false;
1832                        goto process;
1833                }
1834                /* Fall through to ACK */
1835        }
1836        case TCP_TW_ACK:
1837                tcp_v4_timewait_ack(sk, skb);
1838                break;
1839        case TCP_TW_RST:
1840                tcp_v4_send_reset(sk, skb);
1841                inet_twsk_deschedule_put(inet_twsk(sk));
1842                goto discard_it;
1843        case TCP_TW_SUCCESS:;
1844        }
1845        goto discard_it;
1846}
1847
1848static struct timewait_sock_ops tcp_timewait_sock_ops = {
1849        .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1850        .twsk_unique    = tcp_twsk_unique,
1851        .twsk_destructor= tcp_twsk_destructor,
1852};
1853
1854void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1855{
1856        struct dst_entry *dst = skb_dst(skb);
1857
1858        if (dst && dst_hold_safe(dst)) {
1859                sk->sk_rx_dst = dst;
1860                inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1861        }
1862}
1863EXPORT_SYMBOL(inet_sk_rx_dst_set);
1864
1865const struct inet_connection_sock_af_ops ipv4_specific = {
1866        .queue_xmit        = ip_queue_xmit,
1867        .send_check        = tcp_v4_send_check,
1868        .rebuild_header    = inet_sk_rebuild_header,
1869        .sk_rx_dst_set     = inet_sk_rx_dst_set,
1870        .conn_request      = tcp_v4_conn_request,
1871        .syn_recv_sock     = tcp_v4_syn_recv_sock,
1872        .net_header_len    = sizeof(struct iphdr),
1873        .setsockopt        = ip_setsockopt,
1874        .getsockopt        = ip_getsockopt,
1875        .addr2sockaddr     = inet_csk_addr2sockaddr,
1876        .sockaddr_len      = sizeof(struct sockaddr_in),
1877#ifdef CONFIG_COMPAT
1878        .compat_setsockopt = compat_ip_setsockopt,
1879        .compat_getsockopt = compat_ip_getsockopt,
1880#endif
1881        .mtu_reduced       = tcp_v4_mtu_reduced,
1882};
1883EXPORT_SYMBOL(ipv4_specific);
1884
1885#ifdef CONFIG_TCP_MD5SIG
1886static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1887        .md5_lookup             = tcp_v4_md5_lookup,
1888        .calc_md5_hash          = tcp_v4_md5_hash_skb,
1889        .md5_parse              = tcp_v4_parse_md5_keys,
1890};
1891#endif
1892
1893/* NOTE: A lot of things set to zero explicitly by call to
1894 *       sk_alloc() so need not be done here.
1895 */
1896static int tcp_v4_init_sock(struct sock *sk)
1897{
1898        struct inet_connection_sock *icsk = inet_csk(sk);
1899
1900        tcp_init_sock(sk);
1901
1902        icsk->icsk_af_ops = &ipv4_specific;
1903
1904#ifdef CONFIG_TCP_MD5SIG
1905        tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1906#endif
1907
1908        return 0;
1909}
1910
1911void tcp_v4_destroy_sock(struct sock *sk)
1912{
1913        struct tcp_sock *tp = tcp_sk(sk);
1914
1915        tcp_clear_xmit_timers(sk);
1916
1917        tcp_cleanup_congestion_control(sk);
1918
1919        tcp_cleanup_ulp(sk);
1920
1921        /* Cleanup up the write buffer. */
1922        tcp_write_queue_purge(sk);
1923
1924        /* Check if we want to disable active TFO */
1925        tcp_fastopen_active_disable_ofo_check(sk);
1926
1927        /* Cleans up our, hopefully empty, out_of_order_queue. */
1928        skb_rbtree_purge(&tp->out_of_order_queue);
1929
1930#ifdef CONFIG_TCP_MD5SIG
1931        /* Clean up the MD5 key list, if any */
1932        if (tp->md5sig_info) {
1933                tcp_clear_md5_list(sk);
1934                kfree_rcu(tp->md5sig_info, rcu);
1935                tp->md5sig_info = NULL;
1936        }
1937#endif
1938
1939        /* Clean prequeue, it must be empty really */
1940        __skb_queue_purge(&tp->ucopy.prequeue);
1941
1942        /* Clean up a referenced TCP bind bucket. */
1943        if (inet_csk(sk)->icsk_bind_hash)
1944                inet_put_port(sk);
1945
1946        BUG_ON(tp->fastopen_rsk);
1947
1948        /* If socket is aborted during connect operation */
1949        tcp_free_fastopen_req(tp);
1950        tcp_saved_syn_free(tp);
1951
1952        sk_sockets_allocated_dec(sk);
1953}
1954EXPORT_SYMBOL(tcp_v4_destroy_sock);
1955
1956#ifdef CONFIG_PROC_FS
1957/* Proc filesystem TCP sock list dumping. */
1958
1959/*
1960 * Get next listener socket follow cur.  If cur is NULL, get first socket
1961 * starting from bucket given in st->bucket; when st->bucket is zero the
1962 * very first socket in the hash table is returned.
1963 */
1964static void *listening_get_next(struct seq_file *seq, void *cur)
1965{
1966        struct tcp_iter_state *st = seq->private;
1967        struct net *net = seq_file_net(seq);
1968        struct inet_listen_hashbucket *ilb;
1969        struct sock *sk = cur;
1970
1971        if (!sk) {
1972get_head:
1973                ilb = &tcp_hashinfo.listening_hash[st->bucket];
1974                spin_lock(&ilb->lock);
1975                sk = sk_head(&ilb->head);
1976                st->offset = 0;
1977                goto get_sk;
1978        }
1979        ilb = &tcp_hashinfo.listening_hash[st->bucket];
1980        ++st->num;
1981        ++st->offset;
1982
1983        sk = sk_next(sk);
1984get_sk:
1985        sk_for_each_from(sk) {
1986                if (!net_eq(sock_net(sk), net))
1987                        continue;
1988                if (sk->sk_family == st->family)
1989                        return sk;
1990        }
1991        spin_unlock(&ilb->lock);
1992        st->offset = 0;
1993        if (++st->bucket < INET_LHTABLE_SIZE)
1994                goto get_head;
1995        return NULL;
1996}
1997
1998static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1999{
2000        struct tcp_iter_state *st = seq->private;

2001        void *rc;
2002
2003        st->bucket = 0;
2004        st->offset = 0;
2005        rc = listening_get_next(seq, NULL);
2006
2007        while (rc && *pos) {
2008                rc = listening_get_next(seq, rc);
2009                --*pos;
2010        }
2011        return rc;
2012}
2013
2014static inline bool empty_bucket(const struct tcp_iter_state *st)
2015{
2016        return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2017}
2018
2019/*
2020 * Get first established socket starting from bucket given in st->bucket.
2021 * If st->bucket is zero, the very first socket in the hash is returned.
2022 */
2023static void *established_get_first(struct seq_file *seq)
2024{
2025        struct tcp_iter_state *st = seq->private;
2026        struct net *net = seq_file_net(seq);
2027        void *rc = NULL;
2028
2029        st->offset = 0;
2030        for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2031                struct sock *sk;
2032                struct hlist_nulls_node *node;
2033                spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2034
2035                /* Lockless fast path for the common case of empty buckets */
2036                if (empty_bucket(st))
2037                        continue;
2038
2039                spin_lock_bh(lock);
2040                sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2041                        if (sk->sk_family != st->family ||
2042                            !net_eq(sock_net(sk), net)) {
2043                                continue;
2044                        }
2045                        rc = sk;
2046                        goto out;
2047                }
2048                spin_unlock_bh(lock);
2049        }
2050out:
2051        return rc;
2052}
2053
2054static void *established_get_next(struct seq_file *seq, void *cur)
2055{
2056        struct sock *sk = cur;
2057        struct hlist_nulls_node *node;
2058        struct tcp_iter_state *st = seq->private;
2059        struct net *net = seq_file_net(seq);
2060
2061        ++st->num;
2062        ++st->offset;
2063
2064        sk = sk_nulls_next(sk);
2065
2066        sk_nulls_for_each_from(sk, node) {
2067                if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2068                        return sk;
2069        }
2070
2071        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2072        ++st->bucket;
2073        return established_get_first(seq);
2074}
2075
2076static void *established_get_idx(struct seq_file *seq, loff_t pos)
2077{
2078        struct tcp_iter_state *st = seq->private;
2079        void *rc;
2080
2081        st->bucket = 0;
2082        rc = established_get_first(seq);
2083
2084        while (rc && pos) {
2085                rc = established_get_next(seq, rc);
2086                --pos;
2087        }
2088        return rc;
2089}
2090
2091static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2092{
2093        void *rc;
2094        struct tcp_iter_state *st = seq->private;
2095
2096        st->state = TCP_SEQ_STATE_LISTENING;
2097        rc        = listening_get_idx(seq, &pos);
2098
2099        if (!rc) {
2100                st->state = TCP_SEQ_STATE_ESTABLISHED;
2101                rc        = established_get_idx(seq, pos);
2102        }
2103
2104        return rc;
2105}
2106
2107static void *tcp_seek_last_pos(struct seq_file *seq)
2108{
2109        struct tcp_iter_state *st = seq->private;
2110        int offset = st->offset;
2111        int orig_num = st->num;
2112        void *rc = NULL;
2113
2114        switch (st->state) {
2115        case TCP_SEQ_STATE_LISTENING:
2116                if (st->bucket >= INET_LHTABLE_SIZE)
2117                        break;
2118                st->state = TCP_SEQ_STATE_LISTENING;
2119                rc = listening_get_next(seq, NULL);
2120                while (offset-- && rc)
2121                        rc = listening_get_next(seq, rc);
2122                if (rc)
2123                        break;
2124                st->bucket = 0;
2125                st->state = TCP_SEQ_STATE_ESTABLISHED;
2126                /* Fallthrough */
2127        case TCP_SEQ_STATE_ESTABLISHED:
2128                if (st->bucket > tcp_hashinfo.ehash_mask)
2129                        break;
2130                rc = established_get_first(seq);
2131                while (offset-- && rc)
2132                        rc = established_get_next(seq, rc);
2133        }
2134
2135        st->num = orig_num;
2136
2137        return rc;
2138}
2139
2140static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2141{
2142        struct tcp_iter_state *st = seq->private;
2143        void *rc;
2144
2145        if (*pos && *pos == st->last_pos) {
2146                rc = tcp_seek_last_pos(seq);
2147                if (rc)
2148                        goto out;
2149        }
2150
2151        st->state = TCP_SEQ_STATE_LISTENING;
2152        st->num = 0;
2153        st->bucket = 0;
2154        st->offset = 0;
2155        rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2156
2157out:
2158        st->last_pos = *pos;
2159        return rc;
2160}
2161
2162static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2163{
2164        struct tcp_iter_state *st = seq->private;
2165        void *rc = NULL;
2166
2167        if (v == SEQ_START_TOKEN) {
2168                rc = tcp_get_idx(seq, 0);
2169                goto out;
2170        }
2171
2172        switch (st->state) {
2173        case TCP_SEQ_STATE_LISTENING:
2174                rc = listening_get_next(seq, v);
2175                if (!rc) {
2176                        st->state = TCP_SEQ_STATE_ESTABLISHED;
2177                        st->bucket = 0;
2178                        st->offset = 0;
2179                        rc        = established_get_first(seq);
2180                }
2181                break;
2182        case TCP_SEQ_STATE_ESTABLISHED:
2183                rc = established_get_next(seq, v);
2184                break;
2185        }
2186out:
2187        ++*pos;
2188        st->last_pos = *pos;
2189        return rc;
2190}
2191
2192static void tcp_seq_stop(struct seq_file *seq, void *v)
2193{
2194        struct tcp_iter_state *st = seq->private;
2195
2196        switch (st->state) {
2197        case TCP_SEQ_STATE_LISTENING:
2198                if (v != SEQ_START_TOKEN)
2199                        spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2200                break;
2201        case TCP_SEQ_STATE_ESTABLISHED:
2202                if (v)
2203                        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2204                break;
2205        }
2206}
2207
2208int tcp_seq_open(struct inode *inode, struct file *file)
2209{
2210        struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2211        struct tcp_iter_state *s;
2212        int err;
2213
2214        err = seq_open_net(inode, file, &afinfo->seq_ops,
2215                          sizeof(struct tcp_iter_state));
2216        if (err < 0)
2217                return err;
2218
2219        s = ((struct seq_file *)file->private_data)->private;
2220        s->family               = afinfo->family;
2221        s->last_pos             = 0;
2222        return 0;
2223}
2224EXPORT_SYMBOL(tcp_seq_open);
2225
2226int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2227{
2228        int rc = 0;
2229        struct proc_dir_entry *p;
2230
2231        afinfo->seq_ops.start           = tcp_seq_start;
2232        afinfo->seq_ops.next            = tcp_seq_next;
2233        afinfo->seq_ops.stop            = tcp_seq_stop;
2234
2235        p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2236                             afinfo->seq_fops, afinfo);
2237        if (!p)
2238                rc = -ENOMEM;
2239        return rc;
2240}
2241EXPORT_SYMBOL(tcp_proc_register);
2242
2243void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2244{
2245        remove_proc_entry(afinfo->name, net->proc_net);
2246}
2247EXPORT_SYMBOL(tcp_proc_unregister);
2248
2249static void get_openreq4(const struct request_sock *req,
2250                         struct seq_file *f, int i)
2251{
2252        const struct inet_request_sock *ireq = inet_rsk(req);
2253        long delta = req->rsk_timer.expires - jiffies;
2254
2255        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2256                " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2257                i,
2258                ireq->ir_loc_addr,
2259                ireq->ir_num,
2260                ireq->ir_rmt_addr,
2261                ntohs(ireq->ir_rmt_port),
2262                TCP_SYN_RECV,
2263                0, 0, /* could print option size, but that is af dependent. */
2264                1,    /* timers active (only the expire timer) */
2265                jiffies_delta_to_clock_t(delta),
2266                req->num_timeout,
2267                from_kuid_munged(seq_user_ns(f),
2268                                 sock_i_uid(req->rsk_listener)),
2269                0,  /* non standard timer */
2270                0, /* open_requests have no inode */
2271                0,
2272                req);
2273}
2274
2275static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2276{
2277        int timer_active;
2278        unsigned long timer_expires;
2279        const struct tcp_sock *tp = tcp_sk(sk);
2280        const struct inet_connection_sock *icsk = inet_csk(sk);
2281        const struct inet_sock *inet = inet_sk(sk);
2282        const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2283        __be32 dest = inet->inet_daddr;
2284        __be32 src = inet->inet_rcv_saddr;
2285        __u16 destp = ntohs(inet->inet_dport);
2286        __u16 srcp = ntohs(inet->inet_sport);
2287        int rx_queue;
2288        int state;
2289
2290        if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2291            icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2292            icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2293                timer_active    = 1;
2294                timer_expires   = icsk->icsk_timeout;
2295        } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2296                timer_active    = 4;
2297                timer_expires   = icsk->icsk_timeout;
2298        } else if (timer_pending(&sk->sk_timer)) {
2299                timer_active    = 2;
2300                timer_expires   = sk->sk_timer.expires;
2301        } else {
2302                timer_active    = 0;
2303                timer_expires = jiffies;
2304        }
2305
2306        state = sk_state_load(sk);
2307        if (state == TCP_LISTEN)
2308                rx_queue = sk->sk_ack_backlog;
2309        else
2310                /* Because we don't lock the socket,
2311                 * we might find a transient negative value.
2312                 */
2313                rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2314
2315        seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2316                        "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2317                i, src, srcp, dest, destp, state,
2318                tp->write_seq - tp->snd_una,
2319                rx_queue,
2320                timer_active,
2321                jiffies_delta_to_clock_t(timer_expires - jiffies),
2322                icsk->icsk_retransmits,
2323                from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2324                icsk->icsk_probes_out,
2325                sock_i_ino(sk),
2326                refcount_read(&sk->sk_refcnt), sk,
2327                jiffies_to_clock_t(icsk->icsk_rto),
2328                jiffies_to_clock_t(icsk->icsk_ack.ato),
2329                (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2330                tp->snd_cwnd,
2331                state == TCP_LISTEN ?
2332                    fastopenq->max_qlen :
2333                    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2334}
2335
2336static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2337                               struct seq_file *f, int i)
2338{
2339        long delta = tw->tw_timer.expires - jiffies;
2340        __be32 dest, src;
2341        __u16 destp, srcp;
2342
2343        dest  = tw->tw_daddr;
2344        src   = tw->tw_rcv_saddr;
2345        destp = ntohs(tw->tw_dport);
2346        srcp  = ntohs(tw->tw_sport);
2347
2348        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2349                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2350                i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2351                3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2352                refcount_read(&tw->tw_refcnt), tw);
2353}
2354
2355#define TMPSZ 150
2356
2357static int tcp4_seq_show(struct seq_file *seq, void *v)
2358{
2359        struct tcp_iter_state *st;
2360        struct sock *sk = v;
2361
2362        seq_setwidth(seq, TMPSZ - 1);
2363        if (v == SEQ_START_TOKEN) {
2364                seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2365                           "rx_queue tr tm->when retrnsmt   uid  timeout "
2366                           "inode");
2367                goto out;
2368        }
2369        st = seq->private;
2370
2371        if (sk->sk_state == TCP_TIME_WAIT)
2372                get_timewait4_sock(v, seq, st->num);
2373        else if (sk->sk_state == TCP_NEW_SYN_RECV)
2374                get_openreq4(v, seq, st->num);
2375        else
2376                get_tcp4_sock(v, seq, st->num);
2377out:
2378        seq_pad(seq, '\n');
2379        return 0;
2380}
2381
2382static const struct file_operations tcp_afinfo_seq_fops = {
2383        .owner   = THIS_MODULE,
2384        .open    = tcp_seq_open,
2385        .read    = seq_read,
2386        .llseek  = seq_lseek,
2387        .release = seq_release_net
2388};
2389
2390static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2391        .name           = "tcp",
2392        .family         = AF_INET,
2393        .seq_fops       = &tcp_afinfo_seq_fops,
2394        .seq_ops        = {
2395                .show           = tcp4_seq_show,
2396        },
2397};
2398
2399static int __net_init tcp4_proc_init_net(struct net *net)
2400{
2401        return tcp_proc_register(net, &tcp4_seq_afinfo);
2402}
2403
2404static void __net_exit tcp4_proc_exit_net(struct net *net)
2405{
2406        tcp_proc_unregister(net, &tcp4_seq_afinfo);
2407}
2408
2409static struct pernet_operations tcp4_net_ops = {
2410        .init = tcp4_proc_init_net,
2411        .exit = tcp4_proc_exit_net,
2412};
2413
2414int __init tcp4_proc_init(void)
2415{
2416        return register_pernet_subsys(&tcp4_net_ops);
2417}
2418
2419void tcp4_proc_exit(void)
2420{
2421        unregister_pernet_subsys(&tcp4_net_ops);
2422}
2423#endif /* CONFIG_PROC_FS */
2424
2425struct proto tcp_prot = {
2426        .name                   = "TCP",
2427        .owner                  = THIS_MODULE,
2428        .close                  = tcp_close,
2429        .connect                = tcp_v4_connect,
2430        .disconnect             = tcp_disconnect,
2431        .accept                 = inet_csk_accept,
2432        .ioctl                  = tcp_ioctl,
2433        .init                   = tcp_v4_init_sock,
2434        .destroy                = tcp_v4_destroy_sock,
2435        .shutdown               = tcp_shutdown,
2436        .setsockopt             = tcp_setsockopt,
2437        .getsockopt             = tcp_getsockopt,
2438        .keepalive              = tcp_set_keepalive,
2439        .recvmsg                = tcp_recvmsg,
2440        .sendmsg                = tcp_sendmsg,
2441        .sendpage               = tcp_sendpage,
2442        .backlog_rcv            = tcp_v4_do_rcv,
2443        .release_cb             = tcp_release_cb,
2444        .hash                   = inet_hash,
2445        .unhash                 = inet_unhash,
2446        .get_port               = inet_csk_get_port,
2447        .enter_memory_pressure  = tcp_enter_memory_pressure,
2448        .leave_memory_pressure  = tcp_leave_memory_pressure,
2449        .stream_memory_free     = tcp_stream_memory_free,
2450        .sockets_allocated      = &tcp_sockets_allocated,
2451        .orphan_count           = &tcp_orphan_count,
2452        .memory_allocated       = &tcp_memory_allocated,
2453        .memory_pressure        = &tcp_memory_pressure,
2454        .sysctl_mem             = sysctl_tcp_mem,
2455        .sysctl_wmem            = sysctl_tcp_wmem,
2456        .sysctl_rmem            = sysctl_tcp_rmem,
2457        .max_header             = MAX_TCP_HEADER,
2458        .obj_size               = sizeof(struct tcp_sock),
2459        .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2460        .twsk_prot              = &tcp_timewait_sock_ops,
2461        .rsk_prot               = &tcp_request_sock_ops,
2462        .h.hashinfo             = &tcp_hashinfo,
2463        .no_autobind            = true,
2464#ifdef CONFIG_COMPAT
2465        .compat_setsockopt      = compat_tcp_setsockopt,
2466        .compat_getsockopt      = compat_tcp_getsockopt,
2467#endif
2468        .diag_destroy           = tcp_abort,
2469};
2470EXPORT_SYMBOL(tcp_prot);
2471
2472static void __net_exit tcp_sk_exit(struct net *net)
2473{
2474        int cpu;
2475
2476        for_each_possible_cpu(cpu)
2477                inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2478        free_percpu(net->ipv4.tcp_sk);
2479}
2480
2481static int __net_init tcp_sk_init(struct net *net)
2482{
2483        int res, cpu, cnt;
2484
2485        net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2486        if (!net->ipv4.tcp_sk)
2487                return -ENOMEM;
2488
2489        for_each_possible_cpu(cpu) {
2490                struct sock *sk;
2491
2492                res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2493                                           IPPROTO_TCP, net);
2494                if (res)
2495                        goto fail;
2496                sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2497                *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2498        }
2499
2500        net->ipv4.sysctl_tcp_ecn = 2;
2501        net->ipv4.sysctl_tcp_ecn_fallback = 1;
2502
2503        net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2504        net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2505        net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2506
2507        net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2508        net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2509        net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2510
2511        net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2512        net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2513        net->ipv4.sysctl_tcp_syncookies = 1;
2514        net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2515        net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2516        net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2517        net->ipv4.sysctl_tcp_orphan_retries = 0;
2518        net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2519        net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2520        net->ipv4.sysctl_tcp_tw_reuse = 0;
2521
2522        cnt = tcp_hashinfo.ehash_mask + 1;
2523        net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2524        net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2525
2526        net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2527        net->ipv4.sysctl_tcp_sack = 1;
2528        net->ipv4.sysctl_tcp_window_scaling = 1;
2529        net->ipv4.sysctl_tcp_timestamps = 1;
2530
2531        return 0;
2532fail:
2533        tcp_sk_exit(net);
2534
2535        return res;
2536}
2537
2538static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2539{
2540        inet_twsk_purge(&tcp_hashinfo, AF_INET);
2541}
2542
2543static struct pernet_operations __net_initdata tcp_sk_ops = {
2544       .init       = tcp_sk_init,
2545       .exit       = tcp_sk_exit,
2546       .exit_batch = tcp_sk_exit_batch,
2547};
2548
2549void __init tcp_v4_init(void)
2550{
2551        if (register_pernet_subsys(&tcp_sk_ops))
2552                panic("Failed to create the TCP control socket.\n");
2553}
2554