LXR linux/net/ipv4/tcp

   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Implementation of the Transmission Control Protocol(TCP).
   7 *
   8 *              IPv4 specific functions
   9 *
  10 *
  11 *              code split from:
  12 *              linux/ipv4/tcp.c
  13 *              linux/ipv4/tcp_input.c
  14 *              linux/ipv4/tcp_output.c
  15 *
  16 *              See tcp.c for author information
  17 *
  18 *      This program is free software; you can redistribute it and/or
  19 *      modify it under the terms of the GNU General Public License
  20 *      as published by the Free Software Foundation; either version
  21 *      2 of the License, or (at your option) any later version.
  22 */
  23
  24/*
  25 * Changes:
  26 *              David S. Miller :       New socket lookup architecture.
  27 *                                      This code is dedicated to John Dyson.
  28 *              David S. Miller :       Change semantics of established hash,
  29 *                                      half is devoted to TIME_WAIT sockets
  30 *                                      and the rest go in the other half.
  31 *              Andi Kleen :            Add support for syncookies and fixed
  32 *                                      some bugs: ip options weren't passed to
  33 *                                      the TCP layer, missed a check for an
  34 *                                      ACK bit.
  35 *              Andi Kleen :            Implemented fast path mtu discovery.
  36 *                                      Fixed many serious bugs in the
  37 *                                      request_sock handling and moved
  38 *                                      most of it into the af independent code.
  39 *                                      Added tail drop and some other bugfixes.
  40 *                                      Added new listen semantics.
  41 *              Mike McLagan    :       Routing by source
  42 *      Juan Jose Ciarlante:            ip_dynaddr bits
  43 *              Andi Kleen:             various fixes.
  44 *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45 *                                      coma.
  46 *      Andi Kleen              :       Fix new listen.
  47 *      Andi Kleen              :       Fix accept error reporting.
  48 *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49 *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50 *                                      a single port at the same time.
  51 */
  52
  53#define pr_fmt(fmt) "TCP: " fmt
  54
  55#include <linux/bottom_half.h>
  56#include <linux/types.h>
  57#include <linux/fcntl.h>
  58#include <linux/module.h>
  59#include <linux/random.h>
  60#include <linux/cache.h>
  61#include <linux/jhash.h>
  62#include <linux/init.h>
  63#include <linux/times.h>
  64#include <linux/slab.h>
  65
  66#include <net/net_namespace.h>
  67#include <net/icmp.h>
  68#include <net/inet_hashtables.h>
  69#include <net/tcp.h>
  70#include <net/transp_v6.h>
  71#include <net/ipv6.h>
  72#include <net/inet_common.h>
  73#include <net/timewait_sock.h>
  74#include <net/xfrm.h>
  75#include <net/secure_seq.h>
  76#include <net/busy_poll.h>
  77
  78#include <linux/inet.h>
  79#include <linux/ipv6.h>
  80#include <linux/stddef.h>
  81#include <linux/proc_fs.h>
  82#include <linux/seq_file.h>
  83#include <linux/inetdevice.h>
  84
  85#include <crypto/hash.h>
  86#include <linux/scatterlist.h>
  87
  88#include <trace/events/tcp.h>
  89
  90#ifdef CONFIG_TCP_MD5SIG
  91static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  92                               __be32 daddr, __be32 saddr, const struct tcphdr *th);
  93#endif
  94
  95struct inet_hashinfo tcp_hashinfo;
  96EXPORT_SYMBOL(tcp_hashinfo);
  97
  98static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  99{
 100        return secure_tcp_seq(ip_hdr(skb)->daddr,
 101                              ip_hdr(skb)->saddr,
 102                              tcp_hdr(skb)->dest,
 103                              tcp_hdr(skb)->source);
 104}
 105
 106static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 107{
 108        return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 109}
 110
 111int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 112{
 113        const struct inet_timewait_sock *tw = inet_twsk(sktw);
 114        const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 115        struct tcp_sock *tp = tcp_sk(sk);
 116        int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
 117
 118        if (reuse == 2) {
 119                /* Still does not detect *everything* that goes through
 120                 * lo, since we require a loopback src or dst address
 121                 * or direct binding to 'lo' interface.
 122                 */
 123                bool loopback = false;
 124                if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 125                        loopback = true;
 126#if IS_ENABLED(CONFIG_IPV6)
 127                if (tw->tw_family == AF_INET6) {
 128                        if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 129                            (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
 130                             (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
 131                            ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 132                            (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
 133                             (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
 134                                loopback = true;
 135                } else
 136#endif
 137                {
 138                        if (ipv4_is_loopback(tw->tw_daddr) ||
 139                            ipv4_is_loopback(tw->tw_rcv_saddr))
 140                                loopback = true;
 141                }
 142                if (!loopback)
 143                        reuse = 0;
 144        }
 145
 146        /* With PAWS, it is safe from the viewpoint
 147           of data integrity. Even without PAWS it is safe provided sequence
 148           spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 149
 150           Actually, the idea is close to VJ's one, only timestamp cache is
 151           held not per host, but per port pair and TW bucket is used as state
 152           holder.
 153
 154           If TW bucket has been already destroyed we fall back to VJ's scheme
 155           and use initial timestamp retrieved from peer table.
 156         */
 157        if (tcptw->tw_ts_recent_stamp &&
 158            (!twp || (reuse && get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 159                /* In case of repair and re-using TIME-WAIT sockets we still
 160                 * want to be sure that it is safe as above but honor the
 161                 * sequence numbers and time stamps set as part of the repair
 162                 * process.
 163                 *
 164                 * Without this check re-using a TIME-WAIT socket with TCP
 165                 * repair would accumulate a -1 on the repair assigned
 166                 * sequence number. The first time it is reused the sequence
 167                 * is -1, the second time -2, etc. This fixes that issue
 168                 * without appearing to create any others.
 169                 */
 170                if (likely(!tp->repair)) {
 171                        tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 172                        if (tp->write_seq == 0)
 173                                tp->write_seq = 1;
 174                        tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 175                        tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 176                }
 177                sock_hold(sktw);
 178                return 1;
 179        }
 180
 181        return 0;
 182}
 183EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 184
 185static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 186                              int addr_len)
 187{
 188        /* This check is replicated from tcp_v4_connect() and intended to
 189         * prevent BPF program called below from accessing bytes that are out
 190         * of the bound specified by user in addr_len.
 191         */
 192        if (addr_len < sizeof(struct sockaddr_in))
 193                return -EINVAL;
 194
 195        sock_owned_by_me(sk);
 196
 197        return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 198}
 199
 200/* This will initiate an outgoing connection. */
 201int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 202{
 203        struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 204        struct inet_sock *inet = inet_sk(sk);
 205        struct tcp_sock *tp = tcp_sk(sk);
 206        __be16 orig_sport, orig_dport;
 207        __be32 daddr, nexthop;
 208        struct flowi4 *fl4;
 209        struct rtable *rt;
 210        int err;
 211        struct ip_options_rcu *inet_opt;
 212        struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 213
 214        if (addr_len < sizeof(struct sockaddr_in))
 215                return -EINVAL;
 216
 217        if (usin->sin_family != AF_INET)
 218                return -EAFNOSUPPORT;
 219
 220        nexthop = daddr = usin->sin_addr.s_addr;
 221        inet_opt = rcu_dereference_protected(inet->inet_opt,
 222                                             lockdep_sock_is_held(sk));
 223        if (inet_opt && inet_opt->opt.srr) {
 224                if (!daddr)
 225                        return -EINVAL;
 226                nexthop = inet_opt->opt.faddr;
 227        }
 228
 229        orig_sport = inet->inet_sport;
 230        orig_dport = usin->sin_port;
 231        fl4 = &inet->cork.fl.u.ip4;
 232        rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 233                              RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 234                              IPPROTO_TCP,
 235                              orig_sport, orig_dport, sk);
 236        if (IS_ERR(rt)) {
 237                err = PTR_ERR(rt);
 238                if (err == -ENETUNREACH)
 239                        IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 240                return err;
 241        }
 242
 243        if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 244                ip_rt_put(rt);
 245                return -ENETUNREACH;
 246        }
 247
 248        if (!inet_opt || !inet_opt->opt.srr)
 249                daddr = fl4->daddr;
 250
 251        if (!inet->inet_saddr)
 252                inet->inet_saddr = fl4->saddr;
 253        sk_rcv_saddr_set(sk, inet->inet_saddr);
 254
 255        if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 256                /* Reset inherited state */
 257                tp->rx_opt.ts_recent       = 0;
 258                tp->rx_opt.ts_recent_stamp = 0;
 259                if (likely(!tp->repair))
 260                        tp->write_seq      = 0;
 261        }
 262
 263        inet->inet_dport = usin->sin_port;
 264        sk_daddr_set(sk, daddr);
 265
 266        inet_csk(sk)->icsk_ext_hdr_len = 0;
 267        if (inet_opt)
 268                inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 269
 270        tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 271
 272        /* Socket identity is still unknown (sport may be zero).
 273         * However we set state to SYN-SENT and not releasing socket
 274         * lock select source port, enter ourselves into the hash tables and
 275         * complete initialization after this.
 276         */
 277        tcp_set_state(sk, TCP_SYN_SENT);
 278        err = inet_hash_connect(tcp_death_row, sk);
 279        if (err)
 280                goto failure;
 281
 282        sk_set_txhash(sk);
 283
 284        rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 285                               inet->inet_sport, inet->inet_dport, sk);
 286        if (IS_ERR(rt)) {
 287                err = PTR_ERR(rt);
 288                rt = NULL;
 289                goto failure;
 290        }
 291        /* OK, now commit destination to socket.  */
 292        sk->sk_gso_type = SKB_GSO_TCPV4;
 293        sk_setup_caps(sk, &rt->dst);
 294        rt = NULL;
 295
 296        if (likely(!tp->repair)) {
 297                if (!tp->write_seq)
 298                        tp->write_seq = secure_tcp_seq(inet->inet_saddr,
 299                                                       inet->inet_daddr,
 300                                                       inet->inet_sport,
 301                                                       usin->sin_port);
 302                tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 303                                                 inet->inet_saddr,
 304                                                 inet->inet_daddr);
 305        }
 306
 307        inet->inet_id = tp->write_seq ^ jiffies;
 308
 309        if (tcp_fastopen_defer_connect(sk, &err))
 310                return err;
 311        if (err)
 312                goto failure;
 313
 314        err = tcp_connect(sk);
 315
 316        if (err)
 317                goto failure;
 318
 319        return 0;
 320
 321failure:
 322        /*
 323         * This unhashes the socket and releases the local port,
 324         * if necessary.
 325         */
 326        tcp_set_state(sk, TCP_CLOSE);
 327        ip_rt_put(rt);
 328        sk->sk_route_caps = 0;
 329        inet->inet_dport = 0;
 330        return err;
 331}
 332EXPORT_SYMBOL(tcp_v4_connect);
 333
 334/*
 335 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 336 * It can be called through tcp_release_cb() if socket was owned by user
 337 * at the time tcp_v4_err() was called to handle ICMP message.
 338 */
 339void tcp_v4_mtu_reduced(struct sock *sk)
 340{
 341        struct inet_sock *inet = inet_sk(sk);
 342        struct dst_entry *dst;
 343        u32 mtu;
 344
 345        if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 346                return;
 347        mtu = tcp_sk(sk)->mtu_info;
 348        dst = inet_csk_update_pmtu(sk, mtu);
 349        if (!dst)
 350                return;
 351
 352        /* Something is about to be wrong... Remember soft error
 353         * for the case, if this connection will not able to recover.
 354         */
 355        if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 356                sk->sk_err_soft = EMSGSIZE;
 357
 358        mtu = dst_mtu(dst);
 359
 360        if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 361            ip_sk_accept_pmtu(sk) &&
 362            inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 363                tcp_sync_mss(sk, mtu);
 364
 365                /* Resend the TCP packet because it's
 366                 * clear that the old packet has been
 367                 * dropped. This is the new "fast" path mtu
 368                 * discovery.
 369                 */
 370                tcp_simple_retransmit(sk);
 371        } /* else let the usual retransmit timer handle it */
 372}
 373EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 374
 375static void do_redirect(struct sk_buff *skb, struct sock *sk)
 376{
 377        struct dst_entry *dst = __sk_dst_check(sk, 0);
 378
 379        if (dst)
 380                dst->ops->redirect(dst, sk, skb);
 381}
 382
 383
 384/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 385void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 386{
 387        struct request_sock *req = inet_reqsk(sk);
 388        struct net *net = sock_net(sk);
 389
 390        /* ICMPs are not backlogged, hence we cannot get
 391         * an established socket here.
 392         */
 393        if (seq != tcp_rsk(req)->snt_isn) {
 394                __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 395        } else if (abort) {
 396                /*
 397                 * Still in SYN_RECV, just remove it silently.
 398                 * There is no good way to pass the error to the newly
 399                 * created socket, and POSIX does not want network
 400                 * errors returned from accept().
 401                 */
 402                inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 403                tcp_listendrop(req->rsk_listener);
 404        }
 405        reqsk_put(req);
 406}
 407EXPORT_SYMBOL(tcp_req_err);
 408
 409/*
 410 * This routine is called by the ICMP module when it gets some
 411 * sort of error condition.  If err < 0 then the socket should
 412 * be closed and the error returned to the user.  If err > 0
 413 * it's just the icmp type << 8 | icmp code.  After adjustment
 414 * header points to the first 8 bytes of the tcp header.  We need
 415 * to find the appropriate port.
 416 *
 417 * The locking strategy used here is very "optimistic". When
 418 * someone else accesses the socket the ICMP is just dropped
 419 * and for some paths there is no check at all.
 420 * A more general error queue to queue errors for later handling
 421 * is probably better.
 422 *
 423 */
 424
 425void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 426{
 427        const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 428        struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 429        struct inet_connection_sock *icsk;
 430        struct tcp_sock *tp;
 431        struct inet_sock *inet;
 432        const int type = icmp_hdr(icmp_skb)->type;
 433        const int code = icmp_hdr(icmp_skb)->code;
 434        struct sock *sk;
 435        struct sk_buff *skb;
 436        struct request_sock *fastopen;
 437        u32 seq, snd_una;
 438        s32 remaining;
 439        u32 delta_us;
 440        int err;
 441        struct net *net = dev_net(icmp_skb->dev);
 442
 443        sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 444                                       th->dest, iph->saddr, ntohs(th->source),
 445                                       inet_iif(icmp_skb), 0);
 446        if (!sk) {
 447                __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 448                return;
 449        }
 450        if (sk->sk_state == TCP_TIME_WAIT) {
 451                inet_twsk_put(inet_twsk(sk));
 452                return;
 453        }
 454        seq = ntohl(th->seq);
 455        if (sk->sk_state == TCP_NEW_SYN_RECV)
 456                return tcp_req_err(sk, seq,
 457                                  type == ICMP_PARAMETERPROB ||
 458                                  type == ICMP_TIME_EXCEEDED ||
 459                                  (type == ICMP_DEST_UNREACH &&
 460                                   (code == ICMP_NET_UNREACH ||
 461                                    code == ICMP_HOST_UNREACH)));
 462
 463        bh_lock_sock(sk);
 464        /* If too many ICMPs get dropped on busy
 465         * servers this needs to be solved differently.
 466         * We do take care of PMTU discovery (RFC1191) special case :
 467         * we can receive locally generated ICMP messages while socket is held.
 468         */
 469        if (sock_owned_by_user(sk)) {
 470                if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 471                        __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 472        }
 473        if (sk->sk_state == TCP_CLOSE)
 474                goto out;
 475
 476        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 477                __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 478                goto out;
 479        }
 480
 481        icsk = inet_csk(sk);
 482        tp = tcp_sk(sk);
 483        /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 484        fastopen = tp->fastopen_rsk;
 485        snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 486        if (sk->sk_state != TCP_LISTEN &&
 487            !between(seq, snd_una, tp->snd_nxt)) {
 488                __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 489                goto out;
 490        }
 491
 492        switch (type) {
 493        case ICMP_REDIRECT:
 494                if (!sock_owned_by_user(sk))
 495                        do_redirect(icmp_skb, sk);
 496                goto out;
 497        case ICMP_SOURCE_QUENCH:
 498                /* Just silently ignore these. */
 499                goto out;
 500        case ICMP_PARAMETERPROB:
 501                err = EPROTO;
 502                break;
 503        case ICMP_DEST_UNREACH:
 504                if (code > NR_ICMP_UNREACH)
 505                        goto out;
 506
 507                if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 508                        /* We are not interested in TCP_LISTEN and open_requests
 509                         * (SYN-ACKs send out by Linux are always <576bytes so
 510                         * they should go through unfragmented).
 511                         */
 512                        if (sk->sk_state == TCP_LISTEN)
 513                                goto out;
 514
 515                        tp->mtu_info = info;
 516                        if (!sock_owned_by_user(sk)) {
 517                                tcp_v4_mtu_reduced(sk);
 518                        } else {
 519                                if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 520                                        sock_hold(sk);
 521                        }
 522                        goto out;
 523                }
 524
 525                err = icmp_err_convert[code].errno;
 526                /* check if icmp_skb allows revert of backoff
 527                 * (see draft-zimmermann-tcp-lcd) */
 528                if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 529                        break;
 530                if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 531                    !icsk->icsk_backoff || fastopen)
 532                        break;
 533
 534                if (sock_owned_by_user(sk))
 535                        break;
 536
 537                icsk->icsk_backoff--;
 538                icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 539                                               TCP_TIMEOUT_INIT;
 540                icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 541
 542                skb = tcp_rtx_queue_head(sk);
 543                BUG_ON(!skb);
 544
 545                tcp_mstamp_refresh(tp);
 546                delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
 547                remaining = icsk->icsk_rto -
 548                            usecs_to_jiffies(delta_us);
 549
 550                if (remaining > 0) {
 551                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 552                                                  remaining, TCP_RTO_MAX);
 553                } else {
 554                        /* RTO revert clocked out retransmission.
 555                         * Will retransmit now */
 556                        tcp_retransmit_timer(sk);
 557                }
 558
 559                break;
 560        case ICMP_TIME_EXCEEDED:
 561                err = EHOSTUNREACH;
 562                break;
 563        default:
 564                goto out;
 565        }
 566
 567        switch (sk->sk_state) {
 568        case TCP_SYN_SENT:
 569        case TCP_SYN_RECV:
 570                /* Only in fast or simultaneous open. If a fast open socket is
 571                 * is already accepted it is treated as a connected one below.
 572                 */
 573                if (fastopen && !fastopen->sk)
 574                        break;
 575
 576                if (!sock_owned_by_user(sk)) {
 577                        sk->sk_err = err;
 578
 579                        sk->sk_error_report(sk);
 580
 581                        tcp_done(sk);
 582                } else {
 583                        sk->sk_err_soft = err;
 584                }
 585                goto out;
 586        }
 587
 588        /* If we've already connected we will keep trying
 589         * until we time out, or the user gives up.
 590         *
 591         * rfc1122 4.2.3.9 allows to consider as hard errors
 592         * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 593         * but it is obsoleted by pmtu discovery).
 594         *
 595         * Note, that in modern internet, where routing is unreliable
 596         * and in each dark corner broken firewalls sit, sending random
 597         * errors ordered by their masters even this two messages finally lose
 598         * their original sense (even Linux sends invalid PORT_UNREACHs)
 599         *
 600         * Now we are in compliance with RFCs.
 601         *                                                      --ANK (980905)
 602         */
 603
 604        inet = inet_sk(sk);
 605        if (!sock_owned_by_user(sk) && inet->recverr) {
 606                sk->sk_err = err;
 607                sk->sk_error_report(sk);
 608        } else  { /* Only an error on timeout */
 609                sk->sk_err_soft = err;
 610        }
 611
 612out:
 613        bh_unlock_sock(sk);
 614        sock_put(sk);
 615}
 616
 617void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 618{
 619        struct tcphdr *th = tcp_hdr(skb);
 620
 621        th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 622        skb->csum_start = skb_transport_header(skb) - skb->head;
 623        skb->csum_offset = offsetof(struct tcphdr, check);
 624}
 625
 626/* This routine computes an IPv4 TCP checksum. */
 627void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 628{
 629        const struct inet_sock *inet = inet_sk(sk);
 630
 631        __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 632}
 633EXPORT_SYMBOL(tcp_v4_send_check);
 634
 635/*
 636 *      This routine will send an RST to the other tcp.
 637 *
 638 *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 639 *                    for reset.
 640 *      Answer: if a packet caused RST, it is not for a socket
 641 *              existing in our system, if it is matched to a socket,
 642 *              it is just duplicate segment or bug in other side's TCP.
 643 *              So that we build reply only basing on parameters
 644 *              arrived with segment.
 645 *      Exception: precedence violation. We do not implement it in any case.
 646 */
 647
 648static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 649{
 650        const struct tcphdr *th = tcp_hdr(skb);
 651        struct {
 652                struct tcphdr th;
 653#ifdef CONFIG_TCP_MD5SIG
 654                __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 655#endif
 656        } rep;
 657        struct ip_reply_arg arg;
 658#ifdef CONFIG_TCP_MD5SIG
 659        struct tcp_md5sig_key *key = NULL;
 660        const __u8 *hash_location = NULL;
 661        unsigned char newhash[16];
 662        int genhash;
 663        struct sock *sk1 = NULL;
 664#endif
 665        struct net *net;
 666        struct sock *ctl_sk;
 667
 668        /* Never send a reset in response to a reset. */
 669        if (th->rst)
 670                return;
 671
 672        /* If sk not NULL, it means we did a successful lookup and incoming
 673         * route had to be correct. prequeue might have dropped our dst.
 674         */
 675        if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 676                return;
 677
 678        /* Swap the send and the receive. */
 679        memset(&rep, 0, sizeof(rep));
 680        rep.th.dest   = th->source;
 681        rep.th.source = th->dest;
 682        rep.th.doff   = sizeof(struct tcphdr) / 4;
 683        rep.th.rst    = 1;
 684
 685        if (th->ack) {
 686                rep.th.seq = th->ack_seq;
 687        } else {
 688                rep.th.ack = 1;
 689                rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 690                                       skb->len - (th->doff << 2));
 691        }
 692
 693        memset(&arg, 0, sizeof(arg));
 694        arg.iov[0].iov_base = (unsigned char *)&rep;
 695        arg.iov[0].iov_len  = sizeof(rep.th);
 696
 697        net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 698#ifdef CONFIG_TCP_MD5SIG
 699        rcu_read_lock();
 700        hash_location = tcp_parse_md5sig_option(th);
 701        if (sk && sk_fullsock(sk)) {
 702                key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 703                                        &ip_hdr(skb)->saddr, AF_INET);
 704        } else if (hash_location) {
 705                /*
 706                 * active side is lost. Try to find listening socket through
 707                 * source port, and then find md5 key through listening socket.
 708                 * we are not loose security here:
 709                 * Incoming packet is checked with md5 hash with finding key,
 710                 * no RST generated if md5 hash doesn't match.
 711                 */
 712                sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 713                                             ip_hdr(skb)->saddr,
 714                                             th->source, ip_hdr(skb)->daddr,
 715                                             ntohs(th->source), inet_iif(skb),
 716                                             tcp_v4_sdif(skb));
 717                /* don't send rst if it can't find key */
 718                if (!sk1)
 719                        goto out;
 720
 721                key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 722                                        &ip_hdr(skb)->saddr, AF_INET);
 723                if (!key)
 724                        goto out;
 725
 726
 727                genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 728                if (genhash || memcmp(hash_location, newhash, 16) != 0)
 729                        goto out;
 730
 731        }
 732
 733        if (key) {
 734                rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 735                                   (TCPOPT_NOP << 16) |
 736                                   (TCPOPT_MD5SIG << 8) |
 737                                   TCPOLEN_MD5SIG);
 738                /* Update length and the length the header thinks exists */
 739                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 740                rep.th.doff = arg.iov[0].iov_len / 4;
 741
 742                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 743                                     key, ip_hdr(skb)->saddr,
 744                                     ip_hdr(skb)->daddr, &rep.th);
 745        }
 746#endif
 747        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 748                                      ip_hdr(skb)->saddr, /* XXX */
 749                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 750        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 751        arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 752
 753        /* When socket is gone, all binding information is lost.
 754         * routing might fail in this case. No choice here, if we choose to force
 755         * input interface, we will misroute in case of asymmetric route.
 756         */
 757        if (sk) {
 758                arg.bound_dev_if = sk->sk_bound_dev_if;
 759                if (sk_fullsock(sk))
 760                        trace_tcp_send_reset(sk, skb);
 761        }
 762
 763        BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 764                     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 765
 766        arg.tos = ip_hdr(skb)->tos;
 767        arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 768        local_bh_disable();
 769        ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
 770        if (sk)
 771                ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 772                                   inet_twsk(sk)->tw_mark : sk->sk_mark;
 773        ip_send_unicast_reply(ctl_sk,
 774                              skb, &TCP_SKB_CB(skb)->header.h4.opt,
 775                              ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 776                              &arg, arg.iov[0].iov_len);
 777
 778        ctl_sk->sk_mark = 0;
 779        __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 780        __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 781        local_bh_enable();
 782
 783#ifdef CONFIG_TCP_MD5SIG
 784out:
 785        rcu_read_unlock();
 786#endif
 787}
 788
 789/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 790   outside socket context is ugly, certainly. What can I do?
 791 */
 792
 793static void tcp_v4_send_ack(const struct sock *sk,
 794                            struct sk_buff *skb, u32 seq, u32 ack,
 795                            u32 win, u32 tsval, u32 tsecr, int oif,
 796                            struct tcp_md5sig_key *key,
 797                            int reply_flags, u8 tos)
 798{
 799        const struct tcphdr *th = tcp_hdr(skb);
 800        struct {
 801                struct tcphdr th;
 802                __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 803#ifdef CONFIG_TCP_MD5SIG
 804                           + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 805#endif
 806                        ];
 807        } rep;
 808        struct net *net = sock_net(sk);
 809        struct ip_reply_arg arg;
 810        struct sock *ctl_sk;
 811
 812        memset(&rep.th, 0, sizeof(struct tcphdr));
 813        memset(&arg, 0, sizeof(arg));
 814
 815        arg.iov[0].iov_base = (unsigned char *)&rep;
 816        arg.iov[0].iov_len  = sizeof(rep.th);
 817        if (tsecr) {
 818                rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 819                                   (TCPOPT_TIMESTAMP << 8) |
 820                                   TCPOLEN_TIMESTAMP);
 821                rep.opt[1] = htonl(tsval);
 822                rep.opt[2] = htonl(tsecr);
 823                arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 824        }
 825
 826        /* Swap the send and the receive. */
 827        rep.th.dest    = th->source;
 828        rep.th.source  = th->dest;
 829        rep.th.doff    = arg.iov[0].iov_len / 4;
 830        rep.th.seq     = htonl(seq);
 831        rep.th.ack_seq = htonl(ack);
 832        rep.th.ack     = 1;
 833        rep.th.window  = htons(win);
 834
 835#ifdef CONFIG_TCP_MD5SIG
 836        if (key) {
 837                int offset = (tsecr) ? 3 : 0;
 838
 839                rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 840                                          (TCPOPT_NOP << 16) |
 841                                          (TCPOPT_MD5SIG << 8) |
 842                                          TCPOLEN_MD5SIG);
 843                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 844                rep.th.doff = arg.iov[0].iov_len/4;
 845
 846                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 847                                    key, ip_hdr(skb)->saddr,
 848                                    ip_hdr(skb)->daddr, &rep.th);
 849        }
 850#endif
 851        arg.flags = reply_flags;
 852        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 853                                      ip_hdr(skb)->saddr, /* XXX */
 854                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 855        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 856        if (oif)
 857                arg.bound_dev_if = oif;
 858        arg.tos = tos;
 859        arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 860        local_bh_disable();
 861        ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
 862        if (sk)
 863                ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 864                                   inet_twsk(sk)->tw_mark : sk->sk_mark;
 865        ip_send_unicast_reply(ctl_sk,
 866                              skb, &TCP_SKB_CB(skb)->header.h4.opt,
 867                              ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 868                              &arg, arg.iov[0].iov_len);
 869
 870        ctl_sk->sk_mark = 0;
 871        __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 872        local_bh_enable();
 873}
 874
 875static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 876{
 877        struct inet_timewait_sock *tw = inet_twsk(sk);
 878        struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 879
 880        tcp_v4_send_ack(sk, skb,
 881                        tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 882                        tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 883                        tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 884                        tcptw->tw_ts_recent,
 885                        tw->tw_bound_dev_if,
 886                        tcp_twsk_md5_key(tcptw),
 887                        tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 888                        tw->tw_tos
 889                        );
 890
 891        inet_twsk_put(tw);
 892}
 893
 894static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 895                                  struct request_sock *req)
 896{
 897        /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 898         * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 899         */
 900        u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 901                                             tcp_sk(sk)->snd_nxt;
 902
 903        /* RFC 7323 2.3
 904         * The window field (SEG.WND) of every outgoing segment, with the
 905         * exception of <SYN> segments, MUST be right-shifted by
 906         * Rcv.Wind.Shift bits:
 907         */
 908        tcp_v4_send_ack(sk, skb, seq,
 909                        tcp_rsk(req)->rcv_nxt,
 910                        req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 911                        tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 912                        req->ts_recent,
 913                        0,
 914                        tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
 915                                          AF_INET),
 916                        inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 917                        ip_hdr(skb)->tos);
 918}
 919
 920/*
 921 *      Send a SYN-ACK after having received a SYN.
 922 *      This still operates on a request_sock only, not on a big
 923 *      socket.
 924 */
 925static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 926                              struct flowi *fl,
 927                              struct request_sock *req,
 928                              struct tcp_fastopen_cookie *foc,
 929                              enum tcp_synack_type synack_type)
 930{
 931        const struct inet_request_sock *ireq = inet_rsk(req);
 932        struct flowi4 fl4;
 933        int err = -1;
 934        struct sk_buff *skb;
 935
 936        /* First, grab a route. */
 937        if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 938                return -1;
 939
 940        skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 941
 942        if (skb) {
 943                __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 944
 945                err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 946                                            ireq->ir_rmt_addr,
 947                                            ireq_opt_deref(ireq));
 948                err = net_xmit_eval(err);
 949        }
 950
 951        return err;
 952}
 953
 954/*
 955 *      IPv4 request_sock destructor.
 956 */
 957static void tcp_v4_reqsk_destructor(struct request_sock *req)
 958{
 959        kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
 960}
 961
 962#ifdef CONFIG_TCP_MD5SIG
 963/*
 964 * RFC2385 MD5 checksumming requires a mapping of
 965 * IP address->MD5 Key.
 966 * We need to maintain these in the sk structure.
 967 */
 968
 969/* Find the Key structure for an address.  */
 970struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
 971                                         const union tcp_md5_addr *addr,
 972                                         int family)
 973{
 974        const struct tcp_sock *tp = tcp_sk(sk);
 975        struct tcp_md5sig_key *key;
 976        const struct tcp_md5sig_info *md5sig;
 977        __be32 mask;
 978        struct tcp_md5sig_key *best_match = NULL;
 979        bool match;
 980
 981        /* caller either holds rcu_read_lock() or socket lock */
 982        md5sig = rcu_dereference_check(tp->md5sig_info,
 983                                       lockdep_sock_is_held(sk));
 984        if (!md5sig)
 985                return NULL;
 986
 987        hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 988                if (key->family != family)
 989                        continue;
 990
 991                if (family == AF_INET) {
 992                        mask = inet_make_mask(key->prefixlen);
 993                        match = (key->addr.a4.s_addr & mask) ==
 994                                (addr->a4.s_addr & mask);
 995#if IS_ENABLED(CONFIG_IPV6)
 996                } else if (family == AF_INET6) {
 997                        match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
 998                                                  key->prefixlen);
 999#endif
1000                } else {

1001                        match = false;
1002                }
1003
1004                if (match && (!best_match ||
1005                              key->prefixlen > best_match->prefixlen))
1006                        best_match = key;
1007        }
1008        return best_match;
1009}
1010EXPORT_SYMBOL(tcp_md5_do_lookup);
1011
1012static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1013                                                      const union tcp_md5_addr *addr,
1014                                                      int family, u8 prefixlen)
1015{
1016        const struct tcp_sock *tp = tcp_sk(sk);
1017        struct tcp_md5sig_key *key;
1018        unsigned int size = sizeof(struct in_addr);
1019        const struct tcp_md5sig_info *md5sig;
1020
1021        /* caller either holds rcu_read_lock() or socket lock */
1022        md5sig = rcu_dereference_check(tp->md5sig_info,
1023                                       lockdep_sock_is_held(sk));
1024        if (!md5sig)
1025                return NULL;
1026#if IS_ENABLED(CONFIG_IPV6)
1027        if (family == AF_INET6)
1028                size = sizeof(struct in6_addr);
1029#endif
1030        hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1031                if (key->family != family)
1032                        continue;
1033                if (!memcmp(&key->addr, addr, size) &&
1034                    key->prefixlen == prefixlen)
1035                        return key;
1036        }
1037        return NULL;
1038}
1039
1040struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1041                                         const struct sock *addr_sk)
1042{
1043        const union tcp_md5_addr *addr;
1044
1045        addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1046        return tcp_md5_do_lookup(sk, addr, AF_INET);
1047}
1048EXPORT_SYMBOL(tcp_v4_md5_lookup);
1049
1050/* This can be called on a newly created socket, from other files */
1051int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1052                   int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1053                   gfp_t gfp)
1054{
1055        /* Add Key to the list */
1056        struct tcp_md5sig_key *key;
1057        struct tcp_sock *tp = tcp_sk(sk);
1058        struct tcp_md5sig_info *md5sig;
1059
1060        key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1061        if (key) {
1062                /* Pre-existing entry - just update that one. */
1063                memcpy(key->key, newkey, newkeylen);
1064                key->keylen = newkeylen;
1065                return 0;
1066        }
1067
1068        md5sig = rcu_dereference_protected(tp->md5sig_info,
1069                                           lockdep_sock_is_held(sk));
1070        if (!md5sig) {
1071                md5sig = kmalloc(sizeof(*md5sig), gfp);
1072                if (!md5sig)
1073                        return -ENOMEM;
1074
1075                sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1076                INIT_HLIST_HEAD(&md5sig->head);
1077                rcu_assign_pointer(tp->md5sig_info, md5sig);
1078        }
1079
1080        key = sock_kmalloc(sk, sizeof(*key), gfp);
1081        if (!key)
1082                return -ENOMEM;
1083        if (!tcp_alloc_md5sig_pool()) {
1084                sock_kfree_s(sk, key, sizeof(*key));
1085                return -ENOMEM;
1086        }
1087
1088        memcpy(key->key, newkey, newkeylen);
1089        key->keylen = newkeylen;
1090        key->family = family;
1091        key->prefixlen = prefixlen;
1092        memcpy(&key->addr, addr,
1093               (family == AF_INET6) ? sizeof(struct in6_addr) :
1094                                      sizeof(struct in_addr));
1095        hlist_add_head_rcu(&key->node, &md5sig->head);
1096        return 0;
1097}
1098EXPORT_SYMBOL(tcp_md5_do_add);
1099
1100int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1101                   u8 prefixlen)
1102{
1103        struct tcp_md5sig_key *key;
1104
1105        key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1106        if (!key)
1107                return -ENOENT;
1108        hlist_del_rcu(&key->node);
1109        atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1110        kfree_rcu(key, rcu);
1111        return 0;
1112}
1113EXPORT_SYMBOL(tcp_md5_do_del);
1114
1115static void tcp_clear_md5_list(struct sock *sk)
1116{
1117        struct tcp_sock *tp = tcp_sk(sk);
1118        struct tcp_md5sig_key *key;
1119        struct hlist_node *n;
1120        struct tcp_md5sig_info *md5sig;
1121
1122        md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1123
1124        hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1125                hlist_del_rcu(&key->node);
1126                atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1127                kfree_rcu(key, rcu);
1128        }
1129}
1130
1131static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1132                                 char __user *optval, int optlen)
1133{
1134        struct tcp_md5sig cmd;
1135        struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1136        u8 prefixlen = 32;
1137
1138        if (optlen < sizeof(cmd))
1139                return -EINVAL;
1140
1141        if (copy_from_user(&cmd, optval, sizeof(cmd)))
1142                return -EFAULT;
1143
1144        if (sin->sin_family != AF_INET)
1145                return -EINVAL;
1146
1147        if (optname == TCP_MD5SIG_EXT &&
1148            cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1149                prefixlen = cmd.tcpm_prefixlen;
1150                if (prefixlen > 32)
1151                        return -EINVAL;
1152        }
1153
1154        if (!cmd.tcpm_keylen)
1155                return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1156                                      AF_INET, prefixlen);
1157
1158        if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1159                return -EINVAL;
1160
1161        return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1162                              AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1163                              GFP_KERNEL);
1164}
1165
1166static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1167                                   __be32 daddr, __be32 saddr,
1168                                   const struct tcphdr *th, int nbytes)
1169{
1170        struct tcp4_pseudohdr *bp;
1171        struct scatterlist sg;
1172        struct tcphdr *_th;
1173
1174        bp = hp->scratch;
1175        bp->saddr = saddr;
1176        bp->daddr = daddr;
1177        bp->pad = 0;
1178        bp->protocol = IPPROTO_TCP;
1179        bp->len = cpu_to_be16(nbytes);
1180
1181        _th = (struct tcphdr *)(bp + 1);
1182        memcpy(_th, th, sizeof(*th));
1183        _th->check = 0;
1184
1185        sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1186        ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1187                                sizeof(*bp) + sizeof(*th));
1188        return crypto_ahash_update(hp->md5_req);
1189}
1190
1191static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1192                               __be32 daddr, __be32 saddr, const struct tcphdr *th)
1193{
1194        struct tcp_md5sig_pool *hp;
1195        struct ahash_request *req;
1196
1197        hp = tcp_get_md5sig_pool();
1198        if (!hp)
1199                goto clear_hash_noput;
1200        req = hp->md5_req;
1201
1202        if (crypto_ahash_init(req))
1203                goto clear_hash;
1204        if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1205                goto clear_hash;
1206        if (tcp_md5_hash_key(hp, key))
1207                goto clear_hash;
1208        ahash_request_set_crypt(req, NULL, md5_hash, 0);
1209        if (crypto_ahash_final(req))
1210                goto clear_hash;
1211
1212        tcp_put_md5sig_pool();
1213        return 0;
1214
1215clear_hash:
1216        tcp_put_md5sig_pool();
1217clear_hash_noput:
1218        memset(md5_hash, 0, 16);
1219        return 1;
1220}
1221
1222int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1223                        const struct sock *sk,
1224                        const struct sk_buff *skb)
1225{
1226        struct tcp_md5sig_pool *hp;
1227        struct ahash_request *req;
1228        const struct tcphdr *th = tcp_hdr(skb);
1229        __be32 saddr, daddr;
1230
1231        if (sk) { /* valid for establish/request sockets */
1232                saddr = sk->sk_rcv_saddr;
1233                daddr = sk->sk_daddr;
1234        } else {
1235                const struct iphdr *iph = ip_hdr(skb);
1236                saddr = iph->saddr;
1237                daddr = iph->daddr;
1238        }
1239
1240        hp = tcp_get_md5sig_pool();
1241        if (!hp)
1242                goto clear_hash_noput;
1243        req = hp->md5_req;
1244
1245        if (crypto_ahash_init(req))
1246                goto clear_hash;
1247
1248        if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1249                goto clear_hash;
1250        if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1251                goto clear_hash;
1252        if (tcp_md5_hash_key(hp, key))
1253                goto clear_hash;
1254        ahash_request_set_crypt(req, NULL, md5_hash, 0);
1255        if (crypto_ahash_final(req))
1256                goto clear_hash;
1257
1258        tcp_put_md5sig_pool();
1259        return 0;
1260
1261clear_hash:
1262        tcp_put_md5sig_pool();
1263clear_hash_noput:
1264        memset(md5_hash, 0, 16);
1265        return 1;
1266}
1267EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1268
1269#endif
1270
1271/* Called with rcu_read_lock() */
1272static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1273                                    const struct sk_buff *skb)
1274{
1275#ifdef CONFIG_TCP_MD5SIG
1276        /*
1277         * This gets called for each TCP segment that arrives
1278         * so we want to be efficient.
1279         * We have 3 drop cases:
1280         * o No MD5 hash and one expected.
1281         * o MD5 hash and we're not expecting one.
1282         * o MD5 hash and its wrong.
1283         */
1284        const __u8 *hash_location = NULL;
1285        struct tcp_md5sig_key *hash_expected;
1286        const struct iphdr *iph = ip_hdr(skb);
1287        const struct tcphdr *th = tcp_hdr(skb);
1288        int genhash;
1289        unsigned char newhash[16];
1290
1291        hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1292                                          AF_INET);
1293        hash_location = tcp_parse_md5sig_option(th);
1294
1295        /* We've parsed the options - do we have a hash? */
1296        if (!hash_expected && !hash_location)
1297                return false;
1298
1299        if (hash_expected && !hash_location) {
1300                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1301                return true;
1302        }
1303
1304        if (!hash_expected && hash_location) {
1305                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1306                return true;
1307        }
1308
1309        /* Okay, so this is hash_expected and hash_location -
1310         * so we need to calculate the checksum.
1311         */
1312        genhash = tcp_v4_md5_hash_skb(newhash,
1313                                      hash_expected,
1314                                      NULL, skb);
1315
1316        if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1317                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1318                net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1319                                     &iph->saddr, ntohs(th->source),
1320                                     &iph->daddr, ntohs(th->dest),
1321                                     genhash ? " tcp_v4_calc_md5_hash failed"
1322                                     : "");
1323                return true;
1324        }
1325        return false;
1326#endif
1327        return false;
1328}
1329
1330static void tcp_v4_init_req(struct request_sock *req,
1331                            const struct sock *sk_listener,
1332                            struct sk_buff *skb)
1333{
1334        struct inet_request_sock *ireq = inet_rsk(req);
1335        struct net *net = sock_net(sk_listener);
1336
1337        sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1338        sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1339        RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1340}
1341
1342static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1343                                          struct flowi *fl,
1344                                          const struct request_sock *req)
1345{
1346        return inet_csk_route_req(sk, &fl->u.ip4, req);
1347}
1348
1349struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1350        .family         =       PF_INET,
1351        .obj_size       =       sizeof(struct tcp_request_sock),
1352        .rtx_syn_ack    =       tcp_rtx_synack,
1353        .send_ack       =       tcp_v4_reqsk_send_ack,
1354        .destructor     =       tcp_v4_reqsk_destructor,
1355        .send_reset     =       tcp_v4_send_reset,
1356        .syn_ack_timeout =      tcp_syn_ack_timeout,
1357};
1358
1359static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1360        .mss_clamp      =       TCP_MSS_DEFAULT,
1361#ifdef CONFIG_TCP_MD5SIG
1362        .req_md5_lookup =       tcp_v4_md5_lookup,
1363        .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1364#endif
1365        .init_req       =       tcp_v4_init_req,
1366#ifdef CONFIG_SYN_COOKIES
1367        .cookie_init_seq =      cookie_v4_init_sequence,
1368#endif
1369        .route_req      =       tcp_v4_route_req,
1370        .init_seq       =       tcp_v4_init_seq,
1371        .init_ts_off    =       tcp_v4_init_ts_off,
1372        .send_synack    =       tcp_v4_send_synack,
1373};
1374
1375int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1376{
1377        /* Never answer to SYNs send to broadcast or multicast */
1378        if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1379                goto drop;
1380
1381        return tcp_conn_request(&tcp_request_sock_ops,
1382                                &tcp_request_sock_ipv4_ops, sk, skb);
1383
1384drop:
1385        tcp_listendrop(sk);
1386        return 0;
1387}
1388EXPORT_SYMBOL(tcp_v4_conn_request);
1389
1390
1391/*
1392 * The three way handshake has completed - we got a valid synack -
1393 * now create the new socket.
1394 */
1395struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1396                                  struct request_sock *req,
1397                                  struct dst_entry *dst,
1398                                  struct request_sock *req_unhash,
1399                                  bool *own_req)
1400{
1401        struct inet_request_sock *ireq;
1402        struct inet_sock *newinet;
1403        struct tcp_sock *newtp;
1404        struct sock *newsk;
1405#ifdef CONFIG_TCP_MD5SIG
1406        struct tcp_md5sig_key *key;
1407#endif
1408        struct ip_options_rcu *inet_opt;
1409
1410        if (sk_acceptq_is_full(sk))
1411                goto exit_overflow;
1412
1413        newsk = tcp_create_openreq_child(sk, req, skb);
1414        if (!newsk)
1415                goto exit_nonewsk;
1416
1417        newsk->sk_gso_type = SKB_GSO_TCPV4;
1418        inet_sk_rx_dst_set(newsk, skb);
1419
1420        newtp                 = tcp_sk(newsk);
1421        newinet               = inet_sk(newsk);
1422        ireq                  = inet_rsk(req);
1423        sk_daddr_set(newsk, ireq->ir_rmt_addr);
1424        sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1425        newsk->sk_bound_dev_if = ireq->ir_iif;
1426        newinet->inet_saddr   = ireq->ir_loc_addr;
1427        inet_opt              = rcu_dereference(ireq->ireq_opt);
1428        RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1429        newinet->mc_index     = inet_iif(skb);
1430        newinet->mc_ttl       = ip_hdr(skb)->ttl;
1431        newinet->rcv_tos      = ip_hdr(skb)->tos;
1432        inet_csk(newsk)->icsk_ext_hdr_len = 0;
1433        if (inet_opt)
1434                inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1435        newinet->inet_id = newtp->write_seq ^ jiffies;
1436
1437        if (!dst) {
1438                dst = inet_csk_route_child_sock(sk, newsk, req);
1439                if (!dst)
1440                        goto put_and_exit;
1441        } else {
1442                /* syncookie case : see end of cookie_v4_check() */
1443        }
1444        sk_setup_caps(newsk, dst);
1445
1446        tcp_ca_openreq_child(newsk, dst);
1447
1448        tcp_sync_mss(newsk, dst_mtu(dst));
1449        newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1450
1451        tcp_initialize_rcv_mss(newsk);
1452
1453#ifdef CONFIG_TCP_MD5SIG
1454        /* Copy over the MD5 key from the original socket */
1455        key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1456                                AF_INET);
1457        if (key) {
1458                /*
1459                 * We're using one, so create a matching key
1460                 * on the newsk structure. If we fail to get
1461                 * memory, then we end up not copying the key
1462                 * across. Shucks.
1463                 */
1464                tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1465                               AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1466                sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1467        }
1468#endif
1469
1470        if (__inet_inherit_port(sk, newsk) < 0)
1471                goto put_and_exit;
1472        *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1473        if (likely(*own_req)) {
1474                tcp_move_syn(newtp, req);
1475                ireq->ireq_opt = NULL;
1476        } else {
1477                newinet->inet_opt = NULL;
1478        }
1479        return newsk;
1480
1481exit_overflow:
1482        NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1483exit_nonewsk:
1484        dst_release(dst);
1485exit:
1486        tcp_listendrop(sk);
1487        return NULL;
1488put_and_exit:
1489        newinet->inet_opt = NULL;
1490        inet_csk_prepare_forced_close(newsk);
1491        tcp_done(newsk);
1492        goto exit;
1493}
1494EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1495
1496static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1497{
1498#ifdef CONFIG_SYN_COOKIES
1499        const struct tcphdr *th = tcp_hdr(skb);
1500
1501        if (!th->syn)
1502                sk = cookie_v4_check(sk, skb);
1503#endif
1504        return sk;
1505}
1506
1507/* The socket must have it's spinlock held when we get
1508 * here, unless it is a TCP_LISTEN socket.
1509 *
1510 * We have a potential double-lock case here, so even when
1511 * doing backlog processing we use the BH locking scheme.
1512 * This is because we cannot sleep with the original spinlock
1513 * held.
1514 */
1515int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1516{
1517        struct sock *rsk;
1518
1519        if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1520                struct dst_entry *dst = sk->sk_rx_dst;
1521
1522                sock_rps_save_rxhash(sk, skb);
1523                sk_mark_napi_id(sk, skb);
1524                if (dst) {
1525                        if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1526                            !dst->ops->check(dst, 0)) {
1527                                dst_release(dst);
1528                                sk->sk_rx_dst = NULL;
1529                        }
1530                }
1531                tcp_rcv_established(sk, skb);
1532                return 0;
1533        }
1534
1535        if (tcp_checksum_complete(skb))
1536                goto csum_err;
1537
1538        if (sk->sk_state == TCP_LISTEN) {
1539                struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1540
1541                if (!nsk)
1542                        goto discard;
1543                if (nsk != sk) {
1544                        if (tcp_child_process(sk, nsk, skb)) {
1545                                rsk = nsk;
1546                                goto reset;
1547                        }
1548                        return 0;
1549                }
1550        } else
1551                sock_rps_save_rxhash(sk, skb);
1552
1553        if (tcp_rcv_state_process(sk, skb)) {
1554                rsk = sk;
1555                goto reset;
1556        }
1557        return 0;
1558
1559reset:
1560        tcp_v4_send_reset(rsk, skb);
1561discard:
1562        kfree_skb(skb);
1563        /* Be careful here. If this function gets more complicated and
1564         * gcc suffers from register pressure on the x86, sk (in %ebx)
1565         * might be destroyed here. This current version compiles correctly,
1566         * but you have been warned.
1567         */
1568        return 0;
1569
1570csum_err:
1571        TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1572        TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1573        goto discard;
1574}
1575EXPORT_SYMBOL(tcp_v4_do_rcv);
1576
1577int tcp_v4_early_demux(struct sk_buff *skb)
1578{
1579        const struct iphdr *iph;
1580        const struct tcphdr *th;
1581        struct sock *sk;
1582
1583        if (skb->pkt_type != PACKET_HOST)
1584                return 0;
1585
1586        if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1587                return 0;
1588
1589        iph = ip_hdr(skb);
1590        th = tcp_hdr(skb);
1591
1592        if (th->doff < sizeof(struct tcphdr) / 4)
1593                return 0;
1594
1595        sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1596                                       iph->saddr, th->source,
1597                                       iph->daddr, ntohs(th->dest),
1598                                       skb->skb_iif, inet_sdif(skb));
1599        if (sk) {
1600                skb->sk = sk;
1601                skb->destructor = sock_edemux;
1602                if (sk_fullsock(sk)) {
1603                        struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1604
1605                        if (dst)
1606                                dst = dst_check(dst, 0);
1607                        if (dst &&
1608                            inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1609                                skb_dst_set_noref(skb, dst);
1610                }
1611        }
1612        return 0;
1613}
1614
1615bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1616{
1617        u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1618
1619        /* Only socket owner can try to collapse/prune rx queues
1620         * to reduce memory overhead, so add a little headroom here.
1621         * Few sockets backlog are possibly concurrently non empty.
1622         */
1623        limit += 64*1024;
1624
1625        /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1626         * we can fix skb->truesize to its real value to avoid future drops.
1627         * This is valid because skb is not yet charged to the socket.
1628         * It has been noticed pure SACK packets were sometimes dropped
1629         * (if cooked by drivers without copybreak feature).
1630         */
1631        skb_condense(skb);
1632
1633        if (unlikely(sk_add_backlog(sk, skb, limit))) {
1634                bh_unlock_sock(sk);
1635                __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1636                return true;
1637        }
1638        return false;
1639}
1640EXPORT_SYMBOL(tcp_add_backlog);
1641
1642int tcp_filter(struct sock *sk, struct sk_buff *skb)
1643{
1644        struct tcphdr *th = (struct tcphdr *)skb->data;
1645        unsigned int eaten = skb->len;
1646        int err;
1647
1648        err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1649        if (!err) {
1650                eaten -= skb->len;
1651                TCP_SKB_CB(skb)->end_seq -= eaten;
1652        }
1653        return err;
1654}
1655EXPORT_SYMBOL(tcp_filter);
1656
1657static void tcp_v4_restore_cb(struct sk_buff *skb)
1658{
1659        memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1660                sizeof(struct inet_skb_parm));
1661}
1662
1663static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1664                           const struct tcphdr *th)
1665{
1666        /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1667         * barrier() makes sure compiler wont play fool^Waliasing games.
1668         */
1669        memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1670                sizeof(struct inet_skb_parm));
1671        barrier();
1672
1673        TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1674        TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1675                                    skb->len - th->doff * 4);
1676        TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1677        TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1678        TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1679        TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1680        TCP_SKB_CB(skb)->sacked  = 0;
1681        TCP_SKB_CB(skb)->has_rxtstamp =
1682                        skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1683}
1684
1685/*
1686 *      From tcp_input.c
1687 */
1688
1689int tcp_v4_rcv(struct sk_buff *skb)
1690{
1691        struct net *net = dev_net(skb->dev);
1692        int sdif = inet_sdif(skb);
1693        const struct iphdr *iph;
1694        const struct tcphdr *th;
1695        bool refcounted;
1696        struct sock *sk;
1697        int ret;
1698
1699        if (skb->pkt_type != PACKET_HOST)
1700                goto discard_it;
1701
1702        /* Count it even if it's bad */
1703        __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1704
1705        if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1706                goto discard_it;
1707
1708        th = (const struct tcphdr *)skb->data;
1709
1710        if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1711                goto bad_packet;
1712        if (!pskb_may_pull(skb, th->doff * 4))
1713                goto discard_it;
1714
1715        /* An explanation is required here, I think.
1716         * Packet length and doff are validated by header prediction,
1717         * provided case of th->doff==0 is eliminated.
1718         * So, we defer the checks. */
1719
1720        if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1721                goto csum_error;
1722
1723        th = (const struct tcphdr *)skb->data;
1724        iph = ip_hdr(skb);
1725lookup:
1726        sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1727                               th->dest, sdif, &refcounted);
1728        if (!sk)
1729                goto no_tcp_socket;
1730
1731process:
1732        if (sk->sk_state == TCP_TIME_WAIT)
1733                goto do_time_wait;
1734
1735        if (sk->sk_state == TCP_NEW_SYN_RECV) {
1736                struct request_sock *req = inet_reqsk(sk);
1737                bool req_stolen = false;
1738                struct sock *nsk;
1739
1740                sk = req->rsk_listener;
1741                if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1742                        sk_drops_add(sk, skb);
1743                        reqsk_put(req);
1744                        goto discard_it;
1745                }
1746                if (tcp_checksum_complete(skb)) {
1747                        reqsk_put(req);
1748                        goto csum_error;
1749                }
1750                if (unlikely(sk->sk_state != TCP_LISTEN)) {
1751                        inet_csk_reqsk_queue_drop_and_put(sk, req);
1752                        goto lookup;
1753                }
1754                /* We own a reference on the listener, increase it again
1755                 * as we might lose it too soon.
1756                 */
1757                sock_hold(sk);
1758                refcounted = true;
1759                nsk = NULL;
1760                if (!tcp_filter(sk, skb)) {
1761                        th = (const struct tcphdr *)skb->data;
1762                        iph = ip_hdr(skb);
1763                        tcp_v4_fill_cb(skb, iph, th);
1764                        nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1765                }
1766                if (!nsk) {
1767                        reqsk_put(req);
1768                        if (req_stolen) {
1769                                /* Another cpu got exclusive access to req
1770                                 * and created a full blown socket.
1771                                 * Try to feed this packet to this socket
1772                                 * instead of discarding it.
1773                                 */
1774                                tcp_v4_restore_cb(skb);
1775                                sock_put(sk);
1776                                goto lookup;
1777                        }
1778                        goto discard_and_relse;
1779                }
1780                if (nsk == sk) {
1781                        reqsk_put(req);
1782                        tcp_v4_restore_cb(skb);
1783                } else if (tcp_child_process(sk, nsk, skb)) {
1784                        tcp_v4_send_reset(nsk, skb);
1785                        goto discard_and_relse;
1786                } else {
1787                        sock_put(sk);
1788                        return 0;
1789                }
1790        }
1791        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1792                __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1793                goto discard_and_relse;
1794        }
1795
1796        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1797                goto discard_and_relse;
1798
1799        if (tcp_v4_inbound_md5_hash(sk, skb))
1800                goto discard_and_relse;
1801
1802        nf_reset(skb);
1803
1804        if (tcp_filter(sk, skb))
1805                goto discard_and_relse;
1806        th = (const struct tcphdr *)skb->data;
1807        iph = ip_hdr(skb);
1808        tcp_v4_fill_cb(skb, iph, th);
1809
1810        skb->dev = NULL;
1811
1812        if (sk->sk_state == TCP_LISTEN) {
1813                ret = tcp_v4_do_rcv(sk, skb);
1814                goto put_and_return;
1815        }
1816
1817        sk_incoming_cpu_update(sk);
1818
1819        bh_lock_sock_nested(sk);
1820        tcp_segs_in(tcp_sk(sk), skb);
1821        ret = 0;
1822        if (!sock_owned_by_user(sk)) {
1823                ret = tcp_v4_do_rcv(sk, skb);
1824        } else if (tcp_add_backlog(sk, skb)) {
1825                goto discard_and_relse;
1826        }
1827        bh_unlock_sock(sk);
1828
1829put_and_return:
1830        if (refcounted)
1831                sock_put(sk);
1832
1833        return ret;
1834
1835no_tcp_socket:
1836        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1837                goto discard_it;
1838
1839        tcp_v4_fill_cb(skb, iph, th);
1840
1841        if (tcp_checksum_complete(skb)) {
1842csum_error:
1843                __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1844bad_packet:
1845                __TCP_INC_STATS(net, TCP_MIB_INERRS);
1846        } else {
1847                tcp_v4_send_reset(NULL, skb);
1848        }
1849
1850discard_it:
1851        /* Discard frame. */
1852        kfree_skb(skb);
1853        return 0;
1854
1855discard_and_relse:
1856        sk_drops_add(sk, skb);
1857        if (refcounted)
1858                sock_put(sk);
1859        goto discard_it;
1860
1861do_time_wait:
1862        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1863                inet_twsk_put(inet_twsk(sk));
1864                goto discard_it;
1865        }
1866
1867        tcp_v4_fill_cb(skb, iph, th);
1868
1869        if (tcp_checksum_complete(skb)) {
1870                inet_twsk_put(inet_twsk(sk));
1871                goto csum_error;
1872        }
1873        switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1874        case TCP_TW_SYN: {
1875                struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1876                                                        &tcp_hashinfo, skb,
1877                                                        __tcp_hdrlen(th),
1878                                                        iph->saddr, th->source,
1879                                                        iph->daddr, th->dest,
1880                                                        inet_iif(skb),
1881                                                        sdif);
1882                if (sk2) {
1883                        inet_twsk_deschedule_put(inet_twsk(sk));
1884                        sk = sk2;
1885                        tcp_v4_restore_cb(skb);
1886                        refcounted = false;
1887                        goto process;
1888                }
1889        }
1890                /* to ACK */
1891                /* fall through */
1892        case TCP_TW_ACK:
1893                tcp_v4_timewait_ack(sk, skb);
1894                break;
1895        case TCP_TW_RST:
1896                tcp_v4_send_reset(sk, skb);
1897                inet_twsk_deschedule_put(inet_twsk(sk));
1898                goto discard_it;
1899        case TCP_TW_SUCCESS:;
1900        }
1901        goto discard_it;
1902}
1903
1904static struct timewait_sock_ops tcp_timewait_sock_ops = {
1905        .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1906        .twsk_unique    = tcp_twsk_unique,
1907        .twsk_destructor= tcp_twsk_destructor,
1908};
1909
1910void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1911{
1912        struct dst_entry *dst = skb_dst(skb);
1913
1914        if (dst && dst_hold_safe(dst)) {
1915                sk->sk_rx_dst = dst;
1916                inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1917        }
1918}
1919EXPORT_SYMBOL(inet_sk_rx_dst_set);
1920
1921const struct inet_connection_sock_af_ops ipv4_specific = {
1922        .queue_xmit        = ip_queue_xmit,
1923        .send_check        = tcp_v4_send_check,
1924        .rebuild_header    = inet_sk_rebuild_header,
1925        .sk_rx_dst_set     = inet_sk_rx_dst_set,
1926        .conn_request      = tcp_v4_conn_request,
1927        .syn_recv_sock     = tcp_v4_syn_recv_sock,
1928        .net_header_len    = sizeof(struct iphdr),
1929        .setsockopt        = ip_setsockopt,
1930        .getsockopt        = ip_getsockopt,
1931        .addr2sockaddr     = inet_csk_addr2sockaddr,
1932        .sockaddr_len      = sizeof(struct sockaddr_in),
1933#ifdef CONFIG_COMPAT
1934        .compat_setsockopt = compat_ip_setsockopt,
1935        .compat_getsockopt = compat_ip_getsockopt,
1936#endif
1937        .mtu_reduced       = tcp_v4_mtu_reduced,
1938};
1939EXPORT_SYMBOL(ipv4_specific);
1940
1941#ifdef CONFIG_TCP_MD5SIG
1942static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1943        .md5_lookup             = tcp_v4_md5_lookup,
1944        .calc_md5_hash          = tcp_v4_md5_hash_skb,
1945        .md5_parse              = tcp_v4_parse_md5_keys,
1946};
1947#endif
1948
1949/* NOTE: A lot of things set to zero explicitly by call to
1950 *       sk_alloc() so need not be done here.
1951 */
1952static int tcp_v4_init_sock(struct sock *sk)
1953{
1954        struct inet_connection_sock *icsk = inet_csk(sk);
1955
1956        tcp_init_sock(sk);
1957
1958        icsk->icsk_af_ops = &ipv4_specific;
1959
1960#ifdef CONFIG_TCP_MD5SIG
1961        tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1962#endif
1963
1964        return 0;
1965}
1966
1967void tcp_v4_destroy_sock(struct sock *sk)
1968{
1969        struct tcp_sock *tp = tcp_sk(sk);
1970
1971        trace_tcp_destroy_sock(sk);
1972
1973        tcp_clear_xmit_timers(sk);
1974
1975        tcp_cleanup_congestion_control(sk);
1976
1977        tcp_cleanup_ulp(sk);
1978
1979        /* Cleanup up the write buffer. */
1980        tcp_write_queue_purge(sk);
1981
1982        /* Check if we want to disable active TFO */
1983        tcp_fastopen_active_disable_ofo_check(sk);
1984
1985        /* Cleans up our, hopefully empty, out_of_order_queue. */
1986        skb_rbtree_purge(&tp->out_of_order_queue);
1987
1988#ifdef CONFIG_TCP_MD5SIG
1989        /* Clean up the MD5 key list, if any */
1990        if (tp->md5sig_info) {
1991                tcp_clear_md5_list(sk);
1992                kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
1993                tp->md5sig_info = NULL;
1994        }
1995#endif
1996
1997        /* Clean up a referenced TCP bind bucket. */
1998        if (inet_csk(sk)->icsk_bind_hash)
1999                inet_put_port(sk);
2000

2001        BUG_ON(tp->fastopen_rsk);
2002
2003        /* If socket is aborted during connect operation */
2004        tcp_free_fastopen_req(tp);
2005        tcp_fastopen_destroy_cipher(sk);
2006        tcp_saved_syn_free(tp);
2007
2008        sk_sockets_allocated_dec(sk);
2009}
2010EXPORT_SYMBOL(tcp_v4_destroy_sock);
2011
2012#ifdef CONFIG_PROC_FS
2013/* Proc filesystem TCP sock list dumping. */
2014
2015/*
2016 * Get next listener socket follow cur.  If cur is NULL, get first socket
2017 * starting from bucket given in st->bucket; when st->bucket is zero the
2018 * very first socket in the hash table is returned.
2019 */
2020static void *listening_get_next(struct seq_file *seq, void *cur)
2021{
2022        struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2023        struct tcp_iter_state *st = seq->private;
2024        struct net *net = seq_file_net(seq);
2025        struct inet_listen_hashbucket *ilb;
2026        struct sock *sk = cur;
2027
2028        if (!sk) {
2029get_head:
2030                ilb = &tcp_hashinfo.listening_hash[st->bucket];
2031                spin_lock(&ilb->lock);
2032                sk = sk_head(&ilb->head);
2033                st->offset = 0;
2034                goto get_sk;
2035        }
2036        ilb = &tcp_hashinfo.listening_hash[st->bucket];
2037        ++st->num;
2038        ++st->offset;
2039
2040        sk = sk_next(sk);
2041get_sk:
2042        sk_for_each_from(sk) {
2043                if (!net_eq(sock_net(sk), net))
2044                        continue;
2045                if (sk->sk_family == afinfo->family)
2046                        return sk;
2047        }
2048        spin_unlock(&ilb->lock);
2049        st->offset = 0;
2050        if (++st->bucket < INET_LHTABLE_SIZE)
2051                goto get_head;
2052        return NULL;
2053}
2054
2055static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2056{
2057        struct tcp_iter_state *st = seq->private;
2058        void *rc;
2059
2060        st->bucket = 0;
2061        st->offset = 0;
2062        rc = listening_get_next(seq, NULL);
2063
2064        while (rc && *pos) {
2065                rc = listening_get_next(seq, rc);
2066                --*pos;
2067        }
2068        return rc;
2069}
2070
2071static inline bool empty_bucket(const struct tcp_iter_state *st)
2072{
2073        return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2074}
2075
2076/*
2077 * Get first established socket starting from bucket given in st->bucket.
2078 * If st->bucket is zero, the very first socket in the hash is returned.
2079 */
2080static void *established_get_first(struct seq_file *seq)
2081{
2082        struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2083        struct tcp_iter_state *st = seq->private;
2084        struct net *net = seq_file_net(seq);
2085        void *rc = NULL;
2086
2087        st->offset = 0;
2088        for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2089                struct sock *sk;
2090                struct hlist_nulls_node *node;
2091                spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2092
2093                /* Lockless fast path for the common case of empty buckets */
2094                if (empty_bucket(st))
2095                        continue;
2096
2097                spin_lock_bh(lock);
2098                sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2099                        if (sk->sk_family != afinfo->family ||
2100                            !net_eq(sock_net(sk), net)) {
2101                                continue;
2102                        }
2103                        rc = sk;
2104                        goto out;
2105                }
2106                spin_unlock_bh(lock);
2107        }
2108out:
2109        return rc;
2110}
2111
2112static void *established_get_next(struct seq_file *seq, void *cur)
2113{
2114        struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2115        struct sock *sk = cur;
2116        struct hlist_nulls_node *node;
2117        struct tcp_iter_state *st = seq->private;
2118        struct net *net = seq_file_net(seq);
2119
2120        ++st->num;
2121        ++st->offset;
2122
2123        sk = sk_nulls_next(sk);
2124
2125        sk_nulls_for_each_from(sk, node) {
2126                if (sk->sk_family == afinfo->family &&
2127                    net_eq(sock_net(sk), net))
2128                        return sk;
2129        }
2130
2131        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2132        ++st->bucket;
2133        return established_get_first(seq);
2134}
2135
2136static void *established_get_idx(struct seq_file *seq, loff_t pos)
2137{
2138        struct tcp_iter_state *st = seq->private;
2139        void *rc;
2140
2141        st->bucket = 0;
2142        rc = established_get_first(seq);
2143
2144        while (rc && pos) {
2145                rc = established_get_next(seq, rc);
2146                --pos;
2147        }
2148        return rc;
2149}
2150
2151static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2152{
2153        void *rc;
2154        struct tcp_iter_state *st = seq->private;
2155
2156        st->state = TCP_SEQ_STATE_LISTENING;
2157        rc        = listening_get_idx(seq, &pos);
2158
2159        if (!rc) {
2160                st->state = TCP_SEQ_STATE_ESTABLISHED;
2161                rc        = established_get_idx(seq, pos);
2162        }
2163
2164        return rc;
2165}
2166
2167static void *tcp_seek_last_pos(struct seq_file *seq)
2168{
2169        struct tcp_iter_state *st = seq->private;
2170        int offset = st->offset;
2171        int orig_num = st->num;
2172        void *rc = NULL;
2173
2174        switch (st->state) {
2175        case TCP_SEQ_STATE_LISTENING:
2176                if (st->bucket >= INET_LHTABLE_SIZE)
2177                        break;
2178                st->state = TCP_SEQ_STATE_LISTENING;
2179                rc = listening_get_next(seq, NULL);
2180                while (offset-- && rc)
2181                        rc = listening_get_next(seq, rc);
2182                if (rc)
2183                        break;
2184                st->bucket = 0;
2185                st->state = TCP_SEQ_STATE_ESTABLISHED;
2186                /* Fallthrough */
2187        case TCP_SEQ_STATE_ESTABLISHED:
2188                if (st->bucket > tcp_hashinfo.ehash_mask)
2189                        break;
2190                rc = established_get_first(seq);
2191                while (offset-- && rc)
2192                        rc = established_get_next(seq, rc);
2193        }
2194
2195        st->num = orig_num;
2196
2197        return rc;
2198}
2199
2200void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2201{
2202        struct tcp_iter_state *st = seq->private;
2203        void *rc;
2204
2205        if (*pos && *pos == st->last_pos) {
2206                rc = tcp_seek_last_pos(seq);
2207                if (rc)
2208                        goto out;
2209        }
2210
2211        st->state = TCP_SEQ_STATE_LISTENING;
2212        st->num = 0;
2213        st->bucket = 0;
2214        st->offset = 0;
2215        rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2216
2217out:
2218        st->last_pos = *pos;
2219        return rc;
2220}
2221EXPORT_SYMBOL(tcp_seq_start);
2222
2223void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2224{
2225        struct tcp_iter_state *st = seq->private;
2226        void *rc = NULL;
2227
2228        if (v == SEQ_START_TOKEN) {
2229                rc = tcp_get_idx(seq, 0);
2230                goto out;
2231        }
2232
2233        switch (st->state) {
2234        case TCP_SEQ_STATE_LISTENING:
2235                rc = listening_get_next(seq, v);
2236                if (!rc) {
2237                        st->state = TCP_SEQ_STATE_ESTABLISHED;
2238                        st->bucket = 0;
2239                        st->offset = 0;
2240                        rc        = established_get_first(seq);
2241                }
2242                break;
2243        case TCP_SEQ_STATE_ESTABLISHED:
2244                rc = established_get_next(seq, v);
2245                break;
2246        }
2247out:
2248        ++*pos;
2249        st->last_pos = *pos;
2250        return rc;
2251}
2252EXPORT_SYMBOL(tcp_seq_next);
2253
2254void tcp_seq_stop(struct seq_file *seq, void *v)
2255{
2256        struct tcp_iter_state *st = seq->private;
2257
2258        switch (st->state) {
2259        case TCP_SEQ_STATE_LISTENING:
2260                if (v != SEQ_START_TOKEN)
2261                        spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2262                break;
2263        case TCP_SEQ_STATE_ESTABLISHED:
2264                if (v)
2265                        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2266                break;
2267        }
2268}
2269EXPORT_SYMBOL(tcp_seq_stop);
2270
2271static void get_openreq4(const struct request_sock *req,
2272                         struct seq_file *f, int i)
2273{
2274        const struct inet_request_sock *ireq = inet_rsk(req);
2275        long delta = req->rsk_timer.expires - jiffies;
2276
2277        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2278                " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2279                i,
2280                ireq->ir_loc_addr,
2281                ireq->ir_num,
2282                ireq->ir_rmt_addr,
2283                ntohs(ireq->ir_rmt_port),
2284                TCP_SYN_RECV,
2285                0, 0, /* could print option size, but that is af dependent. */
2286                1,    /* timers active (only the expire timer) */
2287                jiffies_delta_to_clock_t(delta),
2288                req->num_timeout,
2289                from_kuid_munged(seq_user_ns(f),
2290                                 sock_i_uid(req->rsk_listener)),
2291                0,  /* non standard timer */
2292                0, /* open_requests have no inode */
2293                0,
2294                req);
2295}
2296
2297static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2298{
2299        int timer_active;
2300        unsigned long timer_expires;
2301        const struct tcp_sock *tp = tcp_sk(sk);
2302        const struct inet_connection_sock *icsk = inet_csk(sk);
2303        const struct inet_sock *inet = inet_sk(sk);
2304        const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2305        __be32 dest = inet->inet_daddr;
2306        __be32 src = inet->inet_rcv_saddr;
2307        __u16 destp = ntohs(inet->inet_dport);
2308        __u16 srcp = ntohs(inet->inet_sport);
2309        int rx_queue;
2310        int state;
2311
2312        if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2313            icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2314            icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2315                timer_active    = 1;
2316                timer_expires   = icsk->icsk_timeout;
2317        } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2318                timer_active    = 4;
2319                timer_expires   = icsk->icsk_timeout;
2320        } else if (timer_pending(&sk->sk_timer)) {
2321                timer_active    = 2;
2322                timer_expires   = sk->sk_timer.expires;
2323        } else {
2324                timer_active    = 0;
2325                timer_expires = jiffies;
2326        }
2327
2328        state = inet_sk_state_load(sk);
2329        if (state == TCP_LISTEN)
2330                rx_queue = sk->sk_ack_backlog;
2331        else
2332                /* Because we don't lock the socket,
2333                 * we might find a transient negative value.
2334                 */
2335                rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2336
2337        seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2338                        "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2339                i, src, srcp, dest, destp, state,
2340                tp->write_seq - tp->snd_una,
2341                rx_queue,
2342                timer_active,
2343                jiffies_delta_to_clock_t(timer_expires - jiffies),
2344                icsk->icsk_retransmits,
2345                from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2346                icsk->icsk_probes_out,
2347                sock_i_ino(sk),
2348                refcount_read(&sk->sk_refcnt), sk,
2349                jiffies_to_clock_t(icsk->icsk_rto),
2350                jiffies_to_clock_t(icsk->icsk_ack.ato),
2351                (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2352                tp->snd_cwnd,
2353                state == TCP_LISTEN ?
2354                    fastopenq->max_qlen :
2355                    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2356}
2357
2358static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2359                               struct seq_file *f, int i)
2360{
2361        long delta = tw->tw_timer.expires - jiffies;
2362        __be32 dest, src;
2363        __u16 destp, srcp;
2364
2365        dest  = tw->tw_daddr;
2366        src   = tw->tw_rcv_saddr;
2367        destp = ntohs(tw->tw_dport);
2368        srcp  = ntohs(tw->tw_sport);
2369
2370        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2371                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2372                i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2373                3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2374                refcount_read(&tw->tw_refcnt), tw);
2375}
2376
2377#define TMPSZ 150
2378
2379static int tcp4_seq_show(struct seq_file *seq, void *v)
2380{
2381        struct tcp_iter_state *st;
2382        struct sock *sk = v;
2383
2384        seq_setwidth(seq, TMPSZ - 1);
2385        if (v == SEQ_START_TOKEN) {
2386                seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2387                           "rx_queue tr tm->when retrnsmt   uid  timeout "
2388                           "inode");
2389                goto out;
2390        }
2391        st = seq->private;
2392
2393        if (sk->sk_state == TCP_TIME_WAIT)
2394                get_timewait4_sock(v, seq, st->num);
2395        else if (sk->sk_state == TCP_NEW_SYN_RECV)
2396                get_openreq4(v, seq, st->num);
2397        else
2398                get_tcp4_sock(v, seq, st->num);
2399out:
2400        seq_pad(seq, '\n');
2401        return 0;
2402}
2403
2404static const struct seq_operations tcp4_seq_ops = {
2405        .show           = tcp4_seq_show,
2406        .start          = tcp_seq_start,
2407        .next           = tcp_seq_next,
2408        .stop           = tcp_seq_stop,
2409};
2410
2411static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2412        .family         = AF_INET,
2413};
2414
2415static int __net_init tcp4_proc_init_net(struct net *net)
2416{
2417        if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2418                        sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2419                return -ENOMEM;
2420        return 0;
2421}
2422
2423static void __net_exit tcp4_proc_exit_net(struct net *net)
2424{
2425        remove_proc_entry("tcp", net->proc_net);
2426}
2427
2428static struct pernet_operations tcp4_net_ops = {
2429        .init = tcp4_proc_init_net,
2430        .exit = tcp4_proc_exit_net,
2431};
2432
2433int __init tcp4_proc_init(void)
2434{
2435        return register_pernet_subsys(&tcp4_net_ops);
2436}
2437
2438void tcp4_proc_exit(void)
2439{
2440        unregister_pernet_subsys(&tcp4_net_ops);
2441}
2442#endif /* CONFIG_PROC_FS */
2443
2444struct proto tcp_prot = {
2445        .name                   = "TCP",
2446        .owner                  = THIS_MODULE,
2447        .close                  = tcp_close,
2448        .pre_connect            = tcp_v4_pre_connect,
2449        .connect                = tcp_v4_connect,
2450        .disconnect             = tcp_disconnect,
2451        .accept                 = inet_csk_accept,
2452        .ioctl                  = tcp_ioctl,
2453        .init                   = tcp_v4_init_sock,
2454        .destroy                = tcp_v4_destroy_sock,
2455        .shutdown               = tcp_shutdown,
2456        .setsockopt             = tcp_setsockopt,
2457        .getsockopt             = tcp_getsockopt,
2458        .keepalive              = tcp_set_keepalive,
2459        .recvmsg                = tcp_recvmsg,
2460        .sendmsg                = tcp_sendmsg,
2461        .sendpage               = tcp_sendpage,
2462        .backlog_rcv            = tcp_v4_do_rcv,
2463        .release_cb             = tcp_release_cb,
2464        .hash                   = inet_hash,
2465        .unhash                 = inet_unhash,
2466        .get_port               = inet_csk_get_port,
2467        .enter_memory_pressure  = tcp_enter_memory_pressure,
2468        .leave_memory_pressure  = tcp_leave_memory_pressure,
2469        .stream_memory_free     = tcp_stream_memory_free,
2470        .sockets_allocated      = &tcp_sockets_allocated,
2471        .orphan_count           = &tcp_orphan_count,
2472        .memory_allocated       = &tcp_memory_allocated,
2473        .memory_pressure        = &tcp_memory_pressure,
2474        .sysctl_mem             = sysctl_tcp_mem,
2475        .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2476        .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2477        .max_header             = MAX_TCP_HEADER,
2478        .obj_size               = sizeof(struct tcp_sock),
2479        .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2480        .twsk_prot              = &tcp_timewait_sock_ops,
2481        .rsk_prot               = &tcp_request_sock_ops,
2482        .h.hashinfo             = &tcp_hashinfo,
2483        .no_autobind            = true,
2484#ifdef CONFIG_COMPAT
2485        .compat_setsockopt      = compat_tcp_setsockopt,
2486        .compat_getsockopt      = compat_tcp_getsockopt,
2487#endif
2488        .diag_destroy           = tcp_abort,
2489};
2490EXPORT_SYMBOL(tcp_prot);
2491
2492static void __net_exit tcp_sk_exit(struct net *net)
2493{
2494        int cpu;
2495
2496        module_put(net->ipv4.tcp_congestion_control->owner);
2497
2498        for_each_possible_cpu(cpu)
2499                inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2500        free_percpu(net->ipv4.tcp_sk);
2501}
2502
2503static int __net_init tcp_sk_init(struct net *net)
2504{
2505        int res, cpu, cnt;
2506
2507        net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2508        if (!net->ipv4.tcp_sk)
2509                return -ENOMEM;
2510
2511        for_each_possible_cpu(cpu) {
2512                struct sock *sk;
2513
2514                res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2515                                           IPPROTO_TCP, net);
2516                if (res)
2517                        goto fail;
2518                sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2519                *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2520        }
2521
2522        net->ipv4.sysctl_tcp_ecn = 2;
2523        net->ipv4.sysctl_tcp_ecn_fallback = 1;
2524
2525        net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2526        net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2527        net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2528
2529        net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2530        net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2531        net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2532
2533        net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2534        net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2535        net->ipv4.sysctl_tcp_syncookies = 1;
2536        net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2537        net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2538        net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2539        net->ipv4.sysctl_tcp_orphan_retries = 0;
2540        net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2541        net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2542        net->ipv4.sysctl_tcp_tw_reuse = 2;
2543
2544        cnt = tcp_hashinfo.ehash_mask + 1;
2545        net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2546        net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2547
2548        net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2549        net->ipv4.sysctl_tcp_sack = 1;
2550        net->ipv4.sysctl_tcp_window_scaling = 1;
2551        net->ipv4.sysctl_tcp_timestamps = 1;
2552        net->ipv4.sysctl_tcp_early_retrans = 3;
2553        net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2554        net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2555        net->ipv4.sysctl_tcp_retrans_collapse = 1;
2556        net->ipv4.sysctl_tcp_max_reordering = 300;
2557        net->ipv4.sysctl_tcp_dsack = 1;
2558        net->ipv4.sysctl_tcp_app_win = 31;
2559        net->ipv4.sysctl_tcp_adv_win_scale = 1;
2560        net->ipv4.sysctl_tcp_frto = 2;
2561        net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2562        /* This limits the percentage of the congestion window which we
2563         * will allow a single TSO frame to consume.  Building TSO frames
2564         * which are too large can cause TCP streams to be bursty.
2565         */
2566        net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2567        /* Default TSQ limit of four TSO segments */
2568        net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
2569        /* rfc5961 challenge ack rate limiting */
2570        net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2571        net->ipv4.sysctl_tcp_min_tso_segs = 2;
2572        net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2573        net->ipv4.sysctl_tcp_autocorking = 1;
2574        net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2575        net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2576        net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2577        if (net != &init_net) {
2578                memcpy(net->ipv4.sysctl_tcp_rmem,
2579                       init_net.ipv4.sysctl_tcp_rmem,
2580                       sizeof(init_net.ipv4.sysctl_tcp_rmem));
2581                memcpy(net->ipv4.sysctl_tcp_wmem,
2582                       init_net.ipv4.sysctl_tcp_wmem,
2583                       sizeof(init_net.ipv4.sysctl_tcp_wmem));
2584        }
2585        net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2586        net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2587        net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2588        spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2589        net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2590        atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2591
2592        /* Reno is always built in */
2593        if (!net_eq(net, &init_net) &&
2594            try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2595                net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2596        else
2597                net->ipv4.tcp_congestion_control = &tcp_reno;
2598
2599        return 0;
2600fail:
2601        tcp_sk_exit(net);
2602
2603        return res;
2604}
2605
2606static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2607{
2608        struct net *net;
2609
2610        inet_twsk_purge(&tcp_hashinfo, AF_INET);
2611
2612        list_for_each_entry(net, net_exit_list, exit_list)
2613                tcp_fastopen_ctx_destroy(net);
2614}
2615
2616static struct pernet_operations __net_initdata tcp_sk_ops = {
2617       .init       = tcp_sk_init,
2618       .exit       = tcp_sk_exit,
2619       .exit_batch = tcp_sk_exit_batch,
2620};
2621
2622void __init tcp_v4_init(void)
2623{
2624        if (register_pernet_subsys(&tcp_sk_ops))
2625                panic("Failed to create the TCP control socket.\n");
2626}
2627