linux/net/ipv4/tcp_ipv4.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Implementation of the Transmission Control Protocol(TCP).
   7 *
   8 *              IPv4 specific functions
   9 *
  10 *
  11 *              code split from:
  12 *              linux/ipv4/tcp.c
  13 *              linux/ipv4/tcp_input.c
  14 *              linux/ipv4/tcp_output.c
  15 *
  16 *              See tcp.c for author information
  17 *
  18 *      This program is free software; you can redistribute it and/or
  19 *      modify it under the terms of the GNU General Public License
  20 *      as published by the Free Software Foundation; either version
  21 *      2 of the License, or (at your option) any later version.
  22 */
  23
  24/*
  25 * Changes:
  26 *              David S. Miller :       New socket lookup architecture.
  27 *                                      This code is dedicated to John Dyson.
  28 *              David S. Miller :       Change semantics of established hash,
  29 *                                      half is devoted to TIME_WAIT sockets
  30 *                                      and the rest go in the other half.
  31 *              Andi Kleen :            Add support for syncookies and fixed
  32 *                                      some bugs: ip options weren't passed to
  33 *                                      the TCP layer, missed a check for an
  34 *                                      ACK bit.
  35 *              Andi Kleen :            Implemented fast path mtu discovery.
  36 *                                      Fixed many serious bugs in the
  37 *                                      request_sock handling and moved
  38 *                                      most of it into the af independent code.
  39 *                                      Added tail drop and some other bugfixes.
  40 *                                      Added new listen semantics.
  41 *              Mike McLagan    :       Routing by source
  42 *      Juan Jose Ciarlante:            ip_dynaddr bits
  43 *              Andi Kleen:             various fixes.
  44 *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45 *                                      coma.
  46 *      Andi Kleen              :       Fix new listen.
  47 *      Andi Kleen              :       Fix accept error reporting.
  48 *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49 *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50 *                                      a single port at the same time.
  51 */
  52
  53#define pr_fmt(fmt) "TCP: " fmt
  54
  55#include <linux/bottom_half.h>
  56#include <linux/types.h>
  57#include <linux/fcntl.h>
  58#include <linux/module.h>
  59#include <linux/random.h>
  60#include <linux/cache.h>
  61#include <linux/jhash.h>
  62#include <linux/init.h>
  63#include <linux/times.h>
  64#include <linux/slab.h>
  65
  66#include <net/net_namespace.h>
  67#include <net/icmp.h>
  68#include <net/inet_hashtables.h>
  69#include <net/tcp.h>
  70#include <net/transp_v6.h>
  71#include <net/ipv6.h>
  72#include <net/inet_common.h>
  73#include <net/timewait_sock.h>
  74#include <net/xfrm.h>
  75#include <net/secure_seq.h>
  76#include <net/busy_poll.h>
  77
  78#include <linux/inet.h>
  79#include <linux/ipv6.h>
  80#include <linux/stddef.h>
  81#include <linux/proc_fs.h>
  82#include <linux/seq_file.h>
  83#include <linux/inetdevice.h>
  84
  85#include <crypto/hash.h>
  86#include <linux/scatterlist.h>
  87
  88#include <trace/events/tcp.h>
  89
  90#ifdef CONFIG_TCP_MD5SIG
  91static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  92                               __be32 daddr, __be32 saddr, const struct tcphdr *th);
  93#endif
  94
  95struct inet_hashinfo tcp_hashinfo;
  96EXPORT_SYMBOL(tcp_hashinfo);
  97
  98static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  99{
 100        return secure_tcp_seq(ip_hdr(skb)->daddr,
 101                              ip_hdr(skb)->saddr,
 102                              tcp_hdr(skb)->dest,
 103                              tcp_hdr(skb)->source);
 104}
 105
 106static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 107{
 108        return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 109}
 110
 111int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 112{
 113        const struct inet_timewait_sock *tw = inet_twsk(sktw);
 114        const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 115        struct tcp_sock *tp = tcp_sk(sk);
 116        int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
 117
 118        if (reuse == 2) {
 119                /* Still does not detect *everything* that goes through
 120                 * lo, since we require a loopback src or dst address
 121                 * or direct binding to 'lo' interface.
 122                 */
 123                bool loopback = false;
 124                if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 125                        loopback = true;
 126#if IS_ENABLED(CONFIG_IPV6)
 127                if (tw->tw_family == AF_INET6) {
 128                        if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 129                            (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
 130                             (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
 131                            ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 132                            (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
 133                             (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
 134                                loopback = true;
 135                } else
 136#endif
 137                {
 138                        if (ipv4_is_loopback(tw->tw_daddr) ||
 139                            ipv4_is_loopback(tw->tw_rcv_saddr))
 140                                loopback = true;
 141                }
 142                if (!loopback)
 143                        reuse = 0;
 144        }
 145
 146        /* With PAWS, it is safe from the viewpoint
 147           of data integrity. Even without PAWS it is safe provided sequence
 148           spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 149
 150           Actually, the idea is close to VJ's one, only timestamp cache is
 151           held not per host, but per port pair and TW bucket is used as state
 152           holder.
 153
 154           If TW bucket has been already destroyed we fall back to VJ's scheme
 155           and use initial timestamp retrieved from peer table.
 156         */
 157        if (tcptw->tw_ts_recent_stamp &&
 158            (!twp || (reuse && time_after32(ktime_get_seconds(),
 159                                            tcptw->tw_ts_recent_stamp)))) {
 160                /* In case of repair and re-using TIME-WAIT sockets we still
 161                 * want to be sure that it is safe as above but honor the
 162                 * sequence numbers and time stamps set as part of the repair
 163                 * process.
 164                 *
 165                 * Without this check re-using a TIME-WAIT socket with TCP
 166                 * repair would accumulate a -1 on the repair assigned
 167                 * sequence number. The first time it is reused the sequence
 168                 * is -1, the second time -2, etc. This fixes that issue
 169                 * without appearing to create any others.
 170                 */
 171                if (likely(!tp->repair)) {
 172                        tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 173                        if (tp->write_seq == 0)
 174                                tp->write_seq = 1;
 175                        tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 176                        tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 177                }
 178                sock_hold(sktw);
 179                return 1;
 180        }
 181
 182        return 0;
 183}
 184EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 185
 186static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 187                              int addr_len)
 188{
 189        /* This check is replicated from tcp_v4_connect() and intended to
 190         * prevent BPF program called below from accessing bytes that are out
 191         * of the bound specified by user in addr_len.
 192         */
 193        if (addr_len < sizeof(struct sockaddr_in))
 194                return -EINVAL;
 195
 196        sock_owned_by_me(sk);
 197
 198        return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 199}
 200
 201/* This will initiate an outgoing connection. */
 202int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 203{
 204        struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 205        struct inet_sock *inet = inet_sk(sk);
 206        struct tcp_sock *tp = tcp_sk(sk);
 207        __be16 orig_sport, orig_dport;
 208        __be32 daddr, nexthop;
 209        struct flowi4 *fl4;
 210        struct rtable *rt;
 211        int err;
 212        struct ip_options_rcu *inet_opt;
 213        struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 214
 215        if (addr_len < sizeof(struct sockaddr_in))
 216                return -EINVAL;
 217
 218        if (usin->sin_family != AF_INET)
 219                return -EAFNOSUPPORT;
 220
 221        nexthop = daddr = usin->sin_addr.s_addr;
 222        inet_opt = rcu_dereference_protected(inet->inet_opt,
 223                                             lockdep_sock_is_held(sk));
 224        if (inet_opt && inet_opt->opt.srr) {
 225                if (!daddr)
 226                        return -EINVAL;
 227                nexthop = inet_opt->opt.faddr;
 228        }
 229
 230        orig_sport = inet->inet_sport;
 231        orig_dport = usin->sin_port;
 232        fl4 = &inet->cork.fl.u.ip4;
 233        rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 234                              RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 235                              IPPROTO_TCP,
 236                              orig_sport, orig_dport, sk);
 237        if (IS_ERR(rt)) {
 238                err = PTR_ERR(rt);
 239                if (err == -ENETUNREACH)
 240                        IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 241                return err;
 242        }
 243
 244        if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 245                ip_rt_put(rt);
 246                return -ENETUNREACH;
 247        }
 248
 249        if (!inet_opt || !inet_opt->opt.srr)
 250                daddr = fl4->daddr;
 251
 252        if (!inet->inet_saddr)
 253                inet->inet_saddr = fl4->saddr;
 254        sk_rcv_saddr_set(sk, inet->inet_saddr);
 255
 256        if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 257                /* Reset inherited state */
 258                tp->rx_opt.ts_recent       = 0;
 259                tp->rx_opt.ts_recent_stamp = 0;
 260                if (likely(!tp->repair))
 261                        tp->write_seq      = 0;
 262        }
 263
 264        inet->inet_dport = usin->sin_port;
 265        sk_daddr_set(sk, daddr);
 266
 267        inet_csk(sk)->icsk_ext_hdr_len = 0;
 268        if (inet_opt)
 269                inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 270
 271        tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 272
 273        /* Socket identity is still unknown (sport may be zero).
 274         * However we set state to SYN-SENT and not releasing socket
 275         * lock select source port, enter ourselves into the hash tables and
 276         * complete initialization after this.
 277         */
 278        tcp_set_state(sk, TCP_SYN_SENT);
 279        err = inet_hash_connect(tcp_death_row, sk);
 280        if (err)
 281                goto failure;
 282
 283        sk_set_txhash(sk);
 284
 285        rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 286                               inet->inet_sport, inet->inet_dport, sk);
 287        if (IS_ERR(rt)) {
 288                err = PTR_ERR(rt);
 289                rt = NULL;
 290                goto failure;
 291        }
 292        /* OK, now commit destination to socket.  */
 293        sk->sk_gso_type = SKB_GSO_TCPV4;
 294        sk_setup_caps(sk, &rt->dst);
 295        rt = NULL;
 296
 297        if (likely(!tp->repair)) {
 298                if (!tp->write_seq)
 299                        tp->write_seq = secure_tcp_seq(inet->inet_saddr,
 300                                                       inet->inet_daddr,
 301                                                       inet->inet_sport,
 302                                                       usin->sin_port);
 303                tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 304                                                 inet->inet_saddr,
 305                                                 inet->inet_daddr);
 306        }
 307
 308        inet->inet_id = tp->write_seq ^ jiffies;
 309
 310        if (tcp_fastopen_defer_connect(sk, &err))
 311                return err;
 312        if (err)
 313                goto failure;
 314
 315        err = tcp_connect(sk);
 316
 317        if (err)
 318                goto failure;
 319
 320        return 0;
 321
 322failure:
 323        /*
 324         * This unhashes the socket and releases the local port,
 325         * if necessary.
 326         */
 327        tcp_set_state(sk, TCP_CLOSE);
 328        ip_rt_put(rt);
 329        sk->sk_route_caps = 0;
 330        inet->inet_dport = 0;
 331        return err;
 332}
 333EXPORT_SYMBOL(tcp_v4_connect);
 334
 335/*
 336 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 337 * It can be called through tcp_release_cb() if socket was owned by user
 338 * at the time tcp_v4_err() was called to handle ICMP message.
 339 */
 340void tcp_v4_mtu_reduced(struct sock *sk)
 341{
 342        struct inet_sock *inet = inet_sk(sk);
 343        struct dst_entry *dst;
 344        u32 mtu;
 345
 346        if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 347                return;
 348        mtu = tcp_sk(sk)->mtu_info;
 349        dst = inet_csk_update_pmtu(sk, mtu);
 350        if (!dst)
 351                return;
 352
 353        /* Something is about to be wrong... Remember soft error
 354         * for the case, if this connection will not able to recover.
 355         */
 356        if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 357                sk->sk_err_soft = EMSGSIZE;
 358
 359        mtu = dst_mtu(dst);
 360
 361        if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 362            ip_sk_accept_pmtu(sk) &&
 363            inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 364                tcp_sync_mss(sk, mtu);
 365
 366                /* Resend the TCP packet because it's
 367                 * clear that the old packet has been
 368                 * dropped. This is the new "fast" path mtu
 369                 * discovery.
 370                 */
 371                tcp_simple_retransmit(sk);
 372        } /* else let the usual retransmit timer handle it */
 373}
 374EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 375
 376static void do_redirect(struct sk_buff *skb, struct sock *sk)
 377{
 378        struct dst_entry *dst = __sk_dst_check(sk, 0);
 379
 380        if (dst)
 381                dst->ops->redirect(dst, sk, skb);
 382}
 383
 384
 385/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 386void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 387{
 388        struct request_sock *req = inet_reqsk(sk);
 389        struct net *net = sock_net(sk);
 390
 391        /* ICMPs are not backlogged, hence we cannot get
 392         * an established socket here.
 393         */
 394        if (seq != tcp_rsk(req)->snt_isn) {
 395                __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 396        } else if (abort) {
 397                /*
 398                 * Still in SYN_RECV, just remove it silently.
 399                 * There is no good way to pass the error to the newly
 400                 * created socket, and POSIX does not want network
 401                 * errors returned from accept().
 402                 */
 403                inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 404                tcp_listendrop(req->rsk_listener);
 405        }
 406        reqsk_put(req);
 407}
 408EXPORT_SYMBOL(tcp_req_err);
 409
 410/*
 411 * This routine is called by the ICMP module when it gets some
 412 * sort of error condition.  If err < 0 then the socket should
 413 * be closed and the error returned to the user.  If err > 0
 414 * it's just the icmp type << 8 | icmp code.  After adjustment
 415 * header points to the first 8 bytes of the tcp header.  We need
 416 * to find the appropriate port.
 417 *
 418 * The locking strategy used here is very "optimistic". When
 419 * someone else accesses the socket the ICMP is just dropped
 420 * and for some paths there is no check at all.
 421 * A more general error queue to queue errors for later handling
 422 * is probably better.
 423 *
 424 */
 425
 426void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 427{
 428        const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 429        struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 430        struct inet_connection_sock *icsk;
 431        struct tcp_sock *tp;
 432        struct inet_sock *inet;
 433        const int type = icmp_hdr(icmp_skb)->type;
 434        const int code = icmp_hdr(icmp_skb)->code;
 435        struct sock *sk;
 436        struct sk_buff *skb;
 437        struct request_sock *fastopen;
 438        u32 seq, snd_una;
 439        s32 remaining;
 440        u32 delta_us;
 441        int err;
 442        struct net *net = dev_net(icmp_skb->dev);
 443
 444        sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 445                                       th->dest, iph->saddr, ntohs(th->source),
 446                                       inet_iif(icmp_skb), 0);
 447        if (!sk) {
 448                __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 449                return;
 450        }
 451        if (sk->sk_state == TCP_TIME_WAIT) {
 452                inet_twsk_put(inet_twsk(sk));
 453                return;
 454        }
 455        seq = ntohl(th->seq);
 456        if (sk->sk_state == TCP_NEW_SYN_RECV)
 457                return tcp_req_err(sk, seq,
 458                                  type == ICMP_PARAMETERPROB ||
 459                                  type == ICMP_TIME_EXCEEDED ||
 460                                  (type == ICMP_DEST_UNREACH &&
 461                                   (code == ICMP_NET_UNREACH ||
 462                                    code == ICMP_HOST_UNREACH)));
 463
 464        bh_lock_sock(sk);
 465        /* If too many ICMPs get dropped on busy
 466         * servers this needs to be solved differently.
 467         * We do take care of PMTU discovery (RFC1191) special case :
 468         * we can receive locally generated ICMP messages while socket is held.
 469         */
 470        if (sock_owned_by_user(sk)) {
 471                if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 472                        __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 473        }
 474        if (sk->sk_state == TCP_CLOSE)
 475                goto out;
 476
 477        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 478                __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 479                goto out;
 480        }
 481
 482        icsk = inet_csk(sk);
 483        tp = tcp_sk(sk);
 484        /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 485        fastopen = tp->fastopen_rsk;
 486        snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 487        if (sk->sk_state != TCP_LISTEN &&
 488            !between(seq, snd_una, tp->snd_nxt)) {
 489                __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 490                goto out;
 491        }
 492
 493        switch (type) {
 494        case ICMP_REDIRECT:
 495                if (!sock_owned_by_user(sk))
 496                        do_redirect(icmp_skb, sk);
 497                goto out;
 498        case ICMP_SOURCE_QUENCH:
 499                /* Just silently ignore these. */
 500                goto out;
 501        case ICMP_PARAMETERPROB:
 502                err = EPROTO;
 503                break;
 504        case ICMP_DEST_UNREACH:
 505                if (code > NR_ICMP_UNREACH)
 506                        goto out;
 507
 508                if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 509                        /* We are not interested in TCP_LISTEN and open_requests
 510                         * (SYN-ACKs send out by Linux are always <576bytes so
 511                         * they should go through unfragmented).
 512                         */
 513                        if (sk->sk_state == TCP_LISTEN)
 514                                goto out;
 515
 516                        tp->mtu_info = info;
 517                        if (!sock_owned_by_user(sk)) {
 518                                tcp_v4_mtu_reduced(sk);
 519                        } else {
 520                                if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 521                                        sock_hold(sk);
 522                        }
 523                        goto out;
 524                }
 525
 526                err = icmp_err_convert[code].errno;
 527                /* check if icmp_skb allows revert of backoff
 528                 * (see draft-zimmermann-tcp-lcd) */
 529                if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 530                        break;
 531                if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 532                    !icsk->icsk_backoff || fastopen)
 533                        break;
 534
 535                if (sock_owned_by_user(sk))
 536                        break;
 537
 538                icsk->icsk_backoff--;
 539                icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 540                                               TCP_TIMEOUT_INIT;
 541                icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 542
 543                skb = tcp_rtx_queue_head(sk);
 544                BUG_ON(!skb);
 545
 546                tcp_mstamp_refresh(tp);
 547                delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
 548                remaining = icsk->icsk_rto -
 549                            usecs_to_jiffies(delta_us);
 550
 551                if (remaining > 0) {
 552                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 553                                                  remaining, TCP_RTO_MAX);
 554                } else {
 555                        /* RTO revert clocked out retransmission.
 556                         * Will retransmit now */
 557                        tcp_retransmit_timer(sk);
 558                }
 559
 560                break;
 561        case ICMP_TIME_EXCEEDED:
 562                err = EHOSTUNREACH;
 563                break;
 564        default:
 565                goto out;
 566        }
 567
 568        switch (sk->sk_state) {
 569        case TCP_SYN_SENT:
 570        case TCP_SYN_RECV:
 571                /* Only in fast or simultaneous open. If a fast open socket is
 572                 * is already accepted it is treated as a connected one below.
 573                 */
 574                if (fastopen && !fastopen->sk)
 575                        break;
 576
 577                if (!sock_owned_by_user(sk)) {
 578                        sk->sk_err = err;
 579
 580                        sk->sk_error_report(sk);
 581
 582                        tcp_done(sk);
 583                } else {
 584                        sk->sk_err_soft = err;
 585                }
 586                goto out;
 587        }
 588
 589        /* If we've already connected we will keep trying
 590         * until we time out, or the user gives up.
 591         *
 592         * rfc1122 4.2.3.9 allows to consider as hard errors
 593         * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 594         * but it is obsoleted by pmtu discovery).
 595         *
 596         * Note, that in modern internet, where routing is unreliable
 597         * and in each dark corner broken firewalls sit, sending random
 598         * errors ordered by their masters even this two messages finally lose
 599         * their original sense (even Linux sends invalid PORT_UNREACHs)
 600         *
 601         * Now we are in compliance with RFCs.
 602         *                                                      --ANK (980905)
 603         */
 604
 605        inet = inet_sk(sk);
 606        if (!sock_owned_by_user(sk) && inet->recverr) {
 607                sk->sk_err = err;
 608                sk->sk_error_report(sk);
 609        } else  { /* Only an error on timeout */
 610                sk->sk_err_soft = err;
 611        }
 612
 613out:
 614        bh_unlock_sock(sk);
 615        sock_put(sk);
 616}
 617
 618void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 619{
 620        struct tcphdr *th = tcp_hdr(skb);
 621
 622        th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 623        skb->csum_start = skb_transport_header(skb) - skb->head;
 624        skb->csum_offset = offsetof(struct tcphdr, check);
 625}
 626
 627/* This routine computes an IPv4 TCP checksum. */
 628void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 629{
 630        const struct inet_sock *inet = inet_sk(sk);
 631
 632        __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 633}
 634EXPORT_SYMBOL(tcp_v4_send_check);
 635
 636/*
 637 *      This routine will send an RST to the other tcp.
 638 *
 639 *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 640 *                    for reset.
 641 *      Answer: if a packet caused RST, it is not for a socket
 642 *              existing in our system, if it is matched to a socket,
 643 *              it is just duplicate segment or bug in other side's TCP.
 644 *              So that we build reply only basing on parameters
 645 *              arrived with segment.
 646 *      Exception: precedence violation. We do not implement it in any case.
 647 */
 648
 649static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 650{
 651        const struct tcphdr *th = tcp_hdr(skb);
 652        struct {
 653                struct tcphdr th;
 654#ifdef CONFIG_TCP_MD5SIG
 655                __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 656#endif
 657        } rep;
 658        struct ip_reply_arg arg;
 659#ifdef CONFIG_TCP_MD5SIG
 660        struct tcp_md5sig_key *key = NULL;
 661        const __u8 *hash_location = NULL;
 662        unsigned char newhash[16];
 663        int genhash;
 664        struct sock *sk1 = NULL;
 665#endif
 666        struct net *net;
 667        struct sock *ctl_sk;
 668
 669        /* Never send a reset in response to a reset. */
 670        if (th->rst)
 671                return;
 672
 673        /* If sk not NULL, it means we did a successful lookup and incoming
 674         * route had to be correct. prequeue might have dropped our dst.
 675         */
 676        if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 677                return;
 678
 679        /* Swap the send and the receive. */
 680        memset(&rep, 0, sizeof(rep));
 681        rep.th.dest   = th->source;
 682        rep.th.source = th->dest;
 683        rep.th.doff   = sizeof(struct tcphdr) / 4;
 684        rep.th.rst    = 1;
 685
 686        if (th->ack) {
 687                rep.th.seq = th->ack_seq;
 688        } else {
 689                rep.th.ack = 1;
 690                rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 691                                       skb->len - (th->doff << 2));
 692        }
 693
 694        memset(&arg, 0, sizeof(arg));
 695        arg.iov[0].iov_base = (unsigned char *)&rep;
 696        arg.iov[0].iov_len  = sizeof(rep.th);
 697
 698        net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 699#ifdef CONFIG_TCP_MD5SIG
 700        rcu_read_lock();
 701        hash_location = tcp_parse_md5sig_option(th);
 702        if (sk && sk_fullsock(sk)) {
 703                key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 704                                        &ip_hdr(skb)->saddr, AF_INET);
 705        } else if (hash_location) {
 706                /*
 707                 * active side is lost. Try to find listening socket through
 708                 * source port, and then find md5 key through listening socket.
 709                 * we are not loose security here:
 710                 * Incoming packet is checked with md5 hash with finding key,
 711                 * no RST generated if md5 hash doesn't match.
 712                 */
 713                sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 714                                             ip_hdr(skb)->saddr,
 715                                             th->source, ip_hdr(skb)->daddr,
 716                                             ntohs(th->source), inet_iif(skb),
 717                                             tcp_v4_sdif(skb));
 718                /* don't send rst if it can't find key */
 719                if (!sk1)
 720                        goto out;
 721
 722                key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 723                                        &ip_hdr(skb)->saddr, AF_INET);
 724                if (!key)
 725                        goto out;
 726
 727
 728                genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 729                if (genhash || memcmp(hash_location, newhash, 16) != 0)
 730                        goto out;
 731
 732        }
 733
 734        if (key) {
 735                rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 736                                   (TCPOPT_NOP << 16) |
 737                                   (TCPOPT_MD5SIG << 8) |
 738                                   TCPOLEN_MD5SIG);
 739                /* Update length and the length the header thinks exists */
 740                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 741                rep.th.doff = arg.iov[0].iov_len / 4;
 742
 743                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 744                                     key, ip_hdr(skb)->saddr,
 745                                     ip_hdr(skb)->daddr, &rep.th);
 746        }
 747#endif
 748        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 749                                      ip_hdr(skb)->saddr, /* XXX */
 750                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 751        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 752        arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 753
 754        /* When socket is gone, all binding information is lost.
 755         * routing might fail in this case. No choice here, if we choose to force
 756         * input interface, we will misroute in case of asymmetric route.
 757         */
 758        if (sk) {
 759                arg.bound_dev_if = sk->sk_bound_dev_if;
 760                if (sk_fullsock(sk))
 761                        trace_tcp_send_reset(sk, skb);
 762        }
 763
 764        BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 765                     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 766
 767        arg.tos = ip_hdr(skb)->tos;
 768        arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 769        local_bh_disable();
 770        ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
 771        if (sk)
 772                ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 773                                   inet_twsk(sk)->tw_mark : sk->sk_mark;
 774        ip_send_unicast_reply(ctl_sk,
 775                              skb, &TCP_SKB_CB(skb)->header.h4.opt,
 776                              ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 777                              &arg, arg.iov[0].iov_len);
 778
 779        ctl_sk->sk_mark = 0;
 780        __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 781        __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 782        local_bh_enable();
 783
 784#ifdef CONFIG_TCP_MD5SIG
 785out:
 786        rcu_read_unlock();
 787#endif
 788}
 789
 790/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 791   outside socket context is ugly, certainly. What can I do?
 792 */
 793
 794static void tcp_v4_send_ack(const struct sock *sk,
 795                            struct sk_buff *skb, u32 seq, u32 ack,
 796                            u32 win, u32 tsval, u32 tsecr, int oif,
 797                            struct tcp_md5sig_key *key,
 798                            int reply_flags, u8 tos)
 799{
 800        const struct tcphdr *th = tcp_hdr(skb);
 801        struct {
 802                struct tcphdr th;
 803                __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 804#ifdef CONFIG_TCP_MD5SIG
 805                           + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 806#endif
 807                        ];
 808        } rep;
 809        struct net *net = sock_net(sk);
 810        struct ip_reply_arg arg;
 811        struct sock *ctl_sk;
 812
 813        memset(&rep.th, 0, sizeof(struct tcphdr));
 814        memset(&arg, 0, sizeof(arg));
 815
 816        arg.iov[0].iov_base = (unsigned char *)&rep;
 817        arg.iov[0].iov_len  = sizeof(rep.th);
 818        if (tsecr) {
 819                rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 820                                   (TCPOPT_TIMESTAMP << 8) |
 821                                   TCPOLEN_TIMESTAMP);
 822                rep.opt[1] = htonl(tsval);
 823                rep.opt[2] = htonl(tsecr);
 824                arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 825        }
 826
 827        /* Swap the send and the receive. */
 828        rep.th.dest    = th->source;
 829        rep.th.source  = th->dest;
 830        rep.th.doff    = arg.iov[0].iov_len / 4;
 831        rep.th.seq     = htonl(seq);
 832        rep.th.ack_seq = htonl(ack);
 833        rep.th.ack     = 1;
 834        rep.th.window  = htons(win);
 835
 836#ifdef CONFIG_TCP_MD5SIG
 837        if (key) {
 838                int offset = (tsecr) ? 3 : 0;
 839
 840                rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 841                                          (TCPOPT_NOP << 16) |
 842                                          (TCPOPT_MD5SIG << 8) |
 843                                          TCPOLEN_MD5SIG);
 844                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 845                rep.th.doff = arg.iov[0].iov_len/4;
 846
 847                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 848                                    key, ip_hdr(skb)->saddr,
 849                                    ip_hdr(skb)->daddr, &rep.th);
 850        }
 851#endif
 852        arg.flags = reply_flags;
 853        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 854                                      ip_hdr(skb)->saddr, /* XXX */
 855                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 856        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 857        if (oif)
 858                arg.bound_dev_if = oif;
 859        arg.tos = tos;
 860        arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 861        local_bh_disable();
 862        ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
 863        if (sk)
 864                ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 865                                   inet_twsk(sk)->tw_mark : sk->sk_mark;
 866        ip_send_unicast_reply(ctl_sk,
 867                              skb, &TCP_SKB_CB(skb)->header.h4.opt,
 868                              ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 869                              &arg, arg.iov[0].iov_len);
 870
 871        ctl_sk->sk_mark = 0;
 872        __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 873        local_bh_enable();
 874}
 875
 876static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 877{
 878        struct inet_timewait_sock *tw = inet_twsk(sk);
 879        struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 880
 881        tcp_v4_send_ack(sk, skb,
 882                        tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 883                        tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 884                        tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 885                        tcptw->tw_ts_recent,
 886                        tw->tw_bound_dev_if,
 887                        tcp_twsk_md5_key(tcptw),
 888                        tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 889                        tw->tw_tos
 890                        );
 891
 892        inet_twsk_put(tw);
 893}
 894
 895static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 896                                  struct request_sock *req)
 897{
 898        /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 899         * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 900         */
 901        u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 902                                             tcp_sk(sk)->snd_nxt;
 903
 904        /* RFC 7323 2.3
 905         * The window field (SEG.WND) of every outgoing segment, with the
 906         * exception of <SYN> segments, MUST be right-shifted by
 907         * Rcv.Wind.Shift bits:
 908         */
 909        tcp_v4_send_ack(sk, skb, seq,
 910                        tcp_rsk(req)->rcv_nxt,
 911                        req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 912                        tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 913                        req->ts_recent,
 914                        0,
 915                        tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
 916                                          AF_INET),
 917                        inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 918                        ip_hdr(skb)->tos);
 919}
 920
 921/*
 922 *      Send a SYN-ACK after having received a SYN.
 923 *      This still operates on a request_sock only, not on a big
 924 *      socket.
 925 */
 926static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 927                              struct flowi *fl,
 928                              struct request_sock *req,
 929                              struct tcp_fastopen_cookie *foc,
 930                              enum tcp_synack_type synack_type)
 931{
 932        const struct inet_request_sock *ireq = inet_rsk(req);
 933        struct flowi4 fl4;
 934        int err = -1;
 935        struct sk_buff *skb;
 936
 937        /* First, grab a route. */
 938        if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 939                return -1;
 940
 941        skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 942
 943        if (skb) {
 944                __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 945
 946                rcu_read_lock();
 947                err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 948                                            ireq->ir_rmt_addr,
 949                                            rcu_dereference(ireq->ireq_opt));
 950                rcu_read_unlock();
 951                err = net_xmit_eval(err);
 952        }
 953
 954        return err;
 955}
 956
 957/*
 958 *      IPv4 request_sock destructor.
 959 */
 960static void tcp_v4_reqsk_destructor(struct request_sock *req)
 961{
 962        kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
 963}
 964
 965#ifdef CONFIG_TCP_MD5SIG
 966/*
 967 * RFC2385 MD5 checksumming requires a mapping of
 968 * IP address->MD5 Key.
 969 * We need to maintain these in the sk structure.
 970 */
 971
 972/* Find the Key structure for an address.  */
 973struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
 974                                         const union tcp_md5_addr *addr,
 975                                         int family)
 976{
 977        const struct tcp_sock *tp = tcp_sk(sk);
 978        struct tcp_md5sig_key *key;
 979        const struct tcp_md5sig_info *md5sig;
 980        __be32 mask;
 981        struct tcp_md5sig_key *best_match = NULL;
 982        bool match;
 983
 984        /* caller either holds rcu_read_lock() or socket lock */
 985        md5sig = rcu_dereference_check(tp->md5sig_info,
 986                                       lockdep_sock_is_held(sk));
 987        if (!md5sig)
 988                return NULL;
 989
 990        hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 991                if (key->family != family)
 992                        continue;
 993
 994                if (family == AF_INET) {
 995                        mask = inet_make_mask(key->prefixlen);
 996                        match = (key->addr.a4.s_addr & mask) ==
 997                                (addr->a4.s_addr & mask);
 998#if IS_ENABLED(CONFIG_IPV6)
 999                } else if (family == AF_INET6) {
1000                        match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1001                                                  key->prefixlen);
1002#endif
1003                } else {
1004                        match = false;
1005                }
1006
1007                if (match && (!best_match ||
1008                              key->prefixlen > best_match->prefixlen))
1009                        best_match = key;
1010        }
1011        return best_match;
1012}
1013EXPORT_SYMBOL(tcp_md5_do_lookup);
1014
1015static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1016                                                      const union tcp_md5_addr *addr,
1017                                                      int family, u8 prefixlen)
1018{
1019        const struct tcp_sock *tp = tcp_sk(sk);
1020        struct tcp_md5sig_key *key;
1021        unsigned int size = sizeof(struct in_addr);
1022        const struct tcp_md5sig_info *md5sig;
1023
1024        /* caller either holds rcu_read_lock() or socket lock */
1025        md5sig = rcu_dereference_check(tp->md5sig_info,
1026                                       lockdep_sock_is_held(sk));
1027        if (!md5sig)
1028                return NULL;
1029#if IS_ENABLED(CONFIG_IPV6)
1030        if (family == AF_INET6)
1031                size = sizeof(struct in6_addr);
1032#endif
1033        hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1034                if (key->family != family)
1035                        continue;
1036                if (!memcmp(&key->addr, addr, size) &&
1037                    key->prefixlen == prefixlen)
1038                        return key;
1039        }
1040        return NULL;
1041}
1042
1043struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1044                                         const struct sock *addr_sk)
1045{
1046        const union tcp_md5_addr *addr;
1047
1048        addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1049        return tcp_md5_do_lookup(sk, addr, AF_INET);
1050}
1051EXPORT_SYMBOL(tcp_v4_md5_lookup);
1052
1053/* This can be called on a newly created socket, from other files */
1054int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1055                   int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1056                   gfp_t gfp)
1057{
1058        /* Add Key to the list */
1059        struct tcp_md5sig_key *key;
1060        struct tcp_sock *tp = tcp_sk(sk);
1061        struct tcp_md5sig_info *md5sig;
1062
1063        key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1064        if (key) {
1065                /* Pre-existing entry - just update that one. */
1066                memcpy(key->key, newkey, newkeylen);
1067                key->keylen = newkeylen;
1068                return 0;
1069        }
1070
1071        md5sig = rcu_dereference_protected(tp->md5sig_info,
1072                                           lockdep_sock_is_held(sk));
1073        if (!md5sig) {
1074                md5sig = kmalloc(sizeof(*md5sig), gfp);
1075                if (!md5sig)
1076                        return -ENOMEM;
1077
1078                sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1079                INIT_HLIST_HEAD(&md5sig->head);
1080                rcu_assign_pointer(tp->md5sig_info, md5sig);
1081        }
1082
1083        key = sock_kmalloc(sk, sizeof(*key), gfp);
1084        if (!key)
1085                return -ENOMEM;
1086        if (!tcp_alloc_md5sig_pool()) {
1087                sock_kfree_s(sk, key, sizeof(*key));
1088                return -ENOMEM;
1089        }
1090
1091        memcpy(key->key, newkey, newkeylen);
1092        key->keylen = newkeylen;
1093        key->family = family;
1094        key->prefixlen = prefixlen;
1095        memcpy(&key->addr, addr,
1096               (family == AF_INET6) ? sizeof(struct in6_addr) :
1097                                      sizeof(struct in_addr));
1098        hlist_add_head_rcu(&key->node, &md5sig->head);
1099        return 0;
1100}
1101EXPORT_SYMBOL(tcp_md5_do_add);
1102
1103int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1104                   u8 prefixlen)
1105{
1106        struct tcp_md5sig_key *key;
1107
1108        key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1109        if (!key)
1110                return -ENOENT;
1111        hlist_del_rcu(&key->node);
1112        atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1113        kfree_rcu(key, rcu);
1114        return 0;
1115}
1116EXPORT_SYMBOL(tcp_md5_do_del);
1117
1118static void tcp_clear_md5_list(struct sock *sk)
1119{
1120        struct tcp_sock *tp = tcp_sk(sk);
1121        struct tcp_md5sig_key *key;
1122        struct hlist_node *n;
1123        struct tcp_md5sig_info *md5sig;
1124
1125        md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1126
1127        hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1128                hlist_del_rcu(&key->node);
1129                atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1130                kfree_rcu(key, rcu);
1131        }
1132}
1133
1134static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1135                                 char __user *optval, int optlen)
1136{
1137        struct tcp_md5sig cmd;
1138        struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1139        u8 prefixlen = 32;
1140
1141        if (optlen < sizeof(cmd))
1142                return -EINVAL;
1143
1144        if (copy_from_user(&cmd, optval, sizeof(cmd)))
1145                return -EFAULT;
1146
1147        if (sin->sin_family != AF_INET)
1148                return -EINVAL;
1149
1150        if (optname == TCP_MD5SIG_EXT &&
1151            cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1152                prefixlen = cmd.tcpm_prefixlen;
1153                if (prefixlen > 32)
1154                        return -EINVAL;
1155        }
1156
1157        if (!cmd.tcpm_keylen)
1158                return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1159                                      AF_INET, prefixlen);
1160
1161        if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1162                return -EINVAL;
1163
1164        return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1165                              AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1166                              GFP_KERNEL);
1167}
1168
1169static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1170                                   __be32 daddr, __be32 saddr,
1171                                   const struct tcphdr *th, int nbytes)
1172{
1173        struct tcp4_pseudohdr *bp;
1174        struct scatterlist sg;
1175        struct tcphdr *_th;
1176
1177        bp = hp->scratch;
1178        bp->saddr = saddr;
1179        bp->daddr = daddr;
1180        bp->pad = 0;
1181        bp->protocol = IPPROTO_TCP;
1182        bp->len = cpu_to_be16(nbytes);
1183
1184        _th = (struct tcphdr *)(bp + 1);
1185        memcpy(_th, th, sizeof(*th));
1186        _th->check = 0;
1187
1188        sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1189        ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1190                                sizeof(*bp) + sizeof(*th));
1191        return crypto_ahash_update(hp->md5_req);
1192}
1193
1194static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1195                               __be32 daddr, __be32 saddr, const struct tcphdr *th)
1196{
1197        struct tcp_md5sig_pool *hp;
1198        struct ahash_request *req;
1199
1200        hp = tcp_get_md5sig_pool();
1201        if (!hp)
1202                goto clear_hash_noput;
1203        req = hp->md5_req;
1204
1205        if (crypto_ahash_init(req))
1206                goto clear_hash;
1207        if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1208                goto clear_hash;
1209        if (tcp_md5_hash_key(hp, key))
1210                goto clear_hash;
1211        ahash_request_set_crypt(req, NULL, md5_hash, 0);
1212        if (crypto_ahash_final(req))
1213                goto clear_hash;
1214
1215        tcp_put_md5sig_pool();
1216        return 0;
1217
1218clear_hash:
1219        tcp_put_md5sig_pool();
1220clear_hash_noput:
1221        memset(md5_hash, 0, 16);
1222        return 1;
1223}
1224
1225int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1226                        const struct sock *sk,
1227                        const struct sk_buff *skb)
1228{
1229        struct tcp_md5sig_pool *hp;
1230        struct ahash_request *req;
1231        const struct tcphdr *th = tcp_hdr(skb);
1232        __be32 saddr, daddr;
1233
1234        if (sk) { /* valid for establish/request sockets */
1235                saddr = sk->sk_rcv_saddr;
1236                daddr = sk->sk_daddr;
1237        } else {
1238                const struct iphdr *iph = ip_hdr(skb);
1239                saddr = iph->saddr;
1240                daddr = iph->daddr;
1241        }
1242
1243        hp = tcp_get_md5sig_pool();
1244        if (!hp)
1245                goto clear_hash_noput;
1246        req = hp->md5_req;
1247
1248        if (crypto_ahash_init(req))
1249                goto clear_hash;
1250
1251        if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1252                goto clear_hash;
1253        if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1254                goto clear_hash;
1255        if (tcp_md5_hash_key(hp, key))
1256                goto clear_hash;
1257        ahash_request_set_crypt(req, NULL, md5_hash, 0);
1258        if (crypto_ahash_final(req))
1259                goto clear_hash;
1260
1261        tcp_put_md5sig_pool();
1262        return 0;
1263
1264clear_hash:
1265        tcp_put_md5sig_pool();
1266clear_hash_noput:
1267        memset(md5_hash, 0, 16);
1268        return 1;
1269}
1270EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1271
1272#endif
1273
1274/* Called with rcu_read_lock() */
1275static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1276                                    const struct sk_buff *skb)
1277{
1278#ifdef CONFIG_TCP_MD5SIG
1279        /*
1280         * This gets called for each TCP segment that arrives
1281         * so we want to be efficient.
1282         * We have 3 drop cases:
1283         * o No MD5 hash and one expected.
1284         * o MD5 hash and we're not expecting one.
1285         * o MD5 hash and its wrong.
1286         */
1287        const __u8 *hash_location = NULL;
1288        struct tcp_md5sig_key *hash_expected;
1289        const struct iphdr *iph = ip_hdr(skb);
1290        const struct tcphdr *th = tcp_hdr(skb);
1291        int genhash;
1292        unsigned char newhash[16];
1293
1294        hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1295                                          AF_INET);
1296        hash_location = tcp_parse_md5sig_option(th);
1297
1298        /* We've parsed the options - do we have a hash? */
1299        if (!hash_expected && !hash_location)
1300                return false;
1301
1302        if (hash_expected && !hash_location) {
1303                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1304                return true;
1305        }
1306
1307        if (!hash_expected && hash_location) {
1308                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1309                return true;
1310        }
1311
1312        /* Okay, so this is hash_expected and hash_location -
1313         * so we need to calculate the checksum.
1314         */
1315        genhash = tcp_v4_md5_hash_skb(newhash,
1316                                      hash_expected,
1317                                      NULL, skb);
1318
1319        if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1320                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1321                net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1322                                     &iph->saddr, ntohs(th->source),
1323                                     &iph->daddr, ntohs(th->dest),
1324                                     genhash ? " tcp_v4_calc_md5_hash failed"
1325                                     : "");
1326                return true;
1327        }
1328        return false;
1329#endif
1330        return false;
1331}
1332
1333static void tcp_v4_init_req(struct request_sock *req,
1334                            const struct sock *sk_listener,
1335                            struct sk_buff *skb)
1336{
1337        struct inet_request_sock *ireq = inet_rsk(req);
1338        struct net *net = sock_net(sk_listener);
1339
1340        sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1341        sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1342        RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1343}
1344
1345static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1346                                          struct flowi *fl,
1347                                          const struct request_sock *req)
1348{
1349        return inet_csk_route_req(sk, &fl->u.ip4, req);
1350}
1351
1352struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1353        .family         =       PF_INET,
1354        .obj_size       =       sizeof(struct tcp_request_sock),
1355        .rtx_syn_ack    =       tcp_rtx_synack,
1356        .send_ack       =       tcp_v4_reqsk_send_ack,
1357        .destructor     =       tcp_v4_reqsk_destructor,
1358        .send_reset     =       tcp_v4_send_reset,
1359        .syn_ack_timeout =      tcp_syn_ack_timeout,
1360};
1361
1362static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1363        .mss_clamp      =       TCP_MSS_DEFAULT,
1364#ifdef CONFIG_TCP_MD5SIG
1365        .req_md5_lookup =       tcp_v4_md5_lookup,
1366        .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1367#endif
1368        .init_req       =       tcp_v4_init_req,
1369#ifdef CONFIG_SYN_COOKIES
1370        .cookie_init_seq =      cookie_v4_init_sequence,
1371#endif
1372        .route_req      =       tcp_v4_route_req,
1373        .init_seq       =       tcp_v4_init_seq,
1374        .init_ts_off    =       tcp_v4_init_ts_off,
1375        .send_synack    =       tcp_v4_send_synack,
1376};
1377
1378int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1379{
1380        /* Never answer to SYNs send to broadcast or multicast */
1381        if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1382                goto drop;
1383
1384        return tcp_conn_request(&tcp_request_sock_ops,
1385                                &tcp_request_sock_ipv4_ops, sk, skb);
1386
1387drop:
1388        tcp_listendrop(sk);
1389        return 0;
1390}
1391EXPORT_SYMBOL(tcp_v4_conn_request);
1392
1393
1394/*
1395 * The three way handshake has completed - we got a valid synack -
1396 * now create the new socket.
1397 */
1398struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1399                                  struct request_sock *req,
1400                                  struct dst_entry *dst,
1401                                  struct request_sock *req_unhash,
1402                                  bool *own_req)
1403{
1404        struct inet_request_sock *ireq;
1405        struct inet_sock *newinet;
1406        struct tcp_sock *newtp;
1407        struct sock *newsk;
1408#ifdef CONFIG_TCP_MD5SIG
1409        struct tcp_md5sig_key *key;
1410#endif
1411        struct ip_options_rcu *inet_opt;
1412
1413        if (sk_acceptq_is_full(sk))
1414                goto exit_overflow;
1415
1416        newsk = tcp_create_openreq_child(sk, req, skb);
1417        if (!newsk)
1418                goto exit_nonewsk;
1419
1420        newsk->sk_gso_type = SKB_GSO_TCPV4;
1421        inet_sk_rx_dst_set(newsk, skb);
1422
1423        newtp                 = tcp_sk(newsk);
1424        newinet               = inet_sk(newsk);
1425        ireq                  = inet_rsk(req);
1426        sk_daddr_set(newsk, ireq->ir_rmt_addr);
1427        sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1428        newsk->sk_bound_dev_if = ireq->ir_iif;
1429        newinet->inet_saddr   = ireq->ir_loc_addr;
1430        inet_opt              = rcu_dereference(ireq->ireq_opt);
1431        RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1432        newinet->mc_index     = inet_iif(skb);
1433        newinet->mc_ttl       = ip_hdr(skb)->ttl;
1434        newinet->rcv_tos      = ip_hdr(skb)->tos;
1435        inet_csk(newsk)->icsk_ext_hdr_len = 0;
1436        if (inet_opt)
1437                inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1438        newinet->inet_id = newtp->write_seq ^ jiffies;
1439
1440        if (!dst) {
1441                dst = inet_csk_route_child_sock(sk, newsk, req);
1442                if (!dst)
1443                        goto put_and_exit;
1444        } else {
1445                /* syncookie case : see end of cookie_v4_check() */
1446        }
1447        sk_setup_caps(newsk, dst);
1448
1449        tcp_ca_openreq_child(newsk, dst);
1450
1451        tcp_sync_mss(newsk, dst_mtu(dst));
1452        newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1453
1454        tcp_initialize_rcv_mss(newsk);
1455
1456#ifdef CONFIG_TCP_MD5SIG
1457        /* Copy over the MD5 key from the original socket */
1458        key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1459                                AF_INET);
1460        if (key) {
1461                /*
1462                 * We're using one, so create a matching key
1463                 * on the newsk structure. If we fail to get
1464                 * memory, then we end up not copying the key
1465                 * across. Shucks.
1466                 */
1467                tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1468                               AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1469                sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1470        }
1471#endif
1472
1473        if (__inet_inherit_port(sk, newsk) < 0)
1474                goto put_and_exit;
1475        *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1476        if (likely(*own_req)) {
1477                tcp_move_syn(newtp, req);
1478                ireq->ireq_opt = NULL;
1479        } else {
1480                newinet->inet_opt = NULL;
1481        }
1482        return newsk;
1483
1484exit_overflow:
1485        NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1486exit_nonewsk:
1487        dst_release(dst);
1488exit:
1489        tcp_listendrop(sk);
1490        return NULL;
1491put_and_exit:
1492        newinet->inet_opt = NULL;
1493        inet_csk_prepare_forced_close(newsk);
1494        tcp_done(newsk);
1495        goto exit;
1496}
1497EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1498
1499static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1500{
1501#ifdef CONFIG_SYN_COOKIES
1502        const struct tcphdr *th = tcp_hdr(skb);
1503
1504        if (!th->syn)
1505                sk = cookie_v4_check(sk, skb);
1506#endif
1507        return sk;
1508}
1509
1510/* The socket must have it's spinlock held when we get
1511 * here, unless it is a TCP_LISTEN socket.
1512 *
1513 * We have a potential double-lock case here, so even when
1514 * doing backlog processing we use the BH locking scheme.
1515 * This is because we cannot sleep with the original spinlock
1516 * held.
1517 */
1518int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1519{
1520        struct sock *rsk;
1521
1522        if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1523                struct dst_entry *dst = sk->sk_rx_dst;
1524
1525                sock_rps_save_rxhash(sk, skb);
1526                sk_mark_napi_id(sk, skb);
1527                if (dst) {
1528                        if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1529                            !dst->ops->check(dst, 0)) {
1530                                dst_release(dst);
1531                                sk->sk_rx_dst = NULL;
1532                        }
1533                }
1534                tcp_rcv_established(sk, skb);
1535                return 0;
1536        }
1537
1538        if (tcp_checksum_complete(skb))
1539                goto csum_err;
1540
1541        if (sk->sk_state == TCP_LISTEN) {
1542                struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1543
1544                if (!nsk)
1545                        goto discard;
1546                if (nsk != sk) {
1547                        if (tcp_child_process(sk, nsk, skb)) {
1548                                rsk = nsk;
1549                                goto reset;
1550                        }
1551                        return 0;
1552                }
1553        } else
1554                sock_rps_save_rxhash(sk, skb);
1555
1556        if (tcp_rcv_state_process(sk, skb)) {
1557                rsk = sk;
1558                goto reset;
1559        }
1560        return 0;
1561
1562reset:
1563        tcp_v4_send_reset(rsk, skb);
1564discard:
1565        kfree_skb(skb);
1566        /* Be careful here. If this function gets more complicated and
1567         * gcc suffers from register pressure on the x86, sk (in %ebx)
1568         * might be destroyed here. This current version compiles correctly,
1569         * but you have been warned.
1570         */
1571        return 0;
1572
1573csum_err:
1574        TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1575        TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1576        goto discard;
1577}
1578EXPORT_SYMBOL(tcp_v4_do_rcv);
1579
1580int tcp_v4_early_demux(struct sk_buff *skb)
1581{
1582        const struct iphdr *iph;
1583        const struct tcphdr *th;
1584        struct sock *sk;
1585
1586        if (skb->pkt_type != PACKET_HOST)
1587                return 0;
1588
1589        if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1590                return 0;
1591
1592        iph = ip_hdr(skb);
1593        th = tcp_hdr(skb);
1594
1595        if (th->doff < sizeof(struct tcphdr) / 4)
1596                return 0;
1597
1598        sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1599                                       iph->saddr, th->source,
1600                                       iph->daddr, ntohs(th->dest),
1601                                       skb->skb_iif, inet_sdif(skb));
1602        if (sk) {
1603                skb->sk = sk;
1604                skb->destructor = sock_edemux;
1605                if (sk_fullsock(sk)) {
1606                        struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1607
1608                        if (dst)
1609                                dst = dst_check(dst, 0);
1610                        if (dst &&
1611                            inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1612                                skb_dst_set_noref(skb, dst);
1613                }
1614        }
1615        return 0;
1616}
1617
1618bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1619{
1620        u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1621
1622        /* Only socket owner can try to collapse/prune rx queues
1623         * to reduce memory overhead, so add a little headroom here.
1624         * Few sockets backlog are possibly concurrently non empty.
1625         */
1626        limit += 64*1024;
1627
1628        /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1629         * we can fix skb->truesize to its real value to avoid future drops.
1630         * This is valid because skb is not yet charged to the socket.
1631         * It has been noticed pure SACK packets were sometimes dropped
1632         * (if cooked by drivers without copybreak feature).
1633         */
1634        skb_condense(skb);
1635
1636        if (unlikely(sk_add_backlog(sk, skb, limit))) {
1637                bh_unlock_sock(sk);
1638                __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1639                return true;
1640        }
1641        return false;
1642}
1643EXPORT_SYMBOL(tcp_add_backlog);
1644
1645int tcp_filter(struct sock *sk, struct sk_buff *skb)
1646{
1647        struct tcphdr *th = (struct tcphdr *)skb->data;
1648        unsigned int eaten = skb->len;
1649        int err;
1650
1651        err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1652        if (!err) {
1653                eaten -= skb->len;
1654                TCP_SKB_CB(skb)->end_seq -= eaten;
1655        }
1656        return err;
1657}
1658EXPORT_SYMBOL(tcp_filter);
1659
1660static void tcp_v4_restore_cb(struct sk_buff *skb)
1661{
1662        memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1663                sizeof(struct inet_skb_parm));
1664}
1665
1666static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1667                           const struct tcphdr *th)
1668{
1669        /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1670         * barrier() makes sure compiler wont play fool^Waliasing games.
1671         */
1672        memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1673                sizeof(struct inet_skb_parm));
1674        barrier();
1675
1676        TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1677        TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1678                                    skb->len - th->doff * 4);
1679        TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1680        TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1681        TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1682        TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1683        TCP_SKB_CB(skb)->sacked  = 0;
1684        TCP_SKB_CB(skb)->has_rxtstamp =
1685                        skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1686}
1687
1688/*
1689 *      From tcp_input.c
1690 */
1691
1692int tcp_v4_rcv(struct sk_buff *skb)
1693{
1694        struct net *net = dev_net(skb->dev);
1695        int sdif = inet_sdif(skb);
1696        const struct iphdr *iph;
1697        const struct tcphdr *th;
1698        bool refcounted;
1699        struct sock *sk;
1700        int ret;
1701
1702        if (skb->pkt_type != PACKET_HOST)
1703                goto discard_it;
1704
1705        /* Count it even if it's bad */
1706        __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1707
1708        if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1709                goto discard_it;
1710
1711        th = (const struct tcphdr *)skb->data;
1712
1713        if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1714                goto bad_packet;
1715        if (!pskb_may_pull(skb, th->doff * 4))
1716                goto discard_it;
1717
1718        /* An explanation is required here, I think.
1719         * Packet length and doff are validated by header prediction,
1720         * provided case of th->doff==0 is eliminated.
1721         * So, we defer the checks. */
1722
1723        if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1724                goto csum_error;
1725
1726        th = (const struct tcphdr *)skb->data;
1727        iph = ip_hdr(skb);
1728lookup:
1729        sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1730                               th->dest, sdif, &refcounted);
1731        if (!sk)
1732                goto no_tcp_socket;
1733
1734process:
1735        if (sk->sk_state == TCP_TIME_WAIT)
1736                goto do_time_wait;
1737
1738        if (sk->sk_state == TCP_NEW_SYN_RECV) {
1739                struct request_sock *req = inet_reqsk(sk);
1740                bool req_stolen = false;
1741                struct sock *nsk;
1742
1743                sk = req->rsk_listener;
1744                if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1745                        sk_drops_add(sk, skb);
1746                        reqsk_put(req);
1747                        goto discard_it;
1748                }
1749                if (tcp_checksum_complete(skb)) {
1750                        reqsk_put(req);
1751                        goto csum_error;
1752                }
1753                if (unlikely(sk->sk_state != TCP_LISTEN)) {
1754                        inet_csk_reqsk_queue_drop_and_put(sk, req);
1755                        goto lookup;
1756                }
1757                /* We own a reference on the listener, increase it again
1758                 * as we might lose it too soon.
1759                 */
1760                sock_hold(sk);
1761                refcounted = true;
1762                nsk = NULL;
1763                if (!tcp_filter(sk, skb)) {
1764                        th = (const struct tcphdr *)skb->data;
1765                        iph = ip_hdr(skb);
1766                        tcp_v4_fill_cb(skb, iph, th);
1767                        nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1768                }
1769                if (!nsk) {
1770                        reqsk_put(req);
1771                        if (req_stolen) {
1772                                /* Another cpu got exclusive access to req
1773                                 * and created a full blown socket.
1774                                 * Try to feed this packet to this socket
1775                                 * instead of discarding it.
1776                                 */
1777                                tcp_v4_restore_cb(skb);
1778                                sock_put(sk);
1779                                goto lookup;
1780                        }
1781                        goto discard_and_relse;
1782                }
1783                if (nsk == sk) {
1784                        reqsk_put(req);
1785                        tcp_v4_restore_cb(skb);
1786                } else if (tcp_child_process(sk, nsk, skb)) {
1787                        tcp_v4_send_reset(nsk, skb);
1788                        goto discard_and_relse;
1789                } else {
1790                        sock_put(sk);
1791                        return 0;
1792                }
1793        }
1794        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1795                __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1796                goto discard_and_relse;
1797        }
1798
1799        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1800                goto discard_and_relse;
1801
1802        if (tcp_v4_inbound_md5_hash(sk, skb))
1803                goto discard_and_relse;
1804
1805        nf_reset(skb);
1806
1807        if (tcp_filter(sk, skb))
1808                goto discard_and_relse;
1809        th = (const struct tcphdr *)skb->data;
1810        iph = ip_hdr(skb);
1811        tcp_v4_fill_cb(skb, iph, th);
1812
1813        skb->dev = NULL;
1814
1815        if (sk->sk_state == TCP_LISTEN) {
1816                ret = tcp_v4_do_rcv(sk, skb);
1817                goto put_and_return;
1818        }
1819
1820        sk_incoming_cpu_update(sk);
1821
1822        bh_lock_sock_nested(sk);
1823        tcp_segs_in(tcp_sk(sk), skb);
1824        ret = 0;
1825        if (!sock_owned_by_user(sk)) {
1826                ret = tcp_v4_do_rcv(sk, skb);
1827        } else if (tcp_add_backlog(sk, skb)) {
1828                goto discard_and_relse;
1829        }
1830        bh_unlock_sock(sk);
1831
1832put_and_return:
1833        if (refcounted)
1834                sock_put(sk);
1835
1836        return ret;
1837
1838no_tcp_socket:
1839        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1840                goto discard_it;
1841
1842        tcp_v4_fill_cb(skb, iph, th);
1843
1844        if (tcp_checksum_complete(skb)) {
1845csum_error:
1846                __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1847bad_packet:
1848                __TCP_INC_STATS(net, TCP_MIB_INERRS);
1849        } else {
1850                tcp_v4_send_reset(NULL, skb);
1851        }
1852
1853discard_it:
1854        /* Discard frame. */
1855        kfree_skb(skb);
1856        return 0;
1857
1858discard_and_relse:
1859        sk_drops_add(sk, skb);
1860        if (refcounted)
1861                sock_put(sk);
1862        goto discard_it;
1863
1864do_time_wait:
1865        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1866                inet_twsk_put(inet_twsk(sk));
1867                goto discard_it;
1868        }
1869
1870        tcp_v4_fill_cb(skb, iph, th);
1871
1872        if (tcp_checksum_complete(skb)) {
1873                inet_twsk_put(inet_twsk(sk));
1874                goto csum_error;
1875        }
1876        switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1877        case TCP_TW_SYN: {
1878                struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1879                                                        &tcp_hashinfo, skb,
1880                                                        __tcp_hdrlen(th),
1881                                                        iph->saddr, th->source,
1882                                                        iph->daddr, th->dest,
1883                                                        inet_iif(skb),
1884                                                        sdif);
1885                if (sk2) {
1886                        inet_twsk_deschedule_put(inet_twsk(sk));
1887                        sk = sk2;
1888                        tcp_v4_restore_cb(skb);
1889                        refcounted = false;
1890                        goto process;
1891                }
1892        }
1893                /* to ACK */
1894                /* fall through */
1895        case TCP_TW_ACK:
1896                tcp_v4_timewait_ack(sk, skb);
1897                break;
1898        case TCP_TW_RST:
1899                tcp_v4_send_reset(sk, skb);
1900                inet_twsk_deschedule_put(inet_twsk(sk));
1901                goto discard_it;
1902        case TCP_TW_SUCCESS:;
1903        }
1904        goto discard_it;
1905}
1906
1907static struct timewait_sock_ops tcp_timewait_sock_ops = {
1908        .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1909        .twsk_unique    = tcp_twsk_unique,
1910        .twsk_destructor= tcp_twsk_destructor,
1911};
1912
1913void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1914{
1915        struct dst_entry *dst = skb_dst(skb);
1916
1917        if (dst && dst_hold_safe(dst)) {
1918                sk->sk_rx_dst = dst;
1919                inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1920        }
1921}
1922EXPORT_SYMBOL(inet_sk_rx_dst_set);
1923
1924const struct inet_connection_sock_af_ops ipv4_specific = {
1925        .queue_xmit        = ip_queue_xmit,
1926        .send_check        = tcp_v4_send_check,
1927        .rebuild_header    = inet_sk_rebuild_header,
1928        .sk_rx_dst_set     = inet_sk_rx_dst_set,
1929        .conn_request      = tcp_v4_conn_request,
1930        .syn_recv_sock     = tcp_v4_syn_recv_sock,
1931        .net_header_len    = sizeof(struct iphdr),
1932        .setsockopt        = ip_setsockopt,
1933        .getsockopt        = ip_getsockopt,
1934        .addr2sockaddr     = inet_csk_addr2sockaddr,
1935        .sockaddr_len      = sizeof(struct sockaddr_in),
1936#ifdef CONFIG_COMPAT
1937        .compat_setsockopt = compat_ip_setsockopt,
1938        .compat_getsockopt = compat_ip_getsockopt,
1939#endif
1940        .mtu_reduced       = tcp_v4_mtu_reduced,
1941};
1942EXPORT_SYMBOL(ipv4_specific);
1943
1944#ifdef CONFIG_TCP_MD5SIG
1945static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1946        .md5_lookup             = tcp_v4_md5_lookup,
1947        .calc_md5_hash          = tcp_v4_md5_hash_skb,
1948        .md5_parse              = tcp_v4_parse_md5_keys,
1949};
1950#endif
1951
1952/* NOTE: A lot of things set to zero explicitly by call to
1953 *       sk_alloc() so need not be done here.
1954 */
1955static int tcp_v4_init_sock(struct sock *sk)
1956{
1957        struct inet_connection_sock *icsk = inet_csk(sk);
1958
1959        tcp_init_sock(sk);
1960
1961        icsk->icsk_af_ops = &ipv4_specific;
1962
1963#ifdef CONFIG_TCP_MD5SIG
1964        tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1965#endif
1966
1967        return 0;
1968}
1969
1970void tcp_v4_destroy_sock(struct sock *sk)
1971{
1972        struct tcp_sock *tp = tcp_sk(sk);
1973
1974        trace_tcp_destroy_sock(sk);
1975
1976        tcp_clear_xmit_timers(sk);
1977
1978        tcp_cleanup_congestion_control(sk);
1979
1980        tcp_cleanup_ulp(sk);
1981
1982        /* Cleanup up the write buffer. */
1983        tcp_write_queue_purge(sk);
1984
1985        /* Check if we want to disable active TFO */
1986        tcp_fastopen_active_disable_ofo_check(sk);
1987
1988        /* Cleans up our, hopefully empty, out_of_order_queue. */
1989        skb_rbtree_purge(&tp->out_of_order_queue);
1990
1991#ifdef CONFIG_TCP_MD5SIG
1992        /* Clean up the MD5 key list, if any */
1993        if (tp->md5sig_info) {
1994                tcp_clear_md5_list(sk);
1995                kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
1996                tp->md5sig_info = NULL;
1997        }
1998#endif
1999
2000        /* Clean up a referenced TCP bind bucket. */
2001        if (inet_csk(sk)->icsk_bind_hash)
2002                inet_put_port(sk);
2003
2004        BUG_ON(tp->fastopen_rsk);
2005
2006        /* If socket is aborted during connect operation */
2007        tcp_free_fastopen_req(tp);
2008        tcp_fastopen_destroy_cipher(sk);
2009        tcp_saved_syn_free(tp);
2010
2011        sk_sockets_allocated_dec(sk);
2012}
2013EXPORT_SYMBOL(tcp_v4_destroy_sock);
2014
2015#ifdef CONFIG_PROC_FS
2016/* Proc filesystem TCP sock list dumping. */
2017
2018/*
2019 * Get next listener socket follow cur.  If cur is NULL, get first socket
2020 * starting from bucket given in st->bucket; when st->bucket is zero the
2021 * very first socket in the hash table is returned.
2022 */
2023static void *listening_get_next(struct seq_file *seq, void *cur)
2024{
2025        struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2026        struct tcp_iter_state *st = seq->private;
2027        struct net *net = seq_file_net(seq);
2028        struct inet_listen_hashbucket *ilb;
2029        struct sock *sk = cur;
2030
2031        if (!sk) {
2032get_head:
2033                ilb = &tcp_hashinfo.listening_hash[st->bucket];
2034                spin_lock(&ilb->lock);
2035                sk = sk_head(&ilb->head);
2036                st->offset = 0;
2037                goto get_sk;
2038        }
2039        ilb = &tcp_hashinfo.listening_hash[st->bucket];
2040        ++st->num;
2041        ++st->offset;
2042
2043        sk = sk_next(sk);
2044get_sk:
2045        sk_for_each_from(sk) {
2046                if (!net_eq(sock_net(sk), net))
2047                        continue;
2048                if (sk->sk_family == afinfo->family)
2049                        return sk;
2050        }
2051        spin_unlock(&ilb->lock);
2052        st->offset = 0;
2053        if (++st->bucket < INET_LHTABLE_SIZE)
2054                goto get_head;
2055        return NULL;
2056}
2057
2058static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2059{
2060        struct tcp_iter_state *st = seq->private;
2061        void *rc;
2062
2063        st->bucket = 0;
2064        st->offset = 0;
2065        rc = listening_get_next(seq, NULL);
2066
2067        while (rc && *pos) {
2068                rc = listening_get_next(seq, rc);
2069                --*pos;
2070        }
2071        return rc;
2072}
2073
2074static inline bool empty_bucket(const struct tcp_iter_state *st)
2075{
2076        return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2077}
2078
2079/*
2080 * Get first established socket starting from bucket given in st->bucket.
2081 * If st->bucket is zero, the very first socket in the hash is returned.
2082 */
2083static void *established_get_first(struct seq_file *seq)
2084{
2085        struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2086        struct tcp_iter_state *st = seq->private;
2087        struct net *net = seq_file_net(seq);
2088        void *rc = NULL;
2089
2090        st->offset = 0;
2091        for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2092                struct sock *sk;
2093                struct hlist_nulls_node *node;
2094                spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2095
2096                /* Lockless fast path for the common case of empty buckets */
2097                if (empty_bucket(st))
2098                        continue;
2099
2100                spin_lock_bh(lock);
2101                sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2102                        if (sk->sk_family != afinfo->family ||
2103                            !net_eq(sock_net(sk), net)) {
2104                                continue;
2105                        }
2106                        rc = sk;
2107                        goto out;
2108                }
2109                spin_unlock_bh(lock);
2110        }
2111out:
2112        return rc;
2113}
2114
2115static void *established_get_next(struct seq_file *seq, void *cur)
2116{
2117        struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2118        struct sock *sk = cur;
2119        struct hlist_nulls_node *node;
2120        struct tcp_iter_state *st = seq->private;
2121        struct net *net = seq_file_net(seq);
2122
2123        ++st->num;
2124        ++st->offset;
2125
2126        sk = sk_nulls_next(sk);
2127
2128        sk_nulls_for_each_from(sk, node) {
2129                if (sk->sk_family == afinfo->family &&
2130                    net_eq(sock_net(sk), net))
2131                        return sk;
2132        }
2133
2134        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2135        ++st->bucket;
2136        return established_get_first(seq);
2137}
2138
2139static void *established_get_idx(struct seq_file *seq, loff_t pos)
2140{
2141        struct tcp_iter_state *st = seq->private;
2142        void *rc;
2143
2144        st->bucket = 0;
2145        rc = established_get_first(seq);
2146
2147        while (rc && pos) {
2148                rc = established_get_next(seq, rc);
2149                --pos;
2150        }
2151        return rc;
2152}
2153
2154static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2155{
2156        void *rc;
2157        struct tcp_iter_state *st = seq->private;
2158
2159        st->state = TCP_SEQ_STATE_LISTENING;
2160        rc        = listening_get_idx(seq, &pos);
2161
2162        if (!rc) {
2163                st->state = TCP_SEQ_STATE_ESTABLISHED;
2164                rc        = established_get_idx(seq, pos);
2165        }
2166
2167        return rc;
2168}
2169
2170static void *tcp_seek_last_pos(struct seq_file *seq)
2171{
2172        struct tcp_iter_state *st = seq->private;
2173        int offset = st->offset;
2174        int orig_num = st->num;
2175        void *rc = NULL;
2176
2177        switch (st->state) {
2178        case TCP_SEQ_STATE_LISTENING:
2179                if (st->bucket >= INET_LHTABLE_SIZE)
2180                        break;
2181                st->state = TCP_SEQ_STATE_LISTENING;
2182                rc = listening_get_next(seq, NULL);
2183                while (offset-- && rc)
2184                        rc = listening_get_next(seq, rc);
2185                if (rc)
2186                        break;
2187                st->bucket = 0;
2188                st->state = TCP_SEQ_STATE_ESTABLISHED;
2189                /* Fallthrough */
2190        case TCP_SEQ_STATE_ESTABLISHED:
2191                if (st->bucket > tcp_hashinfo.ehash_mask)
2192                        break;
2193                rc = established_get_first(seq);
2194                while (offset-- && rc)
2195                        rc = established_get_next(seq, rc);
2196        }
2197
2198        st->num = orig_num;
2199
2200        return rc;
2201}
2202
2203void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2204{
2205        struct tcp_iter_state *st = seq->private;
2206        void *rc;
2207
2208        if (*pos && *pos == st->last_pos) {
2209                rc = tcp_seek_last_pos(seq);
2210                if (rc)
2211                        goto out;
2212        }
2213
2214        st->state = TCP_SEQ_STATE_LISTENING;
2215        st->num = 0;
2216        st->bucket = 0;
2217        st->offset = 0;
2218        rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2219
2220out:
2221        st->last_pos = *pos;
2222        return rc;
2223}
2224EXPORT_SYMBOL(tcp_seq_start);
2225
2226void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2227{
2228        struct tcp_iter_state *st = seq->private;
2229        void *rc = NULL;
2230
2231        if (v == SEQ_START_TOKEN) {
2232                rc = tcp_get_idx(seq, 0);
2233                goto out;
2234        }
2235
2236        switch (st->state) {
2237        case TCP_SEQ_STATE_LISTENING:
2238                rc = listening_get_next(seq, v);
2239                if (!rc) {
2240                        st->state = TCP_SEQ_STATE_ESTABLISHED;
2241                        st->bucket = 0;
2242                        st->offset = 0;
2243                        rc        = established_get_first(seq);
2244                }
2245                break;
2246        case TCP_SEQ_STATE_ESTABLISHED:
2247                rc = established_get_next(seq, v);
2248                break;
2249        }
2250out:
2251        ++*pos;
2252        st->last_pos = *pos;
2253        return rc;
2254}
2255EXPORT_SYMBOL(tcp_seq_next);
2256
2257void tcp_seq_stop(struct seq_file *seq, void *v)
2258{
2259        struct tcp_iter_state *st = seq->private;
2260
2261        switch (st->state) {
2262        case TCP_SEQ_STATE_LISTENING:
2263                if (v != SEQ_START_TOKEN)
2264                        spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2265                break;
2266        case TCP_SEQ_STATE_ESTABLISHED:
2267                if (v)
2268                        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2269                break;
2270        }
2271}
2272EXPORT_SYMBOL(tcp_seq_stop);
2273
2274static void get_openreq4(const struct request_sock *req,
2275                         struct seq_file *f, int i)
2276{
2277        const struct inet_request_sock *ireq = inet_rsk(req);
2278        long delta = req->rsk_timer.expires - jiffies;
2279
2280        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2281                " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2282                i,
2283                ireq->ir_loc_addr,
2284                ireq->ir_num,
2285                ireq->ir_rmt_addr,
2286                ntohs(ireq->ir_rmt_port),
2287                TCP_SYN_RECV,
2288                0, 0, /* could print option size, but that is af dependent. */
2289                1,    /* timers active (only the expire timer) */
2290                jiffies_delta_to_clock_t(delta),
2291                req->num_timeout,
2292                from_kuid_munged(seq_user_ns(f),
2293                                 sock_i_uid(req->rsk_listener)),
2294                0,  /* non standard timer */
2295                0, /* open_requests have no inode */
2296                0,
2297                req);
2298}
2299
2300static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2301{
2302        int timer_active;
2303        unsigned long timer_expires;
2304        const struct tcp_sock *tp = tcp_sk(sk);
2305        const struct inet_connection_sock *icsk = inet_csk(sk);
2306        const struct inet_sock *inet = inet_sk(sk);
2307        const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2308        __be32 dest = inet->inet_daddr;
2309        __be32 src = inet->inet_rcv_saddr;
2310        __u16 destp = ntohs(inet->inet_dport);
2311        __u16 srcp = ntohs(inet->inet_sport);
2312        int rx_queue;
2313        int state;
2314
2315        if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2316            icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2317            icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2318                timer_active    = 1;
2319                timer_expires   = icsk->icsk_timeout;
2320        } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2321                timer_active    = 4;
2322                timer_expires   = icsk->icsk_timeout;
2323        } else if (timer_pending(&sk->sk_timer)) {
2324                timer_active    = 2;
2325                timer_expires   = sk->sk_timer.expires;
2326        } else {
2327                timer_active    = 0;
2328                timer_expires = jiffies;
2329        }
2330
2331        state = inet_sk_state_load(sk);
2332        if (state == TCP_LISTEN)
2333                rx_queue = sk->sk_ack_backlog;
2334        else
2335                /* Because we don't lock the socket,
2336                 * we might find a transient negative value.
2337                 */
2338                rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2339
2340        seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2341                        "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2342                i, src, srcp, dest, destp, state,
2343                tp->write_seq - tp->snd_una,
2344                rx_queue,
2345                timer_active,
2346                jiffies_delta_to_clock_t(timer_expires - jiffies),
2347                icsk->icsk_retransmits,
2348                from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2349                icsk->icsk_probes_out,
2350                sock_i_ino(sk),
2351                refcount_read(&sk->sk_refcnt), sk,
2352                jiffies_to_clock_t(icsk->icsk_rto),
2353                jiffies_to_clock_t(icsk->icsk_ack.ato),
2354                (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2355                tp->snd_cwnd,
2356                state == TCP_LISTEN ?
2357                    fastopenq->max_qlen :
2358                    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2359}
2360
2361static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2362                               struct seq_file *f, int i)
2363{
2364        long delta = tw->tw_timer.expires - jiffies;
2365        __be32 dest, src;
2366        __u16 destp, srcp;
2367
2368        dest  = tw->tw_daddr;
2369        src   = tw->tw_rcv_saddr;
2370        destp = ntohs(tw->tw_dport);
2371        srcp  = ntohs(tw->tw_sport);
2372
2373        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2374                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2375                i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2376                3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2377                refcount_read(&tw->tw_refcnt), tw);
2378}
2379
2380#define TMPSZ 150
2381
2382static int tcp4_seq_show(struct seq_file *seq, void *v)
2383{
2384        struct tcp_iter_state *st;
2385        struct sock *sk = v;
2386
2387        seq_setwidth(seq, TMPSZ - 1);
2388        if (v == SEQ_START_TOKEN) {
2389                seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2390                           "rx_queue tr tm->when retrnsmt   uid  timeout "
2391                           "inode");
2392                goto out;
2393        }
2394        st = seq->private;
2395
2396        if (sk->sk_state == TCP_TIME_WAIT)
2397                get_timewait4_sock(v, seq, st->num);
2398        else if (sk->sk_state == TCP_NEW_SYN_RECV)
2399                get_openreq4(v, seq, st->num);
2400        else
2401                get_tcp4_sock(v, seq, st->num);
2402out:
2403        seq_pad(seq, '\n');
2404        return 0;
2405}
2406
2407static const struct seq_operations tcp4_seq_ops = {
2408        .show           = tcp4_seq_show,
2409        .start          = tcp_seq_start,
2410        .next           = tcp_seq_next,
2411        .stop           = tcp_seq_stop,
2412};
2413
2414static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2415        .family         = AF_INET,
2416};
2417
2418static int __net_init tcp4_proc_init_net(struct net *net)
2419{
2420        if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2421                        sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2422                return -ENOMEM;
2423        return 0;
2424}
2425
2426static void __net_exit tcp4_proc_exit_net(struct net *net)
2427{
2428        remove_proc_entry("tcp", net->proc_net);
2429}
2430
2431static struct pernet_operations tcp4_net_ops = {
2432        .init = tcp4_proc_init_net,
2433        .exit = tcp4_proc_exit_net,
2434};
2435
2436int __init tcp4_proc_init(void)
2437{
2438        return register_pernet_subsys(&tcp4_net_ops);
2439}
2440
2441void tcp4_proc_exit(void)
2442{
2443        unregister_pernet_subsys(&tcp4_net_ops);
2444}
2445#endif /* CONFIG_PROC_FS */
2446
2447struct proto tcp_prot = {
2448        .name                   = "TCP",
2449        .owner                  = THIS_MODULE,
2450        .close                  = tcp_close,
2451        .pre_connect            = tcp_v4_pre_connect,
2452        .connect                = tcp_v4_connect,
2453        .disconnect             = tcp_disconnect,
2454        .accept                 = inet_csk_accept,
2455        .ioctl                  = tcp_ioctl,
2456        .init                   = tcp_v4_init_sock,
2457        .destroy                = tcp_v4_destroy_sock,
2458        .shutdown               = tcp_shutdown,
2459        .setsockopt             = tcp_setsockopt,
2460        .getsockopt             = tcp_getsockopt,
2461        .keepalive              = tcp_set_keepalive,
2462        .recvmsg                = tcp_recvmsg,
2463        .sendmsg                = tcp_sendmsg,
2464        .sendpage               = tcp_sendpage,
2465        .backlog_rcv            = tcp_v4_do_rcv,
2466        .release_cb             = tcp_release_cb,
2467        .hash                   = inet_hash,
2468        .unhash                 = inet_unhash,
2469        .get_port               = inet_csk_get_port,
2470        .enter_memory_pressure  = tcp_enter_memory_pressure,
2471        .leave_memory_pressure  = tcp_leave_memory_pressure,
2472        .stream_memory_free     = tcp_stream_memory_free,
2473        .sockets_allocated      = &tcp_sockets_allocated,
2474        .orphan_count           = &tcp_orphan_count,
2475        .memory_allocated       = &tcp_memory_allocated,
2476        .memory_pressure        = &tcp_memory_pressure,
2477        .sysctl_mem             = sysctl_tcp_mem,
2478        .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2479        .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2480        .max_header             = MAX_TCP_HEADER,
2481        .obj_size               = sizeof(struct tcp_sock),
2482        .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2483        .twsk_prot              = &tcp_timewait_sock_ops,
2484        .rsk_prot               = &tcp_request_sock_ops,
2485        .h.hashinfo             = &tcp_hashinfo,
2486        .no_autobind            = true,
2487#ifdef CONFIG_COMPAT
2488        .compat_setsockopt      = compat_tcp_setsockopt,
2489        .compat_getsockopt      = compat_tcp_getsockopt,
2490#endif
2491        .diag_destroy           = tcp_abort,
2492};
2493EXPORT_SYMBOL(tcp_prot);
2494
2495static void __net_exit tcp_sk_exit(struct net *net)
2496{
2497        int cpu;
2498
2499        module_put(net->ipv4.tcp_congestion_control->owner);
2500
2501        for_each_possible_cpu(cpu)
2502                inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2503        free_percpu(net->ipv4.tcp_sk);
2504}
2505
2506static int __net_init tcp_sk_init(struct net *net)
2507{
2508        int res, cpu, cnt;
2509
2510        net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2511        if (!net->ipv4.tcp_sk)
2512                return -ENOMEM;
2513
2514        for_each_possible_cpu(cpu) {
2515                struct sock *sk;
2516
2517                res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2518                                           IPPROTO_TCP, net);
2519                if (res)
2520                        goto fail;
2521                sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2522
2523                /* Please enforce IP_DF and IPID==0 for RST and
2524                 * ACK sent in SYN-RECV and TIME-WAIT state.
2525                 */
2526                inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2527
2528                *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2529        }
2530
2531        net->ipv4.sysctl_tcp_ecn = 2;
2532        net->ipv4.sysctl_tcp_ecn_fallback = 1;
2533
2534        net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2535        net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2536        net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2537
2538        net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2539        net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2540        net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2541
2542        net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2543        net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2544        net->ipv4.sysctl_tcp_syncookies = 1;
2545        net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2546        net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2547        net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2548        net->ipv4.sysctl_tcp_orphan_retries = 0;
2549        net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2550        net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2551        net->ipv4.sysctl_tcp_tw_reuse = 2;
2552
2553        cnt = tcp_hashinfo.ehash_mask + 1;
2554        net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2555        net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2556
2557        net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2558        net->ipv4.sysctl_tcp_sack = 1;
2559        net->ipv4.sysctl_tcp_window_scaling = 1;
2560        net->ipv4.sysctl_tcp_timestamps = 1;
2561        net->ipv4.sysctl_tcp_early_retrans = 3;
2562        net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2563        net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2564        net->ipv4.sysctl_tcp_retrans_collapse = 1;
2565        net->ipv4.sysctl_tcp_max_reordering = 300;
2566        net->ipv4.sysctl_tcp_dsack = 1;
2567        net->ipv4.sysctl_tcp_app_win = 31;
2568        net->ipv4.sysctl_tcp_adv_win_scale = 1;
2569        net->ipv4.sysctl_tcp_frto = 2;
2570        net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2571        /* This limits the percentage of the congestion window which we
2572         * will allow a single TSO frame to consume.  Building TSO frames
2573         * which are too large can cause TCP streams to be bursty.
2574         */
2575        net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2576        /* Default TSQ limit of four TSO segments */
2577        net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
2578        /* rfc5961 challenge ack rate limiting */
2579        net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2580        net->ipv4.sysctl_tcp_min_tso_segs = 2;
2581        net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2582        net->ipv4.sysctl_tcp_autocorking = 1;
2583        net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2584        net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2585        net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2586        if (net != &init_net) {
2587                memcpy(net->ipv4.sysctl_tcp_rmem,
2588                       init_net.ipv4.sysctl_tcp_rmem,
2589                       sizeof(init_net.ipv4.sysctl_tcp_rmem));
2590                memcpy(net->ipv4.sysctl_tcp_wmem,
2591                       init_net.ipv4.sysctl_tcp_wmem,
2592                       sizeof(init_net.ipv4.sysctl_tcp_wmem));
2593        }
2594        net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2595        net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2596        net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2597        spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2598        net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2599        atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2600
2601        /* Reno is always built in */
2602        if (!net_eq(net, &init_net) &&
2603            try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2604                net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2605        else
2606                net->ipv4.tcp_congestion_control = &tcp_reno;
2607
2608        return 0;
2609fail:
2610        tcp_sk_exit(net);
2611
2612        return res;
2613}
2614
2615static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2616{
2617        struct net *net;
2618
2619        inet_twsk_purge(&tcp_hashinfo, AF_INET);
2620
2621        list_for_each_entry(net, net_exit_list, exit_list)
2622                tcp_fastopen_ctx_destroy(net);
2623}
2624
2625static struct pernet_operations __net_initdata tcp_sk_ops = {
2626       .init       = tcp_sk_init,
2627       .exit       = tcp_sk_exit,
2628       .exit_batch = tcp_sk_exit_batch,
2629};
2630
2631void __init tcp_v4_init(void)
2632{
2633        if (register_pernet_subsys(&tcp_sk_ops))
2634                panic("Failed to create the TCP control socket.\n");
2635}
2636