LXR linux/net/ipv4/tcp

   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Implementation of the Transmission Control Protocol(TCP).
   7 *
   8 *              IPv4 specific functions
   9 *
  10 *
  11 *              code split from:
  12 *              linux/ipv4/tcp.c
  13 *              linux/ipv4/tcp_input.c
  14 *              linux/ipv4/tcp_output.c
  15 *
  16 *              See tcp.c for author information
  17 *
  18 *      This program is free software; you can redistribute it and/or
  19 *      modify it under the terms of the GNU General Public License
  20 *      as published by the Free Software Foundation; either version
  21 *      2 of the License, or (at your option) any later version.
  22 */
  23
  24/*
  25 * Changes:
  26 *              David S. Miller :       New socket lookup architecture.
  27 *                                      This code is dedicated to John Dyson.
  28 *              David S. Miller :       Change semantics of established hash,
  29 *                                      half is devoted to TIME_WAIT sockets
  30 *                                      and the rest go in the other half.
  31 *              Andi Kleen :            Add support for syncookies and fixed
  32 *                                      some bugs: ip options weren't passed to
  33 *                                      the TCP layer, missed a check for an
  34 *                                      ACK bit.
  35 *              Andi Kleen :            Implemented fast path mtu discovery.
  36 *                                      Fixed many serious bugs in the
  37 *                                      request_sock handling and moved
  38 *                                      most of it into the af independent code.
  39 *                                      Added tail drop and some other bugfixes.
  40 *                                      Added new listen semantics.
  41 *              Mike McLagan    :       Routing by source
  42 *      Juan Jose Ciarlante:            ip_dynaddr bits
  43 *              Andi Kleen:             various fixes.
  44 *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45 *                                      coma.
  46 *      Andi Kleen              :       Fix new listen.
  47 *      Andi Kleen              :       Fix accept error reporting.
  48 *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49 *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50 *                                      a single port at the same time.
  51 */
  52
  53#define pr_fmt(fmt) "TCP: " fmt
  54
  55#include <linux/bottom_half.h>
  56#include <linux/types.h>
  57#include <linux/fcntl.h>
  58#include <linux/module.h>
  59#include <linux/random.h>
  60#include <linux/cache.h>
  61#include <linux/jhash.h>
  62#include <linux/init.h>
  63#include <linux/times.h>
  64#include <linux/slab.h>
  65
  66#include <net/net_namespace.h>
  67#include <net/icmp.h>
  68#include <net/inet_hashtables.h>
  69#include <net/tcp.h>
  70#include <net/transp_v6.h>
  71#include <net/ipv6.h>
  72#include <net/inet_common.h>
  73#include <net/timewait_sock.h>
  74#include <net/xfrm.h>
  75#include <net/secure_seq.h>
  76#include <net/busy_poll.h>
  77
  78#include <linux/inet.h>
  79#include <linux/ipv6.h>
  80#include <linux/stddef.h>
  81#include <linux/proc_fs.h>
  82#include <linux/seq_file.h>
  83#include <linux/inetdevice.h>
  84
  85#include <crypto/hash.h>
  86#include <linux/scatterlist.h>
  87
  88#include <trace/events/tcp.h>
  89
  90#ifdef CONFIG_TCP_MD5SIG
  91static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  92                               __be32 daddr, __be32 saddr, const struct tcphdr *th);
  93#endif
  94
  95struct inet_hashinfo tcp_hashinfo;
  96EXPORT_SYMBOL(tcp_hashinfo);
  97
  98static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  99{
 100        return secure_tcp_seq(ip_hdr(skb)->daddr,
 101                              ip_hdr(skb)->saddr,
 102                              tcp_hdr(skb)->dest,
 103                              tcp_hdr(skb)->source);
 104}
 105
 106static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 107{
 108        return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 109}
 110
 111int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 112{
 113        const struct inet_timewait_sock *tw = inet_twsk(sktw);
 114        const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 115        struct tcp_sock *tp = tcp_sk(sk);
 116        int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
 117
 118        if (reuse == 2) {
 119                /* Still does not detect *everything* that goes through
 120                 * lo, since we require a loopback src or dst address
 121                 * or direct binding to 'lo' interface.
 122                 */
 123                bool loopback = false;
 124                if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 125                        loopback = true;
 126#if IS_ENABLED(CONFIG_IPV6)
 127                if (tw->tw_family == AF_INET6) {
 128                        if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 129                            (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
 130                             (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
 131                            ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 132                            (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
 133                             (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
 134                                loopback = true;
 135                } else
 136#endif
 137                {
 138                        if (ipv4_is_loopback(tw->tw_daddr) ||
 139                            ipv4_is_loopback(tw->tw_rcv_saddr))
 140                                loopback = true;
 141                }
 142                if (!loopback)
 143                        reuse = 0;
 144        }
 145
 146        /* With PAWS, it is safe from the viewpoint
 147           of data integrity. Even without PAWS it is safe provided sequence
 148           spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 149
 150           Actually, the idea is close to VJ's one, only timestamp cache is
 151           held not per host, but per port pair and TW bucket is used as state
 152           holder.
 153
 154           If TW bucket has been already destroyed we fall back to VJ's scheme
 155           and use initial timestamp retrieved from peer table.
 156         */
 157        if (tcptw->tw_ts_recent_stamp &&
 158            (!twp || (reuse && time_after32(ktime_get_seconds(),
 159                                            tcptw->tw_ts_recent_stamp)))) {
 160                /* In case of repair and re-using TIME-WAIT sockets we still
 161                 * want to be sure that it is safe as above but honor the
 162                 * sequence numbers and time stamps set as part of the repair
 163                 * process.
 164                 *
 165                 * Without this check re-using a TIME-WAIT socket with TCP
 166                 * repair would accumulate a -1 on the repair assigned
 167                 * sequence number. The first time it is reused the sequence
 168                 * is -1, the second time -2, etc. This fixes that issue
 169                 * without appearing to create any others.
 170                 */
 171                if (likely(!tp->repair)) {
 172                        tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 173                        if (tp->write_seq == 0)
 174                                tp->write_seq = 1;
 175                        tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 176                        tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 177                }
 178                sock_hold(sktw);
 179                return 1;
 180        }
 181
 182        return 0;
 183}
 184EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 185
 186static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 187                              int addr_len)
 188{
 189        /* This check is replicated from tcp_v4_connect() and intended to
 190         * prevent BPF program called below from accessing bytes that are out
 191         * of the bound specified by user in addr_len.
 192         */
 193        if (addr_len < sizeof(struct sockaddr_in))
 194                return -EINVAL;
 195
 196        sock_owned_by_me(sk);
 197
 198        return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 199}
 200
 201/* This will initiate an outgoing connection. */
 202int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 203{
 204        struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 205        struct inet_sock *inet = inet_sk(sk);
 206        struct tcp_sock *tp = tcp_sk(sk);
 207        __be16 orig_sport, orig_dport;
 208        __be32 daddr, nexthop;
 209        struct flowi4 *fl4;
 210        struct rtable *rt;
 211        int err;
 212        struct ip_options_rcu *inet_opt;
 213        struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 214
 215        if (addr_len < sizeof(struct sockaddr_in))
 216                return -EINVAL;
 217
 218        if (usin->sin_family != AF_INET)
 219                return -EAFNOSUPPORT;
 220
 221        nexthop = daddr = usin->sin_addr.s_addr;
 222        inet_opt = rcu_dereference_protected(inet->inet_opt,
 223                                             lockdep_sock_is_held(sk));
 224        if (inet_opt && inet_opt->opt.srr) {
 225                if (!daddr)
 226                        return -EINVAL;
 227                nexthop = inet_opt->opt.faddr;
 228        }
 229
 230        orig_sport = inet->inet_sport;
 231        orig_dport = usin->sin_port;
 232        fl4 = &inet->cork.fl.u.ip4;
 233        rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 234                              RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 235                              IPPROTO_TCP,
 236                              orig_sport, orig_dport, sk);
 237        if (IS_ERR(rt)) {
 238                err = PTR_ERR(rt);
 239                if (err == -ENETUNREACH)
 240                        IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 241                return err;
 242        }
 243
 244        if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 245                ip_rt_put(rt);
 246                return -ENETUNREACH;
 247        }
 248
 249        if (!inet_opt || !inet_opt->opt.srr)
 250                daddr = fl4->daddr;
 251
 252        if (!inet->inet_saddr)
 253                inet->inet_saddr = fl4->saddr;
 254        sk_rcv_saddr_set(sk, inet->inet_saddr);
 255
 256        if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 257                /* Reset inherited state */
 258                tp->rx_opt.ts_recent       = 0;
 259                tp->rx_opt.ts_recent_stamp = 0;
 260                if (likely(!tp->repair))
 261                        tp->write_seq      = 0;
 262        }
 263
 264        inet->inet_dport = usin->sin_port;
 265        sk_daddr_set(sk, daddr);
 266
 267        inet_csk(sk)->icsk_ext_hdr_len = 0;
 268        if (inet_opt)
 269                inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 270
 271        tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 272
 273        /* Socket identity is still unknown (sport may be zero).
 274         * However we set state to SYN-SENT and not releasing socket
 275         * lock select source port, enter ourselves into the hash tables and
 276         * complete initialization after this.
 277         */
 278        tcp_set_state(sk, TCP_SYN_SENT);
 279        err = inet_hash_connect(tcp_death_row, sk);
 280        if (err)
 281                goto failure;
 282
 283        sk_set_txhash(sk);
 284
 285        rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 286                               inet->inet_sport, inet->inet_dport, sk);
 287        if (IS_ERR(rt)) {
 288                err = PTR_ERR(rt);
 289                rt = NULL;
 290                goto failure;
 291        }
 292        /* OK, now commit destination to socket.  */
 293        sk->sk_gso_type = SKB_GSO_TCPV4;
 294        sk_setup_caps(sk, &rt->dst);
 295        rt = NULL;
 296
 297        if (likely(!tp->repair)) {
 298                if (!tp->write_seq)
 299                        tp->write_seq = secure_tcp_seq(inet->inet_saddr,
 300                                                       inet->inet_daddr,
 301                                                       inet->inet_sport,
 302                                                       usin->sin_port);
 303                tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 304                                                 inet->inet_saddr,
 305                                                 inet->inet_daddr);
 306        }
 307
 308        inet->inet_id = prandom_u32();
 309
 310        if (tcp_fastopen_defer_connect(sk, &err))
 311                return err;
 312        if (err)
 313                goto failure;
 314
 315        err = tcp_connect(sk);
 316
 317        if (err)
 318                goto failure;
 319
 320        return 0;
 321
 322failure:
 323        /*
 324         * This unhashes the socket and releases the local port,
 325         * if necessary.
 326         */
 327        tcp_set_state(sk, TCP_CLOSE);
 328        ip_rt_put(rt);
 329        sk->sk_route_caps = 0;
 330        inet->inet_dport = 0;
 331        return err;
 332}
 333EXPORT_SYMBOL(tcp_v4_connect);
 334
 335/*
 336 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 337 * It can be called through tcp_release_cb() if socket was owned by user
 338 * at the time tcp_v4_err() was called to handle ICMP message.
 339 */
 340void tcp_v4_mtu_reduced(struct sock *sk)
 341{
 342        struct inet_sock *inet = inet_sk(sk);
 343        struct dst_entry *dst;
 344        u32 mtu;
 345
 346        if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 347                return;
 348        mtu = tcp_sk(sk)->mtu_info;
 349        dst = inet_csk_update_pmtu(sk, mtu);
 350        if (!dst)
 351                return;
 352
 353        /* Something is about to be wrong... Remember soft error
 354         * for the case, if this connection will not able to recover.
 355         */
 356        if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 357                sk->sk_err_soft = EMSGSIZE;
 358
 359        mtu = dst_mtu(dst);
 360
 361        if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 362            ip_sk_accept_pmtu(sk) &&
 363            inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 364                tcp_sync_mss(sk, mtu);
 365
 366                /* Resend the TCP packet because it's
 367                 * clear that the old packet has been
 368                 * dropped. This is the new "fast" path mtu
 369                 * discovery.
 370                 */
 371                tcp_simple_retransmit(sk);
 372        } /* else let the usual retransmit timer handle it */
 373}
 374EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 375
 376static void do_redirect(struct sk_buff *skb, struct sock *sk)
 377{
 378        struct dst_entry *dst = __sk_dst_check(sk, 0);
 379
 380        if (dst)
 381                dst->ops->redirect(dst, sk, skb);
 382}
 383
 384
 385/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 386void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 387{
 388        struct request_sock *req = inet_reqsk(sk);
 389        struct net *net = sock_net(sk);
 390
 391        /* ICMPs are not backlogged, hence we cannot get
 392         * an established socket here.
 393         */
 394        if (seq != tcp_rsk(req)->snt_isn) {
 395                __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 396        } else if (abort) {
 397                /*
 398                 * Still in SYN_RECV, just remove it silently.
 399                 * There is no good way to pass the error to the newly
 400                 * created socket, and POSIX does not want network
 401                 * errors returned from accept().
 402                 */
 403                inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 404                tcp_listendrop(req->rsk_listener);
 405        }
 406        reqsk_put(req);
 407}
 408EXPORT_SYMBOL(tcp_req_err);
 409
 410/*
 411 * This routine is called by the ICMP module when it gets some
 412 * sort of error condition.  If err < 0 then the socket should
 413 * be closed and the error returned to the user.  If err > 0
 414 * it's just the icmp type << 8 | icmp code.  After adjustment
 415 * header points to the first 8 bytes of the tcp header.  We need
 416 * to find the appropriate port.
 417 *
 418 * The locking strategy used here is very "optimistic". When
 419 * someone else accesses the socket the ICMP is just dropped
 420 * and for some paths there is no check at all.
 421 * A more general error queue to queue errors for later handling
 422 * is probably better.
 423 *
 424 */
 425
 426void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 427{
 428        const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 429        struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 430        struct inet_connection_sock *icsk;
 431        struct tcp_sock *tp;
 432        struct inet_sock *inet;
 433        const int type = icmp_hdr(icmp_skb)->type;
 434        const int code = icmp_hdr(icmp_skb)->code;
 435        struct sock *sk;
 436        struct sk_buff *skb;
 437        struct request_sock *fastopen;
 438        u32 seq, snd_una;
 439        s32 remaining;
 440        u32 delta_us;
 441        int err;
 442        struct net *net = dev_net(icmp_skb->dev);
 443
 444        sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 445                                       th->dest, iph->saddr, ntohs(th->source),
 446                                       inet_iif(icmp_skb), 0);
 447        if (!sk) {
 448                __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 449                return;
 450        }
 451        if (sk->sk_state == TCP_TIME_WAIT) {
 452                inet_twsk_put(inet_twsk(sk));
 453                return;
 454        }
 455        seq = ntohl(th->seq);
 456        if (sk->sk_state == TCP_NEW_SYN_RECV)
 457                return tcp_req_err(sk, seq,
 458                                  type == ICMP_PARAMETERPROB ||
 459                                  type == ICMP_TIME_EXCEEDED ||
 460                                  (type == ICMP_DEST_UNREACH &&
 461                                   (code == ICMP_NET_UNREACH ||
 462                                    code == ICMP_HOST_UNREACH)));
 463
 464        bh_lock_sock(sk);
 465        /* If too many ICMPs get dropped on busy
 466         * servers this needs to be solved differently.
 467         * We do take care of PMTU discovery (RFC1191) special case :
 468         * we can receive locally generated ICMP messages while socket is held.
 469         */
 470        if (sock_owned_by_user(sk)) {
 471                if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 472                        __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 473        }
 474        if (sk->sk_state == TCP_CLOSE)
 475                goto out;
 476
 477        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 478                __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 479                goto out;
 480        }
 481
 482        icsk = inet_csk(sk);
 483        tp = tcp_sk(sk);
 484        /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 485        fastopen = rcu_dereference(tp->fastopen_rsk);
 486        snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 487        if (sk->sk_state != TCP_LISTEN &&
 488            !between(seq, snd_una, tp->snd_nxt)) {
 489                __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 490                goto out;
 491        }
 492
 493        switch (type) {
 494        case ICMP_REDIRECT:
 495                if (!sock_owned_by_user(sk))
 496                        do_redirect(icmp_skb, sk);
 497                goto out;
 498        case ICMP_SOURCE_QUENCH:
 499                /* Just silently ignore these. */
 500                goto out;
 501        case ICMP_PARAMETERPROB:
 502                err = EPROTO;
 503                break;
 504        case ICMP_DEST_UNREACH:
 505                if (code > NR_ICMP_UNREACH)
 506                        goto out;
 507
 508                if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 509                        /* We are not interested in TCP_LISTEN and open_requests
 510                         * (SYN-ACKs send out by Linux are always <576bytes so
 511                         * they should go through unfragmented).
 512                         */
 513                        if (sk->sk_state == TCP_LISTEN)
 514                                goto out;
 515
 516                        tp->mtu_info = info;
 517                        if (!sock_owned_by_user(sk)) {
 518                                tcp_v4_mtu_reduced(sk);
 519                        } else {
 520                                if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 521                                        sock_hold(sk);
 522                        }
 523                        goto out;
 524                }
 525
 526                err = icmp_err_convert[code].errno;
 527                /* check if icmp_skb allows revert of backoff
 528                 * (see draft-zimmermann-tcp-lcd) */
 529                if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 530                        break;
 531                if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 532                    !icsk->icsk_backoff || fastopen)
 533                        break;
 534
 535                if (sock_owned_by_user(sk))
 536                        break;
 537
 538                skb = tcp_rtx_queue_head(sk);
 539                if (WARN_ON_ONCE(!skb))
 540                        break;
 541
 542                icsk->icsk_backoff--;
 543                icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 544                                               TCP_TIMEOUT_INIT;
 545                icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 546
 547
 548                tcp_mstamp_refresh(tp);
 549                delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
 550                remaining = icsk->icsk_rto -
 551                            usecs_to_jiffies(delta_us);
 552
 553                if (remaining > 0) {
 554                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 555                                                  remaining, TCP_RTO_MAX);
 556                } else {
 557                        /* RTO revert clocked out retransmission.
 558                         * Will retransmit now */
 559                        tcp_retransmit_timer(sk);
 560                }
 561
 562                break;
 563        case ICMP_TIME_EXCEEDED:
 564                err = EHOSTUNREACH;
 565                break;
 566        default:
 567                goto out;
 568        }
 569
 570        switch (sk->sk_state) {
 571        case TCP_SYN_SENT:
 572        case TCP_SYN_RECV:
 573                /* Only in fast or simultaneous open. If a fast open socket is
 574                 * is already accepted it is treated as a connected one below.
 575                 */
 576                if (fastopen && !fastopen->sk)
 577                        break;
 578
 579                if (!sock_owned_by_user(sk)) {
 580                        sk->sk_err = err;
 581
 582                        sk->sk_error_report(sk);
 583
 584                        tcp_done(sk);
 585                } else {
 586                        sk->sk_err_soft = err;
 587                }
 588                goto out;
 589        }
 590
 591        /* If we've already connected we will keep trying
 592         * until we time out, or the user gives up.
 593         *
 594         * rfc1122 4.2.3.9 allows to consider as hard errors
 595         * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 596         * but it is obsoleted by pmtu discovery).
 597         *
 598         * Note, that in modern internet, where routing is unreliable
 599         * and in each dark corner broken firewalls sit, sending random
 600         * errors ordered by their masters even this two messages finally lose
 601         * their original sense (even Linux sends invalid PORT_UNREACHs)
 602         *
 603         * Now we are in compliance with RFCs.
 604         *                                                      --ANK (980905)
 605         */
 606
 607        inet = inet_sk(sk);
 608        if (!sock_owned_by_user(sk) && inet->recverr) {
 609                sk->sk_err = err;
 610                sk->sk_error_report(sk);
 611        } else  { /* Only an error on timeout */
 612                sk->sk_err_soft = err;
 613        }
 614
 615out:
 616        bh_unlock_sock(sk);
 617        sock_put(sk);
 618}
 619
 620void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 621{
 622        struct tcphdr *th = tcp_hdr(skb);
 623
 624        th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 625        skb->csum_start = skb_transport_header(skb) - skb->head;
 626        skb->csum_offset = offsetof(struct tcphdr, check);
 627}
 628
 629/* This routine computes an IPv4 TCP checksum. */
 630void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 631{
 632        const struct inet_sock *inet = inet_sk(sk);
 633
 634        __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 635}
 636EXPORT_SYMBOL(tcp_v4_send_check);
 637
 638/*
 639 *      This routine will send an RST to the other tcp.
 640 *
 641 *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 642 *                    for reset.
 643 *      Answer: if a packet caused RST, it is not for a socket
 644 *              existing in our system, if it is matched to a socket,
 645 *              it is just duplicate segment or bug in other side's TCP.
 646 *              So that we build reply only basing on parameters
 647 *              arrived with segment.
 648 *      Exception: precedence violation. We do not implement it in any case.
 649 */
 650
 651static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 652{
 653        const struct tcphdr *th = tcp_hdr(skb);
 654        struct {
 655                struct tcphdr th;
 656#ifdef CONFIG_TCP_MD5SIG
 657                __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 658#endif
 659        } rep;
 660        struct ip_reply_arg arg;
 661#ifdef CONFIG_TCP_MD5SIG
 662        struct tcp_md5sig_key *key = NULL;
 663        const __u8 *hash_location = NULL;
 664        unsigned char newhash[16];
 665        int genhash;
 666        struct sock *sk1 = NULL;
 667#endif
 668        struct net *net;
 669        struct sock *ctl_sk;
 670
 671        /* Never send a reset in response to a reset. */
 672        if (th->rst)
 673                return;
 674
 675        /* If sk not NULL, it means we did a successful lookup and incoming
 676         * route had to be correct. prequeue might have dropped our dst.
 677         */
 678        if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 679                return;
 680
 681        /* Swap the send and the receive. */
 682        memset(&rep, 0, sizeof(rep));
 683        rep.th.dest   = th->source;
 684        rep.th.source = th->dest;
 685        rep.th.doff   = sizeof(struct tcphdr) / 4;
 686        rep.th.rst    = 1;
 687
 688        if (th->ack) {
 689                rep.th.seq = th->ack_seq;
 690        } else {
 691                rep.th.ack = 1;
 692                rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 693                                       skb->len - (th->doff << 2));
 694        }
 695
 696        memset(&arg, 0, sizeof(arg));
 697        arg.iov[0].iov_base = (unsigned char *)&rep;
 698        arg.iov[0].iov_len  = sizeof(rep.th);
 699
 700        net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 701#ifdef CONFIG_TCP_MD5SIG
 702        rcu_read_lock();
 703        hash_location = tcp_parse_md5sig_option(th);
 704        if (sk && sk_fullsock(sk)) {
 705                key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 706                                        &ip_hdr(skb)->saddr, AF_INET);
 707        } else if (hash_location) {
 708                /*
 709                 * active side is lost. Try to find listening socket through
 710                 * source port, and then find md5 key through listening socket.
 711                 * we are not loose security here:
 712                 * Incoming packet is checked with md5 hash with finding key,
 713                 * no RST generated if md5 hash doesn't match.
 714                 */
 715                sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 716                                             ip_hdr(skb)->saddr,
 717                                             th->source, ip_hdr(skb)->daddr,
 718                                             ntohs(th->source), inet_iif(skb),
 719                                             tcp_v4_sdif(skb));
 720                /* don't send rst if it can't find key */
 721                if (!sk1)
 722                        goto out;
 723
 724                key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 725                                        &ip_hdr(skb)->saddr, AF_INET);
 726                if (!key)
 727                        goto out;
 728
 729
 730                genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 731                if (genhash || memcmp(hash_location, newhash, 16) != 0)
 732                        goto out;
 733
 734        }
 735
 736        if (key) {
 737                rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 738                                   (TCPOPT_NOP << 16) |
 739                                   (TCPOPT_MD5SIG << 8) |
 740                                   TCPOLEN_MD5SIG);
 741                /* Update length and the length the header thinks exists */
 742                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 743                rep.th.doff = arg.iov[0].iov_len / 4;
 744
 745                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 746                                     key, ip_hdr(skb)->saddr,
 747                                     ip_hdr(skb)->daddr, &rep.th);
 748        }
 749#endif
 750        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 751                                      ip_hdr(skb)->saddr, /* XXX */
 752                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 753        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 754        arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 755
 756        /* When socket is gone, all binding information is lost.
 757         * routing might fail in this case. No choice here, if we choose to force
 758         * input interface, we will misroute in case of asymmetric route.
 759         */
 760        if (sk) {
 761                arg.bound_dev_if = sk->sk_bound_dev_if;
 762                if (sk_fullsock(sk))
 763                        trace_tcp_send_reset(sk, skb);
 764        }
 765
 766        BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 767                     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 768
 769        arg.tos = ip_hdr(skb)->tos;
 770        arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 771        local_bh_disable();
 772        ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
 773        if (sk)
 774                ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 775                                   inet_twsk(sk)->tw_mark : sk->sk_mark;
 776        ip_send_unicast_reply(ctl_sk,
 777                              skb, &TCP_SKB_CB(skb)->header.h4.opt,
 778                              ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 779                              &arg, arg.iov[0].iov_len);
 780
 781        ctl_sk->sk_mark = 0;
 782        __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 783        __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 784        local_bh_enable();
 785
 786#ifdef CONFIG_TCP_MD5SIG
 787out:
 788        rcu_read_unlock();
 789#endif
 790}
 791
 792/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 793   outside socket context is ugly, certainly. What can I do?
 794 */
 795
 796static void tcp_v4_send_ack(const struct sock *sk,
 797                            struct sk_buff *skb, u32 seq, u32 ack,
 798                            u32 win, u32 tsval, u32 tsecr, int oif,
 799                            struct tcp_md5sig_key *key,
 800                            int reply_flags, u8 tos)
 801{
 802        const struct tcphdr *th = tcp_hdr(skb);
 803        struct {
 804                struct tcphdr th;
 805                __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 806#ifdef CONFIG_TCP_MD5SIG
 807                           + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 808#endif
 809                        ];
 810        } rep;
 811        struct net *net = sock_net(sk);
 812        struct ip_reply_arg arg;
 813        struct sock *ctl_sk;
 814
 815        memset(&rep.th, 0, sizeof(struct tcphdr));
 816        memset(&arg, 0, sizeof(arg));
 817
 818        arg.iov[0].iov_base = (unsigned char *)&rep;
 819        arg.iov[0].iov_len  = sizeof(rep.th);
 820        if (tsecr) {
 821                rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 822                                   (TCPOPT_TIMESTAMP << 8) |
 823                                   TCPOLEN_TIMESTAMP);
 824                rep.opt[1] = htonl(tsval);
 825                rep.opt[2] = htonl(tsecr);
 826                arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 827        }
 828
 829        /* Swap the send and the receive. */
 830        rep.th.dest    = th->source;
 831        rep.th.source  = th->dest;
 832        rep.th.doff    = arg.iov[0].iov_len / 4;
 833        rep.th.seq     = htonl(seq);
 834        rep.th.ack_seq = htonl(ack);
 835        rep.th.ack     = 1;
 836        rep.th.window  = htons(win);
 837
 838#ifdef CONFIG_TCP_MD5SIG
 839        if (key) {
 840                int offset = (tsecr) ? 3 : 0;
 841
 842                rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 843                                          (TCPOPT_NOP << 16) |
 844                                          (TCPOPT_MD5SIG << 8) |
 845                                          TCPOLEN_MD5SIG);
 846                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 847                rep.th.doff = arg.iov[0].iov_len/4;
 848
 849                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 850                                    key, ip_hdr(skb)->saddr,
 851                                    ip_hdr(skb)->daddr, &rep.th);
 852        }
 853#endif
 854        arg.flags = reply_flags;
 855        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 856                                      ip_hdr(skb)->saddr, /* XXX */
 857                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 858        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 859        if (oif)
 860                arg.bound_dev_if = oif;
 861        arg.tos = tos;
 862        arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 863        local_bh_disable();
 864        ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
 865        if (sk)
 866                ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 867                                   inet_twsk(sk)->tw_mark : sk->sk_mark;
 868        ip_send_unicast_reply(ctl_sk,
 869                              skb, &TCP_SKB_CB(skb)->header.h4.opt,
 870                              ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 871                              &arg, arg.iov[0].iov_len);
 872
 873        ctl_sk->sk_mark = 0;
 874        __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 875        local_bh_enable();
 876}
 877
 878static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 879{
 880        struct inet_timewait_sock *tw = inet_twsk(sk);
 881        struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 882
 883        tcp_v4_send_ack(sk, skb,
 884                        tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 885                        tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 886                        tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 887                        tcptw->tw_ts_recent,
 888                        tw->tw_bound_dev_if,
 889                        tcp_twsk_md5_key(tcptw),
 890                        tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 891                        tw->tw_tos
 892                        );
 893
 894        inet_twsk_put(tw);
 895}
 896
 897static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 898                                  struct request_sock *req)
 899{
 900        /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 901         * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 902         */
 903        u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 904                                             tcp_sk(sk)->snd_nxt;
 905
 906        /* RFC 7323 2.3
 907         * The window field (SEG.WND) of every outgoing segment, with the
 908         * exception of <SYN> segments, MUST be right-shifted by
 909         * Rcv.Wind.Shift bits:
 910         */
 911        tcp_v4_send_ack(sk, skb, seq,
 912                        tcp_rsk(req)->rcv_nxt,
 913                        req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 914                        tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 915                        req->ts_recent,
 916                        0,
 917                        tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
 918                                          AF_INET),
 919                        inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 920                        ip_hdr(skb)->tos);
 921}
 922
 923/*
 924 *      Send a SYN-ACK after having received a SYN.
 925 *      This still operates on a request_sock only, not on a big
 926 *      socket.
 927 */
 928static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 929                              struct flowi *fl,
 930                              struct request_sock *req,
 931                              struct tcp_fastopen_cookie *foc,
 932                              enum tcp_synack_type synack_type)
 933{
 934        const struct inet_request_sock *ireq = inet_rsk(req);
 935        struct flowi4 fl4;
 936        int err = -1;
 937        struct sk_buff *skb;
 938
 939        /* First, grab a route. */
 940        if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 941                return -1;
 942
 943        skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 944
 945        if (skb) {
 946                __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 947
 948                rcu_read_lock();
 949                err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 950                                            ireq->ir_rmt_addr,
 951                                            rcu_dereference(ireq->ireq_opt));
 952                rcu_read_unlock();
 953                err = net_xmit_eval(err);
 954        }
 955
 956        return err;
 957}
 958
 959/*
 960 *      IPv4 request_sock destructor.
 961 */
 962static void tcp_v4_reqsk_destructor(struct request_sock *req)
 963{
 964        kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
 965}
 966
 967#ifdef CONFIG_TCP_MD5SIG
 968/*
 969 * RFC2385 MD5 checksumming requires a mapping of
 970 * IP address->MD5 Key.
 971 * We need to maintain these in the sk structure.
 972 */
 973
 974/* Find the Key structure for an address.  */
 975struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
 976                                         const union tcp_md5_addr *addr,
 977                                         int family)
 978{
 979        const struct tcp_sock *tp = tcp_sk(sk);
 980        struct tcp_md5sig_key *key;
 981        const struct tcp_md5sig_info *md5sig;
 982        __be32 mask;
 983        struct tcp_md5sig_key *best_match = NULL;
 984        bool match;
 985
 986        /* caller either holds rcu_read_lock() or socket lock */
 987        md5sig = rcu_dereference_check(tp->md5sig_info,
 988                                       lockdep_sock_is_held(sk));
 989        if (!md5sig)
 990                return NULL;
 991
 992        hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 993                if (key->family != family)
 994                        continue;
 995
 996                if (family == AF_INET) {
 997                        mask = inet_make_mask(key->prefixlen);
 998                        match = (key->addr.a4.s_addr & mask) ==
 999                                (addr->a4.s_addr & mask);
1000#if IS_ENABLED(CONFIG_IPV6)

1001                } else if (family == AF_INET6) {
1002                        match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1003                                                  key->prefixlen);
1004#endif
1005                } else {
1006                        match = false;
1007                }
1008
1009                if (match && (!best_match ||
1010                              key->prefixlen > best_match->prefixlen))
1011                        best_match = key;
1012        }
1013        return best_match;
1014}
1015EXPORT_SYMBOL(tcp_md5_do_lookup);
1016
1017static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1018                                                      const union tcp_md5_addr *addr,
1019                                                      int family, u8 prefixlen)
1020{
1021        const struct tcp_sock *tp = tcp_sk(sk);
1022        struct tcp_md5sig_key *key;
1023        unsigned int size = sizeof(struct in_addr);
1024        const struct tcp_md5sig_info *md5sig;
1025
1026        /* caller either holds rcu_read_lock() or socket lock */
1027        md5sig = rcu_dereference_check(tp->md5sig_info,
1028                                       lockdep_sock_is_held(sk));
1029        if (!md5sig)
1030                return NULL;
1031#if IS_ENABLED(CONFIG_IPV6)
1032        if (family == AF_INET6)
1033                size = sizeof(struct in6_addr);
1034#endif
1035        hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1036                if (key->family != family)
1037                        continue;
1038                if (!memcmp(&key->addr, addr, size) &&
1039                    key->prefixlen == prefixlen)
1040                        return key;
1041        }
1042        return NULL;
1043}
1044
1045struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1046                                         const struct sock *addr_sk)
1047{
1048        const union tcp_md5_addr *addr;
1049
1050        addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1051        return tcp_md5_do_lookup(sk, addr, AF_INET);
1052}
1053EXPORT_SYMBOL(tcp_v4_md5_lookup);
1054
1055/* This can be called on a newly created socket, from other files */
1056int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1057                   int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1058                   gfp_t gfp)
1059{
1060        /* Add Key to the list */
1061        struct tcp_md5sig_key *key;
1062        struct tcp_sock *tp = tcp_sk(sk);
1063        struct tcp_md5sig_info *md5sig;
1064
1065        key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1066        if (key) {
1067                /* Pre-existing entry - just update that one. */
1068                memcpy(key->key, newkey, newkeylen);
1069                key->keylen = newkeylen;
1070                return 0;
1071        }
1072
1073        md5sig = rcu_dereference_protected(tp->md5sig_info,
1074                                           lockdep_sock_is_held(sk));
1075        if (!md5sig) {
1076                md5sig = kmalloc(sizeof(*md5sig), gfp);
1077                if (!md5sig)
1078                        return -ENOMEM;
1079
1080                sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1081                INIT_HLIST_HEAD(&md5sig->head);
1082                rcu_assign_pointer(tp->md5sig_info, md5sig);
1083        }
1084
1085        key = sock_kmalloc(sk, sizeof(*key), gfp);
1086        if (!key)
1087                return -ENOMEM;
1088        if (!tcp_alloc_md5sig_pool()) {
1089                sock_kfree_s(sk, key, sizeof(*key));
1090                return -ENOMEM;
1091        }
1092
1093        memcpy(key->key, newkey, newkeylen);
1094        key->keylen = newkeylen;
1095        key->family = family;
1096        key->prefixlen = prefixlen;
1097        memcpy(&key->addr, addr,
1098               (family == AF_INET6) ? sizeof(struct in6_addr) :
1099                                      sizeof(struct in_addr));
1100        hlist_add_head_rcu(&key->node, &md5sig->head);
1101        return 0;
1102}
1103EXPORT_SYMBOL(tcp_md5_do_add);
1104
1105int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1106                   u8 prefixlen)
1107{
1108        struct tcp_md5sig_key *key;
1109
1110        key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1111        if (!key)
1112                return -ENOENT;
1113        hlist_del_rcu(&key->node);
1114        atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1115        kfree_rcu(key, rcu);
1116        return 0;
1117}
1118EXPORT_SYMBOL(tcp_md5_do_del);
1119
1120static void tcp_clear_md5_list(struct sock *sk)
1121{
1122        struct tcp_sock *tp = tcp_sk(sk);
1123        struct tcp_md5sig_key *key;
1124        struct hlist_node *n;
1125        struct tcp_md5sig_info *md5sig;
1126
1127        md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1128
1129        hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1130                hlist_del_rcu(&key->node);
1131                atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1132                kfree_rcu(key, rcu);
1133        }
1134}
1135
1136static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1137                                 char __user *optval, int optlen)
1138{
1139        struct tcp_md5sig cmd;
1140        struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1141        u8 prefixlen = 32;
1142
1143        if (optlen < sizeof(cmd))
1144                return -EINVAL;
1145
1146        if (copy_from_user(&cmd, optval, sizeof(cmd)))
1147                return -EFAULT;
1148
1149        if (sin->sin_family != AF_INET)
1150                return -EINVAL;
1151
1152        if (optname == TCP_MD5SIG_EXT &&
1153            cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1154                prefixlen = cmd.tcpm_prefixlen;
1155                if (prefixlen > 32)
1156                        return -EINVAL;
1157        }
1158
1159        if (!cmd.tcpm_keylen)
1160                return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1161                                      AF_INET, prefixlen);
1162
1163        if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1164                return -EINVAL;
1165
1166        return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1167                              AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1168                              GFP_KERNEL);
1169}
1170
1171static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1172                                   __be32 daddr, __be32 saddr,
1173                                   const struct tcphdr *th, int nbytes)
1174{
1175        struct tcp4_pseudohdr *bp;
1176        struct scatterlist sg;
1177        struct tcphdr *_th;
1178
1179        bp = hp->scratch;
1180        bp->saddr = saddr;
1181        bp->daddr = daddr;
1182        bp->pad = 0;
1183        bp->protocol = IPPROTO_TCP;
1184        bp->len = cpu_to_be16(nbytes);
1185
1186        _th = (struct tcphdr *)(bp + 1);
1187        memcpy(_th, th, sizeof(*th));
1188        _th->check = 0;
1189
1190        sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1191        ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1192                                sizeof(*bp) + sizeof(*th));
1193        return crypto_ahash_update(hp->md5_req);
1194}
1195
1196static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1197                               __be32 daddr, __be32 saddr, const struct tcphdr *th)
1198{
1199        struct tcp_md5sig_pool *hp;
1200        struct ahash_request *req;
1201
1202        hp = tcp_get_md5sig_pool();
1203        if (!hp)
1204                goto clear_hash_noput;
1205        req = hp->md5_req;
1206
1207        if (crypto_ahash_init(req))
1208                goto clear_hash;
1209        if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1210                goto clear_hash;
1211        if (tcp_md5_hash_key(hp, key))
1212                goto clear_hash;
1213        ahash_request_set_crypt(req, NULL, md5_hash, 0);
1214        if (crypto_ahash_final(req))
1215                goto clear_hash;
1216
1217        tcp_put_md5sig_pool();
1218        return 0;
1219
1220clear_hash:
1221        tcp_put_md5sig_pool();
1222clear_hash_noput:
1223        memset(md5_hash, 0, 16);
1224        return 1;
1225}
1226
1227int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1228                        const struct sock *sk,
1229                        const struct sk_buff *skb)
1230{
1231        struct tcp_md5sig_pool *hp;
1232        struct ahash_request *req;
1233        const struct tcphdr *th = tcp_hdr(skb);
1234        __be32 saddr, daddr;
1235
1236        if (sk) { /* valid for establish/request sockets */
1237                saddr = sk->sk_rcv_saddr;
1238                daddr = sk->sk_daddr;
1239        } else {
1240                const struct iphdr *iph = ip_hdr(skb);
1241                saddr = iph->saddr;
1242                daddr = iph->daddr;
1243        }
1244
1245        hp = tcp_get_md5sig_pool();
1246        if (!hp)
1247                goto clear_hash_noput;
1248        req = hp->md5_req;
1249
1250        if (crypto_ahash_init(req))
1251                goto clear_hash;
1252
1253        if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1254                goto clear_hash;
1255        if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1256                goto clear_hash;
1257        if (tcp_md5_hash_key(hp, key))
1258                goto clear_hash;
1259        ahash_request_set_crypt(req, NULL, md5_hash, 0);
1260        if (crypto_ahash_final(req))
1261                goto clear_hash;
1262
1263        tcp_put_md5sig_pool();
1264        return 0;
1265
1266clear_hash:
1267        tcp_put_md5sig_pool();
1268clear_hash_noput:
1269        memset(md5_hash, 0, 16);
1270        return 1;
1271}
1272EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1273
1274#endif
1275
1276/* Called with rcu_read_lock() */
1277static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1278                                    const struct sk_buff *skb)
1279{
1280#ifdef CONFIG_TCP_MD5SIG
1281        /*
1282         * This gets called for each TCP segment that arrives
1283         * so we want to be efficient.
1284         * We have 3 drop cases:
1285         * o No MD5 hash and one expected.
1286         * o MD5 hash and we're not expecting one.
1287         * o MD5 hash and its wrong.
1288         */
1289        const __u8 *hash_location = NULL;
1290        struct tcp_md5sig_key *hash_expected;
1291        const struct iphdr *iph = ip_hdr(skb);
1292        const struct tcphdr *th = tcp_hdr(skb);
1293        int genhash;
1294        unsigned char newhash[16];
1295
1296        hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1297                                          AF_INET);
1298        hash_location = tcp_parse_md5sig_option(th);
1299
1300        /* We've parsed the options - do we have a hash? */
1301        if (!hash_expected && !hash_location)
1302                return false;
1303
1304        if (hash_expected && !hash_location) {
1305                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1306                return true;
1307        }
1308
1309        if (!hash_expected && hash_location) {
1310                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1311                return true;
1312        }
1313
1314        /* Okay, so this is hash_expected and hash_location -
1315         * so we need to calculate the checksum.
1316         */
1317        genhash = tcp_v4_md5_hash_skb(newhash,
1318                                      hash_expected,
1319                                      NULL, skb);
1320
1321        if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1322                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1323                net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1324                                     &iph->saddr, ntohs(th->source),
1325                                     &iph->daddr, ntohs(th->dest),
1326                                     genhash ? " tcp_v4_calc_md5_hash failed"
1327                                     : "");
1328                return true;
1329        }
1330        return false;
1331#endif
1332        return false;
1333}
1334
1335static void tcp_v4_init_req(struct request_sock *req,
1336                            const struct sock *sk_listener,
1337                            struct sk_buff *skb)
1338{
1339        struct inet_request_sock *ireq = inet_rsk(req);
1340        struct net *net = sock_net(sk_listener);
1341
1342        sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1343        sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1344        RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1345}
1346
1347static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1348                                          struct flowi *fl,
1349                                          const struct request_sock *req)
1350{
1351        return inet_csk_route_req(sk, &fl->u.ip4, req);
1352}
1353
1354struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1355        .family         =       PF_INET,
1356        .obj_size       =       sizeof(struct tcp_request_sock),
1357        .rtx_syn_ack    =       tcp_rtx_synack,
1358        .send_ack       =       tcp_v4_reqsk_send_ack,
1359        .destructor     =       tcp_v4_reqsk_destructor,
1360        .send_reset     =       tcp_v4_send_reset,
1361        .syn_ack_timeout =      tcp_syn_ack_timeout,
1362};
1363
1364static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1365        .mss_clamp      =       TCP_MSS_DEFAULT,
1366#ifdef CONFIG_TCP_MD5SIG
1367        .req_md5_lookup =       tcp_v4_md5_lookup,
1368        .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1369#endif
1370        .init_req       =       tcp_v4_init_req,
1371#ifdef CONFIG_SYN_COOKIES
1372        .cookie_init_seq =      cookie_v4_init_sequence,
1373#endif
1374        .route_req      =       tcp_v4_route_req,
1375        .init_seq       =       tcp_v4_init_seq,
1376        .init_ts_off    =       tcp_v4_init_ts_off,
1377        .send_synack    =       tcp_v4_send_synack,
1378};
1379
1380int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1381{
1382        /* Never answer to SYNs send to broadcast or multicast */
1383        if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1384                goto drop;
1385
1386        return tcp_conn_request(&tcp_request_sock_ops,
1387                                &tcp_request_sock_ipv4_ops, sk, skb);
1388
1389drop:
1390        tcp_listendrop(sk);
1391        return 0;
1392}
1393EXPORT_SYMBOL(tcp_v4_conn_request);
1394
1395
1396/*
1397 * The three way handshake has completed - we got a valid synack -
1398 * now create the new socket.
1399 */
1400struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1401                                  struct request_sock *req,
1402                                  struct dst_entry *dst,
1403                                  struct request_sock *req_unhash,
1404                                  bool *own_req)
1405{
1406        struct inet_request_sock *ireq;
1407        struct inet_sock *newinet;
1408        struct tcp_sock *newtp;
1409        struct sock *newsk;
1410#ifdef CONFIG_TCP_MD5SIG
1411        struct tcp_md5sig_key *key;
1412#endif
1413        struct ip_options_rcu *inet_opt;
1414
1415        if (sk_acceptq_is_full(sk))
1416                goto exit_overflow;
1417
1418        newsk = tcp_create_openreq_child(sk, req, skb);
1419        if (!newsk)
1420                goto exit_nonewsk;
1421
1422        newsk->sk_gso_type = SKB_GSO_TCPV4;
1423        inet_sk_rx_dst_set(newsk, skb);
1424
1425        newtp                 = tcp_sk(newsk);
1426        newinet               = inet_sk(newsk);
1427        ireq                  = inet_rsk(req);
1428        sk_daddr_set(newsk, ireq->ir_rmt_addr);
1429        sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1430        newsk->sk_bound_dev_if = ireq->ir_iif;
1431        newinet->inet_saddr   = ireq->ir_loc_addr;
1432        inet_opt              = rcu_dereference(ireq->ireq_opt);
1433        RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1434        newinet->mc_index     = inet_iif(skb);
1435        newinet->mc_ttl       = ip_hdr(skb)->ttl;
1436        newinet->rcv_tos      = ip_hdr(skb)->tos;
1437        inet_csk(newsk)->icsk_ext_hdr_len = 0;
1438        if (inet_opt)
1439                inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1440        newinet->inet_id = prandom_u32();
1441
1442        if (!dst) {
1443                dst = inet_csk_route_child_sock(sk, newsk, req);
1444                if (!dst)
1445                        goto put_and_exit;
1446        } else {
1447                /* syncookie case : see end of cookie_v4_check() */
1448        }
1449        sk_setup_caps(newsk, dst);
1450
1451        tcp_ca_openreq_child(newsk, dst);
1452
1453        tcp_sync_mss(newsk, dst_mtu(dst));
1454        newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1455
1456        tcp_initialize_rcv_mss(newsk);
1457
1458#ifdef CONFIG_TCP_MD5SIG
1459        /* Copy over the MD5 key from the original socket */
1460        key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1461                                AF_INET);
1462        if (key) {
1463                /*
1464                 * We're using one, so create a matching key
1465                 * on the newsk structure. If we fail to get
1466                 * memory, then we end up not copying the key
1467                 * across. Shucks.
1468                 */
1469                tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1470                               AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1471                sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1472        }
1473#endif
1474
1475        if (__inet_inherit_port(sk, newsk) < 0)
1476                goto put_and_exit;
1477        *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1478        if (likely(*own_req)) {
1479                tcp_move_syn(newtp, req);
1480                ireq->ireq_opt = NULL;
1481        } else {
1482                newinet->inet_opt = NULL;
1483        }
1484        return newsk;
1485
1486exit_overflow:
1487        NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1488exit_nonewsk:
1489        dst_release(dst);
1490exit:
1491        tcp_listendrop(sk);
1492        return NULL;
1493put_and_exit:
1494        newinet->inet_opt = NULL;
1495        inet_csk_prepare_forced_close(newsk);
1496        tcp_done(newsk);
1497        goto exit;
1498}
1499EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1500
1501static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1502{
1503#ifdef CONFIG_SYN_COOKIES
1504        const struct tcphdr *th = tcp_hdr(skb);
1505
1506        if (!th->syn)
1507                sk = cookie_v4_check(sk, skb);
1508#endif
1509        return sk;
1510}
1511
1512/* The socket must have it's spinlock held when we get
1513 * here, unless it is a TCP_LISTEN socket.
1514 *
1515 * We have a potential double-lock case here, so even when
1516 * doing backlog processing we use the BH locking scheme.
1517 * This is because we cannot sleep with the original spinlock
1518 * held.
1519 */
1520int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1521{
1522        struct sock *rsk;
1523
1524        if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1525                struct dst_entry *dst = sk->sk_rx_dst;
1526
1527                sock_rps_save_rxhash(sk, skb);
1528                sk_mark_napi_id(sk, skb);
1529                if (dst) {
1530                        if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1531                            !dst->ops->check(dst, 0)) {
1532                                dst_release(dst);
1533                                sk->sk_rx_dst = NULL;
1534                        }
1535                }
1536                tcp_rcv_established(sk, skb);
1537                return 0;
1538        }
1539
1540        if (tcp_checksum_complete(skb))
1541                goto csum_err;
1542
1543        if (sk->sk_state == TCP_LISTEN) {
1544                struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1545
1546                if (!nsk)
1547                        goto discard;
1548                if (nsk != sk) {
1549                        if (tcp_child_process(sk, nsk, skb)) {
1550                                rsk = nsk;
1551                                goto reset;
1552                        }
1553                        return 0;
1554                }
1555        } else
1556                sock_rps_save_rxhash(sk, skb);
1557
1558        if (tcp_rcv_state_process(sk, skb)) {
1559                rsk = sk;
1560                goto reset;
1561        }
1562        return 0;
1563
1564reset:
1565        tcp_v4_send_reset(rsk, skb);
1566discard:
1567        kfree_skb(skb);
1568        /* Be careful here. If this function gets more complicated and
1569         * gcc suffers from register pressure on the x86, sk (in %ebx)
1570         * might be destroyed here. This current version compiles correctly,
1571         * but you have been warned.
1572         */
1573        return 0;
1574
1575csum_err:
1576        TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1577        TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1578        goto discard;
1579}
1580EXPORT_SYMBOL(tcp_v4_do_rcv);
1581
1582int tcp_v4_early_demux(struct sk_buff *skb)
1583{
1584        const struct iphdr *iph;
1585        const struct tcphdr *th;
1586        struct sock *sk;
1587
1588        if (skb->pkt_type != PACKET_HOST)
1589                return 0;
1590
1591        if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1592                return 0;
1593
1594        iph = ip_hdr(skb);
1595        th = tcp_hdr(skb);
1596
1597        if (th->doff < sizeof(struct tcphdr) / 4)
1598                return 0;
1599
1600        sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1601                                       iph->saddr, th->source,
1602                                       iph->daddr, ntohs(th->dest),
1603                                       skb->skb_iif, inet_sdif(skb));
1604        if (sk) {
1605                skb->sk = sk;
1606                skb->destructor = sock_edemux;
1607                if (sk_fullsock(sk)) {
1608                        struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1609
1610                        if (dst)
1611                                dst = dst_check(dst, 0);
1612                        if (dst &&
1613                            inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1614                                skb_dst_set_noref(skb, dst);
1615                }
1616        }
1617        return 0;
1618}
1619
1620bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1621{
1622        u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1623
1624        /* Only socket owner can try to collapse/prune rx queues
1625         * to reduce memory overhead, so add a little headroom here.
1626         * Few sockets backlog are possibly concurrently non empty.
1627         */
1628        limit += 64*1024;
1629
1630        /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1631         * we can fix skb->truesize to its real value to avoid future drops.
1632         * This is valid because skb is not yet charged to the socket.
1633         * It has been noticed pure SACK packets were sometimes dropped
1634         * (if cooked by drivers without copybreak feature).
1635         */
1636        skb_condense(skb);
1637
1638        if (unlikely(sk_add_backlog(sk, skb, limit))) {
1639                bh_unlock_sock(sk);
1640                __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1641                return true;
1642        }
1643        return false;
1644}
1645EXPORT_SYMBOL(tcp_add_backlog);
1646
1647int tcp_filter(struct sock *sk, struct sk_buff *skb)
1648{
1649        struct tcphdr *th = (struct tcphdr *)skb->data;
1650
1651        return sk_filter_trim_cap(sk, skb, th->doff * 4);
1652}
1653EXPORT_SYMBOL(tcp_filter);
1654
1655static void tcp_v4_restore_cb(struct sk_buff *skb)
1656{
1657        memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1658                sizeof(struct inet_skb_parm));
1659}
1660
1661static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1662                           const struct tcphdr *th)
1663{
1664        /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1665         * barrier() makes sure compiler wont play fool^Waliasing games.
1666         */
1667        memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1668                sizeof(struct inet_skb_parm));
1669        barrier();
1670
1671        TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1672        TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1673                                    skb->len - th->doff * 4);
1674        TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1675        TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1676        TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1677        TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1678        TCP_SKB_CB(skb)->sacked  = 0;
1679        TCP_SKB_CB(skb)->has_rxtstamp =
1680                        skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1681}
1682
1683/*
1684 *      From tcp_input.c
1685 */
1686
1687int tcp_v4_rcv(struct sk_buff *skb)
1688{
1689        struct net *net = dev_net(skb->dev);
1690        int sdif = inet_sdif(skb);
1691        const struct iphdr *iph;
1692        const struct tcphdr *th;
1693        bool refcounted;
1694        struct sock *sk;
1695        int ret;
1696
1697        if (skb->pkt_type != PACKET_HOST)
1698                goto discard_it;
1699
1700        /* Count it even if it's bad */
1701        __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1702
1703        if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1704                goto discard_it;
1705
1706        th = (const struct tcphdr *)skb->data;
1707
1708        if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1709                goto bad_packet;
1710        if (!pskb_may_pull(skb, th->doff * 4))
1711                goto discard_it;
1712
1713        /* An explanation is required here, I think.
1714         * Packet length and doff are validated by header prediction,
1715         * provided case of th->doff==0 is eliminated.
1716         * So, we defer the checks. */
1717
1718        if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1719                goto csum_error;
1720
1721        th = (const struct tcphdr *)skb->data;
1722        iph = ip_hdr(skb);
1723lookup:
1724        sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1725                               th->dest, sdif, &refcounted);
1726        if (!sk)
1727                goto no_tcp_socket;
1728
1729process:
1730        if (sk->sk_state == TCP_TIME_WAIT)
1731                goto do_time_wait;
1732
1733        if (sk->sk_state == TCP_NEW_SYN_RECV) {
1734                struct request_sock *req = inet_reqsk(sk);
1735                bool req_stolen = false;
1736                struct sock *nsk;
1737
1738                sk = req->rsk_listener;
1739                if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1740                        sk_drops_add(sk, skb);
1741                        reqsk_put(req);
1742                        goto discard_it;
1743                }
1744                if (tcp_checksum_complete(skb)) {
1745                        reqsk_put(req);
1746                        goto csum_error;
1747                }
1748                if (unlikely(sk->sk_state != TCP_LISTEN)) {
1749                        inet_csk_reqsk_queue_drop_and_put(sk, req);
1750                        goto lookup;
1751                }
1752                /* We own a reference on the listener, increase it again
1753                 * as we might lose it too soon.
1754                 */
1755                sock_hold(sk);
1756                refcounted = true;
1757                nsk = NULL;
1758                if (!tcp_filter(sk, skb)) {
1759                        th = (const struct tcphdr *)skb->data;
1760                        iph = ip_hdr(skb);
1761                        tcp_v4_fill_cb(skb, iph, th);
1762                        nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1763                }
1764                if (!nsk) {
1765                        reqsk_put(req);
1766                        if (req_stolen) {
1767                                /* Another cpu got exclusive access to req
1768                                 * and created a full blown socket.
1769                                 * Try to feed this packet to this socket
1770                                 * instead of discarding it.
1771                                 */
1772                                tcp_v4_restore_cb(skb);
1773                                sock_put(sk);
1774                                goto lookup;
1775                        }
1776                        goto discard_and_relse;
1777                }
1778                if (nsk == sk) {
1779                        reqsk_put(req);
1780                        tcp_v4_restore_cb(skb);
1781                } else if (tcp_child_process(sk, nsk, skb)) {
1782                        tcp_v4_send_reset(nsk, skb);
1783                        goto discard_and_relse;
1784                } else {
1785                        sock_put(sk);
1786                        return 0;
1787                }
1788        }
1789        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1790                __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1791                goto discard_and_relse;
1792        }
1793
1794        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1795                goto discard_and_relse;
1796
1797        if (tcp_v4_inbound_md5_hash(sk, skb))
1798                goto discard_and_relse;
1799
1800        nf_reset(skb);
1801
1802        if (tcp_filter(sk, skb))
1803                goto discard_and_relse;
1804        th = (const struct tcphdr *)skb->data;
1805        iph = ip_hdr(skb);
1806        tcp_v4_fill_cb(skb, iph, th);
1807
1808        skb->dev = NULL;
1809
1810        if (sk->sk_state == TCP_LISTEN) {
1811                ret = tcp_v4_do_rcv(sk, skb);
1812                goto put_and_return;
1813        }
1814
1815        sk_incoming_cpu_update(sk);
1816
1817        bh_lock_sock_nested(sk);
1818        tcp_segs_in(tcp_sk(sk), skb);
1819        ret = 0;
1820        if (!sock_owned_by_user(sk)) {
1821                ret = tcp_v4_do_rcv(sk, skb);
1822        } else if (tcp_add_backlog(sk, skb)) {
1823                goto discard_and_relse;
1824        }
1825        bh_unlock_sock(sk);
1826
1827put_and_return:
1828        if (refcounted)
1829                sock_put(sk);
1830
1831        return ret;
1832
1833no_tcp_socket:
1834        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1835                goto discard_it;
1836
1837        tcp_v4_fill_cb(skb, iph, th);
1838
1839        if (tcp_checksum_complete(skb)) {
1840csum_error:
1841                __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1842bad_packet:
1843                __TCP_INC_STATS(net, TCP_MIB_INERRS);
1844        } else {
1845                tcp_v4_send_reset(NULL, skb);
1846        }
1847
1848discard_it:
1849        /* Discard frame. */
1850        kfree_skb(skb);
1851        return 0;
1852
1853discard_and_relse:
1854        sk_drops_add(sk, skb);
1855        if (refcounted)
1856                sock_put(sk);
1857        goto discard_it;
1858
1859do_time_wait:
1860        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1861                inet_twsk_put(inet_twsk(sk));
1862                goto discard_it;
1863        }
1864
1865        tcp_v4_fill_cb(skb, iph, th);
1866
1867        if (tcp_checksum_complete(skb)) {
1868                inet_twsk_put(inet_twsk(sk));
1869                goto csum_error;
1870        }
1871        switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1872        case TCP_TW_SYN: {
1873                struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1874                                                        &tcp_hashinfo, skb,
1875                                                        __tcp_hdrlen(th),
1876                                                        iph->saddr, th->source,
1877                                                        iph->daddr, th->dest,
1878                                                        inet_iif(skb),
1879                                                        sdif);
1880                if (sk2) {
1881                        inet_twsk_deschedule_put(inet_twsk(sk));
1882                        sk = sk2;
1883                        tcp_v4_restore_cb(skb);
1884                        refcounted = false;
1885                        goto process;
1886                }
1887        }
1888                /* to ACK */
1889                /* fall through */
1890        case TCP_TW_ACK:
1891                tcp_v4_timewait_ack(sk, skb);
1892                break;
1893        case TCP_TW_RST:
1894                tcp_v4_send_reset(sk, skb);
1895                inet_twsk_deschedule_put(inet_twsk(sk));
1896                goto discard_it;
1897        case TCP_TW_SUCCESS:;
1898        }
1899        goto discard_it;
1900}
1901
1902static struct timewait_sock_ops tcp_timewait_sock_ops = {
1903        .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1904        .twsk_unique    = tcp_twsk_unique,
1905        .twsk_destructor= tcp_twsk_destructor,
1906};
1907
1908void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1909{
1910        struct dst_entry *dst = skb_dst(skb);
1911
1912        if (dst && dst_hold_safe(dst)) {
1913                sk->sk_rx_dst = dst;
1914                inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1915        }
1916}
1917EXPORT_SYMBOL(inet_sk_rx_dst_set);
1918
1919const struct inet_connection_sock_af_ops ipv4_specific = {
1920        .queue_xmit        = ip_queue_xmit,
1921        .send_check        = tcp_v4_send_check,
1922        .rebuild_header    = inet_sk_rebuild_header,
1923        .sk_rx_dst_set     = inet_sk_rx_dst_set,
1924        .conn_request      = tcp_v4_conn_request,
1925        .syn_recv_sock     = tcp_v4_syn_recv_sock,
1926        .net_header_len    = sizeof(struct iphdr),
1927        .setsockopt        = ip_setsockopt,
1928        .getsockopt        = ip_getsockopt,
1929        .addr2sockaddr     = inet_csk_addr2sockaddr,
1930        .sockaddr_len      = sizeof(struct sockaddr_in),
1931#ifdef CONFIG_COMPAT
1932        .compat_setsockopt = compat_ip_setsockopt,
1933        .compat_getsockopt = compat_ip_getsockopt,
1934#endif
1935        .mtu_reduced       = tcp_v4_mtu_reduced,
1936};
1937EXPORT_SYMBOL(ipv4_specific);
1938
1939#ifdef CONFIG_TCP_MD5SIG
1940static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1941        .md5_lookup             = tcp_v4_md5_lookup,
1942        .calc_md5_hash          = tcp_v4_md5_hash_skb,
1943        .md5_parse              = tcp_v4_parse_md5_keys,
1944};
1945#endif
1946
1947/* NOTE: A lot of things set to zero explicitly by call to
1948 *       sk_alloc() so need not be done here.
1949 */
1950static int tcp_v4_init_sock(struct sock *sk)
1951{
1952        struct inet_connection_sock *icsk = inet_csk(sk);
1953
1954        tcp_init_sock(sk);
1955
1956        icsk->icsk_af_ops = &ipv4_specific;
1957
1958#ifdef CONFIG_TCP_MD5SIG
1959        tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1960#endif
1961
1962        return 0;
1963}
1964
1965void tcp_v4_destroy_sock(struct sock *sk)
1966{
1967        struct tcp_sock *tp = tcp_sk(sk);
1968
1969        trace_tcp_destroy_sock(sk);
1970
1971        tcp_clear_xmit_timers(sk);
1972
1973        tcp_cleanup_congestion_control(sk);
1974
1975        tcp_cleanup_ulp(sk);
1976
1977        /* Cleanup up the write buffer. */
1978        tcp_write_queue_purge(sk);
1979
1980        /* Check if we want to disable active TFO */
1981        tcp_fastopen_active_disable_ofo_check(sk);
1982
1983        /* Cleans up our, hopefully empty, out_of_order_queue. */
1984        skb_rbtree_purge(&tp->out_of_order_queue);
1985
1986#ifdef CONFIG_TCP_MD5SIG
1987        /* Clean up the MD5 key list, if any */
1988        if (tp->md5sig_info) {
1989                tcp_clear_md5_list(sk);
1990                kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
1991                tp->md5sig_info = NULL;
1992        }
1993#endif
1994
1995        /* Clean up a referenced TCP bind bucket. */
1996        if (inet_csk(sk)->icsk_bind_hash)
1997                inet_put_port(sk);
1998
1999        BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2000

2001        /* If socket is aborted during connect operation */
2002        tcp_free_fastopen_req(tp);
2003        tcp_fastopen_destroy_cipher(sk);
2004        tcp_saved_syn_free(tp);
2005
2006        sk_sockets_allocated_dec(sk);
2007}
2008EXPORT_SYMBOL(tcp_v4_destroy_sock);
2009
2010#ifdef CONFIG_PROC_FS
2011/* Proc filesystem TCP sock list dumping. */
2012
2013/*
2014 * Get next listener socket follow cur.  If cur is NULL, get first socket
2015 * starting from bucket given in st->bucket; when st->bucket is zero the
2016 * very first socket in the hash table is returned.
2017 */
2018static void *listening_get_next(struct seq_file *seq, void *cur)
2019{
2020        struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2021        struct tcp_iter_state *st = seq->private;
2022        struct net *net = seq_file_net(seq);
2023        struct inet_listen_hashbucket *ilb;
2024        struct sock *sk = cur;
2025
2026        if (!sk) {
2027get_head:
2028                ilb = &tcp_hashinfo.listening_hash[st->bucket];
2029                spin_lock(&ilb->lock);
2030                sk = sk_head(&ilb->head);
2031                st->offset = 0;
2032                goto get_sk;
2033        }
2034        ilb = &tcp_hashinfo.listening_hash[st->bucket];
2035        ++st->num;
2036        ++st->offset;
2037
2038        sk = sk_next(sk);
2039get_sk:
2040        sk_for_each_from(sk) {
2041                if (!net_eq(sock_net(sk), net))
2042                        continue;
2043                if (sk->sk_family == afinfo->family)
2044                        return sk;
2045        }
2046        spin_unlock(&ilb->lock);
2047        st->offset = 0;
2048        if (++st->bucket < INET_LHTABLE_SIZE)
2049                goto get_head;
2050        return NULL;
2051}
2052
2053static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2054{
2055        struct tcp_iter_state *st = seq->private;
2056        void *rc;
2057
2058        st->bucket = 0;
2059        st->offset = 0;
2060        rc = listening_get_next(seq, NULL);
2061
2062        while (rc && *pos) {
2063                rc = listening_get_next(seq, rc);
2064                --*pos;
2065        }
2066        return rc;
2067}
2068
2069static inline bool empty_bucket(const struct tcp_iter_state *st)
2070{
2071        return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2072}
2073
2074/*
2075 * Get first established socket starting from bucket given in st->bucket.
2076 * If st->bucket is zero, the very first socket in the hash is returned.
2077 */
2078static void *established_get_first(struct seq_file *seq)
2079{
2080        struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2081        struct tcp_iter_state *st = seq->private;
2082        struct net *net = seq_file_net(seq);
2083        void *rc = NULL;
2084
2085        st->offset = 0;
2086        for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2087                struct sock *sk;
2088                struct hlist_nulls_node *node;
2089                spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2090
2091                /* Lockless fast path for the common case of empty buckets */
2092                if (empty_bucket(st))
2093                        continue;
2094
2095                spin_lock_bh(lock);
2096                sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2097                        if (sk->sk_family != afinfo->family ||
2098                            !net_eq(sock_net(sk), net)) {
2099                                continue;
2100                        }
2101                        rc = sk;
2102                        goto out;
2103                }
2104                spin_unlock_bh(lock);
2105        }
2106out:
2107        return rc;
2108}
2109
2110static void *established_get_next(struct seq_file *seq, void *cur)
2111{
2112        struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2113        struct sock *sk = cur;
2114        struct hlist_nulls_node *node;
2115        struct tcp_iter_state *st = seq->private;
2116        struct net *net = seq_file_net(seq);
2117
2118        ++st->num;
2119        ++st->offset;
2120
2121        sk = sk_nulls_next(sk);
2122
2123        sk_nulls_for_each_from(sk, node) {
2124                if (sk->sk_family == afinfo->family &&
2125                    net_eq(sock_net(sk), net))
2126                        return sk;
2127        }
2128
2129        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2130        ++st->bucket;
2131        return established_get_first(seq);
2132}
2133
2134static void *established_get_idx(struct seq_file *seq, loff_t pos)
2135{
2136        struct tcp_iter_state *st = seq->private;
2137        void *rc;
2138
2139        st->bucket = 0;
2140        rc = established_get_first(seq);
2141
2142        while (rc && pos) {
2143                rc = established_get_next(seq, rc);
2144                --pos;
2145        }
2146        return rc;
2147}
2148
2149static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2150{
2151        void *rc;
2152        struct tcp_iter_state *st = seq->private;
2153
2154        st->state = TCP_SEQ_STATE_LISTENING;
2155        rc        = listening_get_idx(seq, &pos);
2156
2157        if (!rc) {
2158                st->state = TCP_SEQ_STATE_ESTABLISHED;
2159                rc        = established_get_idx(seq, pos);
2160        }
2161
2162        return rc;
2163}
2164
2165static void *tcp_seek_last_pos(struct seq_file *seq)
2166{
2167        struct tcp_iter_state *st = seq->private;
2168        int offset = st->offset;
2169        int orig_num = st->num;
2170        void *rc = NULL;
2171
2172        switch (st->state) {
2173        case TCP_SEQ_STATE_LISTENING:
2174                if (st->bucket >= INET_LHTABLE_SIZE)
2175                        break;
2176                st->state = TCP_SEQ_STATE_LISTENING;
2177                rc = listening_get_next(seq, NULL);
2178                while (offset-- && rc)
2179                        rc = listening_get_next(seq, rc);
2180                if (rc)
2181                        break;
2182                st->bucket = 0;
2183                st->state = TCP_SEQ_STATE_ESTABLISHED;
2184                /* Fallthrough */
2185        case TCP_SEQ_STATE_ESTABLISHED:
2186                if (st->bucket > tcp_hashinfo.ehash_mask)
2187                        break;
2188                rc = established_get_first(seq);
2189                while (offset-- && rc)
2190                        rc = established_get_next(seq, rc);
2191        }
2192
2193        st->num = orig_num;
2194
2195        return rc;
2196}
2197
2198void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2199{
2200        struct tcp_iter_state *st = seq->private;
2201        void *rc;
2202
2203        if (*pos && *pos == st->last_pos) {
2204                rc = tcp_seek_last_pos(seq);
2205                if (rc)
2206                        goto out;
2207        }
2208
2209        st->state = TCP_SEQ_STATE_LISTENING;
2210        st->num = 0;
2211        st->bucket = 0;
2212        st->offset = 0;
2213        rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2214
2215out:
2216        st->last_pos = *pos;
2217        return rc;
2218}
2219EXPORT_SYMBOL(tcp_seq_start);
2220
2221void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2222{
2223        struct tcp_iter_state *st = seq->private;
2224        void *rc = NULL;
2225
2226        if (v == SEQ_START_TOKEN) {
2227                rc = tcp_get_idx(seq, 0);
2228                goto out;
2229        }
2230
2231        switch (st->state) {
2232        case TCP_SEQ_STATE_LISTENING:
2233                rc = listening_get_next(seq, v);
2234                if (!rc) {
2235                        st->state = TCP_SEQ_STATE_ESTABLISHED;
2236                        st->bucket = 0;
2237                        st->offset = 0;
2238                        rc        = established_get_first(seq);
2239                }
2240                break;
2241        case TCP_SEQ_STATE_ESTABLISHED:
2242                rc = established_get_next(seq, v);
2243                break;
2244        }
2245out:
2246        ++*pos;
2247        st->last_pos = *pos;
2248        return rc;
2249}
2250EXPORT_SYMBOL(tcp_seq_next);
2251
2252void tcp_seq_stop(struct seq_file *seq, void *v)
2253{
2254        struct tcp_iter_state *st = seq->private;
2255
2256        switch (st->state) {
2257        case TCP_SEQ_STATE_LISTENING:
2258                if (v != SEQ_START_TOKEN)
2259                        spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2260                break;
2261        case TCP_SEQ_STATE_ESTABLISHED:
2262                if (v)
2263                        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2264                break;
2265        }
2266}
2267EXPORT_SYMBOL(tcp_seq_stop);
2268
2269static void get_openreq4(const struct request_sock *req,
2270                         struct seq_file *f, int i)
2271{
2272        const struct inet_request_sock *ireq = inet_rsk(req);
2273        long delta = req->rsk_timer.expires - jiffies;
2274
2275        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2276                " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2277                i,
2278                ireq->ir_loc_addr,
2279                ireq->ir_num,
2280                ireq->ir_rmt_addr,
2281                ntohs(ireq->ir_rmt_port),
2282                TCP_SYN_RECV,
2283                0, 0, /* could print option size, but that is af dependent. */
2284                1,    /* timers active (only the expire timer) */
2285                jiffies_delta_to_clock_t(delta),
2286                req->num_timeout,
2287                from_kuid_munged(seq_user_ns(f),
2288                                 sock_i_uid(req->rsk_listener)),
2289                0,  /* non standard timer */
2290                0, /* open_requests have no inode */
2291                0,
2292                req);
2293}
2294
2295static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2296{
2297        int timer_active;
2298        unsigned long timer_expires;
2299        const struct tcp_sock *tp = tcp_sk(sk);
2300        const struct inet_connection_sock *icsk = inet_csk(sk);
2301        const struct inet_sock *inet = inet_sk(sk);
2302        const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2303        __be32 dest = inet->inet_daddr;
2304        __be32 src = inet->inet_rcv_saddr;
2305        __u16 destp = ntohs(inet->inet_dport);
2306        __u16 srcp = ntohs(inet->inet_sport);
2307        int rx_queue;
2308        int state;
2309
2310        if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2311            icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2312            icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2313                timer_active    = 1;
2314                timer_expires   = icsk->icsk_timeout;
2315        } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2316                timer_active    = 4;
2317                timer_expires   = icsk->icsk_timeout;
2318        } else if (timer_pending(&sk->sk_timer)) {
2319                timer_active    = 2;
2320                timer_expires   = sk->sk_timer.expires;
2321        } else {
2322                timer_active    = 0;
2323                timer_expires = jiffies;
2324        }
2325
2326        state = inet_sk_state_load(sk);
2327        if (state == TCP_LISTEN)
2328                rx_queue = READ_ONCE(sk->sk_ack_backlog);
2329        else
2330                /* Because we don't lock the socket,
2331                 * we might find a transient negative value.
2332                 */
2333                rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2334
2335        seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2336                        "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2337                i, src, srcp, dest, destp, state,
2338                tp->write_seq - tp->snd_una,
2339                rx_queue,
2340                timer_active,
2341                jiffies_delta_to_clock_t(timer_expires - jiffies),
2342                icsk->icsk_retransmits,
2343                from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2344                icsk->icsk_probes_out,
2345                sock_i_ino(sk),
2346                refcount_read(&sk->sk_refcnt), sk,
2347                jiffies_to_clock_t(icsk->icsk_rto),
2348                jiffies_to_clock_t(icsk->icsk_ack.ato),
2349                (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2350                tp->snd_cwnd,
2351                state == TCP_LISTEN ?
2352                    fastopenq->max_qlen :
2353                    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2354}
2355
2356static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2357                               struct seq_file *f, int i)
2358{
2359        long delta = tw->tw_timer.expires - jiffies;
2360        __be32 dest, src;
2361        __u16 destp, srcp;
2362
2363        dest  = tw->tw_daddr;
2364        src   = tw->tw_rcv_saddr;
2365        destp = ntohs(tw->tw_dport);
2366        srcp  = ntohs(tw->tw_sport);
2367
2368        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2369                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2370                i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2371                3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2372                refcount_read(&tw->tw_refcnt), tw);
2373}
2374
2375#define TMPSZ 150
2376
2377static int tcp4_seq_show(struct seq_file *seq, void *v)
2378{
2379        struct tcp_iter_state *st;
2380        struct sock *sk = v;
2381
2382        seq_setwidth(seq, TMPSZ - 1);
2383        if (v == SEQ_START_TOKEN) {
2384                seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2385                           "rx_queue tr tm->when retrnsmt   uid  timeout "
2386                           "inode");
2387                goto out;
2388        }
2389        st = seq->private;
2390
2391        if (sk->sk_state == TCP_TIME_WAIT)
2392                get_timewait4_sock(v, seq, st->num);
2393        else if (sk->sk_state == TCP_NEW_SYN_RECV)
2394                get_openreq4(v, seq, st->num);
2395        else
2396                get_tcp4_sock(v, seq, st->num);
2397out:
2398        seq_pad(seq, '\n');
2399        return 0;
2400}
2401
2402static const struct seq_operations tcp4_seq_ops = {
2403        .show           = tcp4_seq_show,
2404        .start          = tcp_seq_start,
2405        .next           = tcp_seq_next,
2406        .stop           = tcp_seq_stop,
2407};
2408
2409static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2410        .family         = AF_INET,
2411};
2412
2413static int __net_init tcp4_proc_init_net(struct net *net)
2414{
2415        if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2416                        sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2417                return -ENOMEM;
2418        return 0;
2419}
2420
2421static void __net_exit tcp4_proc_exit_net(struct net *net)
2422{
2423        remove_proc_entry("tcp", net->proc_net);
2424}
2425
2426static struct pernet_operations tcp4_net_ops = {
2427        .init = tcp4_proc_init_net,
2428        .exit = tcp4_proc_exit_net,
2429};
2430
2431int __init tcp4_proc_init(void)
2432{
2433        return register_pernet_subsys(&tcp4_net_ops);
2434}
2435
2436void tcp4_proc_exit(void)
2437{
2438        unregister_pernet_subsys(&tcp4_net_ops);
2439}
2440#endif /* CONFIG_PROC_FS */
2441
2442struct proto tcp_prot = {
2443        .name                   = "TCP",
2444        .owner                  = THIS_MODULE,
2445        .close                  = tcp_close,
2446        .pre_connect            = tcp_v4_pre_connect,
2447        .connect                = tcp_v4_connect,
2448        .disconnect             = tcp_disconnect,
2449        .accept                 = inet_csk_accept,
2450        .ioctl                  = tcp_ioctl,
2451        .init                   = tcp_v4_init_sock,
2452        .destroy                = tcp_v4_destroy_sock,
2453        .shutdown               = tcp_shutdown,
2454        .setsockopt             = tcp_setsockopt,
2455        .getsockopt             = tcp_getsockopt,
2456        .keepalive              = tcp_set_keepalive,
2457        .recvmsg                = tcp_recvmsg,
2458        .sendmsg                = tcp_sendmsg,
2459        .sendpage               = tcp_sendpage,
2460        .backlog_rcv            = tcp_v4_do_rcv,
2461        .release_cb             = tcp_release_cb,
2462        .hash                   = inet_hash,
2463        .unhash                 = inet_unhash,
2464        .get_port               = inet_csk_get_port,
2465        .enter_memory_pressure  = tcp_enter_memory_pressure,
2466        .leave_memory_pressure  = tcp_leave_memory_pressure,
2467        .stream_memory_free     = tcp_stream_memory_free,
2468        .sockets_allocated      = &tcp_sockets_allocated,
2469        .orphan_count           = &tcp_orphan_count,
2470        .memory_allocated       = &tcp_memory_allocated,
2471        .memory_pressure        = &tcp_memory_pressure,
2472        .sysctl_mem             = sysctl_tcp_mem,
2473        .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2474        .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2475        .max_header             = MAX_TCP_HEADER,
2476        .obj_size               = sizeof(struct tcp_sock),
2477        .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2478        .twsk_prot              = &tcp_timewait_sock_ops,
2479        .rsk_prot               = &tcp_request_sock_ops,
2480        .h.hashinfo             = &tcp_hashinfo,
2481        .no_autobind            = true,
2482#ifdef CONFIG_COMPAT
2483        .compat_setsockopt      = compat_tcp_setsockopt,
2484        .compat_getsockopt      = compat_tcp_getsockopt,
2485#endif
2486        .diag_destroy           = tcp_abort,
2487};
2488EXPORT_SYMBOL(tcp_prot);
2489
2490static void __net_exit tcp_sk_exit(struct net *net)
2491{
2492        int cpu;
2493
2494        if (net->ipv4.tcp_congestion_control)
2495                module_put(net->ipv4.tcp_congestion_control->owner);
2496
2497        for_each_possible_cpu(cpu)
2498                inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2499        free_percpu(net->ipv4.tcp_sk);
2500}
2501
2502static int __net_init tcp_sk_init(struct net *net)
2503{
2504        int res, cpu, cnt;
2505
2506        net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2507        if (!net->ipv4.tcp_sk)
2508                return -ENOMEM;
2509
2510        for_each_possible_cpu(cpu) {
2511                struct sock *sk;
2512
2513                res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2514                                           IPPROTO_TCP, net);
2515                if (res)
2516                        goto fail;
2517                sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2518
2519                /* Please enforce IP_DF and IPID==0 for RST and
2520                 * ACK sent in SYN-RECV and TIME-WAIT state.
2521                 */
2522                inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2523
2524                *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2525        }
2526
2527        net->ipv4.sysctl_tcp_ecn = 2;
2528        net->ipv4.sysctl_tcp_ecn_fallback = 1;
2529
2530        net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2531        net->ipv4_sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2532        net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2533        net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2534
2535        net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2536        net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2537        net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2538
2539        net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2540        net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2541        net->ipv4.sysctl_tcp_syncookies = 1;
2542        net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2543        net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2544        net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2545        net->ipv4.sysctl_tcp_orphan_retries = 0;
2546        net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2547        net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2548        net->ipv4.sysctl_tcp_tw_reuse = 2;
2549
2550        cnt = tcp_hashinfo.ehash_mask + 1;
2551        net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2552        net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2553
2554        net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2555        net->ipv4.sysctl_tcp_sack = 1;
2556        net->ipv4.sysctl_tcp_window_scaling = 1;
2557        net->ipv4.sysctl_tcp_timestamps = 1;
2558        net->ipv4.sysctl_tcp_early_retrans = 3;
2559        net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2560        net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2561        net->ipv4.sysctl_tcp_retrans_collapse = 1;
2562        net->ipv4.sysctl_tcp_max_reordering = 300;
2563        net->ipv4.sysctl_tcp_dsack = 1;
2564        net->ipv4.sysctl_tcp_app_win = 31;
2565        net->ipv4.sysctl_tcp_adv_win_scale = 1;
2566        net->ipv4.sysctl_tcp_frto = 2;
2567        net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2568        /* This limits the percentage of the congestion window which we
2569         * will allow a single TSO frame to consume.  Building TSO frames
2570         * which are too large can cause TCP streams to be bursty.
2571         */
2572        net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2573        /* Default TSQ limit of four TSO segments */
2574        net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
2575        /* rfc5961 challenge ack rate limiting */
2576        net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2577        net->ipv4.sysctl_tcp_min_tso_segs = 2;
2578        net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2579        net->ipv4.sysctl_tcp_autocorking = 1;
2580        net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2581        net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2582        net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2583        if (net != &init_net) {
2584                memcpy(net->ipv4.sysctl_tcp_rmem,
2585                       init_net.ipv4.sysctl_tcp_rmem,
2586                       sizeof(init_net.ipv4.sysctl_tcp_rmem));
2587                memcpy(net->ipv4.sysctl_tcp_wmem,
2588                       init_net.ipv4.sysctl_tcp_wmem,
2589                       sizeof(init_net.ipv4.sysctl_tcp_wmem));
2590        }
2591        net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2592        net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2593        net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2594        spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2595        net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2596        atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2597
2598        /* Reno is always built in */
2599        if (!net_eq(net, &init_net) &&
2600            try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2601                net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2602        else
2603                net->ipv4.tcp_congestion_control = &tcp_reno;
2604
2605        return 0;
2606fail:
2607        tcp_sk_exit(net);
2608
2609        return res;
2610}
2611
2612static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2613{
2614        struct net *net;
2615
2616        inet_twsk_purge(&tcp_hashinfo, AF_INET);
2617
2618        list_for_each_entry(net, net_exit_list, exit_list)
2619                tcp_fastopen_ctx_destroy(net);
2620}
2621
2622static struct pernet_operations __net_initdata tcp_sk_ops = {
2623       .init       = tcp_sk_init,
2624       .exit       = tcp_sk_exit,
2625       .exit_batch = tcp_sk_exit_batch,
2626};
2627
2628void __init tcp_v4_init(void)
2629{
2630        if (register_pernet_subsys(&tcp_sk_ops))
2631                panic("Failed to create the TCP control socket.\n");
2632}
2633