LXR linux/net/ipv4/tcp

   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Implementation of the Transmission Control Protocol(TCP).
   7 *
   8 *              IPv4 specific functions
   9 *
  10 *
  11 *              code split from:
  12 *              linux/ipv4/tcp.c
  13 *              linux/ipv4/tcp_input.c
  14 *              linux/ipv4/tcp_output.c
  15 *
  16 *              See tcp.c for author information
  17 *
  18 *      This program is free software; you can redistribute it and/or
  19 *      modify it under the terms of the GNU General Public License
  20 *      as published by the Free Software Foundation; either version
  21 *      2 of the License, or (at your option) any later version.
  22 */
  23
  24/*
  25 * Changes:
  26 *              David S. Miller :       New socket lookup architecture.
  27 *                                      This code is dedicated to John Dyson.
  28 *              David S. Miller :       Change semantics of established hash,
  29 *                                      half is devoted to TIME_WAIT sockets
  30 *                                      and the rest go in the other half.
  31 *              Andi Kleen :            Add support for syncookies and fixed
  32 *                                      some bugs: ip options weren't passed to
  33 *                                      the TCP layer, missed a check for an
  34 *                                      ACK bit.
  35 *              Andi Kleen :            Implemented fast path mtu discovery.
  36 *                                      Fixed many serious bugs in the
  37 *                                      request_sock handling and moved
  38 *                                      most of it into the af independent code.
  39 *                                      Added tail drop and some other bugfixes.
  40 *                                      Added new listen semantics.
  41 *              Mike McLagan    :       Routing by source
  42 *      Juan Jose Ciarlante:            ip_dynaddr bits
  43 *              Andi Kleen:             various fixes.
  44 *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45 *                                      coma.
  46 *      Andi Kleen              :       Fix new listen.
  47 *      Andi Kleen              :       Fix accept error reporting.
  48 *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49 *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50 *                                      a single port at the same time.
  51 */
  52
  53#define pr_fmt(fmt) "TCP: " fmt
  54
  55#include <linux/bottom_half.h>
  56#include <linux/types.h>
  57#include <linux/fcntl.h>
  58#include <linux/module.h>
  59#include <linux/random.h>
  60#include <linux/cache.h>
  61#include <linux/jhash.h>
  62#include <linux/init.h>
  63#include <linux/times.h>
  64#include <linux/slab.h>
  65
  66#include <net/net_namespace.h>
  67#include <net/icmp.h>
  68#include <net/inet_hashtables.h>
  69#include <net/tcp.h>
  70#include <net/transp_v6.h>
  71#include <net/ipv6.h>
  72#include <net/inet_common.h>
  73#include <net/timewait_sock.h>
  74#include <net/xfrm.h>
  75#include <net/secure_seq.h>
  76#include <net/busy_poll.h>
  77
  78#include <linux/inet.h>
  79#include <linux/ipv6.h>
  80#include <linux/stddef.h>
  81#include <linux/proc_fs.h>
  82#include <linux/seq_file.h>
  83#include <linux/inetdevice.h>
  84
  85#include <crypto/hash.h>
  86#include <linux/scatterlist.h>
  87
  88#include <trace/events/tcp.h>
  89
  90#ifdef CONFIG_TCP_MD5SIG
  91static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  92                               __be32 daddr, __be32 saddr, const struct tcphdr *th);
  93#endif
  94
  95struct inet_hashinfo tcp_hashinfo;
  96EXPORT_SYMBOL(tcp_hashinfo);
  97
  98static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  99{
 100        return secure_tcp_seq(ip_hdr(skb)->daddr,
 101                              ip_hdr(skb)->saddr,
 102                              tcp_hdr(skb)->dest,
 103                              tcp_hdr(skb)->source);
 104}
 105
 106static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 107{
 108        return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 109}
 110
 111int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 112{
 113        const struct inet_timewait_sock *tw = inet_twsk(sktw);
 114        const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 115        struct tcp_sock *tp = tcp_sk(sk);
 116        int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
 117
 118        if (reuse == 2) {
 119                /* Still does not detect *everything* that goes through
 120                 * lo, since we require a loopback src or dst address
 121                 * or direct binding to 'lo' interface.
 122                 */
 123                bool loopback = false;
 124                if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 125                        loopback = true;
 126#if IS_ENABLED(CONFIG_IPV6)
 127                if (tw->tw_family == AF_INET6) {
 128                        if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 129                            (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
 130                             (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
 131                            ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 132                            (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
 133                             (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
 134                                loopback = true;
 135                } else
 136#endif
 137                {
 138                        if (ipv4_is_loopback(tw->tw_daddr) ||
 139                            ipv4_is_loopback(tw->tw_rcv_saddr))
 140                                loopback = true;
 141                }
 142                if (!loopback)
 143                        reuse = 0;
 144        }
 145
 146        /* With PAWS, it is safe from the viewpoint
 147           of data integrity. Even without PAWS it is safe provided sequence
 148           spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 149
 150           Actually, the idea is close to VJ's one, only timestamp cache is
 151           held not per host, but per port pair and TW bucket is used as state
 152           holder.
 153
 154           If TW bucket has been already destroyed we fall back to VJ's scheme
 155           and use initial timestamp retrieved from peer table.
 156         */
 157        if (tcptw->tw_ts_recent_stamp &&
 158            (!twp || (reuse && get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 159                /* In case of repair and re-using TIME-WAIT sockets we still
 160                 * want to be sure that it is safe as above but honor the
 161                 * sequence numbers and time stamps set as part of the repair
 162                 * process.
 163                 *
 164                 * Without this check re-using a TIME-WAIT socket with TCP
 165                 * repair would accumulate a -1 on the repair assigned
 166                 * sequence number. The first time it is reused the sequence
 167                 * is -1, the second time -2, etc. This fixes that issue
 168                 * without appearing to create any others.
 169                 */
 170                if (likely(!tp->repair)) {
 171                        tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 172                        if (tp->write_seq == 0)
 173                                tp->write_seq = 1;
 174                        tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 175                        tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 176                }
 177                sock_hold(sktw);
 178                return 1;
 179        }
 180
 181        return 0;
 182}
 183EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 184
 185static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 186                              int addr_len)
 187{
 188        /* This check is replicated from tcp_v4_connect() and intended to
 189         * prevent BPF program called below from accessing bytes that are out
 190         * of the bound specified by user in addr_len.
 191         */
 192        if (addr_len < sizeof(struct sockaddr_in))
 193                return -EINVAL;
 194
 195        sock_owned_by_me(sk);
 196
 197        return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 198}
 199
 200/* This will initiate an outgoing connection. */
 201int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 202{
 203        struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 204        struct inet_sock *inet = inet_sk(sk);
 205        struct tcp_sock *tp = tcp_sk(sk);
 206        __be16 orig_sport, orig_dport;
 207        __be32 daddr, nexthop;
 208        struct flowi4 *fl4;
 209        struct rtable *rt;
 210        int err;
 211        struct ip_options_rcu *inet_opt;
 212        struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 213
 214        if (addr_len < sizeof(struct sockaddr_in))
 215                return -EINVAL;
 216
 217        if (usin->sin_family != AF_INET)
 218                return -EAFNOSUPPORT;
 219
 220        nexthop = daddr = usin->sin_addr.s_addr;
 221        inet_opt = rcu_dereference_protected(inet->inet_opt,
 222                                             lockdep_sock_is_held(sk));
 223        if (inet_opt && inet_opt->opt.srr) {
 224                if (!daddr)
 225                        return -EINVAL;
 226                nexthop = inet_opt->opt.faddr;
 227        }
 228
 229        orig_sport = inet->inet_sport;
 230        orig_dport = usin->sin_port;
 231        fl4 = &inet->cork.fl.u.ip4;
 232        rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 233                              RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 234                              IPPROTO_TCP,
 235                              orig_sport, orig_dport, sk);
 236        if (IS_ERR(rt)) {
 237                err = PTR_ERR(rt);
 238                if (err == -ENETUNREACH)
 239                        IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 240                return err;
 241        }
 242
 243        if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 244                ip_rt_put(rt);
 245                return -ENETUNREACH;
 246        }
 247
 248        if (!inet_opt || !inet_opt->opt.srr)
 249                daddr = fl4->daddr;
 250
 251        if (!inet->inet_saddr)
 252                inet->inet_saddr = fl4->saddr;
 253        sk_rcv_saddr_set(sk, inet->inet_saddr);
 254
 255        if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 256                /* Reset inherited state */
 257                tp->rx_opt.ts_recent       = 0;
 258                tp->rx_opt.ts_recent_stamp = 0;
 259                if (likely(!tp->repair))
 260                        tp->write_seq      = 0;
 261        }
 262
 263        inet->inet_dport = usin->sin_port;
 264        sk_daddr_set(sk, daddr);
 265
 266        inet_csk(sk)->icsk_ext_hdr_len = 0;
 267        if (inet_opt)
 268                inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 269
 270        tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 271
 272        /* Socket identity is still unknown (sport may be zero).
 273         * However we set state to SYN-SENT and not releasing socket
 274         * lock select source port, enter ourselves into the hash tables and
 275         * complete initialization after this.
 276         */
 277        tcp_set_state(sk, TCP_SYN_SENT);
 278        err = inet_hash_connect(tcp_death_row, sk);
 279        if (err)
 280                goto failure;
 281
 282        sk_set_txhash(sk);
 283
 284        rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 285                               inet->inet_sport, inet->inet_dport, sk);
 286        if (IS_ERR(rt)) {
 287                err = PTR_ERR(rt);
 288                rt = NULL;
 289                goto failure;
 290        }
 291        /* OK, now commit destination to socket.  */
 292        sk->sk_gso_type = SKB_GSO_TCPV4;
 293        sk_setup_caps(sk, &rt->dst);
 294        rt = NULL;
 295
 296        if (likely(!tp->repair)) {
 297                if (!tp->write_seq)
 298                        tp->write_seq = secure_tcp_seq(inet->inet_saddr,
 299                                                       inet->inet_daddr,
 300                                                       inet->inet_sport,
 301                                                       usin->sin_port);
 302                tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 303                                                 inet->inet_saddr,
 304                                                 inet->inet_daddr);
 305        }
 306
 307        inet->inet_id = tp->write_seq ^ jiffies;
 308
 309        if (tcp_fastopen_defer_connect(sk, &err))
 310                return err;
 311        if (err)
 312                goto failure;
 313
 314        err = tcp_connect(sk);
 315
 316        if (err)
 317                goto failure;
 318
 319        return 0;
 320
 321failure:
 322        /*
 323         * This unhashes the socket and releases the local port,
 324         * if necessary.
 325         */
 326        tcp_set_state(sk, TCP_CLOSE);
 327        ip_rt_put(rt);
 328        sk->sk_route_caps = 0;
 329        inet->inet_dport = 0;
 330        return err;
 331}
 332EXPORT_SYMBOL(tcp_v4_connect);
 333
 334/*
 335 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 336 * It can be called through tcp_release_cb() if socket was owned by user
 337 * at the time tcp_v4_err() was called to handle ICMP message.
 338 */
 339void tcp_v4_mtu_reduced(struct sock *sk)
 340{
 341        struct inet_sock *inet = inet_sk(sk);
 342        struct dst_entry *dst;
 343        u32 mtu;
 344
 345        if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 346                return;
 347        mtu = tcp_sk(sk)->mtu_info;
 348        dst = inet_csk_update_pmtu(sk, mtu);
 349        if (!dst)
 350                return;
 351
 352        /* Something is about to be wrong... Remember soft error
 353         * for the case, if this connection will not able to recover.
 354         */
 355        if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 356                sk->sk_err_soft = EMSGSIZE;
 357
 358        mtu = dst_mtu(dst);
 359
 360        if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 361            ip_sk_accept_pmtu(sk) &&
 362            inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 363                tcp_sync_mss(sk, mtu);
 364
 365                /* Resend the TCP packet because it's
 366                 * clear that the old packet has been
 367                 * dropped. This is the new "fast" path mtu
 368                 * discovery.
 369                 */
 370                tcp_simple_retransmit(sk);
 371        } /* else let the usual retransmit timer handle it */
 372}
 373EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 374
 375static void do_redirect(struct sk_buff *skb, struct sock *sk)
 376{
 377        struct dst_entry *dst = __sk_dst_check(sk, 0);
 378
 379        if (dst)
 380                dst->ops->redirect(dst, sk, skb);
 381}
 382
 383
 384/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 385void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 386{
 387        struct request_sock *req = inet_reqsk(sk);
 388        struct net *net = sock_net(sk);
 389
 390        /* ICMPs are not backlogged, hence we cannot get
 391         * an established socket here.
 392         */
 393        if (seq != tcp_rsk(req)->snt_isn) {
 394                __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 395        } else if (abort) {
 396                /*
 397                 * Still in SYN_RECV, just remove it silently.
 398                 * There is no good way to pass the error to the newly
 399                 * created socket, and POSIX does not want network
 400                 * errors returned from accept().
 401                 */
 402                inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 403                tcp_listendrop(req->rsk_listener);
 404        }
 405        reqsk_put(req);
 406}
 407EXPORT_SYMBOL(tcp_req_err);
 408
 409/*
 410 * This routine is called by the ICMP module when it gets some
 411 * sort of error condition.  If err < 0 then the socket should
 412 * be closed and the error returned to the user.  If err > 0
 413 * it's just the icmp type << 8 | icmp code.  After adjustment
 414 * header points to the first 8 bytes of the tcp header.  We need
 415 * to find the appropriate port.
 416 *
 417 * The locking strategy used here is very "optimistic". When
 418 * someone else accesses the socket the ICMP is just dropped
 419 * and for some paths there is no check at all.
 420 * A more general error queue to queue errors for later handling
 421 * is probably better.
 422 *
 423 */
 424
 425void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 426{
 427        const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 428        struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 429        struct inet_connection_sock *icsk;
 430        struct tcp_sock *tp;
 431        struct inet_sock *inet;
 432        const int type = icmp_hdr(icmp_skb)->type;
 433        const int code = icmp_hdr(icmp_skb)->code;
 434        struct sock *sk;
 435        struct sk_buff *skb;
 436        struct request_sock *fastopen;
 437        u32 seq, snd_una;
 438        s32 remaining;
 439        u32 delta_us;
 440        int err;
 441        struct net *net = dev_net(icmp_skb->dev);
 442
 443        sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 444                                       th->dest, iph->saddr, ntohs(th->source),
 445                                       inet_iif(icmp_skb), 0);
 446        if (!sk) {
 447                __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 448                return;
 449        }
 450        if (sk->sk_state == TCP_TIME_WAIT) {
 451                inet_twsk_put(inet_twsk(sk));
 452                return;
 453        }
 454        seq = ntohl(th->seq);
 455        if (sk->sk_state == TCP_NEW_SYN_RECV)
 456                return tcp_req_err(sk, seq,
 457                                  type == ICMP_PARAMETERPROB ||
 458                                  type == ICMP_TIME_EXCEEDED ||
 459                                  (type == ICMP_DEST_UNREACH &&
 460                                   (code == ICMP_NET_UNREACH ||
 461                                    code == ICMP_HOST_UNREACH)));
 462
 463        bh_lock_sock(sk);
 464        /* If too many ICMPs get dropped on busy
 465         * servers this needs to be solved differently.
 466         * We do take care of PMTU discovery (RFC1191) special case :
 467         * we can receive locally generated ICMP messages while socket is held.
 468         */
 469        if (sock_owned_by_user(sk)) {
 470                if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 471                        __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 472        }
 473        if (sk->sk_state == TCP_CLOSE)
 474                goto out;
 475
 476        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 477                __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 478                goto out;
 479        }
 480
 481        icsk = inet_csk(sk);
 482        tp = tcp_sk(sk);
 483        /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 484        fastopen = tp->fastopen_rsk;
 485        snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 486        if (sk->sk_state != TCP_LISTEN &&
 487            !between(seq, snd_una, tp->snd_nxt)) {
 488                __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 489                goto out;
 490        }
 491
 492        switch (type) {
 493        case ICMP_REDIRECT:
 494                if (!sock_owned_by_user(sk))
 495                        do_redirect(icmp_skb, sk);
 496                goto out;
 497        case ICMP_SOURCE_QUENCH:
 498                /* Just silently ignore these. */
 499                goto out;
 500        case ICMP_PARAMETERPROB:
 501                err = EPROTO;
 502                break;
 503        case ICMP_DEST_UNREACH:
 504                if (code > NR_ICMP_UNREACH)
 505                        goto out;
 506
 507                if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 508                        /* We are not interested in TCP_LISTEN and open_requests
 509                         * (SYN-ACKs send out by Linux are always <576bytes so
 510                         * they should go through unfragmented).
 511                         */
 512                        if (sk->sk_state == TCP_LISTEN)
 513                                goto out;
 514
 515                        tp->mtu_info = info;
 516                        if (!sock_owned_by_user(sk)) {
 517                                tcp_v4_mtu_reduced(sk);
 518                        } else {
 519                                if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 520                                        sock_hold(sk);
 521                        }
 522                        goto out;
 523                }
 524
 525                err = icmp_err_convert[code].errno;
 526                /* check if icmp_skb allows revert of backoff
 527                 * (see draft-zimmermann-tcp-lcd) */
 528                if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 529                        break;
 530                if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 531                    !icsk->icsk_backoff || fastopen)
 532                        break;
 533
 534                if (sock_owned_by_user(sk))
 535                        break;
 536
 537                skb = tcp_rtx_queue_head(sk);
 538                if (WARN_ON_ONCE(!skb))
 539                        break;
 540
 541                icsk->icsk_backoff--;
 542                icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 543                                               TCP_TIMEOUT_INIT;
 544                icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 545
 546
 547                tcp_mstamp_refresh(tp);
 548                delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
 549                remaining = icsk->icsk_rto -
 550                            usecs_to_jiffies(delta_us);
 551
 552                if (remaining > 0) {
 553                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 554                                                  remaining, TCP_RTO_MAX);
 555                } else {
 556                        /* RTO revert clocked out retransmission.
 557                         * Will retransmit now */
 558                        tcp_retransmit_timer(sk);
 559                }
 560
 561                break;
 562        case ICMP_TIME_EXCEEDED:
 563                err = EHOSTUNREACH;
 564                break;
 565        default:
 566                goto out;
 567        }
 568
 569        switch (sk->sk_state) {
 570        case TCP_SYN_SENT:
 571        case TCP_SYN_RECV:
 572                /* Only in fast or simultaneous open. If a fast open socket is
 573                 * is already accepted it is treated as a connected one below.
 574                 */
 575                if (fastopen && !fastopen->sk)
 576                        break;
 577
 578                if (!sock_owned_by_user(sk)) {
 579                        sk->sk_err = err;
 580
 581                        sk->sk_error_report(sk);
 582
 583                        tcp_done(sk);
 584                } else {
 585                        sk->sk_err_soft = err;
 586                }
 587                goto out;
 588        }
 589
 590        /* If we've already connected we will keep trying
 591         * until we time out, or the user gives up.
 592         *
 593         * rfc1122 4.2.3.9 allows to consider as hard errors
 594         * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 595         * but it is obsoleted by pmtu discovery).
 596         *
 597         * Note, that in modern internet, where routing is unreliable
 598         * and in each dark corner broken firewalls sit, sending random
 599         * errors ordered by their masters even this two messages finally lose
 600         * their original sense (even Linux sends invalid PORT_UNREACHs)
 601         *
 602         * Now we are in compliance with RFCs.
 603         *                                                      --ANK (980905)
 604         */
 605
 606        inet = inet_sk(sk);
 607        if (!sock_owned_by_user(sk) && inet->recverr) {
 608                sk->sk_err = err;
 609                sk->sk_error_report(sk);
 610        } else  { /* Only an error on timeout */
 611                sk->sk_err_soft = err;
 612        }
 613
 614out:
 615        bh_unlock_sock(sk);
 616        sock_put(sk);
 617}
 618
 619void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 620{
 621        struct tcphdr *th = tcp_hdr(skb);
 622
 623        th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 624        skb->csum_start = skb_transport_header(skb) - skb->head;
 625        skb->csum_offset = offsetof(struct tcphdr, check);
 626}
 627
 628/* This routine computes an IPv4 TCP checksum. */
 629void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 630{
 631        const struct inet_sock *inet = inet_sk(sk);
 632
 633        __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 634}
 635EXPORT_SYMBOL(tcp_v4_send_check);
 636
 637/*
 638 *      This routine will send an RST to the other tcp.
 639 *
 640 *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 641 *                    for reset.
 642 *      Answer: if a packet caused RST, it is not for a socket
 643 *              existing in our system, if it is matched to a socket,
 644 *              it is just duplicate segment or bug in other side's TCP.
 645 *              So that we build reply only basing on parameters
 646 *              arrived with segment.
 647 *      Exception: precedence violation. We do not implement it in any case.
 648 */
 649
 650static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 651{
 652        const struct tcphdr *th = tcp_hdr(skb);
 653        struct {
 654                struct tcphdr th;
 655#ifdef CONFIG_TCP_MD5SIG
 656                __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 657#endif
 658        } rep;
 659        struct ip_reply_arg arg;
 660#ifdef CONFIG_TCP_MD5SIG
 661        struct tcp_md5sig_key *key = NULL;
 662        const __u8 *hash_location = NULL;
 663        unsigned char newhash[16];
 664        int genhash;
 665        struct sock *sk1 = NULL;
 666#endif
 667        struct net *net;
 668        struct sock *ctl_sk;
 669
 670        /* Never send a reset in response to a reset. */
 671        if (th->rst)
 672                return;
 673
 674        /* If sk not NULL, it means we did a successful lookup and incoming
 675         * route had to be correct. prequeue might have dropped our dst.
 676         */
 677        if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 678                return;
 679
 680        /* Swap the send and the receive. */
 681        memset(&rep, 0, sizeof(rep));
 682        rep.th.dest   = th->source;
 683        rep.th.source = th->dest;
 684        rep.th.doff   = sizeof(struct tcphdr) / 4;
 685        rep.th.rst    = 1;
 686
 687        if (th->ack) {
 688                rep.th.seq = th->ack_seq;
 689        } else {
 690                rep.th.ack = 1;
 691                rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 692                                       skb->len - (th->doff << 2));
 693        }
 694
 695        memset(&arg, 0, sizeof(arg));
 696        arg.iov[0].iov_base = (unsigned char *)&rep;
 697        arg.iov[0].iov_len  = sizeof(rep.th);
 698
 699        net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 700#ifdef CONFIG_TCP_MD5SIG
 701        rcu_read_lock();
 702        hash_location = tcp_parse_md5sig_option(th);
 703        if (sk && sk_fullsock(sk)) {
 704                key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 705                                        &ip_hdr(skb)->saddr, AF_INET);
 706        } else if (hash_location) {
 707                /*
 708                 * active side is lost. Try to find listening socket through
 709                 * source port, and then find md5 key through listening socket.
 710                 * we are not loose security here:
 711                 * Incoming packet is checked with md5 hash with finding key,
 712                 * no RST generated if md5 hash doesn't match.
 713                 */
 714                sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 715                                             ip_hdr(skb)->saddr,
 716                                             th->source, ip_hdr(skb)->daddr,
 717                                             ntohs(th->source), inet_iif(skb),
 718                                             tcp_v4_sdif(skb));
 719                /* don't send rst if it can't find key */
 720                if (!sk1)
 721                        goto out;
 722
 723                key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 724                                        &ip_hdr(skb)->saddr, AF_INET);
 725                if (!key)
 726                        goto out;
 727
 728
 729                genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 730                if (genhash || memcmp(hash_location, newhash, 16) != 0)
 731                        goto out;
 732
 733        }
 734
 735        if (key) {
 736                rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 737                                   (TCPOPT_NOP << 16) |
 738                                   (TCPOPT_MD5SIG << 8) |
 739                                   TCPOLEN_MD5SIG);
 740                /* Update length and the length the header thinks exists */
 741                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 742                rep.th.doff = arg.iov[0].iov_len / 4;
 743
 744                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 745                                     key, ip_hdr(skb)->saddr,
 746                                     ip_hdr(skb)->daddr, &rep.th);
 747        }
 748#endif
 749        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 750                                      ip_hdr(skb)->saddr, /* XXX */
 751                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 752        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 753        arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 754
 755        /* When socket is gone, all binding information is lost.
 756         * routing might fail in this case. No choice here, if we choose to force
 757         * input interface, we will misroute in case of asymmetric route.
 758         */
 759        if (sk) {
 760                arg.bound_dev_if = sk->sk_bound_dev_if;
 761                if (sk_fullsock(sk))
 762                        trace_tcp_send_reset(sk, skb);
 763        }
 764
 765        BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 766                     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 767
 768        arg.tos = ip_hdr(skb)->tos;
 769        arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 770        local_bh_disable();
 771        ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
 772        if (sk)
 773                ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 774                                   inet_twsk(sk)->tw_mark : sk->sk_mark;
 775        ip_send_unicast_reply(ctl_sk,
 776                              skb, &TCP_SKB_CB(skb)->header.h4.opt,
 777                              ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 778                              &arg, arg.iov[0].iov_len);
 779
 780        ctl_sk->sk_mark = 0;
 781        __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 782        __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 783        local_bh_enable();
 784
 785#ifdef CONFIG_TCP_MD5SIG
 786out:
 787        rcu_read_unlock();
 788#endif
 789}
 790
 791/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 792   outside socket context is ugly, certainly. What can I do?
 793 */
 794
 795static void tcp_v4_send_ack(const struct sock *sk,
 796                            struct sk_buff *skb, u32 seq, u32 ack,
 797                            u32 win, u32 tsval, u32 tsecr, int oif,
 798                            struct tcp_md5sig_key *key,
 799                            int reply_flags, u8 tos)
 800{
 801        const struct tcphdr *th = tcp_hdr(skb);
 802        struct {
 803                struct tcphdr th;
 804                __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 805#ifdef CONFIG_TCP_MD5SIG
 806                           + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 807#endif
 808                        ];
 809        } rep;
 810        struct net *net = sock_net(sk);
 811        struct ip_reply_arg arg;
 812        struct sock *ctl_sk;
 813
 814        memset(&rep.th, 0, sizeof(struct tcphdr));
 815        memset(&arg, 0, sizeof(arg));
 816
 817        arg.iov[0].iov_base = (unsigned char *)&rep;
 818        arg.iov[0].iov_len  = sizeof(rep.th);
 819        if (tsecr) {
 820                rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 821                                   (TCPOPT_TIMESTAMP << 8) |
 822                                   TCPOLEN_TIMESTAMP);
 823                rep.opt[1] = htonl(tsval);
 824                rep.opt[2] = htonl(tsecr);
 825                arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 826        }
 827
 828        /* Swap the send and the receive. */
 829        rep.th.dest    = th->source;
 830        rep.th.source  = th->dest;
 831        rep.th.doff    = arg.iov[0].iov_len / 4;
 832        rep.th.seq     = htonl(seq);
 833        rep.th.ack_seq = htonl(ack);
 834        rep.th.ack     = 1;
 835        rep.th.window  = htons(win);
 836
 837#ifdef CONFIG_TCP_MD5SIG
 838        if (key) {
 839                int offset = (tsecr) ? 3 : 0;
 840
 841                rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 842                                          (TCPOPT_NOP << 16) |
 843                                          (TCPOPT_MD5SIG << 8) |
 844                                          TCPOLEN_MD5SIG);
 845                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 846                rep.th.doff = arg.iov[0].iov_len/4;
 847
 848                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 849                                    key, ip_hdr(skb)->saddr,
 850                                    ip_hdr(skb)->daddr, &rep.th);
 851        }
 852#endif
 853        arg.flags = reply_flags;
 854        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 855                                      ip_hdr(skb)->saddr, /* XXX */
 856                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 857        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 858        if (oif)
 859                arg.bound_dev_if = oif;
 860        arg.tos = tos;
 861        arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 862        local_bh_disable();
 863        ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
 864        if (sk)
 865                ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 866                                   inet_twsk(sk)->tw_mark : sk->sk_mark;
 867        ip_send_unicast_reply(ctl_sk,
 868                              skb, &TCP_SKB_CB(skb)->header.h4.opt,
 869                              ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 870                              &arg, arg.iov[0].iov_len);
 871
 872        ctl_sk->sk_mark = 0;
 873        __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 874        local_bh_enable();
 875}
 876
 877static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 878{
 879        struct inet_timewait_sock *tw = inet_twsk(sk);
 880        struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 881
 882        tcp_v4_send_ack(sk, skb,
 883                        tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 884                        tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 885                        tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 886                        tcptw->tw_ts_recent,
 887                        tw->tw_bound_dev_if,
 888                        tcp_twsk_md5_key(tcptw),
 889                        tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 890                        tw->tw_tos
 891                        );
 892
 893        inet_twsk_put(tw);
 894}
 895
 896static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 897                                  struct request_sock *req)
 898{
 899        /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 900         * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 901         */
 902        u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 903                                             tcp_sk(sk)->snd_nxt;
 904
 905        /* RFC 7323 2.3
 906         * The window field (SEG.WND) of every outgoing segment, with the
 907         * exception of <SYN> segments, MUST be right-shifted by
 908         * Rcv.Wind.Shift bits:
 909         */
 910        tcp_v4_send_ack(sk, skb, seq,
 911                        tcp_rsk(req)->rcv_nxt,
 912                        req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 913                        tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 914                        req->ts_recent,
 915                        0,
 916                        tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
 917                                          AF_INET),
 918                        inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 919                        ip_hdr(skb)->tos);
 920}
 921
 922/*
 923 *      Send a SYN-ACK after having received a SYN.
 924 *      This still operates on a request_sock only, not on a big
 925 *      socket.
 926 */
 927static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 928                              struct flowi *fl,
 929                              struct request_sock *req,
 930                              struct tcp_fastopen_cookie *foc,
 931                              enum tcp_synack_type synack_type)
 932{
 933        const struct inet_request_sock *ireq = inet_rsk(req);
 934        struct flowi4 fl4;
 935        int err = -1;
 936        struct sk_buff *skb;
 937
 938        /* First, grab a route. */
 939        if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 940                return -1;
 941
 942        skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 943
 944        if (skb) {
 945                __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 946
 947                rcu_read_lock();
 948                err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 949                                            ireq->ir_rmt_addr,
 950                                            rcu_dereference(ireq->ireq_opt));
 951                rcu_read_unlock();
 952                err = net_xmit_eval(err);
 953        }
 954
 955        return err;
 956}
 957
 958/*
 959 *      IPv4 request_sock destructor.
 960 */
 961static void tcp_v4_reqsk_destructor(struct request_sock *req)
 962{
 963        kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
 964}
 965
 966#ifdef CONFIG_TCP_MD5SIG
 967/*
 968 * RFC2385 MD5 checksumming requires a mapping of
 969 * IP address->MD5 Key.
 970 * We need to maintain these in the sk structure.
 971 */
 972
 973/* Find the Key structure for an address.  */
 974struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
 975                                         const union tcp_md5_addr *addr,
 976                                         int family)
 977{
 978        const struct tcp_sock *tp = tcp_sk(sk);
 979        struct tcp_md5sig_key *key;
 980        const struct tcp_md5sig_info *md5sig;
 981        __be32 mask;
 982        struct tcp_md5sig_key *best_match = NULL;
 983        bool match;
 984
 985        /* caller either holds rcu_read_lock() or socket lock */
 986        md5sig = rcu_dereference_check(tp->md5sig_info,
 987                                       lockdep_sock_is_held(sk));
 988        if (!md5sig)
 989                return NULL;
 990
 991        hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 992                if (key->family != family)
 993                        continue;
 994
 995                if (family == AF_INET) {
 996                        mask = inet_make_mask(key->prefixlen);
 997                        match = (key->addr.a4.s_addr & mask) ==
 998                                (addr->a4.s_addr & mask);
 999#if IS_ENABLED(CONFIG_IPV6)
1000                } else if (family == AF_INET6) {

1001                        match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1002                                                  key->prefixlen);
1003#endif
1004                } else {
1005                        match = false;
1006                }
1007
1008                if (match && (!best_match ||
1009                              key->prefixlen > best_match->prefixlen))
1010                        best_match = key;
1011        }
1012        return best_match;
1013}
1014EXPORT_SYMBOL(tcp_md5_do_lookup);
1015
1016static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1017                                                      const union tcp_md5_addr *addr,
1018                                                      int family, u8 prefixlen)
1019{
1020        const struct tcp_sock *tp = tcp_sk(sk);
1021        struct tcp_md5sig_key *key;
1022        unsigned int size = sizeof(struct in_addr);
1023        const struct tcp_md5sig_info *md5sig;
1024
1025        /* caller either holds rcu_read_lock() or socket lock */
1026        md5sig = rcu_dereference_check(tp->md5sig_info,
1027                                       lockdep_sock_is_held(sk));
1028        if (!md5sig)
1029                return NULL;
1030#if IS_ENABLED(CONFIG_IPV6)
1031        if (family == AF_INET6)
1032                size = sizeof(struct in6_addr);
1033#endif
1034        hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1035                if (key->family != family)
1036                        continue;
1037                if (!memcmp(&key->addr, addr, size) &&
1038                    key->prefixlen == prefixlen)
1039                        return key;
1040        }
1041        return NULL;
1042}
1043
1044struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1045                                         const struct sock *addr_sk)
1046{
1047        const union tcp_md5_addr *addr;
1048
1049        addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1050        return tcp_md5_do_lookup(sk, addr, AF_INET);
1051}
1052EXPORT_SYMBOL(tcp_v4_md5_lookup);
1053
1054/* This can be called on a newly created socket, from other files */
1055int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1056                   int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1057                   gfp_t gfp)
1058{
1059        /* Add Key to the list */
1060        struct tcp_md5sig_key *key;
1061        struct tcp_sock *tp = tcp_sk(sk);
1062        struct tcp_md5sig_info *md5sig;
1063
1064        key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1065        if (key) {
1066                /* Pre-existing entry - just update that one. */
1067                memcpy(key->key, newkey, newkeylen);
1068                key->keylen = newkeylen;
1069                return 0;
1070        }
1071
1072        md5sig = rcu_dereference_protected(tp->md5sig_info,
1073                                           lockdep_sock_is_held(sk));
1074        if (!md5sig) {
1075                md5sig = kmalloc(sizeof(*md5sig), gfp);
1076                if (!md5sig)
1077                        return -ENOMEM;
1078
1079                sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1080                INIT_HLIST_HEAD(&md5sig->head);
1081                rcu_assign_pointer(tp->md5sig_info, md5sig);
1082        }
1083
1084        key = sock_kmalloc(sk, sizeof(*key), gfp);
1085        if (!key)
1086                return -ENOMEM;
1087        if (!tcp_alloc_md5sig_pool()) {
1088                sock_kfree_s(sk, key, sizeof(*key));
1089                return -ENOMEM;
1090        }
1091
1092        memcpy(key->key, newkey, newkeylen);
1093        key->keylen = newkeylen;
1094        key->family = family;
1095        key->prefixlen = prefixlen;
1096        memcpy(&key->addr, addr,
1097               (family == AF_INET6) ? sizeof(struct in6_addr) :
1098                                      sizeof(struct in_addr));
1099        hlist_add_head_rcu(&key->node, &md5sig->head);
1100        return 0;
1101}
1102EXPORT_SYMBOL(tcp_md5_do_add);
1103
1104int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1105                   u8 prefixlen)
1106{
1107        struct tcp_md5sig_key *key;
1108
1109        key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1110        if (!key)
1111                return -ENOENT;
1112        hlist_del_rcu(&key->node);
1113        atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1114        kfree_rcu(key, rcu);
1115        return 0;
1116}
1117EXPORT_SYMBOL(tcp_md5_do_del);
1118
1119static void tcp_clear_md5_list(struct sock *sk)
1120{
1121        struct tcp_sock *tp = tcp_sk(sk);
1122        struct tcp_md5sig_key *key;
1123        struct hlist_node *n;
1124        struct tcp_md5sig_info *md5sig;
1125
1126        md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1127
1128        hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1129                hlist_del_rcu(&key->node);
1130                atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1131                kfree_rcu(key, rcu);
1132        }
1133}
1134
1135static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1136                                 char __user *optval, int optlen)
1137{
1138        struct tcp_md5sig cmd;
1139        struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1140        u8 prefixlen = 32;
1141
1142        if (optlen < sizeof(cmd))
1143                return -EINVAL;
1144
1145        if (copy_from_user(&cmd, optval, sizeof(cmd)))
1146                return -EFAULT;
1147
1148        if (sin->sin_family != AF_INET)
1149                return -EINVAL;
1150
1151        if (optname == TCP_MD5SIG_EXT &&
1152            cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1153                prefixlen = cmd.tcpm_prefixlen;
1154                if (prefixlen > 32)
1155                        return -EINVAL;
1156        }
1157
1158        if (!cmd.tcpm_keylen)
1159                return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1160                                      AF_INET, prefixlen);
1161
1162        if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1163                return -EINVAL;
1164
1165        return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1166                              AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1167                              GFP_KERNEL);
1168}
1169
1170static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1171                                   __be32 daddr, __be32 saddr,
1172                                   const struct tcphdr *th, int nbytes)
1173{
1174        struct tcp4_pseudohdr *bp;
1175        struct scatterlist sg;
1176        struct tcphdr *_th;
1177
1178        bp = hp->scratch;
1179        bp->saddr = saddr;
1180        bp->daddr = daddr;
1181        bp->pad = 0;
1182        bp->protocol = IPPROTO_TCP;
1183        bp->len = cpu_to_be16(nbytes);
1184
1185        _th = (struct tcphdr *)(bp + 1);
1186        memcpy(_th, th, sizeof(*th));
1187        _th->check = 0;
1188
1189        sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1190        ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1191                                sizeof(*bp) + sizeof(*th));
1192        return crypto_ahash_update(hp->md5_req);
1193}
1194
1195static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1196                               __be32 daddr, __be32 saddr, const struct tcphdr *th)
1197{
1198        struct tcp_md5sig_pool *hp;
1199        struct ahash_request *req;
1200
1201        hp = tcp_get_md5sig_pool();
1202        if (!hp)
1203                goto clear_hash_noput;
1204        req = hp->md5_req;
1205
1206        if (crypto_ahash_init(req))
1207                goto clear_hash;
1208        if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1209                goto clear_hash;
1210        if (tcp_md5_hash_key(hp, key))
1211                goto clear_hash;
1212        ahash_request_set_crypt(req, NULL, md5_hash, 0);
1213        if (crypto_ahash_final(req))
1214                goto clear_hash;
1215
1216        tcp_put_md5sig_pool();
1217        return 0;
1218
1219clear_hash:
1220        tcp_put_md5sig_pool();
1221clear_hash_noput:
1222        memset(md5_hash, 0, 16);
1223        return 1;
1224}
1225
1226int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1227                        const struct sock *sk,
1228                        const struct sk_buff *skb)
1229{
1230        struct tcp_md5sig_pool *hp;
1231        struct ahash_request *req;
1232        const struct tcphdr *th = tcp_hdr(skb);
1233        __be32 saddr, daddr;
1234
1235        if (sk) { /* valid for establish/request sockets */
1236                saddr = sk->sk_rcv_saddr;
1237                daddr = sk->sk_daddr;
1238        } else {
1239                const struct iphdr *iph = ip_hdr(skb);
1240                saddr = iph->saddr;
1241                daddr = iph->daddr;
1242        }
1243
1244        hp = tcp_get_md5sig_pool();
1245        if (!hp)
1246                goto clear_hash_noput;
1247        req = hp->md5_req;
1248
1249        if (crypto_ahash_init(req))
1250                goto clear_hash;
1251
1252        if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1253                goto clear_hash;
1254        if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1255                goto clear_hash;
1256        if (tcp_md5_hash_key(hp, key))
1257                goto clear_hash;
1258        ahash_request_set_crypt(req, NULL, md5_hash, 0);
1259        if (crypto_ahash_final(req))
1260                goto clear_hash;
1261
1262        tcp_put_md5sig_pool();
1263        return 0;
1264
1265clear_hash:
1266        tcp_put_md5sig_pool();
1267clear_hash_noput:
1268        memset(md5_hash, 0, 16);
1269        return 1;
1270}
1271EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1272
1273#endif
1274
1275/* Called with rcu_read_lock() */
1276static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1277                                    const struct sk_buff *skb)
1278{
1279#ifdef CONFIG_TCP_MD5SIG
1280        /*
1281         * This gets called for each TCP segment that arrives
1282         * so we want to be efficient.
1283         * We have 3 drop cases:
1284         * o No MD5 hash and one expected.
1285         * o MD5 hash and we're not expecting one.
1286         * o MD5 hash and its wrong.
1287         */
1288        const __u8 *hash_location = NULL;
1289        struct tcp_md5sig_key *hash_expected;
1290        const struct iphdr *iph = ip_hdr(skb);
1291        const struct tcphdr *th = tcp_hdr(skb);
1292        int genhash;
1293        unsigned char newhash[16];
1294
1295        hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1296                                          AF_INET);
1297        hash_location = tcp_parse_md5sig_option(th);
1298
1299        /* We've parsed the options - do we have a hash? */
1300        if (!hash_expected && !hash_location)
1301                return false;
1302
1303        if (hash_expected && !hash_location) {
1304                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1305                return true;
1306        }
1307
1308        if (!hash_expected && hash_location) {
1309                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1310                return true;
1311        }
1312
1313        /* Okay, so this is hash_expected and hash_location -
1314         * so we need to calculate the checksum.
1315         */
1316        genhash = tcp_v4_md5_hash_skb(newhash,
1317                                      hash_expected,
1318                                      NULL, skb);
1319
1320        if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1321                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1322                net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1323                                     &iph->saddr, ntohs(th->source),
1324                                     &iph->daddr, ntohs(th->dest),
1325                                     genhash ? " tcp_v4_calc_md5_hash failed"
1326                                     : "");
1327                return true;
1328        }
1329        return false;
1330#endif
1331        return false;
1332}
1333
1334static void tcp_v4_init_req(struct request_sock *req,
1335                            const struct sock *sk_listener,
1336                            struct sk_buff *skb)
1337{
1338        struct inet_request_sock *ireq = inet_rsk(req);
1339        struct net *net = sock_net(sk_listener);
1340
1341        sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1342        sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1343        RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1344}
1345
1346static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1347                                          struct flowi *fl,
1348                                          const struct request_sock *req)
1349{
1350        return inet_csk_route_req(sk, &fl->u.ip4, req);
1351}
1352
1353struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1354        .family         =       PF_INET,
1355        .obj_size       =       sizeof(struct tcp_request_sock),
1356        .rtx_syn_ack    =       tcp_rtx_synack,
1357        .send_ack       =       tcp_v4_reqsk_send_ack,
1358        .destructor     =       tcp_v4_reqsk_destructor,
1359        .send_reset     =       tcp_v4_send_reset,
1360        .syn_ack_timeout =      tcp_syn_ack_timeout,
1361};
1362
1363static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1364        .mss_clamp      =       TCP_MSS_DEFAULT,
1365#ifdef CONFIG_TCP_MD5SIG
1366        .req_md5_lookup =       tcp_v4_md5_lookup,
1367        .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1368#endif
1369        .init_req       =       tcp_v4_init_req,
1370#ifdef CONFIG_SYN_COOKIES
1371        .cookie_init_seq =      cookie_v4_init_sequence,
1372#endif
1373        .route_req      =       tcp_v4_route_req,
1374        .init_seq       =       tcp_v4_init_seq,
1375        .init_ts_off    =       tcp_v4_init_ts_off,
1376        .send_synack    =       tcp_v4_send_synack,
1377};
1378
1379int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1380{
1381        /* Never answer to SYNs send to broadcast or multicast */
1382        if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1383                goto drop;
1384
1385        return tcp_conn_request(&tcp_request_sock_ops,
1386                                &tcp_request_sock_ipv4_ops, sk, skb);
1387
1388drop:
1389        tcp_listendrop(sk);
1390        return 0;
1391}
1392EXPORT_SYMBOL(tcp_v4_conn_request);
1393
1394
1395/*
1396 * The three way handshake has completed - we got a valid synack -
1397 * now create the new socket.
1398 */
1399struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1400                                  struct request_sock *req,
1401                                  struct dst_entry *dst,
1402                                  struct request_sock *req_unhash,
1403                                  bool *own_req)
1404{
1405        struct inet_request_sock *ireq;
1406        struct inet_sock *newinet;
1407        struct tcp_sock *newtp;
1408        struct sock *newsk;
1409#ifdef CONFIG_TCP_MD5SIG
1410        struct tcp_md5sig_key *key;
1411#endif
1412        struct ip_options_rcu *inet_opt;
1413
1414        if (sk_acceptq_is_full(sk))
1415                goto exit_overflow;
1416
1417        newsk = tcp_create_openreq_child(sk, req, skb);
1418        if (!newsk)
1419                goto exit_nonewsk;
1420
1421        newsk->sk_gso_type = SKB_GSO_TCPV4;
1422        inet_sk_rx_dst_set(newsk, skb);
1423
1424        newtp                 = tcp_sk(newsk);
1425        newinet               = inet_sk(newsk);
1426        ireq                  = inet_rsk(req);
1427        sk_daddr_set(newsk, ireq->ir_rmt_addr);
1428        sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1429        newsk->sk_bound_dev_if = ireq->ir_iif;
1430        newinet->inet_saddr   = ireq->ir_loc_addr;
1431        inet_opt              = rcu_dereference(ireq->ireq_opt);
1432        RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1433        newinet->mc_index     = inet_iif(skb);
1434        newinet->mc_ttl       = ip_hdr(skb)->ttl;
1435        newinet->rcv_tos      = ip_hdr(skb)->tos;
1436        inet_csk(newsk)->icsk_ext_hdr_len = 0;
1437        if (inet_opt)
1438                inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1439        newinet->inet_id = newtp->write_seq ^ jiffies;
1440
1441        if (!dst) {
1442                dst = inet_csk_route_child_sock(sk, newsk, req);
1443                if (!dst)
1444                        goto put_and_exit;
1445        } else {
1446                /* syncookie case : see end of cookie_v4_check() */
1447        }
1448        sk_setup_caps(newsk, dst);
1449
1450        tcp_ca_openreq_child(newsk, dst);
1451
1452        tcp_sync_mss(newsk, dst_mtu(dst));
1453        newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1454
1455        tcp_initialize_rcv_mss(newsk);
1456
1457#ifdef CONFIG_TCP_MD5SIG
1458        /* Copy over the MD5 key from the original socket */
1459        key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1460                                AF_INET);
1461        if (key) {
1462                /*
1463                 * We're using one, so create a matching key
1464                 * on the newsk structure. If we fail to get
1465                 * memory, then we end up not copying the key
1466                 * across. Shucks.
1467                 */
1468                tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1469                               AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1470                sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1471        }
1472#endif
1473
1474        if (__inet_inherit_port(sk, newsk) < 0)
1475                goto put_and_exit;
1476        *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1477        if (likely(*own_req)) {
1478                tcp_move_syn(newtp, req);
1479                ireq->ireq_opt = NULL;
1480        } else {
1481                newinet->inet_opt = NULL;
1482        }
1483        return newsk;
1484
1485exit_overflow:
1486        NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1487exit_nonewsk:
1488        dst_release(dst);
1489exit:
1490        tcp_listendrop(sk);
1491        return NULL;
1492put_and_exit:
1493        newinet->inet_opt = NULL;
1494        inet_csk_prepare_forced_close(newsk);
1495        tcp_done(newsk);
1496        goto exit;
1497}
1498EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1499
1500static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1501{
1502#ifdef CONFIG_SYN_COOKIES
1503        const struct tcphdr *th = tcp_hdr(skb);
1504
1505        if (!th->syn)
1506                sk = cookie_v4_check(sk, skb);
1507#endif
1508        return sk;
1509}
1510
1511/* The socket must have it's spinlock held when we get
1512 * here, unless it is a TCP_LISTEN socket.
1513 *
1514 * We have a potential double-lock case here, so even when
1515 * doing backlog processing we use the BH locking scheme.
1516 * This is because we cannot sleep with the original spinlock
1517 * held.
1518 */
1519int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1520{
1521        struct sock *rsk;
1522
1523        if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1524                struct dst_entry *dst = sk->sk_rx_dst;
1525
1526                sock_rps_save_rxhash(sk, skb);
1527                sk_mark_napi_id(sk, skb);
1528                if (dst) {
1529                        if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1530                            !dst->ops->check(dst, 0)) {
1531                                dst_release(dst);
1532                                sk->sk_rx_dst = NULL;
1533                        }
1534                }
1535                tcp_rcv_established(sk, skb);
1536                return 0;
1537        }
1538
1539        if (tcp_checksum_complete(skb))
1540                goto csum_err;
1541
1542        if (sk->sk_state == TCP_LISTEN) {
1543                struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1544
1545                if (!nsk)
1546                        goto discard;
1547                if (nsk != sk) {
1548                        if (tcp_child_process(sk, nsk, skb)) {
1549                                rsk = nsk;
1550                                goto reset;
1551                        }
1552                        return 0;
1553                }
1554        } else
1555                sock_rps_save_rxhash(sk, skb);
1556
1557        if (tcp_rcv_state_process(sk, skb)) {
1558                rsk = sk;
1559                goto reset;
1560        }
1561        return 0;
1562
1563reset:
1564        tcp_v4_send_reset(rsk, skb);
1565discard:
1566        kfree_skb(skb);
1567        /* Be careful here. If this function gets more complicated and
1568         * gcc suffers from register pressure on the x86, sk (in %ebx)
1569         * might be destroyed here. This current version compiles correctly,
1570         * but you have been warned.
1571         */
1572        return 0;
1573
1574csum_err:
1575        TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1576        TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1577        goto discard;
1578}
1579EXPORT_SYMBOL(tcp_v4_do_rcv);
1580
1581int tcp_v4_early_demux(struct sk_buff *skb)
1582{
1583        const struct iphdr *iph;
1584        const struct tcphdr *th;
1585        struct sock *sk;
1586
1587        if (skb->pkt_type != PACKET_HOST)
1588                return 0;
1589
1590        if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1591                return 0;
1592
1593        iph = ip_hdr(skb);
1594        th = tcp_hdr(skb);
1595
1596        if (th->doff < sizeof(struct tcphdr) / 4)
1597                return 0;
1598
1599        sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1600                                       iph->saddr, th->source,
1601                                       iph->daddr, ntohs(th->dest),
1602                                       skb->skb_iif, inet_sdif(skb));
1603        if (sk) {
1604                skb->sk = sk;
1605                skb->destructor = sock_edemux;
1606                if (sk_fullsock(sk)) {
1607                        struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1608
1609                        if (dst)
1610                                dst = dst_check(dst, 0);
1611                        if (dst &&
1612                            inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1613                                skb_dst_set_noref(skb, dst);
1614                }
1615        }
1616        return 0;
1617}
1618
1619bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1620{
1621        u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1622
1623        /* Only socket owner can try to collapse/prune rx queues
1624         * to reduce memory overhead, so add a little headroom here.
1625         * Few sockets backlog are possibly concurrently non empty.
1626         */
1627        limit += 64*1024;
1628
1629        /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1630         * we can fix skb->truesize to its real value to avoid future drops.
1631         * This is valid because skb is not yet charged to the socket.
1632         * It has been noticed pure SACK packets were sometimes dropped
1633         * (if cooked by drivers without copybreak feature).
1634         */
1635        skb_condense(skb);
1636
1637        if (unlikely(sk_add_backlog(sk, skb, limit))) {
1638                bh_unlock_sock(sk);
1639                __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1640                return true;
1641        }
1642        return false;
1643}
1644EXPORT_SYMBOL(tcp_add_backlog);
1645
1646int tcp_filter(struct sock *sk, struct sk_buff *skb)
1647{
1648        struct tcphdr *th = (struct tcphdr *)skb->data;
1649
1650        return sk_filter_trim_cap(sk, skb, th->doff * 4);
1651}
1652EXPORT_SYMBOL(tcp_filter);
1653
1654static void tcp_v4_restore_cb(struct sk_buff *skb)
1655{
1656        memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1657                sizeof(struct inet_skb_parm));
1658}
1659
1660static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1661                           const struct tcphdr *th)
1662{
1663        /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1664         * barrier() makes sure compiler wont play fool^Waliasing games.
1665         */
1666        memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1667                sizeof(struct inet_skb_parm));
1668        barrier();
1669
1670        TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1671        TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1672                                    skb->len - th->doff * 4);
1673        TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1674        TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1675        TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1676        TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1677        TCP_SKB_CB(skb)->sacked  = 0;
1678        TCP_SKB_CB(skb)->has_rxtstamp =
1679                        skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1680}
1681
1682/*
1683 *      From tcp_input.c
1684 */
1685
1686int tcp_v4_rcv(struct sk_buff *skb)
1687{
1688        struct net *net = dev_net(skb->dev);
1689        int sdif = inet_sdif(skb);
1690        const struct iphdr *iph;
1691        const struct tcphdr *th;
1692        bool refcounted;
1693        struct sock *sk;
1694        int ret;
1695
1696        if (skb->pkt_type != PACKET_HOST)
1697                goto discard_it;
1698
1699        /* Count it even if it's bad */
1700        __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1701
1702        if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1703                goto discard_it;
1704
1705        th = (const struct tcphdr *)skb->data;
1706
1707        if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1708                goto bad_packet;
1709        if (!pskb_may_pull(skb, th->doff * 4))
1710                goto discard_it;
1711
1712        /* An explanation is required here, I think.
1713         * Packet length and doff are validated by header prediction,
1714         * provided case of th->doff==0 is eliminated.
1715         * So, we defer the checks. */
1716
1717        if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1718                goto csum_error;
1719
1720        th = (const struct tcphdr *)skb->data;
1721        iph = ip_hdr(skb);
1722lookup:
1723        sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1724                               th->dest, sdif, &refcounted);
1725        if (!sk)
1726                goto no_tcp_socket;
1727
1728process:
1729        if (sk->sk_state == TCP_TIME_WAIT)
1730                goto do_time_wait;
1731
1732        if (sk->sk_state == TCP_NEW_SYN_RECV) {
1733                struct request_sock *req = inet_reqsk(sk);
1734                bool req_stolen = false;
1735                struct sock *nsk;
1736
1737                sk = req->rsk_listener;
1738                if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1739                        sk_drops_add(sk, skb);
1740                        reqsk_put(req);
1741                        goto discard_it;
1742                }
1743                if (tcp_checksum_complete(skb)) {
1744                        reqsk_put(req);
1745                        goto csum_error;
1746                }
1747                if (unlikely(sk->sk_state != TCP_LISTEN)) {
1748                        inet_csk_reqsk_queue_drop_and_put(sk, req);
1749                        goto lookup;
1750                }
1751                /* We own a reference on the listener, increase it again
1752                 * as we might lose it too soon.
1753                 */
1754                sock_hold(sk);
1755                refcounted = true;
1756                nsk = NULL;
1757                if (!tcp_filter(sk, skb)) {
1758                        th = (const struct tcphdr *)skb->data;
1759                        iph = ip_hdr(skb);
1760                        tcp_v4_fill_cb(skb, iph, th);
1761                        nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1762                }
1763                if (!nsk) {
1764                        reqsk_put(req);
1765                        if (req_stolen) {
1766                                /* Another cpu got exclusive access to req
1767                                 * and created a full blown socket.
1768                                 * Try to feed this packet to this socket
1769                                 * instead of discarding it.
1770                                 */
1771                                tcp_v4_restore_cb(skb);
1772                                sock_put(sk);
1773                                goto lookup;
1774                        }
1775                        goto discard_and_relse;
1776                }
1777                if (nsk == sk) {
1778                        reqsk_put(req);
1779                        tcp_v4_restore_cb(skb);
1780                } else if (tcp_child_process(sk, nsk, skb)) {
1781                        tcp_v4_send_reset(nsk, skb);
1782                        goto discard_and_relse;
1783                } else {
1784                        sock_put(sk);
1785                        return 0;
1786                }
1787        }
1788        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1789                __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1790                goto discard_and_relse;
1791        }
1792
1793        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1794                goto discard_and_relse;
1795
1796        if (tcp_v4_inbound_md5_hash(sk, skb))
1797                goto discard_and_relse;
1798
1799        nf_reset(skb);
1800
1801        if (tcp_filter(sk, skb))
1802                goto discard_and_relse;
1803        th = (const struct tcphdr *)skb->data;
1804        iph = ip_hdr(skb);
1805        tcp_v4_fill_cb(skb, iph, th);
1806
1807        skb->dev = NULL;
1808
1809        if (sk->sk_state == TCP_LISTEN) {
1810                ret = tcp_v4_do_rcv(sk, skb);
1811                goto put_and_return;
1812        }
1813
1814        sk_incoming_cpu_update(sk);
1815
1816        bh_lock_sock_nested(sk);
1817        tcp_segs_in(tcp_sk(sk), skb);
1818        ret = 0;
1819        if (!sock_owned_by_user(sk)) {
1820                ret = tcp_v4_do_rcv(sk, skb);
1821        } else if (tcp_add_backlog(sk, skb)) {
1822                goto discard_and_relse;
1823        }
1824        bh_unlock_sock(sk);
1825
1826put_and_return:
1827        if (refcounted)
1828                sock_put(sk);
1829
1830        return ret;
1831
1832no_tcp_socket:
1833        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1834                goto discard_it;
1835
1836        tcp_v4_fill_cb(skb, iph, th);
1837
1838        if (tcp_checksum_complete(skb)) {
1839csum_error:
1840                __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1841bad_packet:
1842                __TCP_INC_STATS(net, TCP_MIB_INERRS);
1843        } else {
1844                tcp_v4_send_reset(NULL, skb);
1845        }
1846
1847discard_it:
1848        /* Discard frame. */
1849        kfree_skb(skb);
1850        return 0;
1851
1852discard_and_relse:
1853        sk_drops_add(sk, skb);
1854        if (refcounted)
1855                sock_put(sk);
1856        goto discard_it;
1857
1858do_time_wait:
1859        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1860                inet_twsk_put(inet_twsk(sk));
1861                goto discard_it;
1862        }
1863
1864        tcp_v4_fill_cb(skb, iph, th);
1865
1866        if (tcp_checksum_complete(skb)) {
1867                inet_twsk_put(inet_twsk(sk));
1868                goto csum_error;
1869        }
1870        switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1871        case TCP_TW_SYN: {
1872                struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1873                                                        &tcp_hashinfo, skb,
1874                                                        __tcp_hdrlen(th),
1875                                                        iph->saddr, th->source,
1876                                                        iph->daddr, th->dest,
1877                                                        inet_iif(skb),
1878                                                        sdif);
1879                if (sk2) {
1880                        inet_twsk_deschedule_put(inet_twsk(sk));
1881                        sk = sk2;
1882                        tcp_v4_restore_cb(skb);
1883                        refcounted = false;
1884                        goto process;
1885                }
1886        }
1887                /* to ACK */
1888                /* fall through */
1889        case TCP_TW_ACK:
1890                tcp_v4_timewait_ack(sk, skb);
1891                break;
1892        case TCP_TW_RST:
1893                tcp_v4_send_reset(sk, skb);
1894                inet_twsk_deschedule_put(inet_twsk(sk));
1895                goto discard_it;
1896        case TCP_TW_SUCCESS:;
1897        }
1898        goto discard_it;
1899}
1900
1901static struct timewait_sock_ops tcp_timewait_sock_ops = {
1902        .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1903        .twsk_unique    = tcp_twsk_unique,
1904        .twsk_destructor= tcp_twsk_destructor,
1905};
1906
1907void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1908{
1909        struct dst_entry *dst = skb_dst(skb);
1910
1911        if (dst && dst_hold_safe(dst)) {
1912                sk->sk_rx_dst = dst;
1913                inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1914        }
1915}
1916EXPORT_SYMBOL(inet_sk_rx_dst_set);
1917
1918const struct inet_connection_sock_af_ops ipv4_specific = {
1919        .queue_xmit        = ip_queue_xmit,
1920        .send_check        = tcp_v4_send_check,
1921        .rebuild_header    = inet_sk_rebuild_header,
1922        .sk_rx_dst_set     = inet_sk_rx_dst_set,
1923        .conn_request      = tcp_v4_conn_request,
1924        .syn_recv_sock     = tcp_v4_syn_recv_sock,
1925        .net_header_len    = sizeof(struct iphdr),
1926        .setsockopt        = ip_setsockopt,
1927        .getsockopt        = ip_getsockopt,
1928        .addr2sockaddr     = inet_csk_addr2sockaddr,
1929        .sockaddr_len      = sizeof(struct sockaddr_in),
1930#ifdef CONFIG_COMPAT
1931        .compat_setsockopt = compat_ip_setsockopt,
1932        .compat_getsockopt = compat_ip_getsockopt,
1933#endif
1934        .mtu_reduced       = tcp_v4_mtu_reduced,
1935};
1936EXPORT_SYMBOL(ipv4_specific);
1937
1938#ifdef CONFIG_TCP_MD5SIG
1939static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1940        .md5_lookup             = tcp_v4_md5_lookup,
1941        .calc_md5_hash          = tcp_v4_md5_hash_skb,
1942        .md5_parse              = tcp_v4_parse_md5_keys,
1943};
1944#endif
1945
1946/* NOTE: A lot of things set to zero explicitly by call to
1947 *       sk_alloc() so need not be done here.
1948 */
1949static int tcp_v4_init_sock(struct sock *sk)
1950{
1951        struct inet_connection_sock *icsk = inet_csk(sk);
1952
1953        tcp_init_sock(sk);
1954
1955        icsk->icsk_af_ops = &ipv4_specific;
1956
1957#ifdef CONFIG_TCP_MD5SIG
1958        tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1959#endif
1960
1961        return 0;
1962}
1963
1964void tcp_v4_destroy_sock(struct sock *sk)
1965{
1966        struct tcp_sock *tp = tcp_sk(sk);
1967
1968        trace_tcp_destroy_sock(sk);
1969
1970        tcp_clear_xmit_timers(sk);
1971
1972        tcp_cleanup_congestion_control(sk);
1973
1974        tcp_cleanup_ulp(sk);
1975
1976        /* Cleanup up the write buffer. */
1977        tcp_write_queue_purge(sk);
1978
1979        /* Check if we want to disable active TFO */
1980        tcp_fastopen_active_disable_ofo_check(sk);
1981
1982        /* Cleans up our, hopefully empty, out_of_order_queue. */
1983        skb_rbtree_purge(&tp->out_of_order_queue);
1984
1985#ifdef CONFIG_TCP_MD5SIG
1986        /* Clean up the MD5 key list, if any */
1987        if (tp->md5sig_info) {
1988                tcp_clear_md5_list(sk);
1989                kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
1990                tp->md5sig_info = NULL;
1991        }
1992#endif
1993
1994        /* Clean up a referenced TCP bind bucket. */
1995        if (inet_csk(sk)->icsk_bind_hash)
1996                inet_put_port(sk);
1997
1998        BUG_ON(tp->fastopen_rsk);
1999
2000        /* If socket is aborted during connect operation */

2001        tcp_free_fastopen_req(tp);
2002        tcp_fastopen_destroy_cipher(sk);
2003        tcp_saved_syn_free(tp);
2004
2005        sk_sockets_allocated_dec(sk);
2006}
2007EXPORT_SYMBOL(tcp_v4_destroy_sock);
2008
2009#ifdef CONFIG_PROC_FS
2010/* Proc filesystem TCP sock list dumping. */
2011
2012/*
2013 * Get next listener socket follow cur.  If cur is NULL, get first socket
2014 * starting from bucket given in st->bucket; when st->bucket is zero the
2015 * very first socket in the hash table is returned.
2016 */
2017static void *listening_get_next(struct seq_file *seq, void *cur)
2018{
2019        struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2020        struct tcp_iter_state *st = seq->private;
2021        struct net *net = seq_file_net(seq);
2022        struct inet_listen_hashbucket *ilb;
2023        struct sock *sk = cur;
2024
2025        if (!sk) {
2026get_head:
2027                ilb = &tcp_hashinfo.listening_hash[st->bucket];
2028                spin_lock(&ilb->lock);
2029                sk = sk_head(&ilb->head);
2030                st->offset = 0;
2031                goto get_sk;
2032        }
2033        ilb = &tcp_hashinfo.listening_hash[st->bucket];
2034        ++st->num;
2035        ++st->offset;
2036
2037        sk = sk_next(sk);
2038get_sk:
2039        sk_for_each_from(sk) {
2040                if (!net_eq(sock_net(sk), net))
2041                        continue;
2042                if (sk->sk_family == afinfo->family)
2043                        return sk;
2044        }
2045        spin_unlock(&ilb->lock);
2046        st->offset = 0;
2047        if (++st->bucket < INET_LHTABLE_SIZE)
2048                goto get_head;
2049        return NULL;
2050}
2051
2052static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2053{
2054        struct tcp_iter_state *st = seq->private;
2055        void *rc;
2056
2057        st->bucket = 0;
2058        st->offset = 0;
2059        rc = listening_get_next(seq, NULL);
2060
2061        while (rc && *pos) {
2062                rc = listening_get_next(seq, rc);
2063                --*pos;
2064        }
2065        return rc;
2066}
2067
2068static inline bool empty_bucket(const struct tcp_iter_state *st)
2069{
2070        return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2071}
2072
2073/*
2074 * Get first established socket starting from bucket given in st->bucket.
2075 * If st->bucket is zero, the very first socket in the hash is returned.
2076 */
2077static void *established_get_first(struct seq_file *seq)
2078{
2079        struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2080        struct tcp_iter_state *st = seq->private;
2081        struct net *net = seq_file_net(seq);
2082        void *rc = NULL;
2083
2084        st->offset = 0;
2085        for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2086                struct sock *sk;
2087                struct hlist_nulls_node *node;
2088                spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2089
2090                /* Lockless fast path for the common case of empty buckets */
2091                if (empty_bucket(st))
2092                        continue;
2093
2094                spin_lock_bh(lock);
2095                sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2096                        if (sk->sk_family != afinfo->family ||
2097                            !net_eq(sock_net(sk), net)) {
2098                                continue;
2099                        }
2100                        rc = sk;
2101                        goto out;
2102                }
2103                spin_unlock_bh(lock);
2104        }
2105out:
2106        return rc;
2107}
2108
2109static void *established_get_next(struct seq_file *seq, void *cur)
2110{
2111        struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2112        struct sock *sk = cur;
2113        struct hlist_nulls_node *node;
2114        struct tcp_iter_state *st = seq->private;
2115        struct net *net = seq_file_net(seq);
2116
2117        ++st->num;
2118        ++st->offset;
2119
2120        sk = sk_nulls_next(sk);
2121
2122        sk_nulls_for_each_from(sk, node) {
2123                if (sk->sk_family == afinfo->family &&
2124                    net_eq(sock_net(sk), net))
2125                        return sk;
2126        }
2127
2128        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2129        ++st->bucket;
2130        return established_get_first(seq);
2131}
2132
2133static void *established_get_idx(struct seq_file *seq, loff_t pos)
2134{
2135        struct tcp_iter_state *st = seq->private;
2136        void *rc;
2137
2138        st->bucket = 0;
2139        rc = established_get_first(seq);
2140
2141        while (rc && pos) {
2142                rc = established_get_next(seq, rc);
2143                --pos;
2144        }
2145        return rc;
2146}
2147
2148static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2149{
2150        void *rc;
2151        struct tcp_iter_state *st = seq->private;
2152
2153        st->state = TCP_SEQ_STATE_LISTENING;
2154        rc        = listening_get_idx(seq, &pos);
2155
2156        if (!rc) {
2157                st->state = TCP_SEQ_STATE_ESTABLISHED;
2158                rc        = established_get_idx(seq, pos);
2159        }
2160
2161        return rc;
2162}
2163
2164static void *tcp_seek_last_pos(struct seq_file *seq)
2165{
2166        struct tcp_iter_state *st = seq->private;
2167        int offset = st->offset;
2168        int orig_num = st->num;
2169        void *rc = NULL;
2170
2171        switch (st->state) {
2172        case TCP_SEQ_STATE_LISTENING:
2173                if (st->bucket >= INET_LHTABLE_SIZE)
2174                        break;
2175                st->state = TCP_SEQ_STATE_LISTENING;
2176                rc = listening_get_next(seq, NULL);
2177                while (offset-- && rc)
2178                        rc = listening_get_next(seq, rc);
2179                if (rc)
2180                        break;
2181                st->bucket = 0;
2182                st->state = TCP_SEQ_STATE_ESTABLISHED;
2183                /* Fallthrough */
2184        case TCP_SEQ_STATE_ESTABLISHED:
2185                if (st->bucket > tcp_hashinfo.ehash_mask)
2186                        break;
2187                rc = established_get_first(seq);
2188                while (offset-- && rc)
2189                        rc = established_get_next(seq, rc);
2190        }
2191
2192        st->num = orig_num;
2193
2194        return rc;
2195}
2196
2197void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2198{
2199        struct tcp_iter_state *st = seq->private;
2200        void *rc;
2201
2202        if (*pos && *pos == st->last_pos) {
2203                rc = tcp_seek_last_pos(seq);
2204                if (rc)
2205                        goto out;
2206        }
2207
2208        st->state = TCP_SEQ_STATE_LISTENING;
2209        st->num = 0;
2210        st->bucket = 0;
2211        st->offset = 0;
2212        rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2213
2214out:
2215        st->last_pos = *pos;
2216        return rc;
2217}
2218EXPORT_SYMBOL(tcp_seq_start);
2219
2220void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2221{
2222        struct tcp_iter_state *st = seq->private;
2223        void *rc = NULL;
2224
2225        if (v == SEQ_START_TOKEN) {
2226                rc = tcp_get_idx(seq, 0);
2227                goto out;
2228        }
2229
2230        switch (st->state) {
2231        case TCP_SEQ_STATE_LISTENING:
2232                rc = listening_get_next(seq, v);
2233                if (!rc) {
2234                        st->state = TCP_SEQ_STATE_ESTABLISHED;
2235                        st->bucket = 0;
2236                        st->offset = 0;
2237                        rc        = established_get_first(seq);
2238                }
2239                break;
2240        case TCP_SEQ_STATE_ESTABLISHED:
2241                rc = established_get_next(seq, v);
2242                break;
2243        }
2244out:
2245        ++*pos;
2246        st->last_pos = *pos;
2247        return rc;
2248}
2249EXPORT_SYMBOL(tcp_seq_next);
2250
2251void tcp_seq_stop(struct seq_file *seq, void *v)
2252{
2253        struct tcp_iter_state *st = seq->private;
2254
2255        switch (st->state) {
2256        case TCP_SEQ_STATE_LISTENING:
2257                if (v != SEQ_START_TOKEN)
2258                        spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2259                break;
2260        case TCP_SEQ_STATE_ESTABLISHED:
2261                if (v)
2262                        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2263                break;
2264        }
2265}
2266EXPORT_SYMBOL(tcp_seq_stop);
2267
2268static void get_openreq4(const struct request_sock *req,
2269                         struct seq_file *f, int i)
2270{
2271        const struct inet_request_sock *ireq = inet_rsk(req);
2272        long delta = req->rsk_timer.expires - jiffies;
2273
2274        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2275                " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2276                i,
2277                ireq->ir_loc_addr,
2278                ireq->ir_num,
2279                ireq->ir_rmt_addr,
2280                ntohs(ireq->ir_rmt_port),
2281                TCP_SYN_RECV,
2282                0, 0, /* could print option size, but that is af dependent. */
2283                1,    /* timers active (only the expire timer) */
2284                jiffies_delta_to_clock_t(delta),
2285                req->num_timeout,
2286                from_kuid_munged(seq_user_ns(f),
2287                                 sock_i_uid(req->rsk_listener)),
2288                0,  /* non standard timer */
2289                0, /* open_requests have no inode */
2290                0,
2291                req);
2292}
2293
2294static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2295{
2296        int timer_active;
2297        unsigned long timer_expires;
2298        const struct tcp_sock *tp = tcp_sk(sk);
2299        const struct inet_connection_sock *icsk = inet_csk(sk);
2300        const struct inet_sock *inet = inet_sk(sk);
2301        const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2302        __be32 dest = inet->inet_daddr;
2303        __be32 src = inet->inet_rcv_saddr;
2304        __u16 destp = ntohs(inet->inet_dport);
2305        __u16 srcp = ntohs(inet->inet_sport);
2306        int rx_queue;
2307        int state;
2308
2309        if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2310            icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2311            icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2312                timer_active    = 1;
2313                timer_expires   = icsk->icsk_timeout;
2314        } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2315                timer_active    = 4;
2316                timer_expires   = icsk->icsk_timeout;
2317        } else if (timer_pending(&sk->sk_timer)) {
2318                timer_active    = 2;
2319                timer_expires   = sk->sk_timer.expires;
2320        } else {
2321                timer_active    = 0;
2322                timer_expires = jiffies;
2323        }
2324
2325        state = inet_sk_state_load(sk);
2326        if (state == TCP_LISTEN)
2327                rx_queue = sk->sk_ack_backlog;
2328        else
2329                /* Because we don't lock the socket,
2330                 * we might find a transient negative value.
2331                 */
2332                rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2333
2334        seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2335                        "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2336                i, src, srcp, dest, destp, state,
2337                tp->write_seq - tp->snd_una,
2338                rx_queue,
2339                timer_active,
2340                jiffies_delta_to_clock_t(timer_expires - jiffies),
2341                icsk->icsk_retransmits,
2342                from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2343                icsk->icsk_probes_out,
2344                sock_i_ino(sk),
2345                refcount_read(&sk->sk_refcnt), sk,
2346                jiffies_to_clock_t(icsk->icsk_rto),
2347                jiffies_to_clock_t(icsk->icsk_ack.ato),
2348                (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2349                tp->snd_cwnd,
2350                state == TCP_LISTEN ?
2351                    fastopenq->max_qlen :
2352                    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2353}
2354
2355static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2356                               struct seq_file *f, int i)
2357{
2358        long delta = tw->tw_timer.expires - jiffies;
2359        __be32 dest, src;
2360        __u16 destp, srcp;
2361
2362        dest  = tw->tw_daddr;
2363        src   = tw->tw_rcv_saddr;
2364        destp = ntohs(tw->tw_dport);
2365        srcp  = ntohs(tw->tw_sport);
2366
2367        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2368                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2369                i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2370                3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2371                refcount_read(&tw->tw_refcnt), tw);
2372}
2373
2374#define TMPSZ 150
2375
2376static int tcp4_seq_show(struct seq_file *seq, void *v)
2377{
2378        struct tcp_iter_state *st;
2379        struct sock *sk = v;
2380
2381        seq_setwidth(seq, TMPSZ - 1);
2382        if (v == SEQ_START_TOKEN) {
2383                seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2384                           "rx_queue tr tm->when retrnsmt   uid  timeout "
2385                           "inode");
2386                goto out;
2387        }
2388        st = seq->private;
2389
2390        if (sk->sk_state == TCP_TIME_WAIT)
2391                get_timewait4_sock(v, seq, st->num);
2392        else if (sk->sk_state == TCP_NEW_SYN_RECV)
2393                get_openreq4(v, seq, st->num);
2394        else
2395                get_tcp4_sock(v, seq, st->num);
2396out:
2397        seq_pad(seq, '\n');
2398        return 0;
2399}
2400
2401static const struct seq_operations tcp4_seq_ops = {
2402        .show           = tcp4_seq_show,
2403        .start          = tcp_seq_start,
2404        .next           = tcp_seq_next,
2405        .stop           = tcp_seq_stop,
2406};
2407
2408static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2409        .family         = AF_INET,
2410};
2411
2412static int __net_init tcp4_proc_init_net(struct net *net)
2413{
2414        if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2415                        sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2416                return -ENOMEM;
2417        return 0;
2418}
2419
2420static void __net_exit tcp4_proc_exit_net(struct net *net)
2421{
2422        remove_proc_entry("tcp", net->proc_net);
2423}
2424
2425static struct pernet_operations tcp4_net_ops = {
2426        .init = tcp4_proc_init_net,
2427        .exit = tcp4_proc_exit_net,
2428};
2429
2430int __init tcp4_proc_init(void)
2431{
2432        return register_pernet_subsys(&tcp4_net_ops);
2433}
2434
2435void tcp4_proc_exit(void)
2436{
2437        unregister_pernet_subsys(&tcp4_net_ops);
2438}
2439#endif /* CONFIG_PROC_FS */
2440
2441struct proto tcp_prot = {
2442        .name                   = "TCP",
2443        .owner                  = THIS_MODULE,
2444        .close                  = tcp_close,
2445        .pre_connect            = tcp_v4_pre_connect,
2446        .connect                = tcp_v4_connect,
2447        .disconnect             = tcp_disconnect,
2448        .accept                 = inet_csk_accept,
2449        .ioctl                  = tcp_ioctl,
2450        .init                   = tcp_v4_init_sock,
2451        .destroy                = tcp_v4_destroy_sock,
2452        .shutdown               = tcp_shutdown,
2453        .setsockopt             = tcp_setsockopt,
2454        .getsockopt             = tcp_getsockopt,
2455        .keepalive              = tcp_set_keepalive,
2456        .recvmsg                = tcp_recvmsg,
2457        .sendmsg                = tcp_sendmsg,
2458        .sendpage               = tcp_sendpage,
2459        .backlog_rcv            = tcp_v4_do_rcv,
2460        .release_cb             = tcp_release_cb,
2461        .hash                   = inet_hash,
2462        .unhash                 = inet_unhash,
2463        .get_port               = inet_csk_get_port,
2464        .enter_memory_pressure  = tcp_enter_memory_pressure,
2465        .leave_memory_pressure  = tcp_leave_memory_pressure,
2466        .stream_memory_free     = tcp_stream_memory_free,
2467        .sockets_allocated      = &tcp_sockets_allocated,
2468        .orphan_count           = &tcp_orphan_count,
2469        .memory_allocated       = &tcp_memory_allocated,
2470        .memory_pressure        = &tcp_memory_pressure,
2471        .sysctl_mem             = sysctl_tcp_mem,
2472        .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2473        .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2474        .max_header             = MAX_TCP_HEADER,
2475        .obj_size               = sizeof(struct tcp_sock),
2476        .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2477        .twsk_prot              = &tcp_timewait_sock_ops,
2478        .rsk_prot               = &tcp_request_sock_ops,
2479        .h.hashinfo             = &tcp_hashinfo,
2480        .no_autobind            = true,
2481#ifdef CONFIG_COMPAT
2482        .compat_setsockopt      = compat_tcp_setsockopt,
2483        .compat_getsockopt      = compat_tcp_getsockopt,
2484#endif
2485        .diag_destroy           = tcp_abort,
2486};
2487EXPORT_SYMBOL(tcp_prot);
2488
2489static void __net_exit tcp_sk_exit(struct net *net)
2490{
2491        int cpu;
2492
2493        if (net->ipv4.tcp_congestion_control)
2494                module_put(net->ipv4.tcp_congestion_control->owner);
2495
2496        for_each_possible_cpu(cpu)
2497                inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2498        free_percpu(net->ipv4.tcp_sk);
2499}
2500
2501static int __net_init tcp_sk_init(struct net *net)
2502{
2503        int res, cpu, cnt;
2504
2505        net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2506        if (!net->ipv4.tcp_sk)
2507                return -ENOMEM;
2508
2509        for_each_possible_cpu(cpu) {
2510                struct sock *sk;
2511
2512                res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2513                                           IPPROTO_TCP, net);
2514                if (res)
2515                        goto fail;
2516                sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2517
2518                /* Please enforce IP_DF and IPID==0 for RST and
2519                 * ACK sent in SYN-RECV and TIME-WAIT state.
2520                 */
2521                inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2522
2523                *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2524        }
2525
2526        net->ipv4.sysctl_tcp_ecn = 2;
2527        net->ipv4.sysctl_tcp_ecn_fallback = 1;
2528
2529        net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2530        net->ipv4_sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2531        net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2532        net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2533
2534        net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2535        net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2536        net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2537
2538        net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2539        net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2540        net->ipv4.sysctl_tcp_syncookies = 1;
2541        net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2542        net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2543        net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2544        net->ipv4.sysctl_tcp_orphan_retries = 0;
2545        net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2546        net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2547        net->ipv4.sysctl_tcp_tw_reuse = 2;
2548
2549        cnt = tcp_hashinfo.ehash_mask + 1;
2550        net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2551        net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2552
2553        net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2554        net->ipv4.sysctl_tcp_sack = 1;
2555        net->ipv4.sysctl_tcp_window_scaling = 1;
2556        net->ipv4.sysctl_tcp_timestamps = 1;
2557        net->ipv4.sysctl_tcp_early_retrans = 3;
2558        net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2559        net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2560        net->ipv4.sysctl_tcp_retrans_collapse = 1;
2561        net->ipv4.sysctl_tcp_max_reordering = 300;
2562        net->ipv4.sysctl_tcp_dsack = 1;
2563        net->ipv4.sysctl_tcp_app_win = 31;
2564        net->ipv4.sysctl_tcp_adv_win_scale = 1;
2565        net->ipv4.sysctl_tcp_frto = 2;
2566        net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2567        /* This limits the percentage of the congestion window which we
2568         * will allow a single TSO frame to consume.  Building TSO frames
2569         * which are too large can cause TCP streams to be bursty.
2570         */
2571        net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2572        /* Default TSQ limit of four TSO segments */
2573        net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
2574        /* rfc5961 challenge ack rate limiting */
2575        net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2576        net->ipv4.sysctl_tcp_min_tso_segs = 2;
2577        net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2578        net->ipv4.sysctl_tcp_autocorking = 1;
2579        net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2580        net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2581        net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2582        if (net != &init_net) {
2583                memcpy(net->ipv4.sysctl_tcp_rmem,
2584                       init_net.ipv4.sysctl_tcp_rmem,
2585                       sizeof(init_net.ipv4.sysctl_tcp_rmem));
2586                memcpy(net->ipv4.sysctl_tcp_wmem,
2587                       init_net.ipv4.sysctl_tcp_wmem,
2588                       sizeof(init_net.ipv4.sysctl_tcp_wmem));
2589        }
2590        net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2591        net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2592        net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2593        spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2594        net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2595        atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2596
2597        /* Reno is always built in */
2598        if (!net_eq(net, &init_net) &&
2599            try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2600                net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2601        else
2602                net->ipv4.tcp_congestion_control = &tcp_reno;
2603
2604        return 0;
2605fail:
2606        tcp_sk_exit(net);
2607
2608        return res;
2609}
2610
2611static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2612{
2613        struct net *net;
2614
2615        inet_twsk_purge(&tcp_hashinfo, AF_INET);
2616
2617        list_for_each_entry(net, net_exit_list, exit_list)
2618                tcp_fastopen_ctx_destroy(net);
2619}
2620
2621static struct pernet_operations __net_initdata tcp_sk_ops = {
2622       .init       = tcp_sk_init,
2623       .exit       = tcp_sk_exit,
2624       .exit_batch = tcp_sk_exit_batch,
2625};
2626
2627void __init tcp_v4_init(void)
2628{
2629        if (register_pernet_subsys(&tcp_sk_ops))
2630                panic("Failed to create the TCP control socket.\n");
2631}
2632