linux/net/ipv4/tcp_ipv4.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4 *              operating system.  INET is implemented using the  BSD Socket
   5 *              interface as the means of communication with the user level.
   6 *
   7 *              Implementation of the Transmission Control Protocol(TCP).
   8 *
   9 *              IPv4 specific functions
  10 *
  11 *              code split from:
  12 *              linux/ipv4/tcp.c
  13 *              linux/ipv4/tcp_input.c
  14 *              linux/ipv4/tcp_output.c
  15 *
  16 *              See tcp.c for author information
  17 */
  18
  19/*
  20 * Changes:
  21 *              David S. Miller :       New socket lookup architecture.
  22 *                                      This code is dedicated to John Dyson.
  23 *              David S. Miller :       Change semantics of established hash,
  24 *                                      half is devoted to TIME_WAIT sockets
  25 *                                      and the rest go in the other half.
  26 *              Andi Kleen :            Add support for syncookies and fixed
  27 *                                      some bugs: ip options weren't passed to
  28 *                                      the TCP layer, missed a check for an
  29 *                                      ACK bit.
  30 *              Andi Kleen :            Implemented fast path mtu discovery.
  31 *                                      Fixed many serious bugs in the
  32 *                                      request_sock handling and moved
  33 *                                      most of it into the af independent code.
  34 *                                      Added tail drop and some other bugfixes.
  35 *                                      Added new listen semantics.
  36 *              Mike McLagan    :       Routing by source
  37 *      Juan Jose Ciarlante:            ip_dynaddr bits
  38 *              Andi Kleen:             various fixes.
  39 *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  40 *                                      coma.
  41 *      Andi Kleen              :       Fix new listen.
  42 *      Andi Kleen              :       Fix accept error reporting.
  43 *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  44 *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  45 *                                      a single port at the same time.
  46 */
  47
  48#define pr_fmt(fmt) "TCP: " fmt
  49
  50#include <linux/bottom_half.h>
  51#include <linux/types.h>
  52#include <linux/fcntl.h>
  53#include <linux/module.h>
  54#include <linux/random.h>
  55#include <linux/cache.h>
  56#include <linux/jhash.h>
  57#include <linux/init.h>
  58#include <linux/times.h>
  59#include <linux/slab.h>
  60
  61#include <net/net_namespace.h>
  62#include <net/icmp.h>
  63#include <net/inet_hashtables.h>
  64#include <net/tcp.h>
  65#include <net/transp_v6.h>
  66#include <net/ipv6.h>
  67#include <net/inet_common.h>
  68#include <net/timewait_sock.h>
  69#include <net/xfrm.h>
  70#include <net/secure_seq.h>
  71#include <net/busy_poll.h>
  72
  73#include <linux/inet.h>
  74#include <linux/ipv6.h>
  75#include <linux/stddef.h>
  76#include <linux/proc_fs.h>
  77#include <linux/seq_file.h>
  78#include <linux/inetdevice.h>
  79
  80#include <crypto/hash.h>
  81#include <linux/scatterlist.h>
  82
  83#include <trace/events/tcp.h>
  84
  85#ifdef CONFIG_TCP_MD5SIG
  86static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  87                               __be32 daddr, __be32 saddr, const struct tcphdr *th);
  88#endif
  89
  90struct inet_hashinfo tcp_hashinfo;
  91EXPORT_SYMBOL(tcp_hashinfo);
  92
  93static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  94{
  95        return secure_tcp_seq(ip_hdr(skb)->daddr,
  96                              ip_hdr(skb)->saddr,
  97                              tcp_hdr(skb)->dest,
  98                              tcp_hdr(skb)->source);
  99}
 100
 101static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 102{
 103        return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 104}
 105
 106int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 107{
 108        const struct inet_timewait_sock *tw = inet_twsk(sktw);
 109        const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 110        struct tcp_sock *tp = tcp_sk(sk);
 111        int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
 112
 113        if (reuse == 2) {
 114                /* Still does not detect *everything* that goes through
 115                 * lo, since we require a loopback src or dst address
 116                 * or direct binding to 'lo' interface.
 117                 */
 118                bool loopback = false;
 119                if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 120                        loopback = true;
 121#if IS_ENABLED(CONFIG_IPV6)
 122                if (tw->tw_family == AF_INET6) {
 123                        if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 124                            ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
 125                            ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 126                            ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
 127                                loopback = true;
 128                } else
 129#endif
 130                {
 131                        if (ipv4_is_loopback(tw->tw_daddr) ||
 132                            ipv4_is_loopback(tw->tw_rcv_saddr))
 133                                loopback = true;
 134                }
 135                if (!loopback)
 136                        reuse = 0;
 137        }
 138
 139        /* With PAWS, it is safe from the viewpoint
 140           of data integrity. Even without PAWS it is safe provided sequence
 141           spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 142
 143           Actually, the idea is close to VJ's one, only timestamp cache is
 144           held not per host, but per port pair and TW bucket is used as state
 145           holder.
 146
 147           If TW bucket has been already destroyed we fall back to VJ's scheme
 148           and use initial timestamp retrieved from peer table.
 149         */
 150        if (tcptw->tw_ts_recent_stamp &&
 151            (!twp || (reuse && time_after32(ktime_get_seconds(),
 152                                            tcptw->tw_ts_recent_stamp)))) {
 153                /* In case of repair and re-using TIME-WAIT sockets we still
 154                 * want to be sure that it is safe as above but honor the
 155                 * sequence numbers and time stamps set as part of the repair
 156                 * process.
 157                 *
 158                 * Without this check re-using a TIME-WAIT socket with TCP
 159                 * repair would accumulate a -1 on the repair assigned
 160                 * sequence number. The first time it is reused the sequence
 161                 * is -1, the second time -2, etc. This fixes that issue
 162                 * without appearing to create any others.
 163                 */
 164                if (likely(!tp->repair)) {
 165                        u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
 166
 167                        if (!seq)
 168                                seq = 1;
 169                        WRITE_ONCE(tp->write_seq, seq);
 170                        tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 171                        tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 172                }
 173                sock_hold(sktw);
 174                return 1;
 175        }
 176
 177        return 0;
 178}
 179EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 180
 181static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 182                              int addr_len)
 183{
 184        /* This check is replicated from tcp_v4_connect() and intended to
 185         * prevent BPF program called below from accessing bytes that are out
 186         * of the bound specified by user in addr_len.
 187         */
 188        if (addr_len < sizeof(struct sockaddr_in))
 189                return -EINVAL;
 190
 191        sock_owned_by_me(sk);
 192
 193        return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 194}
 195
 196/* This will initiate an outgoing connection. */
 197int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 198{
 199        struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 200        struct inet_sock *inet = inet_sk(sk);
 201        struct tcp_sock *tp = tcp_sk(sk);
 202        __be16 orig_sport, orig_dport;
 203        __be32 daddr, nexthop;
 204        struct flowi4 *fl4;
 205        struct rtable *rt;
 206        int err;
 207        struct ip_options_rcu *inet_opt;
 208        struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 209
 210        if (addr_len < sizeof(struct sockaddr_in))
 211                return -EINVAL;
 212
 213        if (usin->sin_family != AF_INET)
 214                return -EAFNOSUPPORT;
 215
 216        nexthop = daddr = usin->sin_addr.s_addr;
 217        inet_opt = rcu_dereference_protected(inet->inet_opt,
 218                                             lockdep_sock_is_held(sk));
 219        if (inet_opt && inet_opt->opt.srr) {
 220                if (!daddr)
 221                        return -EINVAL;
 222                nexthop = inet_opt->opt.faddr;
 223        }
 224
 225        orig_sport = inet->inet_sport;
 226        orig_dport = usin->sin_port;
 227        fl4 = &inet->cork.fl.u.ip4;
 228        rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 229                              RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 230                              IPPROTO_TCP,
 231                              orig_sport, orig_dport, sk);
 232        if (IS_ERR(rt)) {
 233                err = PTR_ERR(rt);
 234                if (err == -ENETUNREACH)
 235                        IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 236                return err;
 237        }
 238
 239        if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 240                ip_rt_put(rt);
 241                return -ENETUNREACH;
 242        }
 243
 244        if (!inet_opt || !inet_opt->opt.srr)
 245                daddr = fl4->daddr;
 246
 247        if (!inet->inet_saddr)
 248                inet->inet_saddr = fl4->saddr;
 249        sk_rcv_saddr_set(sk, inet->inet_saddr);
 250
 251        if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 252                /* Reset inherited state */
 253                tp->rx_opt.ts_recent       = 0;
 254                tp->rx_opt.ts_recent_stamp = 0;
 255                if (likely(!tp->repair))
 256                        WRITE_ONCE(tp->write_seq, 0);
 257        }
 258
 259        inet->inet_dport = usin->sin_port;
 260        sk_daddr_set(sk, daddr);
 261
 262        inet_csk(sk)->icsk_ext_hdr_len = 0;
 263        if (inet_opt)
 264                inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 265
 266        tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 267
 268        /* Socket identity is still unknown (sport may be zero).
 269         * However we set state to SYN-SENT and not releasing socket
 270         * lock select source port, enter ourselves into the hash tables and
 271         * complete initialization after this.
 272         */
 273        tcp_set_state(sk, TCP_SYN_SENT);
 274        err = inet_hash_connect(tcp_death_row, sk);
 275        if (err)
 276                goto failure;
 277
 278        sk_set_txhash(sk);
 279
 280        rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 281                               inet->inet_sport, inet->inet_dport, sk);
 282        if (IS_ERR(rt)) {
 283                err = PTR_ERR(rt);
 284                rt = NULL;
 285                goto failure;
 286        }
 287        /* OK, now commit destination to socket.  */
 288        sk->sk_gso_type = SKB_GSO_TCPV4;
 289        sk_setup_caps(sk, &rt->dst);
 290        rt = NULL;
 291
 292        if (likely(!tp->repair)) {
 293                if (!tp->write_seq)
 294                        WRITE_ONCE(tp->write_seq,
 295                                   secure_tcp_seq(inet->inet_saddr,
 296                                                  inet->inet_daddr,
 297                                                  inet->inet_sport,
 298                                                  usin->sin_port));
 299                tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 300                                                 inet->inet_saddr,
 301                                                 inet->inet_daddr);
 302        }
 303
 304        inet->inet_id = prandom_u32();
 305
 306        if (tcp_fastopen_defer_connect(sk, &err))
 307                return err;
 308        if (err)
 309                goto failure;
 310
 311        err = tcp_connect(sk);
 312
 313        if (err)
 314                goto failure;
 315
 316        return 0;
 317
 318failure:
 319        /*
 320         * This unhashes the socket and releases the local port,
 321         * if necessary.
 322         */
 323        tcp_set_state(sk, TCP_CLOSE);
 324        ip_rt_put(rt);
 325        sk->sk_route_caps = 0;
 326        inet->inet_dport = 0;
 327        return err;
 328}
 329EXPORT_SYMBOL(tcp_v4_connect);
 330
 331/*
 332 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 333 * It can be called through tcp_release_cb() if socket was owned by user
 334 * at the time tcp_v4_err() was called to handle ICMP message.
 335 */
 336void tcp_v4_mtu_reduced(struct sock *sk)
 337{
 338        struct inet_sock *inet = inet_sk(sk);
 339        struct dst_entry *dst;
 340        u32 mtu;
 341
 342        if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 343                return;
 344        mtu = tcp_sk(sk)->mtu_info;
 345        dst = inet_csk_update_pmtu(sk, mtu);
 346        if (!dst)
 347                return;
 348
 349        /* Something is about to be wrong... Remember soft error
 350         * for the case, if this connection will not able to recover.
 351         */
 352        if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 353                sk->sk_err_soft = EMSGSIZE;
 354
 355        mtu = dst_mtu(dst);
 356
 357        if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 358            ip_sk_accept_pmtu(sk) &&
 359            inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 360                tcp_sync_mss(sk, mtu);
 361
 362                /* Resend the TCP packet because it's
 363                 * clear that the old packet has been
 364                 * dropped. This is the new "fast" path mtu
 365                 * discovery.
 366                 */
 367                tcp_simple_retransmit(sk);
 368        } /* else let the usual retransmit timer handle it */
 369}
 370EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 371
 372static void do_redirect(struct sk_buff *skb, struct sock *sk)
 373{
 374        struct dst_entry *dst = __sk_dst_check(sk, 0);
 375
 376        if (dst)
 377                dst->ops->redirect(dst, sk, skb);
 378}
 379
 380
 381/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 382void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 383{
 384        struct request_sock *req = inet_reqsk(sk);
 385        struct net *net = sock_net(sk);
 386
 387        /* ICMPs are not backlogged, hence we cannot get
 388         * an established socket here.
 389         */
 390        if (seq != tcp_rsk(req)->snt_isn) {
 391                __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 392        } else if (abort) {
 393                /*
 394                 * Still in SYN_RECV, just remove it silently.
 395                 * There is no good way to pass the error to the newly
 396                 * created socket, and POSIX does not want network
 397                 * errors returned from accept().
 398                 */
 399                inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 400                tcp_listendrop(req->rsk_listener);
 401        }
 402        reqsk_put(req);
 403}
 404EXPORT_SYMBOL(tcp_req_err);
 405
 406/*
 407 * This routine is called by the ICMP module when it gets some
 408 * sort of error condition.  If err < 0 then the socket should
 409 * be closed and the error returned to the user.  If err > 0
 410 * it's just the icmp type << 8 | icmp code.  After adjustment
 411 * header points to the first 8 bytes of the tcp header.  We need
 412 * to find the appropriate port.
 413 *
 414 * The locking strategy used here is very "optimistic". When
 415 * someone else accesses the socket the ICMP is just dropped
 416 * and for some paths there is no check at all.
 417 * A more general error queue to queue errors for later handling
 418 * is probably better.
 419 *
 420 */
 421
 422int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 423{
 424        const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 425        struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 426        struct inet_connection_sock *icsk;
 427        struct tcp_sock *tp;
 428        struct inet_sock *inet;
 429        const int type = icmp_hdr(icmp_skb)->type;
 430        const int code = icmp_hdr(icmp_skb)->code;
 431        struct sock *sk;
 432        struct sk_buff *skb;
 433        struct request_sock *fastopen;
 434        u32 seq, snd_una;
 435        s32 remaining;
 436        u32 delta_us;
 437        int err;
 438        struct net *net = dev_net(icmp_skb->dev);
 439
 440        sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 441                                       th->dest, iph->saddr, ntohs(th->source),
 442                                       inet_iif(icmp_skb), 0);
 443        if (!sk) {
 444                __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 445                return -ENOENT;
 446        }
 447        if (sk->sk_state == TCP_TIME_WAIT) {
 448                inet_twsk_put(inet_twsk(sk));
 449                return 0;
 450        }
 451        seq = ntohl(th->seq);
 452        if (sk->sk_state == TCP_NEW_SYN_RECV) {
 453                tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 454                                     type == ICMP_TIME_EXCEEDED ||
 455                                     (type == ICMP_DEST_UNREACH &&
 456                                      (code == ICMP_NET_UNREACH ||
 457                                       code == ICMP_HOST_UNREACH)));
 458                return 0;
 459        }
 460
 461        bh_lock_sock(sk);
 462        /* If too many ICMPs get dropped on busy
 463         * servers this needs to be solved differently.
 464         * We do take care of PMTU discovery (RFC1191) special case :
 465         * we can receive locally generated ICMP messages while socket is held.
 466         */
 467        if (sock_owned_by_user(sk)) {
 468                if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 469                        __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 470        }
 471        if (sk->sk_state == TCP_CLOSE)
 472                goto out;
 473
 474        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 475                __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 476                goto out;
 477        }
 478
 479        icsk = inet_csk(sk);
 480        tp = tcp_sk(sk);
 481        /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 482        fastopen = rcu_dereference(tp->fastopen_rsk);
 483        snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 484        if (sk->sk_state != TCP_LISTEN &&
 485            !between(seq, snd_una, tp->snd_nxt)) {
 486                __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 487                goto out;
 488        }
 489
 490        switch (type) {
 491        case ICMP_REDIRECT:
 492                if (!sock_owned_by_user(sk))
 493                        do_redirect(icmp_skb, sk);
 494                goto out;
 495        case ICMP_SOURCE_QUENCH:
 496                /* Just silently ignore these. */
 497                goto out;
 498        case ICMP_PARAMETERPROB:
 499                err = EPROTO;
 500                break;
 501        case ICMP_DEST_UNREACH:
 502                if (code > NR_ICMP_UNREACH)
 503                        goto out;
 504
 505                if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 506                        /* We are not interested in TCP_LISTEN and open_requests
 507                         * (SYN-ACKs send out by Linux are always <576bytes so
 508                         * they should go through unfragmented).
 509                         */
 510                        if (sk->sk_state == TCP_LISTEN)
 511                                goto out;
 512
 513                        tp->mtu_info = info;
 514                        if (!sock_owned_by_user(sk)) {
 515                                tcp_v4_mtu_reduced(sk);
 516                        } else {
 517                                if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 518                                        sock_hold(sk);
 519                        }
 520                        goto out;
 521                }
 522
 523                err = icmp_err_convert[code].errno;
 524                /* check if icmp_skb allows revert of backoff
 525                 * (see draft-zimmermann-tcp-lcd) */
 526                if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 527                        break;
 528                if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 529                    !icsk->icsk_backoff || fastopen)
 530                        break;
 531
 532                if (sock_owned_by_user(sk))
 533                        break;
 534
 535                skb = tcp_rtx_queue_head(sk);
 536                if (WARN_ON_ONCE(!skb))
 537                        break;
 538
 539                icsk->icsk_backoff--;
 540                icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 541                                               TCP_TIMEOUT_INIT;
 542                icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 543
 544
 545                tcp_mstamp_refresh(tp);
 546                delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 547                remaining = icsk->icsk_rto -
 548                            usecs_to_jiffies(delta_us);
 549
 550                if (remaining > 0) {
 551                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 552                                                  remaining, TCP_RTO_MAX);
 553                } else {
 554                        /* RTO revert clocked out retransmission.
 555                         * Will retransmit now */
 556                        tcp_retransmit_timer(sk);
 557                }
 558
 559                break;
 560        case ICMP_TIME_EXCEEDED:
 561                err = EHOSTUNREACH;
 562                break;
 563        default:
 564                goto out;
 565        }
 566
 567        switch (sk->sk_state) {
 568        case TCP_SYN_SENT:
 569        case TCP_SYN_RECV:
 570                /* Only in fast or simultaneous open. If a fast open socket is
 571                 * is already accepted it is treated as a connected one below.
 572                 */
 573                if (fastopen && !fastopen->sk)
 574                        break;
 575
 576                if (!sock_owned_by_user(sk)) {
 577                        sk->sk_err = err;
 578
 579                        sk->sk_error_report(sk);
 580
 581                        tcp_done(sk);
 582                } else {
 583                        sk->sk_err_soft = err;
 584                }
 585                goto out;
 586        }
 587
 588        /* If we've already connected we will keep trying
 589         * until we time out, or the user gives up.
 590         *
 591         * rfc1122 4.2.3.9 allows to consider as hard errors
 592         * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 593         * but it is obsoleted by pmtu discovery).
 594         *
 595         * Note, that in modern internet, where routing is unreliable
 596         * and in each dark corner broken firewalls sit, sending random
 597         * errors ordered by their masters even this two messages finally lose
 598         * their original sense (even Linux sends invalid PORT_UNREACHs)
 599         *
 600         * Now we are in compliance with RFCs.
 601         *                                                      --ANK (980905)
 602         */
 603
 604        inet = inet_sk(sk);
 605        if (!sock_owned_by_user(sk) && inet->recverr) {
 606                sk->sk_err = err;
 607                sk->sk_error_report(sk);
 608        } else  { /* Only an error on timeout */
 609                sk->sk_err_soft = err;
 610        }
 611
 612out:
 613        bh_unlock_sock(sk);
 614        sock_put(sk);
 615        return 0;
 616}
 617
 618void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 619{
 620        struct tcphdr *th = tcp_hdr(skb);
 621
 622        th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 623        skb->csum_start = skb_transport_header(skb) - skb->head;
 624        skb->csum_offset = offsetof(struct tcphdr, check);
 625}
 626
 627/* This routine computes an IPv4 TCP checksum. */
 628void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 629{
 630        const struct inet_sock *inet = inet_sk(sk);
 631
 632        __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 633}
 634EXPORT_SYMBOL(tcp_v4_send_check);
 635
 636/*
 637 *      This routine will send an RST to the other tcp.
 638 *
 639 *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 640 *                    for reset.
 641 *      Answer: if a packet caused RST, it is not for a socket
 642 *              existing in our system, if it is matched to a socket,
 643 *              it is just duplicate segment or bug in other side's TCP.
 644 *              So that we build reply only basing on parameters
 645 *              arrived with segment.
 646 *      Exception: precedence violation. We do not implement it in any case.
 647 */
 648
 649static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 650{
 651        const struct tcphdr *th = tcp_hdr(skb);
 652        struct {
 653                struct tcphdr th;
 654#ifdef CONFIG_TCP_MD5SIG
 655                __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 656#endif
 657        } rep;
 658        struct ip_reply_arg arg;
 659#ifdef CONFIG_TCP_MD5SIG
 660        struct tcp_md5sig_key *key = NULL;
 661        const __u8 *hash_location = NULL;
 662        unsigned char newhash[16];
 663        int genhash;
 664        struct sock *sk1 = NULL;
 665#endif
 666        u64 transmit_time = 0;
 667        struct sock *ctl_sk;
 668        struct net *net;
 669
 670        /* Never send a reset in response to a reset. */
 671        if (th->rst)
 672                return;
 673
 674        /* If sk not NULL, it means we did a successful lookup and incoming
 675         * route had to be correct. prequeue might have dropped our dst.
 676         */
 677        if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 678                return;
 679
 680        /* Swap the send and the receive. */
 681        memset(&rep, 0, sizeof(rep));
 682        rep.th.dest   = th->source;
 683        rep.th.source = th->dest;
 684        rep.th.doff   = sizeof(struct tcphdr) / 4;
 685        rep.th.rst    = 1;
 686
 687        if (th->ack) {
 688                rep.th.seq = th->ack_seq;
 689        } else {
 690                rep.th.ack = 1;
 691                rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 692                                       skb->len - (th->doff << 2));
 693        }
 694
 695        memset(&arg, 0, sizeof(arg));
 696        arg.iov[0].iov_base = (unsigned char *)&rep;
 697        arg.iov[0].iov_len  = sizeof(rep.th);
 698
 699        net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 700#ifdef CONFIG_TCP_MD5SIG
 701        rcu_read_lock();
 702        hash_location = tcp_parse_md5sig_option(th);
 703        if (sk && sk_fullsock(sk)) {
 704                const union tcp_md5_addr *addr;
 705                int l3index;
 706
 707                /* sdif set, means packet ingressed via a device
 708                 * in an L3 domain and inet_iif is set to it.
 709                 */
 710                l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 711                addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 712                key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
 713        } else if (hash_location) {
 714                const union tcp_md5_addr *addr;
 715                int sdif = tcp_v4_sdif(skb);
 716                int dif = inet_iif(skb);
 717                int l3index;
 718
 719                /*
 720                 * active side is lost. Try to find listening socket through
 721                 * source port, and then find md5 key through listening socket.
 722                 * we are not loose security here:
 723                 * Incoming packet is checked with md5 hash with finding key,
 724                 * no RST generated if md5 hash doesn't match.
 725                 */
 726                sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 727                                             ip_hdr(skb)->saddr,
 728                                             th->source, ip_hdr(skb)->daddr,
 729                                             ntohs(th->source), dif, sdif);
 730                /* don't send rst if it can't find key */
 731                if (!sk1)
 732                        goto out;
 733
 734                /* sdif set, means packet ingressed via a device
 735                 * in an L3 domain and dif is set to it.
 736                 */
 737                l3index = sdif ? dif : 0;
 738                addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 739                key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
 740                if (!key)
 741                        goto out;
 742
 743
 744                genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 745                if (genhash || memcmp(hash_location, newhash, 16) != 0)
 746                        goto out;
 747
 748        }
 749
 750        if (key) {
 751                rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 752                                   (TCPOPT_NOP << 16) |
 753                                   (TCPOPT_MD5SIG << 8) |
 754                                   TCPOLEN_MD5SIG);
 755                /* Update length and the length the header thinks exists */
 756                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 757                rep.th.doff = arg.iov[0].iov_len / 4;
 758
 759                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 760                                     key, ip_hdr(skb)->saddr,
 761                                     ip_hdr(skb)->daddr, &rep.th);
 762        }
 763#endif
 764        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 765                                      ip_hdr(skb)->saddr, /* XXX */
 766                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 767        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 768        arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 769
 770        /* When socket is gone, all binding information is lost.
 771         * routing might fail in this case. No choice here, if we choose to force
 772         * input interface, we will misroute in case of asymmetric route.
 773         */
 774        if (sk) {
 775                arg.bound_dev_if = sk->sk_bound_dev_if;
 776                if (sk_fullsock(sk))
 777                        trace_tcp_send_reset(sk, skb);
 778        }
 779
 780        BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 781                     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 782
 783        arg.tos = ip_hdr(skb)->tos;
 784        arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 785        local_bh_disable();
 786        ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 787        if (sk) {
 788                ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 789                                   inet_twsk(sk)->tw_mark : sk->sk_mark;
 790                ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 791                                   inet_twsk(sk)->tw_priority : sk->sk_priority;
 792                transmit_time = tcp_transmit_time(sk);
 793        }
 794        ip_send_unicast_reply(ctl_sk,
 795                              skb, &TCP_SKB_CB(skb)->header.h4.opt,
 796                              ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 797                              &arg, arg.iov[0].iov_len,
 798                              transmit_time);
 799
 800        ctl_sk->sk_mark = 0;
 801        __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 802        __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 803        local_bh_enable();
 804
 805#ifdef CONFIG_TCP_MD5SIG
 806out:
 807        rcu_read_unlock();
 808#endif
 809}
 810
 811/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 812   outside socket context is ugly, certainly. What can I do?
 813 */
 814
 815static void tcp_v4_send_ack(const struct sock *sk,
 816                            struct sk_buff *skb, u32 seq, u32 ack,
 817                            u32 win, u32 tsval, u32 tsecr, int oif,
 818                            struct tcp_md5sig_key *key,
 819                            int reply_flags, u8 tos)
 820{
 821        const struct tcphdr *th = tcp_hdr(skb);
 822        struct {
 823                struct tcphdr th;
 824                __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 825#ifdef CONFIG_TCP_MD5SIG
 826                           + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 827#endif
 828                        ];
 829        } rep;
 830        struct net *net = sock_net(sk);
 831        struct ip_reply_arg arg;
 832        struct sock *ctl_sk;
 833        u64 transmit_time;
 834
 835        memset(&rep.th, 0, sizeof(struct tcphdr));
 836        memset(&arg, 0, sizeof(arg));
 837
 838        arg.iov[0].iov_base = (unsigned char *)&rep;
 839        arg.iov[0].iov_len  = sizeof(rep.th);
 840        if (tsecr) {
 841                rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 842                                   (TCPOPT_TIMESTAMP << 8) |
 843                                   TCPOLEN_TIMESTAMP);
 844                rep.opt[1] = htonl(tsval);
 845                rep.opt[2] = htonl(tsecr);
 846                arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 847        }
 848
 849        /* Swap the send and the receive. */
 850        rep.th.dest    = th->source;
 851        rep.th.source  = th->dest;
 852        rep.th.doff    = arg.iov[0].iov_len / 4;
 853        rep.th.seq     = htonl(seq);
 854        rep.th.ack_seq = htonl(ack);
 855        rep.th.ack     = 1;
 856        rep.th.window  = htons(win);
 857
 858#ifdef CONFIG_TCP_MD5SIG
 859        if (key) {
 860                int offset = (tsecr) ? 3 : 0;
 861
 862                rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 863                                          (TCPOPT_NOP << 16) |
 864                                          (TCPOPT_MD5SIG << 8) |
 865                                          TCPOLEN_MD5SIG);
 866                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 867                rep.th.doff = arg.iov[0].iov_len/4;
 868
 869                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 870                                    key, ip_hdr(skb)->saddr,
 871                                    ip_hdr(skb)->daddr, &rep.th);
 872        }
 873#endif
 874        arg.flags = reply_flags;
 875        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 876                                      ip_hdr(skb)->saddr, /* XXX */
 877                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 878        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 879        if (oif)
 880                arg.bound_dev_if = oif;
 881        arg.tos = tos;
 882        arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 883        local_bh_disable();
 884        ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 885        ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 886                           inet_twsk(sk)->tw_mark : sk->sk_mark;
 887        ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 888                           inet_twsk(sk)->tw_priority : sk->sk_priority;
 889        transmit_time = tcp_transmit_time(sk);
 890        ip_send_unicast_reply(ctl_sk,
 891                              skb, &TCP_SKB_CB(skb)->header.h4.opt,
 892                              ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 893                              &arg, arg.iov[0].iov_len,
 894                              transmit_time);
 895
 896        ctl_sk->sk_mark = 0;
 897        __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 898        local_bh_enable();
 899}
 900
 901static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 902{
 903        struct inet_timewait_sock *tw = inet_twsk(sk);
 904        struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 905
 906        tcp_v4_send_ack(sk, skb,
 907                        tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 908                        tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 909                        tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 910                        tcptw->tw_ts_recent,
 911                        tw->tw_bound_dev_if,
 912                        tcp_twsk_md5_key(tcptw),
 913                        tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 914                        tw->tw_tos
 915                        );
 916
 917        inet_twsk_put(tw);
 918}
 919
 920static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 921                                  struct request_sock *req)
 922{
 923        const union tcp_md5_addr *addr;
 924        int l3index;
 925
 926        /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 927         * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 928         */
 929        u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 930                                             tcp_sk(sk)->snd_nxt;
 931
 932        /* RFC 7323 2.3
 933         * The window field (SEG.WND) of every outgoing segment, with the
 934         * exception of <SYN> segments, MUST be right-shifted by
 935         * Rcv.Wind.Shift bits:
 936         */
 937        addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 938        l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 939        tcp_v4_send_ack(sk, skb, seq,
 940                        tcp_rsk(req)->rcv_nxt,
 941                        req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 942                        tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 943                        req->ts_recent,
 944                        0,
 945                        tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
 946                        inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 947                        ip_hdr(skb)->tos);
 948}
 949
 950/*
 951 *      Send a SYN-ACK after having received a SYN.
 952 *      This still operates on a request_sock only, not on a big
 953 *      socket.
 954 */
 955static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 956                              struct flowi *fl,
 957                              struct request_sock *req,
 958                              struct tcp_fastopen_cookie *foc,
 959                              enum tcp_synack_type synack_type)
 960{
 961        const struct inet_request_sock *ireq = inet_rsk(req);
 962        struct flowi4 fl4;
 963        int err = -1;
 964        struct sk_buff *skb;
 965
 966        /* First, grab a route. */
 967        if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 968                return -1;
 969
 970        skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 971
 972        if (skb) {
 973                __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 974
 975                rcu_read_lock();
 976                err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 977                                            ireq->ir_rmt_addr,
 978                                            rcu_dereference(ireq->ireq_opt));
 979                rcu_read_unlock();
 980                err = net_xmit_eval(err);
 981        }
 982
 983        return err;
 984}
 985
 986/*
 987 *      IPv4 request_sock destructor.
 988 */
 989static void tcp_v4_reqsk_destructor(struct request_sock *req)
 990{
 991        kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
 992}
 993
 994#ifdef CONFIG_TCP_MD5SIG
 995/*
 996 * RFC2385 MD5 checksumming requires a mapping of
 997 * IP address->MD5 Key.
 998 * We need to maintain these in the sk structure.
 999 */
1000
1001DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1002EXPORT_SYMBOL(tcp_md5_needed);
1003
1004/* Find the Key structure for an address.  */
1005struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1006                                           const union tcp_md5_addr *addr,
1007                                           int family)
1008{
1009        const struct tcp_sock *tp = tcp_sk(sk);
1010        struct tcp_md5sig_key *key;
1011        const struct tcp_md5sig_info *md5sig;
1012        __be32 mask;
1013        struct tcp_md5sig_key *best_match = NULL;
1014        bool match;
1015
1016        /* caller either holds rcu_read_lock() or socket lock */
1017        md5sig = rcu_dereference_check(tp->md5sig_info,
1018                                       lockdep_sock_is_held(sk));
1019        if (!md5sig)
1020                return NULL;
1021
1022        hlist_for_each_entry_rcu(key, &md5sig->head, node,
1023                                 lockdep_sock_is_held(sk)) {
1024                if (key->family != family)
1025                        continue;
1026                if (key->l3index && key->l3index != l3index)
1027                        continue;
1028                if (family == AF_INET) {
1029                        mask = inet_make_mask(key->prefixlen);
1030                        match = (key->addr.a4.s_addr & mask) ==
1031                                (addr->a4.s_addr & mask);
1032#if IS_ENABLED(CONFIG_IPV6)
1033                } else if (family == AF_INET6) {
1034                        match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1035                                                  key->prefixlen);
1036#endif
1037                } else {
1038                        match = false;
1039                }
1040
1041                if (match && (!best_match ||
1042                              key->prefixlen > best_match->prefixlen))
1043                        best_match = key;
1044        }
1045        return best_match;
1046}
1047EXPORT_SYMBOL(__tcp_md5_do_lookup);
1048
1049static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1050                                                      const union tcp_md5_addr *addr,
1051                                                      int family, u8 prefixlen,
1052                                                      int l3index)
1053{
1054        const struct tcp_sock *tp = tcp_sk(sk);
1055        struct tcp_md5sig_key *key;
1056        unsigned int size = sizeof(struct in_addr);
1057        const struct tcp_md5sig_info *md5sig;
1058
1059        /* caller either holds rcu_read_lock() or socket lock */
1060        md5sig = rcu_dereference_check(tp->md5sig_info,
1061                                       lockdep_sock_is_held(sk));
1062        if (!md5sig)
1063                return NULL;
1064#if IS_ENABLED(CONFIG_IPV6)
1065        if (family == AF_INET6)
1066                size = sizeof(struct in6_addr);
1067#endif
1068        hlist_for_each_entry_rcu(key, &md5sig->head, node,
1069                                 lockdep_sock_is_held(sk)) {
1070                if (key->family != family)
1071                        continue;
1072                if (key->l3index && key->l3index != l3index)
1073                        continue;
1074                if (!memcmp(&key->addr, addr, size) &&
1075                    key->prefixlen == prefixlen)
1076                        return key;
1077        }
1078        return NULL;
1079}
1080
1081struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1082                                         const struct sock *addr_sk)
1083{
1084        const union tcp_md5_addr *addr;
1085        int l3index;
1086
1087        l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1088                                                 addr_sk->sk_bound_dev_if);
1089        addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1090        return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1091}
1092EXPORT_SYMBOL(tcp_v4_md5_lookup);
1093
1094/* This can be called on a newly created socket, from other files */
1095int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1096                   int family, u8 prefixlen, int l3index,
1097                   const u8 *newkey, u8 newkeylen, gfp_t gfp)
1098{
1099        /* Add Key to the list */
1100        struct tcp_md5sig_key *key;
1101        struct tcp_sock *tp = tcp_sk(sk);
1102        struct tcp_md5sig_info *md5sig;
1103
1104        key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1105        if (key) {
1106                /* Pre-existing entry - just update that one. */
1107                memcpy(key->key, newkey, newkeylen);
1108                key->keylen = newkeylen;
1109                return 0;
1110        }
1111
1112        md5sig = rcu_dereference_protected(tp->md5sig_info,
1113                                           lockdep_sock_is_held(sk));
1114        if (!md5sig) {
1115                md5sig = kmalloc(sizeof(*md5sig), gfp);
1116                if (!md5sig)
1117                        return -ENOMEM;
1118
1119                sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1120                INIT_HLIST_HEAD(&md5sig->head);
1121                rcu_assign_pointer(tp->md5sig_info, md5sig);
1122        }
1123
1124        key = sock_kmalloc(sk, sizeof(*key), gfp);
1125        if (!key)
1126                return -ENOMEM;
1127        if (!tcp_alloc_md5sig_pool()) {
1128                sock_kfree_s(sk, key, sizeof(*key));
1129                return -ENOMEM;
1130        }
1131
1132        memcpy(key->key, newkey, newkeylen);
1133        key->keylen = newkeylen;
1134        key->family = family;
1135        key->prefixlen = prefixlen;
1136        key->l3index = l3index;
1137        memcpy(&key->addr, addr,
1138               (family == AF_INET6) ? sizeof(struct in6_addr) :
1139                                      sizeof(struct in_addr));
1140        hlist_add_head_rcu(&key->node, &md5sig->head);
1141        return 0;
1142}
1143EXPORT_SYMBOL(tcp_md5_do_add);
1144
1145int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1146                   u8 prefixlen, int l3index)
1147{
1148        struct tcp_md5sig_key *key;
1149
1150        key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1151        if (!key)
1152                return -ENOENT;
1153        hlist_del_rcu(&key->node);
1154        atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1155        kfree_rcu(key, rcu);
1156        return 0;
1157}
1158EXPORT_SYMBOL(tcp_md5_do_del);
1159
1160static void tcp_clear_md5_list(struct sock *sk)
1161{
1162        struct tcp_sock *tp = tcp_sk(sk);
1163        struct tcp_md5sig_key *key;
1164        struct hlist_node *n;
1165        struct tcp_md5sig_info *md5sig;
1166
1167        md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1168
1169        hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1170                hlist_del_rcu(&key->node);
1171                atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1172                kfree_rcu(key, rcu);
1173        }
1174}
1175
1176static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1177                                 char __user *optval, int optlen)
1178{
1179        struct tcp_md5sig cmd;
1180        struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1181        const union tcp_md5_addr *addr;
1182        u8 prefixlen = 32;
1183        int l3index = 0;
1184
1185        if (optlen < sizeof(cmd))
1186                return -EINVAL;
1187
1188        if (copy_from_user(&cmd, optval, sizeof(cmd)))
1189                return -EFAULT;
1190
1191        if (sin->sin_family != AF_INET)
1192                return -EINVAL;
1193
1194        if (optname == TCP_MD5SIG_EXT &&
1195            cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1196                prefixlen = cmd.tcpm_prefixlen;
1197                if (prefixlen > 32)
1198                        return -EINVAL;
1199        }
1200
1201        if (optname == TCP_MD5SIG_EXT &&
1202            cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1203                struct net_device *dev;
1204
1205                rcu_read_lock();
1206                dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1207                if (dev && netif_is_l3_master(dev))
1208                        l3index = dev->ifindex;
1209
1210                rcu_read_unlock();
1211
1212                /* ok to reference set/not set outside of rcu;
1213                 * right now device MUST be an L3 master
1214                 */
1215                if (!dev || !l3index)
1216                        return -EINVAL;
1217        }
1218
1219        addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1220
1221        if (!cmd.tcpm_keylen)
1222                return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
1223
1224        if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1225                return -EINVAL;
1226
1227        return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1228                              cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1229}
1230
1231static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1232                                   __be32 daddr, __be32 saddr,
1233                                   const struct tcphdr *th, int nbytes)
1234{
1235        struct tcp4_pseudohdr *bp;
1236        struct scatterlist sg;
1237        struct tcphdr *_th;
1238
1239        bp = hp->scratch;
1240        bp->saddr = saddr;
1241        bp->daddr = daddr;
1242        bp->pad = 0;
1243        bp->protocol = IPPROTO_TCP;
1244        bp->len = cpu_to_be16(nbytes);
1245
1246        _th = (struct tcphdr *)(bp + 1);
1247        memcpy(_th, th, sizeof(*th));
1248        _th->check = 0;
1249
1250        sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1251        ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1252                                sizeof(*bp) + sizeof(*th));
1253        return crypto_ahash_update(hp->md5_req);
1254}
1255
1256static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1257                               __be32 daddr, __be32 saddr, const struct tcphdr *th)
1258{
1259        struct tcp_md5sig_pool *hp;
1260        struct ahash_request *req;
1261
1262        hp = tcp_get_md5sig_pool();
1263        if (!hp)
1264                goto clear_hash_noput;
1265        req = hp->md5_req;
1266
1267        if (crypto_ahash_init(req))
1268                goto clear_hash;
1269        if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1270                goto clear_hash;
1271        if (tcp_md5_hash_key(hp, key))
1272                goto clear_hash;
1273        ahash_request_set_crypt(req, NULL, md5_hash, 0);
1274        if (crypto_ahash_final(req))
1275                goto clear_hash;
1276
1277        tcp_put_md5sig_pool();
1278        return 0;
1279
1280clear_hash:
1281        tcp_put_md5sig_pool();
1282clear_hash_noput:
1283        memset(md5_hash, 0, 16);
1284        return 1;
1285}
1286
1287int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1288                        const struct sock *sk,
1289                        const struct sk_buff *skb)
1290{
1291        struct tcp_md5sig_pool *hp;
1292        struct ahash_request *req;
1293        const struct tcphdr *th = tcp_hdr(skb);
1294        __be32 saddr, daddr;
1295
1296        if (sk) { /* valid for establish/request sockets */
1297                saddr = sk->sk_rcv_saddr;
1298                daddr = sk->sk_daddr;
1299        } else {
1300                const struct iphdr *iph = ip_hdr(skb);
1301                saddr = iph->saddr;
1302                daddr = iph->daddr;
1303        }
1304
1305        hp = tcp_get_md5sig_pool();
1306        if (!hp)
1307                goto clear_hash_noput;
1308        req = hp->md5_req;
1309
1310        if (crypto_ahash_init(req))
1311                goto clear_hash;
1312
1313        if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1314                goto clear_hash;
1315        if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1316                goto clear_hash;
1317        if (tcp_md5_hash_key(hp, key))
1318                goto clear_hash;
1319        ahash_request_set_crypt(req, NULL, md5_hash, 0);
1320        if (crypto_ahash_final(req))
1321                goto clear_hash;
1322
1323        tcp_put_md5sig_pool();
1324        return 0;
1325
1326clear_hash:
1327        tcp_put_md5sig_pool();
1328clear_hash_noput:
1329        memset(md5_hash, 0, 16);
1330        return 1;
1331}
1332EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1333
1334#endif
1335
1336/* Called with rcu_read_lock() */
1337static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1338                                    const struct sk_buff *skb,
1339                                    int dif, int sdif)
1340{
1341#ifdef CONFIG_TCP_MD5SIG
1342        /*
1343         * This gets called for each TCP segment that arrives
1344         * so we want to be efficient.
1345         * We have 3 drop cases:
1346         * o No MD5 hash and one expected.
1347         * o MD5 hash and we're not expecting one.
1348         * o MD5 hash and its wrong.
1349         */
1350        const __u8 *hash_location = NULL;
1351        struct tcp_md5sig_key *hash_expected;
1352        const struct iphdr *iph = ip_hdr(skb);
1353        const struct tcphdr *th = tcp_hdr(skb);
1354        const union tcp_md5_addr *addr;
1355        unsigned char newhash[16];
1356        int genhash, l3index;
1357
1358        /* sdif set, means packet ingressed via a device
1359         * in an L3 domain and dif is set to the l3mdev
1360         */
1361        l3index = sdif ? dif : 0;
1362
1363        addr = (union tcp_md5_addr *)&iph->saddr;
1364        hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1365        hash_location = tcp_parse_md5sig_option(th);
1366
1367        /* We've parsed the options - do we have a hash? */
1368        if (!hash_expected && !hash_location)
1369                return false;
1370
1371        if (hash_expected && !hash_location) {
1372                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1373                return true;
1374        }
1375
1376        if (!hash_expected && hash_location) {
1377                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1378                return true;
1379        }
1380
1381        /* Okay, so this is hash_expected and hash_location -
1382         * so we need to calculate the checksum.
1383         */
1384        genhash = tcp_v4_md5_hash_skb(newhash,
1385                                      hash_expected,
1386                                      NULL, skb);
1387
1388        if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1389                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1390                net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1391                                     &iph->saddr, ntohs(th->source),
1392                                     &iph->daddr, ntohs(th->dest),
1393                                     genhash ? " tcp_v4_calc_md5_hash failed"
1394                                     : "", l3index);
1395                return true;
1396        }
1397        return false;
1398#endif
1399        return false;
1400}
1401
1402static void tcp_v4_init_req(struct request_sock *req,
1403                            const struct sock *sk_listener,
1404                            struct sk_buff *skb)
1405{
1406        struct inet_request_sock *ireq = inet_rsk(req);
1407        struct net *net = sock_net(sk_listener);
1408
1409        sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1410        sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1411        RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1412}
1413
1414static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1415                                          struct flowi *fl,
1416                                          const struct request_sock *req)
1417{
1418        return inet_csk_route_req(sk, &fl->u.ip4, req);
1419}
1420
1421struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1422        .family         =       PF_INET,
1423        .obj_size       =       sizeof(struct tcp_request_sock),
1424        .rtx_syn_ack    =       tcp_rtx_synack,
1425        .send_ack       =       tcp_v4_reqsk_send_ack,
1426        .destructor     =       tcp_v4_reqsk_destructor,
1427        .send_reset     =       tcp_v4_send_reset,
1428        .syn_ack_timeout =      tcp_syn_ack_timeout,
1429};
1430
1431const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1432        .mss_clamp      =       TCP_MSS_DEFAULT,
1433#ifdef CONFIG_TCP_MD5SIG
1434        .req_md5_lookup =       tcp_v4_md5_lookup,
1435        .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1436#endif
1437        .init_req       =       tcp_v4_init_req,
1438#ifdef CONFIG_SYN_COOKIES
1439        .cookie_init_seq =      cookie_v4_init_sequence,
1440#endif
1441        .route_req      =       tcp_v4_route_req,
1442        .init_seq       =       tcp_v4_init_seq,
1443        .init_ts_off    =       tcp_v4_init_ts_off,
1444        .send_synack    =       tcp_v4_send_synack,
1445};
1446
1447int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1448{
1449        /* Never answer to SYNs send to broadcast or multicast */
1450        if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1451                goto drop;
1452
1453        return tcp_conn_request(&tcp_request_sock_ops,
1454                                &tcp_request_sock_ipv4_ops, sk, skb);
1455
1456drop:
1457        tcp_listendrop(sk);
1458        return 0;
1459}
1460EXPORT_SYMBOL(tcp_v4_conn_request);
1461
1462
1463/*
1464 * The three way handshake has completed - we got a valid synack -
1465 * now create the new socket.
1466 */
1467struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1468                                  struct request_sock *req,
1469                                  struct dst_entry *dst,
1470                                  struct request_sock *req_unhash,
1471                                  bool *own_req)
1472{
1473        struct inet_request_sock *ireq;
1474        struct inet_sock *newinet;
1475        struct tcp_sock *newtp;
1476        struct sock *newsk;
1477#ifdef CONFIG_TCP_MD5SIG
1478        const union tcp_md5_addr *addr;
1479        struct tcp_md5sig_key *key;
1480        int l3index;
1481#endif
1482        struct ip_options_rcu *inet_opt;
1483
1484        if (sk_acceptq_is_full(sk))
1485                goto exit_overflow;
1486
1487        newsk = tcp_create_openreq_child(sk, req, skb);
1488        if (!newsk)
1489                goto exit_nonewsk;
1490
1491        newsk->sk_gso_type = SKB_GSO_TCPV4;
1492        inet_sk_rx_dst_set(newsk, skb);
1493
1494        newtp                 = tcp_sk(newsk);
1495        newinet               = inet_sk(newsk);
1496        ireq                  = inet_rsk(req);
1497        sk_daddr_set(newsk, ireq->ir_rmt_addr);
1498        sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1499        newsk->sk_bound_dev_if = ireq->ir_iif;
1500        newinet->inet_saddr   = ireq->ir_loc_addr;
1501        inet_opt              = rcu_dereference(ireq->ireq_opt);
1502        RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1503        newinet->mc_index     = inet_iif(skb);
1504        newinet->mc_ttl       = ip_hdr(skb)->ttl;
1505        newinet->rcv_tos      = ip_hdr(skb)->tos;
1506        inet_csk(newsk)->icsk_ext_hdr_len = 0;
1507        if (inet_opt)
1508                inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1509        newinet->inet_id = prandom_u32();
1510
1511        if (!dst) {
1512                dst = inet_csk_route_child_sock(sk, newsk, req);
1513                if (!dst)
1514                        goto put_and_exit;
1515        } else {
1516                /* syncookie case : see end of cookie_v4_check() */
1517        }
1518        sk_setup_caps(newsk, dst);
1519
1520        tcp_ca_openreq_child(newsk, dst);
1521
1522        tcp_sync_mss(newsk, dst_mtu(dst));
1523        newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1524
1525        tcp_initialize_rcv_mss(newsk);
1526
1527#ifdef CONFIG_TCP_MD5SIG
1528        l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1529        /* Copy over the MD5 key from the original socket */
1530        addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1531        key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1532        if (key) {
1533                /*
1534                 * We're using one, so create a matching key
1535                 * on the newsk structure. If we fail to get
1536                 * memory, then we end up not copying the key
1537                 * across. Shucks.
1538                 */
1539                tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1540                               key->key, key->keylen, GFP_ATOMIC);
1541                sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1542        }
1543#endif
1544
1545        if (__inet_inherit_port(sk, newsk) < 0)
1546                goto put_and_exit;
1547        *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1548        if (likely(*own_req)) {
1549                tcp_move_syn(newtp, req);
1550                ireq->ireq_opt = NULL;
1551        } else {
1552                newinet->inet_opt = NULL;
1553        }
1554        return newsk;
1555
1556exit_overflow:
1557        NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1558exit_nonewsk:
1559        dst_release(dst);
1560exit:
1561        tcp_listendrop(sk);
1562        return NULL;
1563put_and_exit:
1564        newinet->inet_opt = NULL;
1565        inet_csk_prepare_forced_close(newsk);
1566        tcp_done(newsk);
1567        goto exit;
1568}
1569EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1570
1571static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1572{
1573#ifdef CONFIG_SYN_COOKIES
1574        const struct tcphdr *th = tcp_hdr(skb);
1575
1576        if (!th->syn)
1577                sk = cookie_v4_check(sk, skb);
1578#endif
1579        return sk;
1580}
1581
1582u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1583                         struct tcphdr *th, u32 *cookie)
1584{
1585        u16 mss = 0;
1586#ifdef CONFIG_SYN_COOKIES
1587        mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1588                                    &tcp_request_sock_ipv4_ops, sk, th);
1589        if (mss) {
1590                *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1591                tcp_synq_overflow(sk);
1592        }
1593#endif
1594        return mss;
1595}
1596
1597/* The socket must have it's spinlock held when we get
1598 * here, unless it is a TCP_LISTEN socket.
1599 *
1600 * We have a potential double-lock case here, so even when
1601 * doing backlog processing we use the BH locking scheme.
1602 * This is because we cannot sleep with the original spinlock
1603 * held.
1604 */
1605int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1606{
1607        struct sock *rsk;
1608
1609        if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1610                struct dst_entry *dst = sk->sk_rx_dst;
1611
1612                sock_rps_save_rxhash(sk, skb);
1613                sk_mark_napi_id(sk, skb);
1614                if (dst) {
1615                        if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1616                            !dst->ops->check(dst, 0)) {
1617                                dst_release(dst);
1618                                sk->sk_rx_dst = NULL;
1619                        }
1620                }
1621                tcp_rcv_established(sk, skb);
1622                return 0;
1623        }
1624
1625        if (tcp_checksum_complete(skb))
1626                goto csum_err;
1627
1628        if (sk->sk_state == TCP_LISTEN) {
1629                struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1630
1631                if (!nsk)
1632                        goto discard;
1633                if (nsk != sk) {
1634                        if (tcp_child_process(sk, nsk, skb)) {
1635                                rsk = nsk;
1636                                goto reset;
1637                        }
1638                        return 0;
1639                }
1640        } else
1641                sock_rps_save_rxhash(sk, skb);
1642
1643        if (tcp_rcv_state_process(sk, skb)) {
1644                rsk = sk;
1645                goto reset;
1646        }
1647        return 0;
1648
1649reset:
1650        tcp_v4_send_reset(rsk, skb);
1651discard:
1652        kfree_skb(skb);
1653        /* Be careful here. If this function gets more complicated and
1654         * gcc suffers from register pressure on the x86, sk (in %ebx)
1655         * might be destroyed here. This current version compiles correctly,
1656         * but you have been warned.
1657         */
1658        return 0;
1659
1660csum_err:
1661        TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1662        TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1663        goto discard;
1664}
1665EXPORT_SYMBOL(tcp_v4_do_rcv);
1666
1667int tcp_v4_early_demux(struct sk_buff *skb)
1668{
1669        const struct iphdr *iph;
1670        const struct tcphdr *th;
1671        struct sock *sk;
1672
1673        if (skb->pkt_type != PACKET_HOST)
1674                return 0;
1675
1676        if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1677                return 0;
1678
1679        iph = ip_hdr(skb);
1680        th = tcp_hdr(skb);
1681
1682        if (th->doff < sizeof(struct tcphdr) / 4)
1683                return 0;
1684
1685        sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1686                                       iph->saddr, th->source,
1687                                       iph->daddr, ntohs(th->dest),
1688                                       skb->skb_iif, inet_sdif(skb));
1689        if (sk) {
1690                skb->sk = sk;
1691                skb->destructor = sock_edemux;
1692                if (sk_fullsock(sk)) {
1693                        struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1694
1695                        if (dst)
1696                                dst = dst_check(dst, 0);
1697                        if (dst &&
1698                            inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1699                                skb_dst_set_noref(skb, dst);
1700                }
1701        }
1702        return 0;
1703}
1704
1705bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1706{
1707        u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1708        struct skb_shared_info *shinfo;
1709        const struct tcphdr *th;
1710        struct tcphdr *thtail;
1711        struct sk_buff *tail;
1712        unsigned int hdrlen;
1713        bool fragstolen;
1714        u32 gso_segs;
1715        int delta;
1716
1717        /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1718         * we can fix skb->truesize to its real value to avoid future drops.
1719         * This is valid because skb is not yet charged to the socket.
1720         * It has been noticed pure SACK packets were sometimes dropped
1721         * (if cooked by drivers without copybreak feature).
1722         */
1723        skb_condense(skb);
1724
1725        skb_dst_drop(skb);
1726
1727        if (unlikely(tcp_checksum_complete(skb))) {
1728                bh_unlock_sock(sk);
1729                __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1730                __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1731                return true;
1732        }
1733
1734        /* Attempt coalescing to last skb in backlog, even if we are
1735         * above the limits.
1736         * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1737         */
1738        th = (const struct tcphdr *)skb->data;
1739        hdrlen = th->doff * 4;
1740        shinfo = skb_shinfo(skb);
1741
1742        if (!shinfo->gso_size)
1743                shinfo->gso_size = skb->len - hdrlen;
1744
1745        if (!shinfo->gso_segs)
1746                shinfo->gso_segs = 1;
1747
1748        tail = sk->sk_backlog.tail;
1749        if (!tail)
1750                goto no_coalesce;
1751        thtail = (struct tcphdr *)tail->data;
1752
1753        if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1754            TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1755            ((TCP_SKB_CB(tail)->tcp_flags |
1756              TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1757            !((TCP_SKB_CB(tail)->tcp_flags &
1758              TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1759            ((TCP_SKB_CB(tail)->tcp_flags ^
1760              TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1761#ifdef CONFIG_TLS_DEVICE
1762            tail->decrypted != skb->decrypted ||
1763#endif
1764            thtail->doff != th->doff ||
1765            memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1766                goto no_coalesce;
1767
1768        __skb_pull(skb, hdrlen);
1769        if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1770                thtail->window = th->window;
1771
1772                TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1773
1774                if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
1775                        TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1776
1777                /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1778                 * thtail->fin, so that the fast path in tcp_rcv_established()
1779                 * is not entered if we append a packet with a FIN.
1780                 * SYN, RST, URG are not present.
1781                 * ACK is set on both packets.
1782                 * PSH : we do not really care in TCP stack,
1783                 *       at least for 'GRO' packets.
1784                 */
1785                thtail->fin |= th->fin;
1786                TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1787
1788                if (TCP_SKB_CB(skb)->has_rxtstamp) {
1789                        TCP_SKB_CB(tail)->has_rxtstamp = true;
1790                        tail->tstamp = skb->tstamp;
1791                        skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1792                }
1793
1794                /* Not as strict as GRO. We only need to carry mss max value */
1795                skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1796                                                 skb_shinfo(tail)->gso_size);
1797
1798                gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1799                skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1800
1801                sk->sk_backlog.len += delta;
1802                __NET_INC_STATS(sock_net(sk),
1803                                LINUX_MIB_TCPBACKLOGCOALESCE);
1804                kfree_skb_partial(skb, fragstolen);
1805                return false;
1806        }
1807        __skb_push(skb, hdrlen);
1808
1809no_coalesce:
1810        /* Only socket owner can try to collapse/prune rx queues
1811         * to reduce memory overhead, so add a little headroom here.
1812         * Few sockets backlog are possibly concurrently non empty.
1813         */
1814        limit += 64*1024;
1815
1816        if (unlikely(sk_add_backlog(sk, skb, limit))) {
1817                bh_unlock_sock(sk);
1818                __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1819                return true;
1820        }
1821        return false;
1822}
1823EXPORT_SYMBOL(tcp_add_backlog);
1824
1825int tcp_filter(struct sock *sk, struct sk_buff *skb)
1826{
1827        struct tcphdr *th = (struct tcphdr *)skb->data;
1828
1829        return sk_filter_trim_cap(sk, skb, th->doff * 4);
1830}
1831EXPORT_SYMBOL(tcp_filter);
1832
1833static void tcp_v4_restore_cb(struct sk_buff *skb)
1834{
1835        memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1836                sizeof(struct inet_skb_parm));
1837}
1838
1839static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1840                           const struct tcphdr *th)
1841{
1842        /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1843         * barrier() makes sure compiler wont play fool^Waliasing games.
1844         */
1845        memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1846                sizeof(struct inet_skb_parm));
1847        barrier();
1848
1849        TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1850        TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1851                                    skb->len - th->doff * 4);
1852        TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1853        TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1854        TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1855        TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1856        TCP_SKB_CB(skb)->sacked  = 0;
1857        TCP_SKB_CB(skb)->has_rxtstamp =
1858                        skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1859}
1860
1861/*
1862 *      From tcp_input.c
1863 */
1864
1865int tcp_v4_rcv(struct sk_buff *skb)
1866{
1867        struct net *net = dev_net(skb->dev);
1868        struct sk_buff *skb_to_free;
1869        int sdif = inet_sdif(skb);
1870        int dif = inet_iif(skb);
1871        const struct iphdr *iph;
1872        const struct tcphdr *th;
1873        bool refcounted;
1874        struct sock *sk;
1875        int ret;
1876
1877        if (skb->pkt_type != PACKET_HOST)
1878                goto discard_it;
1879
1880        /* Count it even if it's bad */
1881        __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1882
1883        if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1884                goto discard_it;
1885
1886        th = (const struct tcphdr *)skb->data;
1887
1888        if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1889                goto bad_packet;
1890        if (!pskb_may_pull(skb, th->doff * 4))
1891                goto discard_it;
1892
1893        /* An explanation is required here, I think.
1894         * Packet length and doff are validated by header prediction,
1895         * provided case of th->doff==0 is eliminated.
1896         * So, we defer the checks. */
1897
1898        if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1899                goto csum_error;
1900
1901        th = (const struct tcphdr *)skb->data;
1902        iph = ip_hdr(skb);
1903lookup:
1904        sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1905                               th->dest, sdif, &refcounted);
1906        if (!sk)
1907                goto no_tcp_socket;
1908
1909process:
1910        if (sk->sk_state == TCP_TIME_WAIT)
1911                goto do_time_wait;
1912
1913        if (sk->sk_state == TCP_NEW_SYN_RECV) {
1914                struct request_sock *req = inet_reqsk(sk);
1915                bool req_stolen = false;
1916                struct sock *nsk;
1917
1918                sk = req->rsk_listener;
1919                if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
1920                        sk_drops_add(sk, skb);
1921                        reqsk_put(req);
1922                        goto discard_it;
1923                }
1924                if (tcp_checksum_complete(skb)) {
1925                        reqsk_put(req);
1926                        goto csum_error;
1927                }
1928                if (unlikely(sk->sk_state != TCP_LISTEN)) {
1929                        inet_csk_reqsk_queue_drop_and_put(sk, req);
1930                        goto lookup;
1931                }
1932                /* We own a reference on the listener, increase it again
1933                 * as we might lose it too soon.
1934                 */
1935                sock_hold(sk);
1936                refcounted = true;
1937                nsk = NULL;
1938                if (!tcp_filter(sk, skb)) {
1939                        th = (const struct tcphdr *)skb->data;
1940                        iph = ip_hdr(skb);
1941                        tcp_v4_fill_cb(skb, iph, th);
1942                        nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1943                }
1944                if (!nsk) {
1945                        reqsk_put(req);
1946                        if (req_stolen) {
1947                                /* Another cpu got exclusive access to req
1948                                 * and created a full blown socket.
1949                                 * Try to feed this packet to this socket
1950                                 * instead of discarding it.
1951                                 */
1952                                tcp_v4_restore_cb(skb);
1953                                sock_put(sk);
1954                                goto lookup;
1955                        }
1956                        goto discard_and_relse;
1957                }
1958                if (nsk == sk) {
1959                        reqsk_put(req);
1960                        tcp_v4_restore_cb(skb);
1961                } else if (tcp_child_process(sk, nsk, skb)) {
1962                        tcp_v4_send_reset(nsk, skb);
1963                        goto discard_and_relse;
1964                } else {
1965                        sock_put(sk);
1966                        return 0;
1967                }
1968        }
1969        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1970                __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1971                goto discard_and_relse;
1972        }
1973
1974        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1975                goto discard_and_relse;
1976
1977        if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
1978                goto discard_and_relse;
1979
1980        nf_reset_ct(skb);
1981
1982        if (tcp_filter(sk, skb))
1983                goto discard_and_relse;
1984        th = (const struct tcphdr *)skb->data;
1985        iph = ip_hdr(skb);
1986        tcp_v4_fill_cb(skb, iph, th);
1987
1988        skb->dev = NULL;
1989
1990        if (sk->sk_state == TCP_LISTEN) {
1991                ret = tcp_v4_do_rcv(sk, skb);
1992                goto put_and_return;
1993        }
1994
1995        sk_incoming_cpu_update(sk);
1996
1997        bh_lock_sock_nested(sk);
1998        tcp_segs_in(tcp_sk(sk), skb);
1999        ret = 0;
2000        if (!sock_owned_by_user(sk)) {
2001                skb_to_free = sk->sk_rx_skb_cache;
2002                sk->sk_rx_skb_cache = NULL;
2003                ret = tcp_v4_do_rcv(sk, skb);
2004        } else {
2005                if (tcp_add_backlog(sk, skb))
2006                        goto discard_and_relse;
2007                skb_to_free = NULL;
2008        }
2009        bh_unlock_sock(sk);
2010        if (skb_to_free)
2011                __kfree_skb(skb_to_free);
2012
2013put_and_return:
2014        if (refcounted)
2015                sock_put(sk);
2016
2017        return ret;
2018
2019no_tcp_socket:
2020        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2021                goto discard_it;
2022
2023        tcp_v4_fill_cb(skb, iph, th);
2024
2025        if (tcp_checksum_complete(skb)) {
2026csum_error:
2027                __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2028bad_packet:
2029                __TCP_INC_STATS(net, TCP_MIB_INERRS);
2030        } else {
2031                tcp_v4_send_reset(NULL, skb);
2032        }
2033
2034discard_it:
2035        /* Discard frame. */
2036        kfree_skb(skb);
2037        return 0;
2038
2039discard_and_relse:
2040        sk_drops_add(sk, skb);
2041        if (refcounted)
2042                sock_put(sk);
2043        goto discard_it;
2044
2045do_time_wait:
2046        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2047                inet_twsk_put(inet_twsk(sk));
2048                goto discard_it;
2049        }
2050
2051        tcp_v4_fill_cb(skb, iph, th);
2052
2053        if (tcp_checksum_complete(skb)) {
2054                inet_twsk_put(inet_twsk(sk));
2055                goto csum_error;
2056        }
2057        switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2058        case TCP_TW_SYN: {
2059                struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2060                                                        &tcp_hashinfo, skb,
2061                                                        __tcp_hdrlen(th),
2062                                                        iph->saddr, th->source,
2063                                                        iph->daddr, th->dest,
2064                                                        inet_iif(skb),
2065                                                        sdif);
2066                if (sk2) {
2067                        inet_twsk_deschedule_put(inet_twsk(sk));
2068                        sk = sk2;
2069                        tcp_v4_restore_cb(skb);
2070                        refcounted = false;
2071                        goto process;
2072                }
2073        }
2074                /* to ACK */
2075                fallthrough;
2076        case TCP_TW_ACK:
2077                tcp_v4_timewait_ack(sk, skb);
2078                break;
2079        case TCP_TW_RST:
2080                tcp_v4_send_reset(sk, skb);
2081                inet_twsk_deschedule_put(inet_twsk(sk));
2082                goto discard_it;
2083        case TCP_TW_SUCCESS:;
2084        }
2085        goto discard_it;
2086}
2087
2088static struct timewait_sock_ops tcp_timewait_sock_ops = {
2089        .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2090        .twsk_unique    = tcp_twsk_unique,
2091        .twsk_destructor= tcp_twsk_destructor,
2092};
2093
2094void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2095{
2096        struct dst_entry *dst = skb_dst(skb);
2097
2098        if (dst && dst_hold_safe(dst)) {
2099                sk->sk_rx_dst = dst;
2100                inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2101        }
2102}
2103EXPORT_SYMBOL(inet_sk_rx_dst_set);
2104
2105const struct inet_connection_sock_af_ops ipv4_specific = {
2106        .queue_xmit        = ip_queue_xmit,
2107        .send_check        = tcp_v4_send_check,
2108        .rebuild_header    = inet_sk_rebuild_header,
2109        .sk_rx_dst_set     = inet_sk_rx_dst_set,
2110        .conn_request      = tcp_v4_conn_request,
2111        .syn_recv_sock     = tcp_v4_syn_recv_sock,
2112        .net_header_len    = sizeof(struct iphdr),
2113        .setsockopt        = ip_setsockopt,
2114        .getsockopt        = ip_getsockopt,
2115        .addr2sockaddr     = inet_csk_addr2sockaddr,
2116        .sockaddr_len      = sizeof(struct sockaddr_in),
2117#ifdef CONFIG_COMPAT
2118        .compat_setsockopt = compat_ip_setsockopt,
2119        .compat_getsockopt = compat_ip_getsockopt,
2120#endif
2121        .mtu_reduced       = tcp_v4_mtu_reduced,
2122};
2123EXPORT_SYMBOL(ipv4_specific);
2124
2125#ifdef CONFIG_TCP_MD5SIG
2126static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2127        .md5_lookup             = tcp_v4_md5_lookup,
2128        .calc_md5_hash          = tcp_v4_md5_hash_skb,
2129        .md5_parse              = tcp_v4_parse_md5_keys,
2130};
2131#endif
2132
2133/* NOTE: A lot of things set to zero explicitly by call to
2134 *       sk_alloc() so need not be done here.
2135 */
2136static int tcp_v4_init_sock(struct sock *sk)
2137{
2138        struct inet_connection_sock *icsk = inet_csk(sk);
2139
2140        tcp_init_sock(sk);
2141
2142        icsk->icsk_af_ops = &ipv4_specific;
2143
2144#ifdef CONFIG_TCP_MD5SIG
2145        tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2146#endif
2147
2148        return 0;
2149}
2150
2151void tcp_v4_destroy_sock(struct sock *sk)
2152{
2153        struct tcp_sock *tp = tcp_sk(sk);
2154
2155        trace_tcp_destroy_sock(sk);
2156
2157        tcp_clear_xmit_timers(sk);
2158
2159        tcp_cleanup_congestion_control(sk);
2160
2161        tcp_cleanup_ulp(sk);
2162
2163        /* Cleanup up the write buffer. */
2164        tcp_write_queue_purge(sk);
2165
2166        /* Check if we want to disable active TFO */
2167        tcp_fastopen_active_disable_ofo_check(sk);
2168
2169        /* Cleans up our, hopefully empty, out_of_order_queue. */
2170        skb_rbtree_purge(&tp->out_of_order_queue);
2171
2172#ifdef CONFIG_TCP_MD5SIG
2173        /* Clean up the MD5 key list, if any */
2174        if (tp->md5sig_info) {
2175                tcp_clear_md5_list(sk);
2176                kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2177                tp->md5sig_info = NULL;
2178        }
2179#endif
2180
2181        /* Clean up a referenced TCP bind bucket. */
2182        if (inet_csk(sk)->icsk_bind_hash)
2183                inet_put_port(sk);
2184
2185        BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2186
2187        /* If socket is aborted during connect operation */
2188        tcp_free_fastopen_req(tp);
2189        tcp_fastopen_destroy_cipher(sk);
2190        tcp_saved_syn_free(tp);
2191
2192        sk_sockets_allocated_dec(sk);
2193}
2194EXPORT_SYMBOL(tcp_v4_destroy_sock);
2195
2196#ifdef CONFIG_PROC_FS
2197/* Proc filesystem TCP sock list dumping. */
2198
2199/*
2200 * Get next listener socket follow cur.  If cur is NULL, get first socket
2201 * starting from bucket given in st->bucket; when st->bucket is zero the
2202 * very first socket in the hash table is returned.
2203 */
2204static void *listening_get_next(struct seq_file *seq, void *cur)
2205{
2206        struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2207        struct tcp_iter_state *st = seq->private;
2208        struct net *net = seq_file_net(seq);
2209        struct inet_listen_hashbucket *ilb;
2210        struct hlist_nulls_node *node;
2211        struct sock *sk = cur;
2212
2213        if (!sk) {
2214get_head:
2215                ilb = &tcp_hashinfo.listening_hash[st->bucket];
2216                spin_lock(&ilb->lock);
2217                sk = sk_nulls_head(&ilb->nulls_head);
2218                st->offset = 0;
2219                goto get_sk;
2220        }
2221        ilb = &tcp_hashinfo.listening_hash[st->bucket];
2222        ++st->num;
2223        ++st->offset;
2224
2225        sk = sk_nulls_next(sk);
2226get_sk:
2227        sk_nulls_for_each_from(sk, node) {
2228                if (!net_eq(sock_net(sk), net))
2229                        continue;
2230                if (sk->sk_family == afinfo->family)
2231                        return sk;
2232        }
2233        spin_unlock(&ilb->lock);
2234        st->offset = 0;
2235        if (++st->bucket < INET_LHTABLE_SIZE)
2236                goto get_head;
2237        return NULL;
2238}
2239
2240static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2241{
2242        struct tcp_iter_state *st = seq->private;
2243        void *rc;
2244
2245        st->bucket = 0;
2246        st->offset = 0;
2247        rc = listening_get_next(seq, NULL);
2248
2249        while (rc && *pos) {
2250                rc = listening_get_next(seq, rc);
2251                --*pos;
2252        }
2253        return rc;
2254}
2255
2256static inline bool empty_bucket(const struct tcp_iter_state *st)
2257{
2258        return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2259}
2260
2261/*
2262 * Get first established socket starting from bucket given in st->bucket.
2263 * If st->bucket is zero, the very first socket in the hash is returned.
2264 */
2265static void *established_get_first(struct seq_file *seq)
2266{
2267        struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2268        struct tcp_iter_state *st = seq->private;
2269        struct net *net = seq_file_net(seq);
2270        void *rc = NULL;
2271
2272        st->offset = 0;
2273        for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2274                struct sock *sk;
2275                struct hlist_nulls_node *node;
2276                spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2277
2278                /* Lockless fast path for the common case of empty buckets */
2279                if (empty_bucket(st))
2280                        continue;
2281
2282                spin_lock_bh(lock);
2283                sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2284                        if (sk->sk_family != afinfo->family ||
2285                            !net_eq(sock_net(sk), net)) {
2286                                continue;
2287                        }
2288                        rc = sk;
2289                        goto out;
2290                }
2291                spin_unlock_bh(lock);
2292        }
2293out:
2294        return rc;
2295}
2296
2297static void *established_get_next(struct seq_file *seq, void *cur)
2298{
2299        struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2300        struct sock *sk = cur;
2301        struct hlist_nulls_node *node;
2302        struct tcp_iter_state *st = seq->private;
2303        struct net *net = seq_file_net(seq);
2304
2305        ++st->num;
2306        ++st->offset;
2307
2308        sk = sk_nulls_next(sk);
2309
2310        sk_nulls_for_each_from(sk, node) {
2311                if (sk->sk_family == afinfo->family &&
2312                    net_eq(sock_net(sk), net))
2313                        return sk;
2314        }
2315
2316        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2317        ++st->bucket;
2318        return established_get_first(seq);
2319}
2320
2321static void *established_get_idx(struct seq_file *seq, loff_t pos)
2322{
2323        struct tcp_iter_state *st = seq->private;
2324        void *rc;
2325
2326        st->bucket = 0;
2327        rc = established_get_first(seq);
2328
2329        while (rc && pos) {
2330                rc = established_get_next(seq, rc);
2331                --pos;
2332        }
2333        return rc;
2334}
2335
2336static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2337{
2338        void *rc;
2339        struct tcp_iter_state *st = seq->private;
2340
2341        st->state = TCP_SEQ_STATE_LISTENING;
2342        rc        = listening_get_idx(seq, &pos);
2343
2344        if (!rc) {
2345                st->state = TCP_SEQ_STATE_ESTABLISHED;
2346                rc        = established_get_idx(seq, pos);
2347        }
2348
2349        return rc;
2350}
2351
2352static void *tcp_seek_last_pos(struct seq_file *seq)
2353{
2354        struct tcp_iter_state *st = seq->private;
2355        int offset = st->offset;
2356        int orig_num = st->num;
2357        void *rc = NULL;
2358
2359        switch (st->state) {
2360        case TCP_SEQ_STATE_LISTENING:
2361                if (st->bucket >= INET_LHTABLE_SIZE)
2362                        break;
2363                st->state = TCP_SEQ_STATE_LISTENING;
2364                rc = listening_get_next(seq, NULL);
2365                while (offset-- && rc)
2366                        rc = listening_get_next(seq, rc);
2367                if (rc)
2368                        break;
2369                st->bucket = 0;
2370                st->state = TCP_SEQ_STATE_ESTABLISHED;
2371                fallthrough;
2372        case TCP_SEQ_STATE_ESTABLISHED:
2373                if (st->bucket > tcp_hashinfo.ehash_mask)
2374                        break;
2375                rc = established_get_first(seq);
2376                while (offset-- && rc)
2377                        rc = established_get_next(seq, rc);
2378        }
2379
2380        st->num = orig_num;
2381
2382        return rc;
2383}
2384
2385void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2386{
2387        struct tcp_iter_state *st = seq->private;
2388        void *rc;
2389
2390        if (*pos && *pos == st->last_pos) {
2391                rc = tcp_seek_last_pos(seq);
2392                if (rc)
2393                        goto out;
2394        }
2395
2396        st->state = TCP_SEQ_STATE_LISTENING;
2397        st->num = 0;
2398        st->bucket = 0;
2399        st->offset = 0;
2400        rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2401
2402out:
2403        st->last_pos = *pos;
2404        return rc;
2405}
2406EXPORT_SYMBOL(tcp_seq_start);
2407
2408void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2409{
2410        struct tcp_iter_state *st = seq->private;
2411        void *rc = NULL;
2412
2413        if (v == SEQ_START_TOKEN) {
2414                rc = tcp_get_idx(seq, 0);
2415                goto out;
2416        }
2417
2418        switch (st->state) {
2419        case TCP_SEQ_STATE_LISTENING:
2420                rc = listening_get_next(seq, v);
2421                if (!rc) {
2422                        st->state = TCP_SEQ_STATE_ESTABLISHED;
2423                        st->bucket = 0;
2424                        st->offset = 0;
2425                        rc        = established_get_first(seq);
2426                }
2427                break;
2428        case TCP_SEQ_STATE_ESTABLISHED:
2429                rc = established_get_next(seq, v);
2430                break;
2431        }
2432out:
2433        ++*pos;
2434        st->last_pos = *pos;
2435        return rc;
2436}
2437EXPORT_SYMBOL(tcp_seq_next);
2438
2439void tcp_seq_stop(struct seq_file *seq, void *v)
2440{
2441        struct tcp_iter_state *st = seq->private;
2442
2443        switch (st->state) {
2444        case TCP_SEQ_STATE_LISTENING:
2445                if (v != SEQ_START_TOKEN)
2446                        spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2447                break;
2448        case TCP_SEQ_STATE_ESTABLISHED:
2449                if (v)
2450                        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2451                break;
2452        }
2453}
2454EXPORT_SYMBOL(tcp_seq_stop);
2455
2456static void get_openreq4(const struct request_sock *req,
2457                         struct seq_file *f, int i)
2458{
2459        const struct inet_request_sock *ireq = inet_rsk(req);
2460        long delta = req->rsk_timer.expires - jiffies;
2461
2462        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2463                " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2464                i,
2465                ireq->ir_loc_addr,
2466                ireq->ir_num,
2467                ireq->ir_rmt_addr,
2468                ntohs(ireq->ir_rmt_port),
2469                TCP_SYN_RECV,
2470                0, 0, /* could print option size, but that is af dependent. */
2471                1,    /* timers active (only the expire timer) */
2472                jiffies_delta_to_clock_t(delta),
2473                req->num_timeout,
2474                from_kuid_munged(seq_user_ns(f),
2475                                 sock_i_uid(req->rsk_listener)),
2476                0,  /* non standard timer */
2477                0, /* open_requests have no inode */
2478                0,
2479                req);
2480}
2481
2482static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2483{
2484        int timer_active;
2485        unsigned long timer_expires;
2486        const struct tcp_sock *tp = tcp_sk(sk);
2487        const struct inet_connection_sock *icsk = inet_csk(sk);
2488        const struct inet_sock *inet = inet_sk(sk);
2489        const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2490        __be32 dest = inet->inet_daddr;
2491        __be32 src = inet->inet_rcv_saddr;
2492        __u16 destp = ntohs(inet->inet_dport);
2493        __u16 srcp = ntohs(inet->inet_sport);
2494        int rx_queue;
2495        int state;
2496
2497        if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2498            icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2499            icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2500                timer_active    = 1;
2501                timer_expires   = icsk->icsk_timeout;
2502        } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2503                timer_active    = 4;
2504                timer_expires   = icsk->icsk_timeout;
2505        } else if (timer_pending(&sk->sk_timer)) {
2506                timer_active    = 2;
2507                timer_expires   = sk->sk_timer.expires;
2508        } else {
2509                timer_active    = 0;
2510                timer_expires = jiffies;
2511        }
2512
2513        state = inet_sk_state_load(sk);
2514        if (state == TCP_LISTEN)
2515                rx_queue = READ_ONCE(sk->sk_ack_backlog);
2516        else
2517                /* Because we don't lock the socket,
2518                 * we might find a transient negative value.
2519                 */
2520                rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2521                                      READ_ONCE(tp->copied_seq), 0);
2522
2523        seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2524                        "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2525                i, src, srcp, dest, destp, state,
2526                READ_ONCE(tp->write_seq) - tp->snd_una,
2527                rx_queue,
2528                timer_active,
2529                jiffies_delta_to_clock_t(timer_expires - jiffies),
2530                icsk->icsk_retransmits,
2531                from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2532                icsk->icsk_probes_out,
2533                sock_i_ino(sk),
2534                refcount_read(&sk->sk_refcnt), sk,
2535                jiffies_to_clock_t(icsk->icsk_rto),
2536                jiffies_to_clock_t(icsk->icsk_ack.ato),
2537                (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2538                tp->snd_cwnd,
2539                state == TCP_LISTEN ?
2540                    fastopenq->max_qlen :
2541                    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2542}
2543
2544static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2545                               struct seq_file *f, int i)
2546{
2547        long delta = tw->tw_timer.expires - jiffies;
2548        __be32 dest, src;
2549        __u16 destp, srcp;
2550
2551        dest  = tw->tw_daddr;
2552        src   = tw->tw_rcv_saddr;
2553        destp = ntohs(tw->tw_dport);
2554        srcp  = ntohs(tw->tw_sport);
2555
2556        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2557                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2558                i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2559                3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2560                refcount_read(&tw->tw_refcnt), tw);
2561}
2562
2563#define TMPSZ 150
2564
2565static int tcp4_seq_show(struct seq_file *seq, void *v)
2566{
2567        struct tcp_iter_state *st;
2568        struct sock *sk = v;
2569
2570        seq_setwidth(seq, TMPSZ - 1);
2571        if (v == SEQ_START_TOKEN) {
2572                seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2573                           "rx_queue tr tm->when retrnsmt   uid  timeout "
2574                           "inode");
2575                goto out;
2576        }
2577        st = seq->private;
2578
2579        if (sk->sk_state == TCP_TIME_WAIT)
2580                get_timewait4_sock(v, seq, st->num);
2581        else if (sk->sk_state == TCP_NEW_SYN_RECV)
2582                get_openreq4(v, seq, st->num);
2583        else
2584                get_tcp4_sock(v, seq, st->num);
2585out:
2586        seq_pad(seq, '\n');
2587        return 0;
2588}
2589
2590static const struct seq_operations tcp4_seq_ops = {
2591        .show           = tcp4_seq_show,
2592        .start          = tcp_seq_start,
2593        .next           = tcp_seq_next,
2594        .stop           = tcp_seq_stop,
2595};
2596
2597static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2598        .family         = AF_INET,
2599};
2600
2601static int __net_init tcp4_proc_init_net(struct net *net)
2602{
2603        if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2604                        sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2605                return -ENOMEM;
2606        return 0;
2607}
2608
2609static void __net_exit tcp4_proc_exit_net(struct net *net)
2610{
2611        remove_proc_entry("tcp", net->proc_net);
2612}
2613
2614static struct pernet_operations tcp4_net_ops = {
2615        .init = tcp4_proc_init_net,
2616        .exit = tcp4_proc_exit_net,
2617};
2618
2619int __init tcp4_proc_init(void)
2620{
2621        return register_pernet_subsys(&tcp4_net_ops);
2622}
2623
2624void tcp4_proc_exit(void)
2625{
2626        unregister_pernet_subsys(&tcp4_net_ops);
2627}
2628#endif /* CONFIG_PROC_FS */
2629
2630struct proto tcp_prot = {
2631        .name                   = "TCP",
2632        .owner                  = THIS_MODULE,
2633        .close                  = tcp_close,
2634        .pre_connect            = tcp_v4_pre_connect,
2635        .connect                = tcp_v4_connect,
2636        .disconnect             = tcp_disconnect,
2637        .accept                 = inet_csk_accept,
2638        .ioctl                  = tcp_ioctl,
2639        .init                   = tcp_v4_init_sock,
2640        .destroy                = tcp_v4_destroy_sock,
2641        .shutdown               = tcp_shutdown,
2642        .setsockopt             = tcp_setsockopt,
2643        .getsockopt             = tcp_getsockopt,
2644        .keepalive              = tcp_set_keepalive,
2645        .recvmsg                = tcp_recvmsg,
2646        .sendmsg                = tcp_sendmsg,
2647        .sendpage               = tcp_sendpage,
2648        .backlog_rcv            = tcp_v4_do_rcv,
2649        .release_cb             = tcp_release_cb,
2650        .hash                   = inet_hash,
2651        .unhash                 = inet_unhash,
2652        .get_port               = inet_csk_get_port,
2653        .enter_memory_pressure  = tcp_enter_memory_pressure,
2654        .leave_memory_pressure  = tcp_leave_memory_pressure,
2655        .stream_memory_free     = tcp_stream_memory_free,
2656        .sockets_allocated      = &tcp_sockets_allocated,
2657        .orphan_count           = &tcp_orphan_count,
2658        .memory_allocated       = &tcp_memory_allocated,
2659        .memory_pressure        = &tcp_memory_pressure,
2660        .sysctl_mem             = sysctl_tcp_mem,
2661        .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2662        .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2663        .max_header             = MAX_TCP_HEADER,
2664        .obj_size               = sizeof(struct tcp_sock),
2665        .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2666        .twsk_prot              = &tcp_timewait_sock_ops,
2667        .rsk_prot               = &tcp_request_sock_ops,
2668        .h.hashinfo             = &tcp_hashinfo,
2669        .no_autobind            = true,
2670#ifdef CONFIG_COMPAT
2671        .compat_setsockopt      = compat_tcp_setsockopt,
2672        .compat_getsockopt      = compat_tcp_getsockopt,
2673#endif
2674        .diag_destroy           = tcp_abort,
2675};
2676EXPORT_SYMBOL(tcp_prot);
2677
2678static void __net_exit tcp_sk_exit(struct net *net)
2679{
2680        int cpu;
2681
2682        if (net->ipv4.tcp_congestion_control)
2683                bpf_module_put(net->ipv4.tcp_congestion_control,
2684                               net->ipv4.tcp_congestion_control->owner);
2685
2686        for_each_possible_cpu(cpu)
2687                inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2688        free_percpu(net->ipv4.tcp_sk);
2689}
2690
2691static int __net_init tcp_sk_init(struct net *net)
2692{
2693        int res, cpu, cnt;
2694
2695        net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2696        if (!net->ipv4.tcp_sk)
2697                return -ENOMEM;
2698
2699        for_each_possible_cpu(cpu) {
2700                struct sock *sk;
2701
2702                res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2703                                           IPPROTO_TCP, net);
2704                if (res)
2705                        goto fail;
2706                sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2707
2708                /* Please enforce IP_DF and IPID==0 for RST and
2709                 * ACK sent in SYN-RECV and TIME-WAIT state.
2710                 */
2711                inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2712
2713                *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2714        }
2715
2716        net->ipv4.sysctl_tcp_ecn = 2;
2717        net->ipv4.sysctl_tcp_ecn_fallback = 1;
2718
2719        net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2720        net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2721        net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2722        net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2723        net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2724
2725        net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2726        net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2727        net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2728
2729        net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2730        net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2731        net->ipv4.sysctl_tcp_syncookies = 1;
2732        net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2733        net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2734        net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2735        net->ipv4.sysctl_tcp_orphan_retries = 0;
2736        net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2737        net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2738        net->ipv4.sysctl_tcp_tw_reuse = 2;
2739        net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
2740
2741        cnt = tcp_hashinfo.ehash_mask + 1;
2742        net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2743        net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2744
2745        net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2746        net->ipv4.sysctl_tcp_sack = 1;
2747        net->ipv4.sysctl_tcp_window_scaling = 1;
2748        net->ipv4.sysctl_tcp_timestamps = 1;
2749        net->ipv4.sysctl_tcp_early_retrans = 3;
2750        net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2751        net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2752        net->ipv4.sysctl_tcp_retrans_collapse = 1;
2753        net->ipv4.sysctl_tcp_max_reordering = 300;
2754        net->ipv4.sysctl_tcp_dsack = 1;
2755        net->ipv4.sysctl_tcp_app_win = 31;
2756        net->ipv4.sysctl_tcp_adv_win_scale = 1;
2757        net->ipv4.sysctl_tcp_frto = 2;
2758        net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2759        /* This limits the percentage of the congestion window which we
2760         * will allow a single TSO frame to consume.  Building TSO frames
2761         * which are too large can cause TCP streams to be bursty.
2762         */
2763        net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2764        /* Default TSQ limit of 16 TSO segments */
2765        net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2766        /* rfc5961 challenge ack rate limiting */
2767        net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2768        net->ipv4.sysctl_tcp_min_tso_segs = 2;
2769        net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2770        net->ipv4.sysctl_tcp_autocorking = 1;
2771        net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2772        net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2773        net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2774        if (net != &init_net) {
2775                memcpy(net->ipv4.sysctl_tcp_rmem,
2776                       init_net.ipv4.sysctl_tcp_rmem,
2777                       sizeof(init_net.ipv4.sysctl_tcp_rmem));
2778                memcpy(net->ipv4.sysctl_tcp_wmem,
2779                       init_net.ipv4.sysctl_tcp_wmem,
2780                       sizeof(init_net.ipv4.sysctl_tcp_wmem));
2781        }
2782        net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2783        net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2784        net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2785        spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2786        net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2787        atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2788
2789        /* Reno is always built in */
2790        if (!net_eq(net, &init_net) &&
2791            bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
2792                               init_net.ipv4.tcp_congestion_control->owner))
2793                net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2794        else
2795                net->ipv4.tcp_congestion_control = &tcp_reno;
2796
2797        return 0;
2798fail:
2799        tcp_sk_exit(net);
2800
2801        return res;
2802}
2803
2804static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2805{
2806        struct net *net;
2807
2808        inet_twsk_purge(&tcp_hashinfo, AF_INET);
2809
2810        list_for_each_entry(net, net_exit_list, exit_list)
2811                tcp_fastopen_ctx_destroy(net);
2812}
2813
2814static struct pernet_operations __net_initdata tcp_sk_ops = {
2815       .init       = tcp_sk_init,
2816       .exit       = tcp_sk_exit,
2817       .exit_batch = tcp_sk_exit_batch,
2818};
2819
2820void __init tcp_v4_init(void)
2821{
2822        if (register_pernet_subsys(&tcp_sk_ops))
2823                panic("Failed to create the TCP control socket.\n");
2824}
2825