LXR linux/net/ipv4/tcp

   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4 *              operating system.  INET is implemented using the  BSD Socket
   5 *              interface as the means of communication with the user level.
   6 *
   7 *              Implementation of the Transmission Control Protocol(TCP).
   8 *
   9 *              IPv4 specific functions
  10 *
  11 *              code split from:
  12 *              linux/ipv4/tcp.c
  13 *              linux/ipv4/tcp_input.c
  14 *              linux/ipv4/tcp_output.c
  15 *
  16 *              See tcp.c for author information
  17 */
  18
  19/*
  20 * Changes:
  21 *              David S. Miller :       New socket lookup architecture.
  22 *                                      This code is dedicated to John Dyson.
  23 *              David S. Miller :       Change semantics of established hash,
  24 *                                      half is devoted to TIME_WAIT sockets
  25 *                                      and the rest go in the other half.
  26 *              Andi Kleen :            Add support for syncookies and fixed
  27 *                                      some bugs: ip options weren't passed to
  28 *                                      the TCP layer, missed a check for an
  29 *                                      ACK bit.
  30 *              Andi Kleen :            Implemented fast path mtu discovery.
  31 *                                      Fixed many serious bugs in the
  32 *                                      request_sock handling and moved
  33 *                                      most of it into the af independent code.
  34 *                                      Added tail drop and some other bugfixes.
  35 *                                      Added new listen semantics.
  36 *              Mike McLagan    :       Routing by source
  37 *      Juan Jose Ciarlante:            ip_dynaddr bits
  38 *              Andi Kleen:             various fixes.
  39 *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  40 *                                      coma.
  41 *      Andi Kleen              :       Fix new listen.
  42 *      Andi Kleen              :       Fix accept error reporting.
  43 *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  44 *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  45 *                                      a single port at the same time.
  46 */
  47
  48#define pr_fmt(fmt) "TCP: " fmt
  49
  50#include <linux/bottom_half.h>
  51#include <linux/types.h>
  52#include <linux/fcntl.h>
  53#include <linux/module.h>
  54#include <linux/random.h>
  55#include <linux/cache.h>
  56#include <linux/jhash.h>
  57#include <linux/init.h>
  58#include <linux/times.h>
  59#include <linux/slab.h>
  60
  61#include <net/net_namespace.h>
  62#include <net/icmp.h>
  63#include <net/inet_hashtables.h>
  64#include <net/tcp.h>
  65#include <net/transp_v6.h>
  66#include <net/ipv6.h>
  67#include <net/inet_common.h>
  68#include <net/timewait_sock.h>
  69#include <net/xfrm.h>
  70#include <net/secure_seq.h>
  71#include <net/busy_poll.h>
  72
  73#include <linux/inet.h>
  74#include <linux/ipv6.h>
  75#include <linux/stddef.h>
  76#include <linux/proc_fs.h>
  77#include <linux/seq_file.h>
  78#include <linux/inetdevice.h>
  79#include <linux/btf_ids.h>
  80
  81#include <crypto/hash.h>
  82#include <linux/scatterlist.h>
  83
  84#include <trace/events/tcp.h>
  85
  86#ifdef CONFIG_TCP_MD5SIG
  87static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  88                               __be32 daddr, __be32 saddr, const struct tcphdr *th);
  89#endif
  90
  91struct inet_hashinfo tcp_hashinfo;
  92EXPORT_SYMBOL(tcp_hashinfo);
  93
  94static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  95{
  96        return secure_tcp_seq(ip_hdr(skb)->daddr,
  97                              ip_hdr(skb)->saddr,
  98                              tcp_hdr(skb)->dest,
  99                              tcp_hdr(skb)->source);
 100}
 101
 102static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 103{
 104        return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 105}
 106
 107int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 108{
 109        const struct inet_timewait_sock *tw = inet_twsk(sktw);
 110        const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 111        struct tcp_sock *tp = tcp_sk(sk);
 112        int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
 113
 114        if (reuse == 2) {
 115                /* Still does not detect *everything* that goes through
 116                 * lo, since we require a loopback src or dst address
 117                 * or direct binding to 'lo' interface.
 118                 */
 119                bool loopback = false;
 120                if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 121                        loopback = true;
 122#if IS_ENABLED(CONFIG_IPV6)
 123                if (tw->tw_family == AF_INET6) {
 124                        if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 125                            ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
 126                            ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 127                            ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
 128                                loopback = true;
 129                } else
 130#endif
 131                {
 132                        if (ipv4_is_loopback(tw->tw_daddr) ||
 133                            ipv4_is_loopback(tw->tw_rcv_saddr))
 134                                loopback = true;
 135                }
 136                if (!loopback)
 137                        reuse = 0;
 138        }
 139
 140        /* With PAWS, it is safe from the viewpoint
 141           of data integrity. Even without PAWS it is safe provided sequence
 142           spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 143
 144           Actually, the idea is close to VJ's one, only timestamp cache is
 145           held not per host, but per port pair and TW bucket is used as state
 146           holder.
 147
 148           If TW bucket has been already destroyed we fall back to VJ's scheme
 149           and use initial timestamp retrieved from peer table.
 150         */
 151        if (tcptw->tw_ts_recent_stamp &&
 152            (!twp || (reuse && time_after32(ktime_get_seconds(),
 153                                            tcptw->tw_ts_recent_stamp)))) {
 154                /* In case of repair and re-using TIME-WAIT sockets we still
 155                 * want to be sure that it is safe as above but honor the
 156                 * sequence numbers and time stamps set as part of the repair
 157                 * process.
 158                 *
 159                 * Without this check re-using a TIME-WAIT socket with TCP
 160                 * repair would accumulate a -1 on the repair assigned
 161                 * sequence number. The first time it is reused the sequence
 162                 * is -1, the second time -2, etc. This fixes that issue
 163                 * without appearing to create any others.
 164                 */
 165                if (likely(!tp->repair)) {
 166                        u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
 167
 168                        if (!seq)
 169                                seq = 1;
 170                        WRITE_ONCE(tp->write_seq, seq);
 171                        tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 172                        tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 173                }
 174                sock_hold(sktw);
 175                return 1;
 176        }
 177
 178        return 0;
 179}
 180EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 181
 182static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 183                              int addr_len)
 184{
 185        /* This check is replicated from tcp_v4_connect() and intended to
 186         * prevent BPF program called below from accessing bytes that are out
 187         * of the bound specified by user in addr_len.
 188         */
 189        if (addr_len < sizeof(struct sockaddr_in))
 190                return -EINVAL;
 191
 192        sock_owned_by_me(sk);
 193
 194        return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 195}
 196
 197/* This will initiate an outgoing connection. */
 198int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 199{
 200        struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 201        struct inet_sock *inet = inet_sk(sk);
 202        struct tcp_sock *tp = tcp_sk(sk);
 203        __be16 orig_sport, orig_dport;
 204        __be32 daddr, nexthop;
 205        struct flowi4 *fl4;
 206        struct rtable *rt;
 207        int err;
 208        struct ip_options_rcu *inet_opt;
 209        struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 210
 211        if (addr_len < sizeof(struct sockaddr_in))
 212                return -EINVAL;
 213
 214        if (usin->sin_family != AF_INET)
 215                return -EAFNOSUPPORT;
 216
 217        nexthop = daddr = usin->sin_addr.s_addr;
 218        inet_opt = rcu_dereference_protected(inet->inet_opt,
 219                                             lockdep_sock_is_held(sk));
 220        if (inet_opt && inet_opt->opt.srr) {
 221                if (!daddr)
 222                        return -EINVAL;
 223                nexthop = inet_opt->opt.faddr;
 224        }
 225
 226        orig_sport = inet->inet_sport;
 227        orig_dport = usin->sin_port;
 228        fl4 = &inet->cork.fl.u.ip4;
 229        rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 230                              RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 231                              IPPROTO_TCP,
 232                              orig_sport, orig_dport, sk);
 233        if (IS_ERR(rt)) {
 234                err = PTR_ERR(rt);
 235                if (err == -ENETUNREACH)
 236                        IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 237                return err;
 238        }
 239
 240        if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 241                ip_rt_put(rt);
 242                return -ENETUNREACH;
 243        }
 244
 245        if (!inet_opt || !inet_opt->opt.srr)
 246                daddr = fl4->daddr;
 247
 248        if (!inet->inet_saddr)
 249                inet->inet_saddr = fl4->saddr;
 250        sk_rcv_saddr_set(sk, inet->inet_saddr);
 251
 252        if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 253                /* Reset inherited state */
 254                tp->rx_opt.ts_recent       = 0;
 255                tp->rx_opt.ts_recent_stamp = 0;
 256                if (likely(!tp->repair))
 257                        WRITE_ONCE(tp->write_seq, 0);
 258        }
 259
 260        inet->inet_dport = usin->sin_port;
 261        sk_daddr_set(sk, daddr);
 262
 263        inet_csk(sk)->icsk_ext_hdr_len = 0;
 264        if (inet_opt)
 265                inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 266
 267        tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 268
 269        /* Socket identity is still unknown (sport may be zero).
 270         * However we set state to SYN-SENT and not releasing socket
 271         * lock select source port, enter ourselves into the hash tables and
 272         * complete initialization after this.
 273         */
 274        tcp_set_state(sk, TCP_SYN_SENT);
 275        err = inet_hash_connect(tcp_death_row, sk);
 276        if (err)
 277                goto failure;
 278
 279        sk_set_txhash(sk);
 280
 281        rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 282                               inet->inet_sport, inet->inet_dport, sk);
 283        if (IS_ERR(rt)) {
 284                err = PTR_ERR(rt);
 285                rt = NULL;
 286                goto failure;
 287        }
 288        /* OK, now commit destination to socket.  */
 289        sk->sk_gso_type = SKB_GSO_TCPV4;
 290        sk_setup_caps(sk, &rt->dst);
 291        rt = NULL;
 292
 293        if (likely(!tp->repair)) {
 294                if (!tp->write_seq)
 295                        WRITE_ONCE(tp->write_seq,
 296                                   secure_tcp_seq(inet->inet_saddr,
 297                                                  inet->inet_daddr,
 298                                                  inet->inet_sport,
 299                                                  usin->sin_port));
 300                tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 301                                                 inet->inet_saddr,
 302                                                 inet->inet_daddr);
 303        }
 304
 305        inet->inet_id = prandom_u32();
 306
 307        if (tcp_fastopen_defer_connect(sk, &err))
 308                return err;
 309        if (err)
 310                goto failure;
 311
 312        err = tcp_connect(sk);
 313
 314        if (err)
 315                goto failure;
 316
 317        return 0;
 318
 319failure:
 320        /*
 321         * This unhashes the socket and releases the local port,
 322         * if necessary.
 323         */
 324        tcp_set_state(sk, TCP_CLOSE);
 325        ip_rt_put(rt);
 326        sk->sk_route_caps = 0;
 327        inet->inet_dport = 0;
 328        return err;
 329}
 330EXPORT_SYMBOL(tcp_v4_connect);
 331
 332/*
 333 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 334 * It can be called through tcp_release_cb() if socket was owned by user
 335 * at the time tcp_v4_err() was called to handle ICMP message.
 336 */
 337void tcp_v4_mtu_reduced(struct sock *sk)
 338{
 339        struct inet_sock *inet = inet_sk(sk);
 340        struct dst_entry *dst;
 341        u32 mtu;
 342
 343        if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 344                return;
 345        mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
 346        dst = inet_csk_update_pmtu(sk, mtu);
 347        if (!dst)
 348                return;
 349
 350        /* Something is about to be wrong... Remember soft error
 351         * for the case, if this connection will not able to recover.
 352         */
 353        if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 354                sk->sk_err_soft = EMSGSIZE;
 355
 356        mtu = dst_mtu(dst);
 357
 358        if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 359            ip_sk_accept_pmtu(sk) &&
 360            inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 361                tcp_sync_mss(sk, mtu);
 362
 363                /* Resend the TCP packet because it's
 364                 * clear that the old packet has been
 365                 * dropped. This is the new "fast" path mtu
 366                 * discovery.
 367                 */
 368                tcp_simple_retransmit(sk);
 369        } /* else let the usual retransmit timer handle it */
 370}
 371EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 372
 373static void do_redirect(struct sk_buff *skb, struct sock *sk)
 374{
 375        struct dst_entry *dst = __sk_dst_check(sk, 0);
 376
 377        if (dst)
 378                dst->ops->redirect(dst, sk, skb);
 379}
 380
 381
 382/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 383void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 384{
 385        struct request_sock *req = inet_reqsk(sk);
 386        struct net *net = sock_net(sk);
 387
 388        /* ICMPs are not backlogged, hence we cannot get
 389         * an established socket here.
 390         */
 391        if (seq != tcp_rsk(req)->snt_isn) {
 392                __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 393        } else if (abort) {
 394                /*
 395                 * Still in SYN_RECV, just remove it silently.
 396                 * There is no good way to pass the error to the newly
 397                 * created socket, and POSIX does not want network
 398                 * errors returned from accept().
 399                 */
 400                inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 401                tcp_listendrop(req->rsk_listener);
 402        }
 403        reqsk_put(req);
 404}
 405EXPORT_SYMBOL(tcp_req_err);
 406
 407/* TCP-LD (RFC 6069) logic */
 408void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
 409{
 410        struct inet_connection_sock *icsk = inet_csk(sk);
 411        struct tcp_sock *tp = tcp_sk(sk);
 412        struct sk_buff *skb;
 413        s32 remaining;
 414        u32 delta_us;
 415
 416        if (sock_owned_by_user(sk))
 417                return;
 418
 419        if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 420            !icsk->icsk_backoff)
 421                return;
 422
 423        skb = tcp_rtx_queue_head(sk);
 424        if (WARN_ON_ONCE(!skb))
 425                return;
 426
 427        icsk->icsk_backoff--;
 428        icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
 429        icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 430
 431        tcp_mstamp_refresh(tp);
 432        delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 433        remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
 434
 435        if (remaining > 0) {
 436                inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 437                                          remaining, TCP_RTO_MAX);
 438        } else {
 439                /* RTO revert clocked out retransmission.
 440                 * Will retransmit now.
 441                 */
 442                tcp_retransmit_timer(sk);
 443        }
 444}
 445EXPORT_SYMBOL(tcp_ld_RTO_revert);
 446
 447/*
 448 * This routine is called by the ICMP module when it gets some
 449 * sort of error condition.  If err < 0 then the socket should
 450 * be closed and the error returned to the user.  If err > 0
 451 * it's just the icmp type << 8 | icmp code.  After adjustment
 452 * header points to the first 8 bytes of the tcp header.  We need
 453 * to find the appropriate port.
 454 *
 455 * The locking strategy used here is very "optimistic". When
 456 * someone else accesses the socket the ICMP is just dropped
 457 * and for some paths there is no check at all.
 458 * A more general error queue to queue errors for later handling
 459 * is probably better.
 460 *
 461 */
 462
 463int tcp_v4_err(struct sk_buff *skb, u32 info)
 464{
 465        const struct iphdr *iph = (const struct iphdr *)skb->data;
 466        struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 467        struct tcp_sock *tp;
 468        struct inet_sock *inet;
 469        const int type = icmp_hdr(skb)->type;
 470        const int code = icmp_hdr(skb)->code;
 471        struct sock *sk;
 472        struct request_sock *fastopen;
 473        u32 seq, snd_una;
 474        int err;
 475        struct net *net = dev_net(skb->dev);
 476
 477        sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 478                                       th->dest, iph->saddr, ntohs(th->source),
 479                                       inet_iif(skb), 0);
 480        if (!sk) {
 481                __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 482                return -ENOENT;
 483        }
 484        if (sk->sk_state == TCP_TIME_WAIT) {
 485                inet_twsk_put(inet_twsk(sk));
 486                return 0;
 487        }
 488        seq = ntohl(th->seq);
 489        if (sk->sk_state == TCP_NEW_SYN_RECV) {
 490                tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 491                                     type == ICMP_TIME_EXCEEDED ||
 492                                     (type == ICMP_DEST_UNREACH &&
 493                                      (code == ICMP_NET_UNREACH ||
 494                                       code == ICMP_HOST_UNREACH)));
 495                return 0;
 496        }
 497
 498        bh_lock_sock(sk);
 499        /* If too many ICMPs get dropped on busy
 500         * servers this needs to be solved differently.
 501         * We do take care of PMTU discovery (RFC1191) special case :
 502         * we can receive locally generated ICMP messages while socket is held.
 503         */
 504        if (sock_owned_by_user(sk)) {
 505                if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 506                        __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 507        }
 508        if (sk->sk_state == TCP_CLOSE)
 509                goto out;
 510
 511        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 512                __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 513                goto out;
 514        }
 515
 516        tp = tcp_sk(sk);
 517        /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 518        fastopen = rcu_dereference(tp->fastopen_rsk);
 519        snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 520        if (sk->sk_state != TCP_LISTEN &&
 521            !between(seq, snd_una, tp->snd_nxt)) {
 522                __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 523                goto out;
 524        }
 525
 526        switch (type) {
 527        case ICMP_REDIRECT:
 528                if (!sock_owned_by_user(sk))
 529                        do_redirect(skb, sk);
 530                goto out;
 531        case ICMP_SOURCE_QUENCH:
 532                /* Just silently ignore these. */
 533                goto out;
 534        case ICMP_PARAMETERPROB:
 535                err = EPROTO;
 536                break;
 537        case ICMP_DEST_UNREACH:
 538                if (code > NR_ICMP_UNREACH)
 539                        goto out;
 540
 541                if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 542                        /* We are not interested in TCP_LISTEN and open_requests
 543                         * (SYN-ACKs send out by Linux are always <576bytes so
 544                         * they should go through unfragmented).
 545                         */
 546                        if (sk->sk_state == TCP_LISTEN)
 547                                goto out;
 548
 549                        WRITE_ONCE(tp->mtu_info, info);
 550                        if (!sock_owned_by_user(sk)) {
 551                                tcp_v4_mtu_reduced(sk);
 552                        } else {
 553                                if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 554                                        sock_hold(sk);
 555                        }
 556                        goto out;
 557                }
 558
 559                err = icmp_err_convert[code].errno;
 560                /* check if this ICMP message allows revert of backoff.
 561                 * (see RFC 6069)
 562                 */
 563                if (!fastopen &&
 564                    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
 565                        tcp_ld_RTO_revert(sk, seq);
 566                break;
 567        case ICMP_TIME_EXCEEDED:
 568                err = EHOSTUNREACH;
 569                break;
 570        default:
 571                goto out;
 572        }
 573
 574        switch (sk->sk_state) {
 575        case TCP_SYN_SENT:
 576        case TCP_SYN_RECV:
 577                /* Only in fast or simultaneous open. If a fast open socket is
 578                 * already accepted it is treated as a connected one below.
 579                 */
 580                if (fastopen && !fastopen->sk)
 581                        break;
 582
 583                ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
 584
 585                if (!sock_owned_by_user(sk)) {
 586                        sk->sk_err = err;
 587
 588                        sk_error_report(sk);
 589
 590                        tcp_done(sk);
 591                } else {
 592                        sk->sk_err_soft = err;
 593                }
 594                goto out;
 595        }
 596
 597        /* If we've already connected we will keep trying
 598         * until we time out, or the user gives up.
 599         *
 600         * rfc1122 4.2.3.9 allows to consider as hard errors
 601         * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 602         * but it is obsoleted by pmtu discovery).
 603         *
 604         * Note, that in modern internet, where routing is unreliable
 605         * and in each dark corner broken firewalls sit, sending random
 606         * errors ordered by their masters even this two messages finally lose
 607         * their original sense (even Linux sends invalid PORT_UNREACHs)
 608         *
 609         * Now we are in compliance with RFCs.
 610         *                                                      --ANK (980905)
 611         */
 612
 613        inet = inet_sk(sk);
 614        if (!sock_owned_by_user(sk) && inet->recverr) {
 615                sk->sk_err = err;
 616                sk_error_report(sk);
 617        } else  { /* Only an error on timeout */
 618                sk->sk_err_soft = err;
 619        }
 620
 621out:
 622        bh_unlock_sock(sk);
 623        sock_put(sk);
 624        return 0;
 625}
 626
 627void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 628{
 629        struct tcphdr *th = tcp_hdr(skb);
 630
 631        th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 632        skb->csum_start = skb_transport_header(skb) - skb->head;
 633        skb->csum_offset = offsetof(struct tcphdr, check);
 634}
 635
 636/* This routine computes an IPv4 TCP checksum. */
 637void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 638{
 639        const struct inet_sock *inet = inet_sk(sk);
 640
 641        __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 642}
 643EXPORT_SYMBOL(tcp_v4_send_check);
 644
 645/*
 646 *      This routine will send an RST to the other tcp.
 647 *
 648 *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 649 *                    for reset.
 650 *      Answer: if a packet caused RST, it is not for a socket
 651 *              existing in our system, if it is matched to a socket,
 652 *              it is just duplicate segment or bug in other side's TCP.
 653 *              So that we build reply only basing on parameters
 654 *              arrived with segment.
 655 *      Exception: precedence violation. We do not implement it in any case.
 656 */
 657
 658#ifdef CONFIG_TCP_MD5SIG
 659#define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
 660#else
 661#define OPTION_BYTES sizeof(__be32)
 662#endif
 663
 664static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 665{
 666        const struct tcphdr *th = tcp_hdr(skb);
 667        struct {
 668                struct tcphdr th;
 669                __be32 opt[OPTION_BYTES / sizeof(__be32)];
 670        } rep;
 671        struct ip_reply_arg arg;
 672#ifdef CONFIG_TCP_MD5SIG
 673        struct tcp_md5sig_key *key = NULL;
 674        const __u8 *hash_location = NULL;
 675        unsigned char newhash[16];
 676        int genhash;
 677        struct sock *sk1 = NULL;
 678#endif
 679        u64 transmit_time = 0;
 680        struct sock *ctl_sk;
 681        struct net *net;
 682
 683        /* Never send a reset in response to a reset. */
 684        if (th->rst)
 685                return;
 686
 687        /* If sk not NULL, it means we did a successful lookup and incoming
 688         * route had to be correct. prequeue might have dropped our dst.
 689         */
 690        if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 691                return;
 692
 693        /* Swap the send and the receive. */
 694        memset(&rep, 0, sizeof(rep));
 695        rep.th.dest   = th->source;
 696        rep.th.source = th->dest;
 697        rep.th.doff   = sizeof(struct tcphdr) / 4;
 698        rep.th.rst    = 1;
 699
 700        if (th->ack) {
 701                rep.th.seq = th->ack_seq;
 702        } else {
 703                rep.th.ack = 1;
 704                rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 705                                       skb->len - (th->doff << 2));
 706        }
 707
 708        memset(&arg, 0, sizeof(arg));
 709        arg.iov[0].iov_base = (unsigned char *)&rep;
 710        arg.iov[0].iov_len  = sizeof(rep.th);
 711
 712        net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 713#ifdef CONFIG_TCP_MD5SIG
 714        rcu_read_lock();
 715        hash_location = tcp_parse_md5sig_option(th);
 716        if (sk && sk_fullsock(sk)) {
 717                const union tcp_md5_addr *addr;
 718                int l3index;
 719
 720                /* sdif set, means packet ingressed via a device
 721                 * in an L3 domain and inet_iif is set to it.
 722                 */
 723                l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 724                addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 725                key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
 726        } else if (hash_location) {
 727                const union tcp_md5_addr *addr;
 728                int sdif = tcp_v4_sdif(skb);
 729                int dif = inet_iif(skb);
 730                int l3index;
 731
 732                /*
 733                 * active side is lost. Try to find listening socket through
 734                 * source port, and then find md5 key through listening socket.
 735                 * we are not loose security here:
 736                 * Incoming packet is checked with md5 hash with finding key,
 737                 * no RST generated if md5 hash doesn't match.
 738                 */
 739                sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 740                                             ip_hdr(skb)->saddr,
 741                                             th->source, ip_hdr(skb)->daddr,
 742                                             ntohs(th->source), dif, sdif);
 743                /* don't send rst if it can't find key */
 744                if (!sk1)
 745                        goto out;
 746
 747                /* sdif set, means packet ingressed via a device
 748                 * in an L3 domain and dif is set to it.
 749                 */
 750                l3index = sdif ? dif : 0;
 751                addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 752                key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
 753                if (!key)
 754                        goto out;
 755
 756
 757                genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 758                if (genhash || memcmp(hash_location, newhash, 16) != 0)
 759                        goto out;
 760
 761        }
 762
 763        if (key) {
 764                rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 765                                   (TCPOPT_NOP << 16) |
 766                                   (TCPOPT_MD5SIG << 8) |
 767                                   TCPOLEN_MD5SIG);
 768                /* Update length and the length the header thinks exists */
 769                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 770                rep.th.doff = arg.iov[0].iov_len / 4;
 771
 772                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 773                                     key, ip_hdr(skb)->saddr,
 774                                     ip_hdr(skb)->daddr, &rep.th);
 775        }
 776#endif
 777        /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
 778        if (rep.opt[0] == 0) {
 779                __be32 mrst = mptcp_reset_option(skb);
 780
 781                if (mrst) {
 782                        rep.opt[0] = mrst;
 783                        arg.iov[0].iov_len += sizeof(mrst);
 784                        rep.th.doff = arg.iov[0].iov_len / 4;
 785                }
 786        }
 787
 788        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 789                                      ip_hdr(skb)->saddr, /* XXX */
 790                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 791        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 792        arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 793
 794        /* When socket is gone, all binding information is lost.
 795         * routing might fail in this case. No choice here, if we choose to force
 796         * input interface, we will misroute in case of asymmetric route.
 797         */
 798        if (sk) {
 799                arg.bound_dev_if = sk->sk_bound_dev_if;
 800                if (sk_fullsock(sk))
 801                        trace_tcp_send_reset(sk, skb);
 802        }
 803
 804        BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 805                     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 806
 807        arg.tos = ip_hdr(skb)->tos;
 808        arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 809        local_bh_disable();
 810        ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 811        if (sk) {
 812                ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 813                                   inet_twsk(sk)->tw_mark : sk->sk_mark;
 814                ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 815                                   inet_twsk(sk)->tw_priority : sk->sk_priority;
 816                transmit_time = tcp_transmit_time(sk);
 817        }
 818        ip_send_unicast_reply(ctl_sk,
 819                              skb, &TCP_SKB_CB(skb)->header.h4.opt,
 820                              ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 821                              &arg, arg.iov[0].iov_len,
 822                              transmit_time);
 823
 824        ctl_sk->sk_mark = 0;
 825        __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 826        __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 827        local_bh_enable();
 828
 829#ifdef CONFIG_TCP_MD5SIG
 830out:
 831        rcu_read_unlock();
 832#endif
 833}
 834
 835/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 836   outside socket context is ugly, certainly. What can I do?
 837 */
 838
 839static void tcp_v4_send_ack(const struct sock *sk,
 840                            struct sk_buff *skb, u32 seq, u32 ack,
 841                            u32 win, u32 tsval, u32 tsecr, int oif,
 842                            struct tcp_md5sig_key *key,
 843                            int reply_flags, u8 tos)
 844{
 845        const struct tcphdr *th = tcp_hdr(skb);
 846        struct {
 847                struct tcphdr th;
 848                __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 849#ifdef CONFIG_TCP_MD5SIG
 850                           + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 851#endif
 852                        ];
 853        } rep;
 854        struct net *net = sock_net(sk);
 855        struct ip_reply_arg arg;
 856        struct sock *ctl_sk;
 857        u64 transmit_time;
 858
 859        memset(&rep.th, 0, sizeof(struct tcphdr));
 860        memset(&arg, 0, sizeof(arg));
 861
 862        arg.iov[0].iov_base = (unsigned char *)&rep;
 863        arg.iov[0].iov_len  = sizeof(rep.th);
 864        if (tsecr) {
 865                rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 866                                   (TCPOPT_TIMESTAMP << 8) |
 867                                   TCPOLEN_TIMESTAMP);
 868                rep.opt[1] = htonl(tsval);
 869                rep.opt[2] = htonl(tsecr);
 870                arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 871        }
 872
 873        /* Swap the send and the receive. */
 874        rep.th.dest    = th->source;
 875        rep.th.source  = th->dest;
 876        rep.th.doff    = arg.iov[0].iov_len / 4;
 877        rep.th.seq     = htonl(seq);
 878        rep.th.ack_seq = htonl(ack);
 879        rep.th.ack     = 1;
 880        rep.th.window  = htons(win);
 881
 882#ifdef CONFIG_TCP_MD5SIG
 883        if (key) {
 884                int offset = (tsecr) ? 3 : 0;
 885
 886                rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 887                                          (TCPOPT_NOP << 16) |
 888                                          (TCPOPT_MD5SIG << 8) |
 889                                          TCPOLEN_MD5SIG);
 890                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 891                rep.th.doff = arg.iov[0].iov_len/4;
 892
 893                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 894                                    key, ip_hdr(skb)->saddr,
 895                                    ip_hdr(skb)->daddr, &rep.th);
 896        }
 897#endif
 898        arg.flags = reply_flags;
 899        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 900                                      ip_hdr(skb)->saddr, /* XXX */
 901                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 902        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 903        if (oif)
 904                arg.bound_dev_if = oif;
 905        arg.tos = tos;
 906        arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 907        local_bh_disable();
 908        ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 909        ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 910                           inet_twsk(sk)->tw_mark : sk->sk_mark;
 911        ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 912                           inet_twsk(sk)->tw_priority : sk->sk_priority;
 913        transmit_time = tcp_transmit_time(sk);
 914        ip_send_unicast_reply(ctl_sk,
 915                              skb, &TCP_SKB_CB(skb)->header.h4.opt,
 916                              ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 917                              &arg, arg.iov[0].iov_len,
 918                              transmit_time);
 919
 920        ctl_sk->sk_mark = 0;
 921        __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 922        local_bh_enable();
 923}
 924
 925static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 926{
 927        struct inet_timewait_sock *tw = inet_twsk(sk);
 928        struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 929
 930        tcp_v4_send_ack(sk, skb,
 931                        tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 932                        tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 933                        tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 934                        tcptw->tw_ts_recent,
 935                        tw->tw_bound_dev_if,
 936                        tcp_twsk_md5_key(tcptw),
 937                        tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 938                        tw->tw_tos
 939                        );
 940
 941        inet_twsk_put(tw);
 942}
 943
 944static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 945                                  struct request_sock *req)
 946{
 947        const union tcp_md5_addr *addr;
 948        int l3index;
 949
 950        /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 951         * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 952         */
 953        u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 954                                             tcp_sk(sk)->snd_nxt;
 955
 956        /* RFC 7323 2.3
 957         * The window field (SEG.WND) of every outgoing segment, with the
 958         * exception of <SYN> segments, MUST be right-shifted by
 959         * Rcv.Wind.Shift bits:
 960         */
 961        addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 962        l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 963        tcp_v4_send_ack(sk, skb, seq,
 964                        tcp_rsk(req)->rcv_nxt,
 965                        req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 966                        tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 967                        req->ts_recent,
 968                        0,
 969                        tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
 970                        inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 971                        ip_hdr(skb)->tos);
 972}
 973
 974/*
 975 *      Send a SYN-ACK after having received a SYN.
 976 *      This still operates on a request_sock only, not on a big
 977 *      socket.
 978 */
 979static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 980                              struct flowi *fl,
 981                              struct request_sock *req,
 982                              struct tcp_fastopen_cookie *foc,
 983                              enum tcp_synack_type synack_type,
 984                              struct sk_buff *syn_skb)
 985{
 986        const struct inet_request_sock *ireq = inet_rsk(req);
 987        struct flowi4 fl4;
 988        int err = -1;
 989        struct sk_buff *skb;
 990        u8 tos;
 991
 992        /* First, grab a route. */
 993        if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 994                return -1;
 995
 996        skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
 997
 998        if (skb) {
 999                __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1000

1001                tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
1002                                (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1003                                (inet_sk(sk)->tos & INET_ECN_MASK) :
1004                                inet_sk(sk)->tos;
1005
1006                if (!INET_ECN_is_capable(tos) &&
1007                    tcp_bpf_ca_needs_ecn((struct sock *)req))
1008                        tos |= INET_ECN_ECT_0;
1009
1010                rcu_read_lock();
1011                err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1012                                            ireq->ir_rmt_addr,
1013                                            rcu_dereference(ireq->ireq_opt),
1014                                            tos);
1015                rcu_read_unlock();
1016                err = net_xmit_eval(err);
1017        }
1018
1019        return err;
1020}
1021
1022/*
1023 *      IPv4 request_sock destructor.
1024 */
1025static void tcp_v4_reqsk_destructor(struct request_sock *req)
1026{
1027        kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1028}
1029
1030#ifdef CONFIG_TCP_MD5SIG
1031/*
1032 * RFC2385 MD5 checksumming requires a mapping of
1033 * IP address->MD5 Key.
1034 * We need to maintain these in the sk structure.
1035 */
1036
1037DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1038EXPORT_SYMBOL(tcp_md5_needed);
1039
1040/* Find the Key structure for an address.  */
1041struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1042                                           const union tcp_md5_addr *addr,
1043                                           int family)
1044{
1045        const struct tcp_sock *tp = tcp_sk(sk);
1046        struct tcp_md5sig_key *key;
1047        const struct tcp_md5sig_info *md5sig;
1048        __be32 mask;
1049        struct tcp_md5sig_key *best_match = NULL;
1050        bool match;
1051
1052        /* caller either holds rcu_read_lock() or socket lock */
1053        md5sig = rcu_dereference_check(tp->md5sig_info,
1054                                       lockdep_sock_is_held(sk));
1055        if (!md5sig)
1056                return NULL;
1057
1058        hlist_for_each_entry_rcu(key, &md5sig->head, node,
1059                                 lockdep_sock_is_held(sk)) {
1060                if (key->family != family)
1061                        continue;
1062                if (key->l3index && key->l3index != l3index)
1063                        continue;
1064                if (family == AF_INET) {
1065                        mask = inet_make_mask(key->prefixlen);
1066                        match = (key->addr.a4.s_addr & mask) ==
1067                                (addr->a4.s_addr & mask);
1068#if IS_ENABLED(CONFIG_IPV6)
1069                } else if (family == AF_INET6) {
1070                        match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1071                                                  key->prefixlen);
1072#endif
1073                } else {
1074                        match = false;
1075                }
1076
1077                if (match && (!best_match ||
1078                              key->prefixlen > best_match->prefixlen))
1079                        best_match = key;
1080        }
1081        return best_match;
1082}
1083EXPORT_SYMBOL(__tcp_md5_do_lookup);
1084
1085static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1086                                                      const union tcp_md5_addr *addr,
1087                                                      int family, u8 prefixlen,
1088                                                      int l3index)
1089{
1090        const struct tcp_sock *tp = tcp_sk(sk);
1091        struct tcp_md5sig_key *key;
1092        unsigned int size = sizeof(struct in_addr);
1093        const struct tcp_md5sig_info *md5sig;
1094
1095        /* caller either holds rcu_read_lock() or socket lock */
1096        md5sig = rcu_dereference_check(tp->md5sig_info,
1097                                       lockdep_sock_is_held(sk));
1098        if (!md5sig)
1099                return NULL;
1100#if IS_ENABLED(CONFIG_IPV6)
1101        if (family == AF_INET6)
1102                size = sizeof(struct in6_addr);
1103#endif
1104        hlist_for_each_entry_rcu(key, &md5sig->head, node,
1105                                 lockdep_sock_is_held(sk)) {
1106                if (key->family != family)
1107                        continue;
1108                if (key->l3index && key->l3index != l3index)
1109                        continue;
1110                if (!memcmp(&key->addr, addr, size) &&
1111                    key->prefixlen == prefixlen)
1112                        return key;
1113        }
1114        return NULL;
1115}
1116
1117struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1118                                         const struct sock *addr_sk)
1119{
1120        const union tcp_md5_addr *addr;
1121        int l3index;
1122
1123        l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1124                                                 addr_sk->sk_bound_dev_if);
1125        addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1126        return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1127}
1128EXPORT_SYMBOL(tcp_v4_md5_lookup);
1129
1130/* This can be called on a newly created socket, from other files */
1131int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1132                   int family, u8 prefixlen, int l3index,
1133                   const u8 *newkey, u8 newkeylen, gfp_t gfp)
1134{
1135        /* Add Key to the list */
1136        struct tcp_md5sig_key *key;
1137        struct tcp_sock *tp = tcp_sk(sk);
1138        struct tcp_md5sig_info *md5sig;
1139
1140        key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1141        if (key) {
1142                /* Pre-existing entry - just update that one.
1143                 * Note that the key might be used concurrently.
1144                 * data_race() is telling kcsan that we do not care of
1145                 * key mismatches, since changing MD5 key on live flows
1146                 * can lead to packet drops.
1147                 */
1148                data_race(memcpy(key->key, newkey, newkeylen));
1149
1150                /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1151                 * Also note that a reader could catch new key->keylen value
1152                 * but old key->key[], this is the reason we use __GFP_ZERO
1153                 * at sock_kmalloc() time below these lines.
1154                 */
1155                WRITE_ONCE(key->keylen, newkeylen);
1156
1157                return 0;
1158        }
1159
1160        md5sig = rcu_dereference_protected(tp->md5sig_info,
1161                                           lockdep_sock_is_held(sk));
1162        if (!md5sig) {
1163                md5sig = kmalloc(sizeof(*md5sig), gfp);
1164                if (!md5sig)
1165                        return -ENOMEM;
1166
1167                sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1168                INIT_HLIST_HEAD(&md5sig->head);
1169                rcu_assign_pointer(tp->md5sig_info, md5sig);
1170        }
1171
1172        key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1173        if (!key)
1174                return -ENOMEM;
1175        if (!tcp_alloc_md5sig_pool()) {
1176                sock_kfree_s(sk, key, sizeof(*key));
1177                return -ENOMEM;
1178        }
1179
1180        memcpy(key->key, newkey, newkeylen);
1181        key->keylen = newkeylen;
1182        key->family = family;
1183        key->prefixlen = prefixlen;
1184        key->l3index = l3index;
1185        memcpy(&key->addr, addr,
1186               (family == AF_INET6) ? sizeof(struct in6_addr) :
1187                                      sizeof(struct in_addr));
1188        hlist_add_head_rcu(&key->node, &md5sig->head);
1189        return 0;
1190}
1191EXPORT_SYMBOL(tcp_md5_do_add);
1192
1193int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1194                   u8 prefixlen, int l3index)
1195{
1196        struct tcp_md5sig_key *key;
1197
1198        key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1199        if (!key)
1200                return -ENOENT;
1201        hlist_del_rcu(&key->node);
1202        atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1203        kfree_rcu(key, rcu);
1204        return 0;
1205}
1206EXPORT_SYMBOL(tcp_md5_do_del);
1207
1208static void tcp_clear_md5_list(struct sock *sk)
1209{
1210        struct tcp_sock *tp = tcp_sk(sk);
1211        struct tcp_md5sig_key *key;
1212        struct hlist_node *n;
1213        struct tcp_md5sig_info *md5sig;
1214
1215        md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1216
1217        hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1218                hlist_del_rcu(&key->node);
1219                atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1220                kfree_rcu(key, rcu);
1221        }
1222}
1223
1224static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1225                                 sockptr_t optval, int optlen)
1226{
1227        struct tcp_md5sig cmd;
1228        struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1229        const union tcp_md5_addr *addr;
1230        u8 prefixlen = 32;
1231        int l3index = 0;
1232
1233        if (optlen < sizeof(cmd))
1234                return -EINVAL;
1235
1236        if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1237                return -EFAULT;
1238
1239        if (sin->sin_family != AF_INET)
1240                return -EINVAL;
1241
1242        if (optname == TCP_MD5SIG_EXT &&
1243            cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1244                prefixlen = cmd.tcpm_prefixlen;
1245                if (prefixlen > 32)
1246                        return -EINVAL;
1247        }
1248
1249        if (optname == TCP_MD5SIG_EXT &&
1250            cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1251                struct net_device *dev;
1252
1253                rcu_read_lock();
1254                dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1255                if (dev && netif_is_l3_master(dev))
1256                        l3index = dev->ifindex;
1257
1258                rcu_read_unlock();
1259
1260                /* ok to reference set/not set outside of rcu;
1261                 * right now device MUST be an L3 master
1262                 */
1263                if (!dev || !l3index)
1264                        return -EINVAL;
1265        }
1266
1267        addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1268
1269        if (!cmd.tcpm_keylen)
1270                return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
1271
1272        if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1273                return -EINVAL;
1274
1275        return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1276                              cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1277}
1278
1279static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1280                                   __be32 daddr, __be32 saddr,
1281                                   const struct tcphdr *th, int nbytes)
1282{
1283        struct tcp4_pseudohdr *bp;
1284        struct scatterlist sg;
1285        struct tcphdr *_th;
1286
1287        bp = hp->scratch;
1288        bp->saddr = saddr;
1289        bp->daddr = daddr;
1290        bp->pad = 0;
1291        bp->protocol = IPPROTO_TCP;
1292        bp->len = cpu_to_be16(nbytes);
1293
1294        _th = (struct tcphdr *)(bp + 1);
1295        memcpy(_th, th, sizeof(*th));
1296        _th->check = 0;
1297
1298        sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1299        ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1300                                sizeof(*bp) + sizeof(*th));
1301        return crypto_ahash_update(hp->md5_req);
1302}
1303
1304static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1305                               __be32 daddr, __be32 saddr, const struct tcphdr *th)
1306{
1307        struct tcp_md5sig_pool *hp;
1308        struct ahash_request *req;
1309
1310        hp = tcp_get_md5sig_pool();
1311        if (!hp)
1312                goto clear_hash_noput;
1313        req = hp->md5_req;
1314
1315        if (crypto_ahash_init(req))
1316                goto clear_hash;
1317        if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1318                goto clear_hash;
1319        if (tcp_md5_hash_key(hp, key))
1320                goto clear_hash;
1321        ahash_request_set_crypt(req, NULL, md5_hash, 0);
1322        if (crypto_ahash_final(req))
1323                goto clear_hash;
1324
1325        tcp_put_md5sig_pool();
1326        return 0;
1327
1328clear_hash:
1329        tcp_put_md5sig_pool();
1330clear_hash_noput:
1331        memset(md5_hash, 0, 16);
1332        return 1;
1333}
1334
1335int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1336                        const struct sock *sk,
1337                        const struct sk_buff *skb)
1338{
1339        struct tcp_md5sig_pool *hp;
1340        struct ahash_request *req;
1341        const struct tcphdr *th = tcp_hdr(skb);
1342        __be32 saddr, daddr;
1343
1344        if (sk) { /* valid for establish/request sockets */
1345                saddr = sk->sk_rcv_saddr;
1346                daddr = sk->sk_daddr;
1347        } else {
1348                const struct iphdr *iph = ip_hdr(skb);
1349                saddr = iph->saddr;
1350                daddr = iph->daddr;
1351        }
1352
1353        hp = tcp_get_md5sig_pool();
1354        if (!hp)
1355                goto clear_hash_noput;
1356        req = hp->md5_req;
1357
1358        if (crypto_ahash_init(req))
1359                goto clear_hash;
1360
1361        if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1362                goto clear_hash;
1363        if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1364                goto clear_hash;
1365        if (tcp_md5_hash_key(hp, key))
1366                goto clear_hash;
1367        ahash_request_set_crypt(req, NULL, md5_hash, 0);
1368        if (crypto_ahash_final(req))
1369                goto clear_hash;
1370
1371        tcp_put_md5sig_pool();
1372        return 0;
1373
1374clear_hash:
1375        tcp_put_md5sig_pool();
1376clear_hash_noput:
1377        memset(md5_hash, 0, 16);
1378        return 1;
1379}
1380EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1381
1382#endif
1383
1384/* Called with rcu_read_lock() */
1385static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1386                                    const struct sk_buff *skb,
1387                                    int dif, int sdif)
1388{
1389#ifdef CONFIG_TCP_MD5SIG
1390        /*
1391         * This gets called for each TCP segment that arrives
1392         * so we want to be efficient.
1393         * We have 3 drop cases:
1394         * o No MD5 hash and one expected.
1395         * o MD5 hash and we're not expecting one.
1396         * o MD5 hash and its wrong.
1397         */
1398        const __u8 *hash_location = NULL;
1399        struct tcp_md5sig_key *hash_expected;
1400        const struct iphdr *iph = ip_hdr(skb);
1401        const struct tcphdr *th = tcp_hdr(skb);
1402        const union tcp_md5_addr *addr;
1403        unsigned char newhash[16];
1404        int genhash, l3index;
1405
1406        /* sdif set, means packet ingressed via a device
1407         * in an L3 domain and dif is set to the l3mdev
1408         */
1409        l3index = sdif ? dif : 0;
1410
1411        addr = (union tcp_md5_addr *)&iph->saddr;
1412        hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1413        hash_location = tcp_parse_md5sig_option(th);
1414
1415        /* We've parsed the options - do we have a hash? */
1416        if (!hash_expected && !hash_location)
1417                return false;
1418
1419        if (hash_expected && !hash_location) {
1420                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1421                return true;
1422        }
1423
1424        if (!hash_expected && hash_location) {
1425                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1426                return true;
1427        }
1428
1429        /* Okay, so this is hash_expected and hash_location -
1430         * so we need to calculate the checksum.
1431         */
1432        genhash = tcp_v4_md5_hash_skb(newhash,
1433                                      hash_expected,
1434                                      NULL, skb);
1435
1436        if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1437                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1438                net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1439                                     &iph->saddr, ntohs(th->source),
1440                                     &iph->daddr, ntohs(th->dest),
1441                                     genhash ? " tcp_v4_calc_md5_hash failed"
1442                                     : "", l3index);
1443                return true;
1444        }
1445        return false;
1446#endif
1447        return false;
1448}
1449
1450static void tcp_v4_init_req(struct request_sock *req,
1451                            const struct sock *sk_listener,
1452                            struct sk_buff *skb)
1453{
1454        struct inet_request_sock *ireq = inet_rsk(req);
1455        struct net *net = sock_net(sk_listener);
1456
1457        sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1458        sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1459        RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1460}
1461
1462static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1463                                          struct sk_buff *skb,
1464                                          struct flowi *fl,
1465                                          struct request_sock *req)
1466{
1467        tcp_v4_init_req(req, sk, skb);
1468
1469        if (security_inet_conn_request(sk, skb, req))
1470                return NULL;
1471
1472        return inet_csk_route_req(sk, &fl->u.ip4, req);
1473}
1474
1475struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1476        .family         =       PF_INET,
1477        .obj_size       =       sizeof(struct tcp_request_sock),
1478        .rtx_syn_ack    =       tcp_rtx_synack,
1479        .send_ack       =       tcp_v4_reqsk_send_ack,
1480        .destructor     =       tcp_v4_reqsk_destructor,
1481        .send_reset     =       tcp_v4_send_reset,
1482        .syn_ack_timeout =      tcp_syn_ack_timeout,
1483};
1484
1485const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1486        .mss_clamp      =       TCP_MSS_DEFAULT,
1487#ifdef CONFIG_TCP_MD5SIG
1488        .req_md5_lookup =       tcp_v4_md5_lookup,
1489        .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1490#endif
1491#ifdef CONFIG_SYN_COOKIES
1492        .cookie_init_seq =      cookie_v4_init_sequence,
1493#endif
1494        .route_req      =       tcp_v4_route_req,
1495        .init_seq       =       tcp_v4_init_seq,
1496        .init_ts_off    =       tcp_v4_init_ts_off,
1497        .send_synack    =       tcp_v4_send_synack,
1498};
1499
1500int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1501{
1502        /* Never answer to SYNs send to broadcast or multicast */
1503        if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1504                goto drop;
1505
1506        return tcp_conn_request(&tcp_request_sock_ops,
1507                                &tcp_request_sock_ipv4_ops, sk, skb);
1508
1509drop:
1510        tcp_listendrop(sk);
1511        return 0;
1512}
1513EXPORT_SYMBOL(tcp_v4_conn_request);
1514
1515
1516/*
1517 * The three way handshake has completed - we got a valid synack -
1518 * now create the new socket.
1519 */
1520struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1521                                  struct request_sock *req,
1522                                  struct dst_entry *dst,
1523                                  struct request_sock *req_unhash,
1524                                  bool *own_req)
1525{
1526        struct inet_request_sock *ireq;
1527        bool found_dup_sk = false;
1528        struct inet_sock *newinet;
1529        struct tcp_sock *newtp;
1530        struct sock *newsk;
1531#ifdef CONFIG_TCP_MD5SIG
1532        const union tcp_md5_addr *addr;
1533        struct tcp_md5sig_key *key;
1534        int l3index;
1535#endif
1536        struct ip_options_rcu *inet_opt;
1537
1538        if (sk_acceptq_is_full(sk))
1539                goto exit_overflow;
1540
1541        newsk = tcp_create_openreq_child(sk, req, skb);
1542        if (!newsk)
1543                goto exit_nonewsk;
1544
1545        newsk->sk_gso_type = SKB_GSO_TCPV4;
1546        inet_sk_rx_dst_set(newsk, skb);
1547
1548        newtp                 = tcp_sk(newsk);
1549        newinet               = inet_sk(newsk);
1550        ireq                  = inet_rsk(req);
1551        sk_daddr_set(newsk, ireq->ir_rmt_addr);
1552        sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1553        newsk->sk_bound_dev_if = ireq->ir_iif;
1554        newinet->inet_saddr   = ireq->ir_loc_addr;
1555        inet_opt              = rcu_dereference(ireq->ireq_opt);
1556        RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1557        newinet->mc_index     = inet_iif(skb);
1558        newinet->mc_ttl       = ip_hdr(skb)->ttl;
1559        newinet->rcv_tos      = ip_hdr(skb)->tos;
1560        inet_csk(newsk)->icsk_ext_hdr_len = 0;
1561        if (inet_opt)
1562                inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1563        newinet->inet_id = prandom_u32();
1564
1565        /* Set ToS of the new socket based upon the value of incoming SYN.
1566         * ECT bits are set later in tcp_init_transfer().
1567         */
1568        if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
1569                newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1570
1571        if (!dst) {
1572                dst = inet_csk_route_child_sock(sk, newsk, req);
1573                if (!dst)
1574                        goto put_and_exit;
1575        } else {
1576                /* syncookie case : see end of cookie_v4_check() */
1577        }
1578        sk_setup_caps(newsk, dst);
1579
1580        tcp_ca_openreq_child(newsk, dst);
1581
1582        tcp_sync_mss(newsk, dst_mtu(dst));
1583        newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1584
1585        tcp_initialize_rcv_mss(newsk);
1586
1587#ifdef CONFIG_TCP_MD5SIG
1588        l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1589        /* Copy over the MD5 key from the original socket */
1590        addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1591        key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1592        if (key) {
1593                /*
1594                 * We're using one, so create a matching key
1595                 * on the newsk structure. If we fail to get
1596                 * memory, then we end up not copying the key
1597                 * across. Shucks.
1598                 */
1599                tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1600                               key->key, key->keylen, GFP_ATOMIC);
1601                sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1602        }
1603#endif
1604
1605        if (__inet_inherit_port(sk, newsk) < 0)
1606                goto put_and_exit;
1607        *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1608                                       &found_dup_sk);
1609        if (likely(*own_req)) {
1610                tcp_move_syn(newtp, req);
1611                ireq->ireq_opt = NULL;
1612        } else {
1613                newinet->inet_opt = NULL;
1614
1615                if (!req_unhash && found_dup_sk) {
1616                        /* This code path should only be executed in the
1617                         * syncookie case only
1618                         */
1619                        bh_unlock_sock(newsk);
1620                        sock_put(newsk);
1621                        newsk = NULL;
1622                }
1623        }
1624        return newsk;
1625
1626exit_overflow:
1627        NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1628exit_nonewsk:
1629        dst_release(dst);
1630exit:
1631        tcp_listendrop(sk);
1632        return NULL;
1633put_and_exit:
1634        newinet->inet_opt = NULL;
1635        inet_csk_prepare_forced_close(newsk);
1636        tcp_done(newsk);
1637        goto exit;
1638}
1639EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1640
1641static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1642{
1643#ifdef CONFIG_SYN_COOKIES
1644        const struct tcphdr *th = tcp_hdr(skb);
1645
1646        if (!th->syn)
1647                sk = cookie_v4_check(sk, skb);
1648#endif
1649        return sk;
1650}
1651
1652u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1653                         struct tcphdr *th, u32 *cookie)
1654{
1655        u16 mss = 0;
1656#ifdef CONFIG_SYN_COOKIES
1657        mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1658                                    &tcp_request_sock_ipv4_ops, sk, th);
1659        if (mss) {
1660                *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1661                tcp_synq_overflow(sk);
1662        }
1663#endif
1664        return mss;
1665}
1666
1667INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1668                                                           u32));
1669/* The socket must have it's spinlock held when we get
1670 * here, unless it is a TCP_LISTEN socket.
1671 *
1672 * We have a potential double-lock case here, so even when
1673 * doing backlog processing we use the BH locking scheme.
1674 * This is because we cannot sleep with the original spinlock
1675 * held.
1676 */
1677int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1678{
1679        struct sock *rsk;
1680
1681        if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1682                struct dst_entry *dst = sk->sk_rx_dst;
1683
1684                sock_rps_save_rxhash(sk, skb);
1685                sk_mark_napi_id(sk, skb);
1686                if (dst) {
1687                        if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1688                            !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1689                                             dst, 0)) {
1690                                dst_release(dst);
1691                                sk->sk_rx_dst = NULL;
1692                        }
1693                }
1694                tcp_rcv_established(sk, skb);
1695                return 0;
1696        }
1697
1698        if (tcp_checksum_complete(skb))
1699                goto csum_err;
1700
1701        if (sk->sk_state == TCP_LISTEN) {
1702                struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1703
1704                if (!nsk)
1705                        goto discard;
1706                if (nsk != sk) {
1707                        if (tcp_child_process(sk, nsk, skb)) {
1708                                rsk = nsk;
1709                                goto reset;
1710                        }
1711                        return 0;
1712                }
1713        } else
1714                sock_rps_save_rxhash(sk, skb);
1715
1716        if (tcp_rcv_state_process(sk, skb)) {
1717                rsk = sk;
1718                goto reset;
1719        }
1720        return 0;
1721
1722reset:
1723        tcp_v4_send_reset(rsk, skb);
1724discard:
1725        kfree_skb(skb);
1726        /* Be careful here. If this function gets more complicated and
1727         * gcc suffers from register pressure on the x86, sk (in %ebx)
1728         * might be destroyed here. This current version compiles correctly,
1729         * but you have been warned.
1730         */
1731        return 0;
1732
1733csum_err:
1734        trace_tcp_bad_csum(skb);
1735        TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1736        TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1737        goto discard;
1738}
1739EXPORT_SYMBOL(tcp_v4_do_rcv);
1740
1741int tcp_v4_early_demux(struct sk_buff *skb)
1742{
1743        const struct iphdr *iph;
1744        const struct tcphdr *th;
1745        struct sock *sk;
1746
1747        if (skb->pkt_type != PACKET_HOST)
1748                return 0;
1749
1750        if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1751                return 0;
1752
1753        iph = ip_hdr(skb);
1754        th = tcp_hdr(skb);
1755
1756        if (th->doff < sizeof(struct tcphdr) / 4)
1757                return 0;
1758
1759        sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1760                                       iph->saddr, th->source,
1761                                       iph->daddr, ntohs(th->dest),
1762                                       skb->skb_iif, inet_sdif(skb));
1763        if (sk) {
1764                skb->sk = sk;
1765                skb->destructor = sock_edemux;
1766                if (sk_fullsock(sk)) {
1767                        struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1768
1769                        if (dst)
1770                                dst = dst_check(dst, 0);
1771                        if (dst &&
1772                            inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1773                                skb_dst_set_noref(skb, dst);
1774                }
1775        }
1776        return 0;
1777}
1778
1779bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1780{
1781        u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1782        u32 tail_gso_size, tail_gso_segs;
1783        struct skb_shared_info *shinfo;
1784        const struct tcphdr *th;
1785        struct tcphdr *thtail;
1786        struct sk_buff *tail;
1787        unsigned int hdrlen;
1788        bool fragstolen;
1789        u32 gso_segs;
1790        u32 gso_size;
1791        int delta;
1792
1793        /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1794         * we can fix skb->truesize to its real value to avoid future drops.
1795         * This is valid because skb is not yet charged to the socket.
1796         * It has been noticed pure SACK packets were sometimes dropped
1797         * (if cooked by drivers without copybreak feature).
1798         */
1799        skb_condense(skb);
1800
1801        skb_dst_drop(skb);
1802
1803        if (unlikely(tcp_checksum_complete(skb))) {
1804                bh_unlock_sock(sk);
1805                trace_tcp_bad_csum(skb);
1806                __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1807                __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1808                return true;
1809        }
1810
1811        /* Attempt coalescing to last skb in backlog, even if we are
1812         * above the limits.
1813         * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1814         */
1815        th = (const struct tcphdr *)skb->data;
1816        hdrlen = th->doff * 4;
1817
1818        tail = sk->sk_backlog.tail;
1819        if (!tail)
1820                goto no_coalesce;
1821        thtail = (struct tcphdr *)tail->data;
1822
1823        if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1824            TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1825            ((TCP_SKB_CB(tail)->tcp_flags |
1826              TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1827            !((TCP_SKB_CB(tail)->tcp_flags &
1828              TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1829            ((TCP_SKB_CB(tail)->tcp_flags ^
1830              TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1831#ifdef CONFIG_TLS_DEVICE
1832            tail->decrypted != skb->decrypted ||
1833#endif
1834            thtail->doff != th->doff ||
1835            memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1836                goto no_coalesce;
1837
1838        __skb_pull(skb, hdrlen);
1839
1840        shinfo = skb_shinfo(skb);
1841        gso_size = shinfo->gso_size ?: skb->len;
1842        gso_segs = shinfo->gso_segs ?: 1;
1843
1844        shinfo = skb_shinfo(tail);
1845        tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1846        tail_gso_segs = shinfo->gso_segs ?: 1;
1847
1848        if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1849                TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1850
1851                if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1852                        TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1853                        thtail->window = th->window;
1854                }
1855
1856                /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1857                 * thtail->fin, so that the fast path in tcp_rcv_established()
1858                 * is not entered if we append a packet with a FIN.
1859                 * SYN, RST, URG are not present.
1860                 * ACK is set on both packets.
1861                 * PSH : we do not really care in TCP stack,
1862                 *       at least for 'GRO' packets.
1863                 */
1864                thtail->fin |= th->fin;
1865                TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1866
1867                if (TCP_SKB_CB(skb)->has_rxtstamp) {
1868                        TCP_SKB_CB(tail)->has_rxtstamp = true;
1869                        tail->tstamp = skb->tstamp;
1870                        skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1871                }
1872
1873                /* Not as strict as GRO. We only need to carry mss max value */
1874                shinfo->gso_size = max(gso_size, tail_gso_size);
1875                shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1876
1877                sk->sk_backlog.len += delta;
1878                __NET_INC_STATS(sock_net(sk),
1879                                LINUX_MIB_TCPBACKLOGCOALESCE);
1880                kfree_skb_partial(skb, fragstolen);
1881                return false;
1882        }
1883        __skb_push(skb, hdrlen);
1884
1885no_coalesce:
1886        /* Only socket owner can try to collapse/prune rx queues
1887         * to reduce memory overhead, so add a little headroom here.
1888         * Few sockets backlog are possibly concurrently non empty.
1889         */
1890        limit += 64*1024;
1891
1892        if (unlikely(sk_add_backlog(sk, skb, limit))) {
1893                bh_unlock_sock(sk);
1894                __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1895                return true;
1896        }
1897        return false;
1898}
1899EXPORT_SYMBOL(tcp_add_backlog);
1900
1901int tcp_filter(struct sock *sk, struct sk_buff *skb)
1902{
1903        struct tcphdr *th = (struct tcphdr *)skb->data;
1904
1905        return sk_filter_trim_cap(sk, skb, th->doff * 4);
1906}
1907EXPORT_SYMBOL(tcp_filter);
1908
1909static void tcp_v4_restore_cb(struct sk_buff *skb)
1910{
1911        memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1912                sizeof(struct inet_skb_parm));
1913}
1914
1915static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1916                           const struct tcphdr *th)
1917{
1918        /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1919         * barrier() makes sure compiler wont play fool^Waliasing games.
1920         */
1921        memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1922                sizeof(struct inet_skb_parm));
1923        barrier();
1924
1925        TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1926        TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1927                                    skb->len - th->doff * 4);
1928        TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1929        TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1930        TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1931        TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1932        TCP_SKB_CB(skb)->sacked  = 0;
1933        TCP_SKB_CB(skb)->has_rxtstamp =
1934                        skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1935}
1936
1937/*
1938 *      From tcp_input.c
1939 */
1940
1941int tcp_v4_rcv(struct sk_buff *skb)
1942{
1943        struct net *net = dev_net(skb->dev);
1944        struct sk_buff *skb_to_free;
1945        int sdif = inet_sdif(skb);
1946        int dif = inet_iif(skb);
1947        const struct iphdr *iph;
1948        const struct tcphdr *th;
1949        bool refcounted;
1950        struct sock *sk;
1951        int ret;
1952
1953        if (skb->pkt_type != PACKET_HOST)
1954                goto discard_it;
1955
1956        /* Count it even if it's bad */
1957        __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1958
1959        if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1960                goto discard_it;
1961
1962        th = (const struct tcphdr *)skb->data;
1963
1964        if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1965                goto bad_packet;
1966        if (!pskb_may_pull(skb, th->doff * 4))
1967                goto discard_it;
1968
1969        /* An explanation is required here, I think.
1970         * Packet length and doff are validated by header prediction,
1971         * provided case of th->doff==0 is eliminated.
1972         * So, we defer the checks. */
1973
1974        if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1975                goto csum_error;
1976
1977        th = (const struct tcphdr *)skb->data;
1978        iph = ip_hdr(skb);
1979lookup:
1980        sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1981                               th->dest, sdif, &refcounted);
1982        if (!sk)
1983                goto no_tcp_socket;
1984
1985process:
1986        if (sk->sk_state == TCP_TIME_WAIT)
1987                goto do_time_wait;
1988
1989        if (sk->sk_state == TCP_NEW_SYN_RECV) {
1990                struct request_sock *req = inet_reqsk(sk);
1991                bool req_stolen = false;
1992                struct sock *nsk;
1993
1994                sk = req->rsk_listener;
1995                if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
1996                        sk_drops_add(sk, skb);
1997                        reqsk_put(req);
1998                        goto discard_it;
1999                }
2000                if (tcp_checksum_complete(skb)) {

2001                        reqsk_put(req);
2002                        goto csum_error;
2003                }
2004                if (unlikely(sk->sk_state != TCP_LISTEN)) {
2005                        nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2006                        if (!nsk) {
2007                                inet_csk_reqsk_queue_drop_and_put(sk, req);
2008                                goto lookup;
2009                        }
2010                        sk = nsk;
2011                        /* reuseport_migrate_sock() has already held one sk_refcnt
2012                         * before returning.
2013                         */
2014                } else {
2015                        /* We own a reference on the listener, increase it again
2016                         * as we might lose it too soon.
2017                         */
2018                        sock_hold(sk);
2019                }
2020                refcounted = true;
2021                nsk = NULL;
2022                if (!tcp_filter(sk, skb)) {
2023                        th = (const struct tcphdr *)skb->data;
2024                        iph = ip_hdr(skb);
2025                        tcp_v4_fill_cb(skb, iph, th);
2026                        nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2027                }
2028                if (!nsk) {
2029                        reqsk_put(req);
2030                        if (req_stolen) {
2031                                /* Another cpu got exclusive access to req
2032                                 * and created a full blown socket.
2033                                 * Try to feed this packet to this socket
2034                                 * instead of discarding it.
2035                                 */
2036                                tcp_v4_restore_cb(skb);
2037                                sock_put(sk);
2038                                goto lookup;
2039                        }
2040                        goto discard_and_relse;
2041                }
2042                if (nsk == sk) {
2043                        reqsk_put(req);
2044                        tcp_v4_restore_cb(skb);
2045                } else if (tcp_child_process(sk, nsk, skb)) {
2046                        tcp_v4_send_reset(nsk, skb);
2047                        goto discard_and_relse;
2048                } else {
2049                        sock_put(sk);
2050                        return 0;
2051                }
2052        }
2053        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2054                __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2055                goto discard_and_relse;
2056        }
2057
2058        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2059                goto discard_and_relse;
2060
2061        if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2062                goto discard_and_relse;
2063
2064        nf_reset_ct(skb);
2065
2066        if (tcp_filter(sk, skb))
2067                goto discard_and_relse;
2068        th = (const struct tcphdr *)skb->data;
2069        iph = ip_hdr(skb);
2070        tcp_v4_fill_cb(skb, iph, th);
2071
2072        skb->dev = NULL;
2073
2074        if (sk->sk_state == TCP_LISTEN) {
2075                ret = tcp_v4_do_rcv(sk, skb);
2076                goto put_and_return;
2077        }
2078
2079        sk_incoming_cpu_update(sk);
2080
2081        bh_lock_sock_nested(sk);
2082        tcp_segs_in(tcp_sk(sk), skb);
2083        ret = 0;
2084        if (!sock_owned_by_user(sk)) {
2085                skb_to_free = sk->sk_rx_skb_cache;
2086                sk->sk_rx_skb_cache = NULL;
2087                ret = tcp_v4_do_rcv(sk, skb);
2088        } else {
2089                if (tcp_add_backlog(sk, skb))
2090                        goto discard_and_relse;
2091                skb_to_free = NULL;
2092        }
2093        bh_unlock_sock(sk);
2094        if (skb_to_free)
2095                __kfree_skb(skb_to_free);
2096
2097put_and_return:
2098        if (refcounted)
2099                sock_put(sk);
2100
2101        return ret;
2102
2103no_tcp_socket:
2104        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2105                goto discard_it;
2106
2107        tcp_v4_fill_cb(skb, iph, th);
2108
2109        if (tcp_checksum_complete(skb)) {
2110csum_error:
2111                trace_tcp_bad_csum(skb);
2112                __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2113bad_packet:
2114                __TCP_INC_STATS(net, TCP_MIB_INERRS);
2115        } else {
2116                tcp_v4_send_reset(NULL, skb);
2117        }
2118
2119discard_it:
2120        /* Discard frame. */
2121        kfree_skb(skb);
2122        return 0;
2123
2124discard_and_relse:
2125        sk_drops_add(sk, skb);
2126        if (refcounted)
2127                sock_put(sk);
2128        goto discard_it;
2129
2130do_time_wait:
2131        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2132                inet_twsk_put(inet_twsk(sk));
2133                goto discard_it;
2134        }
2135
2136        tcp_v4_fill_cb(skb, iph, th);
2137
2138        if (tcp_checksum_complete(skb)) {
2139                inet_twsk_put(inet_twsk(sk));
2140                goto csum_error;
2141        }
2142        switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2143        case TCP_TW_SYN: {
2144                struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2145                                                        &tcp_hashinfo, skb,
2146                                                        __tcp_hdrlen(th),
2147                                                        iph->saddr, th->source,
2148                                                        iph->daddr, th->dest,
2149                                                        inet_iif(skb),
2150                                                        sdif);
2151                if (sk2) {
2152                        inet_twsk_deschedule_put(inet_twsk(sk));
2153                        sk = sk2;
2154                        tcp_v4_restore_cb(skb);
2155                        refcounted = false;
2156                        goto process;
2157                }
2158        }
2159                /* to ACK */
2160                fallthrough;
2161        case TCP_TW_ACK:
2162                tcp_v4_timewait_ack(sk, skb);
2163                break;
2164        case TCP_TW_RST:
2165                tcp_v4_send_reset(sk, skb);
2166                inet_twsk_deschedule_put(inet_twsk(sk));
2167                goto discard_it;
2168        case TCP_TW_SUCCESS:;
2169        }
2170        goto discard_it;
2171}
2172
2173static struct timewait_sock_ops tcp_timewait_sock_ops = {
2174        .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2175        .twsk_unique    = tcp_twsk_unique,
2176        .twsk_destructor= tcp_twsk_destructor,
2177};
2178
2179void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2180{
2181        struct dst_entry *dst = skb_dst(skb);
2182
2183        if (dst && dst_hold_safe(dst)) {
2184                sk->sk_rx_dst = dst;
2185                inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2186        }
2187}
2188EXPORT_SYMBOL(inet_sk_rx_dst_set);
2189
2190const struct inet_connection_sock_af_ops ipv4_specific = {
2191        .queue_xmit        = ip_queue_xmit,
2192        .send_check        = tcp_v4_send_check,
2193        .rebuild_header    = inet_sk_rebuild_header,
2194        .sk_rx_dst_set     = inet_sk_rx_dst_set,
2195        .conn_request      = tcp_v4_conn_request,
2196        .syn_recv_sock     = tcp_v4_syn_recv_sock,
2197        .net_header_len    = sizeof(struct iphdr),
2198        .setsockopt        = ip_setsockopt,
2199        .getsockopt        = ip_getsockopt,
2200        .addr2sockaddr     = inet_csk_addr2sockaddr,
2201        .sockaddr_len      = sizeof(struct sockaddr_in),
2202        .mtu_reduced       = tcp_v4_mtu_reduced,
2203};
2204EXPORT_SYMBOL(ipv4_specific);
2205
2206#ifdef CONFIG_TCP_MD5SIG
2207static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2208        .md5_lookup             = tcp_v4_md5_lookup,
2209        .calc_md5_hash          = tcp_v4_md5_hash_skb,
2210        .md5_parse              = tcp_v4_parse_md5_keys,
2211};
2212#endif
2213
2214/* NOTE: A lot of things set to zero explicitly by call to
2215 *       sk_alloc() so need not be done here.
2216 */
2217static int tcp_v4_init_sock(struct sock *sk)
2218{
2219        struct inet_connection_sock *icsk = inet_csk(sk);
2220
2221        tcp_init_sock(sk);
2222
2223        icsk->icsk_af_ops = &ipv4_specific;
2224
2225#ifdef CONFIG_TCP_MD5SIG
2226        tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2227#endif
2228
2229        return 0;
2230}
2231
2232void tcp_v4_destroy_sock(struct sock *sk)
2233{
2234        struct tcp_sock *tp = tcp_sk(sk);
2235
2236        trace_tcp_destroy_sock(sk);
2237
2238        tcp_clear_xmit_timers(sk);
2239
2240        tcp_cleanup_congestion_control(sk);
2241
2242        tcp_cleanup_ulp(sk);
2243
2244        /* Cleanup up the write buffer. */
2245        tcp_write_queue_purge(sk);
2246
2247        /* Check if we want to disable active TFO */
2248        tcp_fastopen_active_disable_ofo_check(sk);
2249
2250        /* Cleans up our, hopefully empty, out_of_order_queue. */
2251        skb_rbtree_purge(&tp->out_of_order_queue);
2252
2253#ifdef CONFIG_TCP_MD5SIG
2254        /* Clean up the MD5 key list, if any */
2255        if (tp->md5sig_info) {
2256                tcp_clear_md5_list(sk);
2257                kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2258                tp->md5sig_info = NULL;
2259        }
2260#endif
2261
2262        /* Clean up a referenced TCP bind bucket. */
2263        if (inet_csk(sk)->icsk_bind_hash)
2264                inet_put_port(sk);
2265
2266        BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2267
2268        /* If socket is aborted during connect operation */
2269        tcp_free_fastopen_req(tp);
2270        tcp_fastopen_destroy_cipher(sk);
2271        tcp_saved_syn_free(tp);
2272
2273        sk_sockets_allocated_dec(sk);
2274}
2275EXPORT_SYMBOL(tcp_v4_destroy_sock);
2276
2277#ifdef CONFIG_PROC_FS
2278/* Proc filesystem TCP sock list dumping. */
2279
2280/*
2281 * Get next listener socket follow cur.  If cur is NULL, get first socket
2282 * starting from bucket given in st->bucket; when st->bucket is zero the
2283 * very first socket in the hash table is returned.
2284 */
2285static void *listening_get_next(struct seq_file *seq, void *cur)
2286{
2287        struct tcp_seq_afinfo *afinfo;
2288        struct tcp_iter_state *st = seq->private;
2289        struct net *net = seq_file_net(seq);
2290        struct inet_listen_hashbucket *ilb;
2291        struct hlist_nulls_node *node;
2292        struct sock *sk = cur;
2293
2294        if (st->bpf_seq_afinfo)
2295                afinfo = st->bpf_seq_afinfo;
2296        else
2297                afinfo = PDE_DATA(file_inode(seq->file));
2298
2299        if (!sk) {
2300get_head:
2301                ilb = &tcp_hashinfo.listening_hash[st->bucket];
2302                spin_lock(&ilb->lock);
2303                sk = sk_nulls_head(&ilb->nulls_head);
2304                st->offset = 0;
2305                goto get_sk;
2306        }
2307        ilb = &tcp_hashinfo.listening_hash[st->bucket];
2308        ++st->num;
2309        ++st->offset;
2310
2311        sk = sk_nulls_next(sk);
2312get_sk:
2313        sk_nulls_for_each_from(sk, node) {
2314                if (!net_eq(sock_net(sk), net))
2315                        continue;
2316                if (afinfo->family == AF_UNSPEC ||
2317                    sk->sk_family == afinfo->family)
2318                        return sk;
2319        }
2320        spin_unlock(&ilb->lock);
2321        st->offset = 0;
2322        if (++st->bucket < INET_LHTABLE_SIZE)
2323                goto get_head;
2324        return NULL;
2325}
2326
2327static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2328{
2329        struct tcp_iter_state *st = seq->private;
2330        void *rc;
2331
2332        st->bucket = 0;
2333        st->offset = 0;
2334        rc = listening_get_next(seq, NULL);
2335
2336        while (rc && *pos) {
2337                rc = listening_get_next(seq, rc);
2338                --*pos;
2339        }
2340        return rc;
2341}
2342
2343static inline bool empty_bucket(const struct tcp_iter_state *st)
2344{
2345        return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2346}
2347
2348/*
2349 * Get first established socket starting from bucket given in st->bucket.
2350 * If st->bucket is zero, the very first socket in the hash is returned.
2351 */
2352static void *established_get_first(struct seq_file *seq)
2353{
2354        struct tcp_seq_afinfo *afinfo;
2355        struct tcp_iter_state *st = seq->private;
2356        struct net *net = seq_file_net(seq);
2357        void *rc = NULL;
2358
2359        if (st->bpf_seq_afinfo)
2360                afinfo = st->bpf_seq_afinfo;
2361        else
2362                afinfo = PDE_DATA(file_inode(seq->file));
2363
2364        st->offset = 0;
2365        for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2366                struct sock *sk;
2367                struct hlist_nulls_node *node;
2368                spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2369
2370                /* Lockless fast path for the common case of empty buckets */
2371                if (empty_bucket(st))
2372                        continue;
2373
2374                spin_lock_bh(lock);
2375                sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2376                        if ((afinfo->family != AF_UNSPEC &&
2377                             sk->sk_family != afinfo->family) ||
2378                            !net_eq(sock_net(sk), net)) {
2379                                continue;
2380                        }
2381                        rc = sk;
2382                        goto out;
2383                }
2384                spin_unlock_bh(lock);
2385        }
2386out:
2387        return rc;
2388}
2389
2390static void *established_get_next(struct seq_file *seq, void *cur)
2391{
2392        struct tcp_seq_afinfo *afinfo;
2393        struct sock *sk = cur;
2394        struct hlist_nulls_node *node;
2395        struct tcp_iter_state *st = seq->private;
2396        struct net *net = seq_file_net(seq);
2397
2398        if (st->bpf_seq_afinfo)
2399                afinfo = st->bpf_seq_afinfo;
2400        else
2401                afinfo = PDE_DATA(file_inode(seq->file));
2402
2403        ++st->num;
2404        ++st->offset;
2405
2406        sk = sk_nulls_next(sk);
2407
2408        sk_nulls_for_each_from(sk, node) {
2409                if ((afinfo->family == AF_UNSPEC ||
2410                     sk->sk_family == afinfo->family) &&
2411                    net_eq(sock_net(sk), net))
2412                        return sk;
2413        }
2414
2415        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2416        ++st->bucket;
2417        return established_get_first(seq);
2418}
2419
2420static void *established_get_idx(struct seq_file *seq, loff_t pos)
2421{
2422        struct tcp_iter_state *st = seq->private;
2423        void *rc;
2424
2425        st->bucket = 0;
2426        rc = established_get_first(seq);
2427
2428        while (rc && pos) {
2429                rc = established_get_next(seq, rc);
2430                --pos;
2431        }
2432        return rc;
2433}
2434
2435static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2436{
2437        void *rc;
2438        struct tcp_iter_state *st = seq->private;
2439
2440        st->state = TCP_SEQ_STATE_LISTENING;
2441        rc        = listening_get_idx(seq, &pos);
2442
2443        if (!rc) {
2444                st->state = TCP_SEQ_STATE_ESTABLISHED;
2445                rc        = established_get_idx(seq, pos);
2446        }
2447
2448        return rc;
2449}
2450
2451static void *tcp_seek_last_pos(struct seq_file *seq)
2452{
2453        struct tcp_iter_state *st = seq->private;
2454        int offset = st->offset;
2455        int orig_num = st->num;
2456        void *rc = NULL;
2457
2458        switch (st->state) {
2459        case TCP_SEQ_STATE_LISTENING:
2460                if (st->bucket >= INET_LHTABLE_SIZE)
2461                        break;
2462                st->state = TCP_SEQ_STATE_LISTENING;
2463                rc = listening_get_next(seq, NULL);
2464                while (offset-- && rc)
2465                        rc = listening_get_next(seq, rc);
2466                if (rc)
2467                        break;
2468                st->bucket = 0;
2469                st->state = TCP_SEQ_STATE_ESTABLISHED;
2470                fallthrough;
2471        case TCP_SEQ_STATE_ESTABLISHED:
2472                if (st->bucket > tcp_hashinfo.ehash_mask)
2473                        break;
2474                rc = established_get_first(seq);
2475                while (offset-- && rc)
2476                        rc = established_get_next(seq, rc);
2477        }
2478
2479        st->num = orig_num;
2480
2481        return rc;
2482}
2483
2484void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2485{
2486        struct tcp_iter_state *st = seq->private;
2487        void *rc;
2488
2489        if (*pos && *pos == st->last_pos) {
2490                rc = tcp_seek_last_pos(seq);
2491                if (rc)
2492                        goto out;
2493        }
2494
2495        st->state = TCP_SEQ_STATE_LISTENING;
2496        st->num = 0;
2497        st->bucket = 0;
2498        st->offset = 0;
2499        rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2500
2501out:
2502        st->last_pos = *pos;
2503        return rc;
2504}
2505EXPORT_SYMBOL(tcp_seq_start);
2506
2507void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2508{
2509        struct tcp_iter_state *st = seq->private;
2510        void *rc = NULL;
2511
2512        if (v == SEQ_START_TOKEN) {
2513                rc = tcp_get_idx(seq, 0);
2514                goto out;
2515        }
2516
2517        switch (st->state) {
2518        case TCP_SEQ_STATE_LISTENING:
2519                rc = listening_get_next(seq, v);
2520                if (!rc) {
2521                        st->state = TCP_SEQ_STATE_ESTABLISHED;
2522                        st->bucket = 0;
2523                        st->offset = 0;
2524                        rc        = established_get_first(seq);
2525                }
2526                break;
2527        case TCP_SEQ_STATE_ESTABLISHED:
2528                rc = established_get_next(seq, v);
2529                break;
2530        }
2531out:
2532        ++*pos;
2533        st->last_pos = *pos;
2534        return rc;
2535}
2536EXPORT_SYMBOL(tcp_seq_next);
2537
2538void tcp_seq_stop(struct seq_file *seq, void *v)
2539{
2540        struct tcp_iter_state *st = seq->private;
2541
2542        switch (st->state) {
2543        case TCP_SEQ_STATE_LISTENING:
2544                if (v != SEQ_START_TOKEN)
2545                        spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2546                break;
2547        case TCP_SEQ_STATE_ESTABLISHED:
2548                if (v)
2549                        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2550                break;
2551        }
2552}
2553EXPORT_SYMBOL(tcp_seq_stop);
2554
2555static void get_openreq4(const struct request_sock *req,
2556                         struct seq_file *f, int i)
2557{
2558        const struct inet_request_sock *ireq = inet_rsk(req);
2559        long delta = req->rsk_timer.expires - jiffies;
2560
2561        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2562                " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2563                i,
2564                ireq->ir_loc_addr,
2565                ireq->ir_num,
2566                ireq->ir_rmt_addr,
2567                ntohs(ireq->ir_rmt_port),
2568                TCP_SYN_RECV,
2569                0, 0, /* could print option size, but that is af dependent. */
2570                1,    /* timers active (only the expire timer) */
2571                jiffies_delta_to_clock_t(delta),
2572                req->num_timeout,
2573                from_kuid_munged(seq_user_ns(f),
2574                                 sock_i_uid(req->rsk_listener)),
2575                0,  /* non standard timer */
2576                0, /* open_requests have no inode */
2577                0,
2578                req);
2579}
2580
2581static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2582{
2583        int timer_active;
2584        unsigned long timer_expires;
2585        const struct tcp_sock *tp = tcp_sk(sk);
2586        const struct inet_connection_sock *icsk = inet_csk(sk);
2587        const struct inet_sock *inet = inet_sk(sk);
2588        const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2589        __be32 dest = inet->inet_daddr;
2590        __be32 src = inet->inet_rcv_saddr;
2591        __u16 destp = ntohs(inet->inet_dport);
2592        __u16 srcp = ntohs(inet->inet_sport);
2593        int rx_queue;
2594        int state;
2595
2596        if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2597            icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2598            icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2599                timer_active    = 1;
2600                timer_expires   = icsk->icsk_timeout;
2601        } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2602                timer_active    = 4;
2603                timer_expires   = icsk->icsk_timeout;
2604        } else if (timer_pending(&sk->sk_timer)) {
2605                timer_active    = 2;
2606                timer_expires   = sk->sk_timer.expires;
2607        } else {
2608                timer_active    = 0;
2609                timer_expires = jiffies;
2610        }
2611
2612        state = inet_sk_state_load(sk);
2613        if (state == TCP_LISTEN)
2614                rx_queue = READ_ONCE(sk->sk_ack_backlog);
2615        else
2616                /* Because we don't lock the socket,
2617                 * we might find a transient negative value.
2618                 */
2619                rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2620                                      READ_ONCE(tp->copied_seq), 0);
2621
2622        seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2623                        "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2624                i, src, srcp, dest, destp, state,
2625                READ_ONCE(tp->write_seq) - tp->snd_una,
2626                rx_queue,
2627                timer_active,
2628                jiffies_delta_to_clock_t(timer_expires - jiffies),
2629                icsk->icsk_retransmits,
2630                from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2631                icsk->icsk_probes_out,
2632                sock_i_ino(sk),
2633                refcount_read(&sk->sk_refcnt), sk,
2634                jiffies_to_clock_t(icsk->icsk_rto),
2635                jiffies_to_clock_t(icsk->icsk_ack.ato),
2636                (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2637                tp->snd_cwnd,
2638                state == TCP_LISTEN ?
2639                    fastopenq->max_qlen :
2640                    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2641}
2642
2643static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2644                               struct seq_file *f, int i)
2645{
2646        long delta = tw->tw_timer.expires - jiffies;
2647        __be32 dest, src;
2648        __u16 destp, srcp;
2649
2650        dest  = tw->tw_daddr;
2651        src   = tw->tw_rcv_saddr;
2652        destp = ntohs(tw->tw_dport);
2653        srcp  = ntohs(tw->tw_sport);
2654
2655        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2656                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2657                i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2658                3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2659                refcount_read(&tw->tw_refcnt), tw);
2660}
2661
2662#define TMPSZ 150
2663
2664static int tcp4_seq_show(struct seq_file *seq, void *v)
2665{
2666        struct tcp_iter_state *st;
2667        struct sock *sk = v;
2668
2669        seq_setwidth(seq, TMPSZ - 1);
2670        if (v == SEQ_START_TOKEN) {
2671                seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2672                           "rx_queue tr tm->when retrnsmt   uid  timeout "
2673                           "inode");
2674                goto out;
2675        }
2676        st = seq->private;
2677
2678        if (sk->sk_state == TCP_TIME_WAIT)
2679                get_timewait4_sock(v, seq, st->num);
2680        else if (sk->sk_state == TCP_NEW_SYN_RECV)
2681                get_openreq4(v, seq, st->num);
2682        else
2683                get_tcp4_sock(v, seq, st->num);
2684out:
2685        seq_pad(seq, '\n');
2686        return 0;
2687}
2688
2689#ifdef CONFIG_BPF_SYSCALL
2690struct bpf_iter__tcp {
2691        __bpf_md_ptr(struct bpf_iter_meta *, meta);
2692        __bpf_md_ptr(struct sock_common *, sk_common);
2693        uid_t uid __aligned(8);
2694};
2695
2696static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2697                             struct sock_common *sk_common, uid_t uid)
2698{
2699        struct bpf_iter__tcp ctx;
2700
2701        meta->seq_num--;  /* skip SEQ_START_TOKEN */
2702        ctx.meta = meta;
2703        ctx.sk_common = sk_common;
2704        ctx.uid = uid;
2705        return bpf_iter_run_prog(prog, &ctx);
2706}
2707
2708static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2709{
2710        struct bpf_iter_meta meta;
2711        struct bpf_prog *prog;
2712        struct sock *sk = v;
2713        uid_t uid;
2714
2715        if (v == SEQ_START_TOKEN)
2716                return 0;
2717
2718        if (sk->sk_state == TCP_TIME_WAIT) {
2719                uid = 0;
2720        } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2721                const struct request_sock *req = v;
2722
2723                uid = from_kuid_munged(seq_user_ns(seq),
2724                                       sock_i_uid(req->rsk_listener));
2725        } else {
2726                uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2727        }
2728
2729        meta.seq = seq;
2730        prog = bpf_iter_get_info(&meta, false);
2731        return tcp_prog_seq_show(prog, &meta, v, uid);
2732}
2733
2734static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2735{
2736        struct bpf_iter_meta meta;
2737        struct bpf_prog *prog;
2738
2739        if (!v) {
2740                meta.seq = seq;
2741                prog = bpf_iter_get_info(&meta, true);
2742                if (prog)
2743                        (void)tcp_prog_seq_show(prog, &meta, v, 0);
2744        }
2745
2746        tcp_seq_stop(seq, v);
2747}
2748
2749static const struct seq_operations bpf_iter_tcp_seq_ops = {
2750        .show           = bpf_iter_tcp_seq_show,
2751        .start          = tcp_seq_start,
2752        .next           = tcp_seq_next,
2753        .stop           = bpf_iter_tcp_seq_stop,
2754};
2755#endif
2756
2757static const struct seq_operations tcp4_seq_ops = {
2758        .show           = tcp4_seq_show,
2759        .start          = tcp_seq_start,
2760        .next           = tcp_seq_next,
2761        .stop           = tcp_seq_stop,
2762};
2763
2764static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2765        .family         = AF_INET,
2766};
2767
2768static int __net_init tcp4_proc_init_net(struct net *net)
2769{
2770        if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2771                        sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2772                return -ENOMEM;
2773        return 0;
2774}
2775
2776static void __net_exit tcp4_proc_exit_net(struct net *net)
2777{
2778        remove_proc_entry("tcp", net->proc_net);
2779}
2780
2781static struct pernet_operations tcp4_net_ops = {
2782        .init = tcp4_proc_init_net,
2783        .exit = tcp4_proc_exit_net,
2784};
2785
2786int __init tcp4_proc_init(void)
2787{
2788        return register_pernet_subsys(&tcp4_net_ops);
2789}
2790
2791void tcp4_proc_exit(void)
2792{
2793        unregister_pernet_subsys(&tcp4_net_ops);
2794}
2795#endif /* CONFIG_PROC_FS */
2796
2797/* @wake is one when sk_stream_write_space() calls us.
2798 * This sends EPOLLOUT only if notsent_bytes is half the limit.
2799 * This mimics the strategy used in sock_def_write_space().
2800 */
2801bool tcp_stream_memory_free(const struct sock *sk, int wake)
2802{
2803        const struct tcp_sock *tp = tcp_sk(sk);
2804        u32 notsent_bytes = READ_ONCE(tp->write_seq) -
2805                            READ_ONCE(tp->snd_nxt);
2806
2807        return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
2808}
2809EXPORT_SYMBOL(tcp_stream_memory_free);
2810
2811struct proto tcp_prot = {
2812        .name                   = "TCP",
2813        .owner                  = THIS_MODULE,
2814        .close                  = tcp_close,
2815        .pre_connect            = tcp_v4_pre_connect,
2816        .connect                = tcp_v4_connect,
2817        .disconnect             = tcp_disconnect,
2818        .accept                 = inet_csk_accept,
2819        .ioctl                  = tcp_ioctl,
2820        .init                   = tcp_v4_init_sock,
2821        .destroy                = tcp_v4_destroy_sock,
2822        .shutdown               = tcp_shutdown,
2823        .setsockopt             = tcp_setsockopt,
2824        .getsockopt             = tcp_getsockopt,
2825        .bpf_bypass_getsockopt  = tcp_bpf_bypass_getsockopt,
2826        .keepalive              = tcp_set_keepalive,
2827        .recvmsg                = tcp_recvmsg,
2828        .sendmsg                = tcp_sendmsg,
2829        .sendpage               = tcp_sendpage,
2830        .backlog_rcv            = tcp_v4_do_rcv,
2831        .release_cb             = tcp_release_cb,
2832        .hash                   = inet_hash,
2833        .unhash                 = inet_unhash,
2834        .get_port               = inet_csk_get_port,
2835#ifdef CONFIG_BPF_SYSCALL
2836        .psock_update_sk_prot   = tcp_bpf_update_proto,
2837#endif
2838        .enter_memory_pressure  = tcp_enter_memory_pressure,
2839        .leave_memory_pressure  = tcp_leave_memory_pressure,
2840        .stream_memory_free     = tcp_stream_memory_free,
2841        .sockets_allocated      = &tcp_sockets_allocated,
2842        .orphan_count           = &tcp_orphan_count,
2843        .memory_allocated       = &tcp_memory_allocated,
2844        .memory_pressure        = &tcp_memory_pressure,
2845        .sysctl_mem             = sysctl_tcp_mem,
2846        .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2847        .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2848        .max_header             = MAX_TCP_HEADER,
2849        .obj_size               = sizeof(struct tcp_sock),
2850        .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2851        .twsk_prot              = &tcp_timewait_sock_ops,
2852        .rsk_prot               = &tcp_request_sock_ops,
2853        .h.hashinfo             = &tcp_hashinfo,
2854        .no_autobind            = true,
2855        .diag_destroy           = tcp_abort,
2856};
2857EXPORT_SYMBOL(tcp_prot);
2858
2859static void __net_exit tcp_sk_exit(struct net *net)
2860{
2861        int cpu;
2862
2863        if (net->ipv4.tcp_congestion_control)
2864                bpf_module_put(net->ipv4.tcp_congestion_control,
2865                               net->ipv4.tcp_congestion_control->owner);
2866
2867        for_each_possible_cpu(cpu)
2868                inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2869        free_percpu(net->ipv4.tcp_sk);
2870}
2871
2872static int __net_init tcp_sk_init(struct net *net)
2873{
2874        int res, cpu, cnt;
2875
2876        net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2877        if (!net->ipv4.tcp_sk)
2878                return -ENOMEM;
2879
2880        for_each_possible_cpu(cpu) {
2881                struct sock *sk;
2882
2883                res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2884                                           IPPROTO_TCP, net);
2885                if (res)
2886                        goto fail;
2887                sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2888
2889                /* Please enforce IP_DF and IPID==0 for RST and
2890                 * ACK sent in SYN-RECV and TIME-WAIT state.
2891                 */
2892                inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2893
2894                *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2895        }
2896
2897        net->ipv4.sysctl_tcp_ecn = 2;
2898        net->ipv4.sysctl_tcp_ecn_fallback = 1;
2899
2900        net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2901        net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2902        net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2903        net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2904        net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2905
2906        net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2907        net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2908        net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2909
2910        net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2911        net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2912        net->ipv4.sysctl_tcp_syncookies = 1;
2913        net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2914        net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2915        net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2916        net->ipv4.sysctl_tcp_orphan_retries = 0;
2917        net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2918        net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2919        net->ipv4.sysctl_tcp_tw_reuse = 2;
2920        net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
2921
2922        cnt = tcp_hashinfo.ehash_mask + 1;
2923        net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2924        net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2925
2926        net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2927        net->ipv4.sysctl_tcp_sack = 1;
2928        net->ipv4.sysctl_tcp_window_scaling = 1;
2929        net->ipv4.sysctl_tcp_timestamps = 1;
2930        net->ipv4.sysctl_tcp_early_retrans = 3;
2931        net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2932        net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2933        net->ipv4.sysctl_tcp_retrans_collapse = 1;
2934        net->ipv4.sysctl_tcp_max_reordering = 300;
2935        net->ipv4.sysctl_tcp_dsack = 1;
2936        net->ipv4.sysctl_tcp_app_win = 31;
2937        net->ipv4.sysctl_tcp_adv_win_scale = 1;
2938        net->ipv4.sysctl_tcp_frto = 2;
2939        net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2940        /* This limits the percentage of the congestion window which we
2941         * will allow a single TSO frame to consume.  Building TSO frames
2942         * which are too large can cause TCP streams to be bursty.
2943         */
2944        net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2945        /* Default TSQ limit of 16 TSO segments */
2946        net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2947        /* rfc5961 challenge ack rate limiting */
2948        net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2949        net->ipv4.sysctl_tcp_min_tso_segs = 2;
2950        net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2951        net->ipv4.sysctl_tcp_autocorking = 1;
2952        net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2953        net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2954        net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2955        if (net != &init_net) {
2956                memcpy(net->ipv4.sysctl_tcp_rmem,
2957                       init_net.ipv4.sysctl_tcp_rmem,
2958                       sizeof(init_net.ipv4.sysctl_tcp_rmem));
2959                memcpy(net->ipv4.sysctl_tcp_wmem,
2960                       init_net.ipv4.sysctl_tcp_wmem,
2961                       sizeof(init_net.ipv4.sysctl_tcp_wmem));
2962        }
2963        net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2964        net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
2965        net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2966        net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2967        spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2968        net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
2969        atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2970
2971        /* Reno is always built in */
2972        if (!net_eq(net, &init_net) &&
2973            bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
2974                               init_net.ipv4.tcp_congestion_control->owner))
2975                net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2976        else
2977                net->ipv4.tcp_congestion_control = &tcp_reno;
2978
2979        return 0;
2980fail:
2981        tcp_sk_exit(net);
2982
2983        return res;
2984}
2985
2986static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2987{
2988        struct net *net;
2989
2990        inet_twsk_purge(&tcp_hashinfo, AF_INET);
2991
2992        list_for_each_entry(net, net_exit_list, exit_list)
2993                tcp_fastopen_ctx_destroy(net);
2994}
2995
2996static struct pernet_operations __net_initdata tcp_sk_ops = {
2997       .init       = tcp_sk_init,
2998       .exit       = tcp_sk_exit,
2999       .exit_batch = tcp_sk_exit_batch,
3000};

3001
3002#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3003DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3004                     struct sock_common *sk_common, uid_t uid)
3005
3006static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3007{
3008        struct tcp_iter_state *st = priv_data;
3009        struct tcp_seq_afinfo *afinfo;
3010        int ret;
3011
3012        afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
3013        if (!afinfo)
3014                return -ENOMEM;
3015
3016        afinfo->family = AF_UNSPEC;
3017        st->bpf_seq_afinfo = afinfo;
3018        ret = bpf_iter_init_seq_net(priv_data, aux);
3019        if (ret)
3020                kfree(afinfo);
3021        return ret;
3022}
3023
3024static void bpf_iter_fini_tcp(void *priv_data)
3025{
3026        struct tcp_iter_state *st = priv_data;
3027
3028        kfree(st->bpf_seq_afinfo);
3029        bpf_iter_fini_seq_net(priv_data);
3030}
3031
3032static const struct bpf_iter_seq_info tcp_seq_info = {
3033        .seq_ops                = &bpf_iter_tcp_seq_ops,
3034        .init_seq_private       = bpf_iter_init_tcp,
3035        .fini_seq_private       = bpf_iter_fini_tcp,
3036        .seq_priv_size          = sizeof(struct tcp_iter_state),
3037};
3038
3039static struct bpf_iter_reg tcp_reg_info = {
3040        .target                 = "tcp",
3041        .ctx_arg_info_size      = 1,
3042        .ctx_arg_info           = {
3043                { offsetof(struct bpf_iter__tcp, sk_common),
3044                  PTR_TO_BTF_ID_OR_NULL },
3045        },
3046        .seq_info               = &tcp_seq_info,
3047};
3048
3049static void __init bpf_iter_register(void)
3050{
3051        tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3052        if (bpf_iter_reg_target(&tcp_reg_info))
3053                pr_warn("Warning: could not register bpf iterator tcp\n");
3054}
3055
3056#endif
3057
3058void __init tcp_v4_init(void)
3059{
3060        if (register_pernet_subsys(&tcp_sk_ops))
3061                panic("Failed to create the TCP control socket.\n");
3062
3063#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3064        bpf_iter_register();
3065#endif
3066}
3067