LXR linux/net/ipv4/tcp

   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4 *              operating system.  INET is implemented using the  BSD Socket
   5 *              interface as the means of communication with the user level.
   6 *
   7 *              Implementation of the Transmission Control Protocol(TCP).
   8 *
   9 *              IPv4 specific functions
  10 *
  11 *              code split from:
  12 *              linux/ipv4/tcp.c
  13 *              linux/ipv4/tcp_input.c
  14 *              linux/ipv4/tcp_output.c
  15 *
  16 *              See tcp.c for author information
  17 */
  18
  19/*
  20 * Changes:
  21 *              David S. Miller :       New socket lookup architecture.
  22 *                                      This code is dedicated to John Dyson.
  23 *              David S. Miller :       Change semantics of established hash,
  24 *                                      half is devoted to TIME_WAIT sockets
  25 *                                      and the rest go in the other half.
  26 *              Andi Kleen :            Add support for syncookies and fixed
  27 *                                      some bugs: ip options weren't passed to
  28 *                                      the TCP layer, missed a check for an
  29 *                                      ACK bit.
  30 *              Andi Kleen :            Implemented fast path mtu discovery.
  31 *                                      Fixed many serious bugs in the
  32 *                                      request_sock handling and moved
  33 *                                      most of it into the af independent code.
  34 *                                      Added tail drop and some other bugfixes.
  35 *                                      Added new listen semantics.
  36 *              Mike McLagan    :       Routing by source
  37 *      Juan Jose Ciarlante:            ip_dynaddr bits
  38 *              Andi Kleen:             various fixes.
  39 *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  40 *                                      coma.
  41 *      Andi Kleen              :       Fix new listen.
  42 *      Andi Kleen              :       Fix accept error reporting.
  43 *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  44 *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  45 *                                      a single port at the same time.
  46 */
  47
  48#define pr_fmt(fmt) "TCP: " fmt
  49
  50#include <linux/bottom_half.h>
  51#include <linux/types.h>
  52#include <linux/fcntl.h>
  53#include <linux/module.h>
  54#include <linux/random.h>
  55#include <linux/cache.h>
  56#include <linux/jhash.h>
  57#include <linux/init.h>
  58#include <linux/times.h>
  59#include <linux/slab.h>
  60
  61#include <net/net_namespace.h>
  62#include <net/icmp.h>
  63#include <net/inet_hashtables.h>
  64#include <net/tcp.h>
  65#include <net/transp_v6.h>
  66#include <net/ipv6.h>
  67#include <net/inet_common.h>
  68#include <net/timewait_sock.h>
  69#include <net/xfrm.h>
  70#include <net/secure_seq.h>
  71#include <net/busy_poll.h>
  72
  73#include <linux/inet.h>
  74#include <linux/ipv6.h>
  75#include <linux/stddef.h>
  76#include <linux/proc_fs.h>
  77#include <linux/seq_file.h>
  78#include <linux/inetdevice.h>
  79#include <linux/btf_ids.h>
  80
  81#include <crypto/hash.h>
  82#include <linux/scatterlist.h>
  83
  84#include <trace/events/tcp.h>
  85
  86#ifdef CONFIG_TCP_MD5SIG
  87static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  88                               __be32 daddr, __be32 saddr, const struct tcphdr *th);
  89#endif
  90
  91struct inet_hashinfo tcp_hashinfo;
  92EXPORT_SYMBOL(tcp_hashinfo);
  93
  94static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
  95
  96static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  97{
  98        return secure_tcp_seq(ip_hdr(skb)->daddr,
  99                              ip_hdr(skb)->saddr,
 100                              tcp_hdr(skb)->dest,
 101                              tcp_hdr(skb)->source);
 102}
 103
 104static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 105{
 106        return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 107}
 108
 109int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 110{
 111        int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
 112        const struct inet_timewait_sock *tw = inet_twsk(sktw);
 113        const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 114        struct tcp_sock *tp = tcp_sk(sk);
 115
 116        if (reuse == 2) {
 117                /* Still does not detect *everything* that goes through
 118                 * lo, since we require a loopback src or dst address
 119                 * or direct binding to 'lo' interface.
 120                 */
 121                bool loopback = false;
 122                if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 123                        loopback = true;
 124#if IS_ENABLED(CONFIG_IPV6)
 125                if (tw->tw_family == AF_INET6) {
 126                        if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 127                            ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
 128                            ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 129                            ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
 130                                loopback = true;
 131                } else
 132#endif
 133                {
 134                        if (ipv4_is_loopback(tw->tw_daddr) ||
 135                            ipv4_is_loopback(tw->tw_rcv_saddr))
 136                                loopback = true;
 137                }
 138                if (!loopback)
 139                        reuse = 0;
 140        }
 141
 142        /* With PAWS, it is safe from the viewpoint
 143           of data integrity. Even without PAWS it is safe provided sequence
 144           spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 145
 146           Actually, the idea is close to VJ's one, only timestamp cache is
 147           held not per host, but per port pair and TW bucket is used as state
 148           holder.
 149
 150           If TW bucket has been already destroyed we fall back to VJ's scheme
 151           and use initial timestamp retrieved from peer table.
 152         */
 153        if (tcptw->tw_ts_recent_stamp &&
 154            (!twp || (reuse && time_after32(ktime_get_seconds(),
 155                                            tcptw->tw_ts_recent_stamp)))) {
 156                /* In case of repair and re-using TIME-WAIT sockets we still
 157                 * want to be sure that it is safe as above but honor the
 158                 * sequence numbers and time stamps set as part of the repair
 159                 * process.
 160                 *
 161                 * Without this check re-using a TIME-WAIT socket with TCP
 162                 * repair would accumulate a -1 on the repair assigned
 163                 * sequence number. The first time it is reused the sequence
 164                 * is -1, the second time -2, etc. This fixes that issue
 165                 * without appearing to create any others.
 166                 */
 167                if (likely(!tp->repair)) {
 168                        u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
 169
 170                        if (!seq)
 171                                seq = 1;
 172                        WRITE_ONCE(tp->write_seq, seq);
 173                        tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 174                        tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 175                }
 176                sock_hold(sktw);
 177                return 1;
 178        }
 179
 180        return 0;
 181}
 182EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 183
 184static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 185                              int addr_len)
 186{
 187        /* This check is replicated from tcp_v4_connect() and intended to
 188         * prevent BPF program called below from accessing bytes that are out
 189         * of the bound specified by user in addr_len.
 190         */
 191        if (addr_len < sizeof(struct sockaddr_in))
 192                return -EINVAL;
 193
 194        sock_owned_by_me(sk);
 195
 196        return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 197}
 198
 199/* This will initiate an outgoing connection. */
 200int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 201{
 202        struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 203        struct inet_sock *inet = inet_sk(sk);
 204        struct tcp_sock *tp = tcp_sk(sk);
 205        __be16 orig_sport, orig_dport;
 206        __be32 daddr, nexthop;
 207        struct flowi4 *fl4;
 208        struct rtable *rt;
 209        int err;
 210        struct ip_options_rcu *inet_opt;
 211        struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row;
 212
 213        if (addr_len < sizeof(struct sockaddr_in))
 214                return -EINVAL;
 215
 216        if (usin->sin_family != AF_INET)
 217                return -EAFNOSUPPORT;
 218
 219        nexthop = daddr = usin->sin_addr.s_addr;
 220        inet_opt = rcu_dereference_protected(inet->inet_opt,
 221                                             lockdep_sock_is_held(sk));
 222        if (inet_opt && inet_opt->opt.srr) {
 223                if (!daddr)
 224                        return -EINVAL;
 225                nexthop = inet_opt->opt.faddr;
 226        }
 227
 228        orig_sport = inet->inet_sport;
 229        orig_dport = usin->sin_port;
 230        fl4 = &inet->cork.fl.u.ip4;
 231        rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 232                              sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
 233                              orig_dport, sk);
 234        if (IS_ERR(rt)) {
 235                err = PTR_ERR(rt);
 236                if (err == -ENETUNREACH)
 237                        IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 238                return err;
 239        }
 240
 241        if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 242                ip_rt_put(rt);
 243                return -ENETUNREACH;
 244        }
 245
 246        if (!inet_opt || !inet_opt->opt.srr)
 247                daddr = fl4->daddr;
 248
 249        if (!inet->inet_saddr)
 250                inet->inet_saddr = fl4->saddr;
 251        sk_rcv_saddr_set(sk, inet->inet_saddr);
 252
 253        if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 254                /* Reset inherited state */
 255                tp->rx_opt.ts_recent       = 0;
 256                tp->rx_opt.ts_recent_stamp = 0;
 257                if (likely(!tp->repair))
 258                        WRITE_ONCE(tp->write_seq, 0);
 259        }
 260
 261        inet->inet_dport = usin->sin_port;
 262        sk_daddr_set(sk, daddr);
 263
 264        inet_csk(sk)->icsk_ext_hdr_len = 0;
 265        if (inet_opt)
 266                inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 267
 268        tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 269
 270        /* Socket identity is still unknown (sport may be zero).
 271         * However we set state to SYN-SENT and not releasing socket
 272         * lock select source port, enter ourselves into the hash tables and
 273         * complete initialization after this.
 274         */
 275        tcp_set_state(sk, TCP_SYN_SENT);
 276        err = inet_hash_connect(tcp_death_row, sk);
 277        if (err)
 278                goto failure;
 279
 280        sk_set_txhash(sk);
 281
 282        rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 283                               inet->inet_sport, inet->inet_dport, sk);
 284        if (IS_ERR(rt)) {
 285                err = PTR_ERR(rt);
 286                rt = NULL;
 287                goto failure;
 288        }
 289        /* OK, now commit destination to socket.  */
 290        sk->sk_gso_type = SKB_GSO_TCPV4;
 291        sk_setup_caps(sk, &rt->dst);
 292        rt = NULL;
 293
 294        if (likely(!tp->repair)) {
 295                if (!tp->write_seq)
 296                        WRITE_ONCE(tp->write_seq,
 297                                   secure_tcp_seq(inet->inet_saddr,
 298                                                  inet->inet_daddr,
 299                                                  inet->inet_sport,
 300                                                  usin->sin_port));
 301                tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 302                                                 inet->inet_saddr,
 303                                                 inet->inet_daddr);
 304        }
 305
 306        inet->inet_id = prandom_u32();
 307
 308        if (tcp_fastopen_defer_connect(sk, &err))
 309                return err;
 310        if (err)
 311                goto failure;
 312
 313        err = tcp_connect(sk);
 314
 315        if (err)
 316                goto failure;
 317
 318        return 0;
 319
 320failure:
 321        /*
 322         * This unhashes the socket and releases the local port,
 323         * if necessary.
 324         */
 325        tcp_set_state(sk, TCP_CLOSE);
 326        ip_rt_put(rt);
 327        sk->sk_route_caps = 0;
 328        inet->inet_dport = 0;
 329        return err;
 330}
 331EXPORT_SYMBOL(tcp_v4_connect);
 332
 333/*
 334 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 335 * It can be called through tcp_release_cb() if socket was owned by user
 336 * at the time tcp_v4_err() was called to handle ICMP message.
 337 */
 338void tcp_v4_mtu_reduced(struct sock *sk)
 339{
 340        struct inet_sock *inet = inet_sk(sk);
 341        struct dst_entry *dst;
 342        u32 mtu;
 343
 344        if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 345                return;
 346        mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
 347        dst = inet_csk_update_pmtu(sk, mtu);
 348        if (!dst)
 349                return;
 350
 351        /* Something is about to be wrong... Remember soft error
 352         * for the case, if this connection will not able to recover.
 353         */
 354        if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 355                sk->sk_err_soft = EMSGSIZE;
 356
 357        mtu = dst_mtu(dst);
 358
 359        if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 360            ip_sk_accept_pmtu(sk) &&
 361            inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 362                tcp_sync_mss(sk, mtu);
 363
 364                /* Resend the TCP packet because it's
 365                 * clear that the old packet has been
 366                 * dropped. This is the new "fast" path mtu
 367                 * discovery.
 368                 */
 369                tcp_simple_retransmit(sk);
 370        } /* else let the usual retransmit timer handle it */
 371}
 372EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 373
 374static void do_redirect(struct sk_buff *skb, struct sock *sk)
 375{
 376        struct dst_entry *dst = __sk_dst_check(sk, 0);
 377
 378        if (dst)
 379                dst->ops->redirect(dst, sk, skb);
 380}
 381
 382
 383/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 384void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 385{
 386        struct request_sock *req = inet_reqsk(sk);
 387        struct net *net = sock_net(sk);
 388
 389        /* ICMPs are not backlogged, hence we cannot get
 390         * an established socket here.
 391         */
 392        if (seq != tcp_rsk(req)->snt_isn) {
 393                __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 394        } else if (abort) {
 395                /*
 396                 * Still in SYN_RECV, just remove it silently.
 397                 * There is no good way to pass the error to the newly
 398                 * created socket, and POSIX does not want network
 399                 * errors returned from accept().
 400                 */
 401                inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 402                tcp_listendrop(req->rsk_listener);
 403        }
 404        reqsk_put(req);
 405}
 406EXPORT_SYMBOL(tcp_req_err);
 407
 408/* TCP-LD (RFC 6069) logic */
 409void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
 410{
 411        struct inet_connection_sock *icsk = inet_csk(sk);
 412        struct tcp_sock *tp = tcp_sk(sk);
 413        struct sk_buff *skb;
 414        s32 remaining;
 415        u32 delta_us;
 416
 417        if (sock_owned_by_user(sk))
 418                return;
 419
 420        if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 421            !icsk->icsk_backoff)
 422                return;
 423
 424        skb = tcp_rtx_queue_head(sk);
 425        if (WARN_ON_ONCE(!skb))
 426                return;
 427
 428        icsk->icsk_backoff--;
 429        icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
 430        icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 431
 432        tcp_mstamp_refresh(tp);
 433        delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 434        remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
 435
 436        if (remaining > 0) {
 437                inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 438                                          remaining, TCP_RTO_MAX);
 439        } else {
 440                /* RTO revert clocked out retransmission.
 441                 * Will retransmit now.
 442                 */
 443                tcp_retransmit_timer(sk);
 444        }
 445}
 446EXPORT_SYMBOL(tcp_ld_RTO_revert);
 447
 448/*
 449 * This routine is called by the ICMP module when it gets some
 450 * sort of error condition.  If err < 0 then the socket should
 451 * be closed and the error returned to the user.  If err > 0
 452 * it's just the icmp type << 8 | icmp code.  After adjustment
 453 * header points to the first 8 bytes of the tcp header.  We need
 454 * to find the appropriate port.
 455 *
 456 * The locking strategy used here is very "optimistic". When
 457 * someone else accesses the socket the ICMP is just dropped
 458 * and for some paths there is no check at all.
 459 * A more general error queue to queue errors for later handling
 460 * is probably better.
 461 *
 462 */
 463
 464int tcp_v4_err(struct sk_buff *skb, u32 info)
 465{
 466        const struct iphdr *iph = (const struct iphdr *)skb->data;
 467        struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 468        struct tcp_sock *tp;
 469        struct inet_sock *inet;
 470        const int type = icmp_hdr(skb)->type;
 471        const int code = icmp_hdr(skb)->code;
 472        struct sock *sk;
 473        struct request_sock *fastopen;
 474        u32 seq, snd_una;
 475        int err;
 476        struct net *net = dev_net(skb->dev);
 477
 478        sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 479                                       th->dest, iph->saddr, ntohs(th->source),
 480                                       inet_iif(skb), 0);
 481        if (!sk) {
 482                __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 483                return -ENOENT;
 484        }
 485        if (sk->sk_state == TCP_TIME_WAIT) {
 486                inet_twsk_put(inet_twsk(sk));
 487                return 0;
 488        }
 489        seq = ntohl(th->seq);
 490        if (sk->sk_state == TCP_NEW_SYN_RECV) {
 491                tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 492                                     type == ICMP_TIME_EXCEEDED ||
 493                                     (type == ICMP_DEST_UNREACH &&
 494                                      (code == ICMP_NET_UNREACH ||
 495                                       code == ICMP_HOST_UNREACH)));
 496                return 0;
 497        }
 498
 499        bh_lock_sock(sk);
 500        /* If too many ICMPs get dropped on busy
 501         * servers this needs to be solved differently.
 502         * We do take care of PMTU discovery (RFC1191) special case :
 503         * we can receive locally generated ICMP messages while socket is held.
 504         */
 505        if (sock_owned_by_user(sk)) {
 506                if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 507                        __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 508        }
 509        if (sk->sk_state == TCP_CLOSE)
 510                goto out;
 511
 512        if (static_branch_unlikely(&ip4_min_ttl)) {
 513                /* min_ttl can be changed concurrently from do_ip_setsockopt() */
 514                if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
 515                        __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 516                        goto out;
 517                }
 518        }
 519
 520        tp = tcp_sk(sk);
 521        /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 522        fastopen = rcu_dereference(tp->fastopen_rsk);
 523        snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 524        if (sk->sk_state != TCP_LISTEN &&
 525            !between(seq, snd_una, tp->snd_nxt)) {
 526                __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 527                goto out;
 528        }
 529
 530        switch (type) {
 531        case ICMP_REDIRECT:
 532                if (!sock_owned_by_user(sk))
 533                        do_redirect(skb, sk);
 534                goto out;
 535        case ICMP_SOURCE_QUENCH:
 536                /* Just silently ignore these. */
 537                goto out;
 538        case ICMP_PARAMETERPROB:
 539                err = EPROTO;
 540                break;
 541        case ICMP_DEST_UNREACH:
 542                if (code > NR_ICMP_UNREACH)
 543                        goto out;
 544
 545                if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 546                        /* We are not interested in TCP_LISTEN and open_requests
 547                         * (SYN-ACKs send out by Linux are always <576bytes so
 548                         * they should go through unfragmented).
 549                         */
 550                        if (sk->sk_state == TCP_LISTEN)
 551                                goto out;
 552
 553                        WRITE_ONCE(tp->mtu_info, info);
 554                        if (!sock_owned_by_user(sk)) {
 555                                tcp_v4_mtu_reduced(sk);
 556                        } else {
 557                                if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 558                                        sock_hold(sk);
 559                        }
 560                        goto out;
 561                }
 562
 563                err = icmp_err_convert[code].errno;
 564                /* check if this ICMP message allows revert of backoff.
 565                 * (see RFC 6069)
 566                 */
 567                if (!fastopen &&
 568                    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
 569                        tcp_ld_RTO_revert(sk, seq);
 570                break;
 571        case ICMP_TIME_EXCEEDED:
 572                err = EHOSTUNREACH;
 573                break;
 574        default:
 575                goto out;
 576        }
 577
 578        switch (sk->sk_state) {
 579        case TCP_SYN_SENT:
 580        case TCP_SYN_RECV:
 581                /* Only in fast or simultaneous open. If a fast open socket is
 582                 * already accepted it is treated as a connected one below.
 583                 */
 584                if (fastopen && !fastopen->sk)
 585                        break;
 586
 587                ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
 588
 589                if (!sock_owned_by_user(sk)) {
 590                        sk->sk_err = err;
 591
 592                        sk_error_report(sk);
 593
 594                        tcp_done(sk);
 595                } else {
 596                        sk->sk_err_soft = err;
 597                }
 598                goto out;
 599        }
 600
 601        /* If we've already connected we will keep trying
 602         * until we time out, or the user gives up.
 603         *
 604         * rfc1122 4.2.3.9 allows to consider as hard errors
 605         * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 606         * but it is obsoleted by pmtu discovery).
 607         *
 608         * Note, that in modern internet, where routing is unreliable
 609         * and in each dark corner broken firewalls sit, sending random
 610         * errors ordered by their masters even this two messages finally lose
 611         * their original sense (even Linux sends invalid PORT_UNREACHs)
 612         *
 613         * Now we are in compliance with RFCs.
 614         *                                                      --ANK (980905)
 615         */
 616
 617        inet = inet_sk(sk);
 618        if (!sock_owned_by_user(sk) && inet->recverr) {
 619                sk->sk_err = err;
 620                sk_error_report(sk);
 621        } else  { /* Only an error on timeout */
 622                sk->sk_err_soft = err;
 623        }
 624
 625out:
 626        bh_unlock_sock(sk);
 627        sock_put(sk);
 628        return 0;
 629}
 630
 631void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 632{
 633        struct tcphdr *th = tcp_hdr(skb);
 634
 635        th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 636        skb->csum_start = skb_transport_header(skb) - skb->head;
 637        skb->csum_offset = offsetof(struct tcphdr, check);
 638}
 639
 640/* This routine computes an IPv4 TCP checksum. */
 641void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 642{
 643        const struct inet_sock *inet = inet_sk(sk);
 644
 645        __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 646}
 647EXPORT_SYMBOL(tcp_v4_send_check);
 648
 649/*
 650 *      This routine will send an RST to the other tcp.
 651 *
 652 *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 653 *                    for reset.
 654 *      Answer: if a packet caused RST, it is not for a socket
 655 *              existing in our system, if it is matched to a socket,
 656 *              it is just duplicate segment or bug in other side's TCP.
 657 *              So that we build reply only basing on parameters
 658 *              arrived with segment.
 659 *      Exception: precedence violation. We do not implement it in any case.
 660 */
 661
 662#ifdef CONFIG_TCP_MD5SIG
 663#define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
 664#else
 665#define OPTION_BYTES sizeof(__be32)
 666#endif
 667
 668static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 669{
 670        const struct tcphdr *th = tcp_hdr(skb);
 671        struct {
 672                struct tcphdr th;
 673                __be32 opt[OPTION_BYTES / sizeof(__be32)];
 674        } rep;
 675        struct ip_reply_arg arg;
 676#ifdef CONFIG_TCP_MD5SIG
 677        struct tcp_md5sig_key *key = NULL;
 678        const __u8 *hash_location = NULL;
 679        unsigned char newhash[16];
 680        int genhash;
 681        struct sock *sk1 = NULL;
 682#endif
 683        u64 transmit_time = 0;
 684        struct sock *ctl_sk;
 685        struct net *net;
 686
 687        /* Never send a reset in response to a reset. */
 688        if (th->rst)
 689                return;
 690
 691        /* If sk not NULL, it means we did a successful lookup and incoming
 692         * route had to be correct. prequeue might have dropped our dst.
 693         */
 694        if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 695                return;
 696
 697        /* Swap the send and the receive. */
 698        memset(&rep, 0, sizeof(rep));
 699        rep.th.dest   = th->source;
 700        rep.th.source = th->dest;
 701        rep.th.doff   = sizeof(struct tcphdr) / 4;
 702        rep.th.rst    = 1;
 703
 704        if (th->ack) {
 705                rep.th.seq = th->ack_seq;
 706        } else {
 707                rep.th.ack = 1;
 708                rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 709                                       skb->len - (th->doff << 2));
 710        }
 711
 712        memset(&arg, 0, sizeof(arg));
 713        arg.iov[0].iov_base = (unsigned char *)&rep;
 714        arg.iov[0].iov_len  = sizeof(rep.th);
 715
 716        net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 717#ifdef CONFIG_TCP_MD5SIG
 718        rcu_read_lock();
 719        hash_location = tcp_parse_md5sig_option(th);
 720        if (sk && sk_fullsock(sk)) {
 721                const union tcp_md5_addr *addr;
 722                int l3index;
 723
 724                /* sdif set, means packet ingressed via a device
 725                 * in an L3 domain and inet_iif is set to it.
 726                 */
 727                l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 728                addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 729                key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
 730        } else if (hash_location) {
 731                const union tcp_md5_addr *addr;
 732                int sdif = tcp_v4_sdif(skb);
 733                int dif = inet_iif(skb);
 734                int l3index;
 735
 736                /*
 737                 * active side is lost. Try to find listening socket through
 738                 * source port, and then find md5 key through listening socket.
 739                 * we are not loose security here:
 740                 * Incoming packet is checked with md5 hash with finding key,
 741                 * no RST generated if md5 hash doesn't match.
 742                 */
 743                sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 744                                             ip_hdr(skb)->saddr,
 745                                             th->source, ip_hdr(skb)->daddr,
 746                                             ntohs(th->source), dif, sdif);
 747                /* don't send rst if it can't find key */
 748                if (!sk1)
 749                        goto out;
 750
 751                /* sdif set, means packet ingressed via a device
 752                 * in an L3 domain and dif is set to it.
 753                 */
 754                l3index = sdif ? dif : 0;
 755                addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 756                key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
 757                if (!key)
 758                        goto out;
 759
 760
 761                genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 762                if (genhash || memcmp(hash_location, newhash, 16) != 0)
 763                        goto out;
 764
 765        }
 766
 767        if (key) {
 768                rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 769                                   (TCPOPT_NOP << 16) |
 770                                   (TCPOPT_MD5SIG << 8) |
 771                                   TCPOLEN_MD5SIG);
 772                /* Update length and the length the header thinks exists */
 773                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 774                rep.th.doff = arg.iov[0].iov_len / 4;
 775
 776                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 777                                     key, ip_hdr(skb)->saddr,
 778                                     ip_hdr(skb)->daddr, &rep.th);
 779        }
 780#endif
 781        /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
 782        if (rep.opt[0] == 0) {
 783                __be32 mrst = mptcp_reset_option(skb);
 784
 785                if (mrst) {
 786                        rep.opt[0] = mrst;
 787                        arg.iov[0].iov_len += sizeof(mrst);
 788                        rep.th.doff = arg.iov[0].iov_len / 4;
 789                }
 790        }
 791
 792        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 793                                      ip_hdr(skb)->saddr, /* XXX */
 794                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 795        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 796        arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 797
 798        /* When socket is gone, all binding information is lost.
 799         * routing might fail in this case. No choice here, if we choose to force
 800         * input interface, we will misroute in case of asymmetric route.
 801         */
 802        if (sk) {
 803                arg.bound_dev_if = sk->sk_bound_dev_if;
 804                if (sk_fullsock(sk))
 805                        trace_tcp_send_reset(sk, skb);
 806        }
 807
 808        BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 809                     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 810
 811        arg.tos = ip_hdr(skb)->tos;
 812        arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 813        local_bh_disable();
 814        ctl_sk = this_cpu_read(ipv4_tcp_sk);
 815        sock_net_set(ctl_sk, net);
 816        if (sk) {
 817                ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 818                                   inet_twsk(sk)->tw_mark : sk->sk_mark;
 819                ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 820                                   inet_twsk(sk)->tw_priority : sk->sk_priority;
 821                transmit_time = tcp_transmit_time(sk);
 822        }
 823        ip_send_unicast_reply(ctl_sk,
 824                              skb, &TCP_SKB_CB(skb)->header.h4.opt,
 825                              ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 826                              &arg, arg.iov[0].iov_len,
 827                              transmit_time);
 828
 829        ctl_sk->sk_mark = 0;
 830        sock_net_set(ctl_sk, &init_net);
 831        __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 832        __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 833        local_bh_enable();
 834
 835#ifdef CONFIG_TCP_MD5SIG
 836out:
 837        rcu_read_unlock();
 838#endif
 839}
 840
 841/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 842   outside socket context is ugly, certainly. What can I do?
 843 */
 844
 845static void tcp_v4_send_ack(const struct sock *sk,
 846                            struct sk_buff *skb, u32 seq, u32 ack,
 847                            u32 win, u32 tsval, u32 tsecr, int oif,
 848                            struct tcp_md5sig_key *key,
 849                            int reply_flags, u8 tos)
 850{
 851        const struct tcphdr *th = tcp_hdr(skb);
 852        struct {
 853                struct tcphdr th;
 854                __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 855#ifdef CONFIG_TCP_MD5SIG
 856                           + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 857#endif
 858                        ];
 859        } rep;
 860        struct net *net = sock_net(sk);
 861        struct ip_reply_arg arg;
 862        struct sock *ctl_sk;
 863        u64 transmit_time;
 864
 865        memset(&rep.th, 0, sizeof(struct tcphdr));
 866        memset(&arg, 0, sizeof(arg));
 867
 868        arg.iov[0].iov_base = (unsigned char *)&rep;
 869        arg.iov[0].iov_len  = sizeof(rep.th);
 870        if (tsecr) {
 871                rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 872                                   (TCPOPT_TIMESTAMP << 8) |
 873                                   TCPOLEN_TIMESTAMP);
 874                rep.opt[1] = htonl(tsval);
 875                rep.opt[2] = htonl(tsecr);
 876                arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 877        }
 878
 879        /* Swap the send and the receive. */
 880        rep.th.dest    = th->source;
 881        rep.th.source  = th->dest;
 882        rep.th.doff    = arg.iov[0].iov_len / 4;
 883        rep.th.seq     = htonl(seq);
 884        rep.th.ack_seq = htonl(ack);
 885        rep.th.ack     = 1;
 886        rep.th.window  = htons(win);
 887
 888#ifdef CONFIG_TCP_MD5SIG
 889        if (key) {
 890                int offset = (tsecr) ? 3 : 0;
 891
 892                rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 893                                          (TCPOPT_NOP << 16) |
 894                                          (TCPOPT_MD5SIG << 8) |
 895                                          TCPOLEN_MD5SIG);
 896                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 897                rep.th.doff = arg.iov[0].iov_len/4;
 898
 899                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 900                                    key, ip_hdr(skb)->saddr,
 901                                    ip_hdr(skb)->daddr, &rep.th);
 902        }
 903#endif
 904        arg.flags = reply_flags;
 905        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 906                                      ip_hdr(skb)->saddr, /* XXX */
 907                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 908        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 909        if (oif)
 910                arg.bound_dev_if = oif;
 911        arg.tos = tos;
 912        arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 913        local_bh_disable();
 914        ctl_sk = this_cpu_read(ipv4_tcp_sk);
 915        sock_net_set(ctl_sk, net);
 916        ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 917                           inet_twsk(sk)->tw_mark : sk->sk_mark;
 918        ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 919                           inet_twsk(sk)->tw_priority : sk->sk_priority;
 920        transmit_time = tcp_transmit_time(sk);
 921        ip_send_unicast_reply(ctl_sk,
 922                              skb, &TCP_SKB_CB(skb)->header.h4.opt,
 923                              ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 924                              &arg, arg.iov[0].iov_len,
 925                              transmit_time);
 926
 927        ctl_sk->sk_mark = 0;
 928        sock_net_set(ctl_sk, &init_net);
 929        __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 930        local_bh_enable();
 931}
 932
 933static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 934{
 935        struct inet_timewait_sock *tw = inet_twsk(sk);
 936        struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 937
 938        tcp_v4_send_ack(sk, skb,
 939                        tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 940                        tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 941                        tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 942                        tcptw->tw_ts_recent,
 943                        tw->tw_bound_dev_if,
 944                        tcp_twsk_md5_key(tcptw),
 945                        tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 946                        tw->tw_tos
 947                        );
 948
 949        inet_twsk_put(tw);
 950}
 951
 952static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 953                                  struct request_sock *req)
 954{
 955        const union tcp_md5_addr *addr;
 956        int l3index;
 957
 958        /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 959         * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 960         */
 961        u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 962                                             tcp_sk(sk)->snd_nxt;
 963
 964        /* RFC 7323 2.3
 965         * The window field (SEG.WND) of every outgoing segment, with the
 966         * exception of <SYN> segments, MUST be right-shifted by
 967         * Rcv.Wind.Shift bits:
 968         */
 969        addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 970        l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 971        tcp_v4_send_ack(sk, skb, seq,
 972                        tcp_rsk(req)->rcv_nxt,
 973                        req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 974                        tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 975                        req->ts_recent,
 976                        0,
 977                        tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
 978                        inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 979                        ip_hdr(skb)->tos);
 980}
 981
 982/*
 983 *      Send a SYN-ACK after having received a SYN.
 984 *      This still operates on a request_sock only, not on a big
 985 *      socket.
 986 */
 987static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 988                              struct flowi *fl,
 989                              struct request_sock *req,
 990                              struct tcp_fastopen_cookie *foc,
 991                              enum tcp_synack_type synack_type,
 992                              struct sk_buff *syn_skb)
 993{
 994        const struct inet_request_sock *ireq = inet_rsk(req);
 995        struct flowi4 fl4;
 996        int err = -1;
 997        struct sk_buff *skb;
 998        u8 tos;
 999
1000        /* First, grab a route. */

1001        if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1002                return -1;
1003
1004        skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1005
1006        if (skb) {
1007                __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1008
1009                tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1010                                (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1011                                (inet_sk(sk)->tos & INET_ECN_MASK) :
1012                                inet_sk(sk)->tos;
1013
1014                if (!INET_ECN_is_capable(tos) &&
1015                    tcp_bpf_ca_needs_ecn((struct sock *)req))
1016                        tos |= INET_ECN_ECT_0;
1017
1018                rcu_read_lock();
1019                err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1020                                            ireq->ir_rmt_addr,
1021                                            rcu_dereference(ireq->ireq_opt),
1022                                            tos);
1023                rcu_read_unlock();
1024                err = net_xmit_eval(err);
1025        }
1026
1027        return err;
1028}
1029
1030/*
1031 *      IPv4 request_sock destructor.
1032 */
1033static void tcp_v4_reqsk_destructor(struct request_sock *req)
1034{
1035        kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1036}
1037
1038#ifdef CONFIG_TCP_MD5SIG
1039/*
1040 * RFC2385 MD5 checksumming requires a mapping of
1041 * IP address->MD5 Key.
1042 * We need to maintain these in the sk structure.
1043 */
1044
1045DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1046EXPORT_SYMBOL(tcp_md5_needed);
1047
1048static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1049{
1050        if (!old)
1051                return true;
1052
1053        /* l3index always overrides non-l3index */
1054        if (old->l3index && new->l3index == 0)
1055                return false;
1056        if (old->l3index == 0 && new->l3index)
1057                return true;
1058
1059        return old->prefixlen < new->prefixlen;
1060}
1061
1062/* Find the Key structure for an address.  */
1063struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1064                                           const union tcp_md5_addr *addr,
1065                                           int family)
1066{
1067        const struct tcp_sock *tp = tcp_sk(sk);
1068        struct tcp_md5sig_key *key;
1069        const struct tcp_md5sig_info *md5sig;
1070        __be32 mask;
1071        struct tcp_md5sig_key *best_match = NULL;
1072        bool match;
1073
1074        /* caller either holds rcu_read_lock() or socket lock */
1075        md5sig = rcu_dereference_check(tp->md5sig_info,
1076                                       lockdep_sock_is_held(sk));
1077        if (!md5sig)
1078                return NULL;
1079
1080        hlist_for_each_entry_rcu(key, &md5sig->head, node,
1081                                 lockdep_sock_is_held(sk)) {
1082                if (key->family != family)
1083                        continue;
1084                if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1085                        continue;
1086                if (family == AF_INET) {
1087                        mask = inet_make_mask(key->prefixlen);
1088                        match = (key->addr.a4.s_addr & mask) ==
1089                                (addr->a4.s_addr & mask);
1090#if IS_ENABLED(CONFIG_IPV6)
1091                } else if (family == AF_INET6) {
1092                        match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1093                                                  key->prefixlen);
1094#endif
1095                } else {
1096                        match = false;
1097                }
1098
1099                if (match && better_md5_match(best_match, key))
1100                        best_match = key;
1101        }
1102        return best_match;
1103}
1104EXPORT_SYMBOL(__tcp_md5_do_lookup);
1105
1106static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1107                                                      const union tcp_md5_addr *addr,
1108                                                      int family, u8 prefixlen,
1109                                                      int l3index, u8 flags)
1110{
1111        const struct tcp_sock *tp = tcp_sk(sk);
1112        struct tcp_md5sig_key *key;
1113        unsigned int size = sizeof(struct in_addr);
1114        const struct tcp_md5sig_info *md5sig;
1115
1116        /* caller either holds rcu_read_lock() or socket lock */
1117        md5sig = rcu_dereference_check(tp->md5sig_info,
1118                                       lockdep_sock_is_held(sk));
1119        if (!md5sig)
1120                return NULL;
1121#if IS_ENABLED(CONFIG_IPV6)
1122        if (family == AF_INET6)
1123                size = sizeof(struct in6_addr);
1124#endif
1125        hlist_for_each_entry_rcu(key, &md5sig->head, node,
1126                                 lockdep_sock_is_held(sk)) {
1127                if (key->family != family)
1128                        continue;
1129                if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1130                        continue;
1131                if (key->l3index != l3index)
1132                        continue;
1133                if (!memcmp(&key->addr, addr, size) &&
1134                    key->prefixlen == prefixlen)
1135                        return key;
1136        }
1137        return NULL;
1138}
1139
1140struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1141                                         const struct sock *addr_sk)
1142{
1143        const union tcp_md5_addr *addr;
1144        int l3index;
1145
1146        l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1147                                                 addr_sk->sk_bound_dev_if);
1148        addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1149        return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1150}
1151EXPORT_SYMBOL(tcp_v4_md5_lookup);
1152
1153/* This can be called on a newly created socket, from other files */
1154int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1155                   int family, u8 prefixlen, int l3index, u8 flags,
1156                   const u8 *newkey, u8 newkeylen, gfp_t gfp)
1157{
1158        /* Add Key to the list */
1159        struct tcp_md5sig_key *key;
1160        struct tcp_sock *tp = tcp_sk(sk);
1161        struct tcp_md5sig_info *md5sig;
1162
1163        key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1164        if (key) {
1165                /* Pre-existing entry - just update that one.
1166                 * Note that the key might be used concurrently.
1167                 * data_race() is telling kcsan that we do not care of
1168                 * key mismatches, since changing MD5 key on live flows
1169                 * can lead to packet drops.
1170                 */
1171                data_race(memcpy(key->key, newkey, newkeylen));
1172
1173                /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1174                 * Also note that a reader could catch new key->keylen value
1175                 * but old key->key[], this is the reason we use __GFP_ZERO
1176                 * at sock_kmalloc() time below these lines.
1177                 */
1178                WRITE_ONCE(key->keylen, newkeylen);
1179
1180                return 0;
1181        }
1182
1183        md5sig = rcu_dereference_protected(tp->md5sig_info,
1184                                           lockdep_sock_is_held(sk));
1185        if (!md5sig) {
1186                md5sig = kmalloc(sizeof(*md5sig), gfp);
1187                if (!md5sig)
1188                        return -ENOMEM;
1189
1190                sk_gso_disable(sk);
1191                INIT_HLIST_HEAD(&md5sig->head);
1192                rcu_assign_pointer(tp->md5sig_info, md5sig);
1193        }
1194
1195        key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1196        if (!key)
1197                return -ENOMEM;
1198        if (!tcp_alloc_md5sig_pool()) {
1199                sock_kfree_s(sk, key, sizeof(*key));
1200                return -ENOMEM;
1201        }
1202
1203        memcpy(key->key, newkey, newkeylen);
1204        key->keylen = newkeylen;
1205        key->family = family;
1206        key->prefixlen = prefixlen;
1207        key->l3index = l3index;
1208        key->flags = flags;
1209        memcpy(&key->addr, addr,
1210               (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1211                                                                 sizeof(struct in_addr));
1212        hlist_add_head_rcu(&key->node, &md5sig->head);
1213        return 0;
1214}
1215EXPORT_SYMBOL(tcp_md5_do_add);
1216
1217int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1218                   u8 prefixlen, int l3index, u8 flags)
1219{
1220        struct tcp_md5sig_key *key;
1221
1222        key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1223        if (!key)
1224                return -ENOENT;
1225        hlist_del_rcu(&key->node);
1226        atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1227        kfree_rcu(key, rcu);
1228        return 0;
1229}
1230EXPORT_SYMBOL(tcp_md5_do_del);
1231
1232static void tcp_clear_md5_list(struct sock *sk)
1233{
1234        struct tcp_sock *tp = tcp_sk(sk);
1235        struct tcp_md5sig_key *key;
1236        struct hlist_node *n;
1237        struct tcp_md5sig_info *md5sig;
1238
1239        md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1240
1241        hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1242                hlist_del_rcu(&key->node);
1243                atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1244                kfree_rcu(key, rcu);
1245        }
1246}
1247
1248static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1249                                 sockptr_t optval, int optlen)
1250{
1251        struct tcp_md5sig cmd;
1252        struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1253        const union tcp_md5_addr *addr;
1254        u8 prefixlen = 32;
1255        int l3index = 0;
1256        u8 flags;
1257
1258        if (optlen < sizeof(cmd))
1259                return -EINVAL;
1260
1261        if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1262                return -EFAULT;
1263
1264        if (sin->sin_family != AF_INET)
1265                return -EINVAL;
1266
1267        flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1268
1269        if (optname == TCP_MD5SIG_EXT &&
1270            cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1271                prefixlen = cmd.tcpm_prefixlen;
1272                if (prefixlen > 32)
1273                        return -EINVAL;
1274        }
1275
1276        if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1277            cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1278                struct net_device *dev;
1279
1280                rcu_read_lock();
1281                dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1282                if (dev && netif_is_l3_master(dev))
1283                        l3index = dev->ifindex;
1284
1285                rcu_read_unlock();
1286
1287                /* ok to reference set/not set outside of rcu;
1288                 * right now device MUST be an L3 master
1289                 */
1290                if (!dev || !l3index)
1291                        return -EINVAL;
1292        }
1293
1294        addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1295
1296        if (!cmd.tcpm_keylen)
1297                return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1298
1299        if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1300                return -EINVAL;
1301
1302        return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1303                              cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1304}
1305
1306static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1307                                   __be32 daddr, __be32 saddr,
1308                                   const struct tcphdr *th, int nbytes)
1309{
1310        struct tcp4_pseudohdr *bp;
1311        struct scatterlist sg;
1312        struct tcphdr *_th;
1313
1314        bp = hp->scratch;
1315        bp->saddr = saddr;
1316        bp->daddr = daddr;
1317        bp->pad = 0;
1318        bp->protocol = IPPROTO_TCP;
1319        bp->len = cpu_to_be16(nbytes);
1320
1321        _th = (struct tcphdr *)(bp + 1);
1322        memcpy(_th, th, sizeof(*th));
1323        _th->check = 0;
1324
1325        sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1326        ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1327                                sizeof(*bp) + sizeof(*th));
1328        return crypto_ahash_update(hp->md5_req);
1329}
1330
1331static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1332                               __be32 daddr, __be32 saddr, const struct tcphdr *th)
1333{
1334        struct tcp_md5sig_pool *hp;
1335        struct ahash_request *req;
1336
1337        hp = tcp_get_md5sig_pool();
1338        if (!hp)
1339                goto clear_hash_noput;
1340        req = hp->md5_req;
1341
1342        if (crypto_ahash_init(req))
1343                goto clear_hash;
1344        if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1345                goto clear_hash;
1346        if (tcp_md5_hash_key(hp, key))
1347                goto clear_hash;
1348        ahash_request_set_crypt(req, NULL, md5_hash, 0);
1349        if (crypto_ahash_final(req))
1350                goto clear_hash;
1351
1352        tcp_put_md5sig_pool();
1353        return 0;
1354
1355clear_hash:
1356        tcp_put_md5sig_pool();
1357clear_hash_noput:
1358        memset(md5_hash, 0, 16);
1359        return 1;
1360}
1361
1362int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1363                        const struct sock *sk,
1364                        const struct sk_buff *skb)
1365{
1366        struct tcp_md5sig_pool *hp;
1367        struct ahash_request *req;
1368        const struct tcphdr *th = tcp_hdr(skb);
1369        __be32 saddr, daddr;
1370
1371        if (sk) { /* valid for establish/request sockets */
1372                saddr = sk->sk_rcv_saddr;
1373                daddr = sk->sk_daddr;
1374        } else {
1375                const struct iphdr *iph = ip_hdr(skb);
1376                saddr = iph->saddr;
1377                daddr = iph->daddr;
1378        }
1379
1380        hp = tcp_get_md5sig_pool();
1381        if (!hp)
1382                goto clear_hash_noput;
1383        req = hp->md5_req;
1384
1385        if (crypto_ahash_init(req))
1386                goto clear_hash;
1387
1388        if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1389                goto clear_hash;
1390        if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1391                goto clear_hash;
1392        if (tcp_md5_hash_key(hp, key))
1393                goto clear_hash;
1394        ahash_request_set_crypt(req, NULL, md5_hash, 0);
1395        if (crypto_ahash_final(req))
1396                goto clear_hash;
1397
1398        tcp_put_md5sig_pool();
1399        return 0;
1400
1401clear_hash:
1402        tcp_put_md5sig_pool();
1403clear_hash_noput:
1404        memset(md5_hash, 0, 16);
1405        return 1;
1406}
1407EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1408
1409#endif
1410
1411static void tcp_v4_init_req(struct request_sock *req,
1412                            const struct sock *sk_listener,
1413                            struct sk_buff *skb)
1414{
1415        struct inet_request_sock *ireq = inet_rsk(req);
1416        struct net *net = sock_net(sk_listener);
1417
1418        sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1419        sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1420        RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1421}
1422
1423static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1424                                          struct sk_buff *skb,
1425                                          struct flowi *fl,
1426                                          struct request_sock *req)
1427{
1428        tcp_v4_init_req(req, sk, skb);
1429
1430        if (security_inet_conn_request(sk, skb, req))
1431                return NULL;
1432
1433        return inet_csk_route_req(sk, &fl->u.ip4, req);
1434}
1435
1436struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1437        .family         =       PF_INET,
1438        .obj_size       =       sizeof(struct tcp_request_sock),
1439        .rtx_syn_ack    =       tcp_rtx_synack,
1440        .send_ack       =       tcp_v4_reqsk_send_ack,
1441        .destructor     =       tcp_v4_reqsk_destructor,
1442        .send_reset     =       tcp_v4_send_reset,
1443        .syn_ack_timeout =      tcp_syn_ack_timeout,
1444};
1445
1446const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1447        .mss_clamp      =       TCP_MSS_DEFAULT,
1448#ifdef CONFIG_TCP_MD5SIG
1449        .req_md5_lookup =       tcp_v4_md5_lookup,
1450        .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1451#endif
1452#ifdef CONFIG_SYN_COOKIES
1453        .cookie_init_seq =      cookie_v4_init_sequence,
1454#endif
1455        .route_req      =       tcp_v4_route_req,
1456        .init_seq       =       tcp_v4_init_seq,
1457        .init_ts_off    =       tcp_v4_init_ts_off,
1458        .send_synack    =       tcp_v4_send_synack,
1459};
1460
1461int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1462{
1463        /* Never answer to SYNs send to broadcast or multicast */
1464        if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1465                goto drop;
1466
1467        return tcp_conn_request(&tcp_request_sock_ops,
1468                                &tcp_request_sock_ipv4_ops, sk, skb);
1469
1470drop:
1471        tcp_listendrop(sk);
1472        return 0;
1473}
1474EXPORT_SYMBOL(tcp_v4_conn_request);
1475
1476
1477/*
1478 * The three way handshake has completed - we got a valid synack -
1479 * now create the new socket.
1480 */
1481struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1482                                  struct request_sock *req,
1483                                  struct dst_entry *dst,
1484                                  struct request_sock *req_unhash,
1485                                  bool *own_req)
1486{
1487        struct inet_request_sock *ireq;
1488        bool found_dup_sk = false;
1489        struct inet_sock *newinet;
1490        struct tcp_sock *newtp;
1491        struct sock *newsk;
1492#ifdef CONFIG_TCP_MD5SIG
1493        const union tcp_md5_addr *addr;
1494        struct tcp_md5sig_key *key;
1495        int l3index;
1496#endif
1497        struct ip_options_rcu *inet_opt;
1498
1499        if (sk_acceptq_is_full(sk))
1500                goto exit_overflow;
1501
1502        newsk = tcp_create_openreq_child(sk, req, skb);
1503        if (!newsk)
1504                goto exit_nonewsk;
1505
1506        newsk->sk_gso_type = SKB_GSO_TCPV4;
1507        inet_sk_rx_dst_set(newsk, skb);
1508
1509        newtp                 = tcp_sk(newsk);
1510        newinet               = inet_sk(newsk);
1511        ireq                  = inet_rsk(req);
1512        sk_daddr_set(newsk, ireq->ir_rmt_addr);
1513        sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1514        newsk->sk_bound_dev_if = ireq->ir_iif;
1515        newinet->inet_saddr   = ireq->ir_loc_addr;
1516        inet_opt              = rcu_dereference(ireq->ireq_opt);
1517        RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1518        newinet->mc_index     = inet_iif(skb);
1519        newinet->mc_ttl       = ip_hdr(skb)->ttl;
1520        newinet->rcv_tos      = ip_hdr(skb)->tos;
1521        inet_csk(newsk)->icsk_ext_hdr_len = 0;
1522        if (inet_opt)
1523                inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1524        newinet->inet_id = prandom_u32();
1525
1526        /* Set ToS of the new socket based upon the value of incoming SYN.
1527         * ECT bits are set later in tcp_init_transfer().
1528         */
1529        if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1530                newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1531
1532        if (!dst) {
1533                dst = inet_csk_route_child_sock(sk, newsk, req);
1534                if (!dst)
1535                        goto put_and_exit;
1536        } else {
1537                /* syncookie case : see end of cookie_v4_check() */
1538        }
1539        sk_setup_caps(newsk, dst);
1540
1541        tcp_ca_openreq_child(newsk, dst);
1542
1543        tcp_sync_mss(newsk, dst_mtu(dst));
1544        newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1545
1546        tcp_initialize_rcv_mss(newsk);
1547
1548#ifdef CONFIG_TCP_MD5SIG
1549        l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1550        /* Copy over the MD5 key from the original socket */
1551        addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1552        key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1553        if (key) {
1554                /*
1555                 * We're using one, so create a matching key
1556                 * on the newsk structure. If we fail to get
1557                 * memory, then we end up not copying the key
1558                 * across. Shucks.
1559                 */
1560                tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1561                               key->key, key->keylen, GFP_ATOMIC);
1562                sk_gso_disable(newsk);
1563        }
1564#endif
1565
1566        if (__inet_inherit_port(sk, newsk) < 0)
1567                goto put_and_exit;
1568        *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1569                                       &found_dup_sk);
1570        if (likely(*own_req)) {
1571                tcp_move_syn(newtp, req);
1572                ireq->ireq_opt = NULL;
1573        } else {
1574                newinet->inet_opt = NULL;
1575
1576                if (!req_unhash && found_dup_sk) {
1577                        /* This code path should only be executed in the
1578                         * syncookie case only
1579                         */
1580                        bh_unlock_sock(newsk);
1581                        sock_put(newsk);
1582                        newsk = NULL;
1583                }
1584        }
1585        return newsk;
1586
1587exit_overflow:
1588        NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1589exit_nonewsk:
1590        dst_release(dst);
1591exit:
1592        tcp_listendrop(sk);
1593        return NULL;
1594put_and_exit:
1595        newinet->inet_opt = NULL;
1596        inet_csk_prepare_forced_close(newsk);
1597        tcp_done(newsk);
1598        goto exit;
1599}
1600EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1601
1602static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1603{
1604#ifdef CONFIG_SYN_COOKIES
1605        const struct tcphdr *th = tcp_hdr(skb);
1606
1607        if (!th->syn)
1608                sk = cookie_v4_check(sk, skb);
1609#endif
1610        return sk;
1611}
1612
1613u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1614                         struct tcphdr *th, u32 *cookie)
1615{
1616        u16 mss = 0;
1617#ifdef CONFIG_SYN_COOKIES
1618        mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1619                                    &tcp_request_sock_ipv4_ops, sk, th);
1620        if (mss) {
1621                *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1622                tcp_synq_overflow(sk);
1623        }
1624#endif
1625        return mss;
1626}
1627
1628INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1629                                                           u32));
1630/* The socket must have it's spinlock held when we get
1631 * here, unless it is a TCP_LISTEN socket.
1632 *
1633 * We have a potential double-lock case here, so even when
1634 * doing backlog processing we use the BH locking scheme.
1635 * This is because we cannot sleep with the original spinlock
1636 * held.
1637 */
1638int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1639{
1640        enum skb_drop_reason reason;
1641        struct sock *rsk;
1642
1643        if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1644                struct dst_entry *dst;
1645
1646                dst = rcu_dereference_protected(sk->sk_rx_dst,
1647                                                lockdep_sock_is_held(sk));
1648
1649                sock_rps_save_rxhash(sk, skb);
1650                sk_mark_napi_id(sk, skb);
1651                if (dst) {
1652                        if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1653                            !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1654                                             dst, 0)) {
1655                                RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1656                                dst_release(dst);
1657                        }
1658                }
1659                tcp_rcv_established(sk, skb);
1660                return 0;
1661        }
1662
1663        reason = SKB_DROP_REASON_NOT_SPECIFIED;
1664        if (tcp_checksum_complete(skb))
1665                goto csum_err;
1666
1667        if (sk->sk_state == TCP_LISTEN) {
1668                struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1669
1670                if (!nsk)
1671                        goto discard;
1672                if (nsk != sk) {
1673                        if (tcp_child_process(sk, nsk, skb)) {
1674                                rsk = nsk;
1675                                goto reset;
1676                        }
1677                        return 0;
1678                }
1679        } else
1680                sock_rps_save_rxhash(sk, skb);
1681
1682        if (tcp_rcv_state_process(sk, skb)) {
1683                rsk = sk;
1684                goto reset;
1685        }
1686        return 0;
1687
1688reset:
1689        tcp_v4_send_reset(rsk, skb);
1690discard:
1691        kfree_skb_reason(skb, reason);
1692        /* Be careful here. If this function gets more complicated and
1693         * gcc suffers from register pressure on the x86, sk (in %ebx)
1694         * might be destroyed here. This current version compiles correctly,
1695         * but you have been warned.
1696         */
1697        return 0;
1698
1699csum_err:
1700        reason = SKB_DROP_REASON_TCP_CSUM;
1701        trace_tcp_bad_csum(skb);
1702        TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1703        TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1704        goto discard;
1705}
1706EXPORT_SYMBOL(tcp_v4_do_rcv);
1707
1708int tcp_v4_early_demux(struct sk_buff *skb)
1709{
1710        const struct iphdr *iph;
1711        const struct tcphdr *th;
1712        struct sock *sk;
1713
1714        if (skb->pkt_type != PACKET_HOST)
1715                return 0;
1716
1717        if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1718                return 0;
1719
1720        iph = ip_hdr(skb);
1721        th = tcp_hdr(skb);
1722
1723        if (th->doff < sizeof(struct tcphdr) / 4)
1724                return 0;
1725
1726        sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1727                                       iph->saddr, th->source,
1728                                       iph->daddr, ntohs(th->dest),
1729                                       skb->skb_iif, inet_sdif(skb));
1730        if (sk) {
1731                skb->sk = sk;
1732                skb->destructor = sock_edemux;
1733                if (sk_fullsock(sk)) {
1734                        struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1735
1736                        if (dst)
1737                                dst = dst_check(dst, 0);
1738                        if (dst &&
1739                            sk->sk_rx_dst_ifindex == skb->skb_iif)
1740                                skb_dst_set_noref(skb, dst);
1741                }
1742        }
1743        return 0;
1744}
1745
1746bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1747                     enum skb_drop_reason *reason)
1748{
1749        u32 limit, tail_gso_size, tail_gso_segs;
1750        struct skb_shared_info *shinfo;
1751        const struct tcphdr *th;
1752        struct tcphdr *thtail;
1753        struct sk_buff *tail;
1754        unsigned int hdrlen;
1755        bool fragstolen;
1756        u32 gso_segs;
1757        u32 gso_size;
1758        int delta;
1759
1760        /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1761         * we can fix skb->truesize to its real value to avoid future drops.
1762         * This is valid because skb is not yet charged to the socket.
1763         * It has been noticed pure SACK packets were sometimes dropped
1764         * (if cooked by drivers without copybreak feature).
1765         */
1766        skb_condense(skb);
1767
1768        skb_dst_drop(skb);
1769
1770        if (unlikely(tcp_checksum_complete(skb))) {
1771                bh_unlock_sock(sk);
1772                trace_tcp_bad_csum(skb);
1773                *reason = SKB_DROP_REASON_TCP_CSUM;
1774                __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1775                __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1776                return true;
1777        }
1778
1779        /* Attempt coalescing to last skb in backlog, even if we are
1780         * above the limits.
1781         * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1782         */
1783        th = (const struct tcphdr *)skb->data;
1784        hdrlen = th->doff * 4;
1785
1786        tail = sk->sk_backlog.tail;
1787        if (!tail)
1788                goto no_coalesce;
1789        thtail = (struct tcphdr *)tail->data;
1790
1791        if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1792            TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1793            ((TCP_SKB_CB(tail)->tcp_flags |
1794              TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1795            !((TCP_SKB_CB(tail)->tcp_flags &
1796              TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1797            ((TCP_SKB_CB(tail)->tcp_flags ^
1798              TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1799#ifdef CONFIG_TLS_DEVICE
1800            tail->decrypted != skb->decrypted ||
1801#endif
1802            thtail->doff != th->doff ||
1803            memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1804                goto no_coalesce;
1805
1806        __skb_pull(skb, hdrlen);
1807
1808        shinfo = skb_shinfo(skb);
1809        gso_size = shinfo->gso_size ?: skb->len;
1810        gso_segs = shinfo->gso_segs ?: 1;
1811
1812        shinfo = skb_shinfo(tail);
1813        tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1814        tail_gso_segs = shinfo->gso_segs ?: 1;
1815
1816        if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1817                TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1818
1819                if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1820                        TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1821                        thtail->window = th->window;
1822                }
1823
1824                /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1825                 * thtail->fin, so that the fast path in tcp_rcv_established()
1826                 * is not entered if we append a packet with a FIN.
1827                 * SYN, RST, URG are not present.
1828                 * ACK is set on both packets.
1829                 * PSH : we do not really care in TCP stack,
1830                 *       at least for 'GRO' packets.
1831                 */
1832                thtail->fin |= th->fin;
1833                TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1834
1835                if (TCP_SKB_CB(skb)->has_rxtstamp) {
1836                        TCP_SKB_CB(tail)->has_rxtstamp = true;
1837                        tail->tstamp = skb->tstamp;
1838                        skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1839                }
1840
1841                /* Not as strict as GRO. We only need to carry mss max value */
1842                shinfo->gso_size = max(gso_size, tail_gso_size);
1843                shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1844
1845                sk->sk_backlog.len += delta;
1846                __NET_INC_STATS(sock_net(sk),
1847                                LINUX_MIB_TCPBACKLOGCOALESCE);
1848                kfree_skb_partial(skb, fragstolen);
1849                return false;
1850        }
1851        __skb_push(skb, hdrlen);
1852
1853no_coalesce:
1854        /* Only socket owner can try to collapse/prune rx queues
1855         * to reduce memory overhead, so add a little headroom here.
1856         * Few sockets backlog are possibly concurrently non empty.
1857         */
1858        limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf) + 64*1024;
1859
1860        if (unlikely(sk_add_backlog(sk, skb, limit))) {
1861                bh_unlock_sock(sk);
1862                *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1863                __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1864                return true;
1865        }
1866        return false;
1867}
1868EXPORT_SYMBOL(tcp_add_backlog);
1869
1870int tcp_filter(struct sock *sk, struct sk_buff *skb)
1871{
1872        struct tcphdr *th = (struct tcphdr *)skb->data;
1873
1874        return sk_filter_trim_cap(sk, skb, th->doff * 4);
1875}
1876EXPORT_SYMBOL(tcp_filter);
1877
1878static void tcp_v4_restore_cb(struct sk_buff *skb)
1879{
1880        memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1881                sizeof(struct inet_skb_parm));
1882}
1883
1884static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1885                           const struct tcphdr *th)
1886{
1887        /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1888         * barrier() makes sure compiler wont play fool^Waliasing games.
1889         */
1890        memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1891                sizeof(struct inet_skb_parm));
1892        barrier();
1893
1894        TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1895        TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1896                                    skb->len - th->doff * 4);
1897        TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1898        TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1899        TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1900        TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1901        TCP_SKB_CB(skb)->sacked  = 0;
1902        TCP_SKB_CB(skb)->has_rxtstamp =
1903                        skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1904}
1905
1906/*
1907 *      From tcp_input.c
1908 */
1909
1910int tcp_v4_rcv(struct sk_buff *skb)
1911{
1912        struct net *net = dev_net(skb->dev);
1913        enum skb_drop_reason drop_reason;
1914        int sdif = inet_sdif(skb);
1915        int dif = inet_iif(skb);
1916        const struct iphdr *iph;
1917        const struct tcphdr *th;
1918        bool refcounted;
1919        struct sock *sk;
1920        int ret;
1921
1922        drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1923        if (skb->pkt_type != PACKET_HOST)
1924                goto discard_it;
1925
1926        /* Count it even if it's bad */
1927        __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1928
1929        if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1930                goto discard_it;
1931
1932        th = (const struct tcphdr *)skb->data;
1933
1934        if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
1935                drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
1936                goto bad_packet;
1937        }
1938        if (!pskb_may_pull(skb, th->doff * 4))
1939                goto discard_it;
1940
1941        /* An explanation is required here, I think.
1942         * Packet length and doff are validated by header prediction,
1943         * provided case of th->doff==0 is eliminated.
1944         * So, we defer the checks. */
1945
1946        if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1947                goto csum_error;
1948
1949        th = (const struct tcphdr *)skb->data;
1950        iph = ip_hdr(skb);
1951lookup:
1952        sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1953                               th->dest, sdif, &refcounted);
1954        if (!sk)
1955                goto no_tcp_socket;
1956
1957process:
1958        if (sk->sk_state == TCP_TIME_WAIT)
1959                goto do_time_wait;
1960
1961        if (sk->sk_state == TCP_NEW_SYN_RECV) {
1962                struct request_sock *req = inet_reqsk(sk);
1963                bool req_stolen = false;
1964                struct sock *nsk;
1965
1966                sk = req->rsk_listener;
1967                if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1968                        drop_reason = SKB_DROP_REASON_XFRM_POLICY;
1969                else
1970                        drop_reason = tcp_inbound_md5_hash(sk, skb,
1971                                                   &iph->saddr, &iph->daddr,
1972                                                   AF_INET, dif, sdif);
1973                if (unlikely(drop_reason)) {
1974                        sk_drops_add(sk, skb);
1975                        reqsk_put(req);
1976                        goto discard_it;
1977                }
1978                if (tcp_checksum_complete(skb)) {
1979                        reqsk_put(req);
1980                        goto csum_error;
1981                }
1982                if (unlikely(sk->sk_state != TCP_LISTEN)) {
1983                        nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
1984                        if (!nsk) {
1985                                inet_csk_reqsk_queue_drop_and_put(sk, req);
1986                                goto lookup;
1987                        }
1988                        sk = nsk;
1989                        /* reuseport_migrate_sock() has already held one sk_refcnt
1990                         * before returning.
1991                         */
1992                } else {
1993                        /* We own a reference on the listener, increase it again
1994                         * as we might lose it too soon.
1995                         */
1996                        sock_hold(sk);
1997                }
1998                refcounted = true;
1999                nsk = NULL;
2000                if (!tcp_filter(sk, skb)) {

2001                        th = (const struct tcphdr *)skb->data;
2002                        iph = ip_hdr(skb);
2003                        tcp_v4_fill_cb(skb, iph, th);
2004                        nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2005                } else {
2006                        drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2007                }
2008                if (!nsk) {
2009                        reqsk_put(req);
2010                        if (req_stolen) {
2011                                /* Another cpu got exclusive access to req
2012                                 * and created a full blown socket.
2013                                 * Try to feed this packet to this socket
2014                                 * instead of discarding it.
2015                                 */
2016                                tcp_v4_restore_cb(skb);
2017                                sock_put(sk);
2018                                goto lookup;
2019                        }
2020                        goto discard_and_relse;
2021                }
2022                nf_reset_ct(skb);
2023                if (nsk == sk) {
2024                        reqsk_put(req);
2025                        tcp_v4_restore_cb(skb);
2026                } else if (tcp_child_process(sk, nsk, skb)) {
2027                        tcp_v4_send_reset(nsk, skb);
2028                        goto discard_and_relse;
2029                } else {
2030                        sock_put(sk);
2031                        return 0;
2032                }
2033        }
2034
2035        if (static_branch_unlikely(&ip4_min_ttl)) {
2036                /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2037                if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2038                        __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2039                        goto discard_and_relse;
2040                }
2041        }
2042
2043        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2044                drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2045                goto discard_and_relse;
2046        }
2047
2048        drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2049                                           &iph->daddr, AF_INET, dif, sdif);
2050        if (drop_reason)
2051                goto discard_and_relse;
2052
2053        nf_reset_ct(skb);
2054
2055        if (tcp_filter(sk, skb)) {
2056                drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2057                goto discard_and_relse;
2058        }
2059        th = (const struct tcphdr *)skb->data;
2060        iph = ip_hdr(skb);
2061        tcp_v4_fill_cb(skb, iph, th);
2062
2063        skb->dev = NULL;
2064
2065        if (sk->sk_state == TCP_LISTEN) {
2066                ret = tcp_v4_do_rcv(sk, skb);
2067                goto put_and_return;
2068        }
2069
2070        sk_incoming_cpu_update(sk);
2071
2072        bh_lock_sock_nested(sk);
2073        tcp_segs_in(tcp_sk(sk), skb);
2074        ret = 0;
2075        if (!sock_owned_by_user(sk)) {
2076                ret = tcp_v4_do_rcv(sk, skb);
2077        } else {
2078                if (tcp_add_backlog(sk, skb, &drop_reason))
2079                        goto discard_and_relse;
2080        }
2081        bh_unlock_sock(sk);
2082
2083put_and_return:
2084        if (refcounted)
2085                sock_put(sk);
2086
2087        return ret;
2088
2089no_tcp_socket:
2090        drop_reason = SKB_DROP_REASON_NO_SOCKET;
2091        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2092                goto discard_it;
2093
2094        tcp_v4_fill_cb(skb, iph, th);
2095
2096        if (tcp_checksum_complete(skb)) {
2097csum_error:
2098                drop_reason = SKB_DROP_REASON_TCP_CSUM;
2099                trace_tcp_bad_csum(skb);
2100                __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2101bad_packet:
2102                __TCP_INC_STATS(net, TCP_MIB_INERRS);
2103        } else {
2104                tcp_v4_send_reset(NULL, skb);
2105        }
2106
2107discard_it:
2108        SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2109        /* Discard frame. */
2110        kfree_skb_reason(skb, drop_reason);
2111        return 0;
2112
2113discard_and_relse:
2114        sk_drops_add(sk, skb);
2115        if (refcounted)
2116                sock_put(sk);
2117        goto discard_it;
2118
2119do_time_wait:
2120        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2121                drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2122                inet_twsk_put(inet_twsk(sk));
2123                goto discard_it;
2124        }
2125
2126        tcp_v4_fill_cb(skb, iph, th);
2127
2128        if (tcp_checksum_complete(skb)) {
2129                inet_twsk_put(inet_twsk(sk));
2130                goto csum_error;
2131        }
2132        switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2133        case TCP_TW_SYN: {
2134                struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2135                                                        &tcp_hashinfo, skb,
2136                                                        __tcp_hdrlen(th),
2137                                                        iph->saddr, th->source,
2138                                                        iph->daddr, th->dest,
2139                                                        inet_iif(skb),
2140                                                        sdif);
2141                if (sk2) {
2142                        inet_twsk_deschedule_put(inet_twsk(sk));
2143                        sk = sk2;
2144                        tcp_v4_restore_cb(skb);
2145                        refcounted = false;
2146                        goto process;
2147                }
2148        }
2149                /* to ACK */
2150                fallthrough;
2151        case TCP_TW_ACK:
2152                tcp_v4_timewait_ack(sk, skb);
2153                break;
2154        case TCP_TW_RST:
2155                tcp_v4_send_reset(sk, skb);
2156                inet_twsk_deschedule_put(inet_twsk(sk));
2157                goto discard_it;
2158        case TCP_TW_SUCCESS:;
2159        }
2160        goto discard_it;
2161}
2162
2163static struct timewait_sock_ops tcp_timewait_sock_ops = {
2164        .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2165        .twsk_unique    = tcp_twsk_unique,
2166        .twsk_destructor= tcp_twsk_destructor,
2167};
2168
2169void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2170{
2171        struct dst_entry *dst = skb_dst(skb);
2172
2173        if (dst && dst_hold_safe(dst)) {
2174                rcu_assign_pointer(sk->sk_rx_dst, dst);
2175                sk->sk_rx_dst_ifindex = skb->skb_iif;
2176        }
2177}
2178EXPORT_SYMBOL(inet_sk_rx_dst_set);
2179
2180const struct inet_connection_sock_af_ops ipv4_specific = {
2181        .queue_xmit        = ip_queue_xmit,
2182        .send_check        = tcp_v4_send_check,
2183        .rebuild_header    = inet_sk_rebuild_header,
2184        .sk_rx_dst_set     = inet_sk_rx_dst_set,
2185        .conn_request      = tcp_v4_conn_request,
2186        .syn_recv_sock     = tcp_v4_syn_recv_sock,
2187        .net_header_len    = sizeof(struct iphdr),
2188        .setsockopt        = ip_setsockopt,
2189        .getsockopt        = ip_getsockopt,
2190        .addr2sockaddr     = inet_csk_addr2sockaddr,
2191        .sockaddr_len      = sizeof(struct sockaddr_in),
2192        .mtu_reduced       = tcp_v4_mtu_reduced,
2193};
2194EXPORT_SYMBOL(ipv4_specific);
2195
2196#ifdef CONFIG_TCP_MD5SIG
2197static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2198        .md5_lookup             = tcp_v4_md5_lookup,
2199        .calc_md5_hash          = tcp_v4_md5_hash_skb,
2200        .md5_parse              = tcp_v4_parse_md5_keys,
2201};
2202#endif
2203
2204/* NOTE: A lot of things set to zero explicitly by call to
2205 *       sk_alloc() so need not be done here.
2206 */
2207static int tcp_v4_init_sock(struct sock *sk)
2208{
2209        struct inet_connection_sock *icsk = inet_csk(sk);
2210
2211        tcp_init_sock(sk);
2212
2213        icsk->icsk_af_ops = &ipv4_specific;
2214
2215#ifdef CONFIG_TCP_MD5SIG
2216        tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2217#endif
2218
2219        return 0;
2220}
2221
2222void tcp_v4_destroy_sock(struct sock *sk)
2223{
2224        struct tcp_sock *tp = tcp_sk(sk);
2225
2226        trace_tcp_destroy_sock(sk);
2227
2228        tcp_clear_xmit_timers(sk);
2229
2230        tcp_cleanup_congestion_control(sk);
2231
2232        tcp_cleanup_ulp(sk);
2233
2234        /* Cleanup up the write buffer. */
2235        tcp_write_queue_purge(sk);
2236
2237        /* Check if we want to disable active TFO */
2238        tcp_fastopen_active_disable_ofo_check(sk);
2239
2240        /* Cleans up our, hopefully empty, out_of_order_queue. */
2241        skb_rbtree_purge(&tp->out_of_order_queue);
2242
2243#ifdef CONFIG_TCP_MD5SIG
2244        /* Clean up the MD5 key list, if any */
2245        if (tp->md5sig_info) {
2246                tcp_clear_md5_list(sk);
2247                kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2248                tp->md5sig_info = NULL;
2249        }
2250#endif
2251
2252        /* Clean up a referenced TCP bind bucket. */
2253        if (inet_csk(sk)->icsk_bind_hash)
2254                inet_put_port(sk);
2255
2256        BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2257
2258        /* If socket is aborted during connect operation */
2259        tcp_free_fastopen_req(tp);
2260        tcp_fastopen_destroy_cipher(sk);
2261        tcp_saved_syn_free(tp);
2262
2263        sk_sockets_allocated_dec(sk);
2264}
2265EXPORT_SYMBOL(tcp_v4_destroy_sock);
2266
2267#ifdef CONFIG_PROC_FS
2268/* Proc filesystem TCP sock list dumping. */
2269
2270static unsigned short seq_file_family(const struct seq_file *seq);
2271
2272static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2273{
2274        unsigned short family = seq_file_family(seq);
2275
2276        /* AF_UNSPEC is used as a match all */
2277        return ((family == AF_UNSPEC || family == sk->sk_family) &&
2278                net_eq(sock_net(sk), seq_file_net(seq)));
2279}
2280
2281/* Find a non empty bucket (starting from st->bucket)
2282 * and return the first sk from it.
2283 */
2284static void *listening_get_first(struct seq_file *seq)
2285{
2286        struct tcp_iter_state *st = seq->private;
2287
2288        st->offset = 0;
2289        for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
2290                struct inet_listen_hashbucket *ilb2;
2291                struct hlist_nulls_node *node;
2292                struct sock *sk;
2293
2294                ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2295                if (hlist_nulls_empty(&ilb2->nulls_head))
2296                        continue;
2297
2298                spin_lock(&ilb2->lock);
2299                sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2300                        if (seq_sk_match(seq, sk))
2301                                return sk;
2302                }
2303                spin_unlock(&ilb2->lock);
2304        }
2305
2306        return NULL;
2307}
2308
2309/* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2310 * If "cur" is the last one in the st->bucket,
2311 * call listening_get_first() to return the first sk of the next
2312 * non empty bucket.
2313 */
2314static void *listening_get_next(struct seq_file *seq, void *cur)
2315{
2316        struct tcp_iter_state *st = seq->private;
2317        struct inet_listen_hashbucket *ilb2;
2318        struct hlist_nulls_node *node;
2319        struct sock *sk = cur;
2320
2321        ++st->num;
2322        ++st->offset;
2323
2324        sk = sk_nulls_next(sk);
2325        sk_nulls_for_each_from(sk, node) {
2326                if (seq_sk_match(seq, sk))
2327                        return sk;
2328        }
2329
2330        ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2331        spin_unlock(&ilb2->lock);
2332        ++st->bucket;
2333        return listening_get_first(seq);
2334}
2335
2336static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2337{
2338        struct tcp_iter_state *st = seq->private;
2339        void *rc;
2340
2341        st->bucket = 0;
2342        st->offset = 0;
2343        rc = listening_get_first(seq);
2344
2345        while (rc && *pos) {
2346                rc = listening_get_next(seq, rc);
2347                --*pos;
2348        }
2349        return rc;
2350}
2351
2352static inline bool empty_bucket(const struct tcp_iter_state *st)
2353{
2354        return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2355}
2356
2357/*
2358 * Get first established socket starting from bucket given in st->bucket.
2359 * If st->bucket is zero, the very first socket in the hash is returned.
2360 */
2361static void *established_get_first(struct seq_file *seq)
2362{
2363        struct tcp_iter_state *st = seq->private;
2364
2365        st->offset = 0;
2366        for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2367                struct sock *sk;
2368                struct hlist_nulls_node *node;
2369                spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2370
2371                /* Lockless fast path for the common case of empty buckets */
2372                if (empty_bucket(st))
2373                        continue;
2374
2375                spin_lock_bh(lock);
2376                sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2377                        if (seq_sk_match(seq, sk))
2378                                return sk;
2379                }
2380                spin_unlock_bh(lock);
2381        }
2382
2383        return NULL;
2384}
2385
2386static void *established_get_next(struct seq_file *seq, void *cur)
2387{
2388        struct sock *sk = cur;
2389        struct hlist_nulls_node *node;
2390        struct tcp_iter_state *st = seq->private;
2391
2392        ++st->num;
2393        ++st->offset;
2394
2395        sk = sk_nulls_next(sk);
2396
2397        sk_nulls_for_each_from(sk, node) {
2398                if (seq_sk_match(seq, sk))
2399                        return sk;
2400        }
2401
2402        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2403        ++st->bucket;
2404        return established_get_first(seq);
2405}
2406
2407static void *established_get_idx(struct seq_file *seq, loff_t pos)
2408{
2409        struct tcp_iter_state *st = seq->private;
2410        void *rc;
2411
2412        st->bucket = 0;
2413        rc = established_get_first(seq);
2414
2415        while (rc && pos) {
2416                rc = established_get_next(seq, rc);
2417                --pos;
2418        }
2419        return rc;
2420}
2421
2422static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2423{
2424        void *rc;
2425        struct tcp_iter_state *st = seq->private;
2426
2427        st->state = TCP_SEQ_STATE_LISTENING;
2428        rc        = listening_get_idx(seq, &pos);
2429
2430        if (!rc) {
2431                st->state = TCP_SEQ_STATE_ESTABLISHED;
2432                rc        = established_get_idx(seq, pos);
2433        }
2434
2435        return rc;
2436}
2437
2438static void *tcp_seek_last_pos(struct seq_file *seq)
2439{
2440        struct tcp_iter_state *st = seq->private;
2441        int bucket = st->bucket;
2442        int offset = st->offset;
2443        int orig_num = st->num;
2444        void *rc = NULL;
2445
2446        switch (st->state) {
2447        case TCP_SEQ_STATE_LISTENING:
2448                if (st->bucket > tcp_hashinfo.lhash2_mask)
2449                        break;
2450                st->state = TCP_SEQ_STATE_LISTENING;
2451                rc = listening_get_first(seq);
2452                while (offset-- && rc && bucket == st->bucket)
2453                        rc = listening_get_next(seq, rc);
2454                if (rc)
2455                        break;
2456                st->bucket = 0;
2457                st->state = TCP_SEQ_STATE_ESTABLISHED;
2458                fallthrough;
2459        case TCP_SEQ_STATE_ESTABLISHED:
2460                if (st->bucket > tcp_hashinfo.ehash_mask)
2461                        break;
2462                rc = established_get_first(seq);
2463                while (offset-- && rc && bucket == st->bucket)
2464                        rc = established_get_next(seq, rc);
2465        }
2466
2467        st->num = orig_num;
2468
2469        return rc;
2470}
2471
2472void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2473{
2474        struct tcp_iter_state *st = seq->private;
2475        void *rc;
2476
2477        if (*pos && *pos == st->last_pos) {
2478                rc = tcp_seek_last_pos(seq);
2479                if (rc)
2480                        goto out;
2481        }
2482
2483        st->state = TCP_SEQ_STATE_LISTENING;
2484        st->num = 0;
2485        st->bucket = 0;
2486        st->offset = 0;
2487        rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2488
2489out:
2490        st->last_pos = *pos;
2491        return rc;
2492}
2493EXPORT_SYMBOL(tcp_seq_start);
2494
2495void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2496{
2497        struct tcp_iter_state *st = seq->private;
2498        void *rc = NULL;
2499
2500        if (v == SEQ_START_TOKEN) {
2501                rc = tcp_get_idx(seq, 0);
2502                goto out;
2503        }
2504
2505        switch (st->state) {
2506        case TCP_SEQ_STATE_LISTENING:
2507                rc = listening_get_next(seq, v);
2508                if (!rc) {
2509                        st->state = TCP_SEQ_STATE_ESTABLISHED;
2510                        st->bucket = 0;
2511                        st->offset = 0;
2512                        rc        = established_get_first(seq);
2513                }
2514                break;
2515        case TCP_SEQ_STATE_ESTABLISHED:
2516                rc = established_get_next(seq, v);
2517                break;
2518        }
2519out:
2520        ++*pos;
2521        st->last_pos = *pos;
2522        return rc;
2523}
2524EXPORT_SYMBOL(tcp_seq_next);
2525
2526void tcp_seq_stop(struct seq_file *seq, void *v)
2527{
2528        struct tcp_iter_state *st = seq->private;
2529
2530        switch (st->state) {
2531        case TCP_SEQ_STATE_LISTENING:
2532                if (v != SEQ_START_TOKEN)
2533                        spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2534                break;
2535        case TCP_SEQ_STATE_ESTABLISHED:
2536                if (v)
2537                        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2538                break;
2539        }
2540}
2541EXPORT_SYMBOL(tcp_seq_stop);
2542
2543static void get_openreq4(const struct request_sock *req,
2544                         struct seq_file *f, int i)
2545{
2546        const struct inet_request_sock *ireq = inet_rsk(req);
2547        long delta = req->rsk_timer.expires - jiffies;
2548
2549        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2550                " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2551                i,
2552                ireq->ir_loc_addr,
2553                ireq->ir_num,
2554                ireq->ir_rmt_addr,
2555                ntohs(ireq->ir_rmt_port),
2556                TCP_SYN_RECV,
2557                0, 0, /* could print option size, but that is af dependent. */
2558                1,    /* timers active (only the expire timer) */
2559                jiffies_delta_to_clock_t(delta),
2560                req->num_timeout,
2561                from_kuid_munged(seq_user_ns(f),
2562                                 sock_i_uid(req->rsk_listener)),
2563                0,  /* non standard timer */
2564                0, /* open_requests have no inode */
2565                0,
2566                req);
2567}
2568
2569static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2570{
2571        int timer_active;
2572        unsigned long timer_expires;
2573        const struct tcp_sock *tp = tcp_sk(sk);
2574        const struct inet_connection_sock *icsk = inet_csk(sk);
2575        const struct inet_sock *inet = inet_sk(sk);
2576        const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2577        __be32 dest = inet->inet_daddr;
2578        __be32 src = inet->inet_rcv_saddr;
2579        __u16 destp = ntohs(inet->inet_dport);
2580        __u16 srcp = ntohs(inet->inet_sport);
2581        int rx_queue;
2582        int state;
2583
2584        if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2585            icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2586            icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2587                timer_active    = 1;
2588                timer_expires   = icsk->icsk_timeout;
2589        } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2590                timer_active    = 4;
2591                timer_expires   = icsk->icsk_timeout;
2592        } else if (timer_pending(&sk->sk_timer)) {
2593                timer_active    = 2;
2594                timer_expires   = sk->sk_timer.expires;
2595        } else {
2596                timer_active    = 0;
2597                timer_expires = jiffies;
2598        }
2599
2600        state = inet_sk_state_load(sk);
2601        if (state == TCP_LISTEN)
2602                rx_queue = READ_ONCE(sk->sk_ack_backlog);
2603        else
2604                /* Because we don't lock the socket,
2605                 * we might find a transient negative value.
2606                 */
2607                rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2608                                      READ_ONCE(tp->copied_seq), 0);
2609
2610        seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2611                        "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2612                i, src, srcp, dest, destp, state,
2613                READ_ONCE(tp->write_seq) - tp->snd_una,
2614                rx_queue,
2615                timer_active,
2616                jiffies_delta_to_clock_t(timer_expires - jiffies),
2617                icsk->icsk_retransmits,
2618                from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2619                icsk->icsk_probes_out,
2620                sock_i_ino(sk),
2621                refcount_read(&sk->sk_refcnt), sk,
2622                jiffies_to_clock_t(icsk->icsk_rto),
2623                jiffies_to_clock_t(icsk->icsk_ack.ato),
2624                (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2625                tcp_snd_cwnd(tp),
2626                state == TCP_LISTEN ?
2627                    fastopenq->max_qlen :
2628                    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2629}
2630
2631static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2632                               struct seq_file *f, int i)
2633{
2634        long delta = tw->tw_timer.expires - jiffies;
2635        __be32 dest, src;
2636        __u16 destp, srcp;
2637
2638        dest  = tw->tw_daddr;
2639        src   = tw->tw_rcv_saddr;
2640        destp = ntohs(tw->tw_dport);
2641        srcp  = ntohs(tw->tw_sport);
2642
2643        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2644                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2645                i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2646                3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2647                refcount_read(&tw->tw_refcnt), tw);
2648}
2649
2650#define TMPSZ 150
2651
2652static int tcp4_seq_show(struct seq_file *seq, void *v)
2653{
2654        struct tcp_iter_state *st;
2655        struct sock *sk = v;
2656
2657        seq_setwidth(seq, TMPSZ - 1);
2658        if (v == SEQ_START_TOKEN) {
2659                seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2660                           "rx_queue tr tm->when retrnsmt   uid  timeout "
2661                           "inode");
2662                goto out;
2663        }
2664        st = seq->private;
2665
2666        if (sk->sk_state == TCP_TIME_WAIT)
2667                get_timewait4_sock(v, seq, st->num);
2668        else if (sk->sk_state == TCP_NEW_SYN_RECV)
2669                get_openreq4(v, seq, st->num);
2670        else
2671                get_tcp4_sock(v, seq, st->num);
2672out:
2673        seq_pad(seq, '\n');
2674        return 0;
2675}
2676
2677#ifdef CONFIG_BPF_SYSCALL
2678struct bpf_tcp_iter_state {
2679        struct tcp_iter_state state;
2680        unsigned int cur_sk;
2681        unsigned int end_sk;
2682        unsigned int max_sk;
2683        struct sock **batch;
2684        bool st_bucket_done;
2685};
2686
2687struct bpf_iter__tcp {
2688        __bpf_md_ptr(struct bpf_iter_meta *, meta);
2689        __bpf_md_ptr(struct sock_common *, sk_common);
2690        uid_t uid __aligned(8);
2691};
2692
2693static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2694                             struct sock_common *sk_common, uid_t uid)
2695{
2696        struct bpf_iter__tcp ctx;
2697
2698        meta->seq_num--;  /* skip SEQ_START_TOKEN */
2699        ctx.meta = meta;
2700        ctx.sk_common = sk_common;
2701        ctx.uid = uid;
2702        return bpf_iter_run_prog(prog, &ctx);
2703}
2704
2705static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2706{
2707        while (iter->cur_sk < iter->end_sk)
2708                sock_put(iter->batch[iter->cur_sk++]);
2709}
2710
2711static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2712                                      unsigned int new_batch_sz)
2713{
2714        struct sock **new_batch;
2715
2716        new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2717                             GFP_USER | __GFP_NOWARN);
2718        if (!new_batch)
2719                return -ENOMEM;
2720
2721        bpf_iter_tcp_put_batch(iter);
2722        kvfree(iter->batch);
2723        iter->batch = new_batch;
2724        iter->max_sk = new_batch_sz;
2725
2726        return 0;
2727}
2728
2729static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2730                                                 struct sock *start_sk)
2731{
2732        struct bpf_tcp_iter_state *iter = seq->private;
2733        struct tcp_iter_state *st = &iter->state;
2734        struct hlist_nulls_node *node;
2735        unsigned int expected = 1;
2736        struct sock *sk;
2737
2738        sock_hold(start_sk);
2739        iter->batch[iter->end_sk++] = start_sk;
2740
2741        sk = sk_nulls_next(start_sk);
2742        sk_nulls_for_each_from(sk, node) {
2743                if (seq_sk_match(seq, sk)) {
2744                        if (iter->end_sk < iter->max_sk) {
2745                                sock_hold(sk);
2746                                iter->batch[iter->end_sk++] = sk;
2747                        }
2748                        expected++;
2749                }
2750        }
2751        spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2752
2753        return expected;
2754}
2755
2756static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2757                                                   struct sock *start_sk)
2758{
2759        struct bpf_tcp_iter_state *iter = seq->private;
2760        struct tcp_iter_state *st = &iter->state;
2761        struct hlist_nulls_node *node;
2762        unsigned int expected = 1;
2763        struct sock *sk;
2764
2765        sock_hold(start_sk);
2766        iter->batch[iter->end_sk++] = start_sk;
2767
2768        sk = sk_nulls_next(start_sk);
2769        sk_nulls_for_each_from(sk, node) {
2770                if (seq_sk_match(seq, sk)) {
2771                        if (iter->end_sk < iter->max_sk) {
2772                                sock_hold(sk);
2773                                iter->batch[iter->end_sk++] = sk;
2774                        }
2775                        expected++;
2776                }
2777        }
2778        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2779
2780        return expected;
2781}
2782
2783static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2784{
2785        struct bpf_tcp_iter_state *iter = seq->private;
2786        struct tcp_iter_state *st = &iter->state;
2787        unsigned int expected;
2788        bool resized = false;
2789        struct sock *sk;
2790
2791        /* The st->bucket is done.  Directly advance to the next
2792         * bucket instead of having the tcp_seek_last_pos() to skip
2793         * one by one in the current bucket and eventually find out
2794         * it has to advance to the next bucket.
2795         */
2796        if (iter->st_bucket_done) {
2797                st->offset = 0;
2798                st->bucket++;
2799                if (st->state == TCP_SEQ_STATE_LISTENING &&
2800                    st->bucket > tcp_hashinfo.lhash2_mask) {
2801                        st->state = TCP_SEQ_STATE_ESTABLISHED;
2802                        st->bucket = 0;
2803                }
2804        }
2805
2806again:
2807        /* Get a new batch */
2808        iter->cur_sk = 0;
2809        iter->end_sk = 0;
2810        iter->st_bucket_done = false;
2811
2812        sk = tcp_seek_last_pos(seq);
2813        if (!sk)
2814                return NULL; /* Done */
2815
2816        if (st->state == TCP_SEQ_STATE_LISTENING)
2817                expected = bpf_iter_tcp_listening_batch(seq, sk);
2818        else
2819                expected = bpf_iter_tcp_established_batch(seq, sk);
2820
2821        if (iter->end_sk == expected) {
2822                iter->st_bucket_done = true;
2823                return sk;
2824        }
2825
2826        if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2827                resized = true;
2828                goto again;
2829        }
2830
2831        return sk;
2832}
2833
2834static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2835{
2836        /* bpf iter does not support lseek, so it always
2837         * continue from where it was stop()-ped.
2838         */
2839        if (*pos)
2840                return bpf_iter_tcp_batch(seq);
2841
2842        return SEQ_START_TOKEN;
2843}
2844
2845static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2846{
2847        struct bpf_tcp_iter_state *iter = seq->private;
2848        struct tcp_iter_state *st = &iter->state;
2849        struct sock *sk;
2850
2851        /* Whenever seq_next() is called, the iter->cur_sk is
2852         * done with seq_show(), so advance to the next sk in
2853         * the batch.
2854         */
2855        if (iter->cur_sk < iter->end_sk) {
2856                /* Keeping st->num consistent in tcp_iter_state.
2857                 * bpf_iter_tcp does not use st->num.
2858                 * meta.seq_num is used instead.
2859                 */
2860                st->num++;
2861                /* Move st->offset to the next sk in the bucket such that
2862                 * the future start() will resume at st->offset in
2863                 * st->bucket.  See tcp_seek_last_pos().
2864                 */
2865                st->offset++;
2866                sock_put(iter->batch[iter->cur_sk++]);
2867        }
2868
2869        if (iter->cur_sk < iter->end_sk)
2870                sk = iter->batch[iter->cur_sk];
2871        else
2872                sk = bpf_iter_tcp_batch(seq);
2873
2874        ++*pos;
2875        /* Keeping st->last_pos consistent in tcp_iter_state.
2876         * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2877         */
2878        st->last_pos = *pos;
2879        return sk;
2880}
2881
2882static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2883{
2884        struct bpf_iter_meta meta;
2885        struct bpf_prog *prog;
2886        struct sock *sk = v;
2887        bool slow;
2888        uid_t uid;
2889        int ret;
2890
2891        if (v == SEQ_START_TOKEN)
2892                return 0;
2893
2894        if (sk_fullsock(sk))
2895                slow = lock_sock_fast(sk);
2896
2897        if (unlikely(sk_unhashed(sk))) {
2898                ret = SEQ_SKIP;
2899                goto unlock;
2900        }
2901
2902        if (sk->sk_state == TCP_TIME_WAIT) {
2903                uid = 0;
2904        } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2905                const struct request_sock *req = v;
2906
2907                uid = from_kuid_munged(seq_user_ns(seq),
2908                                       sock_i_uid(req->rsk_listener));
2909        } else {
2910                uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2911        }
2912
2913        meta.seq = seq;
2914        prog = bpf_iter_get_info(&meta, false);
2915        ret = tcp_prog_seq_show(prog, &meta, v, uid);
2916
2917unlock:
2918        if (sk_fullsock(sk))
2919                unlock_sock_fast(sk, slow);
2920        return ret;
2921
2922}
2923
2924static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2925{
2926        struct bpf_tcp_iter_state *iter = seq->private;
2927        struct bpf_iter_meta meta;
2928        struct bpf_prog *prog;
2929
2930        if (!v) {
2931                meta.seq = seq;
2932                prog = bpf_iter_get_info(&meta, true);
2933                if (prog)
2934                        (void)tcp_prog_seq_show(prog, &meta, v, 0);
2935        }
2936
2937        if (iter->cur_sk < iter->end_sk) {
2938                bpf_iter_tcp_put_batch(iter);
2939                iter->st_bucket_done = false;
2940        }
2941}
2942
2943static const struct seq_operations bpf_iter_tcp_seq_ops = {
2944        .show           = bpf_iter_tcp_seq_show,
2945        .start          = bpf_iter_tcp_seq_start,
2946        .next           = bpf_iter_tcp_seq_next,
2947        .stop           = bpf_iter_tcp_seq_stop,
2948};
2949#endif
2950static unsigned short seq_file_family(const struct seq_file *seq)
2951{
2952        const struct tcp_seq_afinfo *afinfo;
2953
2954#ifdef CONFIG_BPF_SYSCALL
2955        /* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
2956        if (seq->op == &bpf_iter_tcp_seq_ops)
2957                return AF_UNSPEC;
2958#endif
2959
2960        /* Iterated from proc fs */
2961        afinfo = pde_data(file_inode(seq->file));
2962        return afinfo->family;
2963}
2964
2965static const struct seq_operations tcp4_seq_ops = {
2966        .show           = tcp4_seq_show,
2967        .start          = tcp_seq_start,
2968        .next           = tcp_seq_next,
2969        .stop           = tcp_seq_stop,
2970};
2971
2972static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2973        .family         = AF_INET,
2974};
2975
2976static int __net_init tcp4_proc_init_net(struct net *net)
2977{
2978        if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2979                        sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2980                return -ENOMEM;
2981        return 0;
2982}
2983
2984static void __net_exit tcp4_proc_exit_net(struct net *net)
2985{
2986        remove_proc_entry("tcp", net->proc_net);
2987}
2988
2989static struct pernet_operations tcp4_net_ops = {
2990        .init = tcp4_proc_init_net,
2991        .exit = tcp4_proc_exit_net,
2992};
2993
2994int __init tcp4_proc_init(void)
2995{
2996        return register_pernet_subsys(&tcp4_net_ops);
2997}
2998
2999void tcp4_proc_exit(void)
3000{

3001        unregister_pernet_subsys(&tcp4_net_ops);
3002}
3003#endif /* CONFIG_PROC_FS */
3004
3005/* @wake is one when sk_stream_write_space() calls us.
3006 * This sends EPOLLOUT only if notsent_bytes is half the limit.
3007 * This mimics the strategy used in sock_def_write_space().
3008 */
3009bool tcp_stream_memory_free(const struct sock *sk, int wake)
3010{
3011        const struct tcp_sock *tp = tcp_sk(sk);
3012        u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3013                            READ_ONCE(tp->snd_nxt);
3014
3015        return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3016}
3017EXPORT_SYMBOL(tcp_stream_memory_free);
3018
3019struct proto tcp_prot = {
3020        .name                   = "TCP",
3021        .owner                  = THIS_MODULE,
3022        .close                  = tcp_close,
3023        .pre_connect            = tcp_v4_pre_connect,
3024        .connect                = tcp_v4_connect,
3025        .disconnect             = tcp_disconnect,
3026        .accept                 = inet_csk_accept,
3027        .ioctl                  = tcp_ioctl,
3028        .init                   = tcp_v4_init_sock,
3029        .destroy                = tcp_v4_destroy_sock,
3030        .shutdown               = tcp_shutdown,
3031        .setsockopt             = tcp_setsockopt,
3032        .getsockopt             = tcp_getsockopt,
3033        .bpf_bypass_getsockopt  = tcp_bpf_bypass_getsockopt,
3034        .keepalive              = tcp_set_keepalive,
3035        .recvmsg                = tcp_recvmsg,
3036        .sendmsg                = tcp_sendmsg,
3037        .sendpage               = tcp_sendpage,
3038        .backlog_rcv            = tcp_v4_do_rcv,
3039        .release_cb             = tcp_release_cb,
3040        .hash                   = inet_hash,
3041        .unhash                 = inet_unhash,
3042        .get_port               = inet_csk_get_port,
3043        .put_port               = inet_put_port,
3044#ifdef CONFIG_BPF_SYSCALL
3045        .psock_update_sk_prot   = tcp_bpf_update_proto,
3046#endif
3047        .enter_memory_pressure  = tcp_enter_memory_pressure,
3048        .leave_memory_pressure  = tcp_leave_memory_pressure,
3049        .stream_memory_free     = tcp_stream_memory_free,
3050        .sockets_allocated      = &tcp_sockets_allocated,
3051        .orphan_count           = &tcp_orphan_count,
3052        .memory_allocated       = &tcp_memory_allocated,
3053        .memory_pressure        = &tcp_memory_pressure,
3054        .sysctl_mem             = sysctl_tcp_mem,
3055        .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3056        .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3057        .max_header             = MAX_TCP_HEADER,
3058        .obj_size               = sizeof(struct tcp_sock),
3059        .slab_flags             = SLAB_TYPESAFE_BY_RCU,
3060        .twsk_prot              = &tcp_timewait_sock_ops,
3061        .rsk_prot               = &tcp_request_sock_ops,
3062        .h.hashinfo             = &tcp_hashinfo,
3063        .no_autobind            = true,
3064        .diag_destroy           = tcp_abort,
3065};
3066EXPORT_SYMBOL(tcp_prot);
3067
3068static void __net_exit tcp_sk_exit(struct net *net)
3069{
3070        struct inet_timewait_death_row *tcp_death_row = net->ipv4.tcp_death_row;
3071
3072        if (net->ipv4.tcp_congestion_control)
3073                bpf_module_put(net->ipv4.tcp_congestion_control,
3074                               net->ipv4.tcp_congestion_control->owner);
3075        if (refcount_dec_and_test(&tcp_death_row->tw_refcount))
3076                kfree(tcp_death_row);
3077}
3078
3079static int __net_init tcp_sk_init(struct net *net)
3080{
3081        int cnt;
3082
3083        net->ipv4.sysctl_tcp_ecn = 2;
3084        net->ipv4.sysctl_tcp_ecn_fallback = 1;
3085
3086        net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3087        net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3088        net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3089        net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3090        net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3091
3092        net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3093        net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3094        net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3095
3096        net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3097        net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3098        net->ipv4.sysctl_tcp_syncookies = 1;
3099        net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3100        net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3101        net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3102        net->ipv4.sysctl_tcp_orphan_retries = 0;
3103        net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3104        net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3105        net->ipv4.sysctl_tcp_tw_reuse = 2;
3106        net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3107
3108        net->ipv4.tcp_death_row = kzalloc(sizeof(struct inet_timewait_death_row), GFP_KERNEL);
3109        if (!net->ipv4.tcp_death_row)
3110                return -ENOMEM;
3111        refcount_set(&net->ipv4.tcp_death_row->tw_refcount, 1);
3112        cnt = tcp_hashinfo.ehash_mask + 1;
3113        net->ipv4.tcp_death_row->sysctl_max_tw_buckets = cnt / 2;
3114        net->ipv4.tcp_death_row->hashinfo = &tcp_hashinfo;
3115
3116        net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
3117        net->ipv4.sysctl_tcp_sack = 1;
3118        net->ipv4.sysctl_tcp_window_scaling = 1;
3119        net->ipv4.sysctl_tcp_timestamps = 1;
3120        net->ipv4.sysctl_tcp_early_retrans = 3;
3121        net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3122        net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3123        net->ipv4.sysctl_tcp_retrans_collapse = 1;
3124        net->ipv4.sysctl_tcp_max_reordering = 300;
3125        net->ipv4.sysctl_tcp_dsack = 1;
3126        net->ipv4.sysctl_tcp_app_win = 31;
3127        net->ipv4.sysctl_tcp_adv_win_scale = 1;
3128        net->ipv4.sysctl_tcp_frto = 2;
3129        net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3130        /* This limits the percentage of the congestion window which we
3131         * will allow a single TSO frame to consume.  Building TSO frames
3132         * which are too large can cause TCP streams to be bursty.
3133         */
3134        net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3135        /* Default TSQ limit of 16 TSO segments */
3136        net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3137        /* rfc5961 challenge ack rate limiting */
3138        net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
3139        net->ipv4.sysctl_tcp_min_tso_segs = 2;
3140        net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3141        net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3142        net->ipv4.sysctl_tcp_autocorking = 1;
3143        net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3144        net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3145        net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3146        if (net != &init_net) {
3147                memcpy(net->ipv4.sysctl_tcp_rmem,
3148                       init_net.ipv4.sysctl_tcp_rmem,
3149                       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3150                memcpy(net->ipv4.sysctl_tcp_wmem,
3151                       init_net.ipv4.sysctl_tcp_wmem,
3152                       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3153        }
3154        net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3155        net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3156        net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3157        net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3158        net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3159        atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3160
3161        /* Reno is always built in */
3162        if (!net_eq(net, &init_net) &&
3163            bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3164                               init_net.ipv4.tcp_congestion_control->owner))
3165                net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3166        else
3167                net->ipv4.tcp_congestion_control = &tcp_reno;
3168
3169        return 0;
3170}
3171
3172static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3173{
3174        struct net *net;
3175
3176        inet_twsk_purge(&tcp_hashinfo, AF_INET);
3177
3178        list_for_each_entry(net, net_exit_list, exit_list)
3179                tcp_fastopen_ctx_destroy(net);
3180}
3181
3182static struct pernet_operations __net_initdata tcp_sk_ops = {
3183       .init       = tcp_sk_init,
3184       .exit       = tcp_sk_exit,
3185       .exit_batch = tcp_sk_exit_batch,
3186};
3187
3188#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3189DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3190                     struct sock_common *sk_common, uid_t uid)
3191
3192#define INIT_BATCH_SZ 16
3193
3194static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3195{
3196        struct bpf_tcp_iter_state *iter = priv_data;
3197        int err;
3198
3199        err = bpf_iter_init_seq_net(priv_data, aux);
3200        if (err)
3201                return err;
3202
3203        err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3204        if (err) {
3205                bpf_iter_fini_seq_net(priv_data);
3206                return err;
3207        }
3208
3209        return 0;
3210}
3211
3212static void bpf_iter_fini_tcp(void *priv_data)
3213{
3214        struct bpf_tcp_iter_state *iter = priv_data;
3215
3216        bpf_iter_fini_seq_net(priv_data);
3217        kvfree(iter->batch);
3218}
3219
3220static const struct bpf_iter_seq_info tcp_seq_info = {
3221        .seq_ops                = &bpf_iter_tcp_seq_ops,
3222        .init_seq_private       = bpf_iter_init_tcp,
3223        .fini_seq_private       = bpf_iter_fini_tcp,
3224        .seq_priv_size          = sizeof(struct bpf_tcp_iter_state),
3225};
3226
3227static const struct bpf_func_proto *
3228bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3229                            const struct bpf_prog *prog)
3230{
3231        switch (func_id) {
3232        case BPF_FUNC_setsockopt:
3233                return &bpf_sk_setsockopt_proto;
3234        case BPF_FUNC_getsockopt:
3235                return &bpf_sk_getsockopt_proto;
3236        default:
3237                return NULL;
3238        }
3239}
3240
3241static struct bpf_iter_reg tcp_reg_info = {
3242        .target                 = "tcp",
3243        .ctx_arg_info_size      = 1,
3244        .ctx_arg_info           = {
3245                { offsetof(struct bpf_iter__tcp, sk_common),
3246                  PTR_TO_BTF_ID_OR_NULL },
3247        },
3248        .get_func_proto         = bpf_iter_tcp_get_func_proto,
3249        .seq_info               = &tcp_seq_info,
3250};
3251
3252static void __init bpf_iter_register(void)
3253{
3254        tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3255        if (bpf_iter_reg_target(&tcp_reg_info))
3256                pr_warn("Warning: could not register bpf iterator tcp\n");
3257}
3258
3259#endif
3260
3261void __init tcp_v4_init(void)
3262{
3263        int cpu, res;
3264
3265        for_each_possible_cpu(cpu) {
3266                struct sock *sk;
3267
3268                res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3269                                           IPPROTO_TCP, &init_net);
3270                if (res)
3271                        panic("Failed to create the TCP control socket.\n");
3272                sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3273
3274                /* Please enforce IP_DF and IPID==0 for RST and
3275                 * ACK sent in SYN-RECV and TIME-WAIT state.
3276                 */
3277                inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3278
3279                per_cpu(ipv4_tcp_sk, cpu) = sk;
3280        }
3281        if (register_pernet_subsys(&tcp_sk_ops))
3282                panic("Failed to create the TCP control socket.\n");
3283
3284#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3285        bpf_iter_register();
3286#endif
3287}
3288