linux/net/ipv4/tcp_ipv4.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4 *              operating system.  INET is implemented using the  BSD Socket
   5 *              interface as the means of communication with the user level.
   6 *
   7 *              Implementation of the Transmission Control Protocol(TCP).
   8 *
   9 *              IPv4 specific functions
  10 *
  11 *              code split from:
  12 *              linux/ipv4/tcp.c
  13 *              linux/ipv4/tcp_input.c
  14 *              linux/ipv4/tcp_output.c
  15 *
  16 *              See tcp.c for author information
  17 */
  18
  19/*
  20 * Changes:
  21 *              David S. Miller :       New socket lookup architecture.
  22 *                                      This code is dedicated to John Dyson.
  23 *              David S. Miller :       Change semantics of established hash,
  24 *                                      half is devoted to TIME_WAIT sockets
  25 *                                      and the rest go in the other half.
  26 *              Andi Kleen :            Add support for syncookies and fixed
  27 *                                      some bugs: ip options weren't passed to
  28 *                                      the TCP layer, missed a check for an
  29 *                                      ACK bit.
  30 *              Andi Kleen :            Implemented fast path mtu discovery.
  31 *                                      Fixed many serious bugs in the
  32 *                                      request_sock handling and moved
  33 *                                      most of it into the af independent code.
  34 *                                      Added tail drop and some other bugfixes.
  35 *                                      Added new listen semantics.
  36 *              Mike McLagan    :       Routing by source
  37 *      Juan Jose Ciarlante:            ip_dynaddr bits
  38 *              Andi Kleen:             various fixes.
  39 *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  40 *                                      coma.
  41 *      Andi Kleen              :       Fix new listen.
  42 *      Andi Kleen              :       Fix accept error reporting.
  43 *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  44 *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  45 *                                      a single port at the same time.
  46 */
  47
  48#define pr_fmt(fmt) "TCP: " fmt
  49
  50#include <linux/bottom_half.h>
  51#include <linux/types.h>
  52#include <linux/fcntl.h>
  53#include <linux/module.h>
  54#include <linux/random.h>
  55#include <linux/cache.h>
  56#include <linux/jhash.h>
  57#include <linux/init.h>
  58#include <linux/times.h>
  59#include <linux/slab.h>
  60
  61#include <net/net_namespace.h>
  62#include <net/icmp.h>
  63#include <net/inet_hashtables.h>
  64#include <net/tcp.h>
  65#include <net/transp_v6.h>
  66#include <net/ipv6.h>
  67#include <net/inet_common.h>
  68#include <net/timewait_sock.h>
  69#include <net/xfrm.h>
  70#include <net/secure_seq.h>
  71#include <net/busy_poll.h>
  72
  73#include <linux/inet.h>
  74#include <linux/ipv6.h>
  75#include <linux/stddef.h>
  76#include <linux/proc_fs.h>
  77#include <linux/seq_file.h>
  78#include <linux/inetdevice.h>
  79#include <linux/btf_ids.h>
  80
  81#include <crypto/hash.h>
  82#include <linux/scatterlist.h>
  83
  84#include <trace/events/tcp.h>
  85
  86#ifdef CONFIG_TCP_MD5SIG
  87static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  88                               __be32 daddr, __be32 saddr, const struct tcphdr *th);
  89#endif
  90
  91struct inet_hashinfo tcp_hashinfo;
  92EXPORT_SYMBOL(tcp_hashinfo);
  93
  94static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  95{
  96        return secure_tcp_seq(ip_hdr(skb)->daddr,
  97                              ip_hdr(skb)->saddr,
  98                              tcp_hdr(skb)->dest,
  99                              tcp_hdr(skb)->source);
 100}
 101
 102static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 103{
 104        return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 105}
 106
 107int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 108{
 109        const struct inet_timewait_sock *tw = inet_twsk(sktw);
 110        const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 111        struct tcp_sock *tp = tcp_sk(sk);
 112        int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
 113
 114        if (reuse == 2) {
 115                /* Still does not detect *everything* that goes through
 116                 * lo, since we require a loopback src or dst address
 117                 * or direct binding to 'lo' interface.
 118                 */
 119                bool loopback = false;
 120                if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 121                        loopback = true;
 122#if IS_ENABLED(CONFIG_IPV6)
 123                if (tw->tw_family == AF_INET6) {
 124                        if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 125                            ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
 126                            ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 127                            ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
 128                                loopback = true;
 129                } else
 130#endif
 131                {
 132                        if (ipv4_is_loopback(tw->tw_daddr) ||
 133                            ipv4_is_loopback(tw->tw_rcv_saddr))
 134                                loopback = true;
 135                }
 136                if (!loopback)
 137                        reuse = 0;
 138        }
 139
 140        /* With PAWS, it is safe from the viewpoint
 141           of data integrity. Even without PAWS it is safe provided sequence
 142           spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 143
 144           Actually, the idea is close to VJ's one, only timestamp cache is
 145           held not per host, but per port pair and TW bucket is used as state
 146           holder.
 147
 148           If TW bucket has been already destroyed we fall back to VJ's scheme
 149           and use initial timestamp retrieved from peer table.
 150         */
 151        if (tcptw->tw_ts_recent_stamp &&
 152            (!twp || (reuse && time_after32(ktime_get_seconds(),
 153                                            tcptw->tw_ts_recent_stamp)))) {
 154                /* In case of repair and re-using TIME-WAIT sockets we still
 155                 * want to be sure that it is safe as above but honor the
 156                 * sequence numbers and time stamps set as part of the repair
 157                 * process.
 158                 *
 159                 * Without this check re-using a TIME-WAIT socket with TCP
 160                 * repair would accumulate a -1 on the repair assigned
 161                 * sequence number. The first time it is reused the sequence
 162                 * is -1, the second time -2, etc. This fixes that issue
 163                 * without appearing to create any others.
 164                 */
 165                if (likely(!tp->repair)) {
 166                        u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
 167
 168                        if (!seq)
 169                                seq = 1;
 170                        WRITE_ONCE(tp->write_seq, seq);
 171                        tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 172                        tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 173                }
 174                sock_hold(sktw);
 175                return 1;
 176        }
 177
 178        return 0;
 179}
 180EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 181
 182static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 183                              int addr_len)
 184{
 185        /* This check is replicated from tcp_v4_connect() and intended to
 186         * prevent BPF program called below from accessing bytes that are out
 187         * of the bound specified by user in addr_len.
 188         */
 189        if (addr_len < sizeof(struct sockaddr_in))
 190                return -EINVAL;
 191
 192        sock_owned_by_me(sk);
 193
 194        return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 195}
 196
 197/* This will initiate an outgoing connection. */
 198int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 199{
 200        struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 201        struct inet_sock *inet = inet_sk(sk);
 202        struct tcp_sock *tp = tcp_sk(sk);
 203        __be16 orig_sport, orig_dport;
 204        __be32 daddr, nexthop;
 205        struct flowi4 *fl4;
 206        struct rtable *rt;
 207        int err;
 208        struct ip_options_rcu *inet_opt;
 209        struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 210
 211        if (addr_len < sizeof(struct sockaddr_in))
 212                return -EINVAL;
 213
 214        if (usin->sin_family != AF_INET)
 215                return -EAFNOSUPPORT;
 216
 217        nexthop = daddr = usin->sin_addr.s_addr;
 218        inet_opt = rcu_dereference_protected(inet->inet_opt,
 219                                             lockdep_sock_is_held(sk));
 220        if (inet_opt && inet_opt->opt.srr) {
 221                if (!daddr)
 222                        return -EINVAL;
 223                nexthop = inet_opt->opt.faddr;
 224        }
 225
 226        orig_sport = inet->inet_sport;
 227        orig_dport = usin->sin_port;
 228        fl4 = &inet->cork.fl.u.ip4;
 229        rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 230                              RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 231                              IPPROTO_TCP,
 232                              orig_sport, orig_dport, sk);
 233        if (IS_ERR(rt)) {
 234                err = PTR_ERR(rt);
 235                if (err == -ENETUNREACH)
 236                        IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 237                return err;
 238        }
 239
 240        if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 241                ip_rt_put(rt);
 242                return -ENETUNREACH;
 243        }
 244
 245        if (!inet_opt || !inet_opt->opt.srr)
 246                daddr = fl4->daddr;
 247
 248        if (!inet->inet_saddr)
 249                inet->inet_saddr = fl4->saddr;
 250        sk_rcv_saddr_set(sk, inet->inet_saddr);
 251
 252        if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 253                /* Reset inherited state */
 254                tp->rx_opt.ts_recent       = 0;
 255                tp->rx_opt.ts_recent_stamp = 0;
 256                if (likely(!tp->repair))
 257                        WRITE_ONCE(tp->write_seq, 0);
 258        }
 259
 260        inet->inet_dport = usin->sin_port;
 261        sk_daddr_set(sk, daddr);
 262
 263        inet_csk(sk)->icsk_ext_hdr_len = 0;
 264        if (inet_opt)
 265                inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 266
 267        tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 268
 269        /* Socket identity is still unknown (sport may be zero).
 270         * However we set state to SYN-SENT and not releasing socket
 271         * lock select source port, enter ourselves into the hash tables and
 272         * complete initialization after this.
 273         */
 274        tcp_set_state(sk, TCP_SYN_SENT);
 275        err = inet_hash_connect(tcp_death_row, sk);
 276        if (err)
 277                goto failure;
 278
 279        sk_set_txhash(sk);
 280
 281        rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 282                               inet->inet_sport, inet->inet_dport, sk);
 283        if (IS_ERR(rt)) {
 284                err = PTR_ERR(rt);
 285                rt = NULL;
 286                goto failure;
 287        }
 288        /* OK, now commit destination to socket.  */
 289        sk->sk_gso_type = SKB_GSO_TCPV4;
 290        sk_setup_caps(sk, &rt->dst);
 291        rt = NULL;
 292
 293        if (likely(!tp->repair)) {
 294                if (!tp->write_seq)
 295                        WRITE_ONCE(tp->write_seq,
 296                                   secure_tcp_seq(inet->inet_saddr,
 297                                                  inet->inet_daddr,
 298                                                  inet->inet_sport,
 299                                                  usin->sin_port));
 300                tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 301                                                 inet->inet_saddr,
 302                                                 inet->inet_daddr);
 303        }
 304
 305        inet->inet_id = prandom_u32();
 306
 307        if (tcp_fastopen_defer_connect(sk, &err))
 308                return err;
 309        if (err)
 310                goto failure;
 311
 312        err = tcp_connect(sk);
 313
 314        if (err)
 315                goto failure;
 316
 317        return 0;
 318
 319failure:
 320        /*
 321         * This unhashes the socket and releases the local port,
 322         * if necessary.
 323         */
 324        tcp_set_state(sk, TCP_CLOSE);
 325        ip_rt_put(rt);
 326        sk->sk_route_caps = 0;
 327        inet->inet_dport = 0;
 328        return err;
 329}
 330EXPORT_SYMBOL(tcp_v4_connect);
 331
 332/*
 333 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 334 * It can be called through tcp_release_cb() if socket was owned by user
 335 * at the time tcp_v4_err() was called to handle ICMP message.
 336 */
 337void tcp_v4_mtu_reduced(struct sock *sk)
 338{
 339        struct inet_sock *inet = inet_sk(sk);
 340        struct dst_entry *dst;
 341        u32 mtu;
 342
 343        if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 344                return;
 345        mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
 346        dst = inet_csk_update_pmtu(sk, mtu);
 347        if (!dst)
 348                return;
 349
 350        /* Something is about to be wrong... Remember soft error
 351         * for the case, if this connection will not able to recover.
 352         */
 353        if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 354                sk->sk_err_soft = EMSGSIZE;
 355
 356        mtu = dst_mtu(dst);
 357
 358        if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 359            ip_sk_accept_pmtu(sk) &&
 360            inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 361                tcp_sync_mss(sk, mtu);
 362
 363                /* Resend the TCP packet because it's
 364                 * clear that the old packet has been
 365                 * dropped. This is the new "fast" path mtu
 366                 * discovery.
 367                 */
 368                tcp_simple_retransmit(sk);
 369        } /* else let the usual retransmit timer handle it */
 370}
 371EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 372
 373static void do_redirect(struct sk_buff *skb, struct sock *sk)
 374{
 375        struct dst_entry *dst = __sk_dst_check(sk, 0);
 376
 377        if (dst)
 378                dst->ops->redirect(dst, sk, skb);
 379}
 380
 381
 382/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 383void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 384{
 385        struct request_sock *req = inet_reqsk(sk);
 386        struct net *net = sock_net(sk);
 387
 388        /* ICMPs are not backlogged, hence we cannot get
 389         * an established socket here.
 390         */
 391        if (seq != tcp_rsk(req)->snt_isn) {
 392                __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 393        } else if (abort) {
 394                /*
 395                 * Still in SYN_RECV, just remove it silently.
 396                 * There is no good way to pass the error to the newly
 397                 * created socket, and POSIX does not want network
 398                 * errors returned from accept().
 399                 */
 400                inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 401                tcp_listendrop(req->rsk_listener);
 402        }
 403        reqsk_put(req);
 404}
 405EXPORT_SYMBOL(tcp_req_err);
 406
 407/* TCP-LD (RFC 6069) logic */
 408void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
 409{
 410        struct inet_connection_sock *icsk = inet_csk(sk);
 411        struct tcp_sock *tp = tcp_sk(sk);
 412        struct sk_buff *skb;
 413        s32 remaining;
 414        u32 delta_us;
 415
 416        if (sock_owned_by_user(sk))
 417                return;
 418
 419        if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 420            !icsk->icsk_backoff)
 421                return;
 422
 423        skb = tcp_rtx_queue_head(sk);
 424        if (WARN_ON_ONCE(!skb))
 425                return;
 426
 427        icsk->icsk_backoff--;
 428        icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
 429        icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 430
 431        tcp_mstamp_refresh(tp);
 432        delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 433        remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
 434
 435        if (remaining > 0) {
 436                inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 437                                          remaining, TCP_RTO_MAX);
 438        } else {
 439                /* RTO revert clocked out retransmission.
 440                 * Will retransmit now.
 441                 */
 442                tcp_retransmit_timer(sk);
 443        }
 444}
 445EXPORT_SYMBOL(tcp_ld_RTO_revert);
 446
 447/*
 448 * This routine is called by the ICMP module when it gets some
 449 * sort of error condition.  If err < 0 then the socket should
 450 * be closed and the error returned to the user.  If err > 0
 451 * it's just the icmp type << 8 | icmp code.  After adjustment
 452 * header points to the first 8 bytes of the tcp header.  We need
 453 * to find the appropriate port.
 454 *
 455 * The locking strategy used here is very "optimistic". When
 456 * someone else accesses the socket the ICMP is just dropped
 457 * and for some paths there is no check at all.
 458 * A more general error queue to queue errors for later handling
 459 * is probably better.
 460 *
 461 */
 462
 463int tcp_v4_err(struct sk_buff *skb, u32 info)
 464{
 465        const struct iphdr *iph = (const struct iphdr *)skb->data;
 466        struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 467        struct tcp_sock *tp;
 468        struct inet_sock *inet;
 469        const int type = icmp_hdr(skb)->type;
 470        const int code = icmp_hdr(skb)->code;
 471        struct sock *sk;
 472        struct request_sock *fastopen;
 473        u32 seq, snd_una;
 474        int err;
 475        struct net *net = dev_net(skb->dev);
 476
 477        sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 478                                       th->dest, iph->saddr, ntohs(th->source),
 479                                       inet_iif(skb), 0);
 480        if (!sk) {
 481                __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 482                return -ENOENT;
 483        }
 484        if (sk->sk_state == TCP_TIME_WAIT) {
 485                inet_twsk_put(inet_twsk(sk));
 486                return 0;
 487        }
 488        seq = ntohl(th->seq);
 489        if (sk->sk_state == TCP_NEW_SYN_RECV) {
 490                tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 491                                     type == ICMP_TIME_EXCEEDED ||
 492                                     (type == ICMP_DEST_UNREACH &&
 493                                      (code == ICMP_NET_UNREACH ||
 494                                       code == ICMP_HOST_UNREACH)));
 495                return 0;
 496        }
 497
 498        bh_lock_sock(sk);
 499        /* If too many ICMPs get dropped on busy
 500         * servers this needs to be solved differently.
 501         * We do take care of PMTU discovery (RFC1191) special case :
 502         * we can receive locally generated ICMP messages while socket is held.
 503         */
 504        if (sock_owned_by_user(sk)) {
 505                if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 506                        __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 507        }
 508        if (sk->sk_state == TCP_CLOSE)
 509                goto out;
 510
 511        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 512                __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 513                goto out;
 514        }
 515
 516        tp = tcp_sk(sk);
 517        /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 518        fastopen = rcu_dereference(tp->fastopen_rsk);
 519        snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 520        if (sk->sk_state != TCP_LISTEN &&
 521            !between(seq, snd_una, tp->snd_nxt)) {
 522                __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 523                goto out;
 524        }
 525
 526        switch (type) {
 527        case ICMP_REDIRECT:
 528                if (!sock_owned_by_user(sk))
 529                        do_redirect(skb, sk);
 530                goto out;
 531        case ICMP_SOURCE_QUENCH:
 532                /* Just silently ignore these. */
 533                goto out;
 534        case ICMP_PARAMETERPROB:
 535                err = EPROTO;
 536                break;
 537        case ICMP_DEST_UNREACH:
 538                if (code > NR_ICMP_UNREACH)
 539                        goto out;
 540
 541                if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 542                        /* We are not interested in TCP_LISTEN and open_requests
 543                         * (SYN-ACKs send out by Linux are always <576bytes so
 544                         * they should go through unfragmented).
 545                         */
 546                        if (sk->sk_state == TCP_LISTEN)
 547                                goto out;
 548
 549                        WRITE_ONCE(tp->mtu_info, info);
 550                        if (!sock_owned_by_user(sk)) {
 551                                tcp_v4_mtu_reduced(sk);
 552                        } else {
 553                                if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 554                                        sock_hold(sk);
 555                        }
 556                        goto out;
 557                }
 558
 559                err = icmp_err_convert[code].errno;
 560                /* check if this ICMP message allows revert of backoff.
 561                 * (see RFC 6069)
 562                 */
 563                if (!fastopen &&
 564                    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
 565                        tcp_ld_RTO_revert(sk, seq);
 566                break;
 567        case ICMP_TIME_EXCEEDED:
 568                err = EHOSTUNREACH;
 569                break;
 570        default:
 571                goto out;
 572        }
 573
 574        switch (sk->sk_state) {
 575        case TCP_SYN_SENT:
 576        case TCP_SYN_RECV:
 577                /* Only in fast or simultaneous open. If a fast open socket is
 578                 * already accepted it is treated as a connected one below.
 579                 */
 580                if (fastopen && !fastopen->sk)
 581                        break;
 582
 583                ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
 584
 585                if (!sock_owned_by_user(sk)) {
 586                        sk->sk_err = err;
 587
 588                        sk_error_report(sk);
 589
 590                        tcp_done(sk);
 591                } else {
 592                        sk->sk_err_soft = err;
 593                }
 594                goto out;
 595        }
 596
 597        /* If we've already connected we will keep trying
 598         * until we time out, or the user gives up.
 599         *
 600         * rfc1122 4.2.3.9 allows to consider as hard errors
 601         * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 602         * but it is obsoleted by pmtu discovery).
 603         *
 604         * Note, that in modern internet, where routing is unreliable
 605         * and in each dark corner broken firewalls sit, sending random
 606         * errors ordered by their masters even this two messages finally lose
 607         * their original sense (even Linux sends invalid PORT_UNREACHs)
 608         *
 609         * Now we are in compliance with RFCs.
 610         *                                                      --ANK (980905)
 611         */
 612
 613        inet = inet_sk(sk);
 614        if (!sock_owned_by_user(sk) && inet->recverr) {
 615                sk->sk_err = err;
 616                sk_error_report(sk);
 617        } else  { /* Only an error on timeout */
 618                sk->sk_err_soft = err;
 619        }
 620
 621out:
 622        bh_unlock_sock(sk);
 623        sock_put(sk);
 624        return 0;
 625}
 626
 627void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 628{
 629        struct tcphdr *th = tcp_hdr(skb);
 630
 631        th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 632        skb->csum_start = skb_transport_header(skb) - skb->head;
 633        skb->csum_offset = offsetof(struct tcphdr, check);
 634}
 635
 636/* This routine computes an IPv4 TCP checksum. */
 637void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 638{
 639        const struct inet_sock *inet = inet_sk(sk);
 640
 641        __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 642}
 643EXPORT_SYMBOL(tcp_v4_send_check);
 644
 645/*
 646 *      This routine will send an RST to the other tcp.
 647 *
 648 *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 649 *                    for reset.
 650 *      Answer: if a packet caused RST, it is not for a socket
 651 *              existing in our system, if it is matched to a socket,
 652 *              it is just duplicate segment or bug in other side's TCP.
 653 *              So that we build reply only basing on parameters
 654 *              arrived with segment.
 655 *      Exception: precedence violation. We do not implement it in any case.
 656 */
 657
 658#ifdef CONFIG_TCP_MD5SIG
 659#define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
 660#else
 661#define OPTION_BYTES sizeof(__be32)
 662#endif
 663
 664static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 665{
 666        const struct tcphdr *th = tcp_hdr(skb);
 667        struct {
 668                struct tcphdr th;
 669                __be32 opt[OPTION_BYTES / sizeof(__be32)];
 670        } rep;
 671        struct ip_reply_arg arg;
 672#ifdef CONFIG_TCP_MD5SIG
 673        struct tcp_md5sig_key *key = NULL;
 674        const __u8 *hash_location = NULL;
 675        unsigned char newhash[16];
 676        int genhash;
 677        struct sock *sk1 = NULL;
 678#endif
 679        u64 transmit_time = 0;
 680        struct sock *ctl_sk;
 681        struct net *net;
 682
 683        /* Never send a reset in response to a reset. */
 684        if (th->rst)
 685                return;
 686
 687        /* If sk not NULL, it means we did a successful lookup and incoming
 688         * route had to be correct. prequeue might have dropped our dst.
 689         */
 690        if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 691                return;
 692
 693        /* Swap the send and the receive. */
 694        memset(&rep, 0, sizeof(rep));
 695        rep.th.dest   = th->source;
 696        rep.th.source = th->dest;
 697        rep.th.doff   = sizeof(struct tcphdr) / 4;
 698        rep.th.rst    = 1;
 699
 700        if (th->ack) {
 701                rep.th.seq = th->ack_seq;
 702        } else {
 703                rep.th.ack = 1;
 704                rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 705                                       skb->len - (th->doff << 2));
 706        }
 707
 708        memset(&arg, 0, sizeof(arg));
 709        arg.iov[0].iov_base = (unsigned char *)&rep;
 710        arg.iov[0].iov_len  = sizeof(rep.th);
 711
 712        net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 713#ifdef CONFIG_TCP_MD5SIG
 714        rcu_read_lock();
 715        hash_location = tcp_parse_md5sig_option(th);
 716        if (sk && sk_fullsock(sk)) {
 717                const union tcp_md5_addr *addr;
 718                int l3index;
 719
 720                /* sdif set, means packet ingressed via a device
 721                 * in an L3 domain and inet_iif is set to it.
 722                 */
 723                l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 724                addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 725                key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
 726        } else if (hash_location) {
 727                const union tcp_md5_addr *addr;
 728                int sdif = tcp_v4_sdif(skb);
 729                int dif = inet_iif(skb);
 730                int l3index;
 731
 732                /*
 733                 * active side is lost. Try to find listening socket through
 734                 * source port, and then find md5 key through listening socket.
 735                 * we are not loose security here:
 736                 * Incoming packet is checked with md5 hash with finding key,
 737                 * no RST generated if md5 hash doesn't match.
 738                 */
 739                sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 740                                             ip_hdr(skb)->saddr,
 741                                             th->source, ip_hdr(skb)->daddr,
 742                                             ntohs(th->source), dif, sdif);
 743                /* don't send rst if it can't find key */
 744                if (!sk1)
 745                        goto out;
 746
 747                /* sdif set, means packet ingressed via a device
 748                 * in an L3 domain and dif is set to it.
 749                 */
 750                l3index = sdif ? dif : 0;
 751                addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 752                key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
 753                if (!key)
 754                        goto out;
 755
 756
 757                genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 758                if (genhash || memcmp(hash_location, newhash, 16) != 0)
 759                        goto out;
 760
 761        }
 762
 763        if (key) {
 764                rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 765                                   (TCPOPT_NOP << 16) |
 766                                   (TCPOPT_MD5SIG << 8) |
 767                                   TCPOLEN_MD5SIG);
 768                /* Update length and the length the header thinks exists */
 769                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 770                rep.th.doff = arg.iov[0].iov_len / 4;
 771
 772                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 773                                     key, ip_hdr(skb)->saddr,
 774                                     ip_hdr(skb)->daddr, &rep.th);
 775        }
 776#endif
 777        /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
 778        if (rep.opt[0] == 0) {
 779                __be32 mrst = mptcp_reset_option(skb);
 780
 781                if (mrst) {
 782                        rep.opt[0] = mrst;
 783                        arg.iov[0].iov_len += sizeof(mrst);
 784                        rep.th.doff = arg.iov[0].iov_len / 4;
 785                }
 786        }
 787
 788        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 789                                      ip_hdr(skb)->saddr, /* XXX */
 790                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 791        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 792        arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 793
 794        /* When socket is gone, all binding information is lost.
 795         * routing might fail in this case. No choice here, if we choose to force
 796         * input interface, we will misroute in case of asymmetric route.
 797         */
 798        if (sk) {
 799                arg.bound_dev_if = sk->sk_bound_dev_if;
 800                if (sk_fullsock(sk))
 801                        trace_tcp_send_reset(sk, skb);
 802        }
 803
 804        BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 805                     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 806
 807        arg.tos = ip_hdr(skb)->tos;
 808        arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 809        local_bh_disable();
 810        ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 811        if (sk) {
 812                ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 813                                   inet_twsk(sk)->tw_mark : sk->sk_mark;
 814                ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 815                                   inet_twsk(sk)->tw_priority : sk->sk_priority;
 816                transmit_time = tcp_transmit_time(sk);
 817        }
 818        ip_send_unicast_reply(ctl_sk,
 819                              skb, &TCP_SKB_CB(skb)->header.h4.opt,
 820                              ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 821                              &arg, arg.iov[0].iov_len,
 822                              transmit_time);
 823
 824        ctl_sk->sk_mark = 0;
 825        __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 826        __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 827        local_bh_enable();
 828
 829#ifdef CONFIG_TCP_MD5SIG
 830out:
 831        rcu_read_unlock();
 832#endif
 833}
 834
 835/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 836   outside socket context is ugly, certainly. What can I do?
 837 */
 838
 839static void tcp_v4_send_ack(const struct sock *sk,
 840                            struct sk_buff *skb, u32 seq, u32 ack,
 841                            u32 win, u32 tsval, u32 tsecr, int oif,
 842                            struct tcp_md5sig_key *key,
 843                            int reply_flags, u8 tos)
 844{
 845        const struct tcphdr *th = tcp_hdr(skb);
 846        struct {
 847                struct tcphdr th;
 848                __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 849#ifdef CONFIG_TCP_MD5SIG
 850                           + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 851#endif
 852                        ];
 853        } rep;
 854        struct net *net = sock_net(sk);
 855        struct ip_reply_arg arg;
 856        struct sock *ctl_sk;
 857        u64 transmit_time;
 858
 859        memset(&rep.th, 0, sizeof(struct tcphdr));
 860        memset(&arg, 0, sizeof(arg));
 861
 862        arg.iov[0].iov_base = (unsigned char *)&rep;
 863        arg.iov[0].iov_len  = sizeof(rep.th);
 864        if (tsecr) {
 865                rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 866                                   (TCPOPT_TIMESTAMP << 8) |
 867                                   TCPOLEN_TIMESTAMP);
 868                rep.opt[1] = htonl(tsval);
 869                rep.opt[2] = htonl(tsecr);
 870                arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 871        }
 872
 873        /* Swap the send and the receive. */
 874        rep.th.dest    = th->source;
 875        rep.th.source  = th->dest;
 876        rep.th.doff    = arg.iov[0].iov_len / 4;
 877        rep.th.seq     = htonl(seq);
 878        rep.th.ack_seq = htonl(ack);
 879        rep.th.ack     = 1;
 880        rep.th.window  = htons(win);
 881
 882#ifdef CONFIG_TCP_MD5SIG
 883        if (key) {
 884                int offset = (tsecr) ? 3 : 0;
 885
 886                rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 887                                          (TCPOPT_NOP << 16) |
 888                                          (TCPOPT_MD5SIG << 8) |
 889                                          TCPOLEN_MD5SIG);
 890                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 891                rep.th.doff = arg.iov[0].iov_len/4;
 892
 893                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 894                                    key, ip_hdr(skb)->saddr,
 895                                    ip_hdr(skb)->daddr, &rep.th);
 896        }
 897#endif
 898        arg.flags = reply_flags;
 899        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 900                                      ip_hdr(skb)->saddr, /* XXX */
 901                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 902        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 903        if (oif)
 904                arg.bound_dev_if = oif;
 905        arg.tos = tos;
 906        arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 907        local_bh_disable();
 908        ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 909        ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 910                           inet_twsk(sk)->tw_mark : sk->sk_mark;
 911        ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 912                           inet_twsk(sk)->tw_priority : sk->sk_priority;
 913        transmit_time = tcp_transmit_time(sk);
 914        ip_send_unicast_reply(ctl_sk,
 915                              skb, &TCP_SKB_CB(skb)->header.h4.opt,
 916                              ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 917                              &arg, arg.iov[0].iov_len,
 918                              transmit_time);
 919
 920        ctl_sk->sk_mark = 0;
 921        __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 922        local_bh_enable();
 923}
 924
 925static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 926{
 927        struct inet_timewait_sock *tw = inet_twsk(sk);
 928        struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 929
 930        tcp_v4_send_ack(sk, skb,
 931                        tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 932                        tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 933                        tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 934                        tcptw->tw_ts_recent,
 935                        tw->tw_bound_dev_if,
 936                        tcp_twsk_md5_key(tcptw),
 937                        tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 938                        tw->tw_tos
 939                        );
 940
 941        inet_twsk_put(tw);
 942}
 943
 944static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 945                                  struct request_sock *req)
 946{
 947        const union tcp_md5_addr *addr;
 948        int l3index;
 949
 950        /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 951         * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 952         */
 953        u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 954                                             tcp_sk(sk)->snd_nxt;
 955
 956        /* RFC 7323 2.3
 957         * The window field (SEG.WND) of every outgoing segment, with the
 958         * exception of <SYN> segments, MUST be right-shifted by
 959         * Rcv.Wind.Shift bits:
 960         */
 961        addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 962        l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 963        tcp_v4_send_ack(sk, skb, seq,
 964                        tcp_rsk(req)->rcv_nxt,
 965                        req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 966                        tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 967                        req->ts_recent,
 968                        0,
 969                        tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
 970                        inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 971                        ip_hdr(skb)->tos);
 972}
 973
 974/*
 975 *      Send a SYN-ACK after having received a SYN.
 976 *      This still operates on a request_sock only, not on a big
 977 *      socket.
 978 */
 979static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 980                              struct flowi *fl,
 981                              struct request_sock *req,
 982                              struct tcp_fastopen_cookie *foc,
 983                              enum tcp_synack_type synack_type,
 984                              struct sk_buff *syn_skb)
 985{
 986        const struct inet_request_sock *ireq = inet_rsk(req);
 987        struct flowi4 fl4;
 988        int err = -1;
 989        struct sk_buff *skb;
 990        u8 tos;
 991
 992        /* First, grab a route. */
 993        if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 994                return -1;
 995
 996        skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
 997
 998        if (skb) {
 999                __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1000
1001                tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
1002                                (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1003                                (inet_sk(sk)->tos & INET_ECN_MASK) :
1004                                inet_sk(sk)->tos;
1005
1006                if (!INET_ECN_is_capable(tos) &&
1007                    tcp_bpf_ca_needs_ecn((struct sock *)req))
1008                        tos |= INET_ECN_ECT_0;
1009
1010                rcu_read_lock();
1011                err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1012                                            ireq->ir_rmt_addr,
1013                                            rcu_dereference(ireq->ireq_opt),
1014                                            tos);
1015                rcu_read_unlock();
1016                err = net_xmit_eval(err);
1017        }
1018
1019        return err;
1020}
1021
1022/*
1023 *      IPv4 request_sock destructor.
1024 */
1025static void tcp_v4_reqsk_destructor(struct request_sock *req)
1026{
1027        kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1028}
1029
1030#ifdef CONFIG_TCP_MD5SIG
1031/*
1032 * RFC2385 MD5 checksumming requires a mapping of
1033 * IP address->MD5 Key.
1034 * We need to maintain these in the sk structure.
1035 */
1036
1037DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1038EXPORT_SYMBOL(tcp_md5_needed);
1039
1040static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1041{
1042        if (!old)
1043                return true;
1044
1045        /* l3index always overrides non-l3index */
1046        if (old->l3index && new->l3index == 0)
1047                return false;
1048        if (old->l3index == 0 && new->l3index)
1049                return true;
1050
1051        return old->prefixlen < new->prefixlen;
1052}
1053
1054/* Find the Key structure for an address.  */
1055struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1056                                           const union tcp_md5_addr *addr,
1057                                           int family)
1058{
1059        const struct tcp_sock *tp = tcp_sk(sk);
1060        struct tcp_md5sig_key *key;
1061        const struct tcp_md5sig_info *md5sig;
1062        __be32 mask;
1063        struct tcp_md5sig_key *best_match = NULL;
1064        bool match;
1065
1066        /* caller either holds rcu_read_lock() or socket lock */
1067        md5sig = rcu_dereference_check(tp->md5sig_info,
1068                                       lockdep_sock_is_held(sk));
1069        if (!md5sig)
1070                return NULL;
1071
1072        hlist_for_each_entry_rcu(key, &md5sig->head, node,
1073                                 lockdep_sock_is_held(sk)) {
1074                if (key->family != family)
1075                        continue;
1076                if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1077                        continue;
1078                if (family == AF_INET) {
1079                        mask = inet_make_mask(key->prefixlen);
1080                        match = (key->addr.a4.s_addr & mask) ==
1081                                (addr->a4.s_addr & mask);
1082#if IS_ENABLED(CONFIG_IPV6)
1083                } else if (family == AF_INET6) {
1084                        match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1085                                                  key->prefixlen);
1086#endif
1087                } else {
1088                        match = false;
1089                }
1090
1091                if (match && better_md5_match(best_match, key))
1092                        best_match = key;
1093        }
1094        return best_match;
1095}
1096EXPORT_SYMBOL(__tcp_md5_do_lookup);
1097
1098static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1099                                                      const union tcp_md5_addr *addr,
1100                                                      int family, u8 prefixlen,
1101                                                      int l3index, u8 flags)
1102{
1103        const struct tcp_sock *tp = tcp_sk(sk);
1104        struct tcp_md5sig_key *key;
1105        unsigned int size = sizeof(struct in_addr);
1106        const struct tcp_md5sig_info *md5sig;
1107
1108        /* caller either holds rcu_read_lock() or socket lock */
1109        md5sig = rcu_dereference_check(tp->md5sig_info,
1110                                       lockdep_sock_is_held(sk));
1111        if (!md5sig)
1112                return NULL;
1113#if IS_ENABLED(CONFIG_IPV6)
1114        if (family == AF_INET6)
1115                size = sizeof(struct in6_addr);
1116#endif
1117        hlist_for_each_entry_rcu(key, &md5sig->head, node,
1118                                 lockdep_sock_is_held(sk)) {
1119                if (key->family != family)
1120                        continue;
1121                if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1122                        continue;
1123                if (key->l3index != l3index)
1124                        continue;
1125                if (!memcmp(&key->addr, addr, size) &&
1126                    key->prefixlen == prefixlen)
1127                        return key;
1128        }
1129        return NULL;
1130}
1131
1132struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1133                                         const struct sock *addr_sk)
1134{
1135        const union tcp_md5_addr *addr;
1136        int l3index;
1137
1138        l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1139                                                 addr_sk->sk_bound_dev_if);
1140        addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1141        return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1142}
1143EXPORT_SYMBOL(tcp_v4_md5_lookup);
1144
1145/* This can be called on a newly created socket, from other files */
1146int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1147                   int family, u8 prefixlen, int l3index, u8 flags,
1148                   const u8 *newkey, u8 newkeylen, gfp_t gfp)
1149{
1150        /* Add Key to the list */
1151        struct tcp_md5sig_key *key;
1152        struct tcp_sock *tp = tcp_sk(sk);
1153        struct tcp_md5sig_info *md5sig;
1154
1155        key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1156        if (key) {
1157                /* Pre-existing entry - just update that one.
1158                 * Note that the key might be used concurrently.
1159                 * data_race() is telling kcsan that we do not care of
1160                 * key mismatches, since changing MD5 key on live flows
1161                 * can lead to packet drops.
1162                 */
1163                data_race(memcpy(key->key, newkey, newkeylen));
1164
1165                /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1166                 * Also note that a reader could catch new key->keylen value
1167                 * but old key->key[], this is the reason we use __GFP_ZERO
1168                 * at sock_kmalloc() time below these lines.
1169                 */
1170                WRITE_ONCE(key->keylen, newkeylen);
1171
1172                return 0;
1173        }
1174
1175        md5sig = rcu_dereference_protected(tp->md5sig_info,
1176                                           lockdep_sock_is_held(sk));
1177        if (!md5sig) {
1178                md5sig = kmalloc(sizeof(*md5sig), gfp);
1179                if (!md5sig)
1180                        return -ENOMEM;
1181
1182                sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1183                INIT_HLIST_HEAD(&md5sig->head);
1184                rcu_assign_pointer(tp->md5sig_info, md5sig);
1185        }
1186
1187        key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1188        if (!key)
1189                return -ENOMEM;
1190        if (!tcp_alloc_md5sig_pool()) {
1191                sock_kfree_s(sk, key, sizeof(*key));
1192                return -ENOMEM;
1193        }
1194
1195        memcpy(key->key, newkey, newkeylen);
1196        key->keylen = newkeylen;
1197        key->family = family;
1198        key->prefixlen = prefixlen;
1199        key->l3index = l3index;
1200        key->flags = flags;
1201        memcpy(&key->addr, addr,
1202               (family == AF_INET6) ? sizeof(struct in6_addr) :
1203                                      sizeof(struct in_addr));
1204        hlist_add_head_rcu(&key->node, &md5sig->head);
1205        return 0;
1206}
1207EXPORT_SYMBOL(tcp_md5_do_add);
1208
1209int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1210                   u8 prefixlen, int l3index, u8 flags)
1211{
1212        struct tcp_md5sig_key *key;
1213
1214        key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1215        if (!key)
1216                return -ENOENT;
1217        hlist_del_rcu(&key->node);
1218        atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1219        kfree_rcu(key, rcu);
1220        return 0;
1221}
1222EXPORT_SYMBOL(tcp_md5_do_del);
1223
1224static void tcp_clear_md5_list(struct sock *sk)
1225{
1226        struct tcp_sock *tp = tcp_sk(sk);
1227        struct tcp_md5sig_key *key;
1228        struct hlist_node *n;
1229        struct tcp_md5sig_info *md5sig;
1230
1231        md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1232
1233        hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1234                hlist_del_rcu(&key->node);
1235                atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1236                kfree_rcu(key, rcu);
1237        }
1238}
1239
1240static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1241                                 sockptr_t optval, int optlen)
1242{
1243        struct tcp_md5sig cmd;
1244        struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1245        const union tcp_md5_addr *addr;
1246        u8 prefixlen = 32;
1247        int l3index = 0;
1248        u8 flags;
1249
1250        if (optlen < sizeof(cmd))
1251                return -EINVAL;
1252
1253        if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1254                return -EFAULT;
1255
1256        if (sin->sin_family != AF_INET)
1257                return -EINVAL;
1258
1259        flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1260
1261        if (optname == TCP_MD5SIG_EXT &&
1262            cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1263                prefixlen = cmd.tcpm_prefixlen;
1264                if (prefixlen > 32)
1265                        return -EINVAL;
1266        }
1267
1268        if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1269            cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1270                struct net_device *dev;
1271
1272                rcu_read_lock();
1273                dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1274                if (dev && netif_is_l3_master(dev))
1275                        l3index = dev->ifindex;
1276
1277                rcu_read_unlock();
1278
1279                /* ok to reference set/not set outside of rcu;
1280                 * right now device MUST be an L3 master
1281                 */
1282                if (!dev || !l3index)
1283                        return -EINVAL;
1284        }
1285
1286        addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1287
1288        if (!cmd.tcpm_keylen)
1289                return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1290
1291        if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1292                return -EINVAL;
1293
1294        return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1295                              cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1296}
1297
1298static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1299                                   __be32 daddr, __be32 saddr,
1300                                   const struct tcphdr *th, int nbytes)
1301{
1302        struct tcp4_pseudohdr *bp;
1303        struct scatterlist sg;
1304        struct tcphdr *_th;
1305
1306        bp = hp->scratch;
1307        bp->saddr = saddr;
1308        bp->daddr = daddr;
1309        bp->pad = 0;
1310        bp->protocol = IPPROTO_TCP;
1311        bp->len = cpu_to_be16(nbytes);
1312
1313        _th = (struct tcphdr *)(bp + 1);
1314        memcpy(_th, th, sizeof(*th));
1315        _th->check = 0;
1316
1317        sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1318        ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1319                                sizeof(*bp) + sizeof(*th));
1320        return crypto_ahash_update(hp->md5_req);
1321}
1322
1323static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1324                               __be32 daddr, __be32 saddr, const struct tcphdr *th)
1325{
1326        struct tcp_md5sig_pool *hp;
1327        struct ahash_request *req;
1328
1329        hp = tcp_get_md5sig_pool();
1330        if (!hp)
1331                goto clear_hash_noput;
1332        req = hp->md5_req;
1333
1334        if (crypto_ahash_init(req))
1335                goto clear_hash;
1336        if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1337                goto clear_hash;
1338        if (tcp_md5_hash_key(hp, key))
1339                goto clear_hash;
1340        ahash_request_set_crypt(req, NULL, md5_hash, 0);
1341        if (crypto_ahash_final(req))
1342                goto clear_hash;
1343
1344        tcp_put_md5sig_pool();
1345        return 0;
1346
1347clear_hash:
1348        tcp_put_md5sig_pool();
1349clear_hash_noput:
1350        memset(md5_hash, 0, 16);
1351        return 1;
1352}
1353
1354int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1355                        const struct sock *sk,
1356                        const struct sk_buff *skb)
1357{
1358        struct tcp_md5sig_pool *hp;
1359        struct ahash_request *req;
1360        const struct tcphdr *th = tcp_hdr(skb);
1361        __be32 saddr, daddr;
1362
1363        if (sk) { /* valid for establish/request sockets */
1364                saddr = sk->sk_rcv_saddr;
1365                daddr = sk->sk_daddr;
1366        } else {
1367                const struct iphdr *iph = ip_hdr(skb);
1368                saddr = iph->saddr;
1369                daddr = iph->daddr;
1370        }
1371
1372        hp = tcp_get_md5sig_pool();
1373        if (!hp)
1374                goto clear_hash_noput;
1375        req = hp->md5_req;
1376
1377        if (crypto_ahash_init(req))
1378                goto clear_hash;
1379
1380        if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1381                goto clear_hash;
1382        if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1383                goto clear_hash;
1384        if (tcp_md5_hash_key(hp, key))
1385                goto clear_hash;
1386        ahash_request_set_crypt(req, NULL, md5_hash, 0);
1387        if (crypto_ahash_final(req))
1388                goto clear_hash;
1389
1390        tcp_put_md5sig_pool();
1391        return 0;
1392
1393clear_hash:
1394        tcp_put_md5sig_pool();
1395clear_hash_noput:
1396        memset(md5_hash, 0, 16);
1397        return 1;
1398}
1399EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1400
1401#endif
1402
1403/* Called with rcu_read_lock() */
1404static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1405                                    const struct sk_buff *skb,
1406                                    int dif, int sdif)
1407{
1408#ifdef CONFIG_TCP_MD5SIG
1409        /*
1410         * This gets called for each TCP segment that arrives
1411         * so we want to be efficient.
1412         * We have 3 drop cases:
1413         * o No MD5 hash and one expected.
1414         * o MD5 hash and we're not expecting one.
1415         * o MD5 hash and its wrong.
1416         */
1417        const __u8 *hash_location = NULL;
1418        struct tcp_md5sig_key *hash_expected;
1419        const struct iphdr *iph = ip_hdr(skb);
1420        const struct tcphdr *th = tcp_hdr(skb);
1421        const union tcp_md5_addr *addr;
1422        unsigned char newhash[16];
1423        int genhash, l3index;
1424
1425        /* sdif set, means packet ingressed via a device
1426         * in an L3 domain and dif is set to the l3mdev
1427         */
1428        l3index = sdif ? dif : 0;
1429
1430        addr = (union tcp_md5_addr *)&iph->saddr;
1431        hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1432        hash_location = tcp_parse_md5sig_option(th);
1433
1434        /* We've parsed the options - do we have a hash? */
1435        if (!hash_expected && !hash_location)
1436                return false;
1437
1438        if (hash_expected && !hash_location) {
1439                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1440                return true;
1441        }
1442
1443        if (!hash_expected && hash_location) {
1444                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1445                return true;
1446        }
1447
1448        /* Okay, so this is hash_expected and hash_location -
1449         * so we need to calculate the checksum.
1450         */
1451        genhash = tcp_v4_md5_hash_skb(newhash,
1452                                      hash_expected,
1453                                      NULL, skb);
1454
1455        if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1456                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1457                net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1458                                     &iph->saddr, ntohs(th->source),
1459                                     &iph->daddr, ntohs(th->dest),
1460                                     genhash ? " tcp_v4_calc_md5_hash failed"
1461                                     : "", l3index);
1462                return true;
1463        }
1464        return false;
1465#endif
1466        return false;
1467}
1468
1469static void tcp_v4_init_req(struct request_sock *req,
1470                            const struct sock *sk_listener,
1471                            struct sk_buff *skb)
1472{
1473        struct inet_request_sock *ireq = inet_rsk(req);
1474        struct net *net = sock_net(sk_listener);
1475
1476        sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1477        sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1478        RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1479}
1480
1481static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1482                                          struct sk_buff *skb,
1483                                          struct flowi *fl,
1484                                          struct request_sock *req)
1485{
1486        tcp_v4_init_req(req, sk, skb);
1487
1488        if (security_inet_conn_request(sk, skb, req))
1489                return NULL;
1490
1491        return inet_csk_route_req(sk, &fl->u.ip4, req);
1492}
1493
1494struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1495        .family         =       PF_INET,
1496        .obj_size       =       sizeof(struct tcp_request_sock),
1497        .rtx_syn_ack    =       tcp_rtx_synack,
1498        .send_ack       =       tcp_v4_reqsk_send_ack,
1499        .destructor     =       tcp_v4_reqsk_destructor,
1500        .send_reset     =       tcp_v4_send_reset,
1501        .syn_ack_timeout =      tcp_syn_ack_timeout,
1502};
1503
1504const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1505        .mss_clamp      =       TCP_MSS_DEFAULT,
1506#ifdef CONFIG_TCP_MD5SIG
1507        .req_md5_lookup =       tcp_v4_md5_lookup,
1508        .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1509#endif
1510#ifdef CONFIG_SYN_COOKIES
1511        .cookie_init_seq =      cookie_v4_init_sequence,
1512#endif
1513        .route_req      =       tcp_v4_route_req,
1514        .init_seq       =       tcp_v4_init_seq,
1515        .init_ts_off    =       tcp_v4_init_ts_off,
1516        .send_synack    =       tcp_v4_send_synack,
1517};
1518
1519int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1520{
1521        /* Never answer to SYNs send to broadcast or multicast */
1522        if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1523                goto drop;
1524
1525        return tcp_conn_request(&tcp_request_sock_ops,
1526                                &tcp_request_sock_ipv4_ops, sk, skb);
1527
1528drop:
1529        tcp_listendrop(sk);
1530        return 0;
1531}
1532EXPORT_SYMBOL(tcp_v4_conn_request);
1533
1534
1535/*
1536 * The three way handshake has completed - we got a valid synack -
1537 * now create the new socket.
1538 */
1539struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1540                                  struct request_sock *req,
1541                                  struct dst_entry *dst,
1542                                  struct request_sock *req_unhash,
1543                                  bool *own_req)
1544{
1545        struct inet_request_sock *ireq;
1546        bool found_dup_sk = false;
1547        struct inet_sock *newinet;
1548        struct tcp_sock *newtp;
1549        struct sock *newsk;
1550#ifdef CONFIG_TCP_MD5SIG
1551        const union tcp_md5_addr *addr;
1552        struct tcp_md5sig_key *key;
1553        int l3index;
1554#endif
1555        struct ip_options_rcu *inet_opt;
1556
1557        if (sk_acceptq_is_full(sk))
1558                goto exit_overflow;
1559
1560        newsk = tcp_create_openreq_child(sk, req, skb);
1561        if (!newsk)
1562                goto exit_nonewsk;
1563
1564        newsk->sk_gso_type = SKB_GSO_TCPV4;
1565        inet_sk_rx_dst_set(newsk, skb);
1566
1567        newtp                 = tcp_sk(newsk);
1568        newinet               = inet_sk(newsk);
1569        ireq                  = inet_rsk(req);
1570        sk_daddr_set(newsk, ireq->ir_rmt_addr);
1571        sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1572        newsk->sk_bound_dev_if = ireq->ir_iif;
1573        newinet->inet_saddr   = ireq->ir_loc_addr;
1574        inet_opt              = rcu_dereference(ireq->ireq_opt);
1575        RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1576        newinet->mc_index     = inet_iif(skb);
1577        newinet->mc_ttl       = ip_hdr(skb)->ttl;
1578        newinet->rcv_tos      = ip_hdr(skb)->tos;
1579        inet_csk(newsk)->icsk_ext_hdr_len = 0;
1580        if (inet_opt)
1581                inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1582        newinet->inet_id = prandom_u32();
1583
1584        /* Set ToS of the new socket based upon the value of incoming SYN.
1585         * ECT bits are set later in tcp_init_transfer().
1586         */
1587        if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
1588                newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1589
1590        if (!dst) {
1591                dst = inet_csk_route_child_sock(sk, newsk, req);
1592                if (!dst)
1593                        goto put_and_exit;
1594        } else {
1595                /* syncookie case : see end of cookie_v4_check() */
1596        }
1597        sk_setup_caps(newsk, dst);
1598
1599        tcp_ca_openreq_child(newsk, dst);
1600
1601        tcp_sync_mss(newsk, dst_mtu(dst));
1602        newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1603
1604        tcp_initialize_rcv_mss(newsk);
1605
1606#ifdef CONFIG_TCP_MD5SIG
1607        l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1608        /* Copy over the MD5 key from the original socket */
1609        addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1610        key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1611        if (key) {
1612                /*
1613                 * We're using one, so create a matching key
1614                 * on the newsk structure. If we fail to get
1615                 * memory, then we end up not copying the key
1616                 * across. Shucks.
1617                 */
1618                tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1619                               key->key, key->keylen, GFP_ATOMIC);
1620                sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1621        }
1622#endif
1623
1624        if (__inet_inherit_port(sk, newsk) < 0)
1625                goto put_and_exit;
1626        *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1627                                       &found_dup_sk);
1628        if (likely(*own_req)) {
1629                tcp_move_syn(newtp, req);
1630                ireq->ireq_opt = NULL;
1631        } else {
1632                newinet->inet_opt = NULL;
1633
1634                if (!req_unhash && found_dup_sk) {
1635                        /* This code path should only be executed in the
1636                         * syncookie case only
1637                         */
1638                        bh_unlock_sock(newsk);
1639                        sock_put(newsk);
1640                        newsk = NULL;
1641                }
1642        }
1643        return newsk;
1644
1645exit_overflow:
1646        NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1647exit_nonewsk:
1648        dst_release(dst);
1649exit:
1650        tcp_listendrop(sk);
1651        return NULL;
1652put_and_exit:
1653        newinet->inet_opt = NULL;
1654        inet_csk_prepare_forced_close(newsk);
1655        tcp_done(newsk);
1656        goto exit;
1657}
1658EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1659
1660static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1661{
1662#ifdef CONFIG_SYN_COOKIES
1663        const struct tcphdr *th = tcp_hdr(skb);
1664
1665        if (!th->syn)
1666                sk = cookie_v4_check(sk, skb);
1667#endif
1668        return sk;
1669}
1670
1671u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1672                         struct tcphdr *th, u32 *cookie)
1673{
1674        u16 mss = 0;
1675#ifdef CONFIG_SYN_COOKIES
1676        mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1677                                    &tcp_request_sock_ipv4_ops, sk, th);
1678        if (mss) {
1679                *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1680                tcp_synq_overflow(sk);
1681        }
1682#endif
1683        return mss;
1684}
1685
1686INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1687                                                           u32));
1688/* The socket must have it's spinlock held when we get
1689 * here, unless it is a TCP_LISTEN socket.
1690 *
1691 * We have a potential double-lock case here, so even when
1692 * doing backlog processing we use the BH locking scheme.
1693 * This is because we cannot sleep with the original spinlock
1694 * held.
1695 */
1696int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1697{
1698        struct sock *rsk;
1699
1700        if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1701                struct dst_entry *dst = sk->sk_rx_dst;
1702
1703                sock_rps_save_rxhash(sk, skb);
1704                sk_mark_napi_id(sk, skb);
1705                if (dst) {
1706                        if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1707                            !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1708                                             dst, 0)) {
1709                                dst_release(dst);
1710                                sk->sk_rx_dst = NULL;
1711                        }
1712                }
1713                tcp_rcv_established(sk, skb);
1714                return 0;
1715        }
1716
1717        if (tcp_checksum_complete(skb))
1718                goto csum_err;
1719
1720        if (sk->sk_state == TCP_LISTEN) {
1721                struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1722
1723                if (!nsk)
1724                        goto discard;
1725                if (nsk != sk) {
1726                        if (tcp_child_process(sk, nsk, skb)) {
1727                                rsk = nsk;
1728                                goto reset;
1729                        }
1730                        return 0;
1731                }
1732        } else
1733                sock_rps_save_rxhash(sk, skb);
1734
1735        if (tcp_rcv_state_process(sk, skb)) {
1736                rsk = sk;
1737                goto reset;
1738        }
1739        return 0;
1740
1741reset:
1742        tcp_v4_send_reset(rsk, skb);
1743discard:
1744        kfree_skb(skb);
1745        /* Be careful here. If this function gets more complicated and
1746         * gcc suffers from register pressure on the x86, sk (in %ebx)
1747         * might be destroyed here. This current version compiles correctly,
1748         * but you have been warned.
1749         */
1750        return 0;
1751
1752csum_err:
1753        trace_tcp_bad_csum(skb);
1754        TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1755        TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1756        goto discard;
1757}
1758EXPORT_SYMBOL(tcp_v4_do_rcv);
1759
1760int tcp_v4_early_demux(struct sk_buff *skb)
1761{
1762        const struct iphdr *iph;
1763        const struct tcphdr *th;
1764        struct sock *sk;
1765
1766        if (skb->pkt_type != PACKET_HOST)
1767                return 0;
1768
1769        if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1770                return 0;
1771
1772        iph = ip_hdr(skb);
1773        th = tcp_hdr(skb);
1774
1775        if (th->doff < sizeof(struct tcphdr) / 4)
1776                return 0;
1777
1778        sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1779                                       iph->saddr, th->source,
1780                                       iph->daddr, ntohs(th->dest),
1781                                       skb->skb_iif, inet_sdif(skb));
1782        if (sk) {
1783                skb->sk = sk;
1784                skb->destructor = sock_edemux;
1785                if (sk_fullsock(sk)) {
1786                        struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1787
1788                        if (dst)
1789                                dst = dst_check(dst, 0);
1790                        if (dst &&
1791                            inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1792                                skb_dst_set_noref(skb, dst);
1793                }
1794        }
1795        return 0;
1796}
1797
1798bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1799{
1800        u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1801        u32 tail_gso_size, tail_gso_segs;
1802        struct skb_shared_info *shinfo;
1803        const struct tcphdr *th;
1804        struct tcphdr *thtail;
1805        struct sk_buff *tail;
1806        unsigned int hdrlen;
1807        bool fragstolen;
1808        u32 gso_segs;
1809        u32 gso_size;
1810        int delta;
1811
1812        /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1813         * we can fix skb->truesize to its real value to avoid future drops.
1814         * This is valid because skb is not yet charged to the socket.
1815         * It has been noticed pure SACK packets were sometimes dropped
1816         * (if cooked by drivers without copybreak feature).
1817         */
1818        skb_condense(skb);
1819
1820        skb_dst_drop(skb);
1821
1822        if (unlikely(tcp_checksum_complete(skb))) {
1823                bh_unlock_sock(sk);
1824                trace_tcp_bad_csum(skb);
1825                __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1826                __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1827                return true;
1828        }
1829
1830        /* Attempt coalescing to last skb in backlog, even if we are
1831         * above the limits.
1832         * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1833         */
1834        th = (const struct tcphdr *)skb->data;
1835        hdrlen = th->doff * 4;
1836
1837        tail = sk->sk_backlog.tail;
1838        if (!tail)
1839                goto no_coalesce;
1840        thtail = (struct tcphdr *)tail->data;
1841
1842        if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1843            TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1844            ((TCP_SKB_CB(tail)->tcp_flags |
1845              TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1846            !((TCP_SKB_CB(tail)->tcp_flags &
1847              TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1848            ((TCP_SKB_CB(tail)->tcp_flags ^
1849              TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1850#ifdef CONFIG_TLS_DEVICE
1851            tail->decrypted != skb->decrypted ||
1852#endif
1853            thtail->doff != th->doff ||
1854            memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1855                goto no_coalesce;
1856
1857        __skb_pull(skb, hdrlen);
1858
1859        shinfo = skb_shinfo(skb);
1860        gso_size = shinfo->gso_size ?: skb->len;
1861        gso_segs = shinfo->gso_segs ?: 1;
1862
1863        shinfo = skb_shinfo(tail);
1864        tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1865        tail_gso_segs = shinfo->gso_segs ?: 1;
1866
1867        if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1868                TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1869
1870                if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1871                        TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1872                        thtail->window = th->window;
1873                }
1874
1875                /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1876                 * thtail->fin, so that the fast path in tcp_rcv_established()
1877                 * is not entered if we append a packet with a FIN.
1878                 * SYN, RST, URG are not present.
1879                 * ACK is set on both packets.
1880                 * PSH : we do not really care in TCP stack,
1881                 *       at least for 'GRO' packets.
1882                 */
1883                thtail->fin |= th->fin;
1884                TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1885
1886                if (TCP_SKB_CB(skb)->has_rxtstamp) {
1887                        TCP_SKB_CB(tail)->has_rxtstamp = true;
1888                        tail->tstamp = skb->tstamp;
1889                        skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1890                }
1891
1892                /* Not as strict as GRO. We only need to carry mss max value */
1893                shinfo->gso_size = max(gso_size, tail_gso_size);
1894                shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1895
1896                sk->sk_backlog.len += delta;
1897                __NET_INC_STATS(sock_net(sk),
1898                                LINUX_MIB_TCPBACKLOGCOALESCE);
1899                kfree_skb_partial(skb, fragstolen);
1900                return false;
1901        }
1902        __skb_push(skb, hdrlen);
1903
1904no_coalesce:
1905        /* Only socket owner can try to collapse/prune rx queues
1906         * to reduce memory overhead, so add a little headroom here.
1907         * Few sockets backlog are possibly concurrently non empty.
1908         */
1909        limit += 64*1024;
1910
1911        if (unlikely(sk_add_backlog(sk, skb, limit))) {
1912                bh_unlock_sock(sk);
1913                __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1914                return true;
1915        }
1916        return false;
1917}
1918EXPORT_SYMBOL(tcp_add_backlog);
1919
1920int tcp_filter(struct sock *sk, struct sk_buff *skb)
1921{
1922        struct tcphdr *th = (struct tcphdr *)skb->data;
1923
1924        return sk_filter_trim_cap(sk, skb, th->doff * 4);
1925}
1926EXPORT_SYMBOL(tcp_filter);
1927
1928static void tcp_v4_restore_cb(struct sk_buff *skb)
1929{
1930        memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1931                sizeof(struct inet_skb_parm));
1932}
1933
1934static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1935                           const struct tcphdr *th)
1936{
1937        /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1938         * barrier() makes sure compiler wont play fool^Waliasing games.
1939         */
1940        memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1941                sizeof(struct inet_skb_parm));
1942        barrier();
1943
1944        TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1945        TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1946                                    skb->len - th->doff * 4);
1947        TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1948        TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1949        TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1950        TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1951        TCP_SKB_CB(skb)->sacked  = 0;
1952        TCP_SKB_CB(skb)->has_rxtstamp =
1953                        skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1954}
1955
1956/*
1957 *      From tcp_input.c
1958 */
1959
1960int tcp_v4_rcv(struct sk_buff *skb)
1961{
1962        struct net *net = dev_net(skb->dev);
1963        struct sk_buff *skb_to_free;
1964        int sdif = inet_sdif(skb);
1965        int dif = inet_iif(skb);
1966        const struct iphdr *iph;
1967        const struct tcphdr *th;
1968        bool refcounted;
1969        struct sock *sk;
1970        int ret;
1971
1972        if (skb->pkt_type != PACKET_HOST)
1973                goto discard_it;
1974
1975        /* Count it even if it's bad */
1976        __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1977
1978        if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1979                goto discard_it;
1980
1981        th = (const struct tcphdr *)skb->data;
1982
1983        if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1984                goto bad_packet;
1985        if (!pskb_may_pull(skb, th->doff * 4))
1986                goto discard_it;
1987
1988        /* An explanation is required here, I think.
1989         * Packet length and doff are validated by header prediction,
1990         * provided case of th->doff==0 is eliminated.
1991         * So, we defer the checks. */
1992
1993        if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1994                goto csum_error;
1995
1996        th = (const struct tcphdr *)skb->data;
1997        iph = ip_hdr(skb);
1998lookup:
1999        sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
2000                               th->dest, sdif, &refcounted);
2001        if (!sk)
2002                goto no_tcp_socket;
2003
2004process:
2005        if (sk->sk_state == TCP_TIME_WAIT)
2006                goto do_time_wait;
2007
2008        if (sk->sk_state == TCP_NEW_SYN_RECV) {
2009                struct request_sock *req = inet_reqsk(sk);
2010                bool req_stolen = false;
2011                struct sock *nsk;
2012
2013                sk = req->rsk_listener;
2014                if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
2015                        sk_drops_add(sk, skb);
2016                        reqsk_put(req);
2017                        goto discard_it;
2018                }
2019                if (tcp_checksum_complete(skb)) {
2020                        reqsk_put(req);
2021                        goto csum_error;
2022                }
2023                if (unlikely(sk->sk_state != TCP_LISTEN)) {
2024                        nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2025                        if (!nsk) {
2026                                inet_csk_reqsk_queue_drop_and_put(sk, req);
2027                                goto lookup;
2028                        }
2029                        sk = nsk;
2030                        /* reuseport_migrate_sock() has already held one sk_refcnt
2031                         * before returning.
2032                         */
2033                } else {
2034                        /* We own a reference on the listener, increase it again
2035                         * as we might lose it too soon.
2036                         */
2037                        sock_hold(sk);
2038                }
2039                refcounted = true;
2040                nsk = NULL;
2041                if (!tcp_filter(sk, skb)) {
2042                        th = (const struct tcphdr *)skb->data;
2043                        iph = ip_hdr(skb);
2044                        tcp_v4_fill_cb(skb, iph, th);
2045                        nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2046                }
2047                if (!nsk) {
2048                        reqsk_put(req);
2049                        if (req_stolen) {
2050                                /* Another cpu got exclusive access to req
2051                                 * and created a full blown socket.
2052                                 * Try to feed this packet to this socket
2053                                 * instead of discarding it.
2054                                 */
2055                                tcp_v4_restore_cb(skb);
2056                                sock_put(sk);
2057                                goto lookup;
2058                        }
2059                        goto discard_and_relse;
2060                }
2061                if (nsk == sk) {
2062                        reqsk_put(req);
2063                        tcp_v4_restore_cb(skb);
2064                } else if (tcp_child_process(sk, nsk, skb)) {
2065                        tcp_v4_send_reset(nsk, skb);
2066                        goto discard_and_relse;
2067                } else {
2068                        sock_put(sk);
2069                        return 0;
2070                }
2071        }
2072        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2073                __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2074                goto discard_and_relse;
2075        }
2076
2077        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2078                goto discard_and_relse;
2079
2080        if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2081                goto discard_and_relse;
2082
2083        nf_reset_ct(skb);
2084
2085        if (tcp_filter(sk, skb))
2086                goto discard_and_relse;
2087        th = (const struct tcphdr *)skb->data;
2088        iph = ip_hdr(skb);
2089        tcp_v4_fill_cb(skb, iph, th);
2090
2091        skb->dev = NULL;
2092
2093        if (sk->sk_state == TCP_LISTEN) {
2094                ret = tcp_v4_do_rcv(sk, skb);
2095                goto put_and_return;
2096        }
2097
2098        sk_incoming_cpu_update(sk);
2099
2100        bh_lock_sock_nested(sk);
2101        tcp_segs_in(tcp_sk(sk), skb);
2102        ret = 0;
2103        if (!sock_owned_by_user(sk)) {
2104                skb_to_free = sk->sk_rx_skb_cache;
2105                sk->sk_rx_skb_cache = NULL;
2106                ret = tcp_v4_do_rcv(sk, skb);
2107        } else {
2108                if (tcp_add_backlog(sk, skb))
2109                        goto discard_and_relse;
2110                skb_to_free = NULL;
2111        }
2112        bh_unlock_sock(sk);
2113        if (skb_to_free)
2114                __kfree_skb(skb_to_free);
2115
2116put_and_return:
2117        if (refcounted)
2118                sock_put(sk);
2119
2120        return ret;
2121
2122no_tcp_socket:
2123        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2124                goto discard_it;
2125
2126        tcp_v4_fill_cb(skb, iph, th);
2127
2128        if (tcp_checksum_complete(skb)) {
2129csum_error:
2130                trace_tcp_bad_csum(skb);
2131                __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2132bad_packet:
2133                __TCP_INC_STATS(net, TCP_MIB_INERRS);
2134        } else {
2135                tcp_v4_send_reset(NULL, skb);
2136        }
2137
2138discard_it:
2139        /* Discard frame. */
2140        kfree_skb(skb);
2141        return 0;
2142
2143discard_and_relse:
2144        sk_drops_add(sk, skb);
2145        if (refcounted)
2146                sock_put(sk);
2147        goto discard_it;
2148
2149do_time_wait:
2150        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2151                inet_twsk_put(inet_twsk(sk));
2152                goto discard_it;
2153        }
2154
2155        tcp_v4_fill_cb(skb, iph, th);
2156
2157        if (tcp_checksum_complete(skb)) {
2158                inet_twsk_put(inet_twsk(sk));
2159                goto csum_error;
2160        }
2161        switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2162        case TCP_TW_SYN: {
2163                struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2164                                                        &tcp_hashinfo, skb,
2165                                                        __tcp_hdrlen(th),
2166                                                        iph->saddr, th->source,
2167                                                        iph->daddr, th->dest,
2168                                                        inet_iif(skb),
2169                                                        sdif);
2170                if (sk2) {
2171                        inet_twsk_deschedule_put(inet_twsk(sk));
2172                        sk = sk2;
2173                        tcp_v4_restore_cb(skb);
2174                        refcounted = false;
2175                        goto process;
2176                }
2177        }
2178                /* to ACK */
2179                fallthrough;
2180        case TCP_TW_ACK:
2181                tcp_v4_timewait_ack(sk, skb);
2182                break;
2183        case TCP_TW_RST:
2184                tcp_v4_send_reset(sk, skb);
2185                inet_twsk_deschedule_put(inet_twsk(sk));
2186                goto discard_it;
2187        case TCP_TW_SUCCESS:;
2188        }
2189        goto discard_it;
2190}
2191
2192static struct timewait_sock_ops tcp_timewait_sock_ops = {
2193        .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2194        .twsk_unique    = tcp_twsk_unique,
2195        .twsk_destructor= tcp_twsk_destructor,
2196};
2197
2198void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2199{
2200        struct dst_entry *dst = skb_dst(skb);
2201
2202        if (dst && dst_hold_safe(dst)) {
2203                sk->sk_rx_dst = dst;
2204                inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2205        }
2206}
2207EXPORT_SYMBOL(inet_sk_rx_dst_set);
2208
2209const struct inet_connection_sock_af_ops ipv4_specific = {
2210        .queue_xmit        = ip_queue_xmit,
2211        .send_check        = tcp_v4_send_check,
2212        .rebuild_header    = inet_sk_rebuild_header,
2213        .sk_rx_dst_set     = inet_sk_rx_dst_set,
2214        .conn_request      = tcp_v4_conn_request,
2215        .syn_recv_sock     = tcp_v4_syn_recv_sock,
2216        .net_header_len    = sizeof(struct iphdr),
2217        .setsockopt        = ip_setsockopt,
2218        .getsockopt        = ip_getsockopt,
2219        .addr2sockaddr     = inet_csk_addr2sockaddr,
2220        .sockaddr_len      = sizeof(struct sockaddr_in),
2221        .mtu_reduced       = tcp_v4_mtu_reduced,
2222};
2223EXPORT_SYMBOL(ipv4_specific);
2224
2225#ifdef CONFIG_TCP_MD5SIG
2226static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2227        .md5_lookup             = tcp_v4_md5_lookup,
2228        .calc_md5_hash          = tcp_v4_md5_hash_skb,
2229        .md5_parse              = tcp_v4_parse_md5_keys,
2230};
2231#endif
2232
2233/* NOTE: A lot of things set to zero explicitly by call to
2234 *       sk_alloc() so need not be done here.
2235 */
2236static int tcp_v4_init_sock(struct sock *sk)
2237{
2238        struct inet_connection_sock *icsk = inet_csk(sk);
2239
2240        tcp_init_sock(sk);
2241
2242        icsk->icsk_af_ops = &ipv4_specific;
2243
2244#ifdef CONFIG_TCP_MD5SIG
2245        tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2246#endif
2247
2248        return 0;
2249}
2250
2251void tcp_v4_destroy_sock(struct sock *sk)
2252{
2253        struct tcp_sock *tp = tcp_sk(sk);
2254
2255        trace_tcp_destroy_sock(sk);
2256
2257        tcp_clear_xmit_timers(sk);
2258
2259        tcp_cleanup_congestion_control(sk);
2260
2261        tcp_cleanup_ulp(sk);
2262
2263        /* Cleanup up the write buffer. */
2264        tcp_write_queue_purge(sk);
2265
2266        /* Check if we want to disable active TFO */
2267        tcp_fastopen_active_disable_ofo_check(sk);
2268
2269        /* Cleans up our, hopefully empty, out_of_order_queue. */
2270        skb_rbtree_purge(&tp->out_of_order_queue);
2271
2272#ifdef CONFIG_TCP_MD5SIG
2273        /* Clean up the MD5 key list, if any */
2274        if (tp->md5sig_info) {
2275                tcp_clear_md5_list(sk);
2276                kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2277                tp->md5sig_info = NULL;
2278        }
2279#endif
2280
2281        /* Clean up a referenced TCP bind bucket. */
2282        if (inet_csk(sk)->icsk_bind_hash)
2283                inet_put_port(sk);
2284
2285        BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2286
2287        /* If socket is aborted during connect operation */
2288        tcp_free_fastopen_req(tp);
2289        tcp_fastopen_destroy_cipher(sk);
2290        tcp_saved_syn_free(tp);
2291
2292        sk_sockets_allocated_dec(sk);
2293}
2294EXPORT_SYMBOL(tcp_v4_destroy_sock);
2295
2296#ifdef CONFIG_PROC_FS
2297/* Proc filesystem TCP sock list dumping. */
2298
2299static unsigned short seq_file_family(const struct seq_file *seq);
2300
2301static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2302{
2303        unsigned short family = seq_file_family(seq);
2304
2305        /* AF_UNSPEC is used as a match all */
2306        return ((family == AF_UNSPEC || family == sk->sk_family) &&
2307                net_eq(sock_net(sk), seq_file_net(seq)));
2308}
2309
2310/* Find a non empty bucket (starting from st->bucket)
2311 * and return the first sk from it.
2312 */
2313static void *listening_get_first(struct seq_file *seq)
2314{
2315        struct tcp_iter_state *st = seq->private;
2316
2317        st->offset = 0;
2318        for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
2319                struct inet_listen_hashbucket *ilb2;
2320                struct inet_connection_sock *icsk;
2321                struct sock *sk;
2322
2323                ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2324                if (hlist_empty(&ilb2->head))
2325                        continue;
2326
2327                spin_lock(&ilb2->lock);
2328                inet_lhash2_for_each_icsk(icsk, &ilb2->head) {
2329                        sk = (struct sock *)icsk;
2330                        if (seq_sk_match(seq, sk))
2331                                return sk;
2332                }
2333                spin_unlock(&ilb2->lock);
2334        }
2335
2336        return NULL;
2337}
2338
2339/* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2340 * If "cur" is the last one in the st->bucket,
2341 * call listening_get_first() to return the first sk of the next
2342 * non empty bucket.
2343 */
2344static void *listening_get_next(struct seq_file *seq, void *cur)
2345{
2346        struct tcp_iter_state *st = seq->private;
2347        struct inet_listen_hashbucket *ilb2;
2348        struct inet_connection_sock *icsk;
2349        struct sock *sk = cur;
2350
2351        ++st->num;
2352        ++st->offset;
2353
2354        icsk = inet_csk(sk);
2355        inet_lhash2_for_each_icsk_continue(icsk) {
2356                sk = (struct sock *)icsk;
2357                if (seq_sk_match(seq, sk))
2358                        return sk;
2359        }
2360
2361        ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2362        spin_unlock(&ilb2->lock);
2363        ++st->bucket;
2364        return listening_get_first(seq);
2365}
2366
2367static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2368{
2369        struct tcp_iter_state *st = seq->private;
2370        void *rc;
2371
2372        st->bucket = 0;
2373        st->offset = 0;
2374        rc = listening_get_first(seq);
2375
2376        while (rc && *pos) {
2377                rc = listening_get_next(seq, rc);
2378                --*pos;
2379        }
2380        return rc;
2381}
2382
2383static inline bool empty_bucket(const struct tcp_iter_state *st)
2384{
2385        return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2386}
2387
2388/*
2389 * Get first established socket starting from bucket given in st->bucket.
2390 * If st->bucket is zero, the very first socket in the hash is returned.
2391 */
2392static void *established_get_first(struct seq_file *seq)
2393{
2394        struct tcp_iter_state *st = seq->private;
2395
2396        st->offset = 0;
2397        for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2398                struct sock *sk;
2399                struct hlist_nulls_node *node;
2400                spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2401
2402                /* Lockless fast path for the common case of empty buckets */
2403                if (empty_bucket(st))
2404                        continue;
2405
2406                spin_lock_bh(lock);
2407                sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2408                        if (seq_sk_match(seq, sk))
2409                                return sk;
2410                }
2411                spin_unlock_bh(lock);
2412        }
2413
2414        return NULL;
2415}
2416
2417static void *established_get_next(struct seq_file *seq, void *cur)
2418{
2419        struct sock *sk = cur;
2420        struct hlist_nulls_node *node;
2421        struct tcp_iter_state *st = seq->private;
2422
2423        ++st->num;
2424        ++st->offset;
2425
2426        sk = sk_nulls_next(sk);
2427
2428        sk_nulls_for_each_from(sk, node) {
2429                if (seq_sk_match(seq, sk))
2430                        return sk;
2431        }
2432
2433        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2434        ++st->bucket;
2435        return established_get_first(seq);
2436}
2437
2438static void *established_get_idx(struct seq_file *seq, loff_t pos)
2439{
2440        struct tcp_iter_state *st = seq->private;
2441        void *rc;
2442
2443        st->bucket = 0;
2444        rc = established_get_first(seq);
2445
2446        while (rc && pos) {
2447                rc = established_get_next(seq, rc);
2448                --pos;
2449        }
2450        return rc;
2451}
2452
2453static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2454{
2455        void *rc;
2456        struct tcp_iter_state *st = seq->private;
2457
2458        st->state = TCP_SEQ_STATE_LISTENING;
2459        rc        = listening_get_idx(seq, &pos);
2460
2461        if (!rc) {
2462                st->state = TCP_SEQ_STATE_ESTABLISHED;
2463                rc        = established_get_idx(seq, pos);
2464        }
2465
2466        return rc;
2467}
2468
2469static void *tcp_seek_last_pos(struct seq_file *seq)
2470{
2471        struct tcp_iter_state *st = seq->private;
2472        int bucket = st->bucket;
2473        int offset = st->offset;
2474        int orig_num = st->num;
2475        void *rc = NULL;
2476
2477        switch (st->state) {
2478        case TCP_SEQ_STATE_LISTENING:
2479                if (st->bucket > tcp_hashinfo.lhash2_mask)
2480                        break;
2481                st->state = TCP_SEQ_STATE_LISTENING;
2482                rc = listening_get_first(seq);
2483                while (offset-- && rc && bucket == st->bucket)
2484                        rc = listening_get_next(seq, rc);
2485                if (rc)
2486                        break;
2487                st->bucket = 0;
2488                st->state = TCP_SEQ_STATE_ESTABLISHED;
2489                fallthrough;
2490        case TCP_SEQ_STATE_ESTABLISHED:
2491                if (st->bucket > tcp_hashinfo.ehash_mask)
2492                        break;
2493                rc = established_get_first(seq);
2494                while (offset-- && rc && bucket == st->bucket)
2495                        rc = established_get_next(seq, rc);
2496        }
2497
2498        st->num = orig_num;
2499
2500        return rc;
2501}
2502
2503void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2504{
2505        struct tcp_iter_state *st = seq->private;
2506        void *rc;
2507
2508        if (*pos && *pos == st->last_pos) {
2509                rc = tcp_seek_last_pos(seq);
2510                if (rc)
2511                        goto out;
2512        }
2513
2514        st->state = TCP_SEQ_STATE_LISTENING;
2515        st->num = 0;
2516        st->bucket = 0;
2517        st->offset = 0;
2518        rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2519
2520out:
2521        st->last_pos = *pos;
2522        return rc;
2523}
2524EXPORT_SYMBOL(tcp_seq_start);
2525
2526void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2527{
2528        struct tcp_iter_state *st = seq->private;
2529        void *rc = NULL;
2530
2531        if (v == SEQ_START_TOKEN) {
2532                rc = tcp_get_idx(seq, 0);
2533                goto out;
2534        }
2535
2536        switch (st->state) {
2537        case TCP_SEQ_STATE_LISTENING:
2538                rc = listening_get_next(seq, v);
2539                if (!rc) {
2540                        st->state = TCP_SEQ_STATE_ESTABLISHED;
2541                        st->bucket = 0;
2542                        st->offset = 0;
2543                        rc        = established_get_first(seq);
2544                }
2545                break;
2546        case TCP_SEQ_STATE_ESTABLISHED:
2547                rc = established_get_next(seq, v);
2548                break;
2549        }
2550out:
2551        ++*pos;
2552        st->last_pos = *pos;
2553        return rc;
2554}
2555EXPORT_SYMBOL(tcp_seq_next);
2556
2557void tcp_seq_stop(struct seq_file *seq, void *v)
2558{
2559        struct tcp_iter_state *st = seq->private;
2560
2561        switch (st->state) {
2562        case TCP_SEQ_STATE_LISTENING:
2563                if (v != SEQ_START_TOKEN)
2564                        spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2565                break;
2566        case TCP_SEQ_STATE_ESTABLISHED:
2567                if (v)
2568                        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2569                break;
2570        }
2571}
2572EXPORT_SYMBOL(tcp_seq_stop);
2573
2574static void get_openreq4(const struct request_sock *req,
2575                         struct seq_file *f, int i)
2576{
2577        const struct inet_request_sock *ireq = inet_rsk(req);
2578        long delta = req->rsk_timer.expires - jiffies;
2579
2580        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2581                " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2582                i,
2583                ireq->ir_loc_addr,
2584                ireq->ir_num,
2585                ireq->ir_rmt_addr,
2586                ntohs(ireq->ir_rmt_port),
2587                TCP_SYN_RECV,
2588                0, 0, /* could print option size, but that is af dependent. */
2589                1,    /* timers active (only the expire timer) */
2590                jiffies_delta_to_clock_t(delta),
2591                req->num_timeout,
2592                from_kuid_munged(seq_user_ns(f),
2593                                 sock_i_uid(req->rsk_listener)),
2594                0,  /* non standard timer */
2595                0, /* open_requests have no inode */
2596                0,
2597                req);
2598}
2599
2600static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2601{
2602        int timer_active;
2603        unsigned long timer_expires;
2604        const struct tcp_sock *tp = tcp_sk(sk);
2605        const struct inet_connection_sock *icsk = inet_csk(sk);
2606        const struct inet_sock *inet = inet_sk(sk);
2607        const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2608        __be32 dest = inet->inet_daddr;
2609        __be32 src = inet->inet_rcv_saddr;
2610        __u16 destp = ntohs(inet->inet_dport);
2611        __u16 srcp = ntohs(inet->inet_sport);
2612        int rx_queue;
2613        int state;
2614
2615        if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2616            icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2617            icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2618                timer_active    = 1;
2619                timer_expires   = icsk->icsk_timeout;
2620        } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2621                timer_active    = 4;
2622                timer_expires   = icsk->icsk_timeout;
2623        } else if (timer_pending(&sk->sk_timer)) {
2624                timer_active    = 2;
2625                timer_expires   = sk->sk_timer.expires;
2626        } else {
2627                timer_active    = 0;
2628                timer_expires = jiffies;
2629        }
2630
2631        state = inet_sk_state_load(sk);
2632        if (state == TCP_LISTEN)
2633                rx_queue = READ_ONCE(sk->sk_ack_backlog);
2634        else
2635                /* Because we don't lock the socket,
2636                 * we might find a transient negative value.
2637                 */
2638                rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2639                                      READ_ONCE(tp->copied_seq), 0);
2640
2641        seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2642                        "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2643                i, src, srcp, dest, destp, state,
2644                READ_ONCE(tp->write_seq) - tp->snd_una,
2645                rx_queue,
2646                timer_active,
2647                jiffies_delta_to_clock_t(timer_expires - jiffies),
2648                icsk->icsk_retransmits,
2649                from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2650                icsk->icsk_probes_out,
2651                sock_i_ino(sk),
2652                refcount_read(&sk->sk_refcnt), sk,
2653                jiffies_to_clock_t(icsk->icsk_rto),
2654                jiffies_to_clock_t(icsk->icsk_ack.ato),
2655                (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2656                tp->snd_cwnd,
2657                state == TCP_LISTEN ?
2658                    fastopenq->max_qlen :
2659                    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2660}
2661
2662static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2663                               struct seq_file *f, int i)
2664{
2665        long delta = tw->tw_timer.expires - jiffies;
2666        __be32 dest, src;
2667        __u16 destp, srcp;
2668
2669        dest  = tw->tw_daddr;
2670        src   = tw->tw_rcv_saddr;
2671        destp = ntohs(tw->tw_dport);
2672        srcp  = ntohs(tw->tw_sport);
2673
2674        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2675                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2676                i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2677                3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2678                refcount_read(&tw->tw_refcnt), tw);
2679}
2680
2681#define TMPSZ 150
2682
2683static int tcp4_seq_show(struct seq_file *seq, void *v)
2684{
2685        struct tcp_iter_state *st;
2686        struct sock *sk = v;
2687
2688        seq_setwidth(seq, TMPSZ - 1);
2689        if (v == SEQ_START_TOKEN) {
2690                seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2691                           "rx_queue tr tm->when retrnsmt   uid  timeout "
2692                           "inode");
2693                goto out;
2694        }
2695        st = seq->private;
2696
2697        if (sk->sk_state == TCP_TIME_WAIT)
2698                get_timewait4_sock(v, seq, st->num);
2699        else if (sk->sk_state == TCP_NEW_SYN_RECV)
2700                get_openreq4(v, seq, st->num);
2701        else
2702                get_tcp4_sock(v, seq, st->num);
2703out:
2704        seq_pad(seq, '\n');
2705        return 0;
2706}
2707
2708#ifdef CONFIG_BPF_SYSCALL
2709struct bpf_tcp_iter_state {
2710        struct tcp_iter_state state;
2711        unsigned int cur_sk;
2712        unsigned int end_sk;
2713        unsigned int max_sk;
2714        struct sock **batch;
2715        bool st_bucket_done;
2716};
2717
2718struct bpf_iter__tcp {
2719        __bpf_md_ptr(struct bpf_iter_meta *, meta);
2720        __bpf_md_ptr(struct sock_common *, sk_common);
2721        uid_t uid __aligned(8);
2722};
2723
2724static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2725                             struct sock_common *sk_common, uid_t uid)
2726{
2727        struct bpf_iter__tcp ctx;
2728
2729        meta->seq_num--;  /* skip SEQ_START_TOKEN */
2730        ctx.meta = meta;
2731        ctx.sk_common = sk_common;
2732        ctx.uid = uid;
2733        return bpf_iter_run_prog(prog, &ctx);
2734}
2735
2736static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2737{
2738        while (iter->cur_sk < iter->end_sk)
2739                sock_put(iter->batch[iter->cur_sk++]);
2740}
2741
2742static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2743                                      unsigned int new_batch_sz)
2744{
2745        struct sock **new_batch;
2746
2747        new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2748                             GFP_USER | __GFP_NOWARN);
2749        if (!new_batch)
2750                return -ENOMEM;
2751
2752        bpf_iter_tcp_put_batch(iter);
2753        kvfree(iter->batch);
2754        iter->batch = new_batch;
2755        iter->max_sk = new_batch_sz;
2756
2757        return 0;
2758}
2759
2760static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2761                                                 struct sock *start_sk)
2762{
2763        struct bpf_tcp_iter_state *iter = seq->private;
2764        struct tcp_iter_state *st = &iter->state;
2765        struct inet_connection_sock *icsk;
2766        unsigned int expected = 1;
2767        struct sock *sk;
2768
2769        sock_hold(start_sk);
2770        iter->batch[iter->end_sk++] = start_sk;
2771
2772        icsk = inet_csk(start_sk);
2773        inet_lhash2_for_each_icsk_continue(icsk) {
2774                sk = (struct sock *)icsk;
2775                if (seq_sk_match(seq, sk)) {
2776                        if (iter->end_sk < iter->max_sk) {
2777                                sock_hold(sk);
2778                                iter->batch[iter->end_sk++] = sk;
2779                        }
2780                        expected++;
2781                }
2782        }
2783        spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2784
2785        return expected;
2786}
2787
2788static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2789                                                   struct sock *start_sk)
2790{
2791        struct bpf_tcp_iter_state *iter = seq->private;
2792        struct tcp_iter_state *st = &iter->state;
2793        struct hlist_nulls_node *node;
2794        unsigned int expected = 1;
2795        struct sock *sk;
2796
2797        sock_hold(start_sk);
2798        iter->batch[iter->end_sk++] = start_sk;
2799
2800        sk = sk_nulls_next(start_sk);
2801        sk_nulls_for_each_from(sk, node) {
2802                if (seq_sk_match(seq, sk)) {
2803                        if (iter->end_sk < iter->max_sk) {
2804                                sock_hold(sk);
2805                                iter->batch[iter->end_sk++] = sk;
2806                        }
2807                        expected++;
2808                }
2809        }
2810        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2811
2812        return expected;
2813}
2814
2815static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2816{
2817        struct bpf_tcp_iter_state *iter = seq->private;
2818        struct tcp_iter_state *st = &iter->state;
2819        unsigned int expected;
2820        bool resized = false;
2821        struct sock *sk;
2822
2823        /* The st->bucket is done.  Directly advance to the next
2824         * bucket instead of having the tcp_seek_last_pos() to skip
2825         * one by one in the current bucket and eventually find out
2826         * it has to advance to the next bucket.
2827         */
2828        if (iter->st_bucket_done) {
2829                st->offset = 0;
2830                st->bucket++;
2831                if (st->state == TCP_SEQ_STATE_LISTENING &&
2832                    st->bucket > tcp_hashinfo.lhash2_mask) {
2833                        st->state = TCP_SEQ_STATE_ESTABLISHED;
2834                        st->bucket = 0;
2835                }
2836        }
2837
2838again:
2839        /* Get a new batch */
2840        iter->cur_sk = 0;
2841        iter->end_sk = 0;
2842        iter->st_bucket_done = false;
2843
2844        sk = tcp_seek_last_pos(seq);
2845        if (!sk)
2846                return NULL; /* Done */
2847
2848        if (st->state == TCP_SEQ_STATE_LISTENING)
2849                expected = bpf_iter_tcp_listening_batch(seq, sk);
2850        else
2851                expected = bpf_iter_tcp_established_batch(seq, sk);
2852
2853        if (iter->end_sk == expected) {
2854                iter->st_bucket_done = true;
2855                return sk;
2856        }
2857
2858        if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2859                resized = true;
2860                goto again;
2861        }
2862
2863        return sk;
2864}
2865
2866static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2867{
2868        /* bpf iter does not support lseek, so it always
2869         * continue from where it was stop()-ped.
2870         */
2871        if (*pos)
2872                return bpf_iter_tcp_batch(seq);
2873
2874        return SEQ_START_TOKEN;
2875}
2876
2877static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2878{
2879        struct bpf_tcp_iter_state *iter = seq->private;
2880        struct tcp_iter_state *st = &iter->state;
2881        struct sock *sk;
2882
2883        /* Whenever seq_next() is called, the iter->cur_sk is
2884         * done with seq_show(), so advance to the next sk in
2885         * the batch.
2886         */
2887        if (iter->cur_sk < iter->end_sk) {
2888                /* Keeping st->num consistent in tcp_iter_state.
2889                 * bpf_iter_tcp does not use st->num.
2890                 * meta.seq_num is used instead.
2891                 */
2892                st->num++;
2893                /* Move st->offset to the next sk in the bucket such that
2894                 * the future start() will resume at st->offset in
2895                 * st->bucket.  See tcp_seek_last_pos().
2896                 */
2897                st->offset++;
2898                sock_put(iter->batch[iter->cur_sk++]);
2899        }
2900
2901        if (iter->cur_sk < iter->end_sk)
2902                sk = iter->batch[iter->cur_sk];
2903        else
2904                sk = bpf_iter_tcp_batch(seq);
2905
2906        ++*pos;
2907        /* Keeping st->last_pos consistent in tcp_iter_state.
2908         * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2909         */
2910        st->last_pos = *pos;
2911        return sk;
2912}
2913
2914static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2915{
2916        struct bpf_iter_meta meta;
2917        struct bpf_prog *prog;
2918        struct sock *sk = v;
2919        bool slow;
2920        uid_t uid;
2921        int ret;
2922
2923        if (v == SEQ_START_TOKEN)
2924                return 0;
2925
2926        if (sk_fullsock(sk))
2927                slow = lock_sock_fast(sk);
2928
2929        if (unlikely(sk_unhashed(sk))) {
2930                ret = SEQ_SKIP;
2931                goto unlock;
2932        }
2933
2934        if (sk->sk_state == TCP_TIME_WAIT) {
2935                uid = 0;
2936        } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2937                const struct request_sock *req = v;
2938
2939                uid = from_kuid_munged(seq_user_ns(seq),
2940                                       sock_i_uid(req->rsk_listener));
2941        } else {
2942                uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2943        }
2944
2945        meta.seq = seq;
2946        prog = bpf_iter_get_info(&meta, false);
2947        ret = tcp_prog_seq_show(prog, &meta, v, uid);
2948
2949unlock:
2950        if (sk_fullsock(sk))
2951                unlock_sock_fast(sk, slow);
2952        return ret;
2953
2954}
2955
2956static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2957{
2958        struct bpf_tcp_iter_state *iter = seq->private;
2959        struct bpf_iter_meta meta;
2960        struct bpf_prog *prog;
2961
2962        if (!v) {
2963                meta.seq = seq;
2964                prog = bpf_iter_get_info(&meta, true);
2965                if (prog)
2966                        (void)tcp_prog_seq_show(prog, &meta, v, 0);
2967        }
2968
2969        if (iter->cur_sk < iter->end_sk) {
2970                bpf_iter_tcp_put_batch(iter);
2971                iter->st_bucket_done = false;
2972        }
2973}
2974
2975static const struct seq_operations bpf_iter_tcp_seq_ops = {
2976        .show           = bpf_iter_tcp_seq_show,
2977        .start          = bpf_iter_tcp_seq_start,
2978        .next           = bpf_iter_tcp_seq_next,
2979        .stop           = bpf_iter_tcp_seq_stop,
2980};
2981#endif
2982static unsigned short seq_file_family(const struct seq_file *seq)
2983{
2984        const struct tcp_seq_afinfo *afinfo;
2985
2986#ifdef CONFIG_BPF_SYSCALL
2987        /* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
2988        if (seq->op == &bpf_iter_tcp_seq_ops)
2989                return AF_UNSPEC;
2990#endif
2991
2992        /* Iterated from proc fs */
2993        afinfo = PDE_DATA(file_inode(seq->file));
2994        return afinfo->family;
2995}
2996
2997static const struct seq_operations tcp4_seq_ops = {
2998        .show           = tcp4_seq_show,
2999        .start          = tcp_seq_start,
3000        .next           = tcp_seq_next,
3001        .stop           = tcp_seq_stop,
3002};
3003
3004static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3005        .family         = AF_INET,
3006};
3007
3008static int __net_init tcp4_proc_init_net(struct net *net)
3009{
3010        if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3011                        sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3012                return -ENOMEM;
3013        return 0;
3014}
3015
3016static void __net_exit tcp4_proc_exit_net(struct net *net)
3017{
3018        remove_proc_entry("tcp", net->proc_net);
3019}
3020
3021static struct pernet_operations tcp4_net_ops = {
3022        .init = tcp4_proc_init_net,
3023        .exit = tcp4_proc_exit_net,
3024};
3025
3026int __init tcp4_proc_init(void)
3027{
3028        return register_pernet_subsys(&tcp4_net_ops);
3029}
3030
3031void tcp4_proc_exit(void)
3032{
3033        unregister_pernet_subsys(&tcp4_net_ops);
3034}
3035#endif /* CONFIG_PROC_FS */
3036
3037/* @wake is one when sk_stream_write_space() calls us.
3038 * This sends EPOLLOUT only if notsent_bytes is half the limit.
3039 * This mimics the strategy used in sock_def_write_space().
3040 */
3041bool tcp_stream_memory_free(const struct sock *sk, int wake)
3042{
3043        const struct tcp_sock *tp = tcp_sk(sk);
3044        u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3045                            READ_ONCE(tp->snd_nxt);
3046
3047        return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3048}
3049EXPORT_SYMBOL(tcp_stream_memory_free);
3050
3051struct proto tcp_prot = {
3052        .name                   = "TCP",
3053        .owner                  = THIS_MODULE,
3054        .close                  = tcp_close,
3055        .pre_connect            = tcp_v4_pre_connect,
3056        .connect                = tcp_v4_connect,
3057        .disconnect             = tcp_disconnect,
3058        .accept                 = inet_csk_accept,
3059        .ioctl                  = tcp_ioctl,
3060        .init                   = tcp_v4_init_sock,
3061        .destroy                = tcp_v4_destroy_sock,
3062        .shutdown               = tcp_shutdown,
3063        .setsockopt             = tcp_setsockopt,
3064        .getsockopt             = tcp_getsockopt,
3065        .bpf_bypass_getsockopt  = tcp_bpf_bypass_getsockopt,
3066        .keepalive              = tcp_set_keepalive,
3067        .recvmsg                = tcp_recvmsg,
3068        .sendmsg                = tcp_sendmsg,
3069        .sendpage               = tcp_sendpage,
3070        .backlog_rcv            = tcp_v4_do_rcv,
3071        .release_cb             = tcp_release_cb,
3072        .hash                   = inet_hash,
3073        .unhash                 = inet_unhash,
3074        .get_port               = inet_csk_get_port,
3075#ifdef CONFIG_BPF_SYSCALL
3076        .psock_update_sk_prot   = tcp_bpf_update_proto,
3077#endif
3078        .enter_memory_pressure  = tcp_enter_memory_pressure,
3079        .leave_memory_pressure  = tcp_leave_memory_pressure,
3080        .stream_memory_free     = tcp_stream_memory_free,
3081        .sockets_allocated      = &tcp_sockets_allocated,
3082        .orphan_count           = &tcp_orphan_count,
3083        .memory_allocated       = &tcp_memory_allocated,
3084        .memory_pressure        = &tcp_memory_pressure,
3085        .sysctl_mem             = sysctl_tcp_mem,
3086        .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3087        .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3088        .max_header             = MAX_TCP_HEADER,
3089        .obj_size               = sizeof(struct tcp_sock),
3090        .slab_flags             = SLAB_TYPESAFE_BY_RCU,
3091        .twsk_prot              = &tcp_timewait_sock_ops,
3092        .rsk_prot               = &tcp_request_sock_ops,
3093        .h.hashinfo             = &tcp_hashinfo,
3094        .no_autobind            = true,
3095        .diag_destroy           = tcp_abort,
3096};
3097EXPORT_SYMBOL(tcp_prot);
3098
3099static void __net_exit tcp_sk_exit(struct net *net)
3100{
3101        int cpu;
3102
3103        if (net->ipv4.tcp_congestion_control)
3104                bpf_module_put(net->ipv4.tcp_congestion_control,
3105                               net->ipv4.tcp_congestion_control->owner);
3106
3107        for_each_possible_cpu(cpu)
3108                inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
3109        free_percpu(net->ipv4.tcp_sk);
3110}
3111
3112static int __net_init tcp_sk_init(struct net *net)
3113{
3114        int res, cpu, cnt;
3115
3116        net->ipv4.tcp_sk = alloc_percpu(struct sock *);
3117        if (!net->ipv4.tcp_sk)
3118                return -ENOMEM;
3119
3120        for_each_possible_cpu(cpu) {
3121                struct sock *sk;
3122
3123                res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3124                                           IPPROTO_TCP, net);
3125                if (res)
3126                        goto fail;
3127                sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3128
3129                /* Please enforce IP_DF and IPID==0 for RST and
3130                 * ACK sent in SYN-RECV and TIME-WAIT state.
3131                 */
3132                inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3133
3134                *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
3135        }
3136
3137        net->ipv4.sysctl_tcp_ecn = 2;
3138        net->ipv4.sysctl_tcp_ecn_fallback = 1;
3139
3140        net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3141        net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3142        net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3143        net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3144        net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3145
3146        net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3147        net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3148        net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3149
3150        net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3151        net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3152        net->ipv4.sysctl_tcp_syncookies = 1;
3153        net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3154        net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3155        net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3156        net->ipv4.sysctl_tcp_orphan_retries = 0;
3157        net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3158        net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3159        net->ipv4.sysctl_tcp_tw_reuse = 2;
3160        net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3161
3162        cnt = tcp_hashinfo.ehash_mask + 1;
3163        net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
3164        net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
3165
3166        net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
3167        net->ipv4.sysctl_tcp_sack = 1;
3168        net->ipv4.sysctl_tcp_window_scaling = 1;
3169        net->ipv4.sysctl_tcp_timestamps = 1;
3170        net->ipv4.sysctl_tcp_early_retrans = 3;
3171        net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3172        net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3173        net->ipv4.sysctl_tcp_retrans_collapse = 1;
3174        net->ipv4.sysctl_tcp_max_reordering = 300;
3175        net->ipv4.sysctl_tcp_dsack = 1;
3176        net->ipv4.sysctl_tcp_app_win = 31;
3177        net->ipv4.sysctl_tcp_adv_win_scale = 1;
3178        net->ipv4.sysctl_tcp_frto = 2;
3179        net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3180        /* This limits the percentage of the congestion window which we
3181         * will allow a single TSO frame to consume.  Building TSO frames
3182         * which are too large can cause TCP streams to be bursty.
3183         */
3184        net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3185        /* Default TSQ limit of 16 TSO segments */
3186        net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3187        /* rfc5961 challenge ack rate limiting */
3188        net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
3189        net->ipv4.sysctl_tcp_min_tso_segs = 2;
3190        net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3191        net->ipv4.sysctl_tcp_autocorking = 1;
3192        net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3193        net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3194        net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3195        if (net != &init_net) {
3196                memcpy(net->ipv4.sysctl_tcp_rmem,
3197                       init_net.ipv4.sysctl_tcp_rmem,
3198                       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3199                memcpy(net->ipv4.sysctl_tcp_wmem,
3200                       init_net.ipv4.sysctl_tcp_wmem,
3201                       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3202        }
3203        net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3204        net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3205        net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3206        net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3207        net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3208        atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3209
3210        /* Reno is always built in */
3211        if (!net_eq(net, &init_net) &&
3212            bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3213                               init_net.ipv4.tcp_congestion_control->owner))
3214                net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3215        else
3216                net->ipv4.tcp_congestion_control = &tcp_reno;
3217
3218        return 0;
3219fail:
3220        tcp_sk_exit(net);
3221
3222        return res;
3223}
3224
3225static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3226{
3227        struct net *net;
3228
3229        inet_twsk_purge(&tcp_hashinfo, AF_INET);
3230
3231        list_for_each_entry(net, net_exit_list, exit_list)
3232                tcp_fastopen_ctx_destroy(net);
3233}
3234
3235static struct pernet_operations __net_initdata tcp_sk_ops = {
3236       .init       = tcp_sk_init,
3237       .exit       = tcp_sk_exit,
3238       .exit_batch = tcp_sk_exit_batch,
3239};
3240
3241#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3242DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3243                     struct sock_common *sk_common, uid_t uid)
3244
3245#define INIT_BATCH_SZ 16
3246
3247static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3248{
3249        struct bpf_tcp_iter_state *iter = priv_data;
3250        int err;
3251
3252        err = bpf_iter_init_seq_net(priv_data, aux);
3253        if (err)
3254                return err;
3255
3256        err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3257        if (err) {
3258                bpf_iter_fini_seq_net(priv_data);
3259                return err;
3260        }
3261
3262        return 0;
3263}
3264
3265static void bpf_iter_fini_tcp(void *priv_data)
3266{
3267        struct bpf_tcp_iter_state *iter = priv_data;
3268
3269        bpf_iter_fini_seq_net(priv_data);
3270        kvfree(iter->batch);
3271}
3272
3273static const struct bpf_iter_seq_info tcp_seq_info = {
3274        .seq_ops                = &bpf_iter_tcp_seq_ops,
3275        .init_seq_private       = bpf_iter_init_tcp,
3276        .fini_seq_private       = bpf_iter_fini_tcp,
3277        .seq_priv_size          = sizeof(struct bpf_tcp_iter_state),
3278};
3279
3280static const struct bpf_func_proto *
3281bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3282                            const struct bpf_prog *prog)
3283{
3284        switch (func_id) {
3285        case BPF_FUNC_setsockopt:
3286                return &bpf_sk_setsockopt_proto;
3287        case BPF_FUNC_getsockopt:
3288                return &bpf_sk_getsockopt_proto;
3289        default:
3290                return NULL;
3291        }
3292}
3293
3294static struct bpf_iter_reg tcp_reg_info = {
3295        .target                 = "tcp",
3296        .ctx_arg_info_size      = 1,
3297        .ctx_arg_info           = {
3298                { offsetof(struct bpf_iter__tcp, sk_common),
3299                  PTR_TO_BTF_ID_OR_NULL },
3300        },
3301        .get_func_proto         = bpf_iter_tcp_get_func_proto,
3302        .seq_info               = &tcp_seq_info,
3303};
3304
3305static void __init bpf_iter_register(void)
3306{
3307        tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3308        if (bpf_iter_reg_target(&tcp_reg_info))
3309                pr_warn("Warning: could not register bpf iterator tcp\n");
3310}
3311
3312#endif
3313
3314void __init tcp_v4_init(void)
3315{
3316        if (register_pernet_subsys(&tcp_sk_ops))
3317                panic("Failed to create the TCP control socket.\n");
3318
3319#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3320        bpf_iter_register();
3321#endif
3322}
3323