linux/net/ipv4/tcp_ipv4.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4 *              operating system.  INET is implemented using the  BSD Socket
   5 *              interface as the means of communication with the user level.
   6 *
   7 *              Implementation of the Transmission Control Protocol(TCP).
   8 *
   9 *              IPv4 specific functions
  10 *
  11 *              code split from:
  12 *              linux/ipv4/tcp.c
  13 *              linux/ipv4/tcp_input.c
  14 *              linux/ipv4/tcp_output.c
  15 *
  16 *              See tcp.c for author information
  17 */
  18
  19/*
  20 * Changes:
  21 *              David S. Miller :       New socket lookup architecture.
  22 *                                      This code is dedicated to John Dyson.
  23 *              David S. Miller :       Change semantics of established hash,
  24 *                                      half is devoted to TIME_WAIT sockets
  25 *                                      and the rest go in the other half.
  26 *              Andi Kleen :            Add support for syncookies and fixed
  27 *                                      some bugs: ip options weren't passed to
  28 *                                      the TCP layer, missed a check for an
  29 *                                      ACK bit.
  30 *              Andi Kleen :            Implemented fast path mtu discovery.
  31 *                                      Fixed many serious bugs in the
  32 *                                      request_sock handling and moved
  33 *                                      most of it into the af independent code.
  34 *                                      Added tail drop and some other bugfixes.
  35 *                                      Added new listen semantics.
  36 *              Mike McLagan    :       Routing by source
  37 *      Juan Jose Ciarlante:            ip_dynaddr bits
  38 *              Andi Kleen:             various fixes.
  39 *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  40 *                                      coma.
  41 *      Andi Kleen              :       Fix new listen.
  42 *      Andi Kleen              :       Fix accept error reporting.
  43 *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  44 *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  45 *                                      a single port at the same time.
  46 */
  47
  48#define pr_fmt(fmt) "TCP: " fmt
  49
  50#include <linux/bottom_half.h>
  51#include <linux/types.h>
  52#include <linux/fcntl.h>
  53#include <linux/module.h>
  54#include <linux/random.h>
  55#include <linux/cache.h>
  56#include <linux/jhash.h>
  57#include <linux/init.h>
  58#include <linux/times.h>
  59#include <linux/slab.h>
  60
  61#include <net/net_namespace.h>
  62#include <net/icmp.h>
  63#include <net/inet_hashtables.h>
  64#include <net/tcp.h>
  65#include <net/transp_v6.h>
  66#include <net/ipv6.h>
  67#include <net/inet_common.h>
  68#include <net/timewait_sock.h>
  69#include <net/xfrm.h>
  70#include <net/secure_seq.h>
  71#include <net/busy_poll.h>
  72
  73#include <linux/inet.h>
  74#include <linux/ipv6.h>
  75#include <linux/stddef.h>
  76#include <linux/proc_fs.h>
  77#include <linux/seq_file.h>
  78#include <linux/inetdevice.h>
  79
  80#include <crypto/hash.h>
  81#include <linux/scatterlist.h>
  82
  83#include <trace/events/tcp.h>
  84
  85#ifdef CONFIG_TCP_MD5SIG
  86static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  87                               __be32 daddr, __be32 saddr, const struct tcphdr *th);
  88#endif
  89
  90struct inet_hashinfo tcp_hashinfo;
  91EXPORT_SYMBOL(tcp_hashinfo);
  92
  93static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  94{
  95        return secure_tcp_seq(ip_hdr(skb)->daddr,
  96                              ip_hdr(skb)->saddr,
  97                              tcp_hdr(skb)->dest,
  98                              tcp_hdr(skb)->source);
  99}
 100
 101static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 102{
 103        return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 104}
 105
 106int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 107{
 108        const struct inet_timewait_sock *tw = inet_twsk(sktw);
 109        const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 110        struct tcp_sock *tp = tcp_sk(sk);
 111        int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
 112
 113        if (reuse == 2) {
 114                /* Still does not detect *everything* that goes through
 115                 * lo, since we require a loopback src or dst address
 116                 * or direct binding to 'lo' interface.
 117                 */
 118                bool loopback = false;
 119                if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 120                        loopback = true;
 121#if IS_ENABLED(CONFIG_IPV6)
 122                if (tw->tw_family == AF_INET6) {
 123                        if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 124                            (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
 125                             (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
 126                            ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 127                            (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
 128                             (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
 129                                loopback = true;
 130                } else
 131#endif
 132                {
 133                        if (ipv4_is_loopback(tw->tw_daddr) ||
 134                            ipv4_is_loopback(tw->tw_rcv_saddr))
 135                                loopback = true;
 136                }
 137                if (!loopback)
 138                        reuse = 0;
 139        }
 140
 141        /* With PAWS, it is safe from the viewpoint
 142           of data integrity. Even without PAWS it is safe provided sequence
 143           spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 144
 145           Actually, the idea is close to VJ's one, only timestamp cache is
 146           held not per host, but per port pair and TW bucket is used as state
 147           holder.
 148
 149           If TW bucket has been already destroyed we fall back to VJ's scheme
 150           and use initial timestamp retrieved from peer table.
 151         */
 152        if (tcptw->tw_ts_recent_stamp &&
 153            (!twp || (reuse && time_after32(ktime_get_seconds(),
 154                                            tcptw->tw_ts_recent_stamp)))) {
 155                /* In case of repair and re-using TIME-WAIT sockets we still
 156                 * want to be sure that it is safe as above but honor the
 157                 * sequence numbers and time stamps set as part of the repair
 158                 * process.
 159                 *
 160                 * Without this check re-using a TIME-WAIT socket with TCP
 161                 * repair would accumulate a -1 on the repair assigned
 162                 * sequence number. The first time it is reused the sequence
 163                 * is -1, the second time -2, etc. This fixes that issue
 164                 * without appearing to create any others.
 165                 */
 166                if (likely(!tp->repair)) {
 167                        tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 168                        if (tp->write_seq == 0)
 169                                tp->write_seq = 1;
 170                        tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 171                        tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 172                }
 173                sock_hold(sktw);
 174                return 1;
 175        }
 176
 177        return 0;
 178}
 179EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 180
 181static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 182                              int addr_len)
 183{
 184        /* This check is replicated from tcp_v4_connect() and intended to
 185         * prevent BPF program called below from accessing bytes that are out
 186         * of the bound specified by user in addr_len.
 187         */
 188        if (addr_len < sizeof(struct sockaddr_in))
 189                return -EINVAL;
 190
 191        sock_owned_by_me(sk);
 192
 193        return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 194}
 195
 196/* This will initiate an outgoing connection. */
 197int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 198{
 199        struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 200        struct inet_sock *inet = inet_sk(sk);
 201        struct tcp_sock *tp = tcp_sk(sk);
 202        __be16 orig_sport, orig_dport;
 203        __be32 daddr, nexthop;
 204        struct flowi4 *fl4;
 205        struct rtable *rt;
 206        int err;
 207        struct ip_options_rcu *inet_opt;
 208        struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 209
 210        if (addr_len < sizeof(struct sockaddr_in))
 211                return -EINVAL;
 212
 213        if (usin->sin_family != AF_INET)
 214                return -EAFNOSUPPORT;
 215
 216        nexthop = daddr = usin->sin_addr.s_addr;
 217        inet_opt = rcu_dereference_protected(inet->inet_opt,
 218                                             lockdep_sock_is_held(sk));
 219        if (inet_opt && inet_opt->opt.srr) {
 220                if (!daddr)
 221                        return -EINVAL;
 222                nexthop = inet_opt->opt.faddr;
 223        }
 224
 225        orig_sport = inet->inet_sport;
 226        orig_dport = usin->sin_port;
 227        fl4 = &inet->cork.fl.u.ip4;
 228        rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 229                              RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 230                              IPPROTO_TCP,
 231                              orig_sport, orig_dport, sk);
 232        if (IS_ERR(rt)) {
 233                err = PTR_ERR(rt);
 234                if (err == -ENETUNREACH)
 235                        IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 236                return err;
 237        }
 238
 239        if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 240                ip_rt_put(rt);
 241                return -ENETUNREACH;
 242        }
 243
 244        if (!inet_opt || !inet_opt->opt.srr)
 245                daddr = fl4->daddr;
 246
 247        if (!inet->inet_saddr)
 248                inet->inet_saddr = fl4->saddr;
 249        sk_rcv_saddr_set(sk, inet->inet_saddr);
 250
 251        if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 252                /* Reset inherited state */
 253                tp->rx_opt.ts_recent       = 0;
 254                tp->rx_opt.ts_recent_stamp = 0;
 255                if (likely(!tp->repair))
 256                        tp->write_seq      = 0;
 257        }
 258
 259        inet->inet_dport = usin->sin_port;
 260        sk_daddr_set(sk, daddr);
 261
 262        inet_csk(sk)->icsk_ext_hdr_len = 0;
 263        if (inet_opt)
 264                inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 265
 266        tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 267
 268        /* Socket identity is still unknown (sport may be zero).
 269         * However we set state to SYN-SENT and not releasing socket
 270         * lock select source port, enter ourselves into the hash tables and
 271         * complete initialization after this.
 272         */
 273        tcp_set_state(sk, TCP_SYN_SENT);
 274        err = inet_hash_connect(tcp_death_row, sk);
 275        if (err)
 276                goto failure;
 277
 278        sk_set_txhash(sk);
 279
 280        rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 281                               inet->inet_sport, inet->inet_dport, sk);
 282        if (IS_ERR(rt)) {
 283                err = PTR_ERR(rt);
 284                rt = NULL;
 285                goto failure;
 286        }
 287        /* OK, now commit destination to socket.  */
 288        sk->sk_gso_type = SKB_GSO_TCPV4;
 289        sk_setup_caps(sk, &rt->dst);
 290        rt = NULL;
 291
 292        if (likely(!tp->repair)) {
 293                if (!tp->write_seq)
 294                        tp->write_seq = secure_tcp_seq(inet->inet_saddr,
 295                                                       inet->inet_daddr,
 296                                                       inet->inet_sport,
 297                                                       usin->sin_port);
 298                tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 299                                                 inet->inet_saddr,
 300                                                 inet->inet_daddr);
 301        }
 302
 303        inet->inet_id = tp->write_seq ^ jiffies;
 304
 305        if (tcp_fastopen_defer_connect(sk, &err))
 306                return err;
 307        if (err)
 308                goto failure;
 309
 310        err = tcp_connect(sk);
 311
 312        if (err)
 313                goto failure;
 314
 315        return 0;
 316
 317failure:
 318        /*
 319         * This unhashes the socket and releases the local port,
 320         * if necessary.
 321         */
 322        tcp_set_state(sk, TCP_CLOSE);
 323        ip_rt_put(rt);
 324        sk->sk_route_caps = 0;
 325        inet->inet_dport = 0;
 326        return err;
 327}
 328EXPORT_SYMBOL(tcp_v4_connect);
 329
 330/*
 331 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 332 * It can be called through tcp_release_cb() if socket was owned by user
 333 * at the time tcp_v4_err() was called to handle ICMP message.
 334 */
 335void tcp_v4_mtu_reduced(struct sock *sk)
 336{
 337        struct inet_sock *inet = inet_sk(sk);
 338        struct dst_entry *dst;
 339        u32 mtu;
 340
 341        if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 342                return;
 343        mtu = tcp_sk(sk)->mtu_info;
 344        dst = inet_csk_update_pmtu(sk, mtu);
 345        if (!dst)
 346                return;
 347
 348        /* Something is about to be wrong... Remember soft error
 349         * for the case, if this connection will not able to recover.
 350         */
 351        if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 352                sk->sk_err_soft = EMSGSIZE;
 353
 354        mtu = dst_mtu(dst);
 355
 356        if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 357            ip_sk_accept_pmtu(sk) &&
 358            inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 359                tcp_sync_mss(sk, mtu);
 360
 361                /* Resend the TCP packet because it's
 362                 * clear that the old packet has been
 363                 * dropped. This is the new "fast" path mtu
 364                 * discovery.
 365                 */
 366                tcp_simple_retransmit(sk);
 367        } /* else let the usual retransmit timer handle it */
 368}
 369EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 370
 371static void do_redirect(struct sk_buff *skb, struct sock *sk)
 372{
 373        struct dst_entry *dst = __sk_dst_check(sk, 0);
 374
 375        if (dst)
 376                dst->ops->redirect(dst, sk, skb);
 377}
 378
 379
 380/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 381void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 382{
 383        struct request_sock *req = inet_reqsk(sk);
 384        struct net *net = sock_net(sk);
 385
 386        /* ICMPs are not backlogged, hence we cannot get
 387         * an established socket here.
 388         */
 389        if (seq != tcp_rsk(req)->snt_isn) {
 390                __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 391        } else if (abort) {
 392                /*
 393                 * Still in SYN_RECV, just remove it silently.
 394                 * There is no good way to pass the error to the newly
 395                 * created socket, and POSIX does not want network
 396                 * errors returned from accept().
 397                 */
 398                inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 399                tcp_listendrop(req->rsk_listener);
 400        }
 401        reqsk_put(req);
 402}
 403EXPORT_SYMBOL(tcp_req_err);
 404
 405/*
 406 * This routine is called by the ICMP module when it gets some
 407 * sort of error condition.  If err < 0 then the socket should
 408 * be closed and the error returned to the user.  If err > 0
 409 * it's just the icmp type << 8 | icmp code.  After adjustment
 410 * header points to the first 8 bytes of the tcp header.  We need
 411 * to find the appropriate port.
 412 *
 413 * The locking strategy used here is very "optimistic". When
 414 * someone else accesses the socket the ICMP is just dropped
 415 * and for some paths there is no check at all.
 416 * A more general error queue to queue errors for later handling
 417 * is probably better.
 418 *
 419 */
 420
 421int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 422{
 423        const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 424        struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 425        struct inet_connection_sock *icsk;
 426        struct tcp_sock *tp;
 427        struct inet_sock *inet;
 428        const int type = icmp_hdr(icmp_skb)->type;
 429        const int code = icmp_hdr(icmp_skb)->code;
 430        struct sock *sk;
 431        struct sk_buff *skb;
 432        struct request_sock *fastopen;
 433        u32 seq, snd_una;
 434        s32 remaining;
 435        u32 delta_us;
 436        int err;
 437        struct net *net = dev_net(icmp_skb->dev);
 438
 439        sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 440                                       th->dest, iph->saddr, ntohs(th->source),
 441                                       inet_iif(icmp_skb), 0);
 442        if (!sk) {
 443                __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 444                return -ENOENT;
 445        }
 446        if (sk->sk_state == TCP_TIME_WAIT) {
 447                inet_twsk_put(inet_twsk(sk));
 448                return 0;
 449        }
 450        seq = ntohl(th->seq);
 451        if (sk->sk_state == TCP_NEW_SYN_RECV) {
 452                tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 453                                     type == ICMP_TIME_EXCEEDED ||
 454                                     (type == ICMP_DEST_UNREACH &&
 455                                      (code == ICMP_NET_UNREACH ||
 456                                       code == ICMP_HOST_UNREACH)));
 457                return 0;
 458        }
 459
 460        bh_lock_sock(sk);
 461        /* If too many ICMPs get dropped on busy
 462         * servers this needs to be solved differently.
 463         * We do take care of PMTU discovery (RFC1191) special case :
 464         * we can receive locally generated ICMP messages while socket is held.
 465         */
 466        if (sock_owned_by_user(sk)) {
 467                if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 468                        __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 469        }
 470        if (sk->sk_state == TCP_CLOSE)
 471                goto out;
 472
 473        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 474                __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 475                goto out;
 476        }
 477
 478        icsk = inet_csk(sk);
 479        tp = tcp_sk(sk);
 480        /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 481        fastopen = tp->fastopen_rsk;
 482        snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 483        if (sk->sk_state != TCP_LISTEN &&
 484            !between(seq, snd_una, tp->snd_nxt)) {
 485                __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 486                goto out;
 487        }
 488
 489        switch (type) {
 490        case ICMP_REDIRECT:
 491                if (!sock_owned_by_user(sk))
 492                        do_redirect(icmp_skb, sk);
 493                goto out;
 494        case ICMP_SOURCE_QUENCH:
 495                /* Just silently ignore these. */
 496                goto out;
 497        case ICMP_PARAMETERPROB:
 498                err = EPROTO;
 499                break;
 500        case ICMP_DEST_UNREACH:
 501                if (code > NR_ICMP_UNREACH)
 502                        goto out;
 503
 504                if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 505                        /* We are not interested in TCP_LISTEN and open_requests
 506                         * (SYN-ACKs send out by Linux are always <576bytes so
 507                         * they should go through unfragmented).
 508                         */
 509                        if (sk->sk_state == TCP_LISTEN)
 510                                goto out;
 511
 512                        tp->mtu_info = info;
 513                        if (!sock_owned_by_user(sk)) {
 514                                tcp_v4_mtu_reduced(sk);
 515                        } else {
 516                                if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 517                                        sock_hold(sk);
 518                        }
 519                        goto out;
 520                }
 521
 522                err = icmp_err_convert[code].errno;
 523                /* check if icmp_skb allows revert of backoff
 524                 * (see draft-zimmermann-tcp-lcd) */
 525                if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 526                        break;
 527                if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 528                    !icsk->icsk_backoff || fastopen)
 529                        break;
 530
 531                if (sock_owned_by_user(sk))
 532                        break;
 533
 534                skb = tcp_rtx_queue_head(sk);
 535                if (WARN_ON_ONCE(!skb))
 536                        break;
 537
 538                icsk->icsk_backoff--;
 539                icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 540                                               TCP_TIMEOUT_INIT;
 541                icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 542
 543
 544                tcp_mstamp_refresh(tp);
 545                delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 546                remaining = icsk->icsk_rto -
 547                            usecs_to_jiffies(delta_us);
 548
 549                if (remaining > 0) {
 550                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 551                                                  remaining, TCP_RTO_MAX);
 552                } else {
 553                        /* RTO revert clocked out retransmission.
 554                         * Will retransmit now */
 555                        tcp_retransmit_timer(sk);
 556                }
 557
 558                break;
 559        case ICMP_TIME_EXCEEDED:
 560                err = EHOSTUNREACH;
 561                break;
 562        default:
 563                goto out;
 564        }
 565
 566        switch (sk->sk_state) {
 567        case TCP_SYN_SENT:
 568        case TCP_SYN_RECV:
 569                /* Only in fast or simultaneous open. If a fast open socket is
 570                 * is already accepted it is treated as a connected one below.
 571                 */
 572                if (fastopen && !fastopen->sk)
 573                        break;
 574
 575                if (!sock_owned_by_user(sk)) {
 576                        sk->sk_err = err;
 577
 578                        sk->sk_error_report(sk);
 579
 580                        tcp_done(sk);
 581                } else {
 582                        sk->sk_err_soft = err;
 583                }
 584                goto out;
 585        }
 586
 587        /* If we've already connected we will keep trying
 588         * until we time out, or the user gives up.
 589         *
 590         * rfc1122 4.2.3.9 allows to consider as hard errors
 591         * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 592         * but it is obsoleted by pmtu discovery).
 593         *
 594         * Note, that in modern internet, where routing is unreliable
 595         * and in each dark corner broken firewalls sit, sending random
 596         * errors ordered by their masters even this two messages finally lose
 597         * their original sense (even Linux sends invalid PORT_UNREACHs)
 598         *
 599         * Now we are in compliance with RFCs.
 600         *                                                      --ANK (980905)
 601         */
 602
 603        inet = inet_sk(sk);
 604        if (!sock_owned_by_user(sk) && inet->recverr) {
 605                sk->sk_err = err;
 606                sk->sk_error_report(sk);
 607        } else  { /* Only an error on timeout */
 608                sk->sk_err_soft = err;
 609        }
 610
 611out:
 612        bh_unlock_sock(sk);
 613        sock_put(sk);
 614        return 0;
 615}
 616
 617void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 618{
 619        struct tcphdr *th = tcp_hdr(skb);
 620
 621        th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 622        skb->csum_start = skb_transport_header(skb) - skb->head;
 623        skb->csum_offset = offsetof(struct tcphdr, check);
 624}
 625
 626/* This routine computes an IPv4 TCP checksum. */
 627void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 628{
 629        const struct inet_sock *inet = inet_sk(sk);
 630
 631        __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 632}
 633EXPORT_SYMBOL(tcp_v4_send_check);
 634
 635/*
 636 *      This routine will send an RST to the other tcp.
 637 *
 638 *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 639 *                    for reset.
 640 *      Answer: if a packet caused RST, it is not for a socket
 641 *              existing in our system, if it is matched to a socket,
 642 *              it is just duplicate segment or bug in other side's TCP.
 643 *              So that we build reply only basing on parameters
 644 *              arrived with segment.
 645 *      Exception: precedence violation. We do not implement it in any case.
 646 */
 647
 648static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 649{
 650        const struct tcphdr *th = tcp_hdr(skb);
 651        struct {
 652                struct tcphdr th;
 653#ifdef CONFIG_TCP_MD5SIG
 654                __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 655#endif
 656        } rep;
 657        struct ip_reply_arg arg;
 658#ifdef CONFIG_TCP_MD5SIG
 659        struct tcp_md5sig_key *key = NULL;
 660        const __u8 *hash_location = NULL;
 661        unsigned char newhash[16];
 662        int genhash;
 663        struct sock *sk1 = NULL;
 664#endif
 665        u64 transmit_time = 0;
 666        struct sock *ctl_sk;
 667        struct net *net;
 668
 669        /* Never send a reset in response to a reset. */
 670        if (th->rst)
 671                return;
 672
 673        /* If sk not NULL, it means we did a successful lookup and incoming
 674         * route had to be correct. prequeue might have dropped our dst.
 675         */
 676        if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 677                return;
 678
 679        /* Swap the send and the receive. */
 680        memset(&rep, 0, sizeof(rep));
 681        rep.th.dest   = th->source;
 682        rep.th.source = th->dest;
 683        rep.th.doff   = sizeof(struct tcphdr) / 4;
 684        rep.th.rst    = 1;
 685
 686        if (th->ack) {
 687                rep.th.seq = th->ack_seq;
 688        } else {
 689                rep.th.ack = 1;
 690                rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 691                                       skb->len - (th->doff << 2));
 692        }
 693
 694        memset(&arg, 0, sizeof(arg));
 695        arg.iov[0].iov_base = (unsigned char *)&rep;
 696        arg.iov[0].iov_len  = sizeof(rep.th);
 697
 698        net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 699#ifdef CONFIG_TCP_MD5SIG
 700        rcu_read_lock();
 701        hash_location = tcp_parse_md5sig_option(th);
 702        if (sk && sk_fullsock(sk)) {
 703                key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 704                                        &ip_hdr(skb)->saddr, AF_INET);
 705        } else if (hash_location) {
 706                /*
 707                 * active side is lost. Try to find listening socket through
 708                 * source port, and then find md5 key through listening socket.
 709                 * we are not loose security here:
 710                 * Incoming packet is checked with md5 hash with finding key,
 711                 * no RST generated if md5 hash doesn't match.
 712                 */
 713                sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 714                                             ip_hdr(skb)->saddr,
 715                                             th->source, ip_hdr(skb)->daddr,
 716                                             ntohs(th->source), inet_iif(skb),
 717                                             tcp_v4_sdif(skb));
 718                /* don't send rst if it can't find key */
 719                if (!sk1)
 720                        goto out;
 721
 722                key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 723                                        &ip_hdr(skb)->saddr, AF_INET);
 724                if (!key)
 725                        goto out;
 726
 727
 728                genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 729                if (genhash || memcmp(hash_location, newhash, 16) != 0)
 730                        goto out;
 731
 732        }
 733
 734        if (key) {
 735                rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 736                                   (TCPOPT_NOP << 16) |
 737                                   (TCPOPT_MD5SIG << 8) |
 738                                   TCPOLEN_MD5SIG);
 739                /* Update length and the length the header thinks exists */
 740                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 741                rep.th.doff = arg.iov[0].iov_len / 4;
 742
 743                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 744                                     key, ip_hdr(skb)->saddr,
 745                                     ip_hdr(skb)->daddr, &rep.th);
 746        }
 747#endif
 748        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 749                                      ip_hdr(skb)->saddr, /* XXX */
 750                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 751        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 752        arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 753
 754        /* When socket is gone, all binding information is lost.
 755         * routing might fail in this case. No choice here, if we choose to force
 756         * input interface, we will misroute in case of asymmetric route.
 757         */
 758        if (sk) {
 759                arg.bound_dev_if = sk->sk_bound_dev_if;
 760                if (sk_fullsock(sk))
 761                        trace_tcp_send_reset(sk, skb);
 762        }
 763
 764        BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 765                     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 766
 767        arg.tos = ip_hdr(skb)->tos;
 768        arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 769        local_bh_disable();
 770        ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 771        if (sk) {
 772                ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 773                                   inet_twsk(sk)->tw_mark : sk->sk_mark;
 774                transmit_time = tcp_transmit_time(sk);
 775        }
 776        ip_send_unicast_reply(ctl_sk,
 777                              skb, &TCP_SKB_CB(skb)->header.h4.opt,
 778                              ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 779                              &arg, arg.iov[0].iov_len,
 780                              transmit_time);
 781
 782        ctl_sk->sk_mark = 0;
 783        __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 784        __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 785        local_bh_enable();
 786
 787#ifdef CONFIG_TCP_MD5SIG
 788out:
 789        rcu_read_unlock();
 790#endif
 791}
 792
 793/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 794   outside socket context is ugly, certainly. What can I do?
 795 */
 796
 797static void tcp_v4_send_ack(const struct sock *sk,
 798                            struct sk_buff *skb, u32 seq, u32 ack,
 799                            u32 win, u32 tsval, u32 tsecr, int oif,
 800                            struct tcp_md5sig_key *key,
 801                            int reply_flags, u8 tos)
 802{
 803        const struct tcphdr *th = tcp_hdr(skb);
 804        struct {
 805                struct tcphdr th;
 806                __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 807#ifdef CONFIG_TCP_MD5SIG
 808                           + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 809#endif
 810                        ];
 811        } rep;
 812        struct net *net = sock_net(sk);
 813        struct ip_reply_arg arg;
 814        struct sock *ctl_sk;
 815        u64 transmit_time;
 816
 817        memset(&rep.th, 0, sizeof(struct tcphdr));
 818        memset(&arg, 0, sizeof(arg));
 819
 820        arg.iov[0].iov_base = (unsigned char *)&rep;
 821        arg.iov[0].iov_len  = sizeof(rep.th);
 822        if (tsecr) {
 823                rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 824                                   (TCPOPT_TIMESTAMP << 8) |
 825                                   TCPOLEN_TIMESTAMP);
 826                rep.opt[1] = htonl(tsval);
 827                rep.opt[2] = htonl(tsecr);
 828                arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 829        }
 830
 831        /* Swap the send and the receive. */
 832        rep.th.dest    = th->source;
 833        rep.th.source  = th->dest;
 834        rep.th.doff    = arg.iov[0].iov_len / 4;
 835        rep.th.seq     = htonl(seq);
 836        rep.th.ack_seq = htonl(ack);
 837        rep.th.ack     = 1;
 838        rep.th.window  = htons(win);
 839
 840#ifdef CONFIG_TCP_MD5SIG
 841        if (key) {
 842                int offset = (tsecr) ? 3 : 0;
 843
 844                rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 845                                          (TCPOPT_NOP << 16) |
 846                                          (TCPOPT_MD5SIG << 8) |
 847                                          TCPOLEN_MD5SIG);
 848                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 849                rep.th.doff = arg.iov[0].iov_len/4;
 850
 851                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 852                                    key, ip_hdr(skb)->saddr,
 853                                    ip_hdr(skb)->daddr, &rep.th);
 854        }
 855#endif
 856        arg.flags = reply_flags;
 857        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 858                                      ip_hdr(skb)->saddr, /* XXX */
 859                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 860        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 861        if (oif)
 862                arg.bound_dev_if = oif;
 863        arg.tos = tos;
 864        arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 865        local_bh_disable();
 866        ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 867        ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 868                           inet_twsk(sk)->tw_mark : sk->sk_mark;
 869        transmit_time = tcp_transmit_time(sk);
 870        ip_send_unicast_reply(ctl_sk,
 871                              skb, &TCP_SKB_CB(skb)->header.h4.opt,
 872                              ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 873                              &arg, arg.iov[0].iov_len,
 874                              transmit_time);
 875
 876        ctl_sk->sk_mark = 0;
 877        __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 878        local_bh_enable();
 879}
 880
 881static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 882{
 883        struct inet_timewait_sock *tw = inet_twsk(sk);
 884        struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 885
 886        tcp_v4_send_ack(sk, skb,
 887                        tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 888                        tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 889                        tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 890                        tcptw->tw_ts_recent,
 891                        tw->tw_bound_dev_if,
 892                        tcp_twsk_md5_key(tcptw),
 893                        tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 894                        tw->tw_tos
 895                        );
 896
 897        inet_twsk_put(tw);
 898}
 899
 900static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 901                                  struct request_sock *req)
 902{
 903        /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 904         * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 905         */
 906        u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 907                                             tcp_sk(sk)->snd_nxt;
 908
 909        /* RFC 7323 2.3
 910         * The window field (SEG.WND) of every outgoing segment, with the
 911         * exception of <SYN> segments, MUST be right-shifted by
 912         * Rcv.Wind.Shift bits:
 913         */
 914        tcp_v4_send_ack(sk, skb, seq,
 915                        tcp_rsk(req)->rcv_nxt,
 916                        req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 917                        tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 918                        req->ts_recent,
 919                        0,
 920                        tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
 921                                          AF_INET),
 922                        inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 923                        ip_hdr(skb)->tos);
 924}
 925
 926/*
 927 *      Send a SYN-ACK after having received a SYN.
 928 *      This still operates on a request_sock only, not on a big
 929 *      socket.
 930 */
 931static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 932                              struct flowi *fl,
 933                              struct request_sock *req,
 934                              struct tcp_fastopen_cookie *foc,
 935                              enum tcp_synack_type synack_type)
 936{
 937        const struct inet_request_sock *ireq = inet_rsk(req);
 938        struct flowi4 fl4;
 939        int err = -1;
 940        struct sk_buff *skb;
 941
 942        /* First, grab a route. */
 943        if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 944                return -1;
 945
 946        skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 947
 948        if (skb) {
 949                __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 950
 951                rcu_read_lock();
 952                err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 953                                            ireq->ir_rmt_addr,
 954                                            rcu_dereference(ireq->ireq_opt));
 955                rcu_read_unlock();
 956                err = net_xmit_eval(err);
 957        }
 958
 959        return err;
 960}
 961
 962/*
 963 *      IPv4 request_sock destructor.
 964 */
 965static void tcp_v4_reqsk_destructor(struct request_sock *req)
 966{
 967        kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
 968}
 969
 970#ifdef CONFIG_TCP_MD5SIG
 971/*
 972 * RFC2385 MD5 checksumming requires a mapping of
 973 * IP address->MD5 Key.
 974 * We need to maintain these in the sk structure.
 975 */
 976
 977DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
 978EXPORT_SYMBOL(tcp_md5_needed);
 979
 980/* Find the Key structure for an address.  */
 981struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk,
 982                                           const union tcp_md5_addr *addr,
 983                                           int family)
 984{
 985        const struct tcp_sock *tp = tcp_sk(sk);
 986        struct tcp_md5sig_key *key;
 987        const struct tcp_md5sig_info *md5sig;
 988        __be32 mask;
 989        struct tcp_md5sig_key *best_match = NULL;
 990        bool match;
 991
 992        /* caller either holds rcu_read_lock() or socket lock */
 993        md5sig = rcu_dereference_check(tp->md5sig_info,
 994                                       lockdep_sock_is_held(sk));
 995        if (!md5sig)
 996                return NULL;
 997
 998        hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 999                if (key->family != family)
1000                        continue;
1001
1002                if (family == AF_INET) {
1003                        mask = inet_make_mask(key->prefixlen);
1004                        match = (key->addr.a4.s_addr & mask) ==
1005                                (addr->a4.s_addr & mask);
1006#if IS_ENABLED(CONFIG_IPV6)
1007                } else if (family == AF_INET6) {
1008                        match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1009                                                  key->prefixlen);
1010#endif
1011                } else {
1012                        match = false;
1013                }
1014
1015                if (match && (!best_match ||
1016                              key->prefixlen > best_match->prefixlen))
1017                        best_match = key;
1018        }
1019        return best_match;
1020}
1021EXPORT_SYMBOL(__tcp_md5_do_lookup);
1022
1023static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1024                                                      const union tcp_md5_addr *addr,
1025                                                      int family, u8 prefixlen)
1026{
1027        const struct tcp_sock *tp = tcp_sk(sk);
1028        struct tcp_md5sig_key *key;
1029        unsigned int size = sizeof(struct in_addr);
1030        const struct tcp_md5sig_info *md5sig;
1031
1032        /* caller either holds rcu_read_lock() or socket lock */
1033        md5sig = rcu_dereference_check(tp->md5sig_info,
1034                                       lockdep_sock_is_held(sk));
1035        if (!md5sig)
1036                return NULL;
1037#if IS_ENABLED(CONFIG_IPV6)
1038        if (family == AF_INET6)
1039                size = sizeof(struct in6_addr);
1040#endif
1041        hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1042                if (key->family != family)
1043                        continue;
1044                if (!memcmp(&key->addr, addr, size) &&
1045                    key->prefixlen == prefixlen)
1046                        return key;
1047        }
1048        return NULL;
1049}
1050
1051struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1052                                         const struct sock *addr_sk)
1053{
1054        const union tcp_md5_addr *addr;
1055
1056        addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1057        return tcp_md5_do_lookup(sk, addr, AF_INET);
1058}
1059EXPORT_SYMBOL(tcp_v4_md5_lookup);
1060
1061/* This can be called on a newly created socket, from other files */
1062int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1063                   int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1064                   gfp_t gfp)
1065{
1066        /* Add Key to the list */
1067        struct tcp_md5sig_key *key;
1068        struct tcp_sock *tp = tcp_sk(sk);
1069        struct tcp_md5sig_info *md5sig;
1070
1071        key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1072        if (key) {
1073                /* Pre-existing entry - just update that one. */
1074                memcpy(key->key, newkey, newkeylen);
1075                key->keylen = newkeylen;
1076                return 0;
1077        }
1078
1079        md5sig = rcu_dereference_protected(tp->md5sig_info,
1080                                           lockdep_sock_is_held(sk));
1081        if (!md5sig) {
1082                md5sig = kmalloc(sizeof(*md5sig), gfp);
1083                if (!md5sig)
1084                        return -ENOMEM;
1085
1086                sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1087                INIT_HLIST_HEAD(&md5sig->head);
1088                rcu_assign_pointer(tp->md5sig_info, md5sig);
1089        }
1090
1091        key = sock_kmalloc(sk, sizeof(*key), gfp);
1092        if (!key)
1093                return -ENOMEM;
1094        if (!tcp_alloc_md5sig_pool()) {
1095                sock_kfree_s(sk, key, sizeof(*key));
1096                return -ENOMEM;
1097        }
1098
1099        memcpy(key->key, newkey, newkeylen);
1100        key->keylen = newkeylen;
1101        key->family = family;
1102        key->prefixlen = prefixlen;
1103        memcpy(&key->addr, addr,
1104               (family == AF_INET6) ? sizeof(struct in6_addr) :
1105                                      sizeof(struct in_addr));
1106        hlist_add_head_rcu(&key->node, &md5sig->head);
1107        return 0;
1108}
1109EXPORT_SYMBOL(tcp_md5_do_add);
1110
1111int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1112                   u8 prefixlen)
1113{
1114        struct tcp_md5sig_key *key;
1115
1116        key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1117        if (!key)
1118                return -ENOENT;
1119        hlist_del_rcu(&key->node);
1120        atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1121        kfree_rcu(key, rcu);
1122        return 0;
1123}
1124EXPORT_SYMBOL(tcp_md5_do_del);
1125
1126static void tcp_clear_md5_list(struct sock *sk)
1127{
1128        struct tcp_sock *tp = tcp_sk(sk);
1129        struct tcp_md5sig_key *key;
1130        struct hlist_node *n;
1131        struct tcp_md5sig_info *md5sig;
1132
1133        md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1134
1135        hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1136                hlist_del_rcu(&key->node);
1137                atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1138                kfree_rcu(key, rcu);
1139        }
1140}
1141
1142static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1143                                 char __user *optval, int optlen)
1144{
1145        struct tcp_md5sig cmd;
1146        struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1147        u8 prefixlen = 32;
1148
1149        if (optlen < sizeof(cmd))
1150                return -EINVAL;
1151
1152        if (copy_from_user(&cmd, optval, sizeof(cmd)))
1153                return -EFAULT;
1154
1155        if (sin->sin_family != AF_INET)
1156                return -EINVAL;
1157
1158        if (optname == TCP_MD5SIG_EXT &&
1159            cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1160                prefixlen = cmd.tcpm_prefixlen;
1161                if (prefixlen > 32)
1162                        return -EINVAL;
1163        }
1164
1165        if (!cmd.tcpm_keylen)
1166                return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1167                                      AF_INET, prefixlen);
1168
1169        if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1170                return -EINVAL;
1171
1172        return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1173                              AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1174                              GFP_KERNEL);
1175}
1176
1177static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1178                                   __be32 daddr, __be32 saddr,
1179                                   const struct tcphdr *th, int nbytes)
1180{
1181        struct tcp4_pseudohdr *bp;
1182        struct scatterlist sg;
1183        struct tcphdr *_th;
1184
1185        bp = hp->scratch;
1186        bp->saddr = saddr;
1187        bp->daddr = daddr;
1188        bp->pad = 0;
1189        bp->protocol = IPPROTO_TCP;
1190        bp->len = cpu_to_be16(nbytes);
1191
1192        _th = (struct tcphdr *)(bp + 1);
1193        memcpy(_th, th, sizeof(*th));
1194        _th->check = 0;
1195
1196        sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1197        ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1198                                sizeof(*bp) + sizeof(*th));
1199        return crypto_ahash_update(hp->md5_req);
1200}
1201
1202static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1203                               __be32 daddr, __be32 saddr, const struct tcphdr *th)
1204{
1205        struct tcp_md5sig_pool *hp;
1206        struct ahash_request *req;
1207
1208        hp = tcp_get_md5sig_pool();
1209        if (!hp)
1210                goto clear_hash_noput;
1211        req = hp->md5_req;
1212
1213        if (crypto_ahash_init(req))
1214                goto clear_hash;
1215        if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1216                goto clear_hash;
1217        if (tcp_md5_hash_key(hp, key))
1218                goto clear_hash;
1219        ahash_request_set_crypt(req, NULL, md5_hash, 0);
1220        if (crypto_ahash_final(req))
1221                goto clear_hash;
1222
1223        tcp_put_md5sig_pool();
1224        return 0;
1225
1226clear_hash:
1227        tcp_put_md5sig_pool();
1228clear_hash_noput:
1229        memset(md5_hash, 0, 16);
1230        return 1;
1231}
1232
1233int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1234                        const struct sock *sk,
1235                        const struct sk_buff *skb)
1236{
1237        struct tcp_md5sig_pool *hp;
1238        struct ahash_request *req;
1239        const struct tcphdr *th = tcp_hdr(skb);
1240        __be32 saddr, daddr;
1241
1242        if (sk) { /* valid for establish/request sockets */
1243                saddr = sk->sk_rcv_saddr;
1244                daddr = sk->sk_daddr;
1245        } else {
1246                const struct iphdr *iph = ip_hdr(skb);
1247                saddr = iph->saddr;
1248                daddr = iph->daddr;
1249        }
1250
1251        hp = tcp_get_md5sig_pool();
1252        if (!hp)
1253                goto clear_hash_noput;
1254        req = hp->md5_req;
1255
1256        if (crypto_ahash_init(req))
1257                goto clear_hash;
1258
1259        if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1260                goto clear_hash;
1261        if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1262                goto clear_hash;
1263        if (tcp_md5_hash_key(hp, key))
1264                goto clear_hash;
1265        ahash_request_set_crypt(req, NULL, md5_hash, 0);
1266        if (crypto_ahash_final(req))
1267                goto clear_hash;
1268
1269        tcp_put_md5sig_pool();
1270        return 0;
1271
1272clear_hash:
1273        tcp_put_md5sig_pool();
1274clear_hash_noput:
1275        memset(md5_hash, 0, 16);
1276        return 1;
1277}
1278EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1279
1280#endif
1281
1282/* Called with rcu_read_lock() */
1283static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1284                                    const struct sk_buff *skb)
1285{
1286#ifdef CONFIG_TCP_MD5SIG
1287        /*
1288         * This gets called for each TCP segment that arrives
1289         * so we want to be efficient.
1290         * We have 3 drop cases:
1291         * o No MD5 hash and one expected.
1292         * o MD5 hash and we're not expecting one.
1293         * o MD5 hash and its wrong.
1294         */
1295        const __u8 *hash_location = NULL;
1296        struct tcp_md5sig_key *hash_expected;
1297        const struct iphdr *iph = ip_hdr(skb);
1298        const struct tcphdr *th = tcp_hdr(skb);
1299        int genhash;
1300        unsigned char newhash[16];
1301
1302        hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1303                                          AF_INET);
1304        hash_location = tcp_parse_md5sig_option(th);
1305
1306        /* We've parsed the options - do we have a hash? */
1307        if (!hash_expected && !hash_location)
1308                return false;
1309
1310        if (hash_expected && !hash_location) {
1311                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1312                return true;
1313        }
1314
1315        if (!hash_expected && hash_location) {
1316                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1317                return true;
1318        }
1319
1320        /* Okay, so this is hash_expected and hash_location -
1321         * so we need to calculate the checksum.
1322         */
1323        genhash = tcp_v4_md5_hash_skb(newhash,
1324                                      hash_expected,
1325                                      NULL, skb);
1326
1327        if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1328                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1329                net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1330                                     &iph->saddr, ntohs(th->source),
1331                                     &iph->daddr, ntohs(th->dest),
1332                                     genhash ? " tcp_v4_calc_md5_hash failed"
1333                                     : "");
1334                return true;
1335        }
1336        return false;
1337#endif
1338        return false;
1339}
1340
1341static void tcp_v4_init_req(struct request_sock *req,
1342                            const struct sock *sk_listener,
1343                            struct sk_buff *skb)
1344{
1345        struct inet_request_sock *ireq = inet_rsk(req);
1346        struct net *net = sock_net(sk_listener);
1347
1348        sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1349        sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1350        RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1351}
1352
1353static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1354                                          struct flowi *fl,
1355                                          const struct request_sock *req)
1356{
1357        return inet_csk_route_req(sk, &fl->u.ip4, req);
1358}
1359
1360struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1361        .family         =       PF_INET,
1362        .obj_size       =       sizeof(struct tcp_request_sock),
1363        .rtx_syn_ack    =       tcp_rtx_synack,
1364        .send_ack       =       tcp_v4_reqsk_send_ack,
1365        .destructor     =       tcp_v4_reqsk_destructor,
1366        .send_reset     =       tcp_v4_send_reset,
1367        .syn_ack_timeout =      tcp_syn_ack_timeout,
1368};
1369
1370static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1371        .mss_clamp      =       TCP_MSS_DEFAULT,
1372#ifdef CONFIG_TCP_MD5SIG
1373        .req_md5_lookup =       tcp_v4_md5_lookup,
1374        .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1375#endif
1376        .init_req       =       tcp_v4_init_req,
1377#ifdef CONFIG_SYN_COOKIES
1378        .cookie_init_seq =      cookie_v4_init_sequence,
1379#endif
1380        .route_req      =       tcp_v4_route_req,
1381        .init_seq       =       tcp_v4_init_seq,
1382        .init_ts_off    =       tcp_v4_init_ts_off,
1383        .send_synack    =       tcp_v4_send_synack,
1384};
1385
1386int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1387{
1388        /* Never answer to SYNs send to broadcast or multicast */
1389        if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1390                goto drop;
1391
1392        return tcp_conn_request(&tcp_request_sock_ops,
1393                                &tcp_request_sock_ipv4_ops, sk, skb);
1394
1395drop:
1396        tcp_listendrop(sk);
1397        return 0;
1398}
1399EXPORT_SYMBOL(tcp_v4_conn_request);
1400
1401
1402/*
1403 * The three way handshake has completed - we got a valid synack -
1404 * now create the new socket.
1405 */
1406struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1407                                  struct request_sock *req,
1408                                  struct dst_entry *dst,
1409                                  struct request_sock *req_unhash,
1410                                  bool *own_req)
1411{
1412        struct inet_request_sock *ireq;
1413        struct inet_sock *newinet;
1414        struct tcp_sock *newtp;
1415        struct sock *newsk;
1416#ifdef CONFIG_TCP_MD5SIG
1417        struct tcp_md5sig_key *key;
1418#endif
1419        struct ip_options_rcu *inet_opt;
1420
1421        if (sk_acceptq_is_full(sk))
1422                goto exit_overflow;
1423
1424        newsk = tcp_create_openreq_child(sk, req, skb);
1425        if (!newsk)
1426                goto exit_nonewsk;
1427
1428        newsk->sk_gso_type = SKB_GSO_TCPV4;
1429        inet_sk_rx_dst_set(newsk, skb);
1430
1431        newtp                 = tcp_sk(newsk);
1432        newinet               = inet_sk(newsk);
1433        ireq                  = inet_rsk(req);
1434        sk_daddr_set(newsk, ireq->ir_rmt_addr);
1435        sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1436        newsk->sk_bound_dev_if = ireq->ir_iif;
1437        newinet->inet_saddr   = ireq->ir_loc_addr;
1438        inet_opt              = rcu_dereference(ireq->ireq_opt);
1439        RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1440        newinet->mc_index     = inet_iif(skb);
1441        newinet->mc_ttl       = ip_hdr(skb)->ttl;
1442        newinet->rcv_tos      = ip_hdr(skb)->tos;
1443        inet_csk(newsk)->icsk_ext_hdr_len = 0;
1444        if (inet_opt)
1445                inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1446        newinet->inet_id = newtp->write_seq ^ jiffies;
1447
1448        if (!dst) {
1449                dst = inet_csk_route_child_sock(sk, newsk, req);
1450                if (!dst)
1451                        goto put_and_exit;
1452        } else {
1453                /* syncookie case : see end of cookie_v4_check() */
1454        }
1455        sk_setup_caps(newsk, dst);
1456
1457        tcp_ca_openreq_child(newsk, dst);
1458
1459        tcp_sync_mss(newsk, dst_mtu(dst));
1460        newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1461
1462        tcp_initialize_rcv_mss(newsk);
1463
1464#ifdef CONFIG_TCP_MD5SIG
1465        /* Copy over the MD5 key from the original socket */
1466        key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1467                                AF_INET);
1468        if (key) {
1469                /*
1470                 * We're using one, so create a matching key
1471                 * on the newsk structure. If we fail to get
1472                 * memory, then we end up not copying the key
1473                 * across. Shucks.
1474                 */
1475                tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1476                               AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1477                sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1478        }
1479#endif
1480
1481        if (__inet_inherit_port(sk, newsk) < 0)
1482                goto put_and_exit;
1483        *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1484        if (likely(*own_req)) {
1485                tcp_move_syn(newtp, req);
1486                ireq->ireq_opt = NULL;
1487        } else {
1488                newinet->inet_opt = NULL;
1489        }
1490        return newsk;
1491
1492exit_overflow:
1493        NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1494exit_nonewsk:
1495        dst_release(dst);
1496exit:
1497        tcp_listendrop(sk);
1498        return NULL;
1499put_and_exit:
1500        newinet->inet_opt = NULL;
1501        inet_csk_prepare_forced_close(newsk);
1502        tcp_done(newsk);
1503        goto exit;
1504}
1505EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1506
1507static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1508{
1509#ifdef CONFIG_SYN_COOKIES
1510        const struct tcphdr *th = tcp_hdr(skb);
1511
1512        if (!th->syn)
1513                sk = cookie_v4_check(sk, skb);
1514#endif
1515        return sk;
1516}
1517
1518/* The socket must have it's spinlock held when we get
1519 * here, unless it is a TCP_LISTEN socket.
1520 *
1521 * We have a potential double-lock case here, so even when
1522 * doing backlog processing we use the BH locking scheme.
1523 * This is because we cannot sleep with the original spinlock
1524 * held.
1525 */
1526int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1527{
1528        struct sock *rsk;
1529
1530        if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1531                struct dst_entry *dst = sk->sk_rx_dst;
1532
1533                sock_rps_save_rxhash(sk, skb);
1534                sk_mark_napi_id(sk, skb);
1535                if (dst) {
1536                        if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1537                            !dst->ops->check(dst, 0)) {
1538                                dst_release(dst);
1539                                sk->sk_rx_dst = NULL;
1540                        }
1541                }
1542                tcp_rcv_established(sk, skb);
1543                return 0;
1544        }
1545
1546        if (tcp_checksum_complete(skb))
1547                goto csum_err;
1548
1549        if (sk->sk_state == TCP_LISTEN) {
1550                struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1551
1552                if (!nsk)
1553                        goto discard;
1554                if (nsk != sk) {
1555                        if (tcp_child_process(sk, nsk, skb)) {
1556                                rsk = nsk;
1557                                goto reset;
1558                        }
1559                        return 0;
1560                }
1561        } else
1562                sock_rps_save_rxhash(sk, skb);
1563
1564        if (tcp_rcv_state_process(sk, skb)) {
1565                rsk = sk;
1566                goto reset;
1567        }
1568        return 0;
1569
1570reset:
1571        tcp_v4_send_reset(rsk, skb);
1572discard:
1573        kfree_skb(skb);
1574        /* Be careful here. If this function gets more complicated and
1575         * gcc suffers from register pressure on the x86, sk (in %ebx)
1576         * might be destroyed here. This current version compiles correctly,
1577         * but you have been warned.
1578         */
1579        return 0;
1580
1581csum_err:
1582        TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1583        TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1584        goto discard;
1585}
1586EXPORT_SYMBOL(tcp_v4_do_rcv);
1587
1588int tcp_v4_early_demux(struct sk_buff *skb)
1589{
1590        const struct iphdr *iph;
1591        const struct tcphdr *th;
1592        struct sock *sk;
1593
1594        if (skb->pkt_type != PACKET_HOST)
1595                return 0;
1596
1597        if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1598                return 0;
1599
1600        iph = ip_hdr(skb);
1601        th = tcp_hdr(skb);
1602
1603        if (th->doff < sizeof(struct tcphdr) / 4)
1604                return 0;
1605
1606        sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1607                                       iph->saddr, th->source,
1608                                       iph->daddr, ntohs(th->dest),
1609                                       skb->skb_iif, inet_sdif(skb));
1610        if (sk) {
1611                skb->sk = sk;
1612                skb->destructor = sock_edemux;
1613                if (sk_fullsock(sk)) {
1614                        struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1615
1616                        if (dst)
1617                                dst = dst_check(dst, 0);
1618                        if (dst &&
1619                            inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1620                                skb_dst_set_noref(skb, dst);
1621                }
1622        }
1623        return 0;
1624}
1625
1626bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1627{
1628        u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1629        struct skb_shared_info *shinfo;
1630        const struct tcphdr *th;
1631        struct tcphdr *thtail;
1632        struct sk_buff *tail;
1633        unsigned int hdrlen;
1634        bool fragstolen;
1635        u32 gso_segs;
1636        int delta;
1637
1638        /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1639         * we can fix skb->truesize to its real value to avoid future drops.
1640         * This is valid because skb is not yet charged to the socket.
1641         * It has been noticed pure SACK packets were sometimes dropped
1642         * (if cooked by drivers without copybreak feature).
1643         */
1644        skb_condense(skb);
1645
1646        skb_dst_drop(skb);
1647
1648        if (unlikely(tcp_checksum_complete(skb))) {
1649                bh_unlock_sock(sk);
1650                __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1651                __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1652                return true;
1653        }
1654
1655        /* Attempt coalescing to last skb in backlog, even if we are
1656         * above the limits.
1657         * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1658         */
1659        th = (const struct tcphdr *)skb->data;
1660        hdrlen = th->doff * 4;
1661        shinfo = skb_shinfo(skb);
1662
1663        if (!shinfo->gso_size)
1664                shinfo->gso_size = skb->len - hdrlen;
1665
1666        if (!shinfo->gso_segs)
1667                shinfo->gso_segs = 1;
1668
1669        tail = sk->sk_backlog.tail;
1670        if (!tail)
1671                goto no_coalesce;
1672        thtail = (struct tcphdr *)tail->data;
1673
1674        if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1675            TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1676            ((TCP_SKB_CB(tail)->tcp_flags |
1677              TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1678            !((TCP_SKB_CB(tail)->tcp_flags &
1679              TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1680            ((TCP_SKB_CB(tail)->tcp_flags ^
1681              TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1682#ifdef CONFIG_TLS_DEVICE
1683            tail->decrypted != skb->decrypted ||
1684#endif
1685            thtail->doff != th->doff ||
1686            memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1687                goto no_coalesce;
1688
1689        __skb_pull(skb, hdrlen);
1690        if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1691                thtail->window = th->window;
1692
1693                TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1694
1695                if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
1696                        TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1697
1698                /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1699                 * thtail->fin, so that the fast path in tcp_rcv_established()
1700                 * is not entered if we append a packet with a FIN.
1701                 * SYN, RST, URG are not present.
1702                 * ACK is set on both packets.
1703                 * PSH : we do not really care in TCP stack,
1704                 *       at least for 'GRO' packets.
1705                 */
1706                thtail->fin |= th->fin;
1707                TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1708
1709                if (TCP_SKB_CB(skb)->has_rxtstamp) {
1710                        TCP_SKB_CB(tail)->has_rxtstamp = true;
1711                        tail->tstamp = skb->tstamp;
1712                        skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1713                }
1714
1715                /* Not as strict as GRO. We only need to carry mss max value */
1716                skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1717                                                 skb_shinfo(tail)->gso_size);
1718
1719                gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1720                skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1721
1722                sk->sk_backlog.len += delta;
1723                __NET_INC_STATS(sock_net(sk),
1724                                LINUX_MIB_TCPBACKLOGCOALESCE);
1725                kfree_skb_partial(skb, fragstolen);
1726                return false;
1727        }
1728        __skb_push(skb, hdrlen);
1729
1730no_coalesce:
1731        /* Only socket owner can try to collapse/prune rx queues
1732         * to reduce memory overhead, so add a little headroom here.
1733         * Few sockets backlog are possibly concurrently non empty.
1734         */
1735        limit += 64*1024;
1736
1737        if (unlikely(sk_add_backlog(sk, skb, limit))) {
1738                bh_unlock_sock(sk);
1739                __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1740                return true;
1741        }
1742        return false;
1743}
1744EXPORT_SYMBOL(tcp_add_backlog);
1745
1746int tcp_filter(struct sock *sk, struct sk_buff *skb)
1747{
1748        struct tcphdr *th = (struct tcphdr *)skb->data;
1749
1750        return sk_filter_trim_cap(sk, skb, th->doff * 4);
1751}
1752EXPORT_SYMBOL(tcp_filter);
1753
1754static void tcp_v4_restore_cb(struct sk_buff *skb)
1755{
1756        memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1757                sizeof(struct inet_skb_parm));
1758}
1759
1760static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1761                           const struct tcphdr *th)
1762{
1763        /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1764         * barrier() makes sure compiler wont play fool^Waliasing games.
1765         */
1766        memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1767                sizeof(struct inet_skb_parm));
1768        barrier();
1769
1770        TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1771        TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1772                                    skb->len - th->doff * 4);
1773        TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1774        TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1775        TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1776        TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1777        TCP_SKB_CB(skb)->sacked  = 0;
1778        TCP_SKB_CB(skb)->has_rxtstamp =
1779                        skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1780}
1781
1782/*
1783 *      From tcp_input.c
1784 */
1785
1786int tcp_v4_rcv(struct sk_buff *skb)
1787{
1788        struct net *net = dev_net(skb->dev);
1789        struct sk_buff *skb_to_free;
1790        int sdif = inet_sdif(skb);
1791        const struct iphdr *iph;
1792        const struct tcphdr *th;
1793        bool refcounted;
1794        struct sock *sk;
1795        int ret;
1796
1797        if (skb->pkt_type != PACKET_HOST)
1798                goto discard_it;
1799
1800        /* Count it even if it's bad */
1801        __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1802
1803        if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1804                goto discard_it;
1805
1806        th = (const struct tcphdr *)skb->data;
1807
1808        if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1809                goto bad_packet;
1810        if (!pskb_may_pull(skb, th->doff * 4))
1811                goto discard_it;
1812
1813        /* An explanation is required here, I think.
1814         * Packet length and doff are validated by header prediction,
1815         * provided case of th->doff==0 is eliminated.
1816         * So, we defer the checks. */
1817
1818        if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1819                goto csum_error;
1820
1821        th = (const struct tcphdr *)skb->data;
1822        iph = ip_hdr(skb);
1823lookup:
1824        sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1825                               th->dest, sdif, &refcounted);
1826        if (!sk)
1827                goto no_tcp_socket;
1828
1829process:
1830        if (sk->sk_state == TCP_TIME_WAIT)
1831                goto do_time_wait;
1832
1833        if (sk->sk_state == TCP_NEW_SYN_RECV) {
1834                struct request_sock *req = inet_reqsk(sk);
1835                bool req_stolen = false;
1836                struct sock *nsk;
1837
1838                sk = req->rsk_listener;
1839                if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1840                        sk_drops_add(sk, skb);
1841                        reqsk_put(req);
1842                        goto discard_it;
1843                }
1844                if (tcp_checksum_complete(skb)) {
1845                        reqsk_put(req);
1846                        goto csum_error;
1847                }
1848                if (unlikely(sk->sk_state != TCP_LISTEN)) {
1849                        inet_csk_reqsk_queue_drop_and_put(sk, req);
1850                        goto lookup;
1851                }
1852                /* We own a reference on the listener, increase it again
1853                 * as we might lose it too soon.
1854                 */
1855                sock_hold(sk);
1856                refcounted = true;
1857                nsk = NULL;
1858                if (!tcp_filter(sk, skb)) {
1859                        th = (const struct tcphdr *)skb->data;
1860                        iph = ip_hdr(skb);
1861                        tcp_v4_fill_cb(skb, iph, th);
1862                        nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1863                }
1864                if (!nsk) {
1865                        reqsk_put(req);
1866                        if (req_stolen) {
1867                                /* Another cpu got exclusive access to req
1868                                 * and created a full blown socket.
1869                                 * Try to feed this packet to this socket
1870                                 * instead of discarding it.
1871                                 */
1872                                tcp_v4_restore_cb(skb);
1873                                sock_put(sk);
1874                                goto lookup;
1875                        }
1876                        goto discard_and_relse;
1877                }
1878                if (nsk == sk) {
1879                        reqsk_put(req);
1880                        tcp_v4_restore_cb(skb);
1881                } else if (tcp_child_process(sk, nsk, skb)) {
1882                        tcp_v4_send_reset(nsk, skb);
1883                        goto discard_and_relse;
1884                } else {
1885                        sock_put(sk);
1886                        return 0;
1887                }
1888        }
1889        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1890                __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1891                goto discard_and_relse;
1892        }
1893
1894        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1895                goto discard_and_relse;
1896
1897        if (tcp_v4_inbound_md5_hash(sk, skb))
1898                goto discard_and_relse;
1899
1900        nf_reset(skb);
1901
1902        if (tcp_filter(sk, skb))
1903                goto discard_and_relse;
1904        th = (const struct tcphdr *)skb->data;
1905        iph = ip_hdr(skb);
1906        tcp_v4_fill_cb(skb, iph, th);
1907
1908        skb->dev = NULL;
1909
1910        if (sk->sk_state == TCP_LISTEN) {
1911                ret = tcp_v4_do_rcv(sk, skb);
1912                goto put_and_return;
1913        }
1914
1915        sk_incoming_cpu_update(sk);
1916
1917        bh_lock_sock_nested(sk);
1918        tcp_segs_in(tcp_sk(sk), skb);
1919        ret = 0;
1920        if (!sock_owned_by_user(sk)) {
1921                skb_to_free = sk->sk_rx_skb_cache;
1922                sk->sk_rx_skb_cache = NULL;
1923                ret = tcp_v4_do_rcv(sk, skb);
1924        } else {
1925                if (tcp_add_backlog(sk, skb))
1926                        goto discard_and_relse;
1927                skb_to_free = NULL;
1928        }
1929        bh_unlock_sock(sk);
1930        if (skb_to_free)
1931                __kfree_skb(skb_to_free);
1932
1933put_and_return:
1934        if (refcounted)
1935                sock_put(sk);
1936
1937        return ret;
1938
1939no_tcp_socket:
1940        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1941                goto discard_it;
1942
1943        tcp_v4_fill_cb(skb, iph, th);
1944
1945        if (tcp_checksum_complete(skb)) {
1946csum_error:
1947                __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1948bad_packet:
1949                __TCP_INC_STATS(net, TCP_MIB_INERRS);
1950        } else {
1951                tcp_v4_send_reset(NULL, skb);
1952        }
1953
1954discard_it:
1955        /* Discard frame. */
1956        kfree_skb(skb);
1957        return 0;
1958
1959discard_and_relse:
1960        sk_drops_add(sk, skb);
1961        if (refcounted)
1962                sock_put(sk);
1963        goto discard_it;
1964
1965do_time_wait:
1966        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1967                inet_twsk_put(inet_twsk(sk));
1968                goto discard_it;
1969        }
1970
1971        tcp_v4_fill_cb(skb, iph, th);
1972
1973        if (tcp_checksum_complete(skb)) {
1974                inet_twsk_put(inet_twsk(sk));
1975                goto csum_error;
1976        }
1977        switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1978        case TCP_TW_SYN: {
1979                struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1980                                                        &tcp_hashinfo, skb,
1981                                                        __tcp_hdrlen(th),
1982                                                        iph->saddr, th->source,
1983                                                        iph->daddr, th->dest,
1984                                                        inet_iif(skb),
1985                                                        sdif);
1986                if (sk2) {
1987                        inet_twsk_deschedule_put(inet_twsk(sk));
1988                        sk = sk2;
1989                        tcp_v4_restore_cb(skb);
1990                        refcounted = false;
1991                        goto process;
1992                }
1993        }
1994                /* to ACK */
1995                /* fall through */
1996        case TCP_TW_ACK:
1997                tcp_v4_timewait_ack(sk, skb);
1998                break;
1999        case TCP_TW_RST:
2000                tcp_v4_send_reset(sk, skb);
2001                inet_twsk_deschedule_put(inet_twsk(sk));
2002                goto discard_it;
2003        case TCP_TW_SUCCESS:;
2004        }
2005        goto discard_it;
2006}
2007
2008static struct timewait_sock_ops tcp_timewait_sock_ops = {
2009        .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2010        .twsk_unique    = tcp_twsk_unique,
2011        .twsk_destructor= tcp_twsk_destructor,
2012};
2013
2014void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2015{
2016        struct dst_entry *dst = skb_dst(skb);
2017
2018        if (dst && dst_hold_safe(dst)) {
2019                sk->sk_rx_dst = dst;
2020                inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2021        }
2022}
2023EXPORT_SYMBOL(inet_sk_rx_dst_set);
2024
2025const struct inet_connection_sock_af_ops ipv4_specific = {
2026        .queue_xmit        = ip_queue_xmit,
2027        .send_check        = tcp_v4_send_check,
2028        .rebuild_header    = inet_sk_rebuild_header,
2029        .sk_rx_dst_set     = inet_sk_rx_dst_set,
2030        .conn_request      = tcp_v4_conn_request,
2031        .syn_recv_sock     = tcp_v4_syn_recv_sock,
2032        .net_header_len    = sizeof(struct iphdr),
2033        .setsockopt        = ip_setsockopt,
2034        .getsockopt        = ip_getsockopt,
2035        .addr2sockaddr     = inet_csk_addr2sockaddr,
2036        .sockaddr_len      = sizeof(struct sockaddr_in),
2037#ifdef CONFIG_COMPAT
2038        .compat_setsockopt = compat_ip_setsockopt,
2039        .compat_getsockopt = compat_ip_getsockopt,
2040#endif
2041        .mtu_reduced       = tcp_v4_mtu_reduced,
2042};
2043EXPORT_SYMBOL(ipv4_specific);
2044
2045#ifdef CONFIG_TCP_MD5SIG
2046static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2047        .md5_lookup             = tcp_v4_md5_lookup,
2048        .calc_md5_hash          = tcp_v4_md5_hash_skb,
2049        .md5_parse              = tcp_v4_parse_md5_keys,
2050};
2051#endif
2052
2053/* NOTE: A lot of things set to zero explicitly by call to
2054 *       sk_alloc() so need not be done here.
2055 */
2056static int tcp_v4_init_sock(struct sock *sk)
2057{
2058        struct inet_connection_sock *icsk = inet_csk(sk);
2059
2060        tcp_init_sock(sk);
2061
2062        icsk->icsk_af_ops = &ipv4_specific;
2063
2064#ifdef CONFIG_TCP_MD5SIG
2065        tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2066#endif
2067
2068        return 0;
2069}
2070
2071void tcp_v4_destroy_sock(struct sock *sk)
2072{
2073        struct tcp_sock *tp = tcp_sk(sk);
2074
2075        trace_tcp_destroy_sock(sk);
2076
2077        tcp_clear_xmit_timers(sk);
2078
2079        tcp_cleanup_congestion_control(sk);
2080
2081        tcp_cleanup_ulp(sk);
2082
2083        /* Cleanup up the write buffer. */
2084        tcp_write_queue_purge(sk);
2085
2086        /* Check if we want to disable active TFO */
2087        tcp_fastopen_active_disable_ofo_check(sk);
2088
2089        /* Cleans up our, hopefully empty, out_of_order_queue. */
2090        skb_rbtree_purge(&tp->out_of_order_queue);
2091
2092#ifdef CONFIG_TCP_MD5SIG
2093        /* Clean up the MD5 key list, if any */
2094        if (tp->md5sig_info) {
2095                tcp_clear_md5_list(sk);
2096                kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2097                tp->md5sig_info = NULL;
2098        }
2099#endif
2100
2101        /* Clean up a referenced TCP bind bucket. */
2102        if (inet_csk(sk)->icsk_bind_hash)
2103                inet_put_port(sk);
2104
2105        BUG_ON(tp->fastopen_rsk);
2106
2107        /* If socket is aborted during connect operation */
2108        tcp_free_fastopen_req(tp);
2109        tcp_fastopen_destroy_cipher(sk);
2110        tcp_saved_syn_free(tp);
2111
2112        sk_sockets_allocated_dec(sk);
2113}
2114EXPORT_SYMBOL(tcp_v4_destroy_sock);
2115
2116#ifdef CONFIG_PROC_FS
2117/* Proc filesystem TCP sock list dumping. */
2118
2119/*
2120 * Get next listener socket follow cur.  If cur is NULL, get first socket
2121 * starting from bucket given in st->bucket; when st->bucket is zero the
2122 * very first socket in the hash table is returned.
2123 */
2124static void *listening_get_next(struct seq_file *seq, void *cur)
2125{
2126        struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2127        struct tcp_iter_state *st = seq->private;
2128        struct net *net = seq_file_net(seq);
2129        struct inet_listen_hashbucket *ilb;
2130        struct sock *sk = cur;
2131
2132        if (!sk) {
2133get_head:
2134                ilb = &tcp_hashinfo.listening_hash[st->bucket];
2135                spin_lock(&ilb->lock);
2136                sk = sk_head(&ilb->head);
2137                st->offset = 0;
2138                goto get_sk;
2139        }
2140        ilb = &tcp_hashinfo.listening_hash[st->bucket];
2141        ++st->num;
2142        ++st->offset;
2143
2144        sk = sk_next(sk);
2145get_sk:
2146        sk_for_each_from(sk) {
2147                if (!net_eq(sock_net(sk), net))
2148                        continue;
2149                if (sk->sk_family == afinfo->family)
2150                        return sk;
2151        }
2152        spin_unlock(&ilb->lock);
2153        st->offset = 0;
2154        if (++st->bucket < INET_LHTABLE_SIZE)
2155                goto get_head;
2156        return NULL;
2157}
2158
2159static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2160{
2161        struct tcp_iter_state *st = seq->private;
2162        void *rc;
2163
2164        st->bucket = 0;
2165        st->offset = 0;
2166        rc = listening_get_next(seq, NULL);
2167
2168        while (rc && *pos) {
2169                rc = listening_get_next(seq, rc);
2170                --*pos;
2171        }
2172        return rc;
2173}
2174
2175static inline bool empty_bucket(const struct tcp_iter_state *st)
2176{
2177        return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2178}
2179
2180/*
2181 * Get first established socket starting from bucket given in st->bucket.
2182 * If st->bucket is zero, the very first socket in the hash is returned.
2183 */
2184static void *established_get_first(struct seq_file *seq)
2185{
2186        struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2187        struct tcp_iter_state *st = seq->private;
2188        struct net *net = seq_file_net(seq);
2189        void *rc = NULL;
2190
2191        st->offset = 0;
2192        for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2193                struct sock *sk;
2194                struct hlist_nulls_node *node;
2195                spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2196
2197                /* Lockless fast path for the common case of empty buckets */
2198                if (empty_bucket(st))
2199                        continue;
2200
2201                spin_lock_bh(lock);
2202                sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2203                        if (sk->sk_family != afinfo->family ||
2204                            !net_eq(sock_net(sk), net)) {
2205                                continue;
2206                        }
2207                        rc = sk;
2208                        goto out;
2209                }
2210                spin_unlock_bh(lock);
2211        }
2212out:
2213        return rc;
2214}
2215
2216static void *established_get_next(struct seq_file *seq, void *cur)
2217{
2218        struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2219        struct sock *sk = cur;
2220        struct hlist_nulls_node *node;
2221        struct tcp_iter_state *st = seq->private;
2222        struct net *net = seq_file_net(seq);
2223
2224        ++st->num;
2225        ++st->offset;
2226
2227        sk = sk_nulls_next(sk);
2228
2229        sk_nulls_for_each_from(sk, node) {
2230                if (sk->sk_family == afinfo->family &&
2231                    net_eq(sock_net(sk), net))
2232                        return sk;
2233        }
2234
2235        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2236        ++st->bucket;
2237        return established_get_first(seq);
2238}
2239
2240static void *established_get_idx(struct seq_file *seq, loff_t pos)
2241{
2242        struct tcp_iter_state *st = seq->private;
2243        void *rc;
2244
2245        st->bucket = 0;
2246        rc = established_get_first(seq);
2247
2248        while (rc && pos) {
2249                rc = established_get_next(seq, rc);
2250                --pos;
2251        }
2252        return rc;
2253}
2254
2255static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2256{
2257        void *rc;
2258        struct tcp_iter_state *st = seq->private;
2259
2260        st->state = TCP_SEQ_STATE_LISTENING;
2261        rc        = listening_get_idx(seq, &pos);
2262
2263        if (!rc) {
2264                st->state = TCP_SEQ_STATE_ESTABLISHED;
2265                rc        = established_get_idx(seq, pos);
2266        }
2267
2268        return rc;
2269}
2270
2271static void *tcp_seek_last_pos(struct seq_file *seq)
2272{
2273        struct tcp_iter_state *st = seq->private;
2274        int offset = st->offset;
2275        int orig_num = st->num;
2276        void *rc = NULL;
2277
2278        switch (st->state) {
2279        case TCP_SEQ_STATE_LISTENING:
2280                if (st->bucket >= INET_LHTABLE_SIZE)
2281                        break;
2282                st->state = TCP_SEQ_STATE_LISTENING;
2283                rc = listening_get_next(seq, NULL);
2284                while (offset-- && rc)
2285                        rc = listening_get_next(seq, rc);
2286                if (rc)
2287                        break;
2288                st->bucket = 0;
2289                st->state = TCP_SEQ_STATE_ESTABLISHED;
2290                /* Fallthrough */
2291        case TCP_SEQ_STATE_ESTABLISHED:
2292                if (st->bucket > tcp_hashinfo.ehash_mask)
2293                        break;
2294                rc = established_get_first(seq);
2295                while (offset-- && rc)
2296                        rc = established_get_next(seq, rc);
2297        }
2298
2299        st->num = orig_num;
2300
2301        return rc;
2302}
2303
2304void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2305{
2306        struct tcp_iter_state *st = seq->private;
2307        void *rc;
2308
2309        if (*pos && *pos == st->last_pos) {
2310                rc = tcp_seek_last_pos(seq);
2311                if (rc)
2312                        goto out;
2313        }
2314
2315        st->state = TCP_SEQ_STATE_LISTENING;
2316        st->num = 0;
2317        st->bucket = 0;
2318        st->offset = 0;
2319        rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2320
2321out:
2322        st->last_pos = *pos;
2323        return rc;
2324}
2325EXPORT_SYMBOL(tcp_seq_start);
2326
2327void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2328{
2329        struct tcp_iter_state *st = seq->private;
2330        void *rc = NULL;
2331
2332        if (v == SEQ_START_TOKEN) {
2333                rc = tcp_get_idx(seq, 0);
2334                goto out;
2335        }
2336
2337        switch (st->state) {
2338        case TCP_SEQ_STATE_LISTENING:
2339                rc = listening_get_next(seq, v);
2340                if (!rc) {
2341                        st->state = TCP_SEQ_STATE_ESTABLISHED;
2342                        st->bucket = 0;
2343                        st->offset = 0;
2344                        rc        = established_get_first(seq);
2345                }
2346                break;
2347        case TCP_SEQ_STATE_ESTABLISHED:
2348                rc = established_get_next(seq, v);
2349                break;
2350        }
2351out:
2352        ++*pos;
2353        st->last_pos = *pos;
2354        return rc;
2355}
2356EXPORT_SYMBOL(tcp_seq_next);
2357
2358void tcp_seq_stop(struct seq_file *seq, void *v)
2359{
2360        struct tcp_iter_state *st = seq->private;
2361
2362        switch (st->state) {
2363        case TCP_SEQ_STATE_LISTENING:
2364                if (v != SEQ_START_TOKEN)
2365                        spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2366                break;
2367        case TCP_SEQ_STATE_ESTABLISHED:
2368                if (v)
2369                        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2370                break;
2371        }
2372}
2373EXPORT_SYMBOL(tcp_seq_stop);
2374
2375static void get_openreq4(const struct request_sock *req,
2376                         struct seq_file *f, int i)
2377{
2378        const struct inet_request_sock *ireq = inet_rsk(req);
2379        long delta = req->rsk_timer.expires - jiffies;
2380
2381        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2382                " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2383                i,
2384                ireq->ir_loc_addr,
2385                ireq->ir_num,
2386                ireq->ir_rmt_addr,
2387                ntohs(ireq->ir_rmt_port),
2388                TCP_SYN_RECV,
2389                0, 0, /* could print option size, but that is af dependent. */
2390                1,    /* timers active (only the expire timer) */
2391                jiffies_delta_to_clock_t(delta),
2392                req->num_timeout,
2393                from_kuid_munged(seq_user_ns(f),
2394                                 sock_i_uid(req->rsk_listener)),
2395                0,  /* non standard timer */
2396                0, /* open_requests have no inode */
2397                0,
2398                req);
2399}
2400
2401static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2402{
2403        int timer_active;
2404        unsigned long timer_expires;
2405        const struct tcp_sock *tp = tcp_sk(sk);
2406        const struct inet_connection_sock *icsk = inet_csk(sk);
2407        const struct inet_sock *inet = inet_sk(sk);
2408        const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2409        __be32 dest = inet->inet_daddr;
2410        __be32 src = inet->inet_rcv_saddr;
2411        __u16 destp = ntohs(inet->inet_dport);
2412        __u16 srcp = ntohs(inet->inet_sport);
2413        int rx_queue;
2414        int state;
2415
2416        if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2417            icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2418            icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2419                timer_active    = 1;
2420                timer_expires   = icsk->icsk_timeout;
2421        } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2422                timer_active    = 4;
2423                timer_expires   = icsk->icsk_timeout;
2424        } else if (timer_pending(&sk->sk_timer)) {
2425                timer_active    = 2;
2426                timer_expires   = sk->sk_timer.expires;
2427        } else {
2428                timer_active    = 0;
2429                timer_expires = jiffies;
2430        }
2431
2432        state = inet_sk_state_load(sk);
2433        if (state == TCP_LISTEN)
2434                rx_queue = sk->sk_ack_backlog;
2435        else
2436                /* Because we don't lock the socket,
2437                 * we might find a transient negative value.
2438                 */
2439                rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2440
2441        seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2442                        "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2443                i, src, srcp, dest, destp, state,
2444                tp->write_seq - tp->snd_una,
2445                rx_queue,
2446                timer_active,
2447                jiffies_delta_to_clock_t(timer_expires - jiffies),
2448                icsk->icsk_retransmits,
2449                from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2450                icsk->icsk_probes_out,
2451                sock_i_ino(sk),
2452                refcount_read(&sk->sk_refcnt), sk,
2453                jiffies_to_clock_t(icsk->icsk_rto),
2454                jiffies_to_clock_t(icsk->icsk_ack.ato),
2455                (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2456                tp->snd_cwnd,
2457                state == TCP_LISTEN ?
2458                    fastopenq->max_qlen :
2459                    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2460}
2461
2462static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2463                               struct seq_file *f, int i)
2464{
2465        long delta = tw->tw_timer.expires - jiffies;
2466        __be32 dest, src;
2467        __u16 destp, srcp;
2468
2469        dest  = tw->tw_daddr;
2470        src   = tw->tw_rcv_saddr;
2471        destp = ntohs(tw->tw_dport);
2472        srcp  = ntohs(tw->tw_sport);
2473
2474        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2475                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2476                i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2477                3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2478                refcount_read(&tw->tw_refcnt), tw);
2479}
2480
2481#define TMPSZ 150
2482
2483static int tcp4_seq_show(struct seq_file *seq, void *v)
2484{
2485        struct tcp_iter_state *st;
2486        struct sock *sk = v;
2487
2488        seq_setwidth(seq, TMPSZ - 1);
2489        if (v == SEQ_START_TOKEN) {
2490                seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2491                           "rx_queue tr tm->when retrnsmt   uid  timeout "
2492                           "inode");
2493                goto out;
2494        }
2495        st = seq->private;
2496
2497        if (sk->sk_state == TCP_TIME_WAIT)
2498                get_timewait4_sock(v, seq, st->num);
2499        else if (sk->sk_state == TCP_NEW_SYN_RECV)
2500                get_openreq4(v, seq, st->num);
2501        else
2502                get_tcp4_sock(v, seq, st->num);
2503out:
2504        seq_pad(seq, '\n');
2505        return 0;
2506}
2507
2508static const struct seq_operations tcp4_seq_ops = {
2509        .show           = tcp4_seq_show,
2510        .start          = tcp_seq_start,
2511        .next           = tcp_seq_next,
2512        .stop           = tcp_seq_stop,
2513};
2514
2515static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2516        .family         = AF_INET,
2517};
2518
2519static int __net_init tcp4_proc_init_net(struct net *net)
2520{
2521        if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2522                        sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2523                return -ENOMEM;
2524        return 0;
2525}
2526
2527static void __net_exit tcp4_proc_exit_net(struct net *net)
2528{
2529        remove_proc_entry("tcp", net->proc_net);
2530}
2531
2532static struct pernet_operations tcp4_net_ops = {
2533        .init = tcp4_proc_init_net,
2534        .exit = tcp4_proc_exit_net,
2535};
2536
2537int __init tcp4_proc_init(void)
2538{
2539        return register_pernet_subsys(&tcp4_net_ops);
2540}
2541
2542void tcp4_proc_exit(void)
2543{
2544        unregister_pernet_subsys(&tcp4_net_ops);
2545}
2546#endif /* CONFIG_PROC_FS */
2547
2548struct proto tcp_prot = {
2549        .name                   = "TCP",
2550        .owner                  = THIS_MODULE,
2551        .close                  = tcp_close,
2552        .pre_connect            = tcp_v4_pre_connect,
2553        .connect                = tcp_v4_connect,
2554        .disconnect             = tcp_disconnect,
2555        .accept                 = inet_csk_accept,
2556        .ioctl                  = tcp_ioctl,
2557        .init                   = tcp_v4_init_sock,
2558        .destroy                = tcp_v4_destroy_sock,
2559        .shutdown               = tcp_shutdown,
2560        .setsockopt             = tcp_setsockopt,
2561        .getsockopt             = tcp_getsockopt,
2562        .keepalive              = tcp_set_keepalive,
2563        .recvmsg                = tcp_recvmsg,
2564        .sendmsg                = tcp_sendmsg,
2565        .sendpage               = tcp_sendpage,
2566        .backlog_rcv            = tcp_v4_do_rcv,
2567        .release_cb             = tcp_release_cb,
2568        .hash                   = inet_hash,
2569        .unhash                 = inet_unhash,
2570        .get_port               = inet_csk_get_port,
2571        .enter_memory_pressure  = tcp_enter_memory_pressure,
2572        .leave_memory_pressure  = tcp_leave_memory_pressure,
2573        .stream_memory_free     = tcp_stream_memory_free,
2574        .sockets_allocated      = &tcp_sockets_allocated,
2575        .orphan_count           = &tcp_orphan_count,
2576        .memory_allocated       = &tcp_memory_allocated,
2577        .memory_pressure        = &tcp_memory_pressure,
2578        .sysctl_mem             = sysctl_tcp_mem,
2579        .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2580        .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2581        .max_header             = MAX_TCP_HEADER,
2582        .obj_size               = sizeof(struct tcp_sock),
2583        .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2584        .twsk_prot              = &tcp_timewait_sock_ops,
2585        .rsk_prot               = &tcp_request_sock_ops,
2586        .h.hashinfo             = &tcp_hashinfo,
2587        .no_autobind            = true,
2588#ifdef CONFIG_COMPAT
2589        .compat_setsockopt      = compat_tcp_setsockopt,
2590        .compat_getsockopt      = compat_tcp_getsockopt,
2591#endif
2592        .diag_destroy           = tcp_abort,
2593};
2594EXPORT_SYMBOL(tcp_prot);
2595
2596static void __net_exit tcp_sk_exit(struct net *net)
2597{
2598        int cpu;
2599
2600        if (net->ipv4.tcp_congestion_control)
2601                module_put(net->ipv4.tcp_congestion_control->owner);
2602
2603        for_each_possible_cpu(cpu)
2604                inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2605        free_percpu(net->ipv4.tcp_sk);
2606}
2607
2608static int __net_init tcp_sk_init(struct net *net)
2609{
2610        int res, cpu, cnt;
2611
2612        net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2613        if (!net->ipv4.tcp_sk)
2614                return -ENOMEM;
2615
2616        for_each_possible_cpu(cpu) {
2617                struct sock *sk;
2618
2619                res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2620                                           IPPROTO_TCP, net);
2621                if (res)
2622                        goto fail;
2623                sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2624
2625                /* Please enforce IP_DF and IPID==0 for RST and
2626                 * ACK sent in SYN-RECV and TIME-WAIT state.
2627                 */
2628                inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2629
2630                *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2631        }
2632
2633        net->ipv4.sysctl_tcp_ecn = 2;
2634        net->ipv4.sysctl_tcp_ecn_fallback = 1;
2635
2636        net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2637        net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2638        net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2639        net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2640
2641        net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2642        net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2643        net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2644
2645        net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2646        net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2647        net->ipv4.sysctl_tcp_syncookies = 1;
2648        net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2649        net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2650        net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2651        net->ipv4.sysctl_tcp_orphan_retries = 0;
2652        net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2653        net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2654        net->ipv4.sysctl_tcp_tw_reuse = 2;
2655
2656        cnt = tcp_hashinfo.ehash_mask + 1;
2657        net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2658        net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2659
2660        net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2661        net->ipv4.sysctl_tcp_sack = 1;
2662        net->ipv4.sysctl_tcp_window_scaling = 1;
2663        net->ipv4.sysctl_tcp_timestamps = 1;
2664        net->ipv4.sysctl_tcp_early_retrans = 3;
2665        net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2666        net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2667        net->ipv4.sysctl_tcp_retrans_collapse = 1;
2668        net->ipv4.sysctl_tcp_max_reordering = 300;
2669        net->ipv4.sysctl_tcp_dsack = 1;
2670        net->ipv4.sysctl_tcp_app_win = 31;
2671        net->ipv4.sysctl_tcp_adv_win_scale = 1;
2672        net->ipv4.sysctl_tcp_frto = 2;
2673        net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2674        /* This limits the percentage of the congestion window which we
2675         * will allow a single TSO frame to consume.  Building TSO frames
2676         * which are too large can cause TCP streams to be bursty.
2677         */
2678        net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2679        /* Default TSQ limit of 16 TSO segments */
2680        net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2681        /* rfc5961 challenge ack rate limiting */
2682        net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2683        net->ipv4.sysctl_tcp_min_tso_segs = 2;
2684        net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2685        net->ipv4.sysctl_tcp_autocorking = 1;
2686        net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2687        net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2688        net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2689        if (net != &init_net) {
2690                memcpy(net->ipv4.sysctl_tcp_rmem,
2691                       init_net.ipv4.sysctl_tcp_rmem,
2692                       sizeof(init_net.ipv4.sysctl_tcp_rmem));
2693                memcpy(net->ipv4.sysctl_tcp_wmem,
2694                       init_net.ipv4.sysctl_tcp_wmem,
2695                       sizeof(init_net.ipv4.sysctl_tcp_wmem));
2696        }
2697        net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2698        net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2699        net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2700        spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2701        net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2702        atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2703
2704        /* Reno is always built in */
2705        if (!net_eq(net, &init_net) &&
2706            try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2707                net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2708        else
2709                net->ipv4.tcp_congestion_control = &tcp_reno;
2710
2711        return 0;
2712fail:
2713        tcp_sk_exit(net);
2714
2715        return res;
2716}
2717
2718static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2719{
2720        struct net *net;
2721
2722        inet_twsk_purge(&tcp_hashinfo, AF_INET);
2723
2724        list_for_each_entry(net, net_exit_list, exit_list)
2725                tcp_fastopen_ctx_destroy(net);
2726}
2727
2728static struct pernet_operations __net_initdata tcp_sk_ops = {
2729       .init       = tcp_sk_init,
2730       .exit       = tcp_sk_exit,
2731       .exit_batch = tcp_sk_exit_batch,
2732};
2733
2734void __init tcp_v4_init(void)
2735{
2736        if (register_pernet_subsys(&tcp_sk_ops))
2737                panic("Failed to create the TCP control socket.\n");
2738}
2739