linux/net/ipv4/tcp_ipv4.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4 *              operating system.  INET is implemented using the  BSD Socket
   5 *              interface as the means of communication with the user level.
   6 *
   7 *              Implementation of the Transmission Control Protocol(TCP).
   8 *
   9 *              IPv4 specific functions
  10 *
  11 *              code split from:
  12 *              linux/ipv4/tcp.c
  13 *              linux/ipv4/tcp_input.c
  14 *              linux/ipv4/tcp_output.c
  15 *
  16 *              See tcp.c for author information
  17 */
  18
  19/*
  20 * Changes:
  21 *              David S. Miller :       New socket lookup architecture.
  22 *                                      This code is dedicated to John Dyson.
  23 *              David S. Miller :       Change semantics of established hash,
  24 *                                      half is devoted to TIME_WAIT sockets
  25 *                                      and the rest go in the other half.
  26 *              Andi Kleen :            Add support for syncookies and fixed
  27 *                                      some bugs: ip options weren't passed to
  28 *                                      the TCP layer, missed a check for an
  29 *                                      ACK bit.
  30 *              Andi Kleen :            Implemented fast path mtu discovery.
  31 *                                      Fixed many serious bugs in the
  32 *                                      request_sock handling and moved
  33 *                                      most of it into the af independent code.
  34 *                                      Added tail drop and some other bugfixes.
  35 *                                      Added new listen semantics.
  36 *              Mike McLagan    :       Routing by source
  37 *      Juan Jose Ciarlante:            ip_dynaddr bits
  38 *              Andi Kleen:             various fixes.
  39 *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  40 *                                      coma.
  41 *      Andi Kleen              :       Fix new listen.
  42 *      Andi Kleen              :       Fix accept error reporting.
  43 *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  44 *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  45 *                                      a single port at the same time.
  46 */
  47
  48#define pr_fmt(fmt) "TCP: " fmt
  49
  50#include <linux/bottom_half.h>
  51#include <linux/types.h>
  52#include <linux/fcntl.h>
  53#include <linux/module.h>
  54#include <linux/random.h>
  55#include <linux/cache.h>
  56#include <linux/jhash.h>
  57#include <linux/init.h>
  58#include <linux/times.h>
  59#include <linux/slab.h>
  60
  61#include <net/net_namespace.h>
  62#include <net/icmp.h>
  63#include <net/inet_hashtables.h>
  64#include <net/tcp.h>
  65#include <net/transp_v6.h>
  66#include <net/ipv6.h>
  67#include <net/inet_common.h>
  68#include <net/timewait_sock.h>
  69#include <net/xfrm.h>
  70#include <net/secure_seq.h>
  71#include <net/busy_poll.h>
  72
  73#include <linux/inet.h>
  74#include <linux/ipv6.h>
  75#include <linux/stddef.h>
  76#include <linux/proc_fs.h>
  77#include <linux/seq_file.h>
  78#include <linux/inetdevice.h>
  79
  80#include <crypto/hash.h>
  81#include <linux/scatterlist.h>
  82
  83#include <trace/events/tcp.h>
  84
  85#ifdef CONFIG_TCP_MD5SIG
  86static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  87                               __be32 daddr, __be32 saddr, const struct tcphdr *th);
  88#endif
  89
  90struct inet_hashinfo tcp_hashinfo;
  91EXPORT_SYMBOL(tcp_hashinfo);
  92
  93static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  94{
  95        return secure_tcp_seq(ip_hdr(skb)->daddr,
  96                              ip_hdr(skb)->saddr,
  97                              tcp_hdr(skb)->dest,
  98                              tcp_hdr(skb)->source);
  99}
 100
 101static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 102{
 103        return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 104}
 105
 106int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 107{
 108        const struct inet_timewait_sock *tw = inet_twsk(sktw);
 109        const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 110        struct tcp_sock *tp = tcp_sk(sk);
 111        int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
 112
 113        if (reuse == 2) {
 114                /* Still does not detect *everything* that goes through
 115                 * lo, since we require a loopback src or dst address
 116                 * or direct binding to 'lo' interface.
 117                 */
 118                bool loopback = false;
 119                if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 120                        loopback = true;
 121#if IS_ENABLED(CONFIG_IPV6)
 122                if (tw->tw_family == AF_INET6) {
 123                        if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 124                            (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
 125                             (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
 126                            ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 127                            (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
 128                             (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
 129                                loopback = true;
 130                } else
 131#endif
 132                {
 133                        if (ipv4_is_loopback(tw->tw_daddr) ||
 134                            ipv4_is_loopback(tw->tw_rcv_saddr))
 135                                loopback = true;
 136                }
 137                if (!loopback)
 138                        reuse = 0;
 139        }
 140
 141        /* With PAWS, it is safe from the viewpoint
 142           of data integrity. Even without PAWS it is safe provided sequence
 143           spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 144
 145           Actually, the idea is close to VJ's one, only timestamp cache is
 146           held not per host, but per port pair and TW bucket is used as state
 147           holder.
 148
 149           If TW bucket has been already destroyed we fall back to VJ's scheme
 150           and use initial timestamp retrieved from peer table.
 151         */
 152        if (tcptw->tw_ts_recent_stamp &&
 153            (!twp || (reuse && time_after32(ktime_get_seconds(),
 154                                            tcptw->tw_ts_recent_stamp)))) {
 155                /* In case of repair and re-using TIME-WAIT sockets we still
 156                 * want to be sure that it is safe as above but honor the
 157                 * sequence numbers and time stamps set as part of the repair
 158                 * process.
 159                 *
 160                 * Without this check re-using a TIME-WAIT socket with TCP
 161                 * repair would accumulate a -1 on the repair assigned
 162                 * sequence number. The first time it is reused the sequence
 163                 * is -1, the second time -2, etc. This fixes that issue
 164                 * without appearing to create any others.
 165                 */
 166                if (likely(!tp->repair)) {
 167                        u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
 168
 169                        if (!seq)
 170                                seq = 1;
 171                        WRITE_ONCE(tp->write_seq, seq);
 172                        tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 173                        tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 174                }
 175                sock_hold(sktw);
 176                return 1;
 177        }
 178
 179        return 0;
 180}
 181EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 182
 183static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 184                              int addr_len)
 185{
 186        /* This check is replicated from tcp_v4_connect() and intended to
 187         * prevent BPF program called below from accessing bytes that are out
 188         * of the bound specified by user in addr_len.
 189         */
 190        if (addr_len < sizeof(struct sockaddr_in))
 191                return -EINVAL;
 192
 193        sock_owned_by_me(sk);
 194
 195        return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 196}
 197
 198/* This will initiate an outgoing connection. */
 199int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 200{
 201        struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 202        struct inet_sock *inet = inet_sk(sk);
 203        struct tcp_sock *tp = tcp_sk(sk);
 204        __be16 orig_sport, orig_dport;
 205        __be32 daddr, nexthop;
 206        struct flowi4 *fl4;
 207        struct rtable *rt;
 208        int err;
 209        struct ip_options_rcu *inet_opt;
 210        struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 211
 212        if (addr_len < sizeof(struct sockaddr_in))
 213                return -EINVAL;
 214
 215        if (usin->sin_family != AF_INET)
 216                return -EAFNOSUPPORT;
 217
 218        nexthop = daddr = usin->sin_addr.s_addr;
 219        inet_opt = rcu_dereference_protected(inet->inet_opt,
 220                                             lockdep_sock_is_held(sk));
 221        if (inet_opt && inet_opt->opt.srr) {
 222                if (!daddr)
 223                        return -EINVAL;
 224                nexthop = inet_opt->opt.faddr;
 225        }
 226
 227        orig_sport = inet->inet_sport;
 228        orig_dport = usin->sin_port;
 229        fl4 = &inet->cork.fl.u.ip4;
 230        rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 231                              RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 232                              IPPROTO_TCP,
 233                              orig_sport, orig_dport, sk);
 234        if (IS_ERR(rt)) {
 235                err = PTR_ERR(rt);
 236                if (err == -ENETUNREACH)
 237                        IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 238                return err;
 239        }
 240
 241        if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 242                ip_rt_put(rt);
 243                return -ENETUNREACH;
 244        }
 245
 246        if (!inet_opt || !inet_opt->opt.srr)
 247                daddr = fl4->daddr;
 248
 249        if (!inet->inet_saddr)
 250                inet->inet_saddr = fl4->saddr;
 251        sk_rcv_saddr_set(sk, inet->inet_saddr);
 252
 253        if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 254                /* Reset inherited state */
 255                tp->rx_opt.ts_recent       = 0;
 256                tp->rx_opt.ts_recent_stamp = 0;
 257                if (likely(!tp->repair))
 258                        WRITE_ONCE(tp->write_seq, 0);
 259        }
 260
 261        inet->inet_dport = usin->sin_port;
 262        sk_daddr_set(sk, daddr);
 263
 264        inet_csk(sk)->icsk_ext_hdr_len = 0;
 265        if (inet_opt)
 266                inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 267
 268        tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 269
 270        /* Socket identity is still unknown (sport may be zero).
 271         * However we set state to SYN-SENT and not releasing socket
 272         * lock select source port, enter ourselves into the hash tables and
 273         * complete initialization after this.
 274         */
 275        tcp_set_state(sk, TCP_SYN_SENT);
 276        err = inet_hash_connect(tcp_death_row, sk);
 277        if (err)
 278                goto failure;
 279
 280        sk_set_txhash(sk);
 281
 282        rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 283                               inet->inet_sport, inet->inet_dport, sk);
 284        if (IS_ERR(rt)) {
 285                err = PTR_ERR(rt);
 286                rt = NULL;
 287                goto failure;
 288        }
 289        /* OK, now commit destination to socket.  */
 290        sk->sk_gso_type = SKB_GSO_TCPV4;
 291        sk_setup_caps(sk, &rt->dst);
 292        rt = NULL;
 293
 294        if (likely(!tp->repair)) {
 295                if (!tp->write_seq)
 296                        WRITE_ONCE(tp->write_seq,
 297                                   secure_tcp_seq(inet->inet_saddr,
 298                                                  inet->inet_daddr,
 299                                                  inet->inet_sport,
 300                                                  usin->sin_port));
 301                tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 302                                                 inet->inet_saddr,
 303                                                 inet->inet_daddr);
 304        }
 305
 306        inet->inet_id = prandom_u32();
 307
 308        if (tcp_fastopen_defer_connect(sk, &err))
 309                return err;
 310        if (err)
 311                goto failure;
 312
 313        err = tcp_connect(sk);
 314
 315        if (err)
 316                goto failure;
 317
 318        return 0;
 319
 320failure:
 321        /*
 322         * This unhashes the socket and releases the local port,
 323         * if necessary.
 324         */
 325        tcp_set_state(sk, TCP_CLOSE);
 326        ip_rt_put(rt);
 327        sk->sk_route_caps = 0;
 328        inet->inet_dport = 0;
 329        return err;
 330}
 331EXPORT_SYMBOL(tcp_v4_connect);
 332
 333/*
 334 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 335 * It can be called through tcp_release_cb() if socket was owned by user
 336 * at the time tcp_v4_err() was called to handle ICMP message.
 337 */
 338void tcp_v4_mtu_reduced(struct sock *sk)
 339{
 340        struct inet_sock *inet = inet_sk(sk);
 341        struct dst_entry *dst;
 342        u32 mtu;
 343
 344        if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 345                return;
 346        mtu = tcp_sk(sk)->mtu_info;
 347        dst = inet_csk_update_pmtu(sk, mtu);
 348        if (!dst)
 349                return;
 350
 351        /* Something is about to be wrong... Remember soft error
 352         * for the case, if this connection will not able to recover.
 353         */
 354        if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 355                sk->sk_err_soft = EMSGSIZE;
 356
 357        mtu = dst_mtu(dst);
 358
 359        if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 360            ip_sk_accept_pmtu(sk) &&
 361            inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 362                tcp_sync_mss(sk, mtu);
 363
 364                /* Resend the TCP packet because it's
 365                 * clear that the old packet has been
 366                 * dropped. This is the new "fast" path mtu
 367                 * discovery.
 368                 */
 369                tcp_simple_retransmit(sk);
 370        } /* else let the usual retransmit timer handle it */
 371}
 372EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 373
 374static void do_redirect(struct sk_buff *skb, struct sock *sk)
 375{
 376        struct dst_entry *dst = __sk_dst_check(sk, 0);
 377
 378        if (dst)
 379                dst->ops->redirect(dst, sk, skb);
 380}
 381
 382
 383/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 384void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 385{
 386        struct request_sock *req = inet_reqsk(sk);
 387        struct net *net = sock_net(sk);
 388
 389        /* ICMPs are not backlogged, hence we cannot get
 390         * an established socket here.
 391         */
 392        if (seq != tcp_rsk(req)->snt_isn) {
 393                __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 394        } else if (abort) {
 395                /*
 396                 * Still in SYN_RECV, just remove it silently.
 397                 * There is no good way to pass the error to the newly
 398                 * created socket, and POSIX does not want network
 399                 * errors returned from accept().
 400                 */
 401                inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 402                tcp_listendrop(req->rsk_listener);
 403        }
 404        reqsk_put(req);
 405}
 406EXPORT_SYMBOL(tcp_req_err);
 407
 408/*
 409 * This routine is called by the ICMP module when it gets some
 410 * sort of error condition.  If err < 0 then the socket should
 411 * be closed and the error returned to the user.  If err > 0
 412 * it's just the icmp type << 8 | icmp code.  After adjustment
 413 * header points to the first 8 bytes of the tcp header.  We need
 414 * to find the appropriate port.
 415 *
 416 * The locking strategy used here is very "optimistic". When
 417 * someone else accesses the socket the ICMP is just dropped
 418 * and for some paths there is no check at all.
 419 * A more general error queue to queue errors for later handling
 420 * is probably better.
 421 *
 422 */
 423
 424int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 425{
 426        const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 427        struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 428        struct inet_connection_sock *icsk;
 429        struct tcp_sock *tp;
 430        struct inet_sock *inet;
 431        const int type = icmp_hdr(icmp_skb)->type;
 432        const int code = icmp_hdr(icmp_skb)->code;
 433        struct sock *sk;
 434        struct sk_buff *skb;
 435        struct request_sock *fastopen;
 436        u32 seq, snd_una;
 437        s32 remaining;
 438        u32 delta_us;
 439        int err;
 440        struct net *net = dev_net(icmp_skb->dev);
 441
 442        sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 443                                       th->dest, iph->saddr, ntohs(th->source),
 444                                       inet_iif(icmp_skb), 0);
 445        if (!sk) {
 446                __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 447                return -ENOENT;
 448        }
 449        if (sk->sk_state == TCP_TIME_WAIT) {
 450                inet_twsk_put(inet_twsk(sk));
 451                return 0;
 452        }
 453        seq = ntohl(th->seq);
 454        if (sk->sk_state == TCP_NEW_SYN_RECV) {
 455                tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 456                                     type == ICMP_TIME_EXCEEDED ||
 457                                     (type == ICMP_DEST_UNREACH &&
 458                                      (code == ICMP_NET_UNREACH ||
 459                                       code == ICMP_HOST_UNREACH)));
 460                return 0;
 461        }
 462
 463        bh_lock_sock(sk);
 464        /* If too many ICMPs get dropped on busy
 465         * servers this needs to be solved differently.
 466         * We do take care of PMTU discovery (RFC1191) special case :
 467         * we can receive locally generated ICMP messages while socket is held.
 468         */
 469        if (sock_owned_by_user(sk)) {
 470                if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 471                        __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 472        }
 473        if (sk->sk_state == TCP_CLOSE)
 474                goto out;
 475
 476        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 477                __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 478                goto out;
 479        }
 480
 481        icsk = inet_csk(sk);
 482        tp = tcp_sk(sk);
 483        /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 484        fastopen = rcu_dereference(tp->fastopen_rsk);
 485        snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 486        if (sk->sk_state != TCP_LISTEN &&
 487            !between(seq, snd_una, tp->snd_nxt)) {
 488                __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 489                goto out;
 490        }
 491
 492        switch (type) {
 493        case ICMP_REDIRECT:
 494                if (!sock_owned_by_user(sk))
 495                        do_redirect(icmp_skb, sk);
 496                goto out;
 497        case ICMP_SOURCE_QUENCH:
 498                /* Just silently ignore these. */
 499                goto out;
 500        case ICMP_PARAMETERPROB:
 501                err = EPROTO;
 502                break;
 503        case ICMP_DEST_UNREACH:
 504                if (code > NR_ICMP_UNREACH)
 505                        goto out;
 506
 507                if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 508                        /* We are not interested in TCP_LISTEN and open_requests
 509                         * (SYN-ACKs send out by Linux are always <576bytes so
 510                         * they should go through unfragmented).
 511                         */
 512                        if (sk->sk_state == TCP_LISTEN)
 513                                goto out;
 514
 515                        tp->mtu_info = info;
 516                        if (!sock_owned_by_user(sk)) {
 517                                tcp_v4_mtu_reduced(sk);
 518                        } else {
 519                                if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 520                                        sock_hold(sk);
 521                        }
 522                        goto out;
 523                }
 524
 525                err = icmp_err_convert[code].errno;
 526                /* check if icmp_skb allows revert of backoff
 527                 * (see draft-zimmermann-tcp-lcd) */
 528                if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 529                        break;
 530                if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 531                    !icsk->icsk_backoff || fastopen)
 532                        break;
 533
 534                if (sock_owned_by_user(sk))
 535                        break;
 536
 537                skb = tcp_rtx_queue_head(sk);
 538                if (WARN_ON_ONCE(!skb))
 539                        break;
 540
 541                icsk->icsk_backoff--;
 542                icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 543                                               TCP_TIMEOUT_INIT;
 544                icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 545
 546
 547                tcp_mstamp_refresh(tp);
 548                delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 549                remaining = icsk->icsk_rto -
 550                            usecs_to_jiffies(delta_us);
 551
 552                if (remaining > 0) {
 553                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 554                                                  remaining, TCP_RTO_MAX);
 555                } else {
 556                        /* RTO revert clocked out retransmission.
 557                         * Will retransmit now */
 558                        tcp_retransmit_timer(sk);
 559                }
 560
 561                break;
 562        case ICMP_TIME_EXCEEDED:
 563                err = EHOSTUNREACH;
 564                break;
 565        default:
 566                goto out;
 567        }
 568
 569        switch (sk->sk_state) {
 570        case TCP_SYN_SENT:
 571        case TCP_SYN_RECV:
 572                /* Only in fast or simultaneous open. If a fast open socket is
 573                 * is already accepted it is treated as a connected one below.
 574                 */
 575                if (fastopen && !fastopen->sk)
 576                        break;
 577
 578                if (!sock_owned_by_user(sk)) {
 579                        sk->sk_err = err;
 580
 581                        sk->sk_error_report(sk);
 582
 583                        tcp_done(sk);
 584                } else {
 585                        sk->sk_err_soft = err;
 586                }
 587                goto out;
 588        }
 589
 590        /* If we've already connected we will keep trying
 591         * until we time out, or the user gives up.
 592         *
 593         * rfc1122 4.2.3.9 allows to consider as hard errors
 594         * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 595         * but it is obsoleted by pmtu discovery).
 596         *
 597         * Note, that in modern internet, where routing is unreliable
 598         * and in each dark corner broken firewalls sit, sending random
 599         * errors ordered by their masters even this two messages finally lose
 600         * their original sense (even Linux sends invalid PORT_UNREACHs)
 601         *
 602         * Now we are in compliance with RFCs.
 603         *                                                      --ANK (980905)
 604         */
 605
 606        inet = inet_sk(sk);
 607        if (!sock_owned_by_user(sk) && inet->recverr) {
 608                sk->sk_err = err;
 609                sk->sk_error_report(sk);
 610        } else  { /* Only an error on timeout */
 611                sk->sk_err_soft = err;
 612        }
 613
 614out:
 615        bh_unlock_sock(sk);
 616        sock_put(sk);
 617        return 0;
 618}
 619
 620void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 621{
 622        struct tcphdr *th = tcp_hdr(skb);
 623
 624        th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 625        skb->csum_start = skb_transport_header(skb) - skb->head;
 626        skb->csum_offset = offsetof(struct tcphdr, check);
 627}
 628
 629/* This routine computes an IPv4 TCP checksum. */
 630void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 631{
 632        const struct inet_sock *inet = inet_sk(sk);
 633
 634        __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 635}
 636EXPORT_SYMBOL(tcp_v4_send_check);
 637
 638/*
 639 *      This routine will send an RST to the other tcp.
 640 *
 641 *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 642 *                    for reset.
 643 *      Answer: if a packet caused RST, it is not for a socket
 644 *              existing in our system, if it is matched to a socket,
 645 *              it is just duplicate segment or bug in other side's TCP.
 646 *              So that we build reply only basing on parameters
 647 *              arrived with segment.
 648 *      Exception: precedence violation. We do not implement it in any case.
 649 */
 650
 651static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 652{
 653        const struct tcphdr *th = tcp_hdr(skb);
 654        struct {
 655                struct tcphdr th;
 656#ifdef CONFIG_TCP_MD5SIG
 657                __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 658#endif
 659        } rep;
 660        struct ip_reply_arg arg;
 661#ifdef CONFIG_TCP_MD5SIG
 662        struct tcp_md5sig_key *key = NULL;
 663        const __u8 *hash_location = NULL;
 664        unsigned char newhash[16];
 665        int genhash;
 666        struct sock *sk1 = NULL;
 667#endif
 668        u64 transmit_time = 0;
 669        struct sock *ctl_sk;
 670        struct net *net;
 671
 672        /* Never send a reset in response to a reset. */
 673        if (th->rst)
 674                return;
 675
 676        /* If sk not NULL, it means we did a successful lookup and incoming
 677         * route had to be correct. prequeue might have dropped our dst.
 678         */
 679        if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 680                return;
 681
 682        /* Swap the send and the receive. */
 683        memset(&rep, 0, sizeof(rep));
 684        rep.th.dest   = th->source;
 685        rep.th.source = th->dest;
 686        rep.th.doff   = sizeof(struct tcphdr) / 4;
 687        rep.th.rst    = 1;
 688
 689        if (th->ack) {
 690                rep.th.seq = th->ack_seq;
 691        } else {
 692                rep.th.ack = 1;
 693                rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 694                                       skb->len - (th->doff << 2));
 695        }
 696
 697        memset(&arg, 0, sizeof(arg));
 698        arg.iov[0].iov_base = (unsigned char *)&rep;
 699        arg.iov[0].iov_len  = sizeof(rep.th);
 700
 701        net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 702#ifdef CONFIG_TCP_MD5SIG
 703        rcu_read_lock();
 704        hash_location = tcp_parse_md5sig_option(th);
 705        if (sk && sk_fullsock(sk)) {
 706                key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 707                                        &ip_hdr(skb)->saddr, AF_INET);
 708        } else if (hash_location) {
 709                /*
 710                 * active side is lost. Try to find listening socket through
 711                 * source port, and then find md5 key through listening socket.
 712                 * we are not loose security here:
 713                 * Incoming packet is checked with md5 hash with finding key,
 714                 * no RST generated if md5 hash doesn't match.
 715                 */
 716                sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 717                                             ip_hdr(skb)->saddr,
 718                                             th->source, ip_hdr(skb)->daddr,
 719                                             ntohs(th->source), inet_iif(skb),
 720                                             tcp_v4_sdif(skb));
 721                /* don't send rst if it can't find key */
 722                if (!sk1)
 723                        goto out;
 724
 725                key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 726                                        &ip_hdr(skb)->saddr, AF_INET);
 727                if (!key)
 728                        goto out;
 729
 730
 731                genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 732                if (genhash || memcmp(hash_location, newhash, 16) != 0)
 733                        goto out;
 734
 735        }
 736
 737        if (key) {
 738                rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 739                                   (TCPOPT_NOP << 16) |
 740                                   (TCPOPT_MD5SIG << 8) |
 741                                   TCPOLEN_MD5SIG);
 742                /* Update length and the length the header thinks exists */
 743                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 744                rep.th.doff = arg.iov[0].iov_len / 4;
 745
 746                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 747                                     key, ip_hdr(skb)->saddr,
 748                                     ip_hdr(skb)->daddr, &rep.th);
 749        }
 750#endif
 751        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 752                                      ip_hdr(skb)->saddr, /* XXX */
 753                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 754        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 755        arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 756
 757        /* When socket is gone, all binding information is lost.
 758         * routing might fail in this case. No choice here, if we choose to force
 759         * input interface, we will misroute in case of asymmetric route.
 760         */
 761        if (sk) {
 762                arg.bound_dev_if = sk->sk_bound_dev_if;
 763                if (sk_fullsock(sk))
 764                        trace_tcp_send_reset(sk, skb);
 765        }
 766
 767        BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 768                     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 769
 770        arg.tos = ip_hdr(skb)->tos;
 771        arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 772        local_bh_disable();
 773        ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 774        if (sk) {
 775                ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 776                                   inet_twsk(sk)->tw_mark : sk->sk_mark;
 777                ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 778                                   inet_twsk(sk)->tw_priority : sk->sk_priority;
 779                transmit_time = tcp_transmit_time(sk);
 780        }
 781        ip_send_unicast_reply(ctl_sk,
 782                              skb, &TCP_SKB_CB(skb)->header.h4.opt,
 783                              ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 784                              &arg, arg.iov[0].iov_len,
 785                              transmit_time);
 786
 787        ctl_sk->sk_mark = 0;
 788        __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 789        __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 790        local_bh_enable();
 791
 792#ifdef CONFIG_TCP_MD5SIG
 793out:
 794        rcu_read_unlock();
 795#endif
 796}
 797
 798/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 799   outside socket context is ugly, certainly. What can I do?
 800 */
 801
 802static void tcp_v4_send_ack(const struct sock *sk,
 803                            struct sk_buff *skb, u32 seq, u32 ack,
 804                            u32 win, u32 tsval, u32 tsecr, int oif,
 805                            struct tcp_md5sig_key *key,
 806                            int reply_flags, u8 tos)
 807{
 808        const struct tcphdr *th = tcp_hdr(skb);
 809        struct {
 810                struct tcphdr th;
 811                __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 812#ifdef CONFIG_TCP_MD5SIG
 813                           + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 814#endif
 815                        ];
 816        } rep;
 817        struct net *net = sock_net(sk);
 818        struct ip_reply_arg arg;
 819        struct sock *ctl_sk;
 820        u64 transmit_time;
 821
 822        memset(&rep.th, 0, sizeof(struct tcphdr));
 823        memset(&arg, 0, sizeof(arg));
 824
 825        arg.iov[0].iov_base = (unsigned char *)&rep;
 826        arg.iov[0].iov_len  = sizeof(rep.th);
 827        if (tsecr) {
 828                rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 829                                   (TCPOPT_TIMESTAMP << 8) |
 830                                   TCPOLEN_TIMESTAMP);
 831                rep.opt[1] = htonl(tsval);
 832                rep.opt[2] = htonl(tsecr);
 833                arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 834        }
 835
 836        /* Swap the send and the receive. */
 837        rep.th.dest    = th->source;
 838        rep.th.source  = th->dest;
 839        rep.th.doff    = arg.iov[0].iov_len / 4;
 840        rep.th.seq     = htonl(seq);
 841        rep.th.ack_seq = htonl(ack);
 842        rep.th.ack     = 1;
 843        rep.th.window  = htons(win);
 844
 845#ifdef CONFIG_TCP_MD5SIG
 846        if (key) {
 847                int offset = (tsecr) ? 3 : 0;
 848
 849                rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 850                                          (TCPOPT_NOP << 16) |
 851                                          (TCPOPT_MD5SIG << 8) |
 852                                          TCPOLEN_MD5SIG);
 853                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 854                rep.th.doff = arg.iov[0].iov_len/4;
 855
 856                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 857                                    key, ip_hdr(skb)->saddr,
 858                                    ip_hdr(skb)->daddr, &rep.th);
 859        }
 860#endif
 861        arg.flags = reply_flags;
 862        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 863                                      ip_hdr(skb)->saddr, /* XXX */
 864                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 865        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 866        if (oif)
 867                arg.bound_dev_if = oif;
 868        arg.tos = tos;
 869        arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 870        local_bh_disable();
 871        ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 872        ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 873                           inet_twsk(sk)->tw_mark : sk->sk_mark;
 874        ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 875                           inet_twsk(sk)->tw_priority : sk->sk_priority;
 876        transmit_time = tcp_transmit_time(sk);
 877        ip_send_unicast_reply(ctl_sk,
 878                              skb, &TCP_SKB_CB(skb)->header.h4.opt,
 879                              ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 880                              &arg, arg.iov[0].iov_len,
 881                              transmit_time);
 882
 883        ctl_sk->sk_mark = 0;
 884        __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 885        local_bh_enable();
 886}
 887
 888static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 889{
 890        struct inet_timewait_sock *tw = inet_twsk(sk);
 891        struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 892
 893        tcp_v4_send_ack(sk, skb,
 894                        tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 895                        tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 896                        tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 897                        tcptw->tw_ts_recent,
 898                        tw->tw_bound_dev_if,
 899                        tcp_twsk_md5_key(tcptw),
 900                        tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 901                        tw->tw_tos
 902                        );
 903
 904        inet_twsk_put(tw);
 905}
 906
 907static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 908                                  struct request_sock *req)
 909{
 910        /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 911         * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 912         */
 913        u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 914                                             tcp_sk(sk)->snd_nxt;
 915
 916        /* RFC 7323 2.3
 917         * The window field (SEG.WND) of every outgoing segment, with the
 918         * exception of <SYN> segments, MUST be right-shifted by
 919         * Rcv.Wind.Shift bits:
 920         */
 921        tcp_v4_send_ack(sk, skb, seq,
 922                        tcp_rsk(req)->rcv_nxt,
 923                        req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 924                        tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 925                        req->ts_recent,
 926                        0,
 927                        tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
 928                                          AF_INET),
 929                        inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 930                        ip_hdr(skb)->tos);
 931}
 932
 933/*
 934 *      Send a SYN-ACK after having received a SYN.
 935 *      This still operates on a request_sock only, not on a big
 936 *      socket.
 937 */
 938static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 939                              struct flowi *fl,
 940                              struct request_sock *req,
 941                              struct tcp_fastopen_cookie *foc,
 942                              enum tcp_synack_type synack_type)
 943{
 944        const struct inet_request_sock *ireq = inet_rsk(req);
 945        struct flowi4 fl4;
 946        int err = -1;
 947        struct sk_buff *skb;
 948
 949        /* First, grab a route. */
 950        if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 951                return -1;
 952
 953        skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 954
 955        if (skb) {
 956                __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 957
 958                rcu_read_lock();
 959                err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 960                                            ireq->ir_rmt_addr,
 961                                            rcu_dereference(ireq->ireq_opt));
 962                rcu_read_unlock();
 963                err = net_xmit_eval(err);
 964        }
 965
 966        return err;
 967}
 968
 969/*
 970 *      IPv4 request_sock destructor.
 971 */
 972static void tcp_v4_reqsk_destructor(struct request_sock *req)
 973{
 974        kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
 975}
 976
 977#ifdef CONFIG_TCP_MD5SIG
 978/*
 979 * RFC2385 MD5 checksumming requires a mapping of
 980 * IP address->MD5 Key.
 981 * We need to maintain these in the sk structure.
 982 */
 983
 984DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
 985EXPORT_SYMBOL(tcp_md5_needed);
 986
 987/* Find the Key structure for an address.  */
 988struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk,
 989                                           const union tcp_md5_addr *addr,
 990                                           int family)
 991{
 992        const struct tcp_sock *tp = tcp_sk(sk);
 993        struct tcp_md5sig_key *key;
 994        const struct tcp_md5sig_info *md5sig;
 995        __be32 mask;
 996        struct tcp_md5sig_key *best_match = NULL;
 997        bool match;
 998
 999        /* caller either holds rcu_read_lock() or socket lock */
1000        md5sig = rcu_dereference_check(tp->md5sig_info,
1001                                       lockdep_sock_is_held(sk));
1002        if (!md5sig)
1003                return NULL;
1004
1005        hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1006                if (key->family != family)
1007                        continue;
1008
1009                if (family == AF_INET) {
1010                        mask = inet_make_mask(key->prefixlen);
1011                        match = (key->addr.a4.s_addr & mask) ==
1012                                (addr->a4.s_addr & mask);
1013#if IS_ENABLED(CONFIG_IPV6)
1014                } else if (family == AF_INET6) {
1015                        match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1016                                                  key->prefixlen);
1017#endif
1018                } else {
1019                        match = false;
1020                }
1021
1022                if (match && (!best_match ||
1023                              key->prefixlen > best_match->prefixlen))
1024                        best_match = key;
1025        }
1026        return best_match;
1027}
1028EXPORT_SYMBOL(__tcp_md5_do_lookup);
1029
1030static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1031                                                      const union tcp_md5_addr *addr,
1032                                                      int family, u8 prefixlen)
1033{
1034        const struct tcp_sock *tp = tcp_sk(sk);
1035        struct tcp_md5sig_key *key;
1036        unsigned int size = sizeof(struct in_addr);
1037        const struct tcp_md5sig_info *md5sig;
1038
1039        /* caller either holds rcu_read_lock() or socket lock */
1040        md5sig = rcu_dereference_check(tp->md5sig_info,
1041                                       lockdep_sock_is_held(sk));
1042        if (!md5sig)
1043                return NULL;
1044#if IS_ENABLED(CONFIG_IPV6)
1045        if (family == AF_INET6)
1046                size = sizeof(struct in6_addr);
1047#endif
1048        hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1049                if (key->family != family)
1050                        continue;
1051                if (!memcmp(&key->addr, addr, size) &&
1052                    key->prefixlen == prefixlen)
1053                        return key;
1054        }
1055        return NULL;
1056}
1057
1058struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1059                                         const struct sock *addr_sk)
1060{
1061        const union tcp_md5_addr *addr;
1062
1063        addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1064        return tcp_md5_do_lookup(sk, addr, AF_INET);
1065}
1066EXPORT_SYMBOL(tcp_v4_md5_lookup);
1067
1068/* This can be called on a newly created socket, from other files */
1069int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1070                   int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1071                   gfp_t gfp)
1072{
1073        /* Add Key to the list */
1074        struct tcp_md5sig_key *key;
1075        struct tcp_sock *tp = tcp_sk(sk);
1076        struct tcp_md5sig_info *md5sig;
1077
1078        key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1079        if (key) {
1080                /* Pre-existing entry - just update that one. */
1081                memcpy(key->key, newkey, newkeylen);
1082                key->keylen = newkeylen;
1083                return 0;
1084        }
1085
1086        md5sig = rcu_dereference_protected(tp->md5sig_info,
1087                                           lockdep_sock_is_held(sk));
1088        if (!md5sig) {
1089                md5sig = kmalloc(sizeof(*md5sig), gfp);
1090                if (!md5sig)
1091                        return -ENOMEM;
1092
1093                sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1094                INIT_HLIST_HEAD(&md5sig->head);
1095                rcu_assign_pointer(tp->md5sig_info, md5sig);
1096        }
1097
1098        key = sock_kmalloc(sk, sizeof(*key), gfp);
1099        if (!key)
1100                return -ENOMEM;
1101        if (!tcp_alloc_md5sig_pool()) {
1102                sock_kfree_s(sk, key, sizeof(*key));
1103                return -ENOMEM;
1104        }
1105
1106        memcpy(key->key, newkey, newkeylen);
1107        key->keylen = newkeylen;
1108        key->family = family;
1109        key->prefixlen = prefixlen;
1110        memcpy(&key->addr, addr,
1111               (family == AF_INET6) ? sizeof(struct in6_addr) :
1112                                      sizeof(struct in_addr));
1113        hlist_add_head_rcu(&key->node, &md5sig->head);
1114        return 0;
1115}
1116EXPORT_SYMBOL(tcp_md5_do_add);
1117
1118int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1119                   u8 prefixlen)
1120{
1121        struct tcp_md5sig_key *key;
1122
1123        key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1124        if (!key)
1125                return -ENOENT;
1126        hlist_del_rcu(&key->node);
1127        atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1128        kfree_rcu(key, rcu);
1129        return 0;
1130}
1131EXPORT_SYMBOL(tcp_md5_do_del);
1132
1133static void tcp_clear_md5_list(struct sock *sk)
1134{
1135        struct tcp_sock *tp = tcp_sk(sk);
1136        struct tcp_md5sig_key *key;
1137        struct hlist_node *n;
1138        struct tcp_md5sig_info *md5sig;
1139
1140        md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1141
1142        hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1143                hlist_del_rcu(&key->node);
1144                atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1145                kfree_rcu(key, rcu);
1146        }
1147}
1148
1149static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1150                                 char __user *optval, int optlen)
1151{
1152        struct tcp_md5sig cmd;
1153        struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1154        u8 prefixlen = 32;
1155
1156        if (optlen < sizeof(cmd))
1157                return -EINVAL;
1158
1159        if (copy_from_user(&cmd, optval, sizeof(cmd)))
1160                return -EFAULT;
1161
1162        if (sin->sin_family != AF_INET)
1163                return -EINVAL;
1164
1165        if (optname == TCP_MD5SIG_EXT &&
1166            cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1167                prefixlen = cmd.tcpm_prefixlen;
1168                if (prefixlen > 32)
1169                        return -EINVAL;
1170        }
1171
1172        if (!cmd.tcpm_keylen)
1173                return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1174                                      AF_INET, prefixlen);
1175
1176        if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1177                return -EINVAL;
1178
1179        return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1180                              AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1181                              GFP_KERNEL);
1182}
1183
1184static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1185                                   __be32 daddr, __be32 saddr,
1186                                   const struct tcphdr *th, int nbytes)
1187{
1188        struct tcp4_pseudohdr *bp;
1189        struct scatterlist sg;
1190        struct tcphdr *_th;
1191
1192        bp = hp->scratch;
1193        bp->saddr = saddr;
1194        bp->daddr = daddr;
1195        bp->pad = 0;
1196        bp->protocol = IPPROTO_TCP;
1197        bp->len = cpu_to_be16(nbytes);
1198
1199        _th = (struct tcphdr *)(bp + 1);
1200        memcpy(_th, th, sizeof(*th));
1201        _th->check = 0;
1202
1203        sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1204        ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1205                                sizeof(*bp) + sizeof(*th));
1206        return crypto_ahash_update(hp->md5_req);
1207}
1208
1209static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1210                               __be32 daddr, __be32 saddr, const struct tcphdr *th)
1211{
1212        struct tcp_md5sig_pool *hp;
1213        struct ahash_request *req;
1214
1215        hp = tcp_get_md5sig_pool();
1216        if (!hp)
1217                goto clear_hash_noput;
1218        req = hp->md5_req;
1219
1220        if (crypto_ahash_init(req))
1221                goto clear_hash;
1222        if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1223                goto clear_hash;
1224        if (tcp_md5_hash_key(hp, key))
1225                goto clear_hash;
1226        ahash_request_set_crypt(req, NULL, md5_hash, 0);
1227        if (crypto_ahash_final(req))
1228                goto clear_hash;
1229
1230        tcp_put_md5sig_pool();
1231        return 0;
1232
1233clear_hash:
1234        tcp_put_md5sig_pool();
1235clear_hash_noput:
1236        memset(md5_hash, 0, 16);
1237        return 1;
1238}
1239
1240int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1241                        const struct sock *sk,
1242                        const struct sk_buff *skb)
1243{
1244        struct tcp_md5sig_pool *hp;
1245        struct ahash_request *req;
1246        const struct tcphdr *th = tcp_hdr(skb);
1247        __be32 saddr, daddr;
1248
1249        if (sk) { /* valid for establish/request sockets */
1250                saddr = sk->sk_rcv_saddr;
1251                daddr = sk->sk_daddr;
1252        } else {
1253                const struct iphdr *iph = ip_hdr(skb);
1254                saddr = iph->saddr;
1255                daddr = iph->daddr;
1256        }
1257
1258        hp = tcp_get_md5sig_pool();
1259        if (!hp)
1260                goto clear_hash_noput;
1261        req = hp->md5_req;
1262
1263        if (crypto_ahash_init(req))
1264                goto clear_hash;
1265
1266        if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1267                goto clear_hash;
1268        if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1269                goto clear_hash;
1270        if (tcp_md5_hash_key(hp, key))
1271                goto clear_hash;
1272        ahash_request_set_crypt(req, NULL, md5_hash, 0);
1273        if (crypto_ahash_final(req))
1274                goto clear_hash;
1275
1276        tcp_put_md5sig_pool();
1277        return 0;
1278
1279clear_hash:
1280        tcp_put_md5sig_pool();
1281clear_hash_noput:
1282        memset(md5_hash, 0, 16);
1283        return 1;
1284}
1285EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1286
1287#endif
1288
1289/* Called with rcu_read_lock() */
1290static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1291                                    const struct sk_buff *skb)
1292{
1293#ifdef CONFIG_TCP_MD5SIG
1294        /*
1295         * This gets called for each TCP segment that arrives
1296         * so we want to be efficient.
1297         * We have 3 drop cases:
1298         * o No MD5 hash and one expected.
1299         * o MD5 hash and we're not expecting one.
1300         * o MD5 hash and its wrong.
1301         */
1302        const __u8 *hash_location = NULL;
1303        struct tcp_md5sig_key *hash_expected;
1304        const struct iphdr *iph = ip_hdr(skb);
1305        const struct tcphdr *th = tcp_hdr(skb);
1306        int genhash;
1307        unsigned char newhash[16];
1308
1309        hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1310                                          AF_INET);
1311        hash_location = tcp_parse_md5sig_option(th);
1312
1313        /* We've parsed the options - do we have a hash? */
1314        if (!hash_expected && !hash_location)
1315                return false;
1316
1317        if (hash_expected && !hash_location) {
1318                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1319                return true;
1320        }
1321
1322        if (!hash_expected && hash_location) {
1323                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1324                return true;
1325        }
1326
1327        /* Okay, so this is hash_expected and hash_location -
1328         * so we need to calculate the checksum.
1329         */
1330        genhash = tcp_v4_md5_hash_skb(newhash,
1331                                      hash_expected,
1332                                      NULL, skb);
1333
1334        if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1335                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1336                net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1337                                     &iph->saddr, ntohs(th->source),
1338                                     &iph->daddr, ntohs(th->dest),
1339                                     genhash ? " tcp_v4_calc_md5_hash failed"
1340                                     : "");
1341                return true;
1342        }
1343        return false;
1344#endif
1345        return false;
1346}
1347
1348static void tcp_v4_init_req(struct request_sock *req,
1349                            const struct sock *sk_listener,
1350                            struct sk_buff *skb)
1351{
1352        struct inet_request_sock *ireq = inet_rsk(req);
1353        struct net *net = sock_net(sk_listener);
1354
1355        sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1356        sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1357        RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1358}
1359
1360static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1361                                          struct flowi *fl,
1362                                          const struct request_sock *req)
1363{
1364        return inet_csk_route_req(sk, &fl->u.ip4, req);
1365}
1366
1367struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1368        .family         =       PF_INET,
1369        .obj_size       =       sizeof(struct tcp_request_sock),
1370        .rtx_syn_ack    =       tcp_rtx_synack,
1371        .send_ack       =       tcp_v4_reqsk_send_ack,
1372        .destructor     =       tcp_v4_reqsk_destructor,
1373        .send_reset     =       tcp_v4_send_reset,
1374        .syn_ack_timeout =      tcp_syn_ack_timeout,
1375};
1376
1377static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1378        .mss_clamp      =       TCP_MSS_DEFAULT,
1379#ifdef CONFIG_TCP_MD5SIG
1380        .req_md5_lookup =       tcp_v4_md5_lookup,
1381        .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1382#endif
1383        .init_req       =       tcp_v4_init_req,
1384#ifdef CONFIG_SYN_COOKIES
1385        .cookie_init_seq =      cookie_v4_init_sequence,
1386#endif
1387        .route_req      =       tcp_v4_route_req,
1388        .init_seq       =       tcp_v4_init_seq,
1389        .init_ts_off    =       tcp_v4_init_ts_off,
1390        .send_synack    =       tcp_v4_send_synack,
1391};
1392
1393int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1394{
1395        /* Never answer to SYNs send to broadcast or multicast */
1396        if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1397                goto drop;
1398
1399        return tcp_conn_request(&tcp_request_sock_ops,
1400                                &tcp_request_sock_ipv4_ops, sk, skb);
1401
1402drop:
1403        tcp_listendrop(sk);
1404        return 0;
1405}
1406EXPORT_SYMBOL(tcp_v4_conn_request);
1407
1408
1409/*
1410 * The three way handshake has completed - we got a valid synack -
1411 * now create the new socket.
1412 */
1413struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1414                                  struct request_sock *req,
1415                                  struct dst_entry *dst,
1416                                  struct request_sock *req_unhash,
1417                                  bool *own_req)
1418{
1419        struct inet_request_sock *ireq;
1420        struct inet_sock *newinet;
1421        struct tcp_sock *newtp;
1422        struct sock *newsk;
1423#ifdef CONFIG_TCP_MD5SIG
1424        struct tcp_md5sig_key *key;
1425#endif
1426        struct ip_options_rcu *inet_opt;
1427
1428        if (sk_acceptq_is_full(sk))
1429                goto exit_overflow;
1430
1431        newsk = tcp_create_openreq_child(sk, req, skb);
1432        if (!newsk)
1433                goto exit_nonewsk;
1434
1435        newsk->sk_gso_type = SKB_GSO_TCPV4;
1436        inet_sk_rx_dst_set(newsk, skb);
1437
1438        newtp                 = tcp_sk(newsk);
1439        newinet               = inet_sk(newsk);
1440        ireq                  = inet_rsk(req);
1441        sk_daddr_set(newsk, ireq->ir_rmt_addr);
1442        sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1443        newsk->sk_bound_dev_if = ireq->ir_iif;
1444        newinet->inet_saddr   = ireq->ir_loc_addr;
1445        inet_opt              = rcu_dereference(ireq->ireq_opt);
1446        RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1447        newinet->mc_index     = inet_iif(skb);
1448        newinet->mc_ttl       = ip_hdr(skb)->ttl;
1449        newinet->rcv_tos      = ip_hdr(skb)->tos;
1450        inet_csk(newsk)->icsk_ext_hdr_len = 0;
1451        if (inet_opt)
1452                inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1453        newinet->inet_id = prandom_u32();
1454
1455        if (!dst) {
1456                dst = inet_csk_route_child_sock(sk, newsk, req);
1457                if (!dst)
1458                        goto put_and_exit;
1459        } else {
1460                /* syncookie case : see end of cookie_v4_check() */
1461        }
1462        sk_setup_caps(newsk, dst);
1463
1464        tcp_ca_openreq_child(newsk, dst);
1465
1466        tcp_sync_mss(newsk, dst_mtu(dst));
1467        newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1468
1469        tcp_initialize_rcv_mss(newsk);
1470
1471#ifdef CONFIG_TCP_MD5SIG
1472        /* Copy over the MD5 key from the original socket */
1473        key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1474                                AF_INET);
1475        if (key) {
1476                /*
1477                 * We're using one, so create a matching key
1478                 * on the newsk structure. If we fail to get
1479                 * memory, then we end up not copying the key
1480                 * across. Shucks.
1481                 */
1482                tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1483                               AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1484                sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1485        }
1486#endif
1487
1488        if (__inet_inherit_port(sk, newsk) < 0)
1489                goto put_and_exit;
1490        *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1491        if (likely(*own_req)) {
1492                tcp_move_syn(newtp, req);
1493                ireq->ireq_opt = NULL;
1494        } else {
1495                newinet->inet_opt = NULL;
1496        }
1497        return newsk;
1498
1499exit_overflow:
1500        NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1501exit_nonewsk:
1502        dst_release(dst);
1503exit:
1504        tcp_listendrop(sk);
1505        return NULL;
1506put_and_exit:
1507        newinet->inet_opt = NULL;
1508        inet_csk_prepare_forced_close(newsk);
1509        tcp_done(newsk);
1510        goto exit;
1511}
1512EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1513
1514static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1515{
1516#ifdef CONFIG_SYN_COOKIES
1517        const struct tcphdr *th = tcp_hdr(skb);
1518
1519        if (!th->syn)
1520                sk = cookie_v4_check(sk, skb);
1521#endif
1522        return sk;
1523}
1524
1525u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1526                         struct tcphdr *th, u32 *cookie)
1527{
1528        u16 mss = 0;
1529#ifdef CONFIG_SYN_COOKIES
1530        mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1531                                    &tcp_request_sock_ipv4_ops, sk, th);
1532        if (mss) {
1533                *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1534                tcp_synq_overflow(sk);
1535        }
1536#endif
1537        return mss;
1538}
1539
1540/* The socket must have it's spinlock held when we get
1541 * here, unless it is a TCP_LISTEN socket.
1542 *
1543 * We have a potential double-lock case here, so even when
1544 * doing backlog processing we use the BH locking scheme.
1545 * This is because we cannot sleep with the original spinlock
1546 * held.
1547 */
1548int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1549{
1550        struct sock *rsk;
1551
1552        if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1553                struct dst_entry *dst = sk->sk_rx_dst;
1554
1555                sock_rps_save_rxhash(sk, skb);
1556                sk_mark_napi_id(sk, skb);
1557                if (dst) {
1558                        if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1559                            !dst->ops->check(dst, 0)) {
1560                                dst_release(dst);
1561                                sk->sk_rx_dst = NULL;
1562                        }
1563                }
1564                tcp_rcv_established(sk, skb);
1565                return 0;
1566        }
1567
1568        if (tcp_checksum_complete(skb))
1569                goto csum_err;
1570
1571        if (sk->sk_state == TCP_LISTEN) {
1572                struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1573
1574                if (!nsk)
1575                        goto discard;
1576                if (nsk != sk) {
1577                        if (tcp_child_process(sk, nsk, skb)) {
1578                                rsk = nsk;
1579                                goto reset;
1580                        }
1581                        return 0;
1582                }
1583        } else
1584                sock_rps_save_rxhash(sk, skb);
1585
1586        if (tcp_rcv_state_process(sk, skb)) {
1587                rsk = sk;
1588                goto reset;
1589        }
1590        return 0;
1591
1592reset:
1593        tcp_v4_send_reset(rsk, skb);
1594discard:
1595        kfree_skb(skb);
1596        /* Be careful here. If this function gets more complicated and
1597         * gcc suffers from register pressure on the x86, sk (in %ebx)
1598         * might be destroyed here. This current version compiles correctly,
1599         * but you have been warned.
1600         */
1601        return 0;
1602
1603csum_err:
1604        TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1605        TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1606        goto discard;
1607}
1608EXPORT_SYMBOL(tcp_v4_do_rcv);
1609
1610int tcp_v4_early_demux(struct sk_buff *skb)
1611{
1612        const struct iphdr *iph;
1613        const struct tcphdr *th;
1614        struct sock *sk;
1615
1616        if (skb->pkt_type != PACKET_HOST)
1617                return 0;
1618
1619        if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1620                return 0;
1621
1622        iph = ip_hdr(skb);
1623        th = tcp_hdr(skb);
1624
1625        if (th->doff < sizeof(struct tcphdr) / 4)
1626                return 0;
1627
1628        sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1629                                       iph->saddr, th->source,
1630                                       iph->daddr, ntohs(th->dest),
1631                                       skb->skb_iif, inet_sdif(skb));
1632        if (sk) {
1633                skb->sk = sk;
1634                skb->destructor = sock_edemux;
1635                if (sk_fullsock(sk)) {
1636                        struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1637
1638                        if (dst)
1639                                dst = dst_check(dst, 0);
1640                        if (dst &&
1641                            inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1642                                skb_dst_set_noref(skb, dst);
1643                }
1644        }
1645        return 0;
1646}
1647
1648bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1649{
1650        u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1651        struct skb_shared_info *shinfo;
1652        const struct tcphdr *th;
1653        struct tcphdr *thtail;
1654        struct sk_buff *tail;
1655        unsigned int hdrlen;
1656        bool fragstolen;
1657        u32 gso_segs;
1658        int delta;
1659
1660        /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1661         * we can fix skb->truesize to its real value to avoid future drops.
1662         * This is valid because skb is not yet charged to the socket.
1663         * It has been noticed pure SACK packets were sometimes dropped
1664         * (if cooked by drivers without copybreak feature).
1665         */
1666        skb_condense(skb);
1667
1668        skb_dst_drop(skb);
1669
1670        if (unlikely(tcp_checksum_complete(skb))) {
1671                bh_unlock_sock(sk);
1672                __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1673                __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1674                return true;
1675        }
1676
1677        /* Attempt coalescing to last skb in backlog, even if we are
1678         * above the limits.
1679         * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1680         */
1681        th = (const struct tcphdr *)skb->data;
1682        hdrlen = th->doff * 4;
1683        shinfo = skb_shinfo(skb);
1684
1685        if (!shinfo->gso_size)
1686                shinfo->gso_size = skb->len - hdrlen;
1687
1688        if (!shinfo->gso_segs)
1689                shinfo->gso_segs = 1;
1690
1691        tail = sk->sk_backlog.tail;
1692        if (!tail)
1693                goto no_coalesce;
1694        thtail = (struct tcphdr *)tail->data;
1695
1696        if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1697            TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1698            ((TCP_SKB_CB(tail)->tcp_flags |
1699              TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1700            !((TCP_SKB_CB(tail)->tcp_flags &
1701              TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1702            ((TCP_SKB_CB(tail)->tcp_flags ^
1703              TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1704#ifdef CONFIG_TLS_DEVICE
1705            tail->decrypted != skb->decrypted ||
1706#endif
1707            thtail->doff != th->doff ||
1708            memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1709                goto no_coalesce;
1710
1711        __skb_pull(skb, hdrlen);
1712        if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1713                thtail->window = th->window;
1714
1715                TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1716
1717                if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
1718                        TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1719
1720                /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1721                 * thtail->fin, so that the fast path in tcp_rcv_established()
1722                 * is not entered if we append a packet with a FIN.
1723                 * SYN, RST, URG are not present.
1724                 * ACK is set on both packets.
1725                 * PSH : we do not really care in TCP stack,
1726                 *       at least for 'GRO' packets.
1727                 */
1728                thtail->fin |= th->fin;
1729                TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1730
1731                if (TCP_SKB_CB(skb)->has_rxtstamp) {
1732                        TCP_SKB_CB(tail)->has_rxtstamp = true;
1733                        tail->tstamp = skb->tstamp;
1734                        skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1735                }
1736
1737                /* Not as strict as GRO. We only need to carry mss max value */
1738                skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1739                                                 skb_shinfo(tail)->gso_size);
1740
1741                gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1742                skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1743
1744                sk->sk_backlog.len += delta;
1745                __NET_INC_STATS(sock_net(sk),
1746                                LINUX_MIB_TCPBACKLOGCOALESCE);
1747                kfree_skb_partial(skb, fragstolen);
1748                return false;
1749        }
1750        __skb_push(skb, hdrlen);
1751
1752no_coalesce:
1753        /* Only socket owner can try to collapse/prune rx queues
1754         * to reduce memory overhead, so add a little headroom here.
1755         * Few sockets backlog are possibly concurrently non empty.
1756         */
1757        limit += 64*1024;
1758
1759        if (unlikely(sk_add_backlog(sk, skb, limit))) {
1760                bh_unlock_sock(sk);
1761                __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1762                return true;
1763        }
1764        return false;
1765}
1766EXPORT_SYMBOL(tcp_add_backlog);
1767
1768int tcp_filter(struct sock *sk, struct sk_buff *skb)
1769{
1770        struct tcphdr *th = (struct tcphdr *)skb->data;
1771
1772        return sk_filter_trim_cap(sk, skb, th->doff * 4);
1773}
1774EXPORT_SYMBOL(tcp_filter);
1775
1776static void tcp_v4_restore_cb(struct sk_buff *skb)
1777{
1778        memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1779                sizeof(struct inet_skb_parm));
1780}
1781
1782static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1783                           const struct tcphdr *th)
1784{
1785        /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1786         * barrier() makes sure compiler wont play fool^Waliasing games.
1787         */
1788        memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1789                sizeof(struct inet_skb_parm));
1790        barrier();
1791
1792        TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1793        TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1794                                    skb->len - th->doff * 4);
1795        TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1796        TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1797        TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1798        TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1799        TCP_SKB_CB(skb)->sacked  = 0;
1800        TCP_SKB_CB(skb)->has_rxtstamp =
1801                        skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1802}
1803
1804/*
1805 *      From tcp_input.c
1806 */
1807
1808int tcp_v4_rcv(struct sk_buff *skb)
1809{
1810        struct net *net = dev_net(skb->dev);
1811        struct sk_buff *skb_to_free;
1812        int sdif = inet_sdif(skb);
1813        const struct iphdr *iph;
1814        const struct tcphdr *th;
1815        bool refcounted;
1816        struct sock *sk;
1817        int ret;
1818
1819        if (skb->pkt_type != PACKET_HOST)
1820                goto discard_it;
1821
1822        /* Count it even if it's bad */
1823        __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1824
1825        if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1826                goto discard_it;
1827
1828        th = (const struct tcphdr *)skb->data;
1829
1830        if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1831                goto bad_packet;
1832        if (!pskb_may_pull(skb, th->doff * 4))
1833                goto discard_it;
1834
1835        /* An explanation is required here, I think.
1836         * Packet length and doff are validated by header prediction,
1837         * provided case of th->doff==0 is eliminated.
1838         * So, we defer the checks. */
1839
1840        if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1841                goto csum_error;
1842
1843        th = (const struct tcphdr *)skb->data;
1844        iph = ip_hdr(skb);
1845lookup:
1846        sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1847                               th->dest, sdif, &refcounted);
1848        if (!sk)
1849                goto no_tcp_socket;
1850
1851process:
1852        if (sk->sk_state == TCP_TIME_WAIT)
1853                goto do_time_wait;
1854
1855        if (sk->sk_state == TCP_NEW_SYN_RECV) {
1856                struct request_sock *req = inet_reqsk(sk);
1857                bool req_stolen = false;
1858                struct sock *nsk;
1859
1860                sk = req->rsk_listener;
1861                if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1862                        sk_drops_add(sk, skb);
1863                        reqsk_put(req);
1864                        goto discard_it;
1865                }
1866                if (tcp_checksum_complete(skb)) {
1867                        reqsk_put(req);
1868                        goto csum_error;
1869                }
1870                if (unlikely(sk->sk_state != TCP_LISTEN)) {
1871                        inet_csk_reqsk_queue_drop_and_put(sk, req);
1872                        goto lookup;
1873                }
1874                /* We own a reference on the listener, increase it again
1875                 * as we might lose it too soon.
1876                 */
1877                sock_hold(sk);
1878                refcounted = true;
1879                nsk = NULL;
1880                if (!tcp_filter(sk, skb)) {
1881                        th = (const struct tcphdr *)skb->data;
1882                        iph = ip_hdr(skb);
1883                        tcp_v4_fill_cb(skb, iph, th);
1884                        nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1885                }
1886                if (!nsk) {
1887                        reqsk_put(req);
1888                        if (req_stolen) {
1889                                /* Another cpu got exclusive access to req
1890                                 * and created a full blown socket.
1891                                 * Try to feed this packet to this socket
1892                                 * instead of discarding it.
1893                                 */
1894                                tcp_v4_restore_cb(skb);
1895                                sock_put(sk);
1896                                goto lookup;
1897                        }
1898                        goto discard_and_relse;
1899                }
1900                if (nsk == sk) {
1901                        reqsk_put(req);
1902                        tcp_v4_restore_cb(skb);
1903                } else if (tcp_child_process(sk, nsk, skb)) {
1904                        tcp_v4_send_reset(nsk, skb);
1905                        goto discard_and_relse;
1906                } else {
1907                        sock_put(sk);
1908                        return 0;
1909                }
1910        }
1911        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1912                __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1913                goto discard_and_relse;
1914        }
1915
1916        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1917                goto discard_and_relse;
1918
1919        if (tcp_v4_inbound_md5_hash(sk, skb))
1920                goto discard_and_relse;
1921
1922        nf_reset_ct(skb);
1923
1924        if (tcp_filter(sk, skb))
1925                goto discard_and_relse;
1926        th = (const struct tcphdr *)skb->data;
1927        iph = ip_hdr(skb);
1928        tcp_v4_fill_cb(skb, iph, th);
1929
1930        skb->dev = NULL;
1931
1932        if (sk->sk_state == TCP_LISTEN) {
1933                ret = tcp_v4_do_rcv(sk, skb);
1934                goto put_and_return;
1935        }
1936
1937        sk_incoming_cpu_update(sk);
1938
1939        bh_lock_sock_nested(sk);
1940        tcp_segs_in(tcp_sk(sk), skb);
1941        ret = 0;
1942        if (!sock_owned_by_user(sk)) {
1943                skb_to_free = sk->sk_rx_skb_cache;
1944                sk->sk_rx_skb_cache = NULL;
1945                ret = tcp_v4_do_rcv(sk, skb);
1946        } else {
1947                if (tcp_add_backlog(sk, skb))
1948                        goto discard_and_relse;
1949                skb_to_free = NULL;
1950        }
1951        bh_unlock_sock(sk);
1952        if (skb_to_free)
1953                __kfree_skb(skb_to_free);
1954
1955put_and_return:
1956        if (refcounted)
1957                sock_put(sk);
1958
1959        return ret;
1960
1961no_tcp_socket:
1962        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1963                goto discard_it;
1964
1965        tcp_v4_fill_cb(skb, iph, th);
1966
1967        if (tcp_checksum_complete(skb)) {
1968csum_error:
1969                __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1970bad_packet:
1971                __TCP_INC_STATS(net, TCP_MIB_INERRS);
1972        } else {
1973                tcp_v4_send_reset(NULL, skb);
1974        }
1975
1976discard_it:
1977        /* Discard frame. */
1978        kfree_skb(skb);
1979        return 0;
1980
1981discard_and_relse:
1982        sk_drops_add(sk, skb);
1983        if (refcounted)
1984                sock_put(sk);
1985        goto discard_it;
1986
1987do_time_wait:
1988        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1989                inet_twsk_put(inet_twsk(sk));
1990                goto discard_it;
1991        }
1992
1993        tcp_v4_fill_cb(skb, iph, th);
1994
1995        if (tcp_checksum_complete(skb)) {
1996                inet_twsk_put(inet_twsk(sk));
1997                goto csum_error;
1998        }
1999        switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2000        case TCP_TW_SYN: {
2001                struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2002                                                        &tcp_hashinfo, skb,
2003                                                        __tcp_hdrlen(th),
2004                                                        iph->saddr, th->source,
2005                                                        iph->daddr, th->dest,
2006                                                        inet_iif(skb),
2007                                                        sdif);
2008                if (sk2) {
2009                        inet_twsk_deschedule_put(inet_twsk(sk));
2010                        sk = sk2;
2011                        tcp_v4_restore_cb(skb);
2012                        refcounted = false;
2013                        goto process;
2014                }
2015        }
2016                /* to ACK */
2017                /* fall through */
2018        case TCP_TW_ACK:
2019                tcp_v4_timewait_ack(sk, skb);
2020                break;
2021        case TCP_TW_RST:
2022                tcp_v4_send_reset(sk, skb);
2023                inet_twsk_deschedule_put(inet_twsk(sk));
2024                goto discard_it;
2025        case TCP_TW_SUCCESS:;
2026        }
2027        goto discard_it;
2028}
2029
2030static struct timewait_sock_ops tcp_timewait_sock_ops = {
2031        .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2032        .twsk_unique    = tcp_twsk_unique,
2033        .twsk_destructor= tcp_twsk_destructor,
2034};
2035
2036void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2037{
2038        struct dst_entry *dst = skb_dst(skb);
2039
2040        if (dst && dst_hold_safe(dst)) {
2041                sk->sk_rx_dst = dst;
2042                inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2043        }
2044}
2045EXPORT_SYMBOL(inet_sk_rx_dst_set);
2046
2047const struct inet_connection_sock_af_ops ipv4_specific = {
2048        .queue_xmit        = ip_queue_xmit,
2049        .send_check        = tcp_v4_send_check,
2050        .rebuild_header    = inet_sk_rebuild_header,
2051        .sk_rx_dst_set     = inet_sk_rx_dst_set,
2052        .conn_request      = tcp_v4_conn_request,
2053        .syn_recv_sock     = tcp_v4_syn_recv_sock,
2054        .net_header_len    = sizeof(struct iphdr),
2055        .setsockopt        = ip_setsockopt,
2056        .getsockopt        = ip_getsockopt,
2057        .addr2sockaddr     = inet_csk_addr2sockaddr,
2058        .sockaddr_len      = sizeof(struct sockaddr_in),
2059#ifdef CONFIG_COMPAT
2060        .compat_setsockopt = compat_ip_setsockopt,
2061        .compat_getsockopt = compat_ip_getsockopt,
2062#endif
2063        .mtu_reduced       = tcp_v4_mtu_reduced,
2064};
2065EXPORT_SYMBOL(ipv4_specific);
2066
2067#ifdef CONFIG_TCP_MD5SIG
2068static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2069        .md5_lookup             = tcp_v4_md5_lookup,
2070        .calc_md5_hash          = tcp_v4_md5_hash_skb,
2071        .md5_parse              = tcp_v4_parse_md5_keys,
2072};
2073#endif
2074
2075/* NOTE: A lot of things set to zero explicitly by call to
2076 *       sk_alloc() so need not be done here.
2077 */
2078static int tcp_v4_init_sock(struct sock *sk)
2079{
2080        struct inet_connection_sock *icsk = inet_csk(sk);
2081
2082        tcp_init_sock(sk);
2083
2084        icsk->icsk_af_ops = &ipv4_specific;
2085
2086#ifdef CONFIG_TCP_MD5SIG
2087        tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2088#endif
2089
2090        return 0;
2091}
2092
2093void tcp_v4_destroy_sock(struct sock *sk)
2094{
2095        struct tcp_sock *tp = tcp_sk(sk);
2096
2097        trace_tcp_destroy_sock(sk);
2098
2099        tcp_clear_xmit_timers(sk);
2100
2101        tcp_cleanup_congestion_control(sk);
2102
2103        tcp_cleanup_ulp(sk);
2104
2105        /* Cleanup up the write buffer. */
2106        tcp_write_queue_purge(sk);
2107
2108        /* Check if we want to disable active TFO */
2109        tcp_fastopen_active_disable_ofo_check(sk);
2110
2111        /* Cleans up our, hopefully empty, out_of_order_queue. */
2112        skb_rbtree_purge(&tp->out_of_order_queue);
2113
2114#ifdef CONFIG_TCP_MD5SIG
2115        /* Clean up the MD5 key list, if any */
2116        if (tp->md5sig_info) {
2117                tcp_clear_md5_list(sk);
2118                kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2119                tp->md5sig_info = NULL;
2120        }
2121#endif
2122
2123        /* Clean up a referenced TCP bind bucket. */
2124        if (inet_csk(sk)->icsk_bind_hash)
2125                inet_put_port(sk);
2126
2127        BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2128
2129        /* If socket is aborted during connect operation */
2130        tcp_free_fastopen_req(tp);
2131        tcp_fastopen_destroy_cipher(sk);
2132        tcp_saved_syn_free(tp);
2133
2134        sk_sockets_allocated_dec(sk);
2135}
2136EXPORT_SYMBOL(tcp_v4_destroy_sock);
2137
2138#ifdef CONFIG_PROC_FS
2139/* Proc filesystem TCP sock list dumping. */
2140
2141/*
2142 * Get next listener socket follow cur.  If cur is NULL, get first socket
2143 * starting from bucket given in st->bucket; when st->bucket is zero the
2144 * very first socket in the hash table is returned.
2145 */
2146static void *listening_get_next(struct seq_file *seq, void *cur)
2147{
2148        struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2149        struct tcp_iter_state *st = seq->private;
2150        struct net *net = seq_file_net(seq);
2151        struct inet_listen_hashbucket *ilb;
2152        struct sock *sk = cur;
2153
2154        if (!sk) {
2155get_head:
2156                ilb = &tcp_hashinfo.listening_hash[st->bucket];
2157                spin_lock(&ilb->lock);
2158                sk = sk_head(&ilb->head);
2159                st->offset = 0;
2160                goto get_sk;
2161        }
2162        ilb = &tcp_hashinfo.listening_hash[st->bucket];
2163        ++st->num;
2164        ++st->offset;
2165
2166        sk = sk_next(sk);
2167get_sk:
2168        sk_for_each_from(sk) {
2169                if (!net_eq(sock_net(sk), net))
2170                        continue;
2171                if (sk->sk_family == afinfo->family)
2172                        return sk;
2173        }
2174        spin_unlock(&ilb->lock);
2175        st->offset = 0;
2176        if (++st->bucket < INET_LHTABLE_SIZE)
2177                goto get_head;
2178        return NULL;
2179}
2180
2181static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2182{
2183        struct tcp_iter_state *st = seq->private;
2184        void *rc;
2185
2186        st->bucket = 0;
2187        st->offset = 0;
2188        rc = listening_get_next(seq, NULL);
2189
2190        while (rc && *pos) {
2191                rc = listening_get_next(seq, rc);
2192                --*pos;
2193        }
2194        return rc;
2195}
2196
2197static inline bool empty_bucket(const struct tcp_iter_state *st)
2198{
2199        return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2200}
2201
2202/*
2203 * Get first established socket starting from bucket given in st->bucket.
2204 * If st->bucket is zero, the very first socket in the hash is returned.
2205 */
2206static void *established_get_first(struct seq_file *seq)
2207{
2208        struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2209        struct tcp_iter_state *st = seq->private;
2210        struct net *net = seq_file_net(seq);
2211        void *rc = NULL;
2212
2213        st->offset = 0;
2214        for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2215                struct sock *sk;
2216                struct hlist_nulls_node *node;
2217                spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2218
2219                /* Lockless fast path for the common case of empty buckets */
2220                if (empty_bucket(st))
2221                        continue;
2222
2223                spin_lock_bh(lock);
2224                sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2225                        if (sk->sk_family != afinfo->family ||
2226                            !net_eq(sock_net(sk), net)) {
2227                                continue;
2228                        }
2229                        rc = sk;
2230                        goto out;
2231                }
2232                spin_unlock_bh(lock);
2233        }
2234out:
2235        return rc;
2236}
2237
2238static void *established_get_next(struct seq_file *seq, void *cur)
2239{
2240        struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2241        struct sock *sk = cur;
2242        struct hlist_nulls_node *node;
2243        struct tcp_iter_state *st = seq->private;
2244        struct net *net = seq_file_net(seq);
2245
2246        ++st->num;
2247        ++st->offset;
2248
2249        sk = sk_nulls_next(sk);
2250
2251        sk_nulls_for_each_from(sk, node) {
2252                if (sk->sk_family == afinfo->family &&
2253                    net_eq(sock_net(sk), net))
2254                        return sk;
2255        }
2256
2257        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2258        ++st->bucket;
2259        return established_get_first(seq);
2260}
2261
2262static void *established_get_idx(struct seq_file *seq, loff_t pos)
2263{
2264        struct tcp_iter_state *st = seq->private;
2265        void *rc;
2266
2267        st->bucket = 0;
2268        rc = established_get_first(seq);
2269
2270        while (rc && pos) {
2271                rc = established_get_next(seq, rc);
2272                --pos;
2273        }
2274        return rc;
2275}
2276
2277static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2278{
2279        void *rc;
2280        struct tcp_iter_state *st = seq->private;
2281
2282        st->state = TCP_SEQ_STATE_LISTENING;
2283        rc        = listening_get_idx(seq, &pos);
2284
2285        if (!rc) {
2286                st->state = TCP_SEQ_STATE_ESTABLISHED;
2287                rc        = established_get_idx(seq, pos);
2288        }
2289
2290        return rc;
2291}
2292
2293static void *tcp_seek_last_pos(struct seq_file *seq)
2294{
2295        struct tcp_iter_state *st = seq->private;
2296        int offset = st->offset;
2297        int orig_num = st->num;
2298        void *rc = NULL;
2299
2300        switch (st->state) {
2301        case TCP_SEQ_STATE_LISTENING:
2302                if (st->bucket >= INET_LHTABLE_SIZE)
2303                        break;
2304                st->state = TCP_SEQ_STATE_LISTENING;
2305                rc = listening_get_next(seq, NULL);
2306                while (offset-- && rc)
2307                        rc = listening_get_next(seq, rc);
2308                if (rc)
2309                        break;
2310                st->bucket = 0;
2311                st->state = TCP_SEQ_STATE_ESTABLISHED;
2312                /* Fallthrough */
2313        case TCP_SEQ_STATE_ESTABLISHED:
2314                if (st->bucket > tcp_hashinfo.ehash_mask)
2315                        break;
2316                rc = established_get_first(seq);
2317                while (offset-- && rc)
2318                        rc = established_get_next(seq, rc);
2319        }
2320
2321        st->num = orig_num;
2322
2323        return rc;
2324}
2325
2326void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2327{
2328        struct tcp_iter_state *st = seq->private;
2329        void *rc;
2330
2331        if (*pos && *pos == st->last_pos) {
2332                rc = tcp_seek_last_pos(seq);
2333                if (rc)
2334                        goto out;
2335        }
2336
2337        st->state = TCP_SEQ_STATE_LISTENING;
2338        st->num = 0;
2339        st->bucket = 0;
2340        st->offset = 0;
2341        rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2342
2343out:
2344        st->last_pos = *pos;
2345        return rc;
2346}
2347EXPORT_SYMBOL(tcp_seq_start);
2348
2349void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2350{
2351        struct tcp_iter_state *st = seq->private;
2352        void *rc = NULL;
2353
2354        if (v == SEQ_START_TOKEN) {
2355                rc = tcp_get_idx(seq, 0);
2356                goto out;
2357        }
2358
2359        switch (st->state) {
2360        case TCP_SEQ_STATE_LISTENING:
2361                rc = listening_get_next(seq, v);
2362                if (!rc) {
2363                        st->state = TCP_SEQ_STATE_ESTABLISHED;
2364                        st->bucket = 0;
2365                        st->offset = 0;
2366                        rc        = established_get_first(seq);
2367                }
2368                break;
2369        case TCP_SEQ_STATE_ESTABLISHED:
2370                rc = established_get_next(seq, v);
2371                break;
2372        }
2373out:
2374        ++*pos;
2375        st->last_pos = *pos;
2376        return rc;
2377}
2378EXPORT_SYMBOL(tcp_seq_next);
2379
2380void tcp_seq_stop(struct seq_file *seq, void *v)
2381{
2382        struct tcp_iter_state *st = seq->private;
2383
2384        switch (st->state) {
2385        case TCP_SEQ_STATE_LISTENING:
2386                if (v != SEQ_START_TOKEN)
2387                        spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2388                break;
2389        case TCP_SEQ_STATE_ESTABLISHED:
2390                if (v)
2391                        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2392                break;
2393        }
2394}
2395EXPORT_SYMBOL(tcp_seq_stop);
2396
2397static void get_openreq4(const struct request_sock *req,
2398                         struct seq_file *f, int i)
2399{
2400        const struct inet_request_sock *ireq = inet_rsk(req);
2401        long delta = req->rsk_timer.expires - jiffies;
2402
2403        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2404                " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2405                i,
2406                ireq->ir_loc_addr,
2407                ireq->ir_num,
2408                ireq->ir_rmt_addr,
2409                ntohs(ireq->ir_rmt_port),
2410                TCP_SYN_RECV,
2411                0, 0, /* could print option size, but that is af dependent. */
2412                1,    /* timers active (only the expire timer) */
2413                jiffies_delta_to_clock_t(delta),
2414                req->num_timeout,
2415                from_kuid_munged(seq_user_ns(f),
2416                                 sock_i_uid(req->rsk_listener)),
2417                0,  /* non standard timer */
2418                0, /* open_requests have no inode */
2419                0,
2420                req);
2421}
2422
2423static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2424{
2425        int timer_active;
2426        unsigned long timer_expires;
2427        const struct tcp_sock *tp = tcp_sk(sk);
2428        const struct inet_connection_sock *icsk = inet_csk(sk);
2429        const struct inet_sock *inet = inet_sk(sk);
2430        const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2431        __be32 dest = inet->inet_daddr;
2432        __be32 src = inet->inet_rcv_saddr;
2433        __u16 destp = ntohs(inet->inet_dport);
2434        __u16 srcp = ntohs(inet->inet_sport);
2435        int rx_queue;
2436        int state;
2437
2438        if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2439            icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2440            icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2441                timer_active    = 1;
2442                timer_expires   = icsk->icsk_timeout;
2443        } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2444                timer_active    = 4;
2445                timer_expires   = icsk->icsk_timeout;
2446        } else if (timer_pending(&sk->sk_timer)) {
2447                timer_active    = 2;
2448                timer_expires   = sk->sk_timer.expires;
2449        } else {
2450                timer_active    = 0;
2451                timer_expires = jiffies;
2452        }
2453
2454        state = inet_sk_state_load(sk);
2455        if (state == TCP_LISTEN)
2456                rx_queue = sk->sk_ack_backlog;
2457        else
2458                /* Because we don't lock the socket,
2459                 * we might find a transient negative value.
2460                 */
2461                rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2462                                      READ_ONCE(tp->copied_seq), 0);
2463
2464        seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2465                        "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2466                i, src, srcp, dest, destp, state,
2467                READ_ONCE(tp->write_seq) - tp->snd_una,
2468                rx_queue,
2469                timer_active,
2470                jiffies_delta_to_clock_t(timer_expires - jiffies),
2471                icsk->icsk_retransmits,
2472                from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2473                icsk->icsk_probes_out,
2474                sock_i_ino(sk),
2475                refcount_read(&sk->sk_refcnt), sk,
2476                jiffies_to_clock_t(icsk->icsk_rto),
2477                jiffies_to_clock_t(icsk->icsk_ack.ato),
2478                (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2479                tp->snd_cwnd,
2480                state == TCP_LISTEN ?
2481                    fastopenq->max_qlen :
2482                    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2483}
2484
2485static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2486                               struct seq_file *f, int i)
2487{
2488        long delta = tw->tw_timer.expires - jiffies;
2489        __be32 dest, src;
2490        __u16 destp, srcp;
2491
2492        dest  = tw->tw_daddr;
2493        src   = tw->tw_rcv_saddr;
2494        destp = ntohs(tw->tw_dport);
2495        srcp  = ntohs(tw->tw_sport);
2496
2497        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2498                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2499                i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2500                3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2501                refcount_read(&tw->tw_refcnt), tw);
2502}
2503
2504#define TMPSZ 150
2505
2506static int tcp4_seq_show(struct seq_file *seq, void *v)
2507{
2508        struct tcp_iter_state *st;
2509        struct sock *sk = v;
2510
2511        seq_setwidth(seq, TMPSZ - 1);
2512        if (v == SEQ_START_TOKEN) {
2513                seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2514                           "rx_queue tr tm->when retrnsmt   uid  timeout "
2515                           "inode");
2516                goto out;
2517        }
2518        st = seq->private;
2519
2520        if (sk->sk_state == TCP_TIME_WAIT)
2521                get_timewait4_sock(v, seq, st->num);
2522        else if (sk->sk_state == TCP_NEW_SYN_RECV)
2523                get_openreq4(v, seq, st->num);
2524        else
2525                get_tcp4_sock(v, seq, st->num);
2526out:
2527        seq_pad(seq, '\n');
2528        return 0;
2529}
2530
2531static const struct seq_operations tcp4_seq_ops = {
2532        .show           = tcp4_seq_show,
2533        .start          = tcp_seq_start,
2534        .next           = tcp_seq_next,
2535        .stop           = tcp_seq_stop,
2536};
2537
2538static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2539        .family         = AF_INET,
2540};
2541
2542static int __net_init tcp4_proc_init_net(struct net *net)
2543{
2544        if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2545                        sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2546                return -ENOMEM;
2547        return 0;
2548}
2549
2550static void __net_exit tcp4_proc_exit_net(struct net *net)
2551{
2552        remove_proc_entry("tcp", net->proc_net);
2553}
2554
2555static struct pernet_operations tcp4_net_ops = {
2556        .init = tcp4_proc_init_net,
2557        .exit = tcp4_proc_exit_net,
2558};
2559
2560int __init tcp4_proc_init(void)
2561{
2562        return register_pernet_subsys(&tcp4_net_ops);
2563}
2564
2565void tcp4_proc_exit(void)
2566{
2567        unregister_pernet_subsys(&tcp4_net_ops);
2568}
2569#endif /* CONFIG_PROC_FS */
2570
2571struct proto tcp_prot = {
2572        .name                   = "TCP",
2573        .owner                  = THIS_MODULE,
2574        .close                  = tcp_close,
2575        .pre_connect            = tcp_v4_pre_connect,
2576        .connect                = tcp_v4_connect,
2577        .disconnect             = tcp_disconnect,
2578        .accept                 = inet_csk_accept,
2579        .ioctl                  = tcp_ioctl,
2580        .init                   = tcp_v4_init_sock,
2581        .destroy                = tcp_v4_destroy_sock,
2582        .shutdown               = tcp_shutdown,
2583        .setsockopt             = tcp_setsockopt,
2584        .getsockopt             = tcp_getsockopt,
2585        .keepalive              = tcp_set_keepalive,
2586        .recvmsg                = tcp_recvmsg,
2587        .sendmsg                = tcp_sendmsg,
2588        .sendpage               = tcp_sendpage,
2589        .backlog_rcv            = tcp_v4_do_rcv,
2590        .release_cb             = tcp_release_cb,
2591        .hash                   = inet_hash,
2592        .unhash                 = inet_unhash,
2593        .get_port               = inet_csk_get_port,
2594        .enter_memory_pressure  = tcp_enter_memory_pressure,
2595        .leave_memory_pressure  = tcp_leave_memory_pressure,
2596        .stream_memory_free     = tcp_stream_memory_free,
2597        .sockets_allocated      = &tcp_sockets_allocated,
2598        .orphan_count           = &tcp_orphan_count,
2599        .memory_allocated       = &tcp_memory_allocated,
2600        .memory_pressure        = &tcp_memory_pressure,
2601        .sysctl_mem             = sysctl_tcp_mem,
2602        .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2603        .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2604        .max_header             = MAX_TCP_HEADER,
2605        .obj_size               = sizeof(struct tcp_sock),
2606        .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2607        .twsk_prot              = &tcp_timewait_sock_ops,
2608        .rsk_prot               = &tcp_request_sock_ops,
2609        .h.hashinfo             = &tcp_hashinfo,
2610        .no_autobind            = true,
2611#ifdef CONFIG_COMPAT
2612        .compat_setsockopt      = compat_tcp_setsockopt,
2613        .compat_getsockopt      = compat_tcp_getsockopt,
2614#endif
2615        .diag_destroy           = tcp_abort,
2616};
2617EXPORT_SYMBOL(tcp_prot);
2618
2619static void __net_exit tcp_sk_exit(struct net *net)
2620{
2621        int cpu;
2622
2623        if (net->ipv4.tcp_congestion_control)
2624                module_put(net->ipv4.tcp_congestion_control->owner);
2625
2626        for_each_possible_cpu(cpu)
2627                inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2628        free_percpu(net->ipv4.tcp_sk);
2629}
2630
2631static int __net_init tcp_sk_init(struct net *net)
2632{
2633        int res, cpu, cnt;
2634
2635        net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2636        if (!net->ipv4.tcp_sk)
2637                return -ENOMEM;
2638
2639        for_each_possible_cpu(cpu) {
2640                struct sock *sk;
2641
2642                res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2643                                           IPPROTO_TCP, net);
2644                if (res)
2645                        goto fail;
2646                sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2647
2648                /* Please enforce IP_DF and IPID==0 for RST and
2649                 * ACK sent in SYN-RECV and TIME-WAIT state.
2650                 */
2651                inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2652
2653                *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2654        }
2655
2656        net->ipv4.sysctl_tcp_ecn = 2;
2657        net->ipv4.sysctl_tcp_ecn_fallback = 1;
2658
2659        net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2660        net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2661        net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2662        net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2663        net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2664
2665        net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2666        net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2667        net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2668
2669        net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2670        net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2671        net->ipv4.sysctl_tcp_syncookies = 1;
2672        net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2673        net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2674        net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2675        net->ipv4.sysctl_tcp_orphan_retries = 0;
2676        net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2677        net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2678        net->ipv4.sysctl_tcp_tw_reuse = 2;
2679
2680        cnt = tcp_hashinfo.ehash_mask + 1;
2681        net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2682        net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2683
2684        net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2685        net->ipv4.sysctl_tcp_sack = 1;
2686        net->ipv4.sysctl_tcp_window_scaling = 1;
2687        net->ipv4.sysctl_tcp_timestamps = 1;
2688        net->ipv4.sysctl_tcp_early_retrans = 3;
2689        net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2690        net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2691        net->ipv4.sysctl_tcp_retrans_collapse = 1;
2692        net->ipv4.sysctl_tcp_max_reordering = 300;
2693        net->ipv4.sysctl_tcp_dsack = 1;
2694        net->ipv4.sysctl_tcp_app_win = 31;
2695        net->ipv4.sysctl_tcp_adv_win_scale = 1;
2696        net->ipv4.sysctl_tcp_frto = 2;
2697        net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2698        /* This limits the percentage of the congestion window which we
2699         * will allow a single TSO frame to consume.  Building TSO frames
2700         * which are too large can cause TCP streams to be bursty.
2701         */
2702        net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2703        /* Default TSQ limit of 16 TSO segments */
2704        net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2705        /* rfc5961 challenge ack rate limiting */
2706        net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2707        net->ipv4.sysctl_tcp_min_tso_segs = 2;
2708        net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2709        net->ipv4.sysctl_tcp_autocorking = 1;
2710        net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2711        net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2712        net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2713        if (net != &init_net) {
2714                memcpy(net->ipv4.sysctl_tcp_rmem,
2715                       init_net.ipv4.sysctl_tcp_rmem,
2716                       sizeof(init_net.ipv4.sysctl_tcp_rmem));
2717                memcpy(net->ipv4.sysctl_tcp_wmem,
2718                       init_net.ipv4.sysctl_tcp_wmem,
2719                       sizeof(init_net.ipv4.sysctl_tcp_wmem));
2720        }
2721        net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2722        net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2723        net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2724        spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2725        net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2726        atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2727
2728        /* Reno is always built in */
2729        if (!net_eq(net, &init_net) &&
2730            try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2731                net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2732        else
2733                net->ipv4.tcp_congestion_control = &tcp_reno;
2734
2735        return 0;
2736fail:
2737        tcp_sk_exit(net);
2738
2739        return res;
2740}
2741
2742static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2743{
2744        struct net *net;
2745
2746        inet_twsk_purge(&tcp_hashinfo, AF_INET);
2747
2748        list_for_each_entry(net, net_exit_list, exit_list)
2749                tcp_fastopen_ctx_destroy(net);
2750}
2751
2752static struct pernet_operations __net_initdata tcp_sk_ops = {
2753       .init       = tcp_sk_init,
2754       .exit       = tcp_sk_exit,
2755       .exit_batch = tcp_sk_exit_batch,
2756};
2757
2758void __init tcp_v4_init(void)
2759{
2760        if (register_pernet_subsys(&tcp_sk_ops))
2761                panic("Failed to create the TCP control socket.\n");
2762}
2763