linux/net/ipv4/tcp_ipv4.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4 *              operating system.  INET is implemented using the  BSD Socket
   5 *              interface as the means of communication with the user level.
   6 *
   7 *              Implementation of the Transmission Control Protocol(TCP).
   8 *
   9 *              IPv4 specific functions
  10 *
  11 *              code split from:
  12 *              linux/ipv4/tcp.c
  13 *              linux/ipv4/tcp_input.c
  14 *              linux/ipv4/tcp_output.c
  15 *
  16 *              See tcp.c for author information
  17 */
  18
  19/*
  20 * Changes:
  21 *              David S. Miller :       New socket lookup architecture.
  22 *                                      This code is dedicated to John Dyson.
  23 *              David S. Miller :       Change semantics of established hash,
  24 *                                      half is devoted to TIME_WAIT sockets
  25 *                                      and the rest go in the other half.
  26 *              Andi Kleen :            Add support for syncookies and fixed
  27 *                                      some bugs: ip options weren't passed to
  28 *                                      the TCP layer, missed a check for an
  29 *                                      ACK bit.
  30 *              Andi Kleen :            Implemented fast path mtu discovery.
  31 *                                      Fixed many serious bugs in the
  32 *                                      request_sock handling and moved
  33 *                                      most of it into the af independent code.
  34 *                                      Added tail drop and some other bugfixes.
  35 *                                      Added new listen semantics.
  36 *              Mike McLagan    :       Routing by source
  37 *      Juan Jose Ciarlante:            ip_dynaddr bits
  38 *              Andi Kleen:             various fixes.
  39 *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  40 *                                      coma.
  41 *      Andi Kleen              :       Fix new listen.
  42 *      Andi Kleen              :       Fix accept error reporting.
  43 *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  44 *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  45 *                                      a single port at the same time.
  46 */
  47
  48#define pr_fmt(fmt) "TCP: " fmt
  49
  50#include <linux/bottom_half.h>
  51#include <linux/types.h>
  52#include <linux/fcntl.h>
  53#include <linux/module.h>
  54#include <linux/random.h>
  55#include <linux/cache.h>
  56#include <linux/jhash.h>
  57#include <linux/init.h>
  58#include <linux/times.h>
  59#include <linux/slab.h>
  60
  61#include <net/net_namespace.h>
  62#include <net/icmp.h>
  63#include <net/inet_hashtables.h>
  64#include <net/tcp.h>
  65#include <net/transp_v6.h>
  66#include <net/ipv6.h>
  67#include <net/inet_common.h>
  68#include <net/timewait_sock.h>
  69#include <net/xfrm.h>
  70#include <net/secure_seq.h>
  71#include <net/busy_poll.h>
  72
  73#include <linux/inet.h>
  74#include <linux/ipv6.h>
  75#include <linux/stddef.h>
  76#include <linux/proc_fs.h>
  77#include <linux/seq_file.h>
  78#include <linux/inetdevice.h>
  79
  80#include <crypto/hash.h>
  81#include <linux/scatterlist.h>
  82
  83#include <trace/events/tcp.h>
  84
  85#ifdef CONFIG_TCP_MD5SIG
  86static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  87                               __be32 daddr, __be32 saddr, const struct tcphdr *th);
  88#endif
  89
  90struct inet_hashinfo tcp_hashinfo;
  91EXPORT_SYMBOL(tcp_hashinfo);
  92
  93static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  94{
  95        return secure_tcp_seq(ip_hdr(skb)->daddr,
  96                              ip_hdr(skb)->saddr,
  97                              tcp_hdr(skb)->dest,
  98                              tcp_hdr(skb)->source);
  99}
 100
 101static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 102{
 103        return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 104}
 105
 106int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 107{
 108        const struct inet_timewait_sock *tw = inet_twsk(sktw);
 109        const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 110        struct tcp_sock *tp = tcp_sk(sk);
 111        int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
 112
 113        if (reuse == 2) {
 114                /* Still does not detect *everything* that goes through
 115                 * lo, since we require a loopback src or dst address
 116                 * or direct binding to 'lo' interface.
 117                 */
 118                bool loopback = false;
 119                if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 120                        loopback = true;
 121#if IS_ENABLED(CONFIG_IPV6)
 122                if (tw->tw_family == AF_INET6) {
 123                        if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 124                            ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
 125                            ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 126                            ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
 127                                loopback = true;
 128                } else
 129#endif
 130                {
 131                        if (ipv4_is_loopback(tw->tw_daddr) ||
 132                            ipv4_is_loopback(tw->tw_rcv_saddr))
 133                                loopback = true;
 134                }
 135                if (!loopback)
 136                        reuse = 0;
 137        }
 138
 139        /* With PAWS, it is safe from the viewpoint
 140           of data integrity. Even without PAWS it is safe provided sequence
 141           spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 142
 143           Actually, the idea is close to VJ's one, only timestamp cache is
 144           held not per host, but per port pair and TW bucket is used as state
 145           holder.
 146
 147           If TW bucket has been already destroyed we fall back to VJ's scheme
 148           and use initial timestamp retrieved from peer table.
 149         */
 150        if (tcptw->tw_ts_recent_stamp &&
 151            (!twp || (reuse && time_after32(ktime_get_seconds(),
 152                                            tcptw->tw_ts_recent_stamp)))) {
 153                /* In case of repair and re-using TIME-WAIT sockets we still
 154                 * want to be sure that it is safe as above but honor the
 155                 * sequence numbers and time stamps set as part of the repair
 156                 * process.
 157                 *
 158                 * Without this check re-using a TIME-WAIT socket with TCP
 159                 * repair would accumulate a -1 on the repair assigned
 160                 * sequence number. The first time it is reused the sequence
 161                 * is -1, the second time -2, etc. This fixes that issue
 162                 * without appearing to create any others.
 163                 */
 164                if (likely(!tp->repair)) {
 165                        u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
 166
 167                        if (!seq)
 168                                seq = 1;
 169                        WRITE_ONCE(tp->write_seq, seq);
 170                        tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 171                        tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 172                }
 173                sock_hold(sktw);
 174                return 1;
 175        }
 176
 177        return 0;
 178}
 179EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 180
 181static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 182                              int addr_len)
 183{
 184        /* This check is replicated from tcp_v4_connect() and intended to
 185         * prevent BPF program called below from accessing bytes that are out
 186         * of the bound specified by user in addr_len.
 187         */
 188        if (addr_len < sizeof(struct sockaddr_in))
 189                return -EINVAL;
 190
 191        sock_owned_by_me(sk);
 192
 193        return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 194}
 195
 196/* This will initiate an outgoing connection. */
 197int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 198{
 199        struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 200        struct inet_sock *inet = inet_sk(sk);
 201        struct tcp_sock *tp = tcp_sk(sk);
 202        __be16 orig_sport, orig_dport;
 203        __be32 daddr, nexthop;
 204        struct flowi4 *fl4;
 205        struct rtable *rt;
 206        int err;
 207        struct ip_options_rcu *inet_opt;
 208        struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 209
 210        if (addr_len < sizeof(struct sockaddr_in))
 211                return -EINVAL;
 212
 213        if (usin->sin_family != AF_INET)
 214                return -EAFNOSUPPORT;
 215
 216        nexthop = daddr = usin->sin_addr.s_addr;
 217        inet_opt = rcu_dereference_protected(inet->inet_opt,
 218                                             lockdep_sock_is_held(sk));
 219        if (inet_opt && inet_opt->opt.srr) {
 220                if (!daddr)
 221                        return -EINVAL;
 222                nexthop = inet_opt->opt.faddr;
 223        }
 224
 225        orig_sport = inet->inet_sport;
 226        orig_dport = usin->sin_port;
 227        fl4 = &inet->cork.fl.u.ip4;
 228        rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 229                              RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 230                              IPPROTO_TCP,
 231                              orig_sport, orig_dport, sk);
 232        if (IS_ERR(rt)) {
 233                err = PTR_ERR(rt);
 234                if (err == -ENETUNREACH)
 235                        IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 236                return err;
 237        }
 238
 239        if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 240                ip_rt_put(rt);
 241                return -ENETUNREACH;
 242        }
 243
 244        if (!inet_opt || !inet_opt->opt.srr)
 245                daddr = fl4->daddr;
 246
 247        if (!inet->inet_saddr)
 248                inet->inet_saddr = fl4->saddr;
 249        sk_rcv_saddr_set(sk, inet->inet_saddr);
 250
 251        if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 252                /* Reset inherited state */
 253                tp->rx_opt.ts_recent       = 0;
 254                tp->rx_opt.ts_recent_stamp = 0;
 255                if (likely(!tp->repair))
 256                        WRITE_ONCE(tp->write_seq, 0);
 257        }
 258
 259        inet->inet_dport = usin->sin_port;
 260        sk_daddr_set(sk, daddr);
 261
 262        inet_csk(sk)->icsk_ext_hdr_len = 0;
 263        if (inet_opt)
 264                inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 265
 266        tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 267
 268        /* Socket identity is still unknown (sport may be zero).
 269         * However we set state to SYN-SENT and not releasing socket
 270         * lock select source port, enter ourselves into the hash tables and
 271         * complete initialization after this.
 272         */
 273        tcp_set_state(sk, TCP_SYN_SENT);
 274        err = inet_hash_connect(tcp_death_row, sk);
 275        if (err)
 276                goto failure;
 277
 278        sk_set_txhash(sk);
 279
 280        rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 281                               inet->inet_sport, inet->inet_dport, sk);
 282        if (IS_ERR(rt)) {
 283                err = PTR_ERR(rt);
 284                rt = NULL;
 285                goto failure;
 286        }
 287        /* OK, now commit destination to socket.  */
 288        sk->sk_gso_type = SKB_GSO_TCPV4;
 289        sk_setup_caps(sk, &rt->dst);
 290        rt = NULL;
 291
 292        if (likely(!tp->repair)) {
 293                if (!tp->write_seq)
 294                        WRITE_ONCE(tp->write_seq,
 295                                   secure_tcp_seq(inet->inet_saddr,
 296                                                  inet->inet_daddr,
 297                                                  inet->inet_sport,
 298                                                  usin->sin_port));
 299                tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 300                                                 inet->inet_saddr,
 301                                                 inet->inet_daddr);
 302        }
 303
 304        inet->inet_id = prandom_u32();
 305
 306        if (tcp_fastopen_defer_connect(sk, &err))
 307                return err;
 308        if (err)
 309                goto failure;
 310
 311        err = tcp_connect(sk);
 312
 313        if (err)
 314                goto failure;
 315
 316        return 0;
 317
 318failure:
 319        /*
 320         * This unhashes the socket and releases the local port,
 321         * if necessary.
 322         */
 323        tcp_set_state(sk, TCP_CLOSE);
 324        ip_rt_put(rt);
 325        sk->sk_route_caps = 0;
 326        inet->inet_dport = 0;
 327        return err;
 328}
 329EXPORT_SYMBOL(tcp_v4_connect);
 330
 331/*
 332 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 333 * It can be called through tcp_release_cb() if socket was owned by user
 334 * at the time tcp_v4_err() was called to handle ICMP message.
 335 */
 336void tcp_v4_mtu_reduced(struct sock *sk)
 337{
 338        struct inet_sock *inet = inet_sk(sk);
 339        struct dst_entry *dst;
 340        u32 mtu;
 341
 342        if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 343                return;
 344        mtu = tcp_sk(sk)->mtu_info;
 345        dst = inet_csk_update_pmtu(sk, mtu);
 346        if (!dst)
 347                return;
 348
 349        /* Something is about to be wrong... Remember soft error
 350         * for the case, if this connection will not able to recover.
 351         */
 352        if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 353                sk->sk_err_soft = EMSGSIZE;
 354
 355        mtu = dst_mtu(dst);
 356
 357        if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 358            ip_sk_accept_pmtu(sk) &&
 359            inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 360                tcp_sync_mss(sk, mtu);
 361
 362                /* Resend the TCP packet because it's
 363                 * clear that the old packet has been
 364                 * dropped. This is the new "fast" path mtu
 365                 * discovery.
 366                 */
 367                tcp_simple_retransmit(sk);
 368        } /* else let the usual retransmit timer handle it */
 369}
 370EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 371
 372static void do_redirect(struct sk_buff *skb, struct sock *sk)
 373{
 374        struct dst_entry *dst = __sk_dst_check(sk, 0);
 375
 376        if (dst)
 377                dst->ops->redirect(dst, sk, skb);
 378}
 379
 380
 381/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 382void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 383{
 384        struct request_sock *req = inet_reqsk(sk);
 385        struct net *net = sock_net(sk);
 386
 387        /* ICMPs are not backlogged, hence we cannot get
 388         * an established socket here.
 389         */
 390        if (seq != tcp_rsk(req)->snt_isn) {
 391                __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 392        } else if (abort) {
 393                /*
 394                 * Still in SYN_RECV, just remove it silently.
 395                 * There is no good way to pass the error to the newly
 396                 * created socket, and POSIX does not want network
 397                 * errors returned from accept().
 398                 */
 399                inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 400                tcp_listendrop(req->rsk_listener);
 401        }
 402        reqsk_put(req);
 403}
 404EXPORT_SYMBOL(tcp_req_err);
 405
 406/*
 407 * This routine is called by the ICMP module when it gets some
 408 * sort of error condition.  If err < 0 then the socket should
 409 * be closed and the error returned to the user.  If err > 0
 410 * it's just the icmp type << 8 | icmp code.  After adjustment
 411 * header points to the first 8 bytes of the tcp header.  We need
 412 * to find the appropriate port.
 413 *
 414 * The locking strategy used here is very "optimistic". When
 415 * someone else accesses the socket the ICMP is just dropped
 416 * and for some paths there is no check at all.
 417 * A more general error queue to queue errors for later handling
 418 * is probably better.
 419 *
 420 */
 421
 422int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 423{
 424        const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 425        struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 426        struct inet_connection_sock *icsk;
 427        struct tcp_sock *tp;
 428        struct inet_sock *inet;
 429        const int type = icmp_hdr(icmp_skb)->type;
 430        const int code = icmp_hdr(icmp_skb)->code;
 431        struct sock *sk;
 432        struct sk_buff *skb;
 433        struct request_sock *fastopen;
 434        u32 seq, snd_una;
 435        s32 remaining;
 436        u32 delta_us;
 437        int err;
 438        struct net *net = dev_net(icmp_skb->dev);
 439
 440        sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 441                                       th->dest, iph->saddr, ntohs(th->source),
 442                                       inet_iif(icmp_skb), 0);
 443        if (!sk) {
 444                __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 445                return -ENOENT;
 446        }
 447        if (sk->sk_state == TCP_TIME_WAIT) {
 448                inet_twsk_put(inet_twsk(sk));
 449                return 0;
 450        }
 451        seq = ntohl(th->seq);
 452        if (sk->sk_state == TCP_NEW_SYN_RECV) {
 453                tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 454                                     type == ICMP_TIME_EXCEEDED ||
 455                                     (type == ICMP_DEST_UNREACH &&
 456                                      (code == ICMP_NET_UNREACH ||
 457                                       code == ICMP_HOST_UNREACH)));
 458                return 0;
 459        }
 460
 461        bh_lock_sock(sk);
 462        /* If too many ICMPs get dropped on busy
 463         * servers this needs to be solved differently.
 464         * We do take care of PMTU discovery (RFC1191) special case :
 465         * we can receive locally generated ICMP messages while socket is held.
 466         */
 467        if (sock_owned_by_user(sk)) {
 468                if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 469                        __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 470        }
 471        if (sk->sk_state == TCP_CLOSE)
 472                goto out;
 473
 474        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 475                __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 476                goto out;
 477        }
 478
 479        icsk = inet_csk(sk);
 480        tp = tcp_sk(sk);
 481        /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 482        fastopen = rcu_dereference(tp->fastopen_rsk);
 483        snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 484        if (sk->sk_state != TCP_LISTEN &&
 485            !between(seq, snd_una, tp->snd_nxt)) {
 486                __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 487                goto out;
 488        }
 489
 490        switch (type) {
 491        case ICMP_REDIRECT:
 492                if (!sock_owned_by_user(sk))
 493                        do_redirect(icmp_skb, sk);
 494                goto out;
 495        case ICMP_SOURCE_QUENCH:
 496                /* Just silently ignore these. */
 497                goto out;
 498        case ICMP_PARAMETERPROB:
 499                err = EPROTO;
 500                break;
 501        case ICMP_DEST_UNREACH:
 502                if (code > NR_ICMP_UNREACH)
 503                        goto out;
 504
 505                if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 506                        /* We are not interested in TCP_LISTEN and open_requests
 507                         * (SYN-ACKs send out by Linux are always <576bytes so
 508                         * they should go through unfragmented).
 509                         */
 510                        if (sk->sk_state == TCP_LISTEN)
 511                                goto out;
 512
 513                        tp->mtu_info = info;
 514                        if (!sock_owned_by_user(sk)) {
 515                                tcp_v4_mtu_reduced(sk);
 516                        } else {
 517                                if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 518                                        sock_hold(sk);
 519                        }
 520                        goto out;
 521                }
 522
 523                err = icmp_err_convert[code].errno;
 524                /* check if icmp_skb allows revert of backoff
 525                 * (see draft-zimmermann-tcp-lcd) */
 526                if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 527                        break;
 528                if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 529                    !icsk->icsk_backoff || fastopen)
 530                        break;
 531
 532                if (sock_owned_by_user(sk))
 533                        break;
 534
 535                skb = tcp_rtx_queue_head(sk);
 536                if (WARN_ON_ONCE(!skb))
 537                        break;
 538
 539                icsk->icsk_backoff--;
 540                icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 541                                               TCP_TIMEOUT_INIT;
 542                icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 543
 544
 545                tcp_mstamp_refresh(tp);
 546                delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 547                remaining = icsk->icsk_rto -
 548                            usecs_to_jiffies(delta_us);
 549
 550                if (remaining > 0) {
 551                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 552                                                  remaining, TCP_RTO_MAX);
 553                } else {
 554                        /* RTO revert clocked out retransmission.
 555                         * Will retransmit now */
 556                        tcp_retransmit_timer(sk);
 557                }
 558
 559                break;
 560        case ICMP_TIME_EXCEEDED:
 561                err = EHOSTUNREACH;
 562                break;
 563        default:
 564                goto out;
 565        }
 566
 567        switch (sk->sk_state) {
 568        case TCP_SYN_SENT:
 569        case TCP_SYN_RECV:
 570                /* Only in fast or simultaneous open. If a fast open socket is
 571                 * is already accepted it is treated as a connected one below.
 572                 */
 573                if (fastopen && !fastopen->sk)
 574                        break;
 575
 576                if (!sock_owned_by_user(sk)) {
 577                        sk->sk_err = err;
 578
 579                        sk->sk_error_report(sk);
 580
 581                        tcp_done(sk);
 582                } else {
 583                        sk->sk_err_soft = err;
 584                }
 585                goto out;
 586        }
 587
 588        /* If we've already connected we will keep trying
 589         * until we time out, or the user gives up.
 590         *
 591         * rfc1122 4.2.3.9 allows to consider as hard errors
 592         * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 593         * but it is obsoleted by pmtu discovery).
 594         *
 595         * Note, that in modern internet, where routing is unreliable
 596         * and in each dark corner broken firewalls sit, sending random
 597         * errors ordered by their masters even this two messages finally lose
 598         * their original sense (even Linux sends invalid PORT_UNREACHs)
 599         *
 600         * Now we are in compliance with RFCs.
 601         *                                                      --ANK (980905)
 602         */
 603
 604        inet = inet_sk(sk);
 605        if (!sock_owned_by_user(sk) && inet->recverr) {
 606                sk->sk_err = err;
 607                sk->sk_error_report(sk);
 608        } else  { /* Only an error on timeout */
 609                sk->sk_err_soft = err;
 610        }
 611
 612out:
 613        bh_unlock_sock(sk);
 614        sock_put(sk);
 615        return 0;
 616}
 617
 618void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 619{
 620        struct tcphdr *th = tcp_hdr(skb);
 621
 622        th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 623        skb->csum_start = skb_transport_header(skb) - skb->head;
 624        skb->csum_offset = offsetof(struct tcphdr, check);
 625}
 626
 627/* This routine computes an IPv4 TCP checksum. */
 628void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 629{
 630        const struct inet_sock *inet = inet_sk(sk);
 631
 632        __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 633}
 634EXPORT_SYMBOL(tcp_v4_send_check);
 635
 636/*
 637 *      This routine will send an RST to the other tcp.
 638 *
 639 *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 640 *                    for reset.
 641 *      Answer: if a packet caused RST, it is not for a socket
 642 *              existing in our system, if it is matched to a socket,
 643 *              it is just duplicate segment or bug in other side's TCP.
 644 *              So that we build reply only basing on parameters
 645 *              arrived with segment.
 646 *      Exception: precedence violation. We do not implement it in any case.
 647 */
 648
 649static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 650{
 651        const struct tcphdr *th = tcp_hdr(skb);
 652        struct {
 653                struct tcphdr th;
 654#ifdef CONFIG_TCP_MD5SIG
 655                __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 656#endif
 657        } rep;
 658        struct ip_reply_arg arg;
 659#ifdef CONFIG_TCP_MD5SIG
 660        struct tcp_md5sig_key *key = NULL;
 661        const __u8 *hash_location = NULL;
 662        unsigned char newhash[16];
 663        int genhash;
 664        struct sock *sk1 = NULL;
 665#endif
 666        u64 transmit_time = 0;
 667        struct sock *ctl_sk;
 668        struct net *net;
 669
 670        /* Never send a reset in response to a reset. */
 671        if (th->rst)
 672                return;
 673
 674        /* If sk not NULL, it means we did a successful lookup and incoming
 675         * route had to be correct. prequeue might have dropped our dst.
 676         */
 677        if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 678                return;
 679
 680        /* Swap the send and the receive. */
 681        memset(&rep, 0, sizeof(rep));
 682        rep.th.dest   = th->source;
 683        rep.th.source = th->dest;
 684        rep.th.doff   = sizeof(struct tcphdr) / 4;
 685        rep.th.rst    = 1;
 686
 687        if (th->ack) {
 688                rep.th.seq = th->ack_seq;
 689        } else {
 690                rep.th.ack = 1;
 691                rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 692                                       skb->len - (th->doff << 2));
 693        }
 694
 695        memset(&arg, 0, sizeof(arg));
 696        arg.iov[0].iov_base = (unsigned char *)&rep;
 697        arg.iov[0].iov_len  = sizeof(rep.th);
 698
 699        net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 700#ifdef CONFIG_TCP_MD5SIG
 701        rcu_read_lock();
 702        hash_location = tcp_parse_md5sig_option(th);
 703        if (sk && sk_fullsock(sk)) {
 704                key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 705                                        &ip_hdr(skb)->saddr, AF_INET);
 706        } else if (hash_location) {
 707                /*
 708                 * active side is lost. Try to find listening socket through
 709                 * source port, and then find md5 key through listening socket.
 710                 * we are not loose security here:
 711                 * Incoming packet is checked with md5 hash with finding key,
 712                 * no RST generated if md5 hash doesn't match.
 713                 */
 714                sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 715                                             ip_hdr(skb)->saddr,
 716                                             th->source, ip_hdr(skb)->daddr,
 717                                             ntohs(th->source), inet_iif(skb),
 718                                             tcp_v4_sdif(skb));
 719                /* don't send rst if it can't find key */
 720                if (!sk1)
 721                        goto out;
 722
 723                key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 724                                        &ip_hdr(skb)->saddr, AF_INET);
 725                if (!key)
 726                        goto out;
 727
 728
 729                genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 730                if (genhash || memcmp(hash_location, newhash, 16) != 0)
 731                        goto out;
 732
 733        }
 734
 735        if (key) {
 736                rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 737                                   (TCPOPT_NOP << 16) |
 738                                   (TCPOPT_MD5SIG << 8) |
 739                                   TCPOLEN_MD5SIG);
 740                /* Update length and the length the header thinks exists */
 741                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 742                rep.th.doff = arg.iov[0].iov_len / 4;
 743
 744                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 745                                     key, ip_hdr(skb)->saddr,
 746                                     ip_hdr(skb)->daddr, &rep.th);
 747        }
 748#endif
 749        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 750                                      ip_hdr(skb)->saddr, /* XXX */
 751                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 752        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 753        arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 754
 755        /* When socket is gone, all binding information is lost.
 756         * routing might fail in this case. No choice here, if we choose to force
 757         * input interface, we will misroute in case of asymmetric route.
 758         */
 759        if (sk) {
 760                arg.bound_dev_if = sk->sk_bound_dev_if;
 761                if (sk_fullsock(sk))
 762                        trace_tcp_send_reset(sk, skb);
 763        }
 764
 765        BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 766                     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 767
 768        arg.tos = ip_hdr(skb)->tos;
 769        arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 770        local_bh_disable();
 771        ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 772        if (sk) {
 773                ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 774                                   inet_twsk(sk)->tw_mark : sk->sk_mark;
 775                ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 776                                   inet_twsk(sk)->tw_priority : sk->sk_priority;
 777                transmit_time = tcp_transmit_time(sk);
 778        }
 779        ip_send_unicast_reply(ctl_sk,
 780                              skb, &TCP_SKB_CB(skb)->header.h4.opt,
 781                              ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 782                              &arg, arg.iov[0].iov_len,
 783                              transmit_time);
 784
 785        ctl_sk->sk_mark = 0;
 786        __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 787        __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 788        local_bh_enable();
 789
 790#ifdef CONFIG_TCP_MD5SIG
 791out:
 792        rcu_read_unlock();
 793#endif
 794}
 795
 796/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 797   outside socket context is ugly, certainly. What can I do?
 798 */
 799
 800static void tcp_v4_send_ack(const struct sock *sk,
 801                            struct sk_buff *skb, u32 seq, u32 ack,
 802                            u32 win, u32 tsval, u32 tsecr, int oif,
 803                            struct tcp_md5sig_key *key,
 804                            int reply_flags, u8 tos)
 805{
 806        const struct tcphdr *th = tcp_hdr(skb);
 807        struct {
 808                struct tcphdr th;
 809                __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 810#ifdef CONFIG_TCP_MD5SIG
 811                           + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 812#endif
 813                        ];
 814        } rep;
 815        struct net *net = sock_net(sk);
 816        struct ip_reply_arg arg;
 817        struct sock *ctl_sk;
 818        u64 transmit_time;
 819
 820        memset(&rep.th, 0, sizeof(struct tcphdr));
 821        memset(&arg, 0, sizeof(arg));
 822
 823        arg.iov[0].iov_base = (unsigned char *)&rep;
 824        arg.iov[0].iov_len  = sizeof(rep.th);
 825        if (tsecr) {
 826                rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 827                                   (TCPOPT_TIMESTAMP << 8) |
 828                                   TCPOLEN_TIMESTAMP);
 829                rep.opt[1] = htonl(tsval);
 830                rep.opt[2] = htonl(tsecr);
 831                arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 832        }
 833
 834        /* Swap the send and the receive. */
 835        rep.th.dest    = th->source;
 836        rep.th.source  = th->dest;
 837        rep.th.doff    = arg.iov[0].iov_len / 4;
 838        rep.th.seq     = htonl(seq);
 839        rep.th.ack_seq = htonl(ack);
 840        rep.th.ack     = 1;
 841        rep.th.window  = htons(win);
 842
 843#ifdef CONFIG_TCP_MD5SIG
 844        if (key) {
 845                int offset = (tsecr) ? 3 : 0;
 846
 847                rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 848                                          (TCPOPT_NOP << 16) |
 849                                          (TCPOPT_MD5SIG << 8) |
 850                                          TCPOLEN_MD5SIG);
 851                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 852                rep.th.doff = arg.iov[0].iov_len/4;
 853
 854                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 855                                    key, ip_hdr(skb)->saddr,
 856                                    ip_hdr(skb)->daddr, &rep.th);
 857        }
 858#endif
 859        arg.flags = reply_flags;
 860        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 861                                      ip_hdr(skb)->saddr, /* XXX */
 862                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 863        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 864        if (oif)
 865                arg.bound_dev_if = oif;
 866        arg.tos = tos;
 867        arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 868        local_bh_disable();
 869        ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 870        ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 871                           inet_twsk(sk)->tw_mark : sk->sk_mark;
 872        ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 873                           inet_twsk(sk)->tw_priority : sk->sk_priority;
 874        transmit_time = tcp_transmit_time(sk);
 875        ip_send_unicast_reply(ctl_sk,
 876                              skb, &TCP_SKB_CB(skb)->header.h4.opt,
 877                              ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 878                              &arg, arg.iov[0].iov_len,
 879                              transmit_time);
 880
 881        ctl_sk->sk_mark = 0;
 882        __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 883        local_bh_enable();
 884}
 885
 886static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 887{
 888        struct inet_timewait_sock *tw = inet_twsk(sk);
 889        struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 890
 891        tcp_v4_send_ack(sk, skb,
 892                        tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 893                        tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 894                        tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 895                        tcptw->tw_ts_recent,
 896                        tw->tw_bound_dev_if,
 897                        tcp_twsk_md5_key(tcptw),
 898                        tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 899                        tw->tw_tos
 900                        );
 901
 902        inet_twsk_put(tw);
 903}
 904
 905static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 906                                  struct request_sock *req)
 907{
 908        /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 909         * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 910         */
 911        u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 912                                             tcp_sk(sk)->snd_nxt;
 913
 914        /* RFC 7323 2.3
 915         * The window field (SEG.WND) of every outgoing segment, with the
 916         * exception of <SYN> segments, MUST be right-shifted by
 917         * Rcv.Wind.Shift bits:
 918         */
 919        tcp_v4_send_ack(sk, skb, seq,
 920                        tcp_rsk(req)->rcv_nxt,
 921                        req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 922                        tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 923                        req->ts_recent,
 924                        0,
 925                        tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
 926                                          AF_INET),
 927                        inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 928                        ip_hdr(skb)->tos);
 929}
 930
 931/*
 932 *      Send a SYN-ACK after having received a SYN.
 933 *      This still operates on a request_sock only, not on a big
 934 *      socket.
 935 */
 936static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 937                              struct flowi *fl,
 938                              struct request_sock *req,
 939                              struct tcp_fastopen_cookie *foc,
 940                              enum tcp_synack_type synack_type)
 941{
 942        const struct inet_request_sock *ireq = inet_rsk(req);
 943        struct flowi4 fl4;
 944        int err = -1;
 945        struct sk_buff *skb;
 946
 947        /* First, grab a route. */
 948        if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 949                return -1;
 950
 951        skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 952
 953        if (skb) {
 954                __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 955
 956                rcu_read_lock();
 957                err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 958                                            ireq->ir_rmt_addr,
 959                                            rcu_dereference(ireq->ireq_opt));
 960                rcu_read_unlock();
 961                err = net_xmit_eval(err);
 962        }
 963
 964        return err;
 965}
 966
 967/*
 968 *      IPv4 request_sock destructor.
 969 */
 970static void tcp_v4_reqsk_destructor(struct request_sock *req)
 971{
 972        kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
 973}
 974
 975#ifdef CONFIG_TCP_MD5SIG
 976/*
 977 * RFC2385 MD5 checksumming requires a mapping of
 978 * IP address->MD5 Key.
 979 * We need to maintain these in the sk structure.
 980 */
 981
 982DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
 983EXPORT_SYMBOL(tcp_md5_needed);
 984
 985/* Find the Key structure for an address.  */
 986struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk,
 987                                           const union tcp_md5_addr *addr,
 988                                           int family)
 989{
 990        const struct tcp_sock *tp = tcp_sk(sk);
 991        struct tcp_md5sig_key *key;
 992        const struct tcp_md5sig_info *md5sig;
 993        __be32 mask;
 994        struct tcp_md5sig_key *best_match = NULL;
 995        bool match;
 996
 997        /* caller either holds rcu_read_lock() or socket lock */
 998        md5sig = rcu_dereference_check(tp->md5sig_info,
 999                                       lockdep_sock_is_held(sk));
1000        if (!md5sig)
1001                return NULL;
1002
1003        hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1004                if (key->family != family)
1005                        continue;
1006
1007                if (family == AF_INET) {
1008                        mask = inet_make_mask(key->prefixlen);
1009                        match = (key->addr.a4.s_addr & mask) ==
1010                                (addr->a4.s_addr & mask);
1011#if IS_ENABLED(CONFIG_IPV6)
1012                } else if (family == AF_INET6) {
1013                        match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1014                                                  key->prefixlen);
1015#endif
1016                } else {
1017                        match = false;
1018                }
1019
1020                if (match && (!best_match ||
1021                              key->prefixlen > best_match->prefixlen))
1022                        best_match = key;
1023        }
1024        return best_match;
1025}
1026EXPORT_SYMBOL(__tcp_md5_do_lookup);
1027
1028static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1029                                                      const union tcp_md5_addr *addr,
1030                                                      int family, u8 prefixlen)
1031{
1032        const struct tcp_sock *tp = tcp_sk(sk);
1033        struct tcp_md5sig_key *key;
1034        unsigned int size = sizeof(struct in_addr);
1035        const struct tcp_md5sig_info *md5sig;
1036
1037        /* caller either holds rcu_read_lock() or socket lock */
1038        md5sig = rcu_dereference_check(tp->md5sig_info,
1039                                       lockdep_sock_is_held(sk));
1040        if (!md5sig)
1041                return NULL;
1042#if IS_ENABLED(CONFIG_IPV6)
1043        if (family == AF_INET6)
1044                size = sizeof(struct in6_addr);
1045#endif
1046        hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1047                if (key->family != family)
1048                        continue;
1049                if (!memcmp(&key->addr, addr, size) &&
1050                    key->prefixlen == prefixlen)
1051                        return key;
1052        }
1053        return NULL;
1054}
1055
1056struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1057                                         const struct sock *addr_sk)
1058{
1059        const union tcp_md5_addr *addr;
1060
1061        addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1062        return tcp_md5_do_lookup(sk, addr, AF_INET);
1063}
1064EXPORT_SYMBOL(tcp_v4_md5_lookup);
1065
1066/* This can be called on a newly created socket, from other files */
1067int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1068                   int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1069                   gfp_t gfp)
1070{
1071        /* Add Key to the list */
1072        struct tcp_md5sig_key *key;
1073        struct tcp_sock *tp = tcp_sk(sk);
1074        struct tcp_md5sig_info *md5sig;
1075
1076        key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1077        if (key) {
1078                /* Pre-existing entry - just update that one. */
1079                memcpy(key->key, newkey, newkeylen);
1080                key->keylen = newkeylen;
1081                return 0;
1082        }
1083
1084        md5sig = rcu_dereference_protected(tp->md5sig_info,
1085                                           lockdep_sock_is_held(sk));
1086        if (!md5sig) {
1087                md5sig = kmalloc(sizeof(*md5sig), gfp);
1088                if (!md5sig)
1089                        return -ENOMEM;
1090
1091                sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1092                INIT_HLIST_HEAD(&md5sig->head);
1093                rcu_assign_pointer(tp->md5sig_info, md5sig);
1094        }
1095
1096        key = sock_kmalloc(sk, sizeof(*key), gfp);
1097        if (!key)
1098                return -ENOMEM;
1099        if (!tcp_alloc_md5sig_pool()) {
1100                sock_kfree_s(sk, key, sizeof(*key));
1101                return -ENOMEM;
1102        }
1103
1104        memcpy(key->key, newkey, newkeylen);
1105        key->keylen = newkeylen;
1106        key->family = family;
1107        key->prefixlen = prefixlen;
1108        memcpy(&key->addr, addr,
1109               (family == AF_INET6) ? sizeof(struct in6_addr) :
1110                                      sizeof(struct in_addr));
1111        hlist_add_head_rcu(&key->node, &md5sig->head);
1112        return 0;
1113}
1114EXPORT_SYMBOL(tcp_md5_do_add);
1115
1116int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1117                   u8 prefixlen)
1118{
1119        struct tcp_md5sig_key *key;
1120
1121        key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1122        if (!key)
1123                return -ENOENT;
1124        hlist_del_rcu(&key->node);
1125        atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1126        kfree_rcu(key, rcu);
1127        return 0;
1128}
1129EXPORT_SYMBOL(tcp_md5_do_del);
1130
1131static void tcp_clear_md5_list(struct sock *sk)
1132{
1133        struct tcp_sock *tp = tcp_sk(sk);
1134        struct tcp_md5sig_key *key;
1135        struct hlist_node *n;
1136        struct tcp_md5sig_info *md5sig;
1137
1138        md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1139
1140        hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1141                hlist_del_rcu(&key->node);
1142                atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1143                kfree_rcu(key, rcu);
1144        }
1145}
1146
1147static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1148                                 char __user *optval, int optlen)
1149{
1150        struct tcp_md5sig cmd;
1151        struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1152        u8 prefixlen = 32;
1153
1154        if (optlen < sizeof(cmd))
1155                return -EINVAL;
1156
1157        if (copy_from_user(&cmd, optval, sizeof(cmd)))
1158                return -EFAULT;
1159
1160        if (sin->sin_family != AF_INET)
1161                return -EINVAL;
1162
1163        if (optname == TCP_MD5SIG_EXT &&
1164            cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1165                prefixlen = cmd.tcpm_prefixlen;
1166                if (prefixlen > 32)
1167                        return -EINVAL;
1168        }
1169
1170        if (!cmd.tcpm_keylen)
1171                return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1172                                      AF_INET, prefixlen);
1173
1174        if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1175                return -EINVAL;
1176
1177        return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1178                              AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1179                              GFP_KERNEL);
1180}
1181
1182static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1183                                   __be32 daddr, __be32 saddr,
1184                                   const struct tcphdr *th, int nbytes)
1185{
1186        struct tcp4_pseudohdr *bp;
1187        struct scatterlist sg;
1188        struct tcphdr *_th;
1189
1190        bp = hp->scratch;
1191        bp->saddr = saddr;
1192        bp->daddr = daddr;
1193        bp->pad = 0;
1194        bp->protocol = IPPROTO_TCP;
1195        bp->len = cpu_to_be16(nbytes);
1196
1197        _th = (struct tcphdr *)(bp + 1);
1198        memcpy(_th, th, sizeof(*th));
1199        _th->check = 0;
1200
1201        sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1202        ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1203                                sizeof(*bp) + sizeof(*th));
1204        return crypto_ahash_update(hp->md5_req);
1205}
1206
1207static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1208                               __be32 daddr, __be32 saddr, const struct tcphdr *th)
1209{
1210        struct tcp_md5sig_pool *hp;
1211        struct ahash_request *req;
1212
1213        hp = tcp_get_md5sig_pool();
1214        if (!hp)
1215                goto clear_hash_noput;
1216        req = hp->md5_req;
1217
1218        if (crypto_ahash_init(req))
1219                goto clear_hash;
1220        if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1221                goto clear_hash;
1222        if (tcp_md5_hash_key(hp, key))
1223                goto clear_hash;
1224        ahash_request_set_crypt(req, NULL, md5_hash, 0);
1225        if (crypto_ahash_final(req))
1226                goto clear_hash;
1227
1228        tcp_put_md5sig_pool();
1229        return 0;
1230
1231clear_hash:
1232        tcp_put_md5sig_pool();
1233clear_hash_noput:
1234        memset(md5_hash, 0, 16);
1235        return 1;
1236}
1237
1238int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1239                        const struct sock *sk,
1240                        const struct sk_buff *skb)
1241{
1242        struct tcp_md5sig_pool *hp;
1243        struct ahash_request *req;
1244        const struct tcphdr *th = tcp_hdr(skb);
1245        __be32 saddr, daddr;
1246
1247        if (sk) { /* valid for establish/request sockets */
1248                saddr = sk->sk_rcv_saddr;
1249                daddr = sk->sk_daddr;
1250        } else {
1251                const struct iphdr *iph = ip_hdr(skb);
1252                saddr = iph->saddr;
1253                daddr = iph->daddr;
1254        }
1255
1256        hp = tcp_get_md5sig_pool();
1257        if (!hp)
1258                goto clear_hash_noput;
1259        req = hp->md5_req;
1260
1261        if (crypto_ahash_init(req))
1262                goto clear_hash;
1263
1264        if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1265                goto clear_hash;
1266        if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1267                goto clear_hash;
1268        if (tcp_md5_hash_key(hp, key))
1269                goto clear_hash;
1270        ahash_request_set_crypt(req, NULL, md5_hash, 0);
1271        if (crypto_ahash_final(req))
1272                goto clear_hash;
1273
1274        tcp_put_md5sig_pool();
1275        return 0;
1276
1277clear_hash:
1278        tcp_put_md5sig_pool();
1279clear_hash_noput:
1280        memset(md5_hash, 0, 16);
1281        return 1;
1282}
1283EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1284
1285#endif
1286
1287/* Called with rcu_read_lock() */
1288static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1289                                    const struct sk_buff *skb)
1290{
1291#ifdef CONFIG_TCP_MD5SIG
1292        /*
1293         * This gets called for each TCP segment that arrives
1294         * so we want to be efficient.
1295         * We have 3 drop cases:
1296         * o No MD5 hash and one expected.
1297         * o MD5 hash and we're not expecting one.
1298         * o MD5 hash and its wrong.
1299         */
1300        const __u8 *hash_location = NULL;
1301        struct tcp_md5sig_key *hash_expected;
1302        const struct iphdr *iph = ip_hdr(skb);
1303        const struct tcphdr *th = tcp_hdr(skb);
1304        int genhash;
1305        unsigned char newhash[16];
1306
1307        hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1308                                          AF_INET);
1309        hash_location = tcp_parse_md5sig_option(th);
1310
1311        /* We've parsed the options - do we have a hash? */
1312        if (!hash_expected && !hash_location)
1313                return false;
1314
1315        if (hash_expected && !hash_location) {
1316                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1317                return true;
1318        }
1319
1320        if (!hash_expected && hash_location) {
1321                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1322                return true;
1323        }
1324
1325        /* Okay, so this is hash_expected and hash_location -
1326         * so we need to calculate the checksum.
1327         */
1328        genhash = tcp_v4_md5_hash_skb(newhash,
1329                                      hash_expected,
1330                                      NULL, skb);
1331
1332        if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1333                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1334                net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1335                                     &iph->saddr, ntohs(th->source),
1336                                     &iph->daddr, ntohs(th->dest),
1337                                     genhash ? " tcp_v4_calc_md5_hash failed"
1338                                     : "");
1339                return true;
1340        }
1341        return false;
1342#endif
1343        return false;
1344}
1345
1346static void tcp_v4_init_req(struct request_sock *req,
1347                            const struct sock *sk_listener,
1348                            struct sk_buff *skb)
1349{
1350        struct inet_request_sock *ireq = inet_rsk(req);
1351        struct net *net = sock_net(sk_listener);
1352
1353        sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1354        sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1355        RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1356}
1357
1358static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1359                                          struct flowi *fl,
1360                                          const struct request_sock *req)
1361{
1362        return inet_csk_route_req(sk, &fl->u.ip4, req);
1363}
1364
1365struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1366        .family         =       PF_INET,
1367        .obj_size       =       sizeof(struct tcp_request_sock),
1368        .rtx_syn_ack    =       tcp_rtx_synack,
1369        .send_ack       =       tcp_v4_reqsk_send_ack,
1370        .destructor     =       tcp_v4_reqsk_destructor,
1371        .send_reset     =       tcp_v4_send_reset,
1372        .syn_ack_timeout =      tcp_syn_ack_timeout,
1373};
1374
1375static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1376        .mss_clamp      =       TCP_MSS_DEFAULT,
1377#ifdef CONFIG_TCP_MD5SIG
1378        .req_md5_lookup =       tcp_v4_md5_lookup,
1379        .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1380#endif
1381        .init_req       =       tcp_v4_init_req,
1382#ifdef CONFIG_SYN_COOKIES
1383        .cookie_init_seq =      cookie_v4_init_sequence,
1384#endif
1385        .route_req      =       tcp_v4_route_req,
1386        .init_seq       =       tcp_v4_init_seq,
1387        .init_ts_off    =       tcp_v4_init_ts_off,
1388        .send_synack    =       tcp_v4_send_synack,
1389};
1390
1391int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1392{
1393        /* Never answer to SYNs send to broadcast or multicast */
1394        if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1395                goto drop;
1396
1397        return tcp_conn_request(&tcp_request_sock_ops,
1398                                &tcp_request_sock_ipv4_ops, sk, skb);
1399
1400drop:
1401        tcp_listendrop(sk);
1402        return 0;
1403}
1404EXPORT_SYMBOL(tcp_v4_conn_request);
1405
1406
1407/*
1408 * The three way handshake has completed - we got a valid synack -
1409 * now create the new socket.
1410 */
1411struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1412                                  struct request_sock *req,
1413                                  struct dst_entry *dst,
1414                                  struct request_sock *req_unhash,
1415                                  bool *own_req)
1416{
1417        struct inet_request_sock *ireq;
1418        struct inet_sock *newinet;
1419        struct tcp_sock *newtp;
1420        struct sock *newsk;
1421#ifdef CONFIG_TCP_MD5SIG
1422        struct tcp_md5sig_key *key;
1423#endif
1424        struct ip_options_rcu *inet_opt;
1425
1426        if (sk_acceptq_is_full(sk))
1427                goto exit_overflow;
1428
1429        newsk = tcp_create_openreq_child(sk, req, skb);
1430        if (!newsk)
1431                goto exit_nonewsk;
1432
1433        newsk->sk_gso_type = SKB_GSO_TCPV4;
1434        inet_sk_rx_dst_set(newsk, skb);
1435
1436        newtp                 = tcp_sk(newsk);
1437        newinet               = inet_sk(newsk);
1438        ireq                  = inet_rsk(req);
1439        sk_daddr_set(newsk, ireq->ir_rmt_addr);
1440        sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1441        newsk->sk_bound_dev_if = ireq->ir_iif;
1442        newinet->inet_saddr   = ireq->ir_loc_addr;
1443        inet_opt              = rcu_dereference(ireq->ireq_opt);
1444        RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1445        newinet->mc_index     = inet_iif(skb);
1446        newinet->mc_ttl       = ip_hdr(skb)->ttl;
1447        newinet->rcv_tos      = ip_hdr(skb)->tos;
1448        inet_csk(newsk)->icsk_ext_hdr_len = 0;
1449        if (inet_opt)
1450                inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1451        newinet->inet_id = prandom_u32();
1452
1453        if (!dst) {
1454                dst = inet_csk_route_child_sock(sk, newsk, req);
1455                if (!dst)
1456                        goto put_and_exit;
1457        } else {
1458                /* syncookie case : see end of cookie_v4_check() */
1459        }
1460        sk_setup_caps(newsk, dst);
1461
1462        tcp_ca_openreq_child(newsk, dst);
1463
1464        tcp_sync_mss(newsk, dst_mtu(dst));
1465        newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1466
1467        tcp_initialize_rcv_mss(newsk);
1468
1469#ifdef CONFIG_TCP_MD5SIG
1470        /* Copy over the MD5 key from the original socket */
1471        key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1472                                AF_INET);
1473        if (key) {
1474                /*
1475                 * We're using one, so create a matching key
1476                 * on the newsk structure. If we fail to get
1477                 * memory, then we end up not copying the key
1478                 * across. Shucks.
1479                 */
1480                tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1481                               AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1482                sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1483        }
1484#endif
1485
1486        if (__inet_inherit_port(sk, newsk) < 0)
1487                goto put_and_exit;
1488        *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1489        if (likely(*own_req)) {
1490                tcp_move_syn(newtp, req);
1491                ireq->ireq_opt = NULL;
1492        } else {
1493                newinet->inet_opt = NULL;
1494        }
1495        return newsk;
1496
1497exit_overflow:
1498        NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1499exit_nonewsk:
1500        dst_release(dst);
1501exit:
1502        tcp_listendrop(sk);
1503        return NULL;
1504put_and_exit:
1505        newinet->inet_opt = NULL;
1506        inet_csk_prepare_forced_close(newsk);
1507        tcp_done(newsk);
1508        goto exit;
1509}
1510EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1511
1512static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1513{
1514#ifdef CONFIG_SYN_COOKIES
1515        const struct tcphdr *th = tcp_hdr(skb);
1516
1517        if (!th->syn)
1518                sk = cookie_v4_check(sk, skb);
1519#endif
1520        return sk;
1521}
1522
1523u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1524                         struct tcphdr *th, u32 *cookie)
1525{
1526        u16 mss = 0;
1527#ifdef CONFIG_SYN_COOKIES
1528        mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1529                                    &tcp_request_sock_ipv4_ops, sk, th);
1530        if (mss) {
1531                *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1532                tcp_synq_overflow(sk);
1533        }
1534#endif
1535        return mss;
1536}
1537
1538/* The socket must have it's spinlock held when we get
1539 * here, unless it is a TCP_LISTEN socket.
1540 *
1541 * We have a potential double-lock case here, so even when
1542 * doing backlog processing we use the BH locking scheme.
1543 * This is because we cannot sleep with the original spinlock
1544 * held.
1545 */
1546int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1547{
1548        struct sock *rsk;
1549
1550        if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1551                struct dst_entry *dst = sk->sk_rx_dst;
1552
1553                sock_rps_save_rxhash(sk, skb);
1554                sk_mark_napi_id(sk, skb);
1555                if (dst) {
1556                        if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1557                            !dst->ops->check(dst, 0)) {
1558                                dst_release(dst);
1559                                sk->sk_rx_dst = NULL;
1560                        }
1561                }
1562                tcp_rcv_established(sk, skb);
1563                return 0;
1564        }
1565
1566        if (tcp_checksum_complete(skb))
1567                goto csum_err;
1568
1569        if (sk->sk_state == TCP_LISTEN) {
1570                struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1571
1572                if (!nsk)
1573                        goto discard;
1574                if (nsk != sk) {
1575                        if (tcp_child_process(sk, nsk, skb)) {
1576                                rsk = nsk;
1577                                goto reset;
1578                        }
1579                        return 0;
1580                }
1581        } else
1582                sock_rps_save_rxhash(sk, skb);
1583
1584        if (tcp_rcv_state_process(sk, skb)) {
1585                rsk = sk;
1586                goto reset;
1587        }
1588        return 0;
1589
1590reset:
1591        tcp_v4_send_reset(rsk, skb);
1592discard:
1593        kfree_skb(skb);
1594        /* Be careful here. If this function gets more complicated and
1595         * gcc suffers from register pressure on the x86, sk (in %ebx)
1596         * might be destroyed here. This current version compiles correctly,
1597         * but you have been warned.
1598         */
1599        return 0;
1600
1601csum_err:
1602        TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1603        TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1604        goto discard;
1605}
1606EXPORT_SYMBOL(tcp_v4_do_rcv);
1607
1608int tcp_v4_early_demux(struct sk_buff *skb)
1609{
1610        const struct iphdr *iph;
1611        const struct tcphdr *th;
1612        struct sock *sk;
1613
1614        if (skb->pkt_type != PACKET_HOST)
1615                return 0;
1616
1617        if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1618                return 0;
1619
1620        iph = ip_hdr(skb);
1621        th = tcp_hdr(skb);
1622
1623        if (th->doff < sizeof(struct tcphdr) / 4)
1624                return 0;
1625
1626        sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1627                                       iph->saddr, th->source,
1628                                       iph->daddr, ntohs(th->dest),
1629                                       skb->skb_iif, inet_sdif(skb));
1630        if (sk) {
1631                skb->sk = sk;
1632                skb->destructor = sock_edemux;
1633                if (sk_fullsock(sk)) {
1634                        struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1635
1636                        if (dst)
1637                                dst = dst_check(dst, 0);
1638                        if (dst &&
1639                            inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1640                                skb_dst_set_noref(skb, dst);
1641                }
1642        }
1643        return 0;
1644}
1645
1646bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1647{
1648        u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1649        struct skb_shared_info *shinfo;
1650        const struct tcphdr *th;
1651        struct tcphdr *thtail;
1652        struct sk_buff *tail;
1653        unsigned int hdrlen;
1654        bool fragstolen;
1655        u32 gso_segs;
1656        int delta;
1657
1658        /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1659         * we can fix skb->truesize to its real value to avoid future drops.
1660         * This is valid because skb is not yet charged to the socket.
1661         * It has been noticed pure SACK packets were sometimes dropped
1662         * (if cooked by drivers without copybreak feature).
1663         */
1664        skb_condense(skb);
1665
1666        skb_dst_drop(skb);
1667
1668        if (unlikely(tcp_checksum_complete(skb))) {
1669                bh_unlock_sock(sk);
1670                __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1671                __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1672                return true;
1673        }
1674
1675        /* Attempt coalescing to last skb in backlog, even if we are
1676         * above the limits.
1677         * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1678         */
1679        th = (const struct tcphdr *)skb->data;
1680        hdrlen = th->doff * 4;
1681        shinfo = skb_shinfo(skb);
1682
1683        if (!shinfo->gso_size)
1684                shinfo->gso_size = skb->len - hdrlen;
1685
1686        if (!shinfo->gso_segs)
1687                shinfo->gso_segs = 1;
1688
1689        tail = sk->sk_backlog.tail;
1690        if (!tail)
1691                goto no_coalesce;
1692        thtail = (struct tcphdr *)tail->data;
1693
1694        if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1695            TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1696            ((TCP_SKB_CB(tail)->tcp_flags |
1697              TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1698            !((TCP_SKB_CB(tail)->tcp_flags &
1699              TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1700            ((TCP_SKB_CB(tail)->tcp_flags ^
1701              TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1702#ifdef CONFIG_TLS_DEVICE
1703            tail->decrypted != skb->decrypted ||
1704#endif
1705            thtail->doff != th->doff ||
1706            memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1707                goto no_coalesce;
1708
1709        __skb_pull(skb, hdrlen);
1710        if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1711                thtail->window = th->window;
1712
1713                TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1714
1715                if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
1716                        TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1717
1718                /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1719                 * thtail->fin, so that the fast path in tcp_rcv_established()
1720                 * is not entered if we append a packet with a FIN.
1721                 * SYN, RST, URG are not present.
1722                 * ACK is set on both packets.
1723                 * PSH : we do not really care in TCP stack,
1724                 *       at least for 'GRO' packets.
1725                 */
1726                thtail->fin |= th->fin;
1727                TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1728
1729                if (TCP_SKB_CB(skb)->has_rxtstamp) {
1730                        TCP_SKB_CB(tail)->has_rxtstamp = true;
1731                        tail->tstamp = skb->tstamp;
1732                        skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1733                }
1734
1735                /* Not as strict as GRO. We only need to carry mss max value */
1736                skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1737                                                 skb_shinfo(tail)->gso_size);
1738
1739                gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1740                skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1741
1742                sk->sk_backlog.len += delta;
1743                __NET_INC_STATS(sock_net(sk),
1744                                LINUX_MIB_TCPBACKLOGCOALESCE);
1745                kfree_skb_partial(skb, fragstolen);
1746                return false;
1747        }
1748        __skb_push(skb, hdrlen);
1749
1750no_coalesce:
1751        /* Only socket owner can try to collapse/prune rx queues
1752         * to reduce memory overhead, so add a little headroom here.
1753         * Few sockets backlog are possibly concurrently non empty.
1754         */
1755        limit += 64*1024;
1756
1757        if (unlikely(sk_add_backlog(sk, skb, limit))) {
1758                bh_unlock_sock(sk);
1759                __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1760                return true;
1761        }
1762        return false;
1763}
1764EXPORT_SYMBOL(tcp_add_backlog);
1765
1766int tcp_filter(struct sock *sk, struct sk_buff *skb)
1767{
1768        struct tcphdr *th = (struct tcphdr *)skb->data;
1769
1770        return sk_filter_trim_cap(sk, skb, th->doff * 4);
1771}
1772EXPORT_SYMBOL(tcp_filter);
1773
1774static void tcp_v4_restore_cb(struct sk_buff *skb)
1775{
1776        memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1777                sizeof(struct inet_skb_parm));
1778}
1779
1780static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1781                           const struct tcphdr *th)
1782{
1783        /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1784         * barrier() makes sure compiler wont play fool^Waliasing games.
1785         */
1786        memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1787                sizeof(struct inet_skb_parm));
1788        barrier();
1789
1790        TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1791        TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1792                                    skb->len - th->doff * 4);
1793        TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1794        TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1795        TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1796        TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1797        TCP_SKB_CB(skb)->sacked  = 0;
1798        TCP_SKB_CB(skb)->has_rxtstamp =
1799                        skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1800}
1801
1802/*
1803 *      From tcp_input.c
1804 */
1805
1806int tcp_v4_rcv(struct sk_buff *skb)
1807{
1808        struct net *net = dev_net(skb->dev);
1809        struct sk_buff *skb_to_free;
1810        int sdif = inet_sdif(skb);
1811        const struct iphdr *iph;
1812        const struct tcphdr *th;
1813        bool refcounted;
1814        struct sock *sk;
1815        int ret;
1816
1817        if (skb->pkt_type != PACKET_HOST)
1818                goto discard_it;
1819
1820        /* Count it even if it's bad */
1821        __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1822
1823        if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1824                goto discard_it;
1825
1826        th = (const struct tcphdr *)skb->data;
1827
1828        if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1829                goto bad_packet;
1830        if (!pskb_may_pull(skb, th->doff * 4))
1831                goto discard_it;
1832
1833        /* An explanation is required here, I think.
1834         * Packet length and doff are validated by header prediction,
1835         * provided case of th->doff==0 is eliminated.
1836         * So, we defer the checks. */
1837
1838        if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1839                goto csum_error;
1840
1841        th = (const struct tcphdr *)skb->data;
1842        iph = ip_hdr(skb);
1843lookup:
1844        sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1845                               th->dest, sdif, &refcounted);
1846        if (!sk)
1847                goto no_tcp_socket;
1848
1849process:
1850        if (sk->sk_state == TCP_TIME_WAIT)
1851                goto do_time_wait;
1852
1853        if (sk->sk_state == TCP_NEW_SYN_RECV) {
1854                struct request_sock *req = inet_reqsk(sk);
1855                bool req_stolen = false;
1856                struct sock *nsk;
1857
1858                sk = req->rsk_listener;
1859                if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1860                        sk_drops_add(sk, skb);
1861                        reqsk_put(req);
1862                        goto discard_it;
1863                }
1864                if (tcp_checksum_complete(skb)) {
1865                        reqsk_put(req);
1866                        goto csum_error;
1867                }
1868                if (unlikely(sk->sk_state != TCP_LISTEN)) {
1869                        inet_csk_reqsk_queue_drop_and_put(sk, req);
1870                        goto lookup;
1871                }
1872                /* We own a reference on the listener, increase it again
1873                 * as we might lose it too soon.
1874                 */
1875                sock_hold(sk);
1876                refcounted = true;
1877                nsk = NULL;
1878                if (!tcp_filter(sk, skb)) {
1879                        th = (const struct tcphdr *)skb->data;
1880                        iph = ip_hdr(skb);
1881                        tcp_v4_fill_cb(skb, iph, th);
1882                        nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1883                }
1884                if (!nsk) {
1885                        reqsk_put(req);
1886                        if (req_stolen) {
1887                                /* Another cpu got exclusive access to req
1888                                 * and created a full blown socket.
1889                                 * Try to feed this packet to this socket
1890                                 * instead of discarding it.
1891                                 */
1892                                tcp_v4_restore_cb(skb);
1893                                sock_put(sk);
1894                                goto lookup;
1895                        }
1896                        goto discard_and_relse;
1897                }
1898                if (nsk == sk) {
1899                        reqsk_put(req);
1900                        tcp_v4_restore_cb(skb);
1901                } else if (tcp_child_process(sk, nsk, skb)) {
1902                        tcp_v4_send_reset(nsk, skb);
1903                        goto discard_and_relse;
1904                } else {
1905                        sock_put(sk);
1906                        return 0;
1907                }
1908        }
1909        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1910                __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1911                goto discard_and_relse;
1912        }
1913
1914        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1915                goto discard_and_relse;
1916
1917        if (tcp_v4_inbound_md5_hash(sk, skb))
1918                goto discard_and_relse;
1919
1920        nf_reset_ct(skb);
1921
1922        if (tcp_filter(sk, skb))
1923                goto discard_and_relse;
1924        th = (const struct tcphdr *)skb->data;
1925        iph = ip_hdr(skb);
1926        tcp_v4_fill_cb(skb, iph, th);
1927
1928        skb->dev = NULL;
1929
1930        if (sk->sk_state == TCP_LISTEN) {
1931                ret = tcp_v4_do_rcv(sk, skb);
1932                goto put_and_return;
1933        }
1934
1935        sk_incoming_cpu_update(sk);
1936
1937        bh_lock_sock_nested(sk);
1938        tcp_segs_in(tcp_sk(sk), skb);
1939        ret = 0;
1940        if (!sock_owned_by_user(sk)) {
1941                skb_to_free = sk->sk_rx_skb_cache;
1942                sk->sk_rx_skb_cache = NULL;
1943                ret = tcp_v4_do_rcv(sk, skb);
1944        } else {
1945                if (tcp_add_backlog(sk, skb))
1946                        goto discard_and_relse;
1947                skb_to_free = NULL;
1948        }
1949        bh_unlock_sock(sk);
1950        if (skb_to_free)
1951                __kfree_skb(skb_to_free);
1952
1953put_and_return:
1954        if (refcounted)
1955                sock_put(sk);
1956
1957        return ret;
1958
1959no_tcp_socket:
1960        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1961                goto discard_it;
1962
1963        tcp_v4_fill_cb(skb, iph, th);
1964
1965        if (tcp_checksum_complete(skb)) {
1966csum_error:
1967                __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1968bad_packet:
1969                __TCP_INC_STATS(net, TCP_MIB_INERRS);
1970        } else {
1971                tcp_v4_send_reset(NULL, skb);
1972        }
1973
1974discard_it:
1975        /* Discard frame. */
1976        kfree_skb(skb);
1977        return 0;
1978
1979discard_and_relse:
1980        sk_drops_add(sk, skb);
1981        if (refcounted)
1982                sock_put(sk);
1983        goto discard_it;
1984
1985do_time_wait:
1986        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1987                inet_twsk_put(inet_twsk(sk));
1988                goto discard_it;
1989        }
1990
1991        tcp_v4_fill_cb(skb, iph, th);
1992
1993        if (tcp_checksum_complete(skb)) {
1994                inet_twsk_put(inet_twsk(sk));
1995                goto csum_error;
1996        }
1997        switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1998        case TCP_TW_SYN: {
1999                struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2000                                                        &tcp_hashinfo, skb,
2001                                                        __tcp_hdrlen(th),
2002                                                        iph->saddr, th->source,
2003                                                        iph->daddr, th->dest,
2004                                                        inet_iif(skb),
2005                                                        sdif);
2006                if (sk2) {
2007                        inet_twsk_deschedule_put(inet_twsk(sk));
2008                        sk = sk2;
2009                        tcp_v4_restore_cb(skb);
2010                        refcounted = false;
2011                        goto process;
2012                }
2013        }
2014                /* to ACK */
2015                /* fall through */
2016        case TCP_TW_ACK:
2017                tcp_v4_timewait_ack(sk, skb);
2018                break;
2019        case TCP_TW_RST:
2020                tcp_v4_send_reset(sk, skb);
2021                inet_twsk_deschedule_put(inet_twsk(sk));
2022                goto discard_it;
2023        case TCP_TW_SUCCESS:;
2024        }
2025        goto discard_it;
2026}
2027
2028static struct timewait_sock_ops tcp_timewait_sock_ops = {
2029        .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2030        .twsk_unique    = tcp_twsk_unique,
2031        .twsk_destructor= tcp_twsk_destructor,
2032};
2033
2034void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2035{
2036        struct dst_entry *dst = skb_dst(skb);
2037
2038        if (dst && dst_hold_safe(dst)) {
2039                sk->sk_rx_dst = dst;
2040                inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2041        }
2042}
2043EXPORT_SYMBOL(inet_sk_rx_dst_set);
2044
2045const struct inet_connection_sock_af_ops ipv4_specific = {
2046        .queue_xmit        = ip_queue_xmit,
2047        .send_check        = tcp_v4_send_check,
2048        .rebuild_header    = inet_sk_rebuild_header,
2049        .sk_rx_dst_set     = inet_sk_rx_dst_set,
2050        .conn_request      = tcp_v4_conn_request,
2051        .syn_recv_sock     = tcp_v4_syn_recv_sock,
2052        .net_header_len    = sizeof(struct iphdr),
2053        .setsockopt        = ip_setsockopt,
2054        .getsockopt        = ip_getsockopt,
2055        .addr2sockaddr     = inet_csk_addr2sockaddr,
2056        .sockaddr_len      = sizeof(struct sockaddr_in),
2057#ifdef CONFIG_COMPAT
2058        .compat_setsockopt = compat_ip_setsockopt,
2059        .compat_getsockopt = compat_ip_getsockopt,
2060#endif
2061        .mtu_reduced       = tcp_v4_mtu_reduced,
2062};
2063EXPORT_SYMBOL(ipv4_specific);
2064
2065#ifdef CONFIG_TCP_MD5SIG
2066static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2067        .md5_lookup             = tcp_v4_md5_lookup,
2068        .calc_md5_hash          = tcp_v4_md5_hash_skb,
2069        .md5_parse              = tcp_v4_parse_md5_keys,
2070};
2071#endif
2072
2073/* NOTE: A lot of things set to zero explicitly by call to
2074 *       sk_alloc() so need not be done here.
2075 */
2076static int tcp_v4_init_sock(struct sock *sk)
2077{
2078        struct inet_connection_sock *icsk = inet_csk(sk);
2079
2080        tcp_init_sock(sk);
2081
2082        icsk->icsk_af_ops = &ipv4_specific;
2083
2084#ifdef CONFIG_TCP_MD5SIG
2085        tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2086#endif
2087
2088        return 0;
2089}
2090
2091void tcp_v4_destroy_sock(struct sock *sk)
2092{
2093        struct tcp_sock *tp = tcp_sk(sk);
2094
2095        trace_tcp_destroy_sock(sk);
2096
2097        tcp_clear_xmit_timers(sk);
2098
2099        tcp_cleanup_congestion_control(sk);
2100
2101        tcp_cleanup_ulp(sk);
2102
2103        /* Cleanup up the write buffer. */
2104        tcp_write_queue_purge(sk);
2105
2106        /* Check if we want to disable active TFO */
2107        tcp_fastopen_active_disable_ofo_check(sk);
2108
2109        /* Cleans up our, hopefully empty, out_of_order_queue. */
2110        skb_rbtree_purge(&tp->out_of_order_queue);
2111
2112#ifdef CONFIG_TCP_MD5SIG
2113        /* Clean up the MD5 key list, if any */
2114        if (tp->md5sig_info) {
2115                tcp_clear_md5_list(sk);
2116                kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2117                tp->md5sig_info = NULL;
2118        }
2119#endif
2120
2121        /* Clean up a referenced TCP bind bucket. */
2122        if (inet_csk(sk)->icsk_bind_hash)
2123                inet_put_port(sk);
2124
2125        BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2126
2127        /* If socket is aborted during connect operation */
2128        tcp_free_fastopen_req(tp);
2129        tcp_fastopen_destroy_cipher(sk);
2130        tcp_saved_syn_free(tp);
2131
2132        sk_sockets_allocated_dec(sk);
2133}
2134EXPORT_SYMBOL(tcp_v4_destroy_sock);
2135
2136#ifdef CONFIG_PROC_FS
2137/* Proc filesystem TCP sock list dumping. */
2138
2139/*
2140 * Get next listener socket follow cur.  If cur is NULL, get first socket
2141 * starting from bucket given in st->bucket; when st->bucket is zero the
2142 * very first socket in the hash table is returned.
2143 */
2144static void *listening_get_next(struct seq_file *seq, void *cur)
2145{
2146        struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2147        struct tcp_iter_state *st = seq->private;
2148        struct net *net = seq_file_net(seq);
2149        struct inet_listen_hashbucket *ilb;
2150        struct hlist_nulls_node *node;
2151        struct sock *sk = cur;
2152
2153        if (!sk) {
2154get_head:
2155                ilb = &tcp_hashinfo.listening_hash[st->bucket];
2156                spin_lock(&ilb->lock);
2157                sk = sk_nulls_head(&ilb->nulls_head);
2158                st->offset = 0;
2159                goto get_sk;
2160        }
2161        ilb = &tcp_hashinfo.listening_hash[st->bucket];
2162        ++st->num;
2163        ++st->offset;
2164
2165        sk = sk_nulls_next(sk);
2166get_sk:
2167        sk_nulls_for_each_from(sk, node) {
2168                if (!net_eq(sock_net(sk), net))
2169                        continue;
2170                if (sk->sk_family == afinfo->family)
2171                        return sk;
2172        }
2173        spin_unlock(&ilb->lock);
2174        st->offset = 0;
2175        if (++st->bucket < INET_LHTABLE_SIZE)
2176                goto get_head;
2177        return NULL;
2178}
2179
2180static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2181{
2182        struct tcp_iter_state *st = seq->private;
2183        void *rc;
2184
2185        st->bucket = 0;
2186        st->offset = 0;
2187        rc = listening_get_next(seq, NULL);
2188
2189        while (rc && *pos) {
2190                rc = listening_get_next(seq, rc);
2191                --*pos;
2192        }
2193        return rc;
2194}
2195
2196static inline bool empty_bucket(const struct tcp_iter_state *st)
2197{
2198        return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2199}
2200
2201/*
2202 * Get first established socket starting from bucket given in st->bucket.
2203 * If st->bucket is zero, the very first socket in the hash is returned.
2204 */
2205static void *established_get_first(struct seq_file *seq)
2206{
2207        struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2208        struct tcp_iter_state *st = seq->private;
2209        struct net *net = seq_file_net(seq);
2210        void *rc = NULL;
2211
2212        st->offset = 0;
2213        for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2214                struct sock *sk;
2215                struct hlist_nulls_node *node;
2216                spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2217
2218                /* Lockless fast path for the common case of empty buckets */
2219                if (empty_bucket(st))
2220                        continue;
2221
2222                spin_lock_bh(lock);
2223                sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2224                        if (sk->sk_family != afinfo->family ||
2225                            !net_eq(sock_net(sk), net)) {
2226                                continue;
2227                        }
2228                        rc = sk;
2229                        goto out;
2230                }
2231                spin_unlock_bh(lock);
2232        }
2233out:
2234        return rc;
2235}
2236
2237static void *established_get_next(struct seq_file *seq, void *cur)
2238{
2239        struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2240        struct sock *sk = cur;
2241        struct hlist_nulls_node *node;
2242        struct tcp_iter_state *st = seq->private;
2243        struct net *net = seq_file_net(seq);
2244
2245        ++st->num;
2246        ++st->offset;
2247
2248        sk = sk_nulls_next(sk);
2249
2250        sk_nulls_for_each_from(sk, node) {
2251                if (sk->sk_family == afinfo->family &&
2252                    net_eq(sock_net(sk), net))
2253                        return sk;
2254        }
2255
2256        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2257        ++st->bucket;
2258        return established_get_first(seq);
2259}
2260
2261static void *established_get_idx(struct seq_file *seq, loff_t pos)
2262{
2263        struct tcp_iter_state *st = seq->private;
2264        void *rc;
2265
2266        st->bucket = 0;
2267        rc = established_get_first(seq);
2268
2269        while (rc && pos) {
2270                rc = established_get_next(seq, rc);
2271                --pos;
2272        }
2273        return rc;
2274}
2275
2276static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2277{
2278        void *rc;
2279        struct tcp_iter_state *st = seq->private;
2280
2281        st->state = TCP_SEQ_STATE_LISTENING;
2282        rc        = listening_get_idx(seq, &pos);
2283
2284        if (!rc) {
2285                st->state = TCP_SEQ_STATE_ESTABLISHED;
2286                rc        = established_get_idx(seq, pos);
2287        }
2288
2289        return rc;
2290}
2291
2292static void *tcp_seek_last_pos(struct seq_file *seq)
2293{
2294        struct tcp_iter_state *st = seq->private;
2295        int offset = st->offset;
2296        int orig_num = st->num;
2297        void *rc = NULL;
2298
2299        switch (st->state) {
2300        case TCP_SEQ_STATE_LISTENING:
2301                if (st->bucket >= INET_LHTABLE_SIZE)
2302                        break;
2303                st->state = TCP_SEQ_STATE_LISTENING;
2304                rc = listening_get_next(seq, NULL);
2305                while (offset-- && rc)
2306                        rc = listening_get_next(seq, rc);
2307                if (rc)
2308                        break;
2309                st->bucket = 0;
2310                st->state = TCP_SEQ_STATE_ESTABLISHED;
2311                /* Fallthrough */
2312        case TCP_SEQ_STATE_ESTABLISHED:
2313                if (st->bucket > tcp_hashinfo.ehash_mask)
2314                        break;
2315                rc = established_get_first(seq);
2316                while (offset-- && rc)
2317                        rc = established_get_next(seq, rc);
2318        }
2319
2320        st->num = orig_num;
2321
2322        return rc;
2323}
2324
2325void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2326{
2327        struct tcp_iter_state *st = seq->private;
2328        void *rc;
2329
2330        if (*pos && *pos == st->last_pos) {
2331                rc = tcp_seek_last_pos(seq);
2332                if (rc)
2333                        goto out;
2334        }
2335
2336        st->state = TCP_SEQ_STATE_LISTENING;
2337        st->num = 0;
2338        st->bucket = 0;
2339        st->offset = 0;
2340        rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2341
2342out:
2343        st->last_pos = *pos;
2344        return rc;
2345}
2346EXPORT_SYMBOL(tcp_seq_start);
2347
2348void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2349{
2350        struct tcp_iter_state *st = seq->private;
2351        void *rc = NULL;
2352
2353        if (v == SEQ_START_TOKEN) {
2354                rc = tcp_get_idx(seq, 0);
2355                goto out;
2356        }
2357
2358        switch (st->state) {
2359        case TCP_SEQ_STATE_LISTENING:
2360                rc = listening_get_next(seq, v);
2361                if (!rc) {
2362                        st->state = TCP_SEQ_STATE_ESTABLISHED;
2363                        st->bucket = 0;
2364                        st->offset = 0;
2365                        rc        = established_get_first(seq);
2366                }
2367                break;
2368        case TCP_SEQ_STATE_ESTABLISHED:
2369                rc = established_get_next(seq, v);
2370                break;
2371        }
2372out:
2373        ++*pos;
2374        st->last_pos = *pos;
2375        return rc;
2376}
2377EXPORT_SYMBOL(tcp_seq_next);
2378
2379void tcp_seq_stop(struct seq_file *seq, void *v)
2380{
2381        struct tcp_iter_state *st = seq->private;
2382
2383        switch (st->state) {
2384        case TCP_SEQ_STATE_LISTENING:
2385                if (v != SEQ_START_TOKEN)
2386                        spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2387                break;
2388        case TCP_SEQ_STATE_ESTABLISHED:
2389                if (v)
2390                        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2391                break;
2392        }
2393}
2394EXPORT_SYMBOL(tcp_seq_stop);
2395
2396static void get_openreq4(const struct request_sock *req,
2397                         struct seq_file *f, int i)
2398{
2399        const struct inet_request_sock *ireq = inet_rsk(req);
2400        long delta = req->rsk_timer.expires - jiffies;
2401
2402        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2403                " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2404                i,
2405                ireq->ir_loc_addr,
2406                ireq->ir_num,
2407                ireq->ir_rmt_addr,
2408                ntohs(ireq->ir_rmt_port),
2409                TCP_SYN_RECV,
2410                0, 0, /* could print option size, but that is af dependent. */
2411                1,    /* timers active (only the expire timer) */
2412                jiffies_delta_to_clock_t(delta),
2413                req->num_timeout,
2414                from_kuid_munged(seq_user_ns(f),
2415                                 sock_i_uid(req->rsk_listener)),
2416                0,  /* non standard timer */
2417                0, /* open_requests have no inode */
2418                0,
2419                req);
2420}
2421
2422static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2423{
2424        int timer_active;
2425        unsigned long timer_expires;
2426        const struct tcp_sock *tp = tcp_sk(sk);
2427        const struct inet_connection_sock *icsk = inet_csk(sk);
2428        const struct inet_sock *inet = inet_sk(sk);
2429        const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2430        __be32 dest = inet->inet_daddr;
2431        __be32 src = inet->inet_rcv_saddr;
2432        __u16 destp = ntohs(inet->inet_dport);
2433        __u16 srcp = ntohs(inet->inet_sport);
2434        int rx_queue;
2435        int state;
2436
2437        if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2438            icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2439            icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2440                timer_active    = 1;
2441                timer_expires   = icsk->icsk_timeout;
2442        } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2443                timer_active    = 4;
2444                timer_expires   = icsk->icsk_timeout;
2445        } else if (timer_pending(&sk->sk_timer)) {
2446                timer_active    = 2;
2447                timer_expires   = sk->sk_timer.expires;
2448        } else {
2449                timer_active    = 0;
2450                timer_expires = jiffies;
2451        }
2452
2453        state = inet_sk_state_load(sk);
2454        if (state == TCP_LISTEN)
2455                rx_queue = READ_ONCE(sk->sk_ack_backlog);
2456        else
2457                /* Because we don't lock the socket,
2458                 * we might find a transient negative value.
2459                 */
2460                rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2461                                      READ_ONCE(tp->copied_seq), 0);
2462
2463        seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2464                        "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2465                i, src, srcp, dest, destp, state,
2466                READ_ONCE(tp->write_seq) - tp->snd_una,
2467                rx_queue,
2468                timer_active,
2469                jiffies_delta_to_clock_t(timer_expires - jiffies),
2470                icsk->icsk_retransmits,
2471                from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2472                icsk->icsk_probes_out,
2473                sock_i_ino(sk),
2474                refcount_read(&sk->sk_refcnt), sk,
2475                jiffies_to_clock_t(icsk->icsk_rto),
2476                jiffies_to_clock_t(icsk->icsk_ack.ato),
2477                (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2478                tp->snd_cwnd,
2479                state == TCP_LISTEN ?
2480                    fastopenq->max_qlen :
2481                    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2482}
2483
2484static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2485                               struct seq_file *f, int i)
2486{
2487        long delta = tw->tw_timer.expires - jiffies;
2488        __be32 dest, src;
2489        __u16 destp, srcp;
2490
2491        dest  = tw->tw_daddr;
2492        src   = tw->tw_rcv_saddr;
2493        destp = ntohs(tw->tw_dport);
2494        srcp  = ntohs(tw->tw_sport);
2495
2496        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2497                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2498                i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2499                3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2500                refcount_read(&tw->tw_refcnt), tw);
2501}
2502
2503#define TMPSZ 150
2504
2505static int tcp4_seq_show(struct seq_file *seq, void *v)
2506{
2507        struct tcp_iter_state *st;
2508        struct sock *sk = v;
2509
2510        seq_setwidth(seq, TMPSZ - 1);
2511        if (v == SEQ_START_TOKEN) {
2512                seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2513                           "rx_queue tr tm->when retrnsmt   uid  timeout "
2514                           "inode");
2515                goto out;
2516        }
2517        st = seq->private;
2518
2519        if (sk->sk_state == TCP_TIME_WAIT)
2520                get_timewait4_sock(v, seq, st->num);
2521        else if (sk->sk_state == TCP_NEW_SYN_RECV)
2522                get_openreq4(v, seq, st->num);
2523        else
2524                get_tcp4_sock(v, seq, st->num);
2525out:
2526        seq_pad(seq, '\n');
2527        return 0;
2528}
2529
2530static const struct seq_operations tcp4_seq_ops = {
2531        .show           = tcp4_seq_show,
2532        .start          = tcp_seq_start,
2533        .next           = tcp_seq_next,
2534        .stop           = tcp_seq_stop,
2535};
2536
2537static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2538        .family         = AF_INET,
2539};
2540
2541static int __net_init tcp4_proc_init_net(struct net *net)
2542{
2543        if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2544                        sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2545                return -ENOMEM;
2546        return 0;
2547}
2548
2549static void __net_exit tcp4_proc_exit_net(struct net *net)
2550{
2551        remove_proc_entry("tcp", net->proc_net);
2552}
2553
2554static struct pernet_operations tcp4_net_ops = {
2555        .init = tcp4_proc_init_net,
2556        .exit = tcp4_proc_exit_net,
2557};
2558
2559int __init tcp4_proc_init(void)
2560{
2561        return register_pernet_subsys(&tcp4_net_ops);
2562}
2563
2564void tcp4_proc_exit(void)
2565{
2566        unregister_pernet_subsys(&tcp4_net_ops);
2567}
2568#endif /* CONFIG_PROC_FS */
2569
2570struct proto tcp_prot = {
2571        .name                   = "TCP",
2572        .owner                  = THIS_MODULE,
2573        .close                  = tcp_close,
2574        .pre_connect            = tcp_v4_pre_connect,
2575        .connect                = tcp_v4_connect,
2576        .disconnect             = tcp_disconnect,
2577        .accept                 = inet_csk_accept,
2578        .ioctl                  = tcp_ioctl,
2579        .init                   = tcp_v4_init_sock,
2580        .destroy                = tcp_v4_destroy_sock,
2581        .shutdown               = tcp_shutdown,
2582        .setsockopt             = tcp_setsockopt,
2583        .getsockopt             = tcp_getsockopt,
2584        .keepalive              = tcp_set_keepalive,
2585        .recvmsg                = tcp_recvmsg,
2586        .sendmsg                = tcp_sendmsg,
2587        .sendpage               = tcp_sendpage,
2588        .backlog_rcv            = tcp_v4_do_rcv,
2589        .release_cb             = tcp_release_cb,
2590        .hash                   = inet_hash,
2591        .unhash                 = inet_unhash,
2592        .get_port               = inet_csk_get_port,
2593        .enter_memory_pressure  = tcp_enter_memory_pressure,
2594        .leave_memory_pressure  = tcp_leave_memory_pressure,
2595        .stream_memory_free     = tcp_stream_memory_free,
2596        .sockets_allocated      = &tcp_sockets_allocated,
2597        .orphan_count           = &tcp_orphan_count,
2598        .memory_allocated       = &tcp_memory_allocated,
2599        .memory_pressure        = &tcp_memory_pressure,
2600        .sysctl_mem             = sysctl_tcp_mem,
2601        .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2602        .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2603        .max_header             = MAX_TCP_HEADER,
2604        .obj_size               = sizeof(struct tcp_sock),
2605        .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2606        .twsk_prot              = &tcp_timewait_sock_ops,
2607        .rsk_prot               = &tcp_request_sock_ops,
2608        .h.hashinfo             = &tcp_hashinfo,
2609        .no_autobind            = true,
2610#ifdef CONFIG_COMPAT
2611        .compat_setsockopt      = compat_tcp_setsockopt,
2612        .compat_getsockopt      = compat_tcp_getsockopt,
2613#endif
2614        .diag_destroy           = tcp_abort,
2615};
2616EXPORT_SYMBOL(tcp_prot);
2617
2618static void __net_exit tcp_sk_exit(struct net *net)
2619{
2620        int cpu;
2621
2622        if (net->ipv4.tcp_congestion_control)
2623                module_put(net->ipv4.tcp_congestion_control->owner);
2624
2625        for_each_possible_cpu(cpu)
2626                inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2627        free_percpu(net->ipv4.tcp_sk);
2628}
2629
2630static int __net_init tcp_sk_init(struct net *net)
2631{
2632        int res, cpu, cnt;
2633
2634        net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2635        if (!net->ipv4.tcp_sk)
2636                return -ENOMEM;
2637
2638        for_each_possible_cpu(cpu) {
2639                struct sock *sk;
2640
2641                res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2642                                           IPPROTO_TCP, net);
2643                if (res)
2644                        goto fail;
2645                sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2646
2647                /* Please enforce IP_DF and IPID==0 for RST and
2648                 * ACK sent in SYN-RECV and TIME-WAIT state.
2649                 */
2650                inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2651
2652                *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2653        }
2654
2655        net->ipv4.sysctl_tcp_ecn = 2;
2656        net->ipv4.sysctl_tcp_ecn_fallback = 1;
2657
2658        net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2659        net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2660        net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2661        net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2662        net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2663
2664        net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2665        net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2666        net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2667
2668        net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2669        net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2670        net->ipv4.sysctl_tcp_syncookies = 1;
2671        net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2672        net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2673        net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2674        net->ipv4.sysctl_tcp_orphan_retries = 0;
2675        net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2676        net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2677        net->ipv4.sysctl_tcp_tw_reuse = 2;
2678
2679        cnt = tcp_hashinfo.ehash_mask + 1;
2680        net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2681        net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2682
2683        net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2684        net->ipv4.sysctl_tcp_sack = 1;
2685        net->ipv4.sysctl_tcp_window_scaling = 1;
2686        net->ipv4.sysctl_tcp_timestamps = 1;
2687        net->ipv4.sysctl_tcp_early_retrans = 3;
2688        net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2689        net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2690        net->ipv4.sysctl_tcp_retrans_collapse = 1;
2691        net->ipv4.sysctl_tcp_max_reordering = 300;
2692        net->ipv4.sysctl_tcp_dsack = 1;
2693        net->ipv4.sysctl_tcp_app_win = 31;
2694        net->ipv4.sysctl_tcp_adv_win_scale = 1;
2695        net->ipv4.sysctl_tcp_frto = 2;
2696        net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2697        /* This limits the percentage of the congestion window which we
2698         * will allow a single TSO frame to consume.  Building TSO frames
2699         * which are too large can cause TCP streams to be bursty.
2700         */
2701        net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2702        /* Default TSQ limit of 16 TSO segments */
2703        net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2704        /* rfc5961 challenge ack rate limiting */
2705        net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2706        net->ipv4.sysctl_tcp_min_tso_segs = 2;
2707        net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2708        net->ipv4.sysctl_tcp_autocorking = 1;
2709        net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2710        net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2711        net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2712        if (net != &init_net) {
2713                memcpy(net->ipv4.sysctl_tcp_rmem,
2714                       init_net.ipv4.sysctl_tcp_rmem,
2715                       sizeof(init_net.ipv4.sysctl_tcp_rmem));
2716                memcpy(net->ipv4.sysctl_tcp_wmem,
2717                       init_net.ipv4.sysctl_tcp_wmem,
2718                       sizeof(init_net.ipv4.sysctl_tcp_wmem));
2719        }
2720        net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2721        net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2722        net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2723        spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2724        net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2725        atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2726
2727        /* Reno is always built in */
2728        if (!net_eq(net, &init_net) &&
2729            try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2730                net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2731        else
2732                net->ipv4.tcp_congestion_control = &tcp_reno;
2733
2734        return 0;
2735fail:
2736        tcp_sk_exit(net);
2737
2738        return res;
2739}
2740
2741static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2742{
2743        struct net *net;
2744
2745        inet_twsk_purge(&tcp_hashinfo, AF_INET);
2746
2747        list_for_each_entry(net, net_exit_list, exit_list)
2748                tcp_fastopen_ctx_destroy(net);
2749}
2750
2751static struct pernet_operations __net_initdata tcp_sk_ops = {
2752       .init       = tcp_sk_init,
2753       .exit       = tcp_sk_exit,
2754       .exit_batch = tcp_sk_exit_batch,
2755};
2756
2757void __init tcp_v4_init(void)
2758{
2759        if (register_pernet_subsys(&tcp_sk_ops))
2760                panic("Failed to create the TCP control socket.\n");
2761}
2762