LXR linux/net/ipv4/tcp

   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Implementation of the Transmission Control Protocol(TCP).
   7 *
   8 *              IPv4 specific functions
   9 *
  10 *
  11 *              code split from:
  12 *              linux/ipv4/tcp.c
  13 *              linux/ipv4/tcp_input.c
  14 *              linux/ipv4/tcp_output.c
  15 *
  16 *              See tcp.c for author information
  17 *
  18 *      This program is free software; you can redistribute it and/or
  19 *      modify it under the terms of the GNU General Public License
  20 *      as published by the Free Software Foundation; either version
  21 *      2 of the License, or (at your option) any later version.
  22 */
  23
  24/*
  25 * Changes:
  26 *              David S. Miller :       New socket lookup architecture.
  27 *                                      This code is dedicated to John Dyson.
  28 *              David S. Miller :       Change semantics of established hash,
  29 *                                      half is devoted to TIME_WAIT sockets
  30 *                                      and the rest go in the other half.
  31 *              Andi Kleen :            Add support for syncookies and fixed
  32 *                                      some bugs: ip options weren't passed to
  33 *                                      the TCP layer, missed a check for an
  34 *                                      ACK bit.
  35 *              Andi Kleen :            Implemented fast path mtu discovery.
  36 *                                      Fixed many serious bugs in the
  37 *                                      request_sock handling and moved
  38 *                                      most of it into the af independent code.
  39 *                                      Added tail drop and some other bugfixes.
  40 *                                      Added new listen semantics.
  41 *              Mike McLagan    :       Routing by source
  42 *      Juan Jose Ciarlante:            ip_dynaddr bits
  43 *              Andi Kleen:             various fixes.
  44 *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45 *                                      coma.
  46 *      Andi Kleen              :       Fix new listen.
  47 *      Andi Kleen              :       Fix accept error reporting.
  48 *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49 *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50 *                                      a single port at the same time.
  51 */
  52
  53#define pr_fmt(fmt) "TCP: " fmt
  54
  55#include <linux/bottom_half.h>
  56#include <linux/types.h>
  57#include <linux/fcntl.h>
  58#include <linux/module.h>
  59#include <linux/random.h>
  60#include <linux/cache.h>
  61#include <linux/jhash.h>
  62#include <linux/init.h>
  63#include <linux/times.h>
  64#include <linux/slab.h>
  65
  66#include <net/net_namespace.h>
  67#include <net/icmp.h>
  68#include <net/inet_hashtables.h>
  69#include <net/tcp.h>
  70#include <net/transp_v6.h>
  71#include <net/ipv6.h>
  72#include <net/inet_common.h>
  73#include <net/timewait_sock.h>
  74#include <net/xfrm.h>
  75#include <net/secure_seq.h>
  76#include <net/busy_poll.h>
  77
  78#include <linux/inet.h>
  79#include <linux/ipv6.h>
  80#include <linux/stddef.h>
  81#include <linux/proc_fs.h>
  82#include <linux/seq_file.h>
  83#include <linux/inetdevice.h>
  84
  85#include <crypto/hash.h>
  86#include <linux/scatterlist.h>
  87
  88#include <trace/events/tcp.h>
  89
  90#ifdef CONFIG_TCP_MD5SIG
  91static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  92                               __be32 daddr, __be32 saddr, const struct tcphdr *th);
  93#endif
  94
  95struct inet_hashinfo tcp_hashinfo;
  96EXPORT_SYMBOL(tcp_hashinfo);
  97
  98static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  99{
 100        return secure_tcp_seq(ip_hdr(skb)->daddr,
 101                              ip_hdr(skb)->saddr,
 102                              tcp_hdr(skb)->dest,
 103                              tcp_hdr(skb)->source);
 104}
 105
 106static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 107{
 108        return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 109}
 110
 111int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 112{
 113        const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 114        struct tcp_sock *tp = tcp_sk(sk);
 115
 116        /* With PAWS, it is safe from the viewpoint
 117           of data integrity. Even without PAWS it is safe provided sequence
 118           spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 119
 120           Actually, the idea is close to VJ's one, only timestamp cache is
 121           held not per host, but per port pair and TW bucket is used as state
 122           holder.
 123
 124           If TW bucket has been already destroyed we fall back to VJ's scheme
 125           and use initial timestamp retrieved from peer table.
 126         */
 127        if (tcptw->tw_ts_recent_stamp &&
 128            (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
 129                             get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 130                tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 131                if (tp->write_seq == 0)
 132                        tp->write_seq = 1;
 133                tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 134                tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 135                sock_hold(sktw);
 136                return 1;
 137        }
 138
 139        return 0;
 140}
 141EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 142
 143/* This will initiate an outgoing connection. */
 144int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 145{
 146        struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 147        struct inet_sock *inet = inet_sk(sk);
 148        struct tcp_sock *tp = tcp_sk(sk);
 149        __be16 orig_sport, orig_dport;
 150        __be32 daddr, nexthop;
 151        struct flowi4 *fl4;
 152        struct rtable *rt;
 153        int err;
 154        struct ip_options_rcu *inet_opt;
 155        struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 156
 157        if (addr_len < sizeof(struct sockaddr_in))
 158                return -EINVAL;
 159
 160        if (usin->sin_family != AF_INET)
 161                return -EAFNOSUPPORT;
 162
 163        nexthop = daddr = usin->sin_addr.s_addr;
 164        inet_opt = rcu_dereference_protected(inet->inet_opt,
 165                                             lockdep_sock_is_held(sk));
 166        if (inet_opt && inet_opt->opt.srr) {
 167                if (!daddr)
 168                        return -EINVAL;
 169                nexthop = inet_opt->opt.faddr;
 170        }
 171
 172        orig_sport = inet->inet_sport;
 173        orig_dport = usin->sin_port;
 174        fl4 = &inet->cork.fl.u.ip4;
 175        rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 176                              RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 177                              IPPROTO_TCP,
 178                              orig_sport, orig_dport, sk);
 179        if (IS_ERR(rt)) {
 180                err = PTR_ERR(rt);
 181                if (err == -ENETUNREACH)
 182                        IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 183                return err;
 184        }
 185
 186        if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 187                ip_rt_put(rt);
 188                return -ENETUNREACH;
 189        }
 190
 191        if (!inet_opt || !inet_opt->opt.srr)
 192                daddr = fl4->daddr;
 193
 194        if (!inet->inet_saddr)
 195                inet->inet_saddr = fl4->saddr;
 196        sk_rcv_saddr_set(sk, inet->inet_saddr);
 197
 198        if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 199                /* Reset inherited state */
 200                tp->rx_opt.ts_recent       = 0;
 201                tp->rx_opt.ts_recent_stamp = 0;
 202                if (likely(!tp->repair))
 203                        tp->write_seq      = 0;
 204        }
 205
 206        inet->inet_dport = usin->sin_port;
 207        sk_daddr_set(sk, daddr);
 208
 209        inet_csk(sk)->icsk_ext_hdr_len = 0;
 210        if (inet_opt)
 211                inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 212
 213        tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 214
 215        /* Socket identity is still unknown (sport may be zero).
 216         * However we set state to SYN-SENT and not releasing socket
 217         * lock select source port, enter ourselves into the hash tables and
 218         * complete initialization after this.
 219         */
 220        tcp_set_state(sk, TCP_SYN_SENT);
 221        err = inet_hash_connect(tcp_death_row, sk);
 222        if (err)
 223                goto failure;
 224
 225        sk_set_txhash(sk);
 226
 227        rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 228                               inet->inet_sport, inet->inet_dport, sk);
 229        if (IS_ERR(rt)) {
 230                err = PTR_ERR(rt);
 231                rt = NULL;
 232                goto failure;
 233        }
 234        /* OK, now commit destination to socket.  */
 235        sk->sk_gso_type = SKB_GSO_TCPV4;
 236        sk_setup_caps(sk, &rt->dst);
 237        rt = NULL;
 238
 239        if (likely(!tp->repair)) {
 240                if (!tp->write_seq)
 241                        tp->write_seq = secure_tcp_seq(inet->inet_saddr,
 242                                                       inet->inet_daddr,
 243                                                       inet->inet_sport,
 244                                                       usin->sin_port);
 245                tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 246                                                 inet->inet_saddr,
 247                                                 inet->inet_daddr);
 248        }
 249
 250        inet->inet_id = tp->write_seq ^ jiffies;
 251
 252        if (tcp_fastopen_defer_connect(sk, &err))
 253                return err;
 254        if (err)
 255                goto failure;
 256
 257        err = tcp_connect(sk);
 258
 259        if (err)
 260                goto failure;
 261
 262        return 0;
 263
 264failure:
 265        /*
 266         * This unhashes the socket and releases the local port,
 267         * if necessary.
 268         */
 269        tcp_set_state(sk, TCP_CLOSE);
 270        ip_rt_put(rt);
 271        sk->sk_route_caps = 0;
 272        inet->inet_dport = 0;
 273        return err;
 274}
 275EXPORT_SYMBOL(tcp_v4_connect);
 276
 277/*
 278 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 279 * It can be called through tcp_release_cb() if socket was owned by user
 280 * at the time tcp_v4_err() was called to handle ICMP message.
 281 */
 282void tcp_v4_mtu_reduced(struct sock *sk)
 283{
 284        struct inet_sock *inet = inet_sk(sk);
 285        struct dst_entry *dst;
 286        u32 mtu;
 287
 288        if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 289                return;
 290        mtu = tcp_sk(sk)->mtu_info;
 291        dst = inet_csk_update_pmtu(sk, mtu);
 292        if (!dst)
 293                return;
 294
 295        /* Something is about to be wrong... Remember soft error
 296         * for the case, if this connection will not able to recover.
 297         */
 298        if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 299                sk->sk_err_soft = EMSGSIZE;
 300
 301        mtu = dst_mtu(dst);
 302
 303        if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 304            ip_sk_accept_pmtu(sk) &&
 305            inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 306                tcp_sync_mss(sk, mtu);
 307
 308                /* Resend the TCP packet because it's
 309                 * clear that the old packet has been
 310                 * dropped. This is the new "fast" path mtu
 311                 * discovery.
 312                 */
 313                tcp_simple_retransmit(sk);
 314        } /* else let the usual retransmit timer handle it */
 315}
 316EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 317
 318static void do_redirect(struct sk_buff *skb, struct sock *sk)
 319{
 320        struct dst_entry *dst = __sk_dst_check(sk, 0);
 321
 322        if (dst)
 323                dst->ops->redirect(dst, sk, skb);
 324}
 325
 326
 327/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 328void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 329{
 330        struct request_sock *req = inet_reqsk(sk);
 331        struct net *net = sock_net(sk);
 332
 333        /* ICMPs are not backlogged, hence we cannot get
 334         * an established socket here.
 335         */
 336        if (seq != tcp_rsk(req)->snt_isn) {
 337                __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 338        } else if (abort) {
 339                /*
 340                 * Still in SYN_RECV, just remove it silently.
 341                 * There is no good way to pass the error to the newly
 342                 * created socket, and POSIX does not want network
 343                 * errors returned from accept().
 344                 */
 345                inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 346                tcp_listendrop(req->rsk_listener);
 347        }
 348        reqsk_put(req);
 349}
 350EXPORT_SYMBOL(tcp_req_err);
 351
 352/*
 353 * This routine is called by the ICMP module when it gets some
 354 * sort of error condition.  If err < 0 then the socket should
 355 * be closed and the error returned to the user.  If err > 0
 356 * it's just the icmp type << 8 | icmp code.  After adjustment
 357 * header points to the first 8 bytes of the tcp header.  We need
 358 * to find the appropriate port.
 359 *
 360 * The locking strategy used here is very "optimistic". When
 361 * someone else accesses the socket the ICMP is just dropped
 362 * and for some paths there is no check at all.
 363 * A more general error queue to queue errors for later handling
 364 * is probably better.
 365 *
 366 */
 367
 368void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 369{
 370        const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 371        struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 372        struct inet_connection_sock *icsk;
 373        struct tcp_sock *tp;
 374        struct inet_sock *inet;
 375        const int type = icmp_hdr(icmp_skb)->type;
 376        const int code = icmp_hdr(icmp_skb)->code;
 377        struct sock *sk;
 378        struct sk_buff *skb;
 379        struct request_sock *fastopen;
 380        u32 seq, snd_una;
 381        s32 remaining;
 382        u32 delta_us;
 383        int err;
 384        struct net *net = dev_net(icmp_skb->dev);
 385
 386        sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 387                                       th->dest, iph->saddr, ntohs(th->source),
 388                                       inet_iif(icmp_skb), 0);
 389        if (!sk) {
 390                __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 391                return;
 392        }
 393        if (sk->sk_state == TCP_TIME_WAIT) {
 394                inet_twsk_put(inet_twsk(sk));
 395                return;
 396        }
 397        seq = ntohl(th->seq);
 398        if (sk->sk_state == TCP_NEW_SYN_RECV)
 399                return tcp_req_err(sk, seq,
 400                                  type == ICMP_PARAMETERPROB ||
 401                                  type == ICMP_TIME_EXCEEDED ||
 402                                  (type == ICMP_DEST_UNREACH &&
 403                                   (code == ICMP_NET_UNREACH ||
 404                                    code == ICMP_HOST_UNREACH)));
 405
 406        bh_lock_sock(sk);
 407        /* If too many ICMPs get dropped on busy
 408         * servers this needs to be solved differently.
 409         * We do take care of PMTU discovery (RFC1191) special case :
 410         * we can receive locally generated ICMP messages while socket is held.
 411         */
 412        if (sock_owned_by_user(sk)) {
 413                if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 414                        __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 415        }
 416        if (sk->sk_state == TCP_CLOSE)
 417                goto out;
 418
 419        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 420                __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 421                goto out;
 422        }
 423
 424        icsk = inet_csk(sk);
 425        tp = tcp_sk(sk);
 426        /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 427        fastopen = tp->fastopen_rsk;
 428        snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 429        if (sk->sk_state != TCP_LISTEN &&
 430            !between(seq, snd_una, tp->snd_nxt)) {
 431                __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 432                goto out;
 433        }
 434
 435        switch (type) {
 436        case ICMP_REDIRECT:
 437                if (!sock_owned_by_user(sk))
 438                        do_redirect(icmp_skb, sk);
 439                goto out;
 440        case ICMP_SOURCE_QUENCH:
 441                /* Just silently ignore these. */
 442                goto out;
 443        case ICMP_PARAMETERPROB:
 444                err = EPROTO;
 445                break;
 446        case ICMP_DEST_UNREACH:
 447                if (code > NR_ICMP_UNREACH)
 448                        goto out;
 449
 450                if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 451                        /* We are not interested in TCP_LISTEN and open_requests
 452                         * (SYN-ACKs send out by Linux are always <576bytes so
 453                         * they should go through unfragmented).
 454                         */
 455                        if (sk->sk_state == TCP_LISTEN)
 456                                goto out;
 457
 458                        tp->mtu_info = info;
 459                        if (!sock_owned_by_user(sk)) {
 460                                tcp_v4_mtu_reduced(sk);
 461                        } else {
 462                                if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 463                                        sock_hold(sk);
 464                        }
 465                        goto out;
 466                }
 467
 468                err = icmp_err_convert[code].errno;
 469                /* check if icmp_skb allows revert of backoff
 470                 * (see draft-zimmermann-tcp-lcd) */
 471                if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 472                        break;
 473                if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 474                    !icsk->icsk_backoff || fastopen)
 475                        break;
 476
 477                if (sock_owned_by_user(sk))
 478                        break;
 479
 480                icsk->icsk_backoff--;
 481                icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 482                                               TCP_TIMEOUT_INIT;
 483                icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 484
 485                skb = tcp_rtx_queue_head(sk);
 486                BUG_ON(!skb);
 487
 488                tcp_mstamp_refresh(tp);
 489                delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
 490                remaining = icsk->icsk_rto -
 491                            usecs_to_jiffies(delta_us);
 492
 493                if (remaining > 0) {
 494                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 495                                                  remaining, TCP_RTO_MAX);
 496                } else {
 497                        /* RTO revert clocked out retransmission.
 498                         * Will retransmit now */
 499                        tcp_retransmit_timer(sk);
 500                }
 501
 502                break;
 503        case ICMP_TIME_EXCEEDED:
 504                err = EHOSTUNREACH;
 505                break;
 506        default:
 507                goto out;
 508        }
 509
 510        switch (sk->sk_state) {
 511        case TCP_SYN_SENT:
 512        case TCP_SYN_RECV:
 513                /* Only in fast or simultaneous open. If a fast open socket is
 514                 * is already accepted it is treated as a connected one below.
 515                 */
 516                if (fastopen && !fastopen->sk)
 517                        break;
 518
 519                if (!sock_owned_by_user(sk)) {
 520                        sk->sk_err = err;
 521
 522                        sk->sk_error_report(sk);
 523
 524                        tcp_done(sk);
 525                } else {
 526                        sk->sk_err_soft = err;
 527                }
 528                goto out;
 529        }
 530
 531        /* If we've already connected we will keep trying
 532         * until we time out, or the user gives up.
 533         *
 534         * rfc1122 4.2.3.9 allows to consider as hard errors
 535         * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 536         * but it is obsoleted by pmtu discovery).
 537         *
 538         * Note, that in modern internet, where routing is unreliable
 539         * and in each dark corner broken firewalls sit, sending random
 540         * errors ordered by their masters even this two messages finally lose
 541         * their original sense (even Linux sends invalid PORT_UNREACHs)
 542         *
 543         * Now we are in compliance with RFCs.
 544         *                                                      --ANK (980905)
 545         */
 546
 547        inet = inet_sk(sk);
 548        if (!sock_owned_by_user(sk) && inet->recverr) {
 549                sk->sk_err = err;
 550                sk->sk_error_report(sk);
 551        } else  { /* Only an error on timeout */
 552                sk->sk_err_soft = err;
 553        }
 554
 555out:
 556        bh_unlock_sock(sk);
 557        sock_put(sk);
 558}
 559
 560void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 561{
 562        struct tcphdr *th = tcp_hdr(skb);
 563
 564        if (skb->ip_summed == CHECKSUM_PARTIAL) {
 565                th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 566                skb->csum_start = skb_transport_header(skb) - skb->head;
 567                skb->csum_offset = offsetof(struct tcphdr, check);
 568        } else {
 569                th->check = tcp_v4_check(skb->len, saddr, daddr,
 570                                         csum_partial(th,
 571                                                      th->doff << 2,
 572                                                      skb->csum));
 573        }
 574}
 575
 576/* This routine computes an IPv4 TCP checksum. */
 577void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 578{
 579        const struct inet_sock *inet = inet_sk(sk);
 580
 581        __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 582}
 583EXPORT_SYMBOL(tcp_v4_send_check);
 584
 585/*
 586 *      This routine will send an RST to the other tcp.
 587 *
 588 *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 589 *                    for reset.
 590 *      Answer: if a packet caused RST, it is not for a socket
 591 *              existing in our system, if it is matched to a socket,
 592 *              it is just duplicate segment or bug in other side's TCP.
 593 *              So that we build reply only basing on parameters
 594 *              arrived with segment.
 595 *      Exception: precedence violation. We do not implement it in any case.
 596 */
 597
 598static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 599{
 600        const struct tcphdr *th = tcp_hdr(skb);
 601        struct {
 602                struct tcphdr th;
 603#ifdef CONFIG_TCP_MD5SIG
 604                __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 605#endif
 606        } rep;
 607        struct ip_reply_arg arg;
 608#ifdef CONFIG_TCP_MD5SIG
 609        struct tcp_md5sig_key *key = NULL;
 610        const __u8 *hash_location = NULL;
 611        unsigned char newhash[16];
 612        int genhash;
 613        struct sock *sk1 = NULL;
 614#endif
 615        struct net *net;
 616
 617        /* Never send a reset in response to a reset. */
 618        if (th->rst)
 619                return;
 620
 621        /* If sk not NULL, it means we did a successful lookup and incoming
 622         * route had to be correct. prequeue might have dropped our dst.
 623         */
 624        if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 625                return;
 626
 627        /* Swap the send and the receive. */
 628        memset(&rep, 0, sizeof(rep));
 629        rep.th.dest   = th->source;
 630        rep.th.source = th->dest;
 631        rep.th.doff   = sizeof(struct tcphdr) / 4;
 632        rep.th.rst    = 1;
 633
 634        if (th->ack) {
 635                rep.th.seq = th->ack_seq;
 636        } else {
 637                rep.th.ack = 1;
 638                rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 639                                       skb->len - (th->doff << 2));
 640        }
 641
 642        memset(&arg, 0, sizeof(arg));
 643        arg.iov[0].iov_base = (unsigned char *)&rep;
 644        arg.iov[0].iov_len  = sizeof(rep.th);
 645
 646        net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 647#ifdef CONFIG_TCP_MD5SIG
 648        rcu_read_lock();
 649        hash_location = tcp_parse_md5sig_option(th);
 650        if (sk && sk_fullsock(sk)) {
 651                key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 652                                        &ip_hdr(skb)->saddr, AF_INET);
 653        } else if (hash_location) {
 654                /*
 655                 * active side is lost. Try to find listening socket through
 656                 * source port, and then find md5 key through listening socket.
 657                 * we are not loose security here:
 658                 * Incoming packet is checked with md5 hash with finding key,
 659                 * no RST generated if md5 hash doesn't match.
 660                 */
 661                sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 662                                             ip_hdr(skb)->saddr,
 663                                             th->source, ip_hdr(skb)->daddr,
 664                                             ntohs(th->source), inet_iif(skb),
 665                                             tcp_v4_sdif(skb));
 666                /* don't send rst if it can't find key */
 667                if (!sk1)
 668                        goto out;
 669
 670                key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 671                                        &ip_hdr(skb)->saddr, AF_INET);
 672                if (!key)
 673                        goto out;
 674
 675
 676                genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 677                if (genhash || memcmp(hash_location, newhash, 16) != 0)
 678                        goto out;
 679
 680        }
 681
 682        if (key) {
 683                rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 684                                   (TCPOPT_NOP << 16) |
 685                                   (TCPOPT_MD5SIG << 8) |
 686                                   TCPOLEN_MD5SIG);
 687                /* Update length and the length the header thinks exists */
 688                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 689                rep.th.doff = arg.iov[0].iov_len / 4;
 690
 691                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 692                                     key, ip_hdr(skb)->saddr,
 693                                     ip_hdr(skb)->daddr, &rep.th);
 694        }
 695#endif
 696        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 697                                      ip_hdr(skb)->saddr, /* XXX */
 698                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 699        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 700        arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 701
 702        /* When socket is gone, all binding information is lost.
 703         * routing might fail in this case. No choice here, if we choose to force
 704         * input interface, we will misroute in case of asymmetric route.
 705         */
 706        if (sk) {
 707                arg.bound_dev_if = sk->sk_bound_dev_if;
 708                trace_tcp_send_reset(sk, skb);
 709        }
 710
 711        BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 712                     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 713
 714        arg.tos = ip_hdr(skb)->tos;
 715        arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 716        local_bh_disable();
 717        ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 718                              skb, &TCP_SKB_CB(skb)->header.h4.opt,
 719                              ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 720                              &arg, arg.iov[0].iov_len);
 721
 722        __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 723        __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 724        local_bh_enable();
 725
 726#ifdef CONFIG_TCP_MD5SIG
 727out:
 728        rcu_read_unlock();
 729#endif
 730}
 731
 732/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 733   outside socket context is ugly, certainly. What can I do?
 734 */
 735
 736static void tcp_v4_send_ack(const struct sock *sk,
 737                            struct sk_buff *skb, u32 seq, u32 ack,
 738                            u32 win, u32 tsval, u32 tsecr, int oif,
 739                            struct tcp_md5sig_key *key,
 740                            int reply_flags, u8 tos)
 741{
 742        const struct tcphdr *th = tcp_hdr(skb);
 743        struct {
 744                struct tcphdr th;
 745                __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 746#ifdef CONFIG_TCP_MD5SIG
 747                           + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 748#endif
 749                        ];
 750        } rep;
 751        struct net *net = sock_net(sk);
 752        struct ip_reply_arg arg;
 753
 754        memset(&rep.th, 0, sizeof(struct tcphdr));
 755        memset(&arg, 0, sizeof(arg));
 756
 757        arg.iov[0].iov_base = (unsigned char *)&rep;
 758        arg.iov[0].iov_len  = sizeof(rep.th);
 759        if (tsecr) {
 760                rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 761                                   (TCPOPT_TIMESTAMP << 8) |
 762                                   TCPOLEN_TIMESTAMP);
 763                rep.opt[1] = htonl(tsval);
 764                rep.opt[2] = htonl(tsecr);
 765                arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 766        }
 767
 768        /* Swap the send and the receive. */
 769        rep.th.dest    = th->source;
 770        rep.th.source  = th->dest;
 771        rep.th.doff    = arg.iov[0].iov_len / 4;
 772        rep.th.seq     = htonl(seq);
 773        rep.th.ack_seq = htonl(ack);
 774        rep.th.ack     = 1;
 775        rep.th.window  = htons(win);
 776
 777#ifdef CONFIG_TCP_MD5SIG
 778        if (key) {
 779                int offset = (tsecr) ? 3 : 0;
 780
 781                rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 782                                          (TCPOPT_NOP << 16) |
 783                                          (TCPOPT_MD5SIG << 8) |
 784                                          TCPOLEN_MD5SIG);
 785                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 786                rep.th.doff = arg.iov[0].iov_len/4;
 787
 788                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 789                                    key, ip_hdr(skb)->saddr,
 790                                    ip_hdr(skb)->daddr, &rep.th);
 791        }
 792#endif
 793        arg.flags = reply_flags;
 794        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 795                                      ip_hdr(skb)->saddr, /* XXX */
 796                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 797        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 798        if (oif)
 799                arg.bound_dev_if = oif;
 800        arg.tos = tos;
 801        arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 802        local_bh_disable();
 803        ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 804                              skb, &TCP_SKB_CB(skb)->header.h4.opt,
 805                              ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 806                              &arg, arg.iov[0].iov_len);
 807
 808        __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 809        local_bh_enable();
 810}
 811
 812static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 813{
 814        struct inet_timewait_sock *tw = inet_twsk(sk);
 815        struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 816
 817        tcp_v4_send_ack(sk, skb,
 818                        tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 819                        tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 820                        tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 821                        tcptw->tw_ts_recent,
 822                        tw->tw_bound_dev_if,
 823                        tcp_twsk_md5_key(tcptw),
 824                        tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 825                        tw->tw_tos
 826                        );
 827
 828        inet_twsk_put(tw);
 829}
 830
 831static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 832                                  struct request_sock *req)
 833{
 834        /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 835         * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 836         */
 837        u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 838                                             tcp_sk(sk)->snd_nxt;
 839
 840        /* RFC 7323 2.3
 841         * The window field (SEG.WND) of every outgoing segment, with the
 842         * exception of <SYN> segments, MUST be right-shifted by
 843         * Rcv.Wind.Shift bits:
 844         */
 845        tcp_v4_send_ack(sk, skb, seq,
 846                        tcp_rsk(req)->rcv_nxt,
 847                        req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 848                        tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 849                        req->ts_recent,
 850                        0,
 851                        tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
 852                                          AF_INET),
 853                        inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 854                        ip_hdr(skb)->tos);
 855}
 856
 857/*
 858 *      Send a SYN-ACK after having received a SYN.
 859 *      This still operates on a request_sock only, not on a big
 860 *      socket.
 861 */
 862static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 863                              struct flowi *fl,
 864                              struct request_sock *req,
 865                              struct tcp_fastopen_cookie *foc,
 866                              enum tcp_synack_type synack_type)
 867{
 868        const struct inet_request_sock *ireq = inet_rsk(req);
 869        struct flowi4 fl4;
 870        int err = -1;
 871        struct sk_buff *skb;
 872
 873        /* First, grab a route. */
 874        if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 875                return -1;
 876
 877        skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 878
 879        if (skb) {
 880                __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 881
 882                err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 883                                            ireq->ir_rmt_addr,
 884                                            ireq_opt_deref(ireq));
 885                err = net_xmit_eval(err);
 886        }
 887
 888        return err;
 889}
 890
 891/*
 892 *      IPv4 request_sock destructor.
 893 */
 894static void tcp_v4_reqsk_destructor(struct request_sock *req)
 895{
 896        kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
 897}
 898
 899#ifdef CONFIG_TCP_MD5SIG
 900/*
 901 * RFC2385 MD5 checksumming requires a mapping of
 902 * IP address->MD5 Key.
 903 * We need to maintain these in the sk structure.
 904 */
 905
 906/* Find the Key structure for an address.  */
 907struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
 908                                         const union tcp_md5_addr *addr,
 909                                         int family)
 910{
 911        const struct tcp_sock *tp = tcp_sk(sk);
 912        struct tcp_md5sig_key *key;
 913        const struct tcp_md5sig_info *md5sig;
 914        __be32 mask;
 915        struct tcp_md5sig_key *best_match = NULL;
 916        bool match;
 917
 918        /* caller either holds rcu_read_lock() or socket lock */
 919        md5sig = rcu_dereference_check(tp->md5sig_info,
 920                                       lockdep_sock_is_held(sk));
 921        if (!md5sig)
 922                return NULL;
 923
 924        hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 925                if (key->family != family)
 926                        continue;
 927
 928                if (family == AF_INET) {
 929                        mask = inet_make_mask(key->prefixlen);
 930                        match = (key->addr.a4.s_addr & mask) ==
 931                                (addr->a4.s_addr & mask);
 932#if IS_ENABLED(CONFIG_IPV6)
 933                } else if (family == AF_INET6) {
 934                        match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
 935                                                  key->prefixlen);
 936#endif
 937                } else {
 938                        match = false;
 939                }
 940
 941                if (match && (!best_match ||
 942                              key->prefixlen > best_match->prefixlen))
 943                        best_match = key;
 944        }
 945        return best_match;
 946}
 947EXPORT_SYMBOL(tcp_md5_do_lookup);
 948
 949static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
 950                                                      const union tcp_md5_addr *addr,
 951                                                      int family, u8 prefixlen)
 952{
 953        const struct tcp_sock *tp = tcp_sk(sk);
 954        struct tcp_md5sig_key *key;
 955        unsigned int size = sizeof(struct in_addr);
 956        const struct tcp_md5sig_info *md5sig;
 957
 958        /* caller either holds rcu_read_lock() or socket lock */
 959        md5sig = rcu_dereference_check(tp->md5sig_info,
 960                                       lockdep_sock_is_held(sk));
 961        if (!md5sig)
 962                return NULL;
 963#if IS_ENABLED(CONFIG_IPV6)
 964        if (family == AF_INET6)
 965                size = sizeof(struct in6_addr);
 966#endif
 967        hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 968                if (key->family != family)
 969                        continue;
 970                if (!memcmp(&key->addr, addr, size) &&
 971                    key->prefixlen == prefixlen)
 972                        return key;
 973        }
 974        return NULL;
 975}
 976
 977struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
 978                                         const struct sock *addr_sk)
 979{
 980        const union tcp_md5_addr *addr;
 981
 982        addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
 983        return tcp_md5_do_lookup(sk, addr, AF_INET);
 984}
 985EXPORT_SYMBOL(tcp_v4_md5_lookup);
 986
 987/* This can be called on a newly created socket, from other files */
 988int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 989                   int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
 990                   gfp_t gfp)
 991{
 992        /* Add Key to the list */
 993        struct tcp_md5sig_key *key;
 994        struct tcp_sock *tp = tcp_sk(sk);
 995        struct tcp_md5sig_info *md5sig;
 996
 997        key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
 998        if (key) {
 999                /* Pre-existing entry - just update that one. */
1000                memcpy(key->key, newkey, newkeylen);

1001                key->keylen = newkeylen;
1002                return 0;
1003        }
1004
1005        md5sig = rcu_dereference_protected(tp->md5sig_info,
1006                                           lockdep_sock_is_held(sk));
1007        if (!md5sig) {
1008                md5sig = kmalloc(sizeof(*md5sig), gfp);
1009                if (!md5sig)
1010                        return -ENOMEM;
1011
1012                sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1013                INIT_HLIST_HEAD(&md5sig->head);
1014                rcu_assign_pointer(tp->md5sig_info, md5sig);
1015        }
1016
1017        key = sock_kmalloc(sk, sizeof(*key), gfp);
1018        if (!key)
1019                return -ENOMEM;
1020        if (!tcp_alloc_md5sig_pool()) {
1021                sock_kfree_s(sk, key, sizeof(*key));
1022                return -ENOMEM;
1023        }
1024
1025        memcpy(key->key, newkey, newkeylen);
1026        key->keylen = newkeylen;
1027        key->family = family;
1028        key->prefixlen = prefixlen;
1029        memcpy(&key->addr, addr,
1030               (family == AF_INET6) ? sizeof(struct in6_addr) :
1031                                      sizeof(struct in_addr));
1032        hlist_add_head_rcu(&key->node, &md5sig->head);
1033        return 0;
1034}
1035EXPORT_SYMBOL(tcp_md5_do_add);
1036
1037int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1038                   u8 prefixlen)
1039{
1040        struct tcp_md5sig_key *key;
1041
1042        key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1043        if (!key)
1044                return -ENOENT;
1045        hlist_del_rcu(&key->node);
1046        atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1047        kfree_rcu(key, rcu);
1048        return 0;
1049}
1050EXPORT_SYMBOL(tcp_md5_do_del);
1051
1052static void tcp_clear_md5_list(struct sock *sk)
1053{
1054        struct tcp_sock *tp = tcp_sk(sk);
1055        struct tcp_md5sig_key *key;
1056        struct hlist_node *n;
1057        struct tcp_md5sig_info *md5sig;
1058
1059        md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1060
1061        hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1062                hlist_del_rcu(&key->node);
1063                atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1064                kfree_rcu(key, rcu);
1065        }
1066}
1067
1068static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1069                                 char __user *optval, int optlen)
1070{
1071        struct tcp_md5sig cmd;
1072        struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1073        u8 prefixlen = 32;
1074
1075        if (optlen < sizeof(cmd))
1076                return -EINVAL;
1077
1078        if (copy_from_user(&cmd, optval, sizeof(cmd)))
1079                return -EFAULT;
1080
1081        if (sin->sin_family != AF_INET)
1082                return -EINVAL;
1083
1084        if (optname == TCP_MD5SIG_EXT &&
1085            cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1086                prefixlen = cmd.tcpm_prefixlen;
1087                if (prefixlen > 32)
1088                        return -EINVAL;
1089        }
1090
1091        if (!cmd.tcpm_keylen)
1092                return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1093                                      AF_INET, prefixlen);
1094
1095        if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1096                return -EINVAL;
1097
1098        return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1099                              AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1100                              GFP_KERNEL);
1101}
1102
1103static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1104                                   __be32 daddr, __be32 saddr,
1105                                   const struct tcphdr *th, int nbytes)
1106{
1107        struct tcp4_pseudohdr *bp;
1108        struct scatterlist sg;
1109        struct tcphdr *_th;
1110
1111        bp = hp->scratch;
1112        bp->saddr = saddr;
1113        bp->daddr = daddr;
1114        bp->pad = 0;
1115        bp->protocol = IPPROTO_TCP;
1116        bp->len = cpu_to_be16(nbytes);
1117
1118        _th = (struct tcphdr *)(bp + 1);
1119        memcpy(_th, th, sizeof(*th));
1120        _th->check = 0;
1121
1122        sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1123        ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1124                                sizeof(*bp) + sizeof(*th));
1125        return crypto_ahash_update(hp->md5_req);
1126}
1127
1128static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1129                               __be32 daddr, __be32 saddr, const struct tcphdr *th)
1130{
1131        struct tcp_md5sig_pool *hp;
1132        struct ahash_request *req;
1133
1134        hp = tcp_get_md5sig_pool();
1135        if (!hp)
1136                goto clear_hash_noput;
1137        req = hp->md5_req;
1138
1139        if (crypto_ahash_init(req))
1140                goto clear_hash;
1141        if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1142                goto clear_hash;
1143        if (tcp_md5_hash_key(hp, key))
1144                goto clear_hash;
1145        ahash_request_set_crypt(req, NULL, md5_hash, 0);
1146        if (crypto_ahash_final(req))
1147                goto clear_hash;
1148
1149        tcp_put_md5sig_pool();
1150        return 0;
1151
1152clear_hash:
1153        tcp_put_md5sig_pool();
1154clear_hash_noput:
1155        memset(md5_hash, 0, 16);
1156        return 1;
1157}
1158
1159int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1160                        const struct sock *sk,
1161                        const struct sk_buff *skb)
1162{
1163        struct tcp_md5sig_pool *hp;
1164        struct ahash_request *req;
1165        const struct tcphdr *th = tcp_hdr(skb);
1166        __be32 saddr, daddr;
1167
1168        if (sk) { /* valid for establish/request sockets */
1169                saddr = sk->sk_rcv_saddr;
1170                daddr = sk->sk_daddr;
1171        } else {
1172                const struct iphdr *iph = ip_hdr(skb);
1173                saddr = iph->saddr;
1174                daddr = iph->daddr;
1175        }
1176
1177        hp = tcp_get_md5sig_pool();
1178        if (!hp)
1179                goto clear_hash_noput;
1180        req = hp->md5_req;
1181
1182        if (crypto_ahash_init(req))
1183                goto clear_hash;
1184
1185        if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1186                goto clear_hash;
1187        if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1188                goto clear_hash;
1189        if (tcp_md5_hash_key(hp, key))
1190                goto clear_hash;
1191        ahash_request_set_crypt(req, NULL, md5_hash, 0);
1192        if (crypto_ahash_final(req))
1193                goto clear_hash;
1194
1195        tcp_put_md5sig_pool();
1196        return 0;
1197
1198clear_hash:
1199        tcp_put_md5sig_pool();
1200clear_hash_noput:
1201        memset(md5_hash, 0, 16);
1202        return 1;
1203}
1204EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1205
1206#endif
1207
1208/* Called with rcu_read_lock() */
1209static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1210                                    const struct sk_buff *skb)
1211{
1212#ifdef CONFIG_TCP_MD5SIG
1213        /*
1214         * This gets called for each TCP segment that arrives
1215         * so we want to be efficient.
1216         * We have 3 drop cases:
1217         * o No MD5 hash and one expected.
1218         * o MD5 hash and we're not expecting one.
1219         * o MD5 hash and its wrong.
1220         */
1221        const __u8 *hash_location = NULL;
1222        struct tcp_md5sig_key *hash_expected;
1223        const struct iphdr *iph = ip_hdr(skb);
1224        const struct tcphdr *th = tcp_hdr(skb);
1225        int genhash;
1226        unsigned char newhash[16];
1227
1228        hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1229                                          AF_INET);
1230        hash_location = tcp_parse_md5sig_option(th);
1231
1232        /* We've parsed the options - do we have a hash? */
1233        if (!hash_expected && !hash_location)
1234                return false;
1235
1236        if (hash_expected && !hash_location) {
1237                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1238                return true;
1239        }
1240
1241        if (!hash_expected && hash_location) {
1242                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1243                return true;
1244        }
1245
1246        /* Okay, so this is hash_expected and hash_location -
1247         * so we need to calculate the checksum.
1248         */
1249        genhash = tcp_v4_md5_hash_skb(newhash,
1250                                      hash_expected,
1251                                      NULL, skb);
1252
1253        if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1254                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1255                net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1256                                     &iph->saddr, ntohs(th->source),
1257                                     &iph->daddr, ntohs(th->dest),
1258                                     genhash ? " tcp_v4_calc_md5_hash failed"
1259                                     : "");
1260                return true;
1261        }
1262        return false;
1263#endif
1264        return false;
1265}
1266
1267static void tcp_v4_init_req(struct request_sock *req,
1268                            const struct sock *sk_listener,
1269                            struct sk_buff *skb)
1270{
1271        struct inet_request_sock *ireq = inet_rsk(req);
1272        struct net *net = sock_net(sk_listener);
1273
1274        sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1275        sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1276        RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1277}
1278
1279static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1280                                          struct flowi *fl,
1281                                          const struct request_sock *req)
1282{
1283        return inet_csk_route_req(sk, &fl->u.ip4, req);
1284}
1285
1286struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1287        .family         =       PF_INET,
1288        .obj_size       =       sizeof(struct tcp_request_sock),
1289        .rtx_syn_ack    =       tcp_rtx_synack,
1290        .send_ack       =       tcp_v4_reqsk_send_ack,
1291        .destructor     =       tcp_v4_reqsk_destructor,
1292        .send_reset     =       tcp_v4_send_reset,
1293        .syn_ack_timeout =      tcp_syn_ack_timeout,
1294};
1295
1296static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1297        .mss_clamp      =       TCP_MSS_DEFAULT,
1298#ifdef CONFIG_TCP_MD5SIG
1299        .req_md5_lookup =       tcp_v4_md5_lookup,
1300        .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1301#endif
1302        .init_req       =       tcp_v4_init_req,
1303#ifdef CONFIG_SYN_COOKIES
1304        .cookie_init_seq =      cookie_v4_init_sequence,
1305#endif
1306        .route_req      =       tcp_v4_route_req,
1307        .init_seq       =       tcp_v4_init_seq,
1308        .init_ts_off    =       tcp_v4_init_ts_off,
1309        .send_synack    =       tcp_v4_send_synack,
1310};
1311
1312int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1313{
1314        /* Never answer to SYNs send to broadcast or multicast */
1315        if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1316                goto drop;
1317
1318        return tcp_conn_request(&tcp_request_sock_ops,
1319                                &tcp_request_sock_ipv4_ops, sk, skb);
1320
1321drop:
1322        tcp_listendrop(sk);
1323        return 0;
1324}
1325EXPORT_SYMBOL(tcp_v4_conn_request);
1326
1327
1328/*
1329 * The three way handshake has completed - we got a valid synack -
1330 * now create the new socket.
1331 */
1332struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1333                                  struct request_sock *req,
1334                                  struct dst_entry *dst,
1335                                  struct request_sock *req_unhash,
1336                                  bool *own_req)
1337{
1338        struct inet_request_sock *ireq;
1339        struct inet_sock *newinet;
1340        struct tcp_sock *newtp;
1341        struct sock *newsk;
1342#ifdef CONFIG_TCP_MD5SIG
1343        struct tcp_md5sig_key *key;
1344#endif
1345        struct ip_options_rcu *inet_opt;
1346
1347        if (sk_acceptq_is_full(sk))
1348                goto exit_overflow;
1349
1350        newsk = tcp_create_openreq_child(sk, req, skb);
1351        if (!newsk)
1352                goto exit_nonewsk;
1353
1354        newsk->sk_gso_type = SKB_GSO_TCPV4;
1355        inet_sk_rx_dst_set(newsk, skb);
1356
1357        newtp                 = tcp_sk(newsk);
1358        newinet               = inet_sk(newsk);
1359        ireq                  = inet_rsk(req);
1360        sk_daddr_set(newsk, ireq->ir_rmt_addr);
1361        sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1362        newsk->sk_bound_dev_if = ireq->ir_iif;
1363        newinet->inet_saddr   = ireq->ir_loc_addr;
1364        inet_opt              = rcu_dereference(ireq->ireq_opt);
1365        RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1366        newinet->mc_index     = inet_iif(skb);
1367        newinet->mc_ttl       = ip_hdr(skb)->ttl;
1368        newinet->rcv_tos      = ip_hdr(skb)->tos;
1369        inet_csk(newsk)->icsk_ext_hdr_len = 0;
1370        if (inet_opt)
1371                inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1372        newinet->inet_id = newtp->write_seq ^ jiffies;
1373
1374        if (!dst) {
1375                dst = inet_csk_route_child_sock(sk, newsk, req);
1376                if (!dst)
1377                        goto put_and_exit;
1378        } else {
1379                /* syncookie case : see end of cookie_v4_check() */
1380        }
1381        sk_setup_caps(newsk, dst);
1382
1383        tcp_ca_openreq_child(newsk, dst);
1384
1385        tcp_sync_mss(newsk, dst_mtu(dst));
1386        newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1387
1388        tcp_initialize_rcv_mss(newsk);
1389
1390#ifdef CONFIG_TCP_MD5SIG
1391        /* Copy over the MD5 key from the original socket */
1392        key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1393                                AF_INET);
1394        if (key) {
1395                /*
1396                 * We're using one, so create a matching key
1397                 * on the newsk structure. If we fail to get
1398                 * memory, then we end up not copying the key
1399                 * across. Shucks.
1400                 */
1401                tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1402                               AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1403                sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1404        }
1405#endif
1406
1407        if (__inet_inherit_port(sk, newsk) < 0)
1408                goto put_and_exit;
1409        *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1410        if (likely(*own_req)) {
1411                tcp_move_syn(newtp, req);
1412                ireq->ireq_opt = NULL;
1413        } else {
1414                newinet->inet_opt = NULL;
1415        }
1416        return newsk;
1417
1418exit_overflow:
1419        NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1420exit_nonewsk:
1421        dst_release(dst);
1422exit:
1423        tcp_listendrop(sk);
1424        return NULL;
1425put_and_exit:
1426        newinet->inet_opt = NULL;
1427        inet_csk_prepare_forced_close(newsk);
1428        tcp_done(newsk);
1429        goto exit;
1430}
1431EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1432
1433static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1434{
1435#ifdef CONFIG_SYN_COOKIES
1436        const struct tcphdr *th = tcp_hdr(skb);
1437
1438        if (!th->syn)
1439                sk = cookie_v4_check(sk, skb);
1440#endif
1441        return sk;
1442}
1443
1444/* The socket must have it's spinlock held when we get
1445 * here, unless it is a TCP_LISTEN socket.
1446 *
1447 * We have a potential double-lock case here, so even when
1448 * doing backlog processing we use the BH locking scheme.
1449 * This is because we cannot sleep with the original spinlock
1450 * held.
1451 */
1452int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1453{
1454        struct sock *rsk;
1455
1456        if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1457                struct dst_entry *dst = sk->sk_rx_dst;
1458
1459                sock_rps_save_rxhash(sk, skb);
1460                sk_mark_napi_id(sk, skb);
1461                if (dst) {
1462                        if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1463                            !dst->ops->check(dst, 0)) {
1464                                dst_release(dst);
1465                                sk->sk_rx_dst = NULL;
1466                        }
1467                }
1468                tcp_rcv_established(sk, skb, tcp_hdr(skb));
1469                return 0;
1470        }
1471
1472        if (tcp_checksum_complete(skb))
1473                goto csum_err;
1474
1475        if (sk->sk_state == TCP_LISTEN) {
1476                struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1477
1478                if (!nsk)
1479                        goto discard;
1480                if (nsk != sk) {
1481                        if (tcp_child_process(sk, nsk, skb)) {
1482                                rsk = nsk;
1483                                goto reset;
1484                        }
1485                        return 0;
1486                }
1487        } else
1488                sock_rps_save_rxhash(sk, skb);
1489
1490        if (tcp_rcv_state_process(sk, skb)) {
1491                rsk = sk;
1492                goto reset;
1493        }
1494        return 0;
1495
1496reset:
1497        tcp_v4_send_reset(rsk, skb);
1498discard:
1499        kfree_skb(skb);
1500        /* Be careful here. If this function gets more complicated and
1501         * gcc suffers from register pressure on the x86, sk (in %ebx)
1502         * might be destroyed here. This current version compiles correctly,
1503         * but you have been warned.
1504         */
1505        return 0;
1506
1507csum_err:
1508        TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1509        TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1510        goto discard;
1511}
1512EXPORT_SYMBOL(tcp_v4_do_rcv);
1513
1514int tcp_v4_early_demux(struct sk_buff *skb)
1515{
1516        const struct iphdr *iph;
1517        const struct tcphdr *th;
1518        struct sock *sk;
1519
1520        if (skb->pkt_type != PACKET_HOST)
1521                return 0;
1522
1523        if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1524                return 0;
1525
1526        iph = ip_hdr(skb);
1527        th = tcp_hdr(skb);
1528
1529        if (th->doff < sizeof(struct tcphdr) / 4)
1530                return 0;
1531
1532        sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1533                                       iph->saddr, th->source,
1534                                       iph->daddr, ntohs(th->dest),
1535                                       skb->skb_iif, inet_sdif(skb));
1536        if (sk) {
1537                skb->sk = sk;
1538                skb->destructor = sock_edemux;
1539                if (sk_fullsock(sk)) {
1540                        struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1541
1542                        if (dst)
1543                                dst = dst_check(dst, 0);
1544                        if (dst &&
1545                            inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1546                                skb_dst_set_noref(skb, dst);
1547                }
1548        }
1549        return 0;
1550}
1551
1552bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1553{
1554        u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1555
1556        /* Only socket owner can try to collapse/prune rx queues
1557         * to reduce memory overhead, so add a little headroom here.
1558         * Few sockets backlog are possibly concurrently non empty.
1559         */
1560        limit += 64*1024;
1561
1562        /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1563         * we can fix skb->truesize to its real value to avoid future drops.
1564         * This is valid because skb is not yet charged to the socket.
1565         * It has been noticed pure SACK packets were sometimes dropped
1566         * (if cooked by drivers without copybreak feature).
1567         */
1568        skb_condense(skb);
1569
1570        if (unlikely(sk_add_backlog(sk, skb, limit))) {
1571                bh_unlock_sock(sk);
1572                __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1573                return true;
1574        }
1575        return false;
1576}
1577EXPORT_SYMBOL(tcp_add_backlog);
1578
1579int tcp_filter(struct sock *sk, struct sk_buff *skb)
1580{
1581        struct tcphdr *th = (struct tcphdr *)skb->data;
1582        unsigned int eaten = skb->len;
1583        int err;
1584
1585        err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1586        if (!err) {
1587                eaten -= skb->len;
1588                TCP_SKB_CB(skb)->end_seq -= eaten;
1589        }
1590        return err;
1591}
1592EXPORT_SYMBOL(tcp_filter);
1593
1594static void tcp_v4_restore_cb(struct sk_buff *skb)
1595{
1596        memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1597                sizeof(struct inet_skb_parm));
1598}
1599
1600static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1601                           const struct tcphdr *th)
1602{
1603        /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1604         * barrier() makes sure compiler wont play fool^Waliasing games.
1605         */
1606        memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1607                sizeof(struct inet_skb_parm));
1608        barrier();
1609
1610        TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1611        TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1612                                    skb->len - th->doff * 4);
1613        TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1614        TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1615        TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1616        TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1617        TCP_SKB_CB(skb)->sacked  = 0;
1618        TCP_SKB_CB(skb)->has_rxtstamp =
1619                        skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1620}
1621
1622/*
1623 *      From tcp_input.c
1624 */
1625
1626int tcp_v4_rcv(struct sk_buff *skb)
1627{
1628        struct net *net = dev_net(skb->dev);
1629        int sdif = inet_sdif(skb);
1630        const struct iphdr *iph;
1631        const struct tcphdr *th;
1632        bool refcounted;
1633        struct sock *sk;
1634        int ret;
1635
1636        if (skb->pkt_type != PACKET_HOST)
1637                goto discard_it;
1638
1639        /* Count it even if it's bad */
1640        __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1641
1642        if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1643                goto discard_it;
1644
1645        th = (const struct tcphdr *)skb->data;
1646
1647        if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1648                goto bad_packet;
1649        if (!pskb_may_pull(skb, th->doff * 4))
1650                goto discard_it;
1651
1652        /* An explanation is required here, I think.
1653         * Packet length and doff are validated by header prediction,
1654         * provided case of th->doff==0 is eliminated.
1655         * So, we defer the checks. */
1656
1657        if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1658                goto csum_error;
1659
1660        th = (const struct tcphdr *)skb->data;
1661        iph = ip_hdr(skb);
1662lookup:
1663        sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1664                               th->dest, sdif, &refcounted);
1665        if (!sk)
1666                goto no_tcp_socket;
1667
1668process:
1669        if (sk->sk_state == TCP_TIME_WAIT)
1670                goto do_time_wait;
1671
1672        if (sk->sk_state == TCP_NEW_SYN_RECV) {
1673                struct request_sock *req = inet_reqsk(sk);
1674                struct sock *nsk;
1675
1676                sk = req->rsk_listener;
1677                if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1678                        sk_drops_add(sk, skb);
1679                        reqsk_put(req);
1680                        goto discard_it;
1681                }
1682                if (unlikely(sk->sk_state != TCP_LISTEN)) {
1683                        inet_csk_reqsk_queue_drop_and_put(sk, req);
1684                        goto lookup;
1685                }
1686                /* We own a reference on the listener, increase it again
1687                 * as we might lose it too soon.
1688                 */
1689                sock_hold(sk);
1690                refcounted = true;
1691                nsk = NULL;
1692                if (!tcp_filter(sk, skb)) {
1693                        th = (const struct tcphdr *)skb->data;
1694                        iph = ip_hdr(skb);
1695                        tcp_v4_fill_cb(skb, iph, th);
1696                        nsk = tcp_check_req(sk, skb, req, false);
1697                }
1698                if (!nsk) {
1699                        reqsk_put(req);
1700                        goto discard_and_relse;
1701                }
1702                if (nsk == sk) {
1703                        reqsk_put(req);
1704                        tcp_v4_restore_cb(skb);
1705                } else if (tcp_child_process(sk, nsk, skb)) {
1706                        tcp_v4_send_reset(nsk, skb);
1707                        goto discard_and_relse;
1708                } else {
1709                        sock_put(sk);
1710                        return 0;
1711                }
1712        }
1713        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1714                __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1715                goto discard_and_relse;
1716        }
1717
1718        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1719                goto discard_and_relse;
1720
1721        if (tcp_v4_inbound_md5_hash(sk, skb))
1722                goto discard_and_relse;
1723
1724        nf_reset(skb);
1725
1726        if (tcp_filter(sk, skb))
1727                goto discard_and_relse;
1728        th = (const struct tcphdr *)skb->data;
1729        iph = ip_hdr(skb);
1730        tcp_v4_fill_cb(skb, iph, th);
1731
1732        skb->dev = NULL;
1733
1734        if (sk->sk_state == TCP_LISTEN) {
1735                ret = tcp_v4_do_rcv(sk, skb);
1736                goto put_and_return;
1737        }
1738
1739        sk_incoming_cpu_update(sk);
1740
1741        bh_lock_sock_nested(sk);
1742        tcp_segs_in(tcp_sk(sk), skb);
1743        ret = 0;
1744        if (!sock_owned_by_user(sk)) {
1745                ret = tcp_v4_do_rcv(sk, skb);
1746        } else if (tcp_add_backlog(sk, skb)) {
1747                goto discard_and_relse;
1748        }
1749        bh_unlock_sock(sk);
1750
1751put_and_return:
1752        if (refcounted)
1753                sock_put(sk);
1754
1755        return ret;
1756
1757no_tcp_socket:
1758        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1759                goto discard_it;
1760
1761        tcp_v4_fill_cb(skb, iph, th);
1762
1763        if (tcp_checksum_complete(skb)) {
1764csum_error:
1765                __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1766bad_packet:
1767                __TCP_INC_STATS(net, TCP_MIB_INERRS);
1768        } else {
1769                tcp_v4_send_reset(NULL, skb);
1770        }
1771
1772discard_it:
1773        /* Discard frame. */
1774        kfree_skb(skb);
1775        return 0;
1776
1777discard_and_relse:
1778        sk_drops_add(sk, skb);
1779        if (refcounted)
1780                sock_put(sk);
1781        goto discard_it;
1782
1783do_time_wait:
1784        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1785                inet_twsk_put(inet_twsk(sk));
1786                goto discard_it;
1787        }
1788
1789        tcp_v4_fill_cb(skb, iph, th);
1790
1791        if (tcp_checksum_complete(skb)) {
1792                inet_twsk_put(inet_twsk(sk));
1793                goto csum_error;
1794        }
1795        switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1796        case TCP_TW_SYN: {
1797                struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1798                                                        &tcp_hashinfo, skb,
1799                                                        __tcp_hdrlen(th),
1800                                                        iph->saddr, th->source,
1801                                                        iph->daddr, th->dest,
1802                                                        inet_iif(skb),
1803                                                        sdif);
1804                if (sk2) {
1805                        inet_twsk_deschedule_put(inet_twsk(sk));
1806                        sk = sk2;
1807                        tcp_v4_restore_cb(skb);
1808                        refcounted = false;
1809                        goto process;
1810                }
1811        }
1812                /* to ACK */
1813                /* fall through */
1814        case TCP_TW_ACK:
1815                tcp_v4_timewait_ack(sk, skb);
1816                break;
1817        case TCP_TW_RST:
1818                tcp_v4_send_reset(sk, skb);
1819                inet_twsk_deschedule_put(inet_twsk(sk));
1820                goto discard_it;
1821        case TCP_TW_SUCCESS:;
1822        }
1823        goto discard_it;
1824}
1825
1826static struct timewait_sock_ops tcp_timewait_sock_ops = {
1827        .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1828        .twsk_unique    = tcp_twsk_unique,
1829        .twsk_destructor= tcp_twsk_destructor,
1830};
1831
1832void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1833{
1834        struct dst_entry *dst = skb_dst(skb);
1835
1836        if (dst && dst_hold_safe(dst)) {
1837                sk->sk_rx_dst = dst;
1838                inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1839        }
1840}
1841EXPORT_SYMBOL(inet_sk_rx_dst_set);
1842
1843const struct inet_connection_sock_af_ops ipv4_specific = {
1844        .queue_xmit        = ip_queue_xmit,
1845        .send_check        = tcp_v4_send_check,
1846        .rebuild_header    = inet_sk_rebuild_header,
1847        .sk_rx_dst_set     = inet_sk_rx_dst_set,
1848        .conn_request      = tcp_v4_conn_request,
1849        .syn_recv_sock     = tcp_v4_syn_recv_sock,
1850        .net_header_len    = sizeof(struct iphdr),
1851        .setsockopt        = ip_setsockopt,
1852        .getsockopt        = ip_getsockopt,
1853        .addr2sockaddr     = inet_csk_addr2sockaddr,
1854        .sockaddr_len      = sizeof(struct sockaddr_in),
1855#ifdef CONFIG_COMPAT
1856        .compat_setsockopt = compat_ip_setsockopt,
1857        .compat_getsockopt = compat_ip_getsockopt,
1858#endif
1859        .mtu_reduced       = tcp_v4_mtu_reduced,
1860};
1861EXPORT_SYMBOL(ipv4_specific);
1862
1863#ifdef CONFIG_TCP_MD5SIG
1864static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1865        .md5_lookup             = tcp_v4_md5_lookup,
1866        .calc_md5_hash          = tcp_v4_md5_hash_skb,
1867        .md5_parse              = tcp_v4_parse_md5_keys,
1868};
1869#endif
1870
1871/* NOTE: A lot of things set to zero explicitly by call to
1872 *       sk_alloc() so need not be done here.
1873 */
1874static int tcp_v4_init_sock(struct sock *sk)
1875{
1876        struct inet_connection_sock *icsk = inet_csk(sk);
1877
1878        tcp_init_sock(sk);
1879
1880        icsk->icsk_af_ops = &ipv4_specific;
1881
1882#ifdef CONFIG_TCP_MD5SIG
1883        tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1884#endif
1885
1886        return 0;
1887}
1888
1889void tcp_v4_destroy_sock(struct sock *sk)
1890{
1891        struct tcp_sock *tp = tcp_sk(sk);
1892
1893        trace_tcp_destroy_sock(sk);
1894
1895        tcp_clear_xmit_timers(sk);
1896
1897        tcp_cleanup_congestion_control(sk);
1898
1899        tcp_cleanup_ulp(sk);
1900
1901        /* Cleanup up the write buffer. */
1902        tcp_write_queue_purge(sk);
1903
1904        /* Check if we want to disable active TFO */
1905        tcp_fastopen_active_disable_ofo_check(sk);
1906
1907        /* Cleans up our, hopefully empty, out_of_order_queue. */
1908        skb_rbtree_purge(&tp->out_of_order_queue);
1909
1910#ifdef CONFIG_TCP_MD5SIG
1911        /* Clean up the MD5 key list, if any */
1912        if (tp->md5sig_info) {
1913                tcp_clear_md5_list(sk);
1914                kfree_rcu(tp->md5sig_info, rcu);
1915                tp->md5sig_info = NULL;
1916        }
1917#endif
1918
1919        /* Clean up a referenced TCP bind bucket. */
1920        if (inet_csk(sk)->icsk_bind_hash)
1921                inet_put_port(sk);
1922
1923        BUG_ON(tp->fastopen_rsk);
1924
1925        /* If socket is aborted during connect operation */
1926        tcp_free_fastopen_req(tp);
1927        tcp_fastopen_destroy_cipher(sk);
1928        tcp_saved_syn_free(tp);
1929
1930        sk_sockets_allocated_dec(sk);
1931}
1932EXPORT_SYMBOL(tcp_v4_destroy_sock);
1933
1934#ifdef CONFIG_PROC_FS
1935/* Proc filesystem TCP sock list dumping. */
1936
1937/*
1938 * Get next listener socket follow cur.  If cur is NULL, get first socket
1939 * starting from bucket given in st->bucket; when st->bucket is zero the
1940 * very first socket in the hash table is returned.
1941 */
1942static void *listening_get_next(struct seq_file *seq, void *cur)
1943{
1944        struct tcp_iter_state *st = seq->private;
1945        struct net *net = seq_file_net(seq);
1946        struct inet_listen_hashbucket *ilb;
1947        struct sock *sk = cur;
1948
1949        if (!sk) {
1950get_head:
1951                ilb = &tcp_hashinfo.listening_hash[st->bucket];
1952                spin_lock(&ilb->lock);
1953                sk = sk_head(&ilb->head);
1954                st->offset = 0;
1955                goto get_sk;
1956        }
1957        ilb = &tcp_hashinfo.listening_hash[st->bucket];
1958        ++st->num;
1959        ++st->offset;
1960
1961        sk = sk_next(sk);
1962get_sk:
1963        sk_for_each_from(sk) {
1964                if (!net_eq(sock_net(sk), net))
1965                        continue;
1966                if (sk->sk_family == st->family)
1967                        return sk;
1968        }
1969        spin_unlock(&ilb->lock);
1970        st->offset = 0;
1971        if (++st->bucket < INET_LHTABLE_SIZE)
1972                goto get_head;
1973        return NULL;
1974}
1975
1976static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1977{
1978        struct tcp_iter_state *st = seq->private;
1979        void *rc;
1980
1981        st->bucket = 0;
1982        st->offset = 0;
1983        rc = listening_get_next(seq, NULL);
1984
1985        while (rc && *pos) {
1986                rc = listening_get_next(seq, rc);
1987                --*pos;
1988        }
1989        return rc;
1990}
1991
1992static inline bool empty_bucket(const struct tcp_iter_state *st)
1993{
1994        return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1995}
1996
1997/*
1998 * Get first established socket starting from bucket given in st->bucket.
1999 * If st->bucket is zero, the very first socket in the hash is returned.
2000 */

2001static void *established_get_first(struct seq_file *seq)
2002{
2003        struct tcp_iter_state *st = seq->private;
2004        struct net *net = seq_file_net(seq);
2005        void *rc = NULL;
2006
2007        st->offset = 0;
2008        for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2009                struct sock *sk;
2010                struct hlist_nulls_node *node;
2011                spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2012
2013                /* Lockless fast path for the common case of empty buckets */
2014                if (empty_bucket(st))
2015                        continue;
2016
2017                spin_lock_bh(lock);
2018                sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2019                        if (sk->sk_family != st->family ||
2020                            !net_eq(sock_net(sk), net)) {
2021                                continue;
2022                        }
2023                        rc = sk;
2024                        goto out;
2025                }
2026                spin_unlock_bh(lock);
2027        }
2028out:
2029        return rc;
2030}
2031
2032static void *established_get_next(struct seq_file *seq, void *cur)
2033{
2034        struct sock *sk = cur;
2035        struct hlist_nulls_node *node;
2036        struct tcp_iter_state *st = seq->private;
2037        struct net *net = seq_file_net(seq);
2038
2039        ++st->num;
2040        ++st->offset;
2041
2042        sk = sk_nulls_next(sk);
2043
2044        sk_nulls_for_each_from(sk, node) {
2045                if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2046                        return sk;
2047        }
2048
2049        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2050        ++st->bucket;
2051        return established_get_first(seq);
2052}
2053
2054static void *established_get_idx(struct seq_file *seq, loff_t pos)
2055{
2056        struct tcp_iter_state *st = seq->private;
2057        void *rc;
2058
2059        st->bucket = 0;
2060        rc = established_get_first(seq);
2061
2062        while (rc && pos) {
2063                rc = established_get_next(seq, rc);
2064                --pos;
2065        }
2066        return rc;
2067}
2068
2069static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2070{
2071        void *rc;
2072        struct tcp_iter_state *st = seq->private;
2073
2074        st->state = TCP_SEQ_STATE_LISTENING;
2075        rc        = listening_get_idx(seq, &pos);
2076
2077        if (!rc) {
2078                st->state = TCP_SEQ_STATE_ESTABLISHED;
2079                rc        = established_get_idx(seq, pos);
2080        }
2081
2082        return rc;
2083}
2084
2085static void *tcp_seek_last_pos(struct seq_file *seq)
2086{
2087        struct tcp_iter_state *st = seq->private;
2088        int offset = st->offset;
2089        int orig_num = st->num;
2090        void *rc = NULL;
2091
2092        switch (st->state) {
2093        case TCP_SEQ_STATE_LISTENING:
2094                if (st->bucket >= INET_LHTABLE_SIZE)
2095                        break;
2096                st->state = TCP_SEQ_STATE_LISTENING;
2097                rc = listening_get_next(seq, NULL);
2098                while (offset-- && rc)
2099                        rc = listening_get_next(seq, rc);
2100                if (rc)
2101                        break;
2102                st->bucket = 0;
2103                st->state = TCP_SEQ_STATE_ESTABLISHED;
2104                /* Fallthrough */
2105        case TCP_SEQ_STATE_ESTABLISHED:
2106                if (st->bucket > tcp_hashinfo.ehash_mask)
2107                        break;
2108                rc = established_get_first(seq);
2109                while (offset-- && rc)
2110                        rc = established_get_next(seq, rc);
2111        }
2112
2113        st->num = orig_num;
2114
2115        return rc;
2116}
2117
2118static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2119{
2120        struct tcp_iter_state *st = seq->private;
2121        void *rc;
2122
2123        if (*pos && *pos == st->last_pos) {
2124                rc = tcp_seek_last_pos(seq);
2125                if (rc)
2126                        goto out;
2127        }
2128
2129        st->state = TCP_SEQ_STATE_LISTENING;
2130        st->num = 0;
2131        st->bucket = 0;
2132        st->offset = 0;
2133        rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2134
2135out:
2136        st->last_pos = *pos;
2137        return rc;
2138}
2139
2140static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2141{
2142        struct tcp_iter_state *st = seq->private;
2143        void *rc = NULL;
2144
2145        if (v == SEQ_START_TOKEN) {
2146                rc = tcp_get_idx(seq, 0);
2147                goto out;
2148        }
2149
2150        switch (st->state) {
2151        case TCP_SEQ_STATE_LISTENING:
2152                rc = listening_get_next(seq, v);
2153                if (!rc) {
2154                        st->state = TCP_SEQ_STATE_ESTABLISHED;
2155                        st->bucket = 0;
2156                        st->offset = 0;
2157                        rc        = established_get_first(seq);
2158                }
2159                break;
2160        case TCP_SEQ_STATE_ESTABLISHED:
2161                rc = established_get_next(seq, v);
2162                break;
2163        }
2164out:
2165        ++*pos;
2166        st->last_pos = *pos;
2167        return rc;
2168}
2169
2170static void tcp_seq_stop(struct seq_file *seq, void *v)
2171{
2172        struct tcp_iter_state *st = seq->private;
2173
2174        switch (st->state) {
2175        case TCP_SEQ_STATE_LISTENING:
2176                if (v != SEQ_START_TOKEN)
2177                        spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2178                break;
2179        case TCP_SEQ_STATE_ESTABLISHED:
2180                if (v)
2181                        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2182                break;
2183        }
2184}
2185
2186int tcp_seq_open(struct inode *inode, struct file *file)
2187{
2188        struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2189        struct tcp_iter_state *s;
2190        int err;
2191
2192        err = seq_open_net(inode, file, &afinfo->seq_ops,
2193                          sizeof(struct tcp_iter_state));
2194        if (err < 0)
2195                return err;
2196
2197        s = ((struct seq_file *)file->private_data)->private;
2198        s->family               = afinfo->family;
2199        s->last_pos             = 0;
2200        return 0;
2201}
2202EXPORT_SYMBOL(tcp_seq_open);
2203
2204int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2205{
2206        int rc = 0;
2207        struct proc_dir_entry *p;
2208
2209        afinfo->seq_ops.start           = tcp_seq_start;
2210        afinfo->seq_ops.next            = tcp_seq_next;
2211        afinfo->seq_ops.stop            = tcp_seq_stop;
2212
2213        p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2214                             afinfo->seq_fops, afinfo);
2215        if (!p)
2216                rc = -ENOMEM;
2217        return rc;
2218}
2219EXPORT_SYMBOL(tcp_proc_register);
2220
2221void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2222{
2223        remove_proc_entry(afinfo->name, net->proc_net);
2224}
2225EXPORT_SYMBOL(tcp_proc_unregister);
2226
2227static void get_openreq4(const struct request_sock *req,
2228                         struct seq_file *f, int i)
2229{
2230        const struct inet_request_sock *ireq = inet_rsk(req);
2231        long delta = req->rsk_timer.expires - jiffies;
2232
2233        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2234                " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2235                i,
2236                ireq->ir_loc_addr,
2237                ireq->ir_num,
2238                ireq->ir_rmt_addr,
2239                ntohs(ireq->ir_rmt_port),
2240                TCP_SYN_RECV,
2241                0, 0, /* could print option size, but that is af dependent. */
2242                1,    /* timers active (only the expire timer) */
2243                jiffies_delta_to_clock_t(delta),
2244                req->num_timeout,
2245                from_kuid_munged(seq_user_ns(f),
2246                                 sock_i_uid(req->rsk_listener)),
2247                0,  /* non standard timer */
2248                0, /* open_requests have no inode */
2249                0,
2250                req);
2251}
2252
2253static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2254{
2255        int timer_active;
2256        unsigned long timer_expires;
2257        const struct tcp_sock *tp = tcp_sk(sk);
2258        const struct inet_connection_sock *icsk = inet_csk(sk);
2259        const struct inet_sock *inet = inet_sk(sk);
2260        const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2261        __be32 dest = inet->inet_daddr;
2262        __be32 src = inet->inet_rcv_saddr;
2263        __u16 destp = ntohs(inet->inet_dport);
2264        __u16 srcp = ntohs(inet->inet_sport);
2265        int rx_queue;
2266        int state;
2267
2268        if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2269            icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2270            icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2271                timer_active    = 1;
2272                timer_expires   = icsk->icsk_timeout;
2273        } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2274                timer_active    = 4;
2275                timer_expires   = icsk->icsk_timeout;
2276        } else if (timer_pending(&sk->sk_timer)) {
2277                timer_active    = 2;
2278                timer_expires   = sk->sk_timer.expires;
2279        } else {
2280                timer_active    = 0;
2281                timer_expires = jiffies;
2282        }
2283
2284        state = sk_state_load(sk);
2285        if (state == TCP_LISTEN)
2286                rx_queue = sk->sk_ack_backlog;
2287        else
2288                /* Because we don't lock the socket,
2289                 * we might find a transient negative value.
2290                 */
2291                rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2292
2293        seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2294                        "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2295                i, src, srcp, dest, destp, state,
2296                tp->write_seq - tp->snd_una,
2297                rx_queue,
2298                timer_active,
2299                jiffies_delta_to_clock_t(timer_expires - jiffies),
2300                icsk->icsk_retransmits,
2301                from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2302                icsk->icsk_probes_out,
2303                sock_i_ino(sk),
2304                refcount_read(&sk->sk_refcnt), sk,
2305                jiffies_to_clock_t(icsk->icsk_rto),
2306                jiffies_to_clock_t(icsk->icsk_ack.ato),
2307                (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2308                tp->snd_cwnd,
2309                state == TCP_LISTEN ?
2310                    fastopenq->max_qlen :
2311                    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2312}
2313
2314static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2315                               struct seq_file *f, int i)
2316{
2317        long delta = tw->tw_timer.expires - jiffies;
2318        __be32 dest, src;
2319        __u16 destp, srcp;
2320
2321        dest  = tw->tw_daddr;
2322        src   = tw->tw_rcv_saddr;
2323        destp = ntohs(tw->tw_dport);
2324        srcp  = ntohs(tw->tw_sport);
2325
2326        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2327                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2328                i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2329                3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2330                refcount_read(&tw->tw_refcnt), tw);
2331}
2332
2333#define TMPSZ 150
2334
2335static int tcp4_seq_show(struct seq_file *seq, void *v)
2336{
2337        struct tcp_iter_state *st;
2338        struct sock *sk = v;
2339
2340        seq_setwidth(seq, TMPSZ - 1);
2341        if (v == SEQ_START_TOKEN) {
2342                seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2343                           "rx_queue tr tm->when retrnsmt   uid  timeout "
2344                           "inode");
2345                goto out;
2346        }
2347        st = seq->private;
2348
2349        if (sk->sk_state == TCP_TIME_WAIT)
2350                get_timewait4_sock(v, seq, st->num);
2351        else if (sk->sk_state == TCP_NEW_SYN_RECV)
2352                get_openreq4(v, seq, st->num);
2353        else
2354                get_tcp4_sock(v, seq, st->num);
2355out:
2356        seq_pad(seq, '\n');
2357        return 0;
2358}
2359
2360static const struct file_operations tcp_afinfo_seq_fops = {
2361        .owner   = THIS_MODULE,
2362        .open    = tcp_seq_open,
2363        .read    = seq_read,
2364        .llseek  = seq_lseek,
2365        .release = seq_release_net
2366};
2367
2368static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2369        .name           = "tcp",
2370        .family         = AF_INET,
2371        .seq_fops       = &tcp_afinfo_seq_fops,
2372        .seq_ops        = {
2373                .show           = tcp4_seq_show,
2374        },
2375};
2376
2377static int __net_init tcp4_proc_init_net(struct net *net)
2378{
2379        return tcp_proc_register(net, &tcp4_seq_afinfo);
2380}
2381
2382static void __net_exit tcp4_proc_exit_net(struct net *net)
2383{
2384        tcp_proc_unregister(net, &tcp4_seq_afinfo);
2385}
2386
2387static struct pernet_operations tcp4_net_ops = {
2388        .init = tcp4_proc_init_net,
2389        .exit = tcp4_proc_exit_net,
2390};
2391
2392int __init tcp4_proc_init(void)
2393{
2394        return register_pernet_subsys(&tcp4_net_ops);
2395}
2396
2397void tcp4_proc_exit(void)
2398{
2399        unregister_pernet_subsys(&tcp4_net_ops);
2400}
2401#endif /* CONFIG_PROC_FS */
2402
2403struct proto tcp_prot = {
2404        .name                   = "TCP",
2405        .owner                  = THIS_MODULE,
2406        .close                  = tcp_close,
2407        .connect                = tcp_v4_connect,
2408        .disconnect             = tcp_disconnect,
2409        .accept                 = inet_csk_accept,
2410        .ioctl                  = tcp_ioctl,
2411        .init                   = tcp_v4_init_sock,
2412        .destroy                = tcp_v4_destroy_sock,
2413        .shutdown               = tcp_shutdown,
2414        .setsockopt             = tcp_setsockopt,
2415        .getsockopt             = tcp_getsockopt,
2416        .keepalive              = tcp_set_keepalive,
2417        .recvmsg                = tcp_recvmsg,
2418        .sendmsg                = tcp_sendmsg,
2419        .sendpage               = tcp_sendpage,
2420        .backlog_rcv            = tcp_v4_do_rcv,
2421        .release_cb             = tcp_release_cb,
2422        .hash                   = inet_hash,
2423        .unhash                 = inet_unhash,
2424        .get_port               = inet_csk_get_port,
2425        .enter_memory_pressure  = tcp_enter_memory_pressure,
2426        .leave_memory_pressure  = tcp_leave_memory_pressure,
2427        .stream_memory_free     = tcp_stream_memory_free,
2428        .sockets_allocated      = &tcp_sockets_allocated,
2429        .orphan_count           = &tcp_orphan_count,
2430        .memory_allocated       = &tcp_memory_allocated,
2431        .memory_pressure        = &tcp_memory_pressure,
2432        .sysctl_mem             = sysctl_tcp_mem,
2433        .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2434        .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2435        .max_header             = MAX_TCP_HEADER,
2436        .obj_size               = sizeof(struct tcp_sock),
2437        .slab_flags             = SLAB_TYPESAFE_BY_RCU,
2438        .twsk_prot              = &tcp_timewait_sock_ops,
2439        .rsk_prot               = &tcp_request_sock_ops,
2440        .h.hashinfo             = &tcp_hashinfo,
2441        .no_autobind            = true,
2442#ifdef CONFIG_COMPAT
2443        .compat_setsockopt      = compat_tcp_setsockopt,
2444        .compat_getsockopt      = compat_tcp_getsockopt,
2445#endif
2446        .diag_destroy           = tcp_abort,
2447};
2448EXPORT_SYMBOL(tcp_prot);
2449
2450static void __net_exit tcp_sk_exit(struct net *net)
2451{
2452        int cpu;
2453
2454        module_put(net->ipv4.tcp_congestion_control->owner);
2455
2456        for_each_possible_cpu(cpu)
2457                inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2458        free_percpu(net->ipv4.tcp_sk);
2459}
2460
2461static int __net_init tcp_sk_init(struct net *net)
2462{
2463        int res, cpu, cnt;
2464
2465        net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2466        if (!net->ipv4.tcp_sk)
2467                return -ENOMEM;
2468
2469        for_each_possible_cpu(cpu) {
2470                struct sock *sk;
2471
2472                res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2473                                           IPPROTO_TCP, net);
2474                if (res)
2475                        goto fail;
2476                sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2477                *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2478        }
2479
2480        net->ipv4.sysctl_tcp_ecn = 2;
2481        net->ipv4.sysctl_tcp_ecn_fallback = 1;
2482
2483        net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2484        net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2485        net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2486
2487        net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2488        net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2489        net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2490
2491        net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2492        net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2493        net->ipv4.sysctl_tcp_syncookies = 1;
2494        net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2495        net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2496        net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2497        net->ipv4.sysctl_tcp_orphan_retries = 0;
2498        net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2499        net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2500        net->ipv4.sysctl_tcp_tw_reuse = 0;
2501
2502        cnt = tcp_hashinfo.ehash_mask + 1;
2503        net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2504        net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2505
2506        net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2507        net->ipv4.sysctl_tcp_sack = 1;
2508        net->ipv4.sysctl_tcp_window_scaling = 1;
2509        net->ipv4.sysctl_tcp_timestamps = 1;
2510        net->ipv4.sysctl_tcp_early_retrans = 3;
2511        net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2512        net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2513        net->ipv4.sysctl_tcp_retrans_collapse = 1;
2514        net->ipv4.sysctl_tcp_max_reordering = 300;
2515        net->ipv4.sysctl_tcp_dsack = 1;
2516        net->ipv4.sysctl_tcp_app_win = 31;
2517        net->ipv4.sysctl_tcp_adv_win_scale = 1;
2518        net->ipv4.sysctl_tcp_frto = 2;
2519        net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2520        /* This limits the percentage of the congestion window which we
2521         * will allow a single TSO frame to consume.  Building TSO frames
2522         * which are too large can cause TCP streams to be bursty.
2523         */
2524        net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2525        /* Default TSQ limit of four TSO segments */
2526        net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
2527        /* rfc5961 challenge ack rate limiting */
2528        net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2529        net->ipv4.sysctl_tcp_min_tso_segs = 2;
2530        net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2531        net->ipv4.sysctl_tcp_autocorking = 1;
2532        net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2533        net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2534        net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2535        if (net != &init_net) {
2536                memcpy(net->ipv4.sysctl_tcp_rmem,
2537                       init_net.ipv4.sysctl_tcp_rmem,
2538                       sizeof(init_net.ipv4.sysctl_tcp_rmem));
2539                memcpy(net->ipv4.sysctl_tcp_wmem,
2540                       init_net.ipv4.sysctl_tcp_wmem,
2541                       sizeof(init_net.ipv4.sysctl_tcp_wmem));
2542        }
2543        net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2544        spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2545        net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2546        atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2547
2548        /* Reno is always built in */
2549        if (!net_eq(net, &init_net) &&
2550            try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2551                net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2552        else
2553                net->ipv4.tcp_congestion_control = &tcp_reno;
2554
2555        return 0;
2556fail:
2557        tcp_sk_exit(net);
2558
2559        return res;
2560}
2561
2562static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2563{
2564        struct net *net;
2565
2566        inet_twsk_purge(&tcp_hashinfo, AF_INET);
2567
2568        list_for_each_entry(net, net_exit_list, exit_list)
2569                tcp_fastopen_ctx_destroy(net);
2570}
2571
2572static struct pernet_operations __net_initdata tcp_sk_ops = {
2573       .init       = tcp_sk_init,
2574       .exit       = tcp_sk_exit,
2575       .exit_batch = tcp_sk_exit_batch,
2576};
2577
2578void __init tcp_v4_init(void)
2579{
2580        if (register_pernet_subsys(&tcp_sk_ops))
2581                panic("Failed to create the TCP control socket.\n");
2582}
2583