linux/net/ipv4/tcp_ipv4.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Implementation of the Transmission Control Protocol(TCP).
   7 *
   8 *              IPv4 specific functions
   9 *
  10 *
  11 *              code split from:
  12 *              linux/ipv4/tcp.c
  13 *              linux/ipv4/tcp_input.c
  14 *              linux/ipv4/tcp_output.c
  15 *
  16 *              See tcp.c for author information
  17 *
  18 *      This program is free software; you can redistribute it and/or
  19 *      modify it under the terms of the GNU General Public License
  20 *      as published by the Free Software Foundation; either version
  21 *      2 of the License, or (at your option) any later version.
  22 */
  23
  24/*
  25 * Changes:
  26 *              David S. Miller :       New socket lookup architecture.
  27 *                                      This code is dedicated to John Dyson.
  28 *              David S. Miller :       Change semantics of established hash,
  29 *                                      half is devoted to TIME_WAIT sockets
  30 *                                      and the rest go in the other half.
  31 *              Andi Kleen :            Add support for syncookies and fixed
  32 *                                      some bugs: ip options weren't passed to
  33 *                                      the TCP layer, missed a check for an
  34 *                                      ACK bit.
  35 *              Andi Kleen :            Implemented fast path mtu discovery.
  36 *                                      Fixed many serious bugs in the
  37 *                                      request_sock handling and moved
  38 *                                      most of it into the af independent code.
  39 *                                      Added tail drop and some other bugfixes.
  40 *                                      Added new listen semantics.
  41 *              Mike McLagan    :       Routing by source
  42 *      Juan Jose Ciarlante:            ip_dynaddr bits
  43 *              Andi Kleen:             various fixes.
  44 *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45 *                                      coma.
  46 *      Andi Kleen              :       Fix new listen.
  47 *      Andi Kleen              :       Fix accept error reporting.
  48 *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49 *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50 *                                      a single port at the same time.
  51 */
  52
  53#define pr_fmt(fmt) "TCP: " fmt
  54
  55#include <linux/bottom_half.h>
  56#include <linux/types.h>
  57#include <linux/fcntl.h>
  58#include <linux/module.h>
  59#include <linux/random.h>
  60#include <linux/cache.h>
  61#include <linux/jhash.h>
  62#include <linux/init.h>
  63#include <linux/times.h>
  64#include <linux/slab.h>
  65
  66#include <net/net_namespace.h>
  67#include <net/icmp.h>
  68#include <net/inet_hashtables.h>
  69#include <net/tcp.h>
  70#include <net/transp_v6.h>
  71#include <net/ipv6.h>
  72#include <net/inet_common.h>
  73#include <net/timewait_sock.h>
  74#include <net/xfrm.h>
  75#include <net/secure_seq.h>
  76#include <net/busy_poll.h>
  77
  78#include <linux/inet.h>
  79#include <linux/ipv6.h>
  80#include <linux/stddef.h>
  81#include <linux/proc_fs.h>
  82#include <linux/seq_file.h>
  83
  84#include <crypto/hash.h>
  85#include <linux/scatterlist.h>
  86
  87int sysctl_tcp_low_latency __read_mostly;
  88
  89#ifdef CONFIG_TCP_MD5SIG
  90static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  91                               __be32 daddr, __be32 saddr, const struct tcphdr *th);
  92#endif
  93
  94struct inet_hashinfo tcp_hashinfo;
  95EXPORT_SYMBOL(tcp_hashinfo);
  96
  97static u32 tcp_v4_init_sequence(const struct sk_buff *skb, u32 *tsoff)
  98{
  99        return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 100                                          ip_hdr(skb)->saddr,
 101                                          tcp_hdr(skb)->dest,
 102                                          tcp_hdr(skb)->source, tsoff);
 103}
 104
 105int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 106{
 107        const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 108        struct tcp_sock *tp = tcp_sk(sk);
 109
 110        /* With PAWS, it is safe from the viewpoint
 111           of data integrity. Even without PAWS it is safe provided sequence
 112           spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 113
 114           Actually, the idea is close to VJ's one, only timestamp cache is
 115           held not per host, but per port pair and TW bucket is used as state
 116           holder.
 117
 118           If TW bucket has been already destroyed we fall back to VJ's scheme
 119           and use initial timestamp retrieved from peer table.
 120         */
 121        if (tcptw->tw_ts_recent_stamp &&
 122            (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
 123                             get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 124                tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 125                if (tp->write_seq == 0)
 126                        tp->write_seq = 1;
 127                tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 128                tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 129                sock_hold(sktw);
 130                return 1;
 131        }
 132
 133        return 0;
 134}
 135EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 136
 137/* This will initiate an outgoing connection. */
 138int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 139{
 140        struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 141        struct inet_sock *inet = inet_sk(sk);
 142        struct tcp_sock *tp = tcp_sk(sk);
 143        __be16 orig_sport, orig_dport;
 144        __be32 daddr, nexthop;
 145        struct flowi4 *fl4;
 146        struct rtable *rt;
 147        int err;
 148        u32 seq;
 149        struct ip_options_rcu *inet_opt;
 150        struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 151
 152        if (addr_len < sizeof(struct sockaddr_in))
 153                return -EINVAL;
 154
 155        if (usin->sin_family != AF_INET)
 156                return -EAFNOSUPPORT;
 157
 158        nexthop = daddr = usin->sin_addr.s_addr;
 159        inet_opt = rcu_dereference_protected(inet->inet_opt,
 160                                             lockdep_sock_is_held(sk));
 161        if (inet_opt && inet_opt->opt.srr) {
 162                if (!daddr)
 163                        return -EINVAL;
 164                nexthop = inet_opt->opt.faddr;
 165        }
 166
 167        orig_sport = inet->inet_sport;
 168        orig_dport = usin->sin_port;
 169        fl4 = &inet->cork.fl.u.ip4;
 170        rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 171                              RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 172                              IPPROTO_TCP,
 173                              orig_sport, orig_dport, sk);
 174        if (IS_ERR(rt)) {
 175                err = PTR_ERR(rt);
 176                if (err == -ENETUNREACH)
 177                        IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 178                return err;
 179        }
 180
 181        if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 182                ip_rt_put(rt);
 183                return -ENETUNREACH;
 184        }
 185
 186        if (!inet_opt || !inet_opt->opt.srr)
 187                daddr = fl4->daddr;
 188
 189        if (!inet->inet_saddr)
 190                inet->inet_saddr = fl4->saddr;
 191        sk_rcv_saddr_set(sk, inet->inet_saddr);
 192
 193        if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 194                /* Reset inherited state */
 195                tp->rx_opt.ts_recent       = 0;
 196                tp->rx_opt.ts_recent_stamp = 0;
 197                if (likely(!tp->repair))
 198                        tp->write_seq      = 0;
 199        }
 200
 201        if (tcp_death_row->sysctl_tw_recycle &&
 202            !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
 203                tcp_fetch_timewait_stamp(sk, &rt->dst);
 204
 205        inet->inet_dport = usin->sin_port;
 206        sk_daddr_set(sk, daddr);
 207
 208        inet_csk(sk)->icsk_ext_hdr_len = 0;
 209        if (inet_opt)
 210                inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 211
 212        tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 213
 214        /* Socket identity is still unknown (sport may be zero).
 215         * However we set state to SYN-SENT and not releasing socket
 216         * lock select source port, enter ourselves into the hash tables and
 217         * complete initialization after this.
 218         */
 219        tcp_set_state(sk, TCP_SYN_SENT);
 220        err = inet_hash_connect(tcp_death_row, sk);
 221        if (err)
 222                goto failure;
 223
 224        sk_set_txhash(sk);
 225
 226        rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 227                               inet->inet_sport, inet->inet_dport, sk);
 228        if (IS_ERR(rt)) {
 229                err = PTR_ERR(rt);
 230                rt = NULL;
 231                goto failure;
 232        }
 233        /* OK, now commit destination to socket.  */
 234        sk->sk_gso_type = SKB_GSO_TCPV4;
 235        sk_setup_caps(sk, &rt->dst);
 236        rt = NULL;
 237
 238        if (likely(!tp->repair)) {
 239                seq = secure_tcp_sequence_number(inet->inet_saddr,
 240                                                 inet->inet_daddr,
 241                                                 inet->inet_sport,
 242                                                 usin->sin_port,
 243                                                 &tp->tsoffset);
 244                if (!tp->write_seq)
 245                        tp->write_seq = seq;
 246        }
 247
 248        inet->inet_id = tp->write_seq ^ jiffies;
 249
 250        if (tcp_fastopen_defer_connect(sk, &err))
 251                return err;
 252        if (err)
 253                goto failure;
 254
 255        err = tcp_connect(sk);
 256
 257        if (err)
 258                goto failure;
 259
 260        return 0;
 261
 262failure:
 263        /*
 264         * This unhashes the socket and releases the local port,
 265         * if necessary.
 266         */
 267        tcp_set_state(sk, TCP_CLOSE);
 268        ip_rt_put(rt);
 269        sk->sk_route_caps = 0;
 270        inet->inet_dport = 0;
 271        return err;
 272}
 273EXPORT_SYMBOL(tcp_v4_connect);
 274
 275/*
 276 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 277 * It can be called through tcp_release_cb() if socket was owned by user
 278 * at the time tcp_v4_err() was called to handle ICMP message.
 279 */
 280void tcp_v4_mtu_reduced(struct sock *sk)
 281{
 282        struct inet_sock *inet = inet_sk(sk);
 283        struct dst_entry *dst;
 284        u32 mtu;
 285
 286        if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 287                return;
 288        mtu = tcp_sk(sk)->mtu_info;
 289        dst = inet_csk_update_pmtu(sk, mtu);
 290        if (!dst)
 291                return;
 292
 293        /* Something is about to be wrong... Remember soft error
 294         * for the case, if this connection will not able to recover.
 295         */
 296        if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 297                sk->sk_err_soft = EMSGSIZE;
 298
 299        mtu = dst_mtu(dst);
 300
 301        if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 302            ip_sk_accept_pmtu(sk) &&
 303            inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 304                tcp_sync_mss(sk, mtu);
 305
 306                /* Resend the TCP packet because it's
 307                 * clear that the old packet has been
 308                 * dropped. This is the new "fast" path mtu
 309                 * discovery.
 310                 */
 311                tcp_simple_retransmit(sk);
 312        } /* else let the usual retransmit timer handle it */
 313}
 314EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 315
 316static void do_redirect(struct sk_buff *skb, struct sock *sk)
 317{
 318        struct dst_entry *dst = __sk_dst_check(sk, 0);
 319
 320        if (dst)
 321                dst->ops->redirect(dst, sk, skb);
 322}
 323
 324
 325/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 326void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 327{
 328        struct request_sock *req = inet_reqsk(sk);
 329        struct net *net = sock_net(sk);
 330
 331        /* ICMPs are not backlogged, hence we cannot get
 332         * an established socket here.
 333         */
 334        if (seq != tcp_rsk(req)->snt_isn) {
 335                __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 336        } else if (abort) {
 337                /*
 338                 * Still in SYN_RECV, just remove it silently.
 339                 * There is no good way to pass the error to the newly
 340                 * created socket, and POSIX does not want network
 341                 * errors returned from accept().
 342                 */
 343                inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 344                tcp_listendrop(req->rsk_listener);
 345        }
 346        reqsk_put(req);
 347}
 348EXPORT_SYMBOL(tcp_req_err);
 349
 350/*
 351 * This routine is called by the ICMP module when it gets some
 352 * sort of error condition.  If err < 0 then the socket should
 353 * be closed and the error returned to the user.  If err > 0
 354 * it's just the icmp type << 8 | icmp code.  After adjustment
 355 * header points to the first 8 bytes of the tcp header.  We need
 356 * to find the appropriate port.
 357 *
 358 * The locking strategy used here is very "optimistic". When
 359 * someone else accesses the socket the ICMP is just dropped
 360 * and for some paths there is no check at all.
 361 * A more general error queue to queue errors for later handling
 362 * is probably better.
 363 *
 364 */
 365
 366void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 367{
 368        const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 369        struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 370        struct inet_connection_sock *icsk;
 371        struct tcp_sock *tp;
 372        struct inet_sock *inet;
 373        const int type = icmp_hdr(icmp_skb)->type;
 374        const int code = icmp_hdr(icmp_skb)->code;
 375        struct sock *sk;
 376        struct sk_buff *skb;
 377        struct request_sock *fastopen;
 378        __u32 seq, snd_una;
 379        __u32 remaining;
 380        int err;
 381        struct net *net = dev_net(icmp_skb->dev);
 382
 383        sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 384                                       th->dest, iph->saddr, ntohs(th->source),
 385                                       inet_iif(icmp_skb));
 386        if (!sk) {
 387                __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 388                return;
 389        }
 390        if (sk->sk_state == TCP_TIME_WAIT) {
 391                inet_twsk_put(inet_twsk(sk));
 392                return;
 393        }
 394        seq = ntohl(th->seq);
 395        if (sk->sk_state == TCP_NEW_SYN_RECV)
 396                return tcp_req_err(sk, seq,
 397                                  type == ICMP_PARAMETERPROB ||
 398                                  type == ICMP_TIME_EXCEEDED ||
 399                                  (type == ICMP_DEST_UNREACH &&
 400                                   (code == ICMP_NET_UNREACH ||
 401                                    code == ICMP_HOST_UNREACH)));
 402
 403        bh_lock_sock(sk);
 404        /* If too many ICMPs get dropped on busy
 405         * servers this needs to be solved differently.
 406         * We do take care of PMTU discovery (RFC1191) special case :
 407         * we can receive locally generated ICMP messages while socket is held.
 408         */
 409        if (sock_owned_by_user(sk)) {
 410                if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 411                        __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 412        }
 413        if (sk->sk_state == TCP_CLOSE)
 414                goto out;
 415
 416        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 417                __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 418                goto out;
 419        }
 420
 421        icsk = inet_csk(sk);
 422        tp = tcp_sk(sk);
 423        /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 424        fastopen = tp->fastopen_rsk;
 425        snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 426        if (sk->sk_state != TCP_LISTEN &&
 427            !between(seq, snd_una, tp->snd_nxt)) {
 428                __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 429                goto out;
 430        }
 431
 432        switch (type) {
 433        case ICMP_REDIRECT:
 434                if (!sock_owned_by_user(sk))
 435                        do_redirect(icmp_skb, sk);
 436                goto out;
 437        case ICMP_SOURCE_QUENCH:
 438                /* Just silently ignore these. */
 439                goto out;
 440        case ICMP_PARAMETERPROB:
 441                err = EPROTO;
 442                break;
 443        case ICMP_DEST_UNREACH:
 444                if (code > NR_ICMP_UNREACH)
 445                        goto out;
 446
 447                if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 448                        /* We are not interested in TCP_LISTEN and open_requests
 449                         * (SYN-ACKs send out by Linux are always <576bytes so
 450                         * they should go through unfragmented).
 451                         */
 452                        if (sk->sk_state == TCP_LISTEN)
 453                                goto out;
 454
 455                        tp->mtu_info = info;
 456                        if (!sock_owned_by_user(sk)) {
 457                                tcp_v4_mtu_reduced(sk);
 458                        } else {
 459                                if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 460                                        sock_hold(sk);
 461                        }
 462                        goto out;
 463                }
 464
 465                err = icmp_err_convert[code].errno;
 466                /* check if icmp_skb allows revert of backoff
 467                 * (see draft-zimmermann-tcp-lcd) */
 468                if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 469                        break;
 470                if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 471                    !icsk->icsk_backoff || fastopen)
 472                        break;
 473
 474                if (sock_owned_by_user(sk))
 475                        break;
 476
 477                icsk->icsk_backoff--;
 478                icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 479                                               TCP_TIMEOUT_INIT;
 480                icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 481
 482                skb = tcp_write_queue_head(sk);
 483                BUG_ON(!skb);
 484
 485                remaining = icsk->icsk_rto -
 486                            min(icsk->icsk_rto,
 487                                tcp_time_stamp - tcp_skb_timestamp(skb));
 488
 489                if (remaining) {
 490                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 491                                                  remaining, TCP_RTO_MAX);
 492                } else {
 493                        /* RTO revert clocked out retransmission.
 494                         * Will retransmit now */
 495                        tcp_retransmit_timer(sk);
 496                }
 497
 498                break;
 499        case ICMP_TIME_EXCEEDED:
 500                err = EHOSTUNREACH;
 501                break;
 502        default:
 503                goto out;
 504        }
 505
 506        switch (sk->sk_state) {
 507        case TCP_SYN_SENT:
 508        case TCP_SYN_RECV:
 509                /* Only in fast or simultaneous open. If a fast open socket is
 510                 * is already accepted it is treated as a connected one below.
 511                 */
 512                if (fastopen && !fastopen->sk)
 513                        break;
 514
 515                if (!sock_owned_by_user(sk)) {
 516                        sk->sk_err = err;
 517
 518                        sk->sk_error_report(sk);
 519
 520                        tcp_done(sk);
 521                } else {
 522                        sk->sk_err_soft = err;
 523                }
 524                goto out;
 525        }
 526
 527        /* If we've already connected we will keep trying
 528         * until we time out, or the user gives up.
 529         *
 530         * rfc1122 4.2.3.9 allows to consider as hard errors
 531         * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 532         * but it is obsoleted by pmtu discovery).
 533         *
 534         * Note, that in modern internet, where routing is unreliable
 535         * and in each dark corner broken firewalls sit, sending random
 536         * errors ordered by their masters even this two messages finally lose
 537         * their original sense (even Linux sends invalid PORT_UNREACHs)
 538         *
 539         * Now we are in compliance with RFCs.
 540         *                                                      --ANK (980905)
 541         */
 542
 543        inet = inet_sk(sk);
 544        if (!sock_owned_by_user(sk) && inet->recverr) {
 545                sk->sk_err = err;
 546                sk->sk_error_report(sk);
 547        } else  { /* Only an error on timeout */
 548                sk->sk_err_soft = err;
 549        }
 550
 551out:
 552        bh_unlock_sock(sk);
 553        sock_put(sk);
 554}
 555
 556void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 557{
 558        struct tcphdr *th = tcp_hdr(skb);
 559
 560        if (skb->ip_summed == CHECKSUM_PARTIAL) {
 561                th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 562                skb->csum_start = skb_transport_header(skb) - skb->head;
 563                skb->csum_offset = offsetof(struct tcphdr, check);
 564        } else {
 565                th->check = tcp_v4_check(skb->len, saddr, daddr,
 566                                         csum_partial(th,
 567                                                      th->doff << 2,
 568                                                      skb->csum));
 569        }
 570}
 571
 572/* This routine computes an IPv4 TCP checksum. */
 573void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 574{
 575        const struct inet_sock *inet = inet_sk(sk);
 576
 577        __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 578}
 579EXPORT_SYMBOL(tcp_v4_send_check);
 580
 581/*
 582 *      This routine will send an RST to the other tcp.
 583 *
 584 *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 585 *                    for reset.
 586 *      Answer: if a packet caused RST, it is not for a socket
 587 *              existing in our system, if it is matched to a socket,
 588 *              it is just duplicate segment or bug in other side's TCP.
 589 *              So that we build reply only basing on parameters
 590 *              arrived with segment.
 591 *      Exception: precedence violation. We do not implement it in any case.
 592 */
 593
 594static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 595{
 596        const struct tcphdr *th = tcp_hdr(skb);
 597        struct {
 598                struct tcphdr th;
 599#ifdef CONFIG_TCP_MD5SIG
 600                __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 601#endif
 602        } rep;
 603        struct ip_reply_arg arg;
 604#ifdef CONFIG_TCP_MD5SIG
 605        struct tcp_md5sig_key *key = NULL;
 606        const __u8 *hash_location = NULL;
 607        unsigned char newhash[16];
 608        int genhash;
 609        struct sock *sk1 = NULL;
 610#endif
 611        struct net *net;
 612
 613        /* Never send a reset in response to a reset. */
 614        if (th->rst)
 615                return;
 616
 617        /* If sk not NULL, it means we did a successful lookup and incoming
 618         * route had to be correct. prequeue might have dropped our dst.
 619         */
 620        if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 621                return;
 622
 623        /* Swap the send and the receive. */
 624        memset(&rep, 0, sizeof(rep));
 625        rep.th.dest   = th->source;
 626        rep.th.source = th->dest;
 627        rep.th.doff   = sizeof(struct tcphdr) / 4;
 628        rep.th.rst    = 1;
 629
 630        if (th->ack) {
 631                rep.th.seq = th->ack_seq;
 632        } else {
 633                rep.th.ack = 1;
 634                rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 635                                       skb->len - (th->doff << 2));
 636        }
 637
 638        memset(&arg, 0, sizeof(arg));
 639        arg.iov[0].iov_base = (unsigned char *)&rep;
 640        arg.iov[0].iov_len  = sizeof(rep.th);
 641
 642        net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 643#ifdef CONFIG_TCP_MD5SIG
 644        rcu_read_lock();
 645        hash_location = tcp_parse_md5sig_option(th);
 646        if (sk && sk_fullsock(sk)) {
 647                key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 648                                        &ip_hdr(skb)->saddr, AF_INET);
 649        } else if (hash_location) {
 650                /*
 651                 * active side is lost. Try to find listening socket through
 652                 * source port, and then find md5 key through listening socket.
 653                 * we are not loose security here:
 654                 * Incoming packet is checked with md5 hash with finding key,
 655                 * no RST generated if md5 hash doesn't match.
 656                 */
 657                sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 658                                             ip_hdr(skb)->saddr,
 659                                             th->source, ip_hdr(skb)->daddr,
 660                                             ntohs(th->source), inet_iif(skb));
 661                /* don't send rst if it can't find key */
 662                if (!sk1)
 663                        goto out;
 664
 665                key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 666                                        &ip_hdr(skb)->saddr, AF_INET);
 667                if (!key)
 668                        goto out;
 669
 670
 671                genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 672                if (genhash || memcmp(hash_location, newhash, 16) != 0)
 673                        goto out;
 674
 675        }
 676
 677        if (key) {
 678                rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 679                                   (TCPOPT_NOP << 16) |
 680                                   (TCPOPT_MD5SIG << 8) |
 681                                   TCPOLEN_MD5SIG);
 682                /* Update length and the length the header thinks exists */
 683                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 684                rep.th.doff = arg.iov[0].iov_len / 4;
 685
 686                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 687                                     key, ip_hdr(skb)->saddr,
 688                                     ip_hdr(skb)->daddr, &rep.th);
 689        }
 690#endif
 691        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 692                                      ip_hdr(skb)->saddr, /* XXX */
 693                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 694        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 695        arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 696
 697        /* When socket is gone, all binding information is lost.
 698         * routing might fail in this case. No choice here, if we choose to force
 699         * input interface, we will misroute in case of asymmetric route.
 700         */
 701        if (sk)
 702                arg.bound_dev_if = sk->sk_bound_dev_if;
 703
 704        BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 705                     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 706
 707        arg.tos = ip_hdr(skb)->tos;
 708        arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 709        local_bh_disable();
 710        ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 711                              skb, &TCP_SKB_CB(skb)->header.h4.opt,
 712                              ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 713                              &arg, arg.iov[0].iov_len);
 714
 715        __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 716        __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 717        local_bh_enable();
 718
 719#ifdef CONFIG_TCP_MD5SIG
 720out:
 721        rcu_read_unlock();
 722#endif
 723}
 724
 725/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 726   outside socket context is ugly, certainly. What can I do?
 727 */
 728
 729static void tcp_v4_send_ack(const struct sock *sk,
 730                            struct sk_buff *skb, u32 seq, u32 ack,
 731                            u32 win, u32 tsval, u32 tsecr, int oif,
 732                            struct tcp_md5sig_key *key,
 733                            int reply_flags, u8 tos)
 734{
 735        const struct tcphdr *th = tcp_hdr(skb);
 736        struct {
 737                struct tcphdr th;
 738                __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 739#ifdef CONFIG_TCP_MD5SIG
 740                           + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 741#endif
 742                        ];
 743        } rep;
 744        struct net *net = sock_net(sk);
 745        struct ip_reply_arg arg;
 746
 747        memset(&rep.th, 0, sizeof(struct tcphdr));
 748        memset(&arg, 0, sizeof(arg));
 749
 750        arg.iov[0].iov_base = (unsigned char *)&rep;
 751        arg.iov[0].iov_len  = sizeof(rep.th);
 752        if (tsecr) {
 753                rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 754                                   (TCPOPT_TIMESTAMP << 8) |
 755                                   TCPOLEN_TIMESTAMP);
 756                rep.opt[1] = htonl(tsval);
 757                rep.opt[2] = htonl(tsecr);
 758                arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 759        }
 760
 761        /* Swap the send and the receive. */
 762        rep.th.dest    = th->source;
 763        rep.th.source  = th->dest;
 764        rep.th.doff    = arg.iov[0].iov_len / 4;
 765        rep.th.seq     = htonl(seq);
 766        rep.th.ack_seq = htonl(ack);
 767        rep.th.ack     = 1;
 768        rep.th.window  = htons(win);
 769
 770#ifdef CONFIG_TCP_MD5SIG
 771        if (key) {
 772                int offset = (tsecr) ? 3 : 0;
 773
 774                rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 775                                          (TCPOPT_NOP << 16) |
 776                                          (TCPOPT_MD5SIG << 8) |
 777                                          TCPOLEN_MD5SIG);
 778                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 779                rep.th.doff = arg.iov[0].iov_len/4;
 780
 781                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 782                                    key, ip_hdr(skb)->saddr,
 783                                    ip_hdr(skb)->daddr, &rep.th);
 784        }
 785#endif
 786        arg.flags = reply_flags;
 787        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 788                                      ip_hdr(skb)->saddr, /* XXX */
 789                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 790        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 791        if (oif)
 792                arg.bound_dev_if = oif;
 793        arg.tos = tos;
 794        arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 795        local_bh_disable();
 796        ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 797                              skb, &TCP_SKB_CB(skb)->header.h4.opt,
 798                              ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 799                              &arg, arg.iov[0].iov_len);
 800
 801        __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 802        local_bh_enable();
 803}
 804
 805static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 806{
 807        struct inet_timewait_sock *tw = inet_twsk(sk);
 808        struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 809
 810        tcp_v4_send_ack(sk, skb,
 811                        tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 812                        tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 813                        tcp_time_stamp + tcptw->tw_ts_offset,
 814                        tcptw->tw_ts_recent,
 815                        tw->tw_bound_dev_if,
 816                        tcp_twsk_md5_key(tcptw),
 817                        tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 818                        tw->tw_tos
 819                        );
 820
 821        inet_twsk_put(tw);
 822}
 823
 824static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 825                                  struct request_sock *req)
 826{
 827        /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 828         * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 829         */
 830        u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 831                                             tcp_sk(sk)->snd_nxt;
 832
 833        /* RFC 7323 2.3
 834         * The window field (SEG.WND) of every outgoing segment, with the
 835         * exception of <SYN> segments, MUST be right-shifted by
 836         * Rcv.Wind.Shift bits:
 837         */
 838        tcp_v4_send_ack(sk, skb, seq,
 839                        tcp_rsk(req)->rcv_nxt,
 840                        req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 841                        tcp_time_stamp + tcp_rsk(req)->ts_off,
 842                        req->ts_recent,
 843                        0,
 844                        tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
 845                                          AF_INET),
 846                        inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 847                        ip_hdr(skb)->tos);
 848}
 849
 850/*
 851 *      Send a SYN-ACK after having received a SYN.
 852 *      This still operates on a request_sock only, not on a big
 853 *      socket.
 854 */
 855static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 856                              struct flowi *fl,
 857                              struct request_sock *req,
 858                              struct tcp_fastopen_cookie *foc,
 859                              enum tcp_synack_type synack_type)
 860{
 861        const struct inet_request_sock *ireq = inet_rsk(req);
 862        struct flowi4 fl4;
 863        int err = -1;
 864        struct sk_buff *skb;
 865
 866        /* First, grab a route. */
 867        if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 868                return -1;
 869
 870        skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 871
 872        if (skb) {
 873                __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 874
 875                err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 876                                            ireq->ir_rmt_addr,
 877                                            ireq->opt);
 878                err = net_xmit_eval(err);
 879        }
 880
 881        return err;
 882}
 883
 884/*
 885 *      IPv4 request_sock destructor.
 886 */
 887static void tcp_v4_reqsk_destructor(struct request_sock *req)
 888{
 889        kfree(inet_rsk(req)->opt);
 890}
 891
 892#ifdef CONFIG_TCP_MD5SIG
 893/*
 894 * RFC2385 MD5 checksumming requires a mapping of
 895 * IP address->MD5 Key.
 896 * We need to maintain these in the sk structure.
 897 */
 898
 899/* Find the Key structure for an address.  */
 900struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
 901                                         const union tcp_md5_addr *addr,
 902                                         int family)
 903{
 904        const struct tcp_sock *tp = tcp_sk(sk);
 905        struct tcp_md5sig_key *key;
 906        unsigned int size = sizeof(struct in_addr);
 907        const struct tcp_md5sig_info *md5sig;
 908
 909        /* caller either holds rcu_read_lock() or socket lock */
 910        md5sig = rcu_dereference_check(tp->md5sig_info,
 911                                       lockdep_sock_is_held(sk));
 912        if (!md5sig)
 913                return NULL;
 914#if IS_ENABLED(CONFIG_IPV6)
 915        if (family == AF_INET6)
 916                size = sizeof(struct in6_addr);
 917#endif
 918        hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 919                if (key->family != family)
 920                        continue;
 921                if (!memcmp(&key->addr, addr, size))
 922                        return key;
 923        }
 924        return NULL;
 925}
 926EXPORT_SYMBOL(tcp_md5_do_lookup);
 927
 928struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
 929                                         const struct sock *addr_sk)
 930{
 931        const union tcp_md5_addr *addr;
 932
 933        addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
 934        return tcp_md5_do_lookup(sk, addr, AF_INET);
 935}
 936EXPORT_SYMBOL(tcp_v4_md5_lookup);
 937
 938/* This can be called on a newly created socket, from other files */
 939int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 940                   int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
 941{
 942        /* Add Key to the list */
 943        struct tcp_md5sig_key *key;
 944        struct tcp_sock *tp = tcp_sk(sk);
 945        struct tcp_md5sig_info *md5sig;
 946
 947        key = tcp_md5_do_lookup(sk, addr, family);
 948        if (key) {
 949                /* Pre-existing entry - just update that one. */
 950                memcpy(key->key, newkey, newkeylen);
 951                key->keylen = newkeylen;
 952                return 0;
 953        }
 954
 955        md5sig = rcu_dereference_protected(tp->md5sig_info,
 956                                           lockdep_sock_is_held(sk));
 957        if (!md5sig) {
 958                md5sig = kmalloc(sizeof(*md5sig), gfp);
 959                if (!md5sig)
 960                        return -ENOMEM;
 961
 962                sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 963                INIT_HLIST_HEAD(&md5sig->head);
 964                rcu_assign_pointer(tp->md5sig_info, md5sig);
 965        }
 966
 967        key = sock_kmalloc(sk, sizeof(*key), gfp);
 968        if (!key)
 969                return -ENOMEM;
 970        if (!tcp_alloc_md5sig_pool()) {
 971                sock_kfree_s(sk, key, sizeof(*key));
 972                return -ENOMEM;
 973        }
 974
 975        memcpy(key->key, newkey, newkeylen);
 976        key->keylen = newkeylen;
 977        key->family = family;
 978        memcpy(&key->addr, addr,
 979               (family == AF_INET6) ? sizeof(struct in6_addr) :
 980                                      sizeof(struct in_addr));
 981        hlist_add_head_rcu(&key->node, &md5sig->head);
 982        return 0;
 983}
 984EXPORT_SYMBOL(tcp_md5_do_add);
 985
 986int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
 987{
 988        struct tcp_md5sig_key *key;
 989
 990        key = tcp_md5_do_lookup(sk, addr, family);
 991        if (!key)
 992                return -ENOENT;
 993        hlist_del_rcu(&key->node);
 994        atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
 995        kfree_rcu(key, rcu);
 996        return 0;
 997}
 998EXPORT_SYMBOL(tcp_md5_do_del);
 999
1000static void tcp_clear_md5_list(struct sock *sk)
1001{
1002        struct tcp_sock *tp = tcp_sk(sk);
1003        struct tcp_md5sig_key *key;
1004        struct hlist_node *n;
1005        struct tcp_md5sig_info *md5sig;
1006
1007        md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1008
1009        hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1010                hlist_del_rcu(&key->node);
1011                atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1012                kfree_rcu(key, rcu);
1013        }
1014}
1015
1016static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1017                                 int optlen)
1018{
1019        struct tcp_md5sig cmd;
1020        struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1021
1022        if (optlen < sizeof(cmd))
1023                return -EINVAL;
1024
1025        if (copy_from_user(&cmd, optval, sizeof(cmd)))
1026                return -EFAULT;
1027
1028        if (sin->sin_family != AF_INET)
1029                return -EINVAL;
1030
1031        if (!cmd.tcpm_keylen)
1032                return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1033                                      AF_INET);
1034
1035        if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1036                return -EINVAL;
1037
1038        return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1039                              AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1040                              GFP_KERNEL);
1041}
1042
1043static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1044                                   __be32 daddr, __be32 saddr,
1045                                   const struct tcphdr *th, int nbytes)
1046{
1047        struct tcp4_pseudohdr *bp;
1048        struct scatterlist sg;
1049        struct tcphdr *_th;
1050
1051        bp = hp->scratch;
1052        bp->saddr = saddr;
1053        bp->daddr = daddr;
1054        bp->pad = 0;
1055        bp->protocol = IPPROTO_TCP;
1056        bp->len = cpu_to_be16(nbytes);
1057
1058        _th = (struct tcphdr *)(bp + 1);
1059        memcpy(_th, th, sizeof(*th));
1060        _th->check = 0;
1061
1062        sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1063        ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1064                                sizeof(*bp) + sizeof(*th));
1065        return crypto_ahash_update(hp->md5_req);
1066}
1067
1068static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1069                               __be32 daddr, __be32 saddr, const struct tcphdr *th)
1070{
1071        struct tcp_md5sig_pool *hp;
1072        struct ahash_request *req;
1073
1074        hp = tcp_get_md5sig_pool();
1075        if (!hp)
1076                goto clear_hash_noput;
1077        req = hp->md5_req;
1078
1079        if (crypto_ahash_init(req))
1080                goto clear_hash;
1081        if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1082                goto clear_hash;
1083        if (tcp_md5_hash_key(hp, key))
1084                goto clear_hash;
1085        ahash_request_set_crypt(req, NULL, md5_hash, 0);
1086        if (crypto_ahash_final(req))
1087                goto clear_hash;
1088
1089        tcp_put_md5sig_pool();
1090        return 0;
1091
1092clear_hash:
1093        tcp_put_md5sig_pool();
1094clear_hash_noput:
1095        memset(md5_hash, 0, 16);
1096        return 1;
1097}
1098
1099int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1100                        const struct sock *sk,
1101                        const struct sk_buff *skb)
1102{
1103        struct tcp_md5sig_pool *hp;
1104        struct ahash_request *req;
1105        const struct tcphdr *th = tcp_hdr(skb);
1106        __be32 saddr, daddr;
1107
1108        if (sk) { /* valid for establish/request sockets */
1109                saddr = sk->sk_rcv_saddr;
1110                daddr = sk->sk_daddr;
1111        } else {
1112                const struct iphdr *iph = ip_hdr(skb);
1113                saddr = iph->saddr;
1114                daddr = iph->daddr;
1115        }
1116
1117        hp = tcp_get_md5sig_pool();
1118        if (!hp)
1119                goto clear_hash_noput;
1120        req = hp->md5_req;
1121
1122        if (crypto_ahash_init(req))
1123                goto clear_hash;
1124
1125        if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1126                goto clear_hash;
1127        if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1128                goto clear_hash;
1129        if (tcp_md5_hash_key(hp, key))
1130                goto clear_hash;
1131        ahash_request_set_crypt(req, NULL, md5_hash, 0);
1132        if (crypto_ahash_final(req))
1133                goto clear_hash;
1134
1135        tcp_put_md5sig_pool();
1136        return 0;
1137
1138clear_hash:
1139        tcp_put_md5sig_pool();
1140clear_hash_noput:
1141        memset(md5_hash, 0, 16);
1142        return 1;
1143}
1144EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1145
1146#endif
1147
1148/* Called with rcu_read_lock() */
1149static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1150                                    const struct sk_buff *skb)
1151{
1152#ifdef CONFIG_TCP_MD5SIG
1153        /*
1154         * This gets called for each TCP segment that arrives
1155         * so we want to be efficient.
1156         * We have 3 drop cases:
1157         * o No MD5 hash and one expected.
1158         * o MD5 hash and we're not expecting one.
1159         * o MD5 hash and its wrong.
1160         */
1161        const __u8 *hash_location = NULL;
1162        struct tcp_md5sig_key *hash_expected;
1163        const struct iphdr *iph = ip_hdr(skb);
1164        const struct tcphdr *th = tcp_hdr(skb);
1165        int genhash;
1166        unsigned char newhash[16];
1167
1168        hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1169                                          AF_INET);
1170        hash_location = tcp_parse_md5sig_option(th);
1171
1172        /* We've parsed the options - do we have a hash? */
1173        if (!hash_expected && !hash_location)
1174                return false;
1175
1176        if (hash_expected && !hash_location) {
1177                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1178                return true;
1179        }
1180
1181        if (!hash_expected && hash_location) {
1182                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1183                return true;
1184        }
1185
1186        /* Okay, so this is hash_expected and hash_location -
1187         * so we need to calculate the checksum.
1188         */
1189        genhash = tcp_v4_md5_hash_skb(newhash,
1190                                      hash_expected,
1191                                      NULL, skb);
1192
1193        if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1194                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1195                net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1196                                     &iph->saddr, ntohs(th->source),
1197                                     &iph->daddr, ntohs(th->dest),
1198                                     genhash ? " tcp_v4_calc_md5_hash failed"
1199                                     : "");
1200                return true;
1201        }
1202        return false;
1203#endif
1204        return false;
1205}
1206
1207static void tcp_v4_init_req(struct request_sock *req,
1208                            const struct sock *sk_listener,
1209                            struct sk_buff *skb)
1210{
1211        struct inet_request_sock *ireq = inet_rsk(req);
1212
1213        sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1214        sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1215        ireq->opt = tcp_v4_save_options(skb);
1216}
1217
1218static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1219                                          struct flowi *fl,
1220                                          const struct request_sock *req,
1221                                          bool *strict)
1222{
1223        struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1224
1225        if (strict) {
1226                if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1227                        *strict = true;
1228                else
1229                        *strict = false;
1230        }
1231
1232        return dst;
1233}
1234
1235struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1236        .family         =       PF_INET,
1237        .obj_size       =       sizeof(struct tcp_request_sock),
1238        .rtx_syn_ack    =       tcp_rtx_synack,
1239        .send_ack       =       tcp_v4_reqsk_send_ack,
1240        .destructor     =       tcp_v4_reqsk_destructor,
1241        .send_reset     =       tcp_v4_send_reset,
1242        .syn_ack_timeout =      tcp_syn_ack_timeout,
1243};
1244
1245static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1246        .mss_clamp      =       TCP_MSS_DEFAULT,
1247#ifdef CONFIG_TCP_MD5SIG
1248        .req_md5_lookup =       tcp_v4_md5_lookup,
1249        .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1250#endif
1251        .init_req       =       tcp_v4_init_req,
1252#ifdef CONFIG_SYN_COOKIES
1253        .cookie_init_seq =      cookie_v4_init_sequence,
1254#endif
1255        .route_req      =       tcp_v4_route_req,
1256        .init_seq       =       tcp_v4_init_sequence,
1257        .send_synack    =       tcp_v4_send_synack,
1258};
1259
1260int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1261{
1262        /* Never answer to SYNs send to broadcast or multicast */
1263        if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1264                goto drop;
1265
1266        return tcp_conn_request(&tcp_request_sock_ops,
1267                                &tcp_request_sock_ipv4_ops, sk, skb);
1268
1269drop:
1270        tcp_listendrop(sk);
1271        return 0;
1272}
1273EXPORT_SYMBOL(tcp_v4_conn_request);
1274
1275
1276/*
1277 * The three way handshake has completed - we got a valid synack -
1278 * now create the new socket.
1279 */
1280struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1281                                  struct request_sock *req,
1282                                  struct dst_entry *dst,
1283                                  struct request_sock *req_unhash,
1284                                  bool *own_req)
1285{
1286        struct inet_request_sock *ireq;
1287        struct inet_sock *newinet;
1288        struct tcp_sock *newtp;
1289        struct sock *newsk;
1290#ifdef CONFIG_TCP_MD5SIG
1291        struct tcp_md5sig_key *key;
1292#endif
1293        struct ip_options_rcu *inet_opt;
1294
1295        if (sk_acceptq_is_full(sk))
1296                goto exit_overflow;
1297
1298        newsk = tcp_create_openreq_child(sk, req, skb);
1299        if (!newsk)
1300                goto exit_nonewsk;
1301
1302        newsk->sk_gso_type = SKB_GSO_TCPV4;
1303        inet_sk_rx_dst_set(newsk, skb);
1304
1305        newtp                 = tcp_sk(newsk);
1306        newinet               = inet_sk(newsk);
1307        ireq                  = inet_rsk(req);
1308        sk_daddr_set(newsk, ireq->ir_rmt_addr);
1309        sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1310        newsk->sk_bound_dev_if = ireq->ir_iif;
1311        newinet->inet_saddr           = ireq->ir_loc_addr;
1312        inet_opt              = ireq->opt;
1313        rcu_assign_pointer(newinet->inet_opt, inet_opt);
1314        ireq->opt             = NULL;
1315        newinet->mc_index     = inet_iif(skb);
1316        newinet->mc_ttl       = ip_hdr(skb)->ttl;
1317        newinet->rcv_tos      = ip_hdr(skb)->tos;
1318        inet_csk(newsk)->icsk_ext_hdr_len = 0;
1319        if (inet_opt)
1320                inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1321        newinet->inet_id = newtp->write_seq ^ jiffies;
1322
1323        if (!dst) {
1324                dst = inet_csk_route_child_sock(sk, newsk, req);
1325                if (!dst)
1326                        goto put_and_exit;
1327        } else {
1328                /* syncookie case : see end of cookie_v4_check() */
1329        }
1330        sk_setup_caps(newsk, dst);
1331
1332        tcp_ca_openreq_child(newsk, dst);
1333
1334        tcp_sync_mss(newsk, dst_mtu(dst));
1335        newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1336
1337        tcp_initialize_rcv_mss(newsk);
1338
1339#ifdef CONFIG_TCP_MD5SIG
1340        /* Copy over the MD5 key from the original socket */
1341        key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1342                                AF_INET);
1343        if (key) {
1344                /*
1345                 * We're using one, so create a matching key
1346                 * on the newsk structure. If we fail to get
1347                 * memory, then we end up not copying the key
1348                 * across. Shucks.
1349                 */
1350                tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1351                               AF_INET, key->key, key->keylen, GFP_ATOMIC);
1352                sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1353        }
1354#endif
1355
1356        if (__inet_inherit_port(sk, newsk) < 0)
1357                goto put_and_exit;
1358        *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1359        if (*own_req)
1360                tcp_move_syn(newtp, req);
1361
1362        return newsk;
1363
1364exit_overflow:
1365        NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1366exit_nonewsk:
1367        dst_release(dst);
1368exit:
1369        tcp_listendrop(sk);
1370        return NULL;
1371put_and_exit:
1372        inet_csk_prepare_forced_close(newsk);
1373        tcp_done(newsk);
1374        goto exit;
1375}
1376EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1377
1378static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1379{
1380#ifdef CONFIG_SYN_COOKIES
1381        const struct tcphdr *th = tcp_hdr(skb);
1382
1383        if (!th->syn)
1384                sk = cookie_v4_check(sk, skb);
1385#endif
1386        return sk;
1387}
1388
1389/* The socket must have it's spinlock held when we get
1390 * here, unless it is a TCP_LISTEN socket.
1391 *
1392 * We have a potential double-lock case here, so even when
1393 * doing backlog processing we use the BH locking scheme.
1394 * This is because we cannot sleep with the original spinlock
1395 * held.
1396 */
1397int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1398{
1399        struct sock *rsk;
1400
1401        if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1402                struct dst_entry *dst = sk->sk_rx_dst;
1403
1404                sock_rps_save_rxhash(sk, skb);
1405                sk_mark_napi_id(sk, skb);
1406                if (dst) {
1407                        if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1408                            !dst->ops->check(dst, 0)) {
1409                                dst_release(dst);
1410                                sk->sk_rx_dst = NULL;
1411                        }
1412                }
1413                tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1414                return 0;
1415        }
1416
1417        if (tcp_checksum_complete(skb))
1418                goto csum_err;
1419
1420        if (sk->sk_state == TCP_LISTEN) {
1421                struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1422
1423                if (!nsk)
1424                        goto discard;
1425                if (nsk != sk) {
1426                        sock_rps_save_rxhash(nsk, skb);
1427                        sk_mark_napi_id(nsk, skb);
1428                        if (tcp_child_process(sk, nsk, skb)) {
1429                                rsk = nsk;
1430                                goto reset;
1431                        }
1432                        return 0;
1433                }
1434        } else
1435                sock_rps_save_rxhash(sk, skb);
1436
1437        if (tcp_rcv_state_process(sk, skb)) {
1438                rsk = sk;
1439                goto reset;
1440        }
1441        return 0;
1442
1443reset:
1444        tcp_v4_send_reset(rsk, skb);
1445discard:
1446        kfree_skb(skb);
1447        /* Be careful here. If this function gets more complicated and
1448         * gcc suffers from register pressure on the x86, sk (in %ebx)
1449         * might be destroyed here. This current version compiles correctly,
1450         * but you have been warned.
1451         */
1452        return 0;
1453
1454csum_err:
1455        TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1456        TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1457        goto discard;
1458}
1459EXPORT_SYMBOL(tcp_v4_do_rcv);
1460
1461void tcp_v4_early_demux(struct sk_buff *skb)
1462{
1463        const struct iphdr *iph;
1464        const struct tcphdr *th;
1465        struct sock *sk;
1466
1467        if (skb->pkt_type != PACKET_HOST)
1468                return;
1469
1470        if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1471                return;
1472
1473        iph = ip_hdr(skb);
1474        th = tcp_hdr(skb);
1475
1476        if (th->doff < sizeof(struct tcphdr) / 4)
1477                return;
1478
1479        sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1480                                       iph->saddr, th->source,
1481                                       iph->daddr, ntohs(th->dest),
1482                                       skb->skb_iif);
1483        if (sk) {
1484                skb->sk = sk;
1485                skb->destructor = sock_edemux;
1486                if (sk_fullsock(sk)) {
1487                        struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1488
1489                        if (dst)
1490                                dst = dst_check(dst, 0);
1491                        if (dst &&
1492                            inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1493                                skb_dst_set_noref(skb, dst);
1494                }
1495        }
1496}
1497
1498/* Packet is added to VJ-style prequeue for processing in process
1499 * context, if a reader task is waiting. Apparently, this exciting
1500 * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1501 * failed somewhere. Latency? Burstiness? Well, at least now we will
1502 * see, why it failed. 8)8)                               --ANK
1503 *
1504 */
1505bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1506{
1507        struct tcp_sock *tp = tcp_sk(sk);
1508
1509        if (sysctl_tcp_low_latency || !tp->ucopy.task)
1510                return false;
1511
1512        if (skb->len <= tcp_hdrlen(skb) &&
1513            skb_queue_len(&tp->ucopy.prequeue) == 0)
1514                return false;
1515
1516        /* Before escaping RCU protected region, we need to take care of skb
1517         * dst. Prequeue is only enabled for established sockets.
1518         * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1519         * Instead of doing full sk_rx_dst validity here, let's perform
1520         * an optimistic check.
1521         */
1522        if (likely(sk->sk_rx_dst))
1523                skb_dst_drop(skb);
1524        else
1525                skb_dst_force_safe(skb);
1526
1527        __skb_queue_tail(&tp->ucopy.prequeue, skb);
1528        tp->ucopy.memory += skb->truesize;
1529        if (skb_queue_len(&tp->ucopy.prequeue) >= 32 ||
1530            tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
1531                struct sk_buff *skb1;
1532
1533                BUG_ON(sock_owned_by_user(sk));
1534                __NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED,
1535                                skb_queue_len(&tp->ucopy.prequeue));
1536
1537                while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1538                        sk_backlog_rcv(sk, skb1);
1539
1540                tp->ucopy.memory = 0;
1541        } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1542                wake_up_interruptible_sync_poll(sk_sleep(sk),
1543                                           POLLIN | POLLRDNORM | POLLRDBAND);
1544                if (!inet_csk_ack_scheduled(sk))
1545                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1546                                                  (3 * tcp_rto_min(sk)) / 4,
1547                                                  TCP_RTO_MAX);
1548        }
1549        return true;
1550}
1551EXPORT_SYMBOL(tcp_prequeue);
1552
1553bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1554{
1555        u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1556
1557        /* Only socket owner can try to collapse/prune rx queues
1558         * to reduce memory overhead, so add a little headroom here.
1559         * Few sockets backlog are possibly concurrently non empty.
1560         */
1561        limit += 64*1024;
1562
1563        /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1564         * we can fix skb->truesize to its real value to avoid future drops.
1565         * This is valid because skb is not yet charged to the socket.
1566         * It has been noticed pure SACK packets were sometimes dropped
1567         * (if cooked by drivers without copybreak feature).
1568         */
1569        skb_condense(skb);
1570
1571        if (unlikely(sk_add_backlog(sk, skb, limit))) {
1572                bh_unlock_sock(sk);
1573                __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1574                return true;
1575        }
1576        return false;
1577}
1578EXPORT_SYMBOL(tcp_add_backlog);
1579
1580int tcp_filter(struct sock *sk, struct sk_buff *skb)
1581{
1582        struct tcphdr *th = (struct tcphdr *)skb->data;
1583        unsigned int eaten = skb->len;
1584        int err;
1585
1586        err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1587        if (!err) {
1588                eaten -= skb->len;
1589                TCP_SKB_CB(skb)->end_seq -= eaten;
1590        }
1591        return err;
1592}
1593EXPORT_SYMBOL(tcp_filter);
1594
1595/*
1596 *      From tcp_input.c
1597 */
1598
1599int tcp_v4_rcv(struct sk_buff *skb)
1600{
1601        struct net *net = dev_net(skb->dev);
1602        const struct iphdr *iph;
1603        const struct tcphdr *th;
1604        bool refcounted;
1605        struct sock *sk;
1606        int ret;
1607
1608        if (skb->pkt_type != PACKET_HOST)
1609                goto discard_it;
1610
1611        /* Count it even if it's bad */
1612        __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1613
1614        if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1615                goto discard_it;
1616
1617        th = (const struct tcphdr *)skb->data;
1618
1619        if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1620                goto bad_packet;
1621        if (!pskb_may_pull(skb, th->doff * 4))
1622                goto discard_it;
1623
1624        /* An explanation is required here, I think.
1625         * Packet length and doff are validated by header prediction,
1626         * provided case of th->doff==0 is eliminated.
1627         * So, we defer the checks. */
1628
1629        if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1630                goto csum_error;
1631
1632        th = (const struct tcphdr *)skb->data;
1633        iph = ip_hdr(skb);
1634        /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1635         * barrier() makes sure compiler wont play fool^Waliasing games.
1636         */
1637        memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1638                sizeof(struct inet_skb_parm));
1639        barrier();
1640
1641        TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1642        TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1643                                    skb->len - th->doff * 4);
1644        TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1645        TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1646        TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1647        TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1648        TCP_SKB_CB(skb)->sacked  = 0;
1649
1650lookup:
1651        sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1652                               th->dest, &refcounted);
1653        if (!sk)
1654                goto no_tcp_socket;
1655
1656process:
1657        if (sk->sk_state == TCP_TIME_WAIT)
1658                goto do_time_wait;
1659
1660        if (sk->sk_state == TCP_NEW_SYN_RECV) {
1661                struct request_sock *req = inet_reqsk(sk);
1662                struct sock *nsk;
1663
1664                sk = req->rsk_listener;
1665                if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1666                        sk_drops_add(sk, skb);
1667                        reqsk_put(req);
1668                        goto discard_it;
1669                }
1670                if (unlikely(sk->sk_state != TCP_LISTEN)) {
1671                        inet_csk_reqsk_queue_drop_and_put(sk, req);
1672                        goto lookup;
1673                }
1674                /* We own a reference on the listener, increase it again
1675                 * as we might lose it too soon.
1676                 */
1677                sock_hold(sk);
1678                refcounted = true;
1679                nsk = tcp_check_req(sk, skb, req, false);
1680                if (!nsk) {
1681                        reqsk_put(req);
1682                        goto discard_and_relse;
1683                }
1684                if (nsk == sk) {
1685                        reqsk_put(req);
1686                } else if (tcp_child_process(sk, nsk, skb)) {
1687                        tcp_v4_send_reset(nsk, skb);
1688                        goto discard_and_relse;
1689                } else {
1690                        sock_put(sk);
1691                        return 0;
1692                }
1693        }
1694        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1695                __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1696                goto discard_and_relse;
1697        }
1698
1699        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1700                goto discard_and_relse;
1701
1702        if (tcp_v4_inbound_md5_hash(sk, skb))
1703                goto discard_and_relse;
1704
1705        nf_reset(skb);
1706
1707        if (tcp_filter(sk, skb))
1708                goto discard_and_relse;
1709        th = (const struct tcphdr *)skb->data;
1710        iph = ip_hdr(skb);
1711
1712        skb->dev = NULL;
1713
1714        if (sk->sk_state == TCP_LISTEN) {
1715                ret = tcp_v4_do_rcv(sk, skb);
1716                goto put_and_return;
1717        }
1718
1719        sk_incoming_cpu_update(sk);
1720
1721        bh_lock_sock_nested(sk);
1722        tcp_segs_in(tcp_sk(sk), skb);
1723        ret = 0;
1724        if (!sock_owned_by_user(sk)) {
1725                if (!tcp_prequeue(sk, skb))
1726                        ret = tcp_v4_do_rcv(sk, skb);
1727        } else if (tcp_add_backlog(sk, skb)) {
1728                goto discard_and_relse;
1729        }
1730        bh_unlock_sock(sk);
1731
1732put_and_return:
1733        if (refcounted)
1734                sock_put(sk);
1735
1736        return ret;
1737
1738no_tcp_socket:
1739        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1740                goto discard_it;
1741
1742        if (tcp_checksum_complete(skb)) {
1743csum_error:
1744                __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1745bad_packet:
1746                __TCP_INC_STATS(net, TCP_MIB_INERRS);
1747        } else {
1748                tcp_v4_send_reset(NULL, skb);
1749        }
1750
1751discard_it:
1752        /* Discard frame. */
1753        kfree_skb(skb);
1754        return 0;
1755
1756discard_and_relse:
1757        sk_drops_add(sk, skb);
1758        if (refcounted)
1759                sock_put(sk);
1760        goto discard_it;
1761
1762do_time_wait:
1763        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1764                inet_twsk_put(inet_twsk(sk));
1765                goto discard_it;
1766        }
1767
1768        if (tcp_checksum_complete(skb)) {
1769                inet_twsk_put(inet_twsk(sk));
1770                goto csum_error;
1771        }
1772        switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1773        case TCP_TW_SYN: {
1774                struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1775                                                        &tcp_hashinfo, skb,
1776                                                        __tcp_hdrlen(th),
1777                                                        iph->saddr, th->source,
1778                                                        iph->daddr, th->dest,
1779                                                        inet_iif(skb));
1780                if (sk2) {
1781                        inet_twsk_deschedule_put(inet_twsk(sk));
1782                        sk = sk2;
1783                        refcounted = false;
1784                        goto process;
1785                }
1786                /* Fall through to ACK */
1787        }
1788        case TCP_TW_ACK:
1789                tcp_v4_timewait_ack(sk, skb);
1790                break;
1791        case TCP_TW_RST:
1792                tcp_v4_send_reset(sk, skb);
1793                inet_twsk_deschedule_put(inet_twsk(sk));
1794                goto discard_it;
1795        case TCP_TW_SUCCESS:;
1796        }
1797        goto discard_it;
1798}
1799
1800static struct timewait_sock_ops tcp_timewait_sock_ops = {
1801        .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1802        .twsk_unique    = tcp_twsk_unique,
1803        .twsk_destructor= tcp_twsk_destructor,
1804};
1805
1806void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1807{
1808        struct dst_entry *dst = skb_dst(skb);
1809
1810        if (dst && dst_hold_safe(dst)) {
1811                sk->sk_rx_dst = dst;
1812                inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1813        }
1814}
1815EXPORT_SYMBOL(inet_sk_rx_dst_set);
1816
1817const struct inet_connection_sock_af_ops ipv4_specific = {
1818        .queue_xmit        = ip_queue_xmit,
1819        .send_check        = tcp_v4_send_check,
1820        .rebuild_header    = inet_sk_rebuild_header,
1821        .sk_rx_dst_set     = inet_sk_rx_dst_set,
1822        .conn_request      = tcp_v4_conn_request,
1823        .syn_recv_sock     = tcp_v4_syn_recv_sock,
1824        .net_header_len    = sizeof(struct iphdr),
1825        .setsockopt        = ip_setsockopt,
1826        .getsockopt        = ip_getsockopt,
1827        .addr2sockaddr     = inet_csk_addr2sockaddr,
1828        .sockaddr_len      = sizeof(struct sockaddr_in),
1829#ifdef CONFIG_COMPAT
1830        .compat_setsockopt = compat_ip_setsockopt,
1831        .compat_getsockopt = compat_ip_getsockopt,
1832#endif
1833        .mtu_reduced       = tcp_v4_mtu_reduced,
1834};
1835EXPORT_SYMBOL(ipv4_specific);
1836
1837#ifdef CONFIG_TCP_MD5SIG
1838static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1839        .md5_lookup             = tcp_v4_md5_lookup,
1840        .calc_md5_hash          = tcp_v4_md5_hash_skb,
1841        .md5_parse              = tcp_v4_parse_md5_keys,
1842};
1843#endif
1844
1845/* NOTE: A lot of things set to zero explicitly by call to
1846 *       sk_alloc() so need not be done here.
1847 */
1848static int tcp_v4_init_sock(struct sock *sk)
1849{
1850        struct inet_connection_sock *icsk = inet_csk(sk);
1851
1852        tcp_init_sock(sk);
1853
1854        icsk->icsk_af_ops = &ipv4_specific;
1855
1856#ifdef CONFIG_TCP_MD5SIG
1857        tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1858#endif
1859
1860        return 0;
1861}
1862
1863void tcp_v4_destroy_sock(struct sock *sk)
1864{
1865        struct tcp_sock *tp = tcp_sk(sk);
1866
1867        tcp_clear_xmit_timers(sk);
1868
1869        tcp_cleanup_congestion_control(sk);
1870
1871        /* Cleanup up the write buffer. */
1872        tcp_write_queue_purge(sk);
1873
1874        /* Cleans up our, hopefully empty, out_of_order_queue. */
1875        skb_rbtree_purge(&tp->out_of_order_queue);
1876
1877#ifdef CONFIG_TCP_MD5SIG
1878        /* Clean up the MD5 key list, if any */
1879        if (tp->md5sig_info) {
1880                tcp_clear_md5_list(sk);
1881                kfree_rcu(tp->md5sig_info, rcu);
1882                tp->md5sig_info = NULL;
1883        }
1884#endif
1885
1886        /* Clean prequeue, it must be empty really */
1887        __skb_queue_purge(&tp->ucopy.prequeue);
1888
1889        /* Clean up a referenced TCP bind bucket. */
1890        if (inet_csk(sk)->icsk_bind_hash)
1891                inet_put_port(sk);
1892
1893        BUG_ON(tp->fastopen_rsk);
1894
1895        /* If socket is aborted during connect operation */
1896        tcp_free_fastopen_req(tp);
1897        tcp_saved_syn_free(tp);
1898
1899        sk_sockets_allocated_dec(sk);
1900}
1901EXPORT_SYMBOL(tcp_v4_destroy_sock);
1902
1903#ifdef CONFIG_PROC_FS
1904/* Proc filesystem TCP sock list dumping. */
1905
1906/*
1907 * Get next listener socket follow cur.  If cur is NULL, get first socket
1908 * starting from bucket given in st->bucket; when st->bucket is zero the
1909 * very first socket in the hash table is returned.
1910 */
1911static void *listening_get_next(struct seq_file *seq, void *cur)
1912{
1913        struct tcp_iter_state *st = seq->private;
1914        struct net *net = seq_file_net(seq);
1915        struct inet_listen_hashbucket *ilb;
1916        struct sock *sk = cur;
1917
1918        if (!sk) {
1919get_head:
1920                ilb = &tcp_hashinfo.listening_hash[st->bucket];
1921                spin_lock(&ilb->lock);
1922                sk = sk_head(&ilb->head);
1923                st->offset = 0;
1924                goto get_sk;
1925        }
1926        ilb = &tcp_hashinfo.listening_hash[st->bucket];
1927        ++st->num;
1928        ++st->offset;
1929
1930        sk = sk_next(sk);
1931get_sk:
1932        sk_for_each_from(sk) {
1933                if (!net_eq(sock_net(sk), net))
1934                        continue;
1935                if (sk->sk_family == st->family)
1936                        return sk;
1937        }
1938        spin_unlock(&ilb->lock);
1939        st->offset = 0;
1940        if (++st->bucket < INET_LHTABLE_SIZE)
1941                goto get_head;
1942        return NULL;
1943}
1944
1945static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1946{
1947        struct tcp_iter_state *st = seq->private;
1948        void *rc;
1949
1950        st->bucket = 0;
1951        st->offset = 0;
1952        rc = listening_get_next(seq, NULL);
1953
1954        while (rc && *pos) {
1955                rc = listening_get_next(seq, rc);
1956                --*pos;
1957        }
1958        return rc;
1959}
1960
1961static inline bool empty_bucket(const struct tcp_iter_state *st)
1962{
1963        return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1964}
1965
1966/*
1967 * Get first established socket starting from bucket given in st->bucket.
1968 * If st->bucket is zero, the very first socket in the hash is returned.
1969 */
1970static void *established_get_first(struct seq_file *seq)
1971{
1972        struct tcp_iter_state *st = seq->private;
1973        struct net *net = seq_file_net(seq);
1974        void *rc = NULL;
1975
1976        st->offset = 0;
1977        for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1978                struct sock *sk;
1979                struct hlist_nulls_node *node;
1980                spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1981
1982                /* Lockless fast path for the common case of empty buckets */
1983                if (empty_bucket(st))
1984                        continue;
1985
1986                spin_lock_bh(lock);
1987                sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1988                        if (sk->sk_family != st->family ||
1989                            !net_eq(sock_net(sk), net)) {
1990                                continue;
1991                        }
1992                        rc = sk;
1993                        goto out;
1994                }
1995                spin_unlock_bh(lock);
1996        }
1997out:
1998        return rc;
1999}
2000
2001static void *established_get_next(struct seq_file *seq, void *cur)
2002{
2003        struct sock *sk = cur;
2004        struct hlist_nulls_node *node;
2005        struct tcp_iter_state *st = seq->private;
2006        struct net *net = seq_file_net(seq);
2007
2008        ++st->num;
2009        ++st->offset;
2010
2011        sk = sk_nulls_next(sk);
2012
2013        sk_nulls_for_each_from(sk, node) {
2014                if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2015                        return sk;
2016        }
2017
2018        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2019        ++st->bucket;
2020        return established_get_first(seq);
2021}
2022
2023static void *established_get_idx(struct seq_file *seq, loff_t pos)
2024{
2025        struct tcp_iter_state *st = seq->private;
2026        void *rc;
2027
2028        st->bucket = 0;
2029        rc = established_get_first(seq);
2030
2031        while (rc && pos) {
2032                rc = established_get_next(seq, rc);
2033                --pos;
2034        }
2035        return rc;
2036}
2037
2038static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2039{
2040        void *rc;
2041        struct tcp_iter_state *st = seq->private;
2042
2043        st->state = TCP_SEQ_STATE_LISTENING;
2044        rc        = listening_get_idx(seq, &pos);
2045
2046        if (!rc) {
2047                st->state = TCP_SEQ_STATE_ESTABLISHED;
2048                rc        = established_get_idx(seq, pos);
2049        }
2050
2051        return rc;
2052}
2053
2054static void *tcp_seek_last_pos(struct seq_file *seq)
2055{
2056        struct tcp_iter_state *st = seq->private;
2057        int offset = st->offset;
2058        int orig_num = st->num;
2059        void *rc = NULL;
2060
2061        switch (st->state) {
2062        case TCP_SEQ_STATE_LISTENING:
2063                if (st->bucket >= INET_LHTABLE_SIZE)
2064                        break;
2065                st->state = TCP_SEQ_STATE_LISTENING;
2066                rc = listening_get_next(seq, NULL);
2067                while (offset-- && rc)
2068                        rc = listening_get_next(seq, rc);
2069                if (rc)
2070                        break;
2071                st->bucket = 0;
2072                st->state = TCP_SEQ_STATE_ESTABLISHED;
2073                /* Fallthrough */
2074        case TCP_SEQ_STATE_ESTABLISHED:
2075                if (st->bucket > tcp_hashinfo.ehash_mask)
2076                        break;
2077                rc = established_get_first(seq);
2078                while (offset-- && rc)
2079                        rc = established_get_next(seq, rc);
2080        }
2081
2082        st->num = orig_num;
2083
2084        return rc;
2085}
2086
2087static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2088{
2089        struct tcp_iter_state *st = seq->private;
2090        void *rc;
2091
2092        if (*pos && *pos == st->last_pos) {
2093                rc = tcp_seek_last_pos(seq);
2094                if (rc)
2095                        goto out;
2096        }
2097
2098        st->state = TCP_SEQ_STATE_LISTENING;
2099        st->num = 0;
2100        st->bucket = 0;
2101        st->offset = 0;
2102        rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2103
2104out:
2105        st->last_pos = *pos;
2106        return rc;
2107}
2108
2109static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2110{
2111        struct tcp_iter_state *st = seq->private;
2112        void *rc = NULL;
2113
2114        if (v == SEQ_START_TOKEN) {
2115                rc = tcp_get_idx(seq, 0);
2116                goto out;
2117        }
2118
2119        switch (st->state) {
2120        case TCP_SEQ_STATE_LISTENING:
2121                rc = listening_get_next(seq, v);
2122                if (!rc) {
2123                        st->state = TCP_SEQ_STATE_ESTABLISHED;
2124                        st->bucket = 0;
2125                        st->offset = 0;
2126                        rc        = established_get_first(seq);
2127                }
2128                break;
2129        case TCP_SEQ_STATE_ESTABLISHED:
2130                rc = established_get_next(seq, v);
2131                break;
2132        }
2133out:
2134        ++*pos;
2135        st->last_pos = *pos;
2136        return rc;
2137}
2138
2139static void tcp_seq_stop(struct seq_file *seq, void *v)
2140{
2141        struct tcp_iter_state *st = seq->private;
2142
2143        switch (st->state) {
2144        case TCP_SEQ_STATE_LISTENING:
2145                if (v != SEQ_START_TOKEN)
2146                        spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2147                break;
2148        case TCP_SEQ_STATE_ESTABLISHED:
2149                if (v)
2150                        spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2151                break;
2152        }
2153}
2154
2155int tcp_seq_open(struct inode *inode, struct file *file)
2156{
2157        struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2158        struct tcp_iter_state *s;
2159        int err;
2160
2161        err = seq_open_net(inode, file, &afinfo->seq_ops,
2162                          sizeof(struct tcp_iter_state));
2163        if (err < 0)
2164                return err;
2165
2166        s = ((struct seq_file *)file->private_data)->private;
2167        s->family               = afinfo->family;
2168        s->last_pos             = 0;
2169        return 0;
2170}
2171EXPORT_SYMBOL(tcp_seq_open);
2172
2173int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2174{
2175        int rc = 0;
2176        struct proc_dir_entry *p;
2177
2178        afinfo->seq_ops.start           = tcp_seq_start;
2179        afinfo->seq_ops.next            = tcp_seq_next;
2180        afinfo->seq_ops.stop            = tcp_seq_stop;
2181
2182        p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2183                             afinfo->seq_fops, afinfo);
2184        if (!p)
2185                rc = -ENOMEM;
2186        return rc;
2187}
2188EXPORT_SYMBOL(tcp_proc_register);
2189
2190void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2191{
2192        remove_proc_entry(afinfo->name, net->proc_net);
2193}
2194EXPORT_SYMBOL(tcp_proc_unregister);
2195
2196static void get_openreq4(const struct request_sock *req,
2197                         struct seq_file *f, int i)
2198{
2199        const struct inet_request_sock *ireq = inet_rsk(req);
2200        long delta = req->rsk_timer.expires - jiffies;
2201
2202        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2203                " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2204                i,
2205                ireq->ir_loc_addr,
2206                ireq->ir_num,
2207                ireq->ir_rmt_addr,
2208                ntohs(ireq->ir_rmt_port),
2209                TCP_SYN_RECV,
2210                0, 0, /* could print option size, but that is af dependent. */
2211                1,    /* timers active (only the expire timer) */
2212                jiffies_delta_to_clock_t(delta),
2213                req->num_timeout,
2214                from_kuid_munged(seq_user_ns(f),
2215                                 sock_i_uid(req->rsk_listener)),
2216                0,  /* non standard timer */
2217                0, /* open_requests have no inode */
2218                0,
2219                req);
2220}
2221
2222static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2223{
2224        int timer_active;
2225        unsigned long timer_expires;
2226        const struct tcp_sock *tp = tcp_sk(sk);
2227        const struct inet_connection_sock *icsk = inet_csk(sk);
2228        const struct inet_sock *inet = inet_sk(sk);
2229        const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2230        __be32 dest = inet->inet_daddr;
2231        __be32 src = inet->inet_rcv_saddr;
2232        __u16 destp = ntohs(inet->inet_dport);
2233        __u16 srcp = ntohs(inet->inet_sport);
2234        int rx_queue;
2235        int state;
2236
2237        if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2238            icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2239            icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2240                timer_active    = 1;
2241                timer_expires   = icsk->icsk_timeout;
2242        } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2243                timer_active    = 4;
2244                timer_expires   = icsk->icsk_timeout;
2245        } else if (timer_pending(&sk->sk_timer)) {
2246                timer_active    = 2;
2247                timer_expires   = sk->sk_timer.expires;
2248        } else {
2249                timer_active    = 0;
2250                timer_expires = jiffies;
2251        }
2252
2253        state = sk_state_load(sk);
2254        if (state == TCP_LISTEN)
2255                rx_queue = sk->sk_ack_backlog;
2256        else
2257                /* Because we don't lock the socket,
2258                 * we might find a transient negative value.
2259                 */
2260                rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2261
2262        seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2263                        "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2264                i, src, srcp, dest, destp, state,
2265                tp->write_seq - tp->snd_una,
2266                rx_queue,
2267                timer_active,
2268                jiffies_delta_to_clock_t(timer_expires - jiffies),
2269                icsk->icsk_retransmits,
2270                from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2271                icsk->icsk_probes_out,
2272                sock_i_ino(sk),
2273                atomic_read(&sk->sk_refcnt), sk,
2274                jiffies_to_clock_t(icsk->icsk_rto),
2275                jiffies_to_clock_t(icsk->icsk_ack.ato),
2276                (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2277                tp->snd_cwnd,
2278                state == TCP_LISTEN ?
2279                    fastopenq->max_qlen :
2280                    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2281}
2282
2283static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2284                               struct seq_file *f, int i)
2285{
2286        long delta = tw->tw_timer.expires - jiffies;
2287        __be32 dest, src;
2288        __u16 destp, srcp;
2289
2290        dest  = tw->tw_daddr;
2291        src   = tw->tw_rcv_saddr;
2292        destp = ntohs(tw->tw_dport);
2293        srcp  = ntohs(tw->tw_sport);
2294
2295        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2296                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2297                i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2298                3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2299                atomic_read(&tw->tw_refcnt), tw);
2300}
2301
2302#define TMPSZ 150
2303
2304static int tcp4_seq_show(struct seq_file *seq, void *v)
2305{
2306        struct tcp_iter_state *st;
2307        struct sock *sk = v;
2308
2309        seq_setwidth(seq, TMPSZ - 1);
2310        if (v == SEQ_START_TOKEN) {
2311                seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2312                           "rx_queue tr tm->when retrnsmt   uid  timeout "
2313                           "inode");
2314                goto out;
2315        }
2316        st = seq->private;
2317
2318        if (sk->sk_state == TCP_TIME_WAIT)
2319                get_timewait4_sock(v, seq, st->num);
2320        else if (sk->sk_state == TCP_NEW_SYN_RECV)
2321                get_openreq4(v, seq, st->num);
2322        else
2323                get_tcp4_sock(v, seq, st->num);
2324out:
2325        seq_pad(seq, '\n');
2326        return 0;
2327}
2328
2329static const struct file_operations tcp_afinfo_seq_fops = {
2330        .owner   = THIS_MODULE,
2331        .open    = tcp_seq_open,
2332        .read    = seq_read,
2333        .llseek  = seq_lseek,
2334        .release = seq_release_net
2335};
2336
2337static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2338        .name           = "tcp",
2339        .family         = AF_INET,
2340        .seq_fops       = &tcp_afinfo_seq_fops,
2341        .seq_ops        = {
2342                .show           = tcp4_seq_show,
2343        },
2344};
2345
2346static int __net_init tcp4_proc_init_net(struct net *net)
2347{
2348        return tcp_proc_register(net, &tcp4_seq_afinfo);
2349}
2350
2351static void __net_exit tcp4_proc_exit_net(struct net *net)
2352{
2353        tcp_proc_unregister(net, &tcp4_seq_afinfo);
2354}
2355
2356static struct pernet_operations tcp4_net_ops = {
2357        .init = tcp4_proc_init_net,
2358        .exit = tcp4_proc_exit_net,
2359};
2360
2361int __init tcp4_proc_init(void)
2362{
2363        return register_pernet_subsys(&tcp4_net_ops);
2364}
2365
2366void tcp4_proc_exit(void)
2367{
2368        unregister_pernet_subsys(&tcp4_net_ops);
2369}
2370#endif /* CONFIG_PROC_FS */
2371
2372struct proto tcp_prot = {
2373        .name                   = "TCP",
2374        .owner                  = THIS_MODULE,
2375        .close                  = tcp_close,
2376        .connect                = tcp_v4_connect,
2377        .disconnect             = tcp_disconnect,
2378        .accept                 = inet_csk_accept,
2379        .ioctl                  = tcp_ioctl,
2380        .init                   = tcp_v4_init_sock,
2381        .destroy                = tcp_v4_destroy_sock,
2382        .shutdown               = tcp_shutdown,
2383        .setsockopt             = tcp_setsockopt,
2384        .getsockopt             = tcp_getsockopt,
2385        .keepalive              = tcp_set_keepalive,
2386        .recvmsg                = tcp_recvmsg,
2387        .sendmsg                = tcp_sendmsg,
2388        .sendpage               = tcp_sendpage,
2389        .backlog_rcv            = tcp_v4_do_rcv,
2390        .release_cb             = tcp_release_cb,
2391        .hash                   = inet_hash,
2392        .unhash                 = inet_unhash,
2393        .get_port               = inet_csk_get_port,
2394        .enter_memory_pressure  = tcp_enter_memory_pressure,
2395        .stream_memory_free     = tcp_stream_memory_free,
2396        .sockets_allocated      = &tcp_sockets_allocated,
2397        .orphan_count           = &tcp_orphan_count,
2398        .memory_allocated       = &tcp_memory_allocated,
2399        .memory_pressure        = &tcp_memory_pressure,
2400        .sysctl_mem             = sysctl_tcp_mem,
2401        .sysctl_wmem            = sysctl_tcp_wmem,
2402        .sysctl_rmem            = sysctl_tcp_rmem,
2403        .max_header             = MAX_TCP_HEADER,
2404        .obj_size               = sizeof(struct tcp_sock),
2405        .slab_flags             = SLAB_DESTROY_BY_RCU,
2406        .twsk_prot              = &tcp_timewait_sock_ops,
2407        .rsk_prot               = &tcp_request_sock_ops,
2408        .h.hashinfo             = &tcp_hashinfo,
2409        .no_autobind            = true,
2410#ifdef CONFIG_COMPAT
2411        .compat_setsockopt      = compat_tcp_setsockopt,
2412        .compat_getsockopt      = compat_tcp_getsockopt,
2413#endif
2414        .diag_destroy           = tcp_abort,
2415};
2416EXPORT_SYMBOL(tcp_prot);
2417
2418static void __net_exit tcp_sk_exit(struct net *net)
2419{
2420        int cpu;
2421
2422        for_each_possible_cpu(cpu)
2423                inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2424        free_percpu(net->ipv4.tcp_sk);
2425}
2426
2427static int __net_init tcp_sk_init(struct net *net)
2428{
2429        int res, cpu, cnt;
2430
2431        net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2432        if (!net->ipv4.tcp_sk)
2433                return -ENOMEM;
2434
2435        for_each_possible_cpu(cpu) {
2436                struct sock *sk;
2437
2438                res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2439                                           IPPROTO_TCP, net);
2440                if (res)
2441                        goto fail;
2442                sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2443                *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2444        }
2445
2446        net->ipv4.sysctl_tcp_ecn = 2;
2447        net->ipv4.sysctl_tcp_ecn_fallback = 1;
2448
2449        net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2450        net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2451        net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2452
2453        net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2454        net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2455        net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2456
2457        net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2458        net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2459        net->ipv4.sysctl_tcp_syncookies = 1;
2460        net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2461        net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2462        net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2463        net->ipv4.sysctl_tcp_orphan_retries = 0;
2464        net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2465        net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2466        net->ipv4.sysctl_tcp_tw_reuse = 0;
2467
2468        cnt = tcp_hashinfo.ehash_mask + 1;
2469        net->ipv4.tcp_death_row.sysctl_tw_recycle = 0;
2470        net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2471        net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2472
2473        net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2474
2475        return 0;
2476fail:
2477        tcp_sk_exit(net);
2478
2479        return res;
2480}
2481
2482static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2483{
2484        inet_twsk_purge(&tcp_hashinfo, AF_INET);
2485}
2486
2487static struct pernet_operations __net_initdata tcp_sk_ops = {
2488       .init       = tcp_sk_init,
2489       .exit       = tcp_sk_exit,
2490       .exit_batch = tcp_sk_exit_batch,
2491};
2492
2493void __init tcp_v4_init(void)
2494{
2495        if (register_pernet_subsys(&tcp_sk_ops))
2496                panic("Failed to create the TCP control socket.\n");
2497}
2498