linux/net/ipv4/tcp_ipv4.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Implementation of the Transmission Control Protocol(TCP).
   7 *
   8 * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
   9 *
  10 *              IPv4 specific functions
  11 *
  12 *
  13 *              code split from:
  14 *              linux/ipv4/tcp.c
  15 *              linux/ipv4/tcp_input.c
  16 *              linux/ipv4/tcp_output.c
  17 *
  18 *              See tcp.c for author information
  19 *
  20 *      This program is free software; you can redistribute it and/or
  21 *      modify it under the terms of the GNU General Public License
  22 *      as published by the Free Software Foundation; either version
  23 *      2 of the License, or (at your option) any later version.
  24 */
  25
  26/*
  27 * Changes:
  28 *              David S. Miller :       New socket lookup architecture.
  29 *                                      This code is dedicated to John Dyson.
  30 *              David S. Miller :       Change semantics of established hash,
  31 *                                      half is devoted to TIME_WAIT sockets
  32 *                                      and the rest go in the other half.
  33 *              Andi Kleen :            Add support for syncookies and fixed
  34 *                                      some bugs: ip options weren't passed to
  35 *                                      the TCP layer, missed a check for an
  36 *                                      ACK bit.
  37 *              Andi Kleen :            Implemented fast path mtu discovery.
  38 *                                      Fixed many serious bugs in the
  39 *                                      request_sock handling and moved
  40 *                                      most of it into the af independent code.
  41 *                                      Added tail drop and some other bugfixes.
  42 *                                      Added new listen semantics.
  43 *              Mike McLagan    :       Routing by source
  44 *      Juan Jose Ciarlante:            ip_dynaddr bits
  45 *              Andi Kleen:             various fixes.
  46 *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  47 *                                      coma.
  48 *      Andi Kleen              :       Fix new listen.
  49 *      Andi Kleen              :       Fix accept error reporting.
  50 *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  51 *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  52 *                                      a single port at the same time.
  53 */
  54
  55
  56#include <linux/types.h>
  57#include <linux/fcntl.h>
  58#include <linux/module.h>
  59#include <linux/random.h>
  60#include <linux/cache.h>
  61#include <linux/jhash.h>
  62#include <linux/init.h>
  63#include <linux/times.h>
  64
  65#include <net/net_namespace.h>
  66#include <net/icmp.h>
  67#include <net/inet_hashtables.h>
  68#include <net/tcp.h>
  69#include <net/transp_v6.h>
  70#include <net/ipv6.h>
  71#include <net/inet_common.h>
  72#include <net/timewait_sock.h>
  73#include <net/xfrm.h>
  74#include <net/netdma.h>
  75
  76#include <linux/inet.h>
  77#include <linux/ipv6.h>
  78#include <linux/stddef.h>
  79#include <linux/proc_fs.h>
  80#include <linux/seq_file.h>
  81
  82#include <linux/crypto.h>
  83#include <linux/scatterlist.h>
  84
  85int sysctl_tcp_tw_reuse __read_mostly;
  86int sysctl_tcp_low_latency __read_mostly;
  87
  88/* Check TCP sequence numbers in ICMP packets. */
  89#define ICMP_MIN_LENGTH 8
  90
  91/* Socket used for sending RSTs */
  92static struct socket *tcp_socket __read_mostly;
  93
  94void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
  95
  96#ifdef CONFIG_TCP_MD5SIG
  97static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
  98                                                   __be32 addr);
  99static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
 100                                   __be32 saddr, __be32 daddr,
 101                                   struct tcphdr *th, int protocol,
 102                                   int tcplen);
 103#endif
 104
 105struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
 106        .lhash_lock  = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
 107        .lhash_users = ATOMIC_INIT(0),
 108        .lhash_wait  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
 109};
 110
 111static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
 112{
 113        return inet_csk_get_port(&tcp_hashinfo, sk, snum,
 114                                 inet_csk_bind_conflict);
 115}
 116
 117static void tcp_v4_hash(struct sock *sk)
 118{
 119        inet_hash(&tcp_hashinfo, sk);
 120}
 121
 122void tcp_unhash(struct sock *sk)
 123{
 124        inet_unhash(&tcp_hashinfo, sk);
 125}
 126
 127static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
 128{
 129        return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 130                                          ip_hdr(skb)->saddr,
 131                                          tcp_hdr(skb)->dest,
 132                                          tcp_hdr(skb)->source);
 133}
 134
 135int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 136{
 137        const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 138        struct tcp_sock *tp = tcp_sk(sk);
 139
 140        /* With PAWS, it is safe from the viewpoint
 141           of data integrity. Even without PAWS it is safe provided sequence
 142           spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 143
 144           Actually, the idea is close to VJ's one, only timestamp cache is
 145           held not per host, but per port pair and TW bucket is used as state
 146           holder.
 147
 148           If TW bucket has been already destroyed we fall back to VJ's scheme
 149           and use initial timestamp retrieved from peer table.
 150         */
 151        if (tcptw->tw_ts_recent_stamp &&
 152            (twp == NULL || (sysctl_tcp_tw_reuse &&
 153                             get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 154                tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 155                if (tp->write_seq == 0)
 156                        tp->write_seq = 1;
 157                tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 158                tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 159                sock_hold(sktw);
 160                return 1;
 161        }
 162
 163        return 0;
 164}
 165
 166EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 167
 168/* This will initiate an outgoing connection. */
 169int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 170{
 171        struct inet_sock *inet = inet_sk(sk);
 172        struct tcp_sock *tp = tcp_sk(sk);
 173        struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 174        struct rtable *rt;
 175        __be32 daddr, nexthop;
 176        int tmp;
 177        int err;
 178
 179        if (addr_len < sizeof(struct sockaddr_in))
 180                return -EINVAL;
 181
 182        if (usin->sin_family != AF_INET)
 183                return -EAFNOSUPPORT;
 184
 185        nexthop = daddr = usin->sin_addr.s_addr;
 186        if (inet->opt && inet->opt->srr) {
 187                if (!daddr)
 188                        return -EINVAL;
 189                nexthop = inet->opt->faddr;
 190        }
 191
 192        tmp = ip_route_connect(&rt, nexthop, inet->saddr,
 193                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 194                               IPPROTO_TCP,
 195                               inet->sport, usin->sin_port, sk, 1);
 196        if (tmp < 0) {
 197                if (tmp == -ENETUNREACH)
 198                        IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
 199                return tmp;
 200        }
 201
 202        if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 203                ip_rt_put(rt);
 204                return -ENETUNREACH;
 205        }
 206
 207        if (!inet->opt || !inet->opt->srr)
 208                daddr = rt->rt_dst;
 209
 210        if (!inet->saddr)
 211                inet->saddr = rt->rt_src;
 212        inet->rcv_saddr = inet->saddr;
 213
 214        if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
 215                /* Reset inherited state */
 216                tp->rx_opt.ts_recent       = 0;
 217                tp->rx_opt.ts_recent_stamp = 0;
 218                tp->write_seq              = 0;
 219        }
 220
 221        if (tcp_death_row.sysctl_tw_recycle &&
 222            !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
 223                struct inet_peer *peer = rt_get_peer(rt);
 224                /*
 225                 * VJ's idea. We save last timestamp seen from
 226                 * the destination in peer table, when entering state
 227                 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
 228                 * when trying new connection.
 229                 */
 230                if (peer != NULL &&
 231                    peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
 232                        tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 233                        tp->rx_opt.ts_recent = peer->tcp_ts;
 234                }
 235        }
 236
 237        inet->dport = usin->sin_port;
 238        inet->daddr = daddr;
 239
 240        inet_csk(sk)->icsk_ext_hdr_len = 0;
 241        if (inet->opt)
 242                inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
 243
 244        tp->rx_opt.mss_clamp = 536;
 245
 246        /* Socket identity is still unknown (sport may be zero).
 247         * However we set state to SYN-SENT and not releasing socket
 248         * lock select source port, enter ourselves into the hash tables and
 249         * complete initialization after this.
 250         */
 251        tcp_set_state(sk, TCP_SYN_SENT);
 252        err = inet_hash_connect(&tcp_death_row, sk);
 253        if (err)
 254                goto failure;
 255
 256        err = ip_route_newports(&rt, IPPROTO_TCP,
 257                                inet->sport, inet->dport, sk);
 258        if (err)
 259                goto failure;
 260
 261        /* OK, now commit destination to socket.  */
 262        sk->sk_gso_type = SKB_GSO_TCPV4;
 263        sk_setup_caps(sk, &rt->u.dst);
 264
 265        if (!tp->write_seq)
 266                tp->write_seq = secure_tcp_sequence_number(inet->saddr,
 267                                                           inet->daddr,
 268                                                           inet->sport,
 269                                                           usin->sin_port);
 270
 271        inet->id = tp->write_seq ^ jiffies;
 272
 273        err = tcp_connect(sk);
 274        rt = NULL;
 275        if (err)
 276                goto failure;
 277
 278        return 0;
 279
 280failure:
 281        /*
 282         * This unhashes the socket and releases the local port,
 283         * if necessary.
 284         */
 285        tcp_set_state(sk, TCP_CLOSE);
 286        ip_rt_put(rt);
 287        sk->sk_route_caps = 0;
 288        inet->dport = 0;
 289        return err;
 290}
 291
 292/*
 293 * This routine does path mtu discovery as defined in RFC1191.
 294 */
 295static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
 296{
 297        struct dst_entry *dst;
 298        struct inet_sock *inet = inet_sk(sk);
 299
 300        /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 301         * send out by Linux are always <576bytes so they should go through
 302         * unfragmented).
 303         */
 304        if (sk->sk_state == TCP_LISTEN)
 305                return;
 306
 307        /* We don't check in the destentry if pmtu discovery is forbidden
 308         * on this route. We just assume that no packet_to_big packets
 309         * are send back when pmtu discovery is not active.
 310         * There is a small race when the user changes this flag in the
 311         * route, but I think that's acceptable.
 312         */
 313        if ((dst = __sk_dst_check(sk, 0)) == NULL)
 314                return;
 315
 316        dst->ops->update_pmtu(dst, mtu);
 317
 318        /* Something is about to be wrong... Remember soft error
 319         * for the case, if this connection will not able to recover.
 320         */
 321        if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 322                sk->sk_err_soft = EMSGSIZE;
 323
 324        mtu = dst_mtu(dst);
 325
 326        if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 327            inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 328                tcp_sync_mss(sk, mtu);
 329
 330                /* Resend the TCP packet because it's
 331                 * clear that the old packet has been
 332                 * dropped. This is the new "fast" path mtu
 333                 * discovery.
 334                 */
 335                tcp_simple_retransmit(sk);
 336        } /* else let the usual retransmit timer handle it */
 337}
 338
 339/*
 340 * This routine is called by the ICMP module when it gets some
 341 * sort of error condition.  If err < 0 then the socket should
 342 * be closed and the error returned to the user.  If err > 0
 343 * it's just the icmp type << 8 | icmp code.  After adjustment
 344 * header points to the first 8 bytes of the tcp header.  We need
 345 * to find the appropriate port.
 346 *
 347 * The locking strategy used here is very "optimistic". When
 348 * someone else accesses the socket the ICMP is just dropped
 349 * and for some paths there is no check at all.
 350 * A more general error queue to queue errors for later handling
 351 * is probably better.
 352 *
 353 */
 354
 355void tcp_v4_err(struct sk_buff *skb, u32 info)
 356{
 357        struct iphdr *iph = (struct iphdr *)skb->data;
 358        struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 359        struct tcp_sock *tp;
 360        struct inet_sock *inet;
 361        const int type = icmp_hdr(skb)->type;
 362        const int code = icmp_hdr(skb)->code;
 363        struct sock *sk;
 364        __u32 seq;
 365        int err;
 366
 367        if (skb->len < (iph->ihl << 2) + 8) {
 368                ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
 369                return;
 370        }
 371
 372        sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
 373                         th->source, inet_iif(skb));
 374        if (!sk) {
 375                ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
 376                return;
 377        }
 378        if (sk->sk_state == TCP_TIME_WAIT) {
 379                inet_twsk_put(inet_twsk(sk));
 380                return;
 381        }
 382
 383        bh_lock_sock(sk);
 384        /* If too many ICMPs get dropped on busy
 385         * servers this needs to be solved differently.
 386         */
 387        if (sock_owned_by_user(sk))
 388                NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
 389
 390        if (sk->sk_state == TCP_CLOSE)
 391                goto out;
 392
 393        tp = tcp_sk(sk);
 394        seq = ntohl(th->seq);
 395        if (sk->sk_state != TCP_LISTEN &&
 396            !between(seq, tp->snd_una, tp->snd_nxt)) {
 397                NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
 398                goto out;
 399        }
 400
 401        switch (type) {
 402        case ICMP_SOURCE_QUENCH:
 403                /* Just silently ignore these. */
 404                goto out;
 405        case ICMP_PARAMETERPROB:
 406                err = EPROTO;
 407                break;
 408        case ICMP_DEST_UNREACH:
 409                if (code > NR_ICMP_UNREACH)
 410                        goto out;
 411
 412                if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 413                        if (!sock_owned_by_user(sk))
 414                                do_pmtu_discovery(sk, iph, info);
 415                        goto out;
 416                }
 417
 418                err = icmp_err_convert[code].errno;
 419                break;
 420        case ICMP_TIME_EXCEEDED:
 421                err = EHOSTUNREACH;
 422                break;
 423        default:
 424                goto out;
 425        }
 426
 427        switch (sk->sk_state) {
 428                struct request_sock *req, **prev;
 429        case TCP_LISTEN:
 430                if (sock_owned_by_user(sk))
 431                        goto out;
 432
 433                req = inet_csk_search_req(sk, &prev, th->dest,
 434                                          iph->daddr, iph->saddr);
 435                if (!req)
 436                        goto out;
 437
 438                /* ICMPs are not backlogged, hence we cannot get
 439                   an established socket here.
 440                 */
 441                BUG_TRAP(!req->sk);
 442
 443                if (seq != tcp_rsk(req)->snt_isn) {
 444                        NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
 445                        goto out;
 446                }
 447
 448                /*
 449                 * Still in SYN_RECV, just remove it silently.
 450                 * There is no good way to pass the error to the newly
 451                 * created socket, and POSIX does not want network
 452                 * errors returned from accept().
 453                 */
 454                inet_csk_reqsk_queue_drop(sk, req, prev);
 455                goto out;
 456
 457        case TCP_SYN_SENT:
 458        case TCP_SYN_RECV:  /* Cannot happen.
 459                               It can f.e. if SYNs crossed.
 460                             */
 461                if (!sock_owned_by_user(sk)) {
 462                        sk->sk_err = err;
 463
 464                        sk->sk_error_report(sk);
 465
 466                        tcp_done(sk);
 467                } else {
 468                        sk->sk_err_soft = err;
 469                }
 470                goto out;
 471        }
 472
 473        /* If we've already connected we will keep trying
 474         * until we time out, or the user gives up.
 475         *
 476         * rfc1122 4.2.3.9 allows to consider as hard errors
 477         * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 478         * but it is obsoleted by pmtu discovery).
 479         *
 480         * Note, that in modern internet, where routing is unreliable
 481         * and in each dark corner broken firewalls sit, sending random
 482         * errors ordered by their masters even this two messages finally lose
 483         * their original sense (even Linux sends invalid PORT_UNREACHs)
 484         *
 485         * Now we are in compliance with RFCs.
 486         *                                                      --ANK (980905)
 487         */
 488
 489        inet = inet_sk(sk);
 490        if (!sock_owned_by_user(sk) && inet->recverr) {
 491                sk->sk_err = err;
 492                sk->sk_error_report(sk);
 493        } else  { /* Only an error on timeout */
 494                sk->sk_err_soft = err;
 495        }
 496
 497out:
 498        bh_unlock_sock(sk);
 499        sock_put(sk);
 500}
 501
 502/* This routine computes an IPv4 TCP checksum. */
 503void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
 504{
 505        struct inet_sock *inet = inet_sk(sk);
 506        struct tcphdr *th = tcp_hdr(skb);
 507
 508        if (skb->ip_summed == CHECKSUM_PARTIAL) {
 509                th->check = ~tcp_v4_check(len, inet->saddr,
 510                                          inet->daddr, 0);
 511                skb->csum_start = skb_transport_header(skb) - skb->head;
 512                skb->csum_offset = offsetof(struct tcphdr, check);
 513        } else {
 514                th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
 515                                         csum_partial((char *)th,
 516                                                      th->doff << 2,
 517                                                      skb->csum));
 518        }
 519}
 520
 521int tcp_v4_gso_send_check(struct sk_buff *skb)
 522{
 523        const struct iphdr *iph;
 524        struct tcphdr *th;
 525
 526        if (!pskb_may_pull(skb, sizeof(*th)))
 527                return -EINVAL;
 528
 529        iph = ip_hdr(skb);
 530        th = tcp_hdr(skb);
 531
 532        th->check = 0;
 533        th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
 534        skb->csum_start = skb_transport_header(skb) - skb->head;
 535        skb->csum_offset = offsetof(struct tcphdr, check);
 536        skb->ip_summed = CHECKSUM_PARTIAL;
 537        return 0;
 538}
 539
 540/*
 541 *      This routine will send an RST to the other tcp.
 542 *
 543 *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 544 *                    for reset.
 545 *      Answer: if a packet caused RST, it is not for a socket
 546 *              existing in our system, if it is matched to a socket,
 547 *              it is just duplicate segment or bug in other side's TCP.
 548 *              So that we build reply only basing on parameters
 549 *              arrived with segment.
 550 *      Exception: precedence violation. We do not implement it in any case.
 551 */
 552
 553static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 554{
 555        struct tcphdr *th = tcp_hdr(skb);
 556        struct {
 557                struct tcphdr th;
 558#ifdef CONFIG_TCP_MD5SIG
 559                __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 560#endif
 561        } rep;
 562        struct ip_reply_arg arg;
 563#ifdef CONFIG_TCP_MD5SIG
 564        struct tcp_md5sig_key *key;
 565#endif
 566
 567        /* Never send a reset in response to a reset. */
 568        if (th->rst)
 569                return;
 570
 571        if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
 572                return;
 573
 574        /* Swap the send and the receive. */
 575        memset(&rep, 0, sizeof(rep));
 576        rep.th.dest   = th->source;
 577        rep.th.source = th->dest;
 578        rep.th.doff   = sizeof(struct tcphdr) / 4;
 579        rep.th.rst    = 1;
 580
 581        if (th->ack) {
 582                rep.th.seq = th->ack_seq;
 583        } else {
 584                rep.th.ack = 1;
 585                rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 586                                       skb->len - (th->doff << 2));
 587        }
 588
 589        memset(&arg, 0, sizeof(arg));
 590        arg.iov[0].iov_base = (unsigned char *)&rep;
 591        arg.iov[0].iov_len  = sizeof(rep.th);
 592
 593#ifdef CONFIG_TCP_MD5SIG
 594        key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
 595        if (key) {
 596                rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 597                                   (TCPOPT_NOP << 16) |
 598                                   (TCPOPT_MD5SIG << 8) |
 599                                   TCPOLEN_MD5SIG);
 600                /* Update length and the length the header thinks exists */
 601                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 602                rep.th.doff = arg.iov[0].iov_len / 4;
 603
 604                tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[1],
 605                                        key,
 606                                        ip_hdr(skb)->daddr,
 607                                        ip_hdr(skb)->saddr,
 608                                        &rep.th, IPPROTO_TCP,
 609                                        arg.iov[0].iov_len);
 610        }
 611#endif
 612        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 613                                      ip_hdr(skb)->saddr, /* XXX */
 614                                      sizeof(struct tcphdr), IPPROTO_TCP, 0);
 615        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 616
 617        ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
 618
 619        TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
 620        TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
 621}
 622
 623/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 624   outside socket context is ugly, certainly. What can I do?
 625 */
 626
 627static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk,
 628                            struct sk_buff *skb, u32 seq, u32 ack,
 629                            u32 win, u32 ts)
 630{
 631        struct tcphdr *th = tcp_hdr(skb);
 632        struct {
 633                struct tcphdr th;
 634                __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 635#ifdef CONFIG_TCP_MD5SIG
 636                           + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 637#endif
 638                        ];
 639        } rep;
 640        struct ip_reply_arg arg;
 641#ifdef CONFIG_TCP_MD5SIG
 642        struct tcp_md5sig_key *key;
 643        struct tcp_md5sig_key tw_key;
 644#endif
 645
 646        memset(&rep.th, 0, sizeof(struct tcphdr));
 647        memset(&arg, 0, sizeof(arg));
 648
 649        arg.iov[0].iov_base = (unsigned char *)&rep;
 650        arg.iov[0].iov_len  = sizeof(rep.th);
 651        if (ts) {
 652                rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 653                                   (TCPOPT_TIMESTAMP << 8) |
 654                                   TCPOLEN_TIMESTAMP);
 655                rep.opt[1] = htonl(tcp_time_stamp);
 656                rep.opt[2] = htonl(ts);
 657                arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 658        }
 659
 660        /* Swap the send and the receive. */
 661        rep.th.dest    = th->source;
 662        rep.th.source  = th->dest;
 663        rep.th.doff    = arg.iov[0].iov_len / 4;
 664        rep.th.seq     = htonl(seq);
 665        rep.th.ack_seq = htonl(ack);
 666        rep.th.ack     = 1;
 667        rep.th.window  = htons(win);
 668
 669#ifdef CONFIG_TCP_MD5SIG
 670        /*
 671         * The SKB holds an imcoming packet, but may not have a valid ->sk
 672         * pointer. This is especially the case when we're dealing with a
 673         * TIME_WAIT ack, because the sk structure is long gone, and only
 674         * the tcp_timewait_sock remains. So the md5 key is stashed in that
 675         * structure, and we use it in preference.  I believe that (twsk ||
 676         * skb->sk) holds true, but we program defensively.
 677         */
 678        if (!twsk && skb->sk) {
 679                key = tcp_v4_md5_do_lookup(skb->sk, ip_hdr(skb)->daddr);
 680        } else if (twsk && twsk->tw_md5_keylen) {
 681                tw_key.key = twsk->tw_md5_key;
 682                tw_key.keylen = twsk->tw_md5_keylen;
 683                key = &tw_key;
 684        } else
 685                key = NULL;
 686
 687        if (key) {
 688                int offset = (ts) ? 3 : 0;
 689
 690                rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 691                                          (TCPOPT_NOP << 16) |
 692                                          (TCPOPT_MD5SIG << 8) |
 693                                          TCPOLEN_MD5SIG);
 694                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 695                rep.th.doff = arg.iov[0].iov_len/4;
 696
 697                tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[offset],
 698                                        key,
 699                                        ip_hdr(skb)->daddr,
 700                                        ip_hdr(skb)->saddr,
 701                                        &rep.th, IPPROTO_TCP,
 702                                        arg.iov[0].iov_len);
 703        }
 704#endif
 705        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 706                                      ip_hdr(skb)->saddr, /* XXX */
 707                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 708        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 709        if (twsk)
 710                arg.bound_dev_if = twsk->tw_sk.tw_bound_dev_if;
 711
 712        ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
 713
 714        TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
 715}
 716
 717static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 718{
 719        struct inet_timewait_sock *tw = inet_twsk(sk);
 720        struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 721
 722        tcp_v4_send_ack(tcptw, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 723                        tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 724                        tcptw->tw_ts_recent);
 725
 726        inet_twsk_put(tw);
 727}
 728
 729static void tcp_v4_reqsk_send_ack(struct sk_buff *skb,
 730                                  struct request_sock *req)
 731{
 732        tcp_v4_send_ack(NULL, skb, tcp_rsk(req)->snt_isn + 1,
 733                        tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
 734                        req->ts_recent);
 735}
 736
 737/*
 738 *      Send a SYN-ACK after having received an ACK.
 739 *      This still operates on a request_sock only, not on a big
 740 *      socket.
 741 */
 742static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
 743                              struct dst_entry *dst)
 744{
 745        const struct inet_request_sock *ireq = inet_rsk(req);
 746        int err = -1;
 747        struct sk_buff * skb;
 748
 749        /* First, grab a route. */
 750        if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
 751                goto out;
 752
 753        skb = tcp_make_synack(sk, dst, req);
 754
 755        if (skb) {
 756                struct tcphdr *th = tcp_hdr(skb);
 757
 758                th->check = tcp_v4_check(skb->len,
 759                                         ireq->loc_addr,
 760                                         ireq->rmt_addr,
 761                                         csum_partial((char *)th, skb->len,
 762                                                      skb->csum));
 763
 764                err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 765                                            ireq->rmt_addr,
 766                                            ireq->opt);
 767                err = net_xmit_eval(err);
 768        }
 769
 770out:
 771        dst_release(dst);
 772        return err;
 773}
 774
 775/*
 776 *      IPv4 request_sock destructor.
 777 */
 778static void tcp_v4_reqsk_destructor(struct request_sock *req)
 779{
 780        kfree(inet_rsk(req)->opt);
 781}
 782
 783#ifdef CONFIG_SYN_COOKIES
 784static void syn_flood_warning(struct sk_buff *skb)
 785{
 786        static unsigned long warntime;
 787
 788        if (time_after(jiffies, (warntime + HZ * 60))) {
 789                warntime = jiffies;
 790                printk(KERN_INFO
 791                       "possible SYN flooding on port %d. Sending cookies.\n",
 792                       ntohs(tcp_hdr(skb)->dest));
 793        }
 794}
 795#endif
 796
 797/*
 798 * Save and compile IPv4 options into the request_sock if needed.
 799 */
 800static struct ip_options *tcp_v4_save_options(struct sock *sk,
 801                                              struct sk_buff *skb)
 802{
 803        struct ip_options *opt = &(IPCB(skb)->opt);
 804        struct ip_options *dopt = NULL;
 805
 806        if (opt && opt->optlen) {
 807                int opt_size = optlength(opt);
 808                dopt = kmalloc(opt_size, GFP_ATOMIC);
 809                if (dopt) {
 810                        if (ip_options_echo(dopt, skb)) {
 811                                kfree(dopt);
 812                                dopt = NULL;
 813                        }
 814                }
 815        }
 816        return dopt;
 817}
 818
 819#ifdef CONFIG_TCP_MD5SIG
 820/*
 821 * RFC2385 MD5 checksumming requires a mapping of
 822 * IP address->MD5 Key.
 823 * We need to maintain these in the sk structure.
 824 */
 825
 826/* Find the Key structure for an address.  */
 827static struct tcp_md5sig_key *
 828                        tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
 829{
 830        struct tcp_sock *tp = tcp_sk(sk);
 831        int i;
 832
 833        if (!tp->md5sig_info || !tp->md5sig_info->entries4)
 834                return NULL;
 835        for (i = 0; i < tp->md5sig_info->entries4; i++) {
 836                if (tp->md5sig_info->keys4[i].addr == addr)
 837                        return &tp->md5sig_info->keys4[i].base;
 838        }
 839        return NULL;
 840}
 841
 842struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 843                                         struct sock *addr_sk)
 844{
 845        return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
 846}
 847
 848EXPORT_SYMBOL(tcp_v4_md5_lookup);
 849
 850static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
 851                                                      struct request_sock *req)
 852{
 853        return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
 854}
 855
 856/* This can be called on a newly created socket, from other files */
 857int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
 858                      u8 *newkey, u8 newkeylen)
 859{
 860        /* Add Key to the list */
 861        struct tcp_md5sig_key *key;
 862        struct tcp_sock *tp = tcp_sk(sk);
 863        struct tcp4_md5sig_key *keys;
 864
 865        key = tcp_v4_md5_do_lookup(sk, addr);
 866        if (key) {
 867                /* Pre-existing entry - just update that one. */
 868                kfree(key->key);
 869                key->key = newkey;
 870                key->keylen = newkeylen;
 871        } else {
 872                struct tcp_md5sig_info *md5sig;
 873
 874                if (!tp->md5sig_info) {
 875                        tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
 876                                                  GFP_ATOMIC);
 877                        if (!tp->md5sig_info) {
 878                                kfree(newkey);
 879                                return -ENOMEM;
 880                        }
 881                        sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
 882                }
 883                if (tcp_alloc_md5sig_pool() == NULL) {
 884                        kfree(newkey);
 885                        return -ENOMEM;
 886                }
 887                md5sig = tp->md5sig_info;
 888
 889                if (md5sig->alloced4 == md5sig->entries4) {
 890                        keys = kmalloc((sizeof(*keys) *
 891                                        (md5sig->entries4 + 1)), GFP_ATOMIC);
 892                        if (!keys) {
 893                                kfree(newkey);
 894                                tcp_free_md5sig_pool();
 895                                return -ENOMEM;
 896                        }
 897
 898                        if (md5sig->entries4)
 899                                memcpy(keys, md5sig->keys4,
 900                                       sizeof(*keys) * md5sig->entries4);
 901
 902                        /* Free old key list, and reference new one */
 903                        kfree(md5sig->keys4);
 904                        md5sig->keys4 = keys;
 905                        md5sig->alloced4++;
 906                }
 907                md5sig->entries4++;
 908                md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
 909                md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
 910                md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
 911        }
 912        return 0;
 913}
 914
 915EXPORT_SYMBOL(tcp_v4_md5_do_add);
 916
 917static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
 918                               u8 *newkey, u8 newkeylen)
 919{
 920        return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
 921                                 newkey, newkeylen);
 922}
 923
 924int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
 925{
 926        struct tcp_sock *tp = tcp_sk(sk);
 927        int i;
 928
 929        for (i = 0; i < tp->md5sig_info->entries4; i++) {
 930                if (tp->md5sig_info->keys4[i].addr == addr) {
 931                        /* Free the key */
 932                        kfree(tp->md5sig_info->keys4[i].base.key);
 933                        tp->md5sig_info->entries4--;
 934
 935                        if (tp->md5sig_info->entries4 == 0) {
 936                                kfree(tp->md5sig_info->keys4);
 937                                tp->md5sig_info->keys4 = NULL;
 938                                tp->md5sig_info->alloced4 = 0;
 939                        } else if (tp->md5sig_info->entries4 != i) {
 940                                /* Need to do some manipulation */
 941                                memmove(&tp->md5sig_info->keys4[i],
 942                                        &tp->md5sig_info->keys4[i+1],
 943                                        (tp->md5sig_info->entries4 - i) *
 944                                         sizeof(struct tcp4_md5sig_key));
 945                        }
 946                        tcp_free_md5sig_pool();
 947                        return 0;
 948                }
 949        }
 950        return -ENOENT;
 951}
 952
 953EXPORT_SYMBOL(tcp_v4_md5_do_del);
 954
 955static void tcp_v4_clear_md5_list(struct sock *sk)
 956{
 957        struct tcp_sock *tp = tcp_sk(sk);
 958
 959        /* Free each key, then the set of key keys,
 960         * the crypto element, and then decrement our
 961         * hold on the last resort crypto.
 962         */
 963        if (tp->md5sig_info->entries4) {
 964                int i;
 965                for (i = 0; i < tp->md5sig_info->entries4; i++)
 966                        kfree(tp->md5sig_info->keys4[i].base.key);
 967                tp->md5sig_info->entries4 = 0;
 968                tcp_free_md5sig_pool();
 969        }
 970        if (tp->md5sig_info->keys4) {
 971                kfree(tp->md5sig_info->keys4);
 972                tp->md5sig_info->keys4 = NULL;
 973                tp->md5sig_info->alloced4  = 0;
 974        }
 975}
 976
 977static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
 978                                 int optlen)
 979{
 980        struct tcp_md5sig cmd;
 981        struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
 982        u8 *newkey;
 983
 984        if (optlen < sizeof(cmd))
 985                return -EINVAL;
 986
 987        if (copy_from_user(&cmd, optval, sizeof(cmd)))
 988                return -EFAULT;
 989
 990        if (sin->sin_family != AF_INET)
 991                return -EINVAL;
 992
 993        if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
 994                if (!tcp_sk(sk)->md5sig_info)
 995                        return -ENOENT;
 996                return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
 997        }
 998
 999        if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1000                return -EINVAL;
1001
1002        if (!tcp_sk(sk)->md5sig_info) {
1003                struct tcp_sock *tp = tcp_sk(sk);
1004                struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
1005
1006                if (!p)
1007                        return -EINVAL;
1008
1009                tp->md5sig_info = p;
1010                sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1011        }
1012
1013        newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1014        if (!newkey)
1015                return -ENOMEM;
1016        return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1017                                 newkey, cmd.tcpm_keylen);
1018}
1019
1020static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1021                                   __be32 saddr, __be32 daddr,
1022                                   struct tcphdr *th, int protocol,
1023                                   int tcplen)
1024{
1025        struct scatterlist sg[4];
1026        __u16 data_len;
1027        int block = 0;
1028        __sum16 old_checksum;
1029        struct tcp_md5sig_pool *hp;
1030        struct tcp4_pseudohdr *bp;
1031        struct hash_desc *desc;
1032        int err;
1033        unsigned int nbytes = 0;
1034
1035        /*
1036         * Okay, so RFC2385 is turned on for this connection,
1037         * so we need to generate the MD5 hash for the packet now.
1038         */
1039
1040        hp = tcp_get_md5sig_pool();
1041        if (!hp)
1042                goto clear_hash_noput;
1043
1044        bp = &hp->md5_blk.ip4;
1045        desc = &hp->md5_desc;
1046
1047        /*
1048         * 1. the TCP pseudo-header (in the order: source IP address,
1049         * destination IP address, zero-padded protocol number, and
1050         * segment length)
1051         */
1052        bp->saddr = saddr;
1053        bp->daddr = daddr;
1054        bp->pad = 0;
1055        bp->protocol = protocol;
1056        bp->len = htons(tcplen);
1057
1058        sg_init_table(sg, 4);
1059
1060        sg_set_buf(&sg[block++], bp, sizeof(*bp));
1061        nbytes += sizeof(*bp);
1062
1063        /* 2. the TCP header, excluding options, and assuming a
1064         * checksum of zero/
1065         */
1066        old_checksum = th->check;
1067        th->check = 0;
1068        sg_set_buf(&sg[block++], th, sizeof(struct tcphdr));
1069        nbytes += sizeof(struct tcphdr);
1070
1071        /* 3. the TCP segment data (if any) */
1072        data_len = tcplen - (th->doff << 2);
1073        if (data_len > 0) {
1074                unsigned char *data = (unsigned char *)th + (th->doff << 2);
1075                sg_set_buf(&sg[block++], data, data_len);
1076                nbytes += data_len;
1077        }
1078
1079        /* 4. an independently-specified key or password, known to both
1080         * TCPs and presumably connection-specific
1081         */
1082        sg_set_buf(&sg[block++], key->key, key->keylen);
1083        nbytes += key->keylen;
1084
1085        sg_mark_end(&sg[block - 1]);
1086
1087        /* Now store the Hash into the packet */
1088        err = crypto_hash_init(desc);
1089        if (err)
1090                goto clear_hash;
1091        err = crypto_hash_update(desc, sg, nbytes);
1092        if (err)
1093                goto clear_hash;
1094        err = crypto_hash_final(desc, md5_hash);
1095        if (err)
1096                goto clear_hash;
1097
1098        /* Reset header, and free up the crypto */
1099        tcp_put_md5sig_pool();
1100        th->check = old_checksum;
1101
1102out:
1103        return 0;
1104clear_hash:
1105        tcp_put_md5sig_pool();
1106clear_hash_noput:
1107        memset(md5_hash, 0, 16);
1108        goto out;
1109}
1110
1111int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1112                         struct sock *sk,
1113                         struct dst_entry *dst,
1114                         struct request_sock *req,
1115                         struct tcphdr *th, int protocol,
1116                         int tcplen)
1117{
1118        __be32 saddr, daddr;
1119
1120        if (sk) {
1121                saddr = inet_sk(sk)->saddr;
1122                daddr = inet_sk(sk)->daddr;
1123        } else {
1124                struct rtable *rt = (struct rtable *)dst;
1125                BUG_ON(!rt);
1126                saddr = rt->rt_src;
1127                daddr = rt->rt_dst;
1128        }
1129        return tcp_v4_do_calc_md5_hash(md5_hash, key,
1130                                       saddr, daddr,
1131                                       th, protocol, tcplen);
1132}
1133
1134EXPORT_SYMBOL(tcp_v4_calc_md5_hash);
1135
1136static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1137{
1138        /*
1139         * This gets called for each TCP segment that arrives
1140         * so we want to be efficient.
1141         * We have 3 drop cases:
1142         * o No MD5 hash and one expected.
1143         * o MD5 hash and we're not expecting one.
1144         * o MD5 hash and its wrong.
1145         */
1146        __u8 *hash_location = NULL;
1147        struct tcp_md5sig_key *hash_expected;
1148        const struct iphdr *iph = ip_hdr(skb);
1149        struct tcphdr *th = tcp_hdr(skb);
1150        int length = (th->doff << 2) - sizeof(struct tcphdr);
1151        int genhash;
1152        unsigned char *ptr;
1153        unsigned char newhash[16];
1154
1155        hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1156
1157        /*
1158         * If the TCP option length is less than the TCP_MD5SIG
1159         * option length, then we can shortcut
1160         */
1161        if (length < TCPOLEN_MD5SIG) {
1162                if (hash_expected)
1163                        return 1;
1164                else
1165                        return 0;
1166        }
1167
1168        /* Okay, we can't shortcut - we have to grub through the options */
1169        ptr = (unsigned char *)(th + 1);
1170        while (length > 0) {
1171                int opcode = *ptr++;
1172                int opsize;
1173
1174                switch (opcode) {
1175                case TCPOPT_EOL:
1176                        goto done_opts;
1177                case TCPOPT_NOP:
1178                        length--;
1179                        continue;
1180                default:
1181                        opsize = *ptr++;
1182                        if (opsize < 2)
1183                                goto done_opts;
1184                        if (opsize > length)
1185                                goto done_opts;
1186
1187                        if (opcode == TCPOPT_MD5SIG) {
1188                                hash_location = ptr;
1189                                goto done_opts;
1190                        }
1191                }
1192                ptr += opsize-2;
1193                length -= opsize;
1194        }
1195done_opts:
1196        /* We've parsed the options - do we have a hash? */
1197        if (!hash_expected && !hash_location)
1198                return 0;
1199
1200        if (hash_expected && !hash_location) {
1201                LIMIT_NETDEBUG(KERN_INFO "MD5 Hash expected but NOT found "
1202                               "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1203                               NIPQUAD(iph->saddr), ntohs(th->source),
1204                               NIPQUAD(iph->daddr), ntohs(th->dest));
1205                return 1;
1206        }
1207
1208        if (!hash_expected && hash_location) {
1209                LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found "
1210                               "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1211                               NIPQUAD(iph->saddr), ntohs(th->source),
1212                               NIPQUAD(iph->daddr), ntohs(th->dest));
1213                return 1;
1214        }
1215
1216        /* Okay, so this is hash_expected and hash_location -
1217         * so we need to calculate the checksum.
1218         */
1219        genhash = tcp_v4_do_calc_md5_hash(newhash,
1220                                          hash_expected,
1221                                          iph->saddr, iph->daddr,
1222                                          th, sk->sk_protocol,
1223                                          skb->len);
1224
1225        if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1226                if (net_ratelimit()) {
1227                        printk(KERN_INFO "MD5 Hash failed for "
1228                               "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
1229                               NIPQUAD(iph->saddr), ntohs(th->source),
1230                               NIPQUAD(iph->daddr), ntohs(th->dest),
1231                               genhash ? " tcp_v4_calc_md5_hash failed" : "");
1232                }
1233                return 1;
1234        }
1235        return 0;
1236}
1237
1238#endif
1239
1240struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1241        .family         =       PF_INET,
1242        .obj_size       =       sizeof(struct tcp_request_sock),
1243        .rtx_syn_ack    =       tcp_v4_send_synack,
1244        .send_ack       =       tcp_v4_reqsk_send_ack,
1245        .destructor     =       tcp_v4_reqsk_destructor,
1246        .send_reset     =       tcp_v4_send_reset,
1247};
1248
1249#ifdef CONFIG_TCP_MD5SIG
1250static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1251        .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1252};
1253#endif
1254
1255static struct timewait_sock_ops tcp_timewait_sock_ops = {
1256        .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1257        .twsk_unique    = tcp_twsk_unique,
1258        .twsk_destructor= tcp_twsk_destructor,
1259};
1260
1261int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1262{
1263        struct inet_request_sock *ireq;
1264        struct tcp_options_received tmp_opt;
1265        struct request_sock *req;
1266        __be32 saddr = ip_hdr(skb)->saddr;
1267        __be32 daddr = ip_hdr(skb)->daddr;
1268        __u32 isn = TCP_SKB_CB(skb)->when;
1269        struct dst_entry *dst = NULL;
1270#ifdef CONFIG_SYN_COOKIES
1271        int want_cookie = 0;
1272#else
1273#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1274#endif
1275
1276        /* Never answer to SYNs send to broadcast or multicast */
1277        if (((struct rtable *)skb->dst)->rt_flags &
1278            (RTCF_BROADCAST | RTCF_MULTICAST))
1279                goto drop;
1280
1281        /* TW buckets are converted to open requests without
1282         * limitations, they conserve resources and peer is
1283         * evidently real one.
1284         */
1285        if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1286#ifdef CONFIG_SYN_COOKIES
1287                if (sysctl_tcp_syncookies) {
1288                        want_cookie = 1;
1289                } else
1290#endif
1291                goto drop;
1292        }
1293
1294        /* Accept backlog is full. If we have already queued enough
1295         * of warm entries in syn queue, drop request. It is better than
1296         * clogging syn queue with openreqs with exponentially increasing
1297         * timeout.
1298         */
1299        if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1300                goto drop;
1301
1302        req = reqsk_alloc(&tcp_request_sock_ops);
1303        if (!req)
1304                goto drop;
1305
1306#ifdef CONFIG_TCP_MD5SIG
1307        tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1308#endif
1309
1310        tcp_clear_options(&tmp_opt);
1311        tmp_opt.mss_clamp = 536;
1312        tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1313
1314        tcp_parse_options(skb, &tmp_opt, 0);
1315
1316        if (want_cookie) {
1317                tcp_clear_options(&tmp_opt);
1318                tmp_opt.saw_tstamp = 0;
1319        }
1320
1321        if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1322                /* Some OSes (unknown ones, but I see them on web server, which
1323                 * contains information interesting only for windows'
1324                 * users) do not send their stamp in SYN. It is easy case.
1325                 * We simply do not advertise TS support.
1326                 */
1327                tmp_opt.saw_tstamp = 0;
1328                tmp_opt.tstamp_ok  = 0;
1329        }
1330        tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1331
1332        tcp_openreq_init(req, &tmp_opt, skb);
1333
1334        if (security_inet_conn_request(sk, skb, req))
1335                goto drop_and_free;
1336
1337        ireq = inet_rsk(req);
1338        ireq->loc_addr = daddr;
1339        ireq->rmt_addr = saddr;
1340        ireq->opt = tcp_v4_save_options(sk, skb);
1341        if (!want_cookie)
1342                TCP_ECN_create_request(req, tcp_hdr(skb));
1343
1344        if (want_cookie) {
1345#ifdef CONFIG_SYN_COOKIES
1346                syn_flood_warning(skb);
1347#endif
1348                isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1349        } else if (!isn) {
1350                struct inet_peer *peer = NULL;
1351
1352                /* VJ's idea. We save last timestamp seen
1353                 * from the destination in peer table, when entering
1354                 * state TIME-WAIT, and check against it before
1355                 * accepting new connection request.
1356                 *
1357                 * If "isn" is not zero, this request hit alive
1358                 * timewait bucket, so that all the necessary checks
1359                 * are made in the function processing timewait state.
1360                 */
1361                if (tmp_opt.saw_tstamp &&
1362                    tcp_death_row.sysctl_tw_recycle &&
1363                    (dst = inet_csk_route_req(sk, req)) != NULL &&
1364                    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1365                    peer->v4daddr == saddr) {
1366                        if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1367                            (s32)(peer->tcp_ts - req->ts_recent) >
1368                                                        TCP_PAWS_WINDOW) {
1369                                NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1370                                dst_release(dst);
1371                                goto drop_and_free;
1372                        }
1373                }
1374                /* Kill the following clause, if you dislike this way. */
1375                else if (!sysctl_tcp_syncookies &&
1376                         (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1377                          (sysctl_max_syn_backlog >> 2)) &&
1378                         (!peer || !peer->tcp_ts_stamp) &&
1379                         (!dst || !dst_metric(dst, RTAX_RTT))) {
1380                        /* Without syncookies last quarter of
1381                         * backlog is filled with destinations,
1382                         * proven to be alive.
1383                         * It means that we continue to communicate
1384                         * to destinations, already remembered
1385                         * to the moment of synflood.
1386                         */
1387                        LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1388                                       "request from %u.%u.%u.%u/%u\n",
1389                                       NIPQUAD(saddr),
1390                                       ntohs(tcp_hdr(skb)->source));
1391                        dst_release(dst);
1392                        goto drop_and_free;
1393                }
1394
1395                isn = tcp_v4_init_sequence(skb);
1396        }
1397        tcp_rsk(req)->snt_isn = isn;
1398
1399        if (tcp_v4_send_synack(sk, req, dst))
1400                goto drop_and_free;
1401
1402        if (want_cookie) {
1403                reqsk_free(req);
1404        } else {
1405                inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1406        }
1407        return 0;
1408
1409drop_and_free:
1410        reqsk_free(req);
1411drop:
1412        return 0;
1413}
1414
1415
1416/*
1417 * The three way handshake has completed - we got a valid synack -
1418 * now create the new socket.
1419 */
1420struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1421                                  struct request_sock *req,
1422                                  struct dst_entry *dst)
1423{
1424        struct inet_request_sock *ireq;
1425        struct inet_sock *newinet;
1426        struct tcp_sock *newtp;
1427        struct sock *newsk;
1428#ifdef CONFIG_TCP_MD5SIG
1429        struct tcp_md5sig_key *key;
1430#endif
1431
1432        if (sk_acceptq_is_full(sk))
1433                goto exit_overflow;
1434
1435        if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1436                goto exit;
1437
1438        newsk = tcp_create_openreq_child(sk, req, skb);
1439        if (!newsk)
1440                goto exit;
1441
1442        newsk->sk_gso_type = SKB_GSO_TCPV4;
1443        sk_setup_caps(newsk, dst);
1444
1445        newtp                 = tcp_sk(newsk);
1446        newinet               = inet_sk(newsk);
1447        ireq                  = inet_rsk(req);
1448        newinet->daddr        = ireq->rmt_addr;
1449        newinet->rcv_saddr    = ireq->loc_addr;
1450        newinet->saddr        = ireq->loc_addr;
1451        newinet->opt          = ireq->opt;
1452        ireq->opt             = NULL;
1453        newinet->mc_index     = inet_iif(skb);
1454        newinet->mc_ttl       = ip_hdr(skb)->ttl;
1455        inet_csk(newsk)->icsk_ext_hdr_len = 0;
1456        if (newinet->opt)
1457                inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1458        newinet->id = newtp->write_seq ^ jiffies;
1459
1460        tcp_mtup_init(newsk);
1461        tcp_sync_mss(newsk, dst_mtu(dst));
1462        newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1463        tcp_initialize_rcv_mss(newsk);
1464
1465#ifdef CONFIG_TCP_MD5SIG
1466        /* Copy over the MD5 key from the original socket */
1467        if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1468                /*
1469                 * We're using one, so create a matching key
1470                 * on the newsk structure. If we fail to get
1471                 * memory, then we end up not copying the key
1472                 * across. Shucks.
1473                 */
1474                char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1475                if (newkey != NULL)
1476                        tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1477                                          newkey, key->keylen);
1478        }
1479#endif
1480
1481        __inet_hash(&tcp_hashinfo, newsk, 0);
1482        __inet_inherit_port(&tcp_hashinfo, sk, newsk);
1483
1484        return newsk;
1485
1486exit_overflow:
1487        NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1488exit:
1489        NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1490        dst_release(dst);
1491        return NULL;
1492}
1493
1494static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1495{
1496        struct tcphdr *th = tcp_hdr(skb);
1497        const struct iphdr *iph = ip_hdr(skb);
1498        struct sock *nsk;
1499        struct request_sock **prev;
1500        /* Find possible connection requests. */
1501        struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1502                                                       iph->saddr, iph->daddr);
1503        if (req)
1504                return tcp_check_req(sk, skb, req, prev);
1505
1506        nsk = inet_lookup_established(&tcp_hashinfo, iph->saddr, th->source,
1507                                      iph->daddr, th->dest, inet_iif(skb));
1508
1509        if (nsk) {
1510                if (nsk->sk_state != TCP_TIME_WAIT) {
1511                        bh_lock_sock(nsk);
1512                        return nsk;
1513                }
1514                inet_twsk_put(inet_twsk(nsk));
1515                return NULL;
1516        }
1517
1518#ifdef CONFIG_SYN_COOKIES
1519        if (!th->rst && !th->syn && th->ack)
1520                sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1521#endif
1522        return sk;
1523}
1524
1525static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1526{
1527        const struct iphdr *iph = ip_hdr(skb);
1528
1529        if (skb->ip_summed == CHECKSUM_COMPLETE) {
1530                if (!tcp_v4_check(skb->len, iph->saddr,
1531                                  iph->daddr, skb->csum)) {
1532                        skb->ip_summed = CHECKSUM_UNNECESSARY;
1533                        return 0;
1534                }
1535        }
1536
1537        skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1538                                       skb->len, IPPROTO_TCP, 0);
1539
1540        if (skb->len <= 76) {
1541                return __skb_checksum_complete(skb);
1542        }
1543        return 0;
1544}
1545
1546
1547/* The socket must have it's spinlock held when we get
1548 * here.
1549 *
1550 * We have a potential double-lock case here, so even when
1551 * doing backlog processing we use the BH locking scheme.
1552 * This is because we cannot sleep with the original spinlock
1553 * held.
1554 */
1555int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1556{
1557        struct sock *rsk;
1558#ifdef CONFIG_TCP_MD5SIG
1559        /*
1560         * We really want to reject the packet as early as possible
1561         * if:
1562         *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1563         *  o There is an MD5 option and we're not expecting one
1564         */
1565        if (tcp_v4_inbound_md5_hash(sk, skb))
1566                goto discard;
1567#endif
1568
1569        if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1570                TCP_CHECK_TIMER(sk);
1571                if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1572                        rsk = sk;
1573                        goto reset;
1574                }
1575                TCP_CHECK_TIMER(sk);
1576                return 0;
1577        }
1578
1579        if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1580                goto csum_err;
1581
1582        if (sk->sk_state == TCP_LISTEN) {
1583                struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1584                if (!nsk)
1585                        goto discard;
1586
1587                if (nsk != sk) {
1588                        if (tcp_child_process(sk, nsk, skb)) {
1589                                rsk = nsk;
1590                                goto reset;
1591                        }
1592                        return 0;
1593                }
1594        }
1595
1596        TCP_CHECK_TIMER(sk);
1597        if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1598                rsk = sk;
1599                goto reset;
1600        }
1601        TCP_CHECK_TIMER(sk);
1602        return 0;
1603
1604reset:
1605        tcp_v4_send_reset(rsk, skb);
1606discard:
1607        kfree_skb(skb);
1608        /* Be careful here. If this function gets more complicated and
1609         * gcc suffers from register pressure on the x86, sk (in %ebx)
1610         * might be destroyed here. This current version compiles correctly,
1611         * but you have been warned.
1612         */
1613        return 0;
1614
1615csum_err:
1616        TCP_INC_STATS_BH(TCP_MIB_INERRS);
1617        goto discard;
1618}
1619
1620/*
1621 *      From tcp_input.c
1622 */
1623
1624int tcp_v4_rcv(struct sk_buff *skb)
1625{
1626        const struct iphdr *iph;
1627        struct tcphdr *th;
1628        struct sock *sk;
1629        int ret;
1630
1631        if (skb->pkt_type != PACKET_HOST)
1632                goto discard_it;
1633
1634        /* Count it even if it's bad */
1635        TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1636
1637        if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1638                goto discard_it;
1639
1640        th = tcp_hdr(skb);
1641
1642        if (th->doff < sizeof(struct tcphdr) / 4)
1643                goto bad_packet;
1644        if (!pskb_may_pull(skb, th->doff * 4))
1645                goto discard_it;
1646
1647        /* An explanation is required here, I think.
1648         * Packet length and doff are validated by header prediction,
1649         * provided case of th->doff==0 is eliminated.
1650         * So, we defer the checks. */
1651        if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1652                goto bad_packet;
1653
1654        th = tcp_hdr(skb);
1655        iph = ip_hdr(skb);
1656        TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1657        TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1658                                    skb->len - th->doff * 4);
1659        TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1660        TCP_SKB_CB(skb)->when    = 0;
1661        TCP_SKB_CB(skb)->flags   = iph->tos;
1662        TCP_SKB_CB(skb)->sacked  = 0;
1663
1664        sk = __inet_lookup(&tcp_hashinfo, iph->saddr, th->source,
1665                           iph->daddr, th->dest, inet_iif(skb));
1666        if (!sk)
1667                goto no_tcp_socket;
1668
1669process:
1670        if (sk->sk_state == TCP_TIME_WAIT)
1671                goto do_time_wait;
1672
1673        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1674                goto discard_and_relse;
1675        nf_reset(skb);
1676
1677        if (sk_filter(sk, skb))
1678                goto discard_and_relse;
1679
1680        skb->dev = NULL;
1681
1682        bh_lock_sock_nested(sk);
1683        ret = 0;
1684        if (!sock_owned_by_user(sk)) {
1685#ifdef CONFIG_NET_DMA
1686                struct tcp_sock *tp = tcp_sk(sk);
1687                if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1688                        tp->ucopy.dma_chan = get_softnet_dma();
1689                if (tp->ucopy.dma_chan)
1690                        ret = tcp_v4_do_rcv(sk, skb);
1691                else
1692#endif
1693                {
1694                        if (!tcp_prequeue(sk, skb))
1695                        ret = tcp_v4_do_rcv(sk, skb);
1696                }
1697        } else
1698                sk_add_backlog(sk, skb);
1699        bh_unlock_sock(sk);
1700
1701        sock_put(sk);
1702
1703        return ret;
1704
1705no_tcp_socket:
1706        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1707                goto discard_it;
1708
1709        if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1710bad_packet:
1711                TCP_INC_STATS_BH(TCP_MIB_INERRS);
1712        } else {
1713                tcp_v4_send_reset(NULL, skb);
1714        }
1715
1716discard_it:
1717        /* Discard frame. */
1718        kfree_skb(skb);
1719        return 0;
1720
1721discard_and_relse:
1722        sock_put(sk);
1723        goto discard_it;
1724
1725do_time_wait:
1726        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1727                inet_twsk_put(inet_twsk(sk));
1728                goto discard_it;
1729        }
1730
1731        if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1732                TCP_INC_STATS_BH(TCP_MIB_INERRS);
1733                inet_twsk_put(inet_twsk(sk));
1734                goto discard_it;
1735        }
1736        switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1737        case TCP_TW_SYN: {
1738                struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1739                                                        iph->daddr, th->dest,
1740                                                        inet_iif(skb));
1741                if (sk2) {
1742                        inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1743                        inet_twsk_put(inet_twsk(sk));
1744                        sk = sk2;
1745                        goto process;
1746                }
1747                /* Fall through to ACK */
1748        }
1749        case TCP_TW_ACK:
1750                tcp_v4_timewait_ack(sk, skb);
1751                break;
1752        case TCP_TW_RST:
1753                goto no_tcp_socket;
1754        case TCP_TW_SUCCESS:;
1755        }
1756        goto discard_it;
1757}
1758
1759/* VJ's idea. Save last timestamp seen from this destination
1760 * and hold it at least for normal timewait interval to use for duplicate
1761 * segment detection in subsequent connections, before they enter synchronized
1762 * state.
1763 */
1764
1765int tcp_v4_remember_stamp(struct sock *sk)
1766{
1767        struct inet_sock *inet = inet_sk(sk);
1768        struct tcp_sock *tp = tcp_sk(sk);
1769        struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1770        struct inet_peer *peer = NULL;
1771        int release_it = 0;
1772
1773        if (!rt || rt->rt_dst != inet->daddr) {
1774                peer = inet_getpeer(inet->daddr, 1);
1775                release_it = 1;
1776        } else {
1777                if (!rt->peer)
1778                        rt_bind_peer(rt, 1);
1779                peer = rt->peer;
1780        }
1781
1782        if (peer) {
1783                if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1784                    (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1785                     peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1786                        peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1787                        peer->tcp_ts = tp->rx_opt.ts_recent;
1788                }
1789                if (release_it)
1790                        inet_putpeer(peer);
1791                return 1;
1792        }
1793
1794        return 0;
1795}
1796
1797int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1798{
1799        struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1800
1801        if (peer) {
1802                const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1803
1804                if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1805                    (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1806                     peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1807                        peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1808                        peer->tcp_ts       = tcptw->tw_ts_recent;
1809                }
1810                inet_putpeer(peer);
1811                return 1;
1812        }
1813
1814        return 0;
1815}
1816
1817struct inet_connection_sock_af_ops ipv4_specific = {
1818        .queue_xmit        = ip_queue_xmit,
1819        .send_check        = tcp_v4_send_check,
1820        .rebuild_header    = inet_sk_rebuild_header,
1821        .conn_request      = tcp_v4_conn_request,
1822        .syn_recv_sock     = tcp_v4_syn_recv_sock,
1823        .remember_stamp    = tcp_v4_remember_stamp,
1824        .net_header_len    = sizeof(struct iphdr),
1825        .setsockopt        = ip_setsockopt,
1826        .getsockopt        = ip_getsockopt,
1827        .addr2sockaddr     = inet_csk_addr2sockaddr,
1828        .sockaddr_len      = sizeof(struct sockaddr_in),
1829#ifdef CONFIG_COMPAT
1830        .compat_setsockopt = compat_ip_setsockopt,
1831        .compat_getsockopt = compat_ip_getsockopt,
1832#endif
1833};
1834
1835#ifdef CONFIG_TCP_MD5SIG
1836static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1837        .md5_lookup             = tcp_v4_md5_lookup,
1838        .calc_md5_hash          = tcp_v4_calc_md5_hash,
1839        .md5_add                = tcp_v4_md5_add_func,
1840        .md5_parse              = tcp_v4_parse_md5_keys,
1841};
1842#endif
1843
1844/* NOTE: A lot of things set to zero explicitly by call to
1845 *       sk_alloc() so need not be done here.
1846 */
1847static int tcp_v4_init_sock(struct sock *sk)
1848{
1849        struct inet_connection_sock *icsk = inet_csk(sk);
1850        struct tcp_sock *tp = tcp_sk(sk);
1851
1852        skb_queue_head_init(&tp->out_of_order_queue);
1853        tcp_init_xmit_timers(sk);
1854        tcp_prequeue_init(tp);
1855
1856        icsk->icsk_rto = TCP_TIMEOUT_INIT;
1857        tp->mdev = TCP_TIMEOUT_INIT;
1858
1859        /* So many TCP implementations out there (incorrectly) count the
1860         * initial SYN frame in their delayed-ACK and congestion control
1861         * algorithms that we must have the following bandaid to talk
1862         * efficiently to them.  -DaveM
1863         */
1864        tp->snd_cwnd = 2;
1865
1866        /* See draft-stevens-tcpca-spec-01 for discussion of the
1867         * initialization of these values.
1868         */
1869        tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1870        tp->snd_cwnd_clamp = ~0;
1871        tp->mss_cache = 536;
1872
1873        tp->reordering = sysctl_tcp_reordering;
1874        icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1875
1876        sk->sk_state = TCP_CLOSE;
1877
1878        sk->sk_write_space = sk_stream_write_space;
1879        sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1880
1881        icsk->icsk_af_ops = &ipv4_specific;
1882        icsk->icsk_sync_mss = tcp_sync_mss;
1883#ifdef CONFIG_TCP_MD5SIG
1884        tp->af_specific = &tcp_sock_ipv4_specific;
1885#endif
1886
1887        sk->sk_sndbuf = sysctl_tcp_wmem[1];
1888        sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1889
1890        atomic_inc(&tcp_sockets_allocated);
1891
1892        return 0;
1893}
1894
1895int tcp_v4_destroy_sock(struct sock *sk)
1896{
1897        struct tcp_sock *tp = tcp_sk(sk);
1898
1899        tcp_clear_xmit_timers(sk);
1900
1901        tcp_cleanup_congestion_control(sk);
1902
1903        /* Cleanup up the write buffer. */
1904        tcp_write_queue_purge(sk);
1905
1906        /* Cleans up our, hopefully empty, out_of_order_queue. */
1907        __skb_queue_purge(&tp->out_of_order_queue);
1908
1909#ifdef CONFIG_TCP_MD5SIG
1910        /* Clean up the MD5 key list, if any */
1911        if (tp->md5sig_info) {
1912                tcp_v4_clear_md5_list(sk);
1913                kfree(tp->md5sig_info);
1914                tp->md5sig_info = NULL;
1915        }
1916#endif
1917
1918#ifdef CONFIG_NET_DMA
1919        /* Cleans up our sk_async_wait_queue */
1920        __skb_queue_purge(&sk->sk_async_wait_queue);
1921#endif
1922
1923        /* Clean prequeue, it must be empty really */
1924        __skb_queue_purge(&tp->ucopy.prequeue);
1925
1926        /* Clean up a referenced TCP bind bucket. */
1927        if (inet_csk(sk)->icsk_bind_hash)
1928                inet_put_port(&tcp_hashinfo, sk);
1929
1930        /*
1931         * If sendmsg cached page exists, toss it.
1932         */
1933        if (sk->sk_sndmsg_page) {
1934                __free_page(sk->sk_sndmsg_page);
1935                sk->sk_sndmsg_page = NULL;
1936        }
1937
1938        atomic_dec(&tcp_sockets_allocated);
1939
1940        return 0;
1941}
1942
1943EXPORT_SYMBOL(tcp_v4_destroy_sock);
1944
1945#ifdef CONFIG_PROC_FS
1946/* Proc filesystem TCP sock list dumping. */
1947
1948static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1949{
1950        return hlist_empty(head) ? NULL :
1951                list_entry(head->first, struct inet_timewait_sock, tw_node);
1952}
1953
1954static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1955{
1956        return tw->tw_node.next ?
1957                hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1958}
1959
1960static void *listening_get_next(struct seq_file *seq, void *cur)
1961{
1962        struct inet_connection_sock *icsk;
1963        struct hlist_node *node;
1964        struct sock *sk = cur;
1965        struct tcp_iter_state* st = seq->private;
1966
1967        if (!sk) {
1968                st->bucket = 0;
1969                sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1970                goto get_sk;
1971        }
1972
1973        ++st->num;
1974
1975        if (st->state == TCP_SEQ_STATE_OPENREQ) {
1976                struct request_sock *req = cur;
1977
1978                icsk = inet_csk(st->syn_wait_sk);
1979                req = req->dl_next;
1980                while (1) {
1981                        while (req) {
1982                                if (req->rsk_ops->family == st->family) {
1983                                        cur = req;
1984                                        goto out;
1985                                }
1986                                req = req->dl_next;
1987                        }
1988                        if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1989                                break;
1990get_req:
1991                        req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1992                }
1993                sk        = sk_next(st->syn_wait_sk);
1994                st->state = TCP_SEQ_STATE_LISTENING;
1995                read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1996        } else {
1997                icsk = inet_csk(sk);
1998                read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1999                if (reqsk_queue_len(&icsk->icsk_accept_queue))
2000                        goto start_req;
2001                read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2002                sk = sk_next(sk);
2003        }
2004get_sk:
2005        sk_for_each_from(sk, node) {
2006                if (sk->sk_family == st->family) {
2007                        cur = sk;
2008                        goto out;
2009                }
2010                icsk = inet_csk(sk);
2011                read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2012                if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2013start_req:
2014                        st->uid         = sock_i_uid(sk);
2015                        st->syn_wait_sk = sk;
2016                        st->state       = TCP_SEQ_STATE_OPENREQ;
2017                        st->sbucket     = 0;
2018                        goto get_req;
2019                }
2020                read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2021        }
2022        if (++st->bucket < INET_LHTABLE_SIZE) {
2023                sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
2024                goto get_sk;
2025        }
2026        cur = NULL;
2027out:
2028        return cur;
2029}
2030
2031static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2032{
2033        void *rc = listening_get_next(seq, NULL);
2034
2035        while (rc && *pos) {
2036                rc = listening_get_next(seq, rc);
2037                --*pos;
2038        }
2039        return rc;
2040}
2041
2042static void *established_get_first(struct seq_file *seq)
2043{
2044        struct tcp_iter_state* st = seq->private;
2045        void *rc = NULL;
2046
2047        for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
2048                struct sock *sk;
2049                struct hlist_node *node;
2050                struct inet_timewait_sock *tw;
2051                rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2052
2053                read_lock_bh(lock);
2054                sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2055                        if (sk->sk_family != st->family) {
2056                                continue;
2057                        }
2058                        rc = sk;
2059                        goto out;
2060                }
2061                st->state = TCP_SEQ_STATE_TIME_WAIT;
2062                inet_twsk_for_each(tw, node,
2063                                   &tcp_hashinfo.ehash[st->bucket].twchain) {
2064                        if (tw->tw_family != st->family) {
2065                                continue;
2066                        }
2067                        rc = tw;
2068                        goto out;
2069                }
2070                read_unlock_bh(lock);
2071                st->state = TCP_SEQ_STATE_ESTABLISHED;
2072        }
2073out:
2074        return rc;
2075}
2076
2077static void *established_get_next(struct seq_file *seq, void *cur)
2078{
2079        struct sock *sk = cur;
2080        struct inet_timewait_sock *tw;
2081        struct hlist_node *node;
2082        struct tcp_iter_state* st = seq->private;
2083
2084        ++st->num;
2085
2086        if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2087                tw = cur;
2088                tw = tw_next(tw);
2089get_tw:
2090                while (tw && tw->tw_family != st->family) {
2091                        tw = tw_next(tw);
2092                }
2093                if (tw) {
2094                        cur = tw;
2095                        goto out;
2096                }
2097                read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2098                st->state = TCP_SEQ_STATE_ESTABLISHED;
2099
2100                if (++st->bucket < tcp_hashinfo.ehash_size) {
2101                        read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2102                        sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2103                } else {
2104                        cur = NULL;
2105                        goto out;
2106                }
2107        } else
2108                sk = sk_next(sk);
2109
2110        sk_for_each_from(sk, node) {
2111                if (sk->sk_family == st->family)
2112                        goto found;
2113        }
2114
2115        st->state = TCP_SEQ_STATE_TIME_WAIT;
2116        tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2117        goto get_tw;
2118found:
2119        cur = sk;
2120out:
2121        return cur;
2122}
2123
2124static void *established_get_idx(struct seq_file *seq, loff_t pos)
2125{
2126        void *rc = established_get_first(seq);
2127
2128        while (rc && pos) {
2129                rc = established_get_next(seq, rc);
2130                --pos;
2131        }
2132        return rc;
2133}
2134
2135static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2136{
2137        void *rc;
2138        struct tcp_iter_state* st = seq->private;
2139
2140        inet_listen_lock(&tcp_hashinfo);
2141        st->state = TCP_SEQ_STATE_LISTENING;
2142        rc        = listening_get_idx(seq, &pos);
2143
2144        if (!rc) {
2145                inet_listen_unlock(&tcp_hashinfo);
2146                st->state = TCP_SEQ_STATE_ESTABLISHED;
2147                rc        = established_get_idx(seq, pos);
2148        }
2149
2150        return rc;
2151}
2152
2153static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2154{
2155        struct tcp_iter_state* st = seq->private;
2156        st->state = TCP_SEQ_STATE_LISTENING;
2157        st->num = 0;
2158        return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2159}
2160
2161static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2162{
2163        void *rc = NULL;
2164        struct tcp_iter_state* st;
2165
2166        if (v == SEQ_START_TOKEN) {
2167                rc = tcp_get_idx(seq, 0);
2168                goto out;
2169        }
2170        st = seq->private;
2171
2172        switch (st->state) {
2173        case TCP_SEQ_STATE_OPENREQ:
2174        case TCP_SEQ_STATE_LISTENING:
2175                rc = listening_get_next(seq, v);
2176                if (!rc) {
2177                        inet_listen_unlock(&tcp_hashinfo);
2178                        st->state = TCP_SEQ_STATE_ESTABLISHED;
2179                        rc        = established_get_first(seq);
2180                }
2181                break;
2182        case TCP_SEQ_STATE_ESTABLISHED:
2183        case TCP_SEQ_STATE_TIME_WAIT:
2184                rc = established_get_next(seq, v);
2185                break;
2186        }
2187out:
2188        ++*pos;
2189        return rc;
2190}
2191
2192static void tcp_seq_stop(struct seq_file *seq, void *v)
2193{
2194        struct tcp_iter_state* st = seq->private;
2195
2196        switch (st->state) {
2197        case TCP_SEQ_STATE_OPENREQ:
2198                if (v) {
2199                        struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2200                        read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2201                }
2202        case TCP_SEQ_STATE_LISTENING:
2203                if (v != SEQ_START_TOKEN)
2204                        inet_listen_unlock(&tcp_hashinfo);
2205                break;
2206        case TCP_SEQ_STATE_TIME_WAIT:
2207        case TCP_SEQ_STATE_ESTABLISHED:
2208                if (v)
2209                        read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2210                break;
2211        }
2212}
2213
2214static int tcp_seq_open(struct inode *inode, struct file *file)
2215{
2216        struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2217        struct seq_file *seq;
2218        struct tcp_iter_state *s;
2219        int rc;
2220
2221        if (unlikely(afinfo == NULL))
2222                return -EINVAL;
2223
2224        s = kzalloc(sizeof(*s), GFP_KERNEL);
2225        if (!s)
2226                return -ENOMEM;
2227        s->family               = afinfo->family;
2228        s->seq_ops.start        = tcp_seq_start;
2229        s->seq_ops.next         = tcp_seq_next;
2230        s->seq_ops.show         = afinfo->seq_show;
2231        s->seq_ops.stop         = tcp_seq_stop;
2232
2233        rc = seq_open(file, &s->seq_ops);
2234        if (rc)
2235                goto out_kfree;
2236        seq          = file->private_data;
2237        seq->private = s;
2238out:
2239        return rc;
2240out_kfree:
2241        kfree(s);
2242        goto out;
2243}
2244
2245int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2246{
2247        int rc = 0;
2248        struct proc_dir_entry *p;
2249
2250        if (!afinfo)
2251                return -EINVAL;
2252        afinfo->seq_fops->owner         = afinfo->owner;
2253        afinfo->seq_fops->open          = tcp_seq_open;
2254        afinfo->seq_fops->read          = seq_read;
2255        afinfo->seq_fops->llseek        = seq_lseek;
2256        afinfo->seq_fops->release       = seq_release_private;
2257
2258        p = proc_net_fops_create(&init_net, afinfo->name, S_IRUGO, afinfo->seq_fops);
2259        if (p)
2260                p->data = afinfo;
2261        else
2262                rc = -ENOMEM;
2263        return rc;
2264}
2265
2266void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2267{
2268        if (!afinfo)
2269                return;
2270        proc_net_remove(&init_net, afinfo->name);
2271        memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2272}
2273
2274static void get_openreq4(struct sock *sk, struct request_sock *req,
2275                         char *tmpbuf, int i, int uid)
2276{
2277        const struct inet_request_sock *ireq = inet_rsk(req);
2278        int ttd = req->expires - jiffies;
2279
2280        sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2281                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2282                i,
2283                ireq->loc_addr,
2284                ntohs(inet_sk(sk)->sport),
2285                ireq->rmt_addr,
2286                ntohs(ireq->rmt_port),
2287                TCP_SYN_RECV,
2288                0, 0, /* could print option size, but that is af dependent. */
2289                1,    /* timers active (only the expire timer) */
2290                jiffies_to_clock_t(ttd),
2291                req->retrans,
2292                uid,
2293                0,  /* non standard timer */
2294                0, /* open_requests have no inode */
2295                atomic_read(&sk->sk_refcnt),
2296                req);
2297}
2298
2299static void get_tcp4_sock(struct sock *sk, char *tmpbuf, int i)
2300{
2301        int timer_active;
2302        unsigned long timer_expires;
2303        struct tcp_sock *tp = tcp_sk(sk);
2304        const struct inet_connection_sock *icsk = inet_csk(sk);
2305        struct inet_sock *inet = inet_sk(sk);
2306        __be32 dest = inet->daddr;
2307        __be32 src = inet->rcv_saddr;
2308        __u16 destp = ntohs(inet->dport);
2309        __u16 srcp = ntohs(inet->sport);
2310
2311        if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2312                timer_active    = 1;
2313                timer_expires   = icsk->icsk_timeout;
2314        } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2315                timer_active    = 4;
2316                timer_expires   = icsk->icsk_timeout;
2317        } else if (timer_pending(&sk->sk_timer)) {
2318                timer_active    = 2;
2319                timer_expires   = sk->sk_timer.expires;
2320        } else {
2321                timer_active    = 0;
2322                timer_expires = jiffies;
2323        }
2324
2325        sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2326                        "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2327                i, src, srcp, dest, destp, sk->sk_state,
2328                tp->write_seq - tp->snd_una,
2329                sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
2330                                             (tp->rcv_nxt - tp->copied_seq),
2331                timer_active,
2332                jiffies_to_clock_t(timer_expires - jiffies),
2333                icsk->icsk_retransmits,
2334                sock_i_uid(sk),
2335                icsk->icsk_probes_out,
2336                sock_i_ino(sk),
2337                atomic_read(&sk->sk_refcnt), sk,
2338                icsk->icsk_rto,
2339                icsk->icsk_ack.ato,
2340                (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2341                tp->snd_cwnd,
2342                tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2343}
2344
2345static void get_timewait4_sock(struct inet_timewait_sock *tw,
2346                               char *tmpbuf, int i)
2347{
2348        __be32 dest, src;
2349        __u16 destp, srcp;
2350        int ttd = tw->tw_ttd - jiffies;
2351
2352        if (ttd < 0)
2353                ttd = 0;
2354
2355        dest  = tw->tw_daddr;
2356        src   = tw->tw_rcv_saddr;
2357        destp = ntohs(tw->tw_dport);
2358        srcp  = ntohs(tw->tw_sport);
2359
2360        sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2361                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2362                i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2363                3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2364                atomic_read(&tw->tw_refcnt), tw);
2365}
2366
2367#define TMPSZ 150
2368
2369static int tcp4_seq_show(struct seq_file *seq, void *v)
2370{
2371        struct tcp_iter_state* st;
2372        char tmpbuf[TMPSZ + 1];
2373
2374        if (v == SEQ_START_TOKEN) {
2375                seq_printf(seq, "%-*s\n", TMPSZ - 1,
2376                           "  sl  local_address rem_address   st tx_queue "
2377                           "rx_queue tr tm->when retrnsmt   uid  timeout "
2378                           "inode");
2379                goto out;
2380        }
2381        st = seq->private;
2382
2383        switch (st->state) {
2384        case TCP_SEQ_STATE_LISTENING:
2385        case TCP_SEQ_STATE_ESTABLISHED:
2386                get_tcp4_sock(v, tmpbuf, st->num);
2387                break;
2388        case TCP_SEQ_STATE_OPENREQ:
2389                get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2390                break;
2391        case TCP_SEQ_STATE_TIME_WAIT:
2392                get_timewait4_sock(v, tmpbuf, st->num);
2393                break;
2394        }
2395        seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2396out:
2397        return 0;
2398}
2399
2400static struct file_operations tcp4_seq_fops;
2401static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2402        .owner          = THIS_MODULE,
2403        .name           = "tcp",
2404        .family         = AF_INET,
2405        .seq_show       = tcp4_seq_show,
2406        .seq_fops       = &tcp4_seq_fops,
2407};
2408
2409int __init tcp4_proc_init(void)
2410{
2411        return tcp_proc_register(&tcp4_seq_afinfo);
2412}
2413
2414void tcp4_proc_exit(void)
2415{
2416        tcp_proc_unregister(&tcp4_seq_afinfo);
2417}
2418#endif /* CONFIG_PROC_FS */
2419
2420DEFINE_PROTO_INUSE(tcp)
2421
2422struct proto tcp_prot = {
2423        .name                   = "TCP",
2424        .owner                  = THIS_MODULE,
2425        .close                  = tcp_close,
2426        .connect                = tcp_v4_connect,
2427        .disconnect             = tcp_disconnect,
2428        .accept                 = inet_csk_accept,
2429        .ioctl                  = tcp_ioctl,
2430        .init                   = tcp_v4_init_sock,
2431        .destroy                = tcp_v4_destroy_sock,
2432        .shutdown               = tcp_shutdown,
2433        .setsockopt             = tcp_setsockopt,
2434        .getsockopt             = tcp_getsockopt,
2435        .recvmsg                = tcp_recvmsg,
2436        .backlog_rcv            = tcp_v4_do_rcv,
2437        .hash                   = tcp_v4_hash,
2438        .unhash                 = tcp_unhash,
2439        .get_port               = tcp_v4_get_port,
2440        .enter_memory_pressure  = tcp_enter_memory_pressure,
2441        .sockets_allocated      = &tcp_sockets_allocated,
2442        .orphan_count           = &tcp_orphan_count,
2443        .memory_allocated       = &tcp_memory_allocated,
2444        .memory_pressure        = &tcp_memory_pressure,
2445        .sysctl_mem             = sysctl_tcp_mem,
2446        .sysctl_wmem            = sysctl_tcp_wmem,
2447        .sysctl_rmem            = sysctl_tcp_rmem,
2448        .max_header             = MAX_TCP_HEADER,
2449        .obj_size               = sizeof(struct tcp_sock),
2450        .twsk_prot              = &tcp_timewait_sock_ops,
2451        .rsk_prot               = &tcp_request_sock_ops,
2452#ifdef CONFIG_COMPAT
2453        .compat_setsockopt      = compat_tcp_setsockopt,
2454        .compat_getsockopt      = compat_tcp_getsockopt,
2455#endif
2456        REF_PROTO_INUSE(tcp)
2457};
2458
2459void __init tcp_v4_init(struct net_proto_family *ops)
2460{
2461        if (inet_csk_ctl_sock_create(&tcp_socket, PF_INET, SOCK_RAW,
2462                                     IPPROTO_TCP) < 0)
2463                panic("Failed to create the TCP control socket.\n");
2464}
2465
2466EXPORT_SYMBOL(ipv4_specific);
2467EXPORT_SYMBOL(tcp_hashinfo);
2468EXPORT_SYMBOL(tcp_prot);
2469EXPORT_SYMBOL(tcp_unhash);
2470EXPORT_SYMBOL(tcp_v4_conn_request);
2471EXPORT_SYMBOL(tcp_v4_connect);
2472EXPORT_SYMBOL(tcp_v4_do_rcv);
2473EXPORT_SYMBOL(tcp_v4_remember_stamp);
2474EXPORT_SYMBOL(tcp_v4_send_check);
2475EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2476
2477#ifdef CONFIG_PROC_FS
2478EXPORT_SYMBOL(tcp_proc_register);
2479EXPORT_SYMBOL(tcp_proc_unregister);
2480#endif
2481EXPORT_SYMBOL(sysctl_tcp_low_latency);
2482
2483