linux/net/ipv4/inet_connection_sock.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Support for INET connection oriented protocols.
   7 *
   8 * Authors:     See the TCP sources
   9 *
  10 *              This program is free software; you can redistribute it and/or
  11 *              modify it under the terms of the GNU General Public License
  12 *              as published by the Free Software Foundation; either version
  13 *              2 of the License, or(at your option) any later version.
  14 */
  15
  16#include <linux/module.h>
  17#include <linux/jhash.h>
  18
  19#include <net/inet_connection_sock.h>
  20#include <net/inet_hashtables.h>
  21#include <net/inet_timewait_sock.h>
  22#include <net/ip.h>
  23#include <net/route.h>
  24#include <net/tcp_states.h>
  25#include <net/xfrm.h>
  26
  27#ifdef INET_CSK_DEBUG
  28const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
  29EXPORT_SYMBOL(inet_csk_timer_bug_msg);
  30#endif
  31
  32/*
  33 * This struct holds the first and last local port number.
  34 */
  35struct local_ports sysctl_local_ports __read_mostly = {
  36        .lock = __SEQLOCK_UNLOCKED(sysctl_local_ports.lock),
  37        .range = { 32768, 61000 },
  38};
  39
  40unsigned long *sysctl_local_reserved_ports;
  41EXPORT_SYMBOL(sysctl_local_reserved_ports);
  42
  43void inet_get_local_port_range(int *low, int *high)
  44{
  45        unsigned int seq;
  46
  47        do {
  48                seq = read_seqbegin(&sysctl_local_ports.lock);
  49
  50                *low = sysctl_local_ports.range[0];
  51                *high = sysctl_local_ports.range[1];
  52        } while (read_seqretry(&sysctl_local_ports.lock, seq));
  53}
  54EXPORT_SYMBOL(inet_get_local_port_range);
  55
  56int inet_csk_bind_conflict(const struct sock *sk,
  57                           const struct inet_bind_bucket *tb, bool relax)
  58{
  59        struct sock *sk2;
  60        int reuse = sk->sk_reuse;
  61        int reuseport = sk->sk_reuseport;
  62        kuid_t uid = sock_i_uid((struct sock *)sk);
  63
  64        /*
  65         * Unlike other sk lookup places we do not check
  66         * for sk_net here, since _all_ the socks listed
  67         * in tb->owners list belong to the same net - the
  68         * one this bucket belongs to.
  69         */
  70
  71        sk_for_each_bound(sk2, &tb->owners) {
  72                if (sk != sk2 &&
  73                    !inet_v6_ipv6only(sk2) &&
  74                    (!sk->sk_bound_dev_if ||
  75                     !sk2->sk_bound_dev_if ||
  76                     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
  77                        if ((!reuse || !sk2->sk_reuse ||
  78                            sk2->sk_state == TCP_LISTEN) &&
  79                            (!reuseport || !sk2->sk_reuseport ||
  80                            (sk2->sk_state != TCP_TIME_WAIT &&
  81                             !uid_eq(uid, sock_i_uid(sk2))))) {
  82                                const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2);
  83                                if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) ||
  84                                    sk2_rcv_saddr == sk_rcv_saddr(sk))
  85                                        break;
  86                        }
  87                        if (!relax && reuse && sk2->sk_reuse &&
  88                            sk2->sk_state != TCP_LISTEN) {
  89                                const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2);
  90
  91                                if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) ||
  92                                    sk2_rcv_saddr == sk_rcv_saddr(sk))
  93                                        break;
  94                        }
  95                }
  96        }
  97        return sk2 != NULL;
  98}
  99EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
 100
 101/* Obtain a reference to a local port for the given sock,
 102 * if snum is zero it means select any available local port.
 103 */
 104int inet_csk_get_port(struct sock *sk, unsigned short snum)
 105{
 106        struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
 107        struct inet_bind_hashbucket *head;
 108        struct inet_bind_bucket *tb;
 109        int ret, attempts = 5;
 110        struct net *net = sock_net(sk);
 111        int smallest_size = -1, smallest_rover;
 112        kuid_t uid = sock_i_uid(sk);
 113
 114        local_bh_disable();
 115        if (!snum) {
 116                int remaining, rover, low, high;
 117
 118again:
 119                inet_get_local_port_range(&low, &high);
 120                remaining = (high - low) + 1;
 121                smallest_rover = rover = net_random() % remaining + low;
 122
 123                smallest_size = -1;
 124                do {
 125                        if (inet_is_reserved_local_port(rover))
 126                                goto next_nolock;
 127                        head = &hashinfo->bhash[inet_bhashfn(net, rover,
 128                                        hashinfo->bhash_size)];
 129                        spin_lock(&head->lock);
 130                        inet_bind_bucket_for_each(tb, &head->chain)
 131                                if (net_eq(ib_net(tb), net) && tb->port == rover) {
 132                                        if (((tb->fastreuse > 0 &&
 133                                              sk->sk_reuse &&
 134                                              sk->sk_state != TCP_LISTEN) ||
 135                                             (tb->fastreuseport > 0 &&
 136                                              sk->sk_reuseport &&
 137                                              uid_eq(tb->fastuid, uid))) &&
 138                                            (tb->num_owners < smallest_size || smallest_size == -1)) {
 139                                                smallest_size = tb->num_owners;
 140                                                smallest_rover = rover;
 141                                                if (atomic_read(&hashinfo->bsockets) > (high - low) + 1 &&
 142                                                    !inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
 143                                                        snum = smallest_rover;
 144                                                        goto tb_found;
 145                                                }
 146                                        }
 147                                        if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
 148                                                snum = rover;
 149                                                goto tb_found;
 150                                        }
 151                                        goto next;
 152                                }
 153                        break;
 154                next:
 155                        spin_unlock(&head->lock);
 156                next_nolock:
 157                        if (++rover > high)
 158                                rover = low;
 159                } while (--remaining > 0);
 160
 161                /* Exhausted local port range during search?  It is not
 162                 * possible for us to be holding one of the bind hash
 163                 * locks if this test triggers, because if 'remaining'
 164                 * drops to zero, we broke out of the do/while loop at
 165                 * the top level, not from the 'break;' statement.
 166                 */
 167                ret = 1;
 168                if (remaining <= 0) {
 169                        if (smallest_size != -1) {
 170                                snum = smallest_rover;
 171                                goto have_snum;
 172                        }
 173                        goto fail;
 174                }
 175                /* OK, here is the one we will use.  HEAD is
 176                 * non-NULL and we hold it's mutex.
 177                 */
 178                snum = rover;
 179        } else {
 180have_snum:
 181                head = &hashinfo->bhash[inet_bhashfn(net, snum,
 182                                hashinfo->bhash_size)];
 183                spin_lock(&head->lock);
 184                inet_bind_bucket_for_each(tb, &head->chain)
 185                        if (net_eq(ib_net(tb), net) && tb->port == snum)
 186                                goto tb_found;
 187        }
 188        tb = NULL;
 189        goto tb_not_found;
 190tb_found:
 191        if (!hlist_empty(&tb->owners)) {
 192                if (sk->sk_reuse == SK_FORCE_REUSE)
 193                        goto success;
 194
 195                if (((tb->fastreuse > 0 &&
 196                      sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
 197                     (tb->fastreuseport > 0 &&
 198                      sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
 199                    smallest_size == -1) {
 200                        goto success;
 201                } else {
 202                        ret = 1;
 203                        if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
 204                                if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
 205                                     (tb->fastreuseport > 0 &&
 206                                      sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
 207                                    smallest_size != -1 && --attempts >= 0) {
 208                                        spin_unlock(&head->lock);
 209                                        goto again;
 210                                }
 211
 212                                goto fail_unlock;
 213                        }
 214                }
 215        }
 216tb_not_found:
 217        ret = 1;
 218        if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
 219                                        net, head, snum)) == NULL)
 220                goto fail_unlock;
 221        if (hlist_empty(&tb->owners)) {
 222                if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
 223                        tb->fastreuse = 1;
 224                else
 225                        tb->fastreuse = 0;
 226                if (sk->sk_reuseport) {
 227                        tb->fastreuseport = 1;
 228                        tb->fastuid = uid;
 229                } else
 230                        tb->fastreuseport = 0;
 231        } else {
 232                if (tb->fastreuse &&
 233                    (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
 234                        tb->fastreuse = 0;
 235                if (tb->fastreuseport &&
 236                    (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid)))
 237                        tb->fastreuseport = 0;
 238        }
 239success:
 240        if (!inet_csk(sk)->icsk_bind_hash)
 241                inet_bind_hash(sk, tb, snum);
 242        WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
 243        ret = 0;
 244
 245fail_unlock:
 246        spin_unlock(&head->lock);
 247fail:
 248        local_bh_enable();
 249        return ret;
 250}
 251EXPORT_SYMBOL_GPL(inet_csk_get_port);
 252
 253/*
 254 * Wait for an incoming connection, avoid race conditions. This must be called
 255 * with the socket locked.
 256 */
 257static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
 258{
 259        struct inet_connection_sock *icsk = inet_csk(sk);
 260        DEFINE_WAIT(wait);
 261        int err;
 262
 263        /*
 264         * True wake-one mechanism for incoming connections: only
 265         * one process gets woken up, not the 'whole herd'.
 266         * Since we do not 'race & poll' for established sockets
 267         * anymore, the common case will execute the loop only once.
 268         *
 269         * Subtle issue: "add_wait_queue_exclusive()" will be added
 270         * after any current non-exclusive waiters, and we know that
 271         * it will always _stay_ after any new non-exclusive waiters
 272         * because all non-exclusive waiters are added at the
 273         * beginning of the wait-queue. As such, it's ok to "drop"
 274         * our exclusiveness temporarily when we get woken up without
 275         * having to remove and re-insert us on the wait queue.
 276         */
 277        for (;;) {
 278                prepare_to_wait_exclusive(sk_sleep(sk), &wait,
 279                                          TASK_INTERRUPTIBLE);
 280                release_sock(sk);
 281                if (reqsk_queue_empty(&icsk->icsk_accept_queue))
 282                        timeo = schedule_timeout(timeo);
 283                lock_sock(sk);
 284                err = 0;
 285                if (!reqsk_queue_empty(&icsk->icsk_accept_queue))
 286                        break;
 287                err = -EINVAL;
 288                if (sk->sk_state != TCP_LISTEN)
 289                        break;
 290                err = sock_intr_errno(timeo);
 291                if (signal_pending(current))
 292                        break;
 293                err = -EAGAIN;
 294                if (!timeo)
 295                        break;
 296        }
 297        finish_wait(sk_sleep(sk), &wait);
 298        return err;
 299}
 300
 301/*
 302 * This will accept the next outstanding connection.
 303 */
 304struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
 305{
 306        struct inet_connection_sock *icsk = inet_csk(sk);
 307        struct request_sock_queue *queue = &icsk->icsk_accept_queue;
 308        struct sock *newsk;
 309        struct request_sock *req;
 310        int error;
 311
 312        lock_sock(sk);
 313
 314        /* We need to make sure that this socket is listening,
 315         * and that it has something pending.
 316         */
 317        error = -EINVAL;
 318        if (sk->sk_state != TCP_LISTEN)
 319                goto out_err;
 320
 321        /* Find already established connection */
 322        if (reqsk_queue_empty(queue)) {
 323                long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
 324
 325                /* If this is a non blocking socket don't sleep */
 326                error = -EAGAIN;
 327                if (!timeo)
 328                        goto out_err;
 329
 330                error = inet_csk_wait_for_connect(sk, timeo);
 331                if (error)
 332                        goto out_err;
 333        }
 334        req = reqsk_queue_remove(queue);
 335        newsk = req->sk;
 336
 337        sk_acceptq_removed(sk);
 338        if (sk->sk_protocol == IPPROTO_TCP && queue->fastopenq != NULL) {
 339                spin_lock_bh(&queue->fastopenq->lock);
 340                if (tcp_rsk(req)->listener) {
 341                        /* We are still waiting for the final ACK from 3WHS
 342                         * so can't free req now. Instead, we set req->sk to
 343                         * NULL to signify that the child socket is taken
 344                         * so reqsk_fastopen_remove() will free the req
 345                         * when 3WHS finishes (or is aborted).
 346                         */
 347                        req->sk = NULL;
 348                        req = NULL;
 349                }
 350                spin_unlock_bh(&queue->fastopenq->lock);
 351        }
 352out:
 353        release_sock(sk);
 354        if (req)
 355                __reqsk_free(req);
 356        return newsk;
 357out_err:
 358        newsk = NULL;
 359        req = NULL;
 360        *err = error;
 361        goto out;
 362}
 363EXPORT_SYMBOL(inet_csk_accept);
 364
 365/*
 366 * Using different timers for retransmit, delayed acks and probes
 367 * We may wish use just one timer maintaining a list of expire jiffies
 368 * to optimize.
 369 */
 370void inet_csk_init_xmit_timers(struct sock *sk,
 371                               void (*retransmit_handler)(unsigned long),
 372                               void (*delack_handler)(unsigned long),
 373                               void (*keepalive_handler)(unsigned long))
 374{
 375        struct inet_connection_sock *icsk = inet_csk(sk);
 376
 377        setup_timer(&icsk->icsk_retransmit_timer, retransmit_handler,
 378                        (unsigned long)sk);
 379        setup_timer(&icsk->icsk_delack_timer, delack_handler,
 380                        (unsigned long)sk);
 381        setup_timer(&sk->sk_timer, keepalive_handler, (unsigned long)sk);
 382        icsk->icsk_pending = icsk->icsk_ack.pending = 0;
 383}
 384EXPORT_SYMBOL(inet_csk_init_xmit_timers);
 385
 386void inet_csk_clear_xmit_timers(struct sock *sk)
 387{
 388        struct inet_connection_sock *icsk = inet_csk(sk);
 389
 390        icsk->icsk_pending = icsk->icsk_ack.pending = icsk->icsk_ack.blocked = 0;
 391
 392        sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
 393        sk_stop_timer(sk, &icsk->icsk_delack_timer);
 394        sk_stop_timer(sk, &sk->sk_timer);
 395}
 396EXPORT_SYMBOL(inet_csk_clear_xmit_timers);
 397
 398void inet_csk_delete_keepalive_timer(struct sock *sk)
 399{
 400        sk_stop_timer(sk, &sk->sk_timer);
 401}
 402EXPORT_SYMBOL(inet_csk_delete_keepalive_timer);
 403
 404void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len)
 405{
 406        sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
 407}
 408EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
 409
 410struct dst_entry *inet_csk_route_req(struct sock *sk,
 411                                     struct flowi4 *fl4,
 412                                     const struct request_sock *req)
 413{
 414        struct rtable *rt;
 415        const struct inet_request_sock *ireq = inet_rsk(req);
 416        struct ip_options_rcu *opt = inet_rsk(req)->opt;
 417        struct net *net = sock_net(sk);
 418        int flags = inet_sk_flowi_flags(sk);
 419
 420        flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 421                           RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 422                           sk->sk_protocol,
 423                           flags,
 424                           (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr,
 425                           ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport);
 426        security_req_classify_flow(req, flowi4_to_flowi(fl4));
 427        rt = ip_route_output_flow(net, fl4, sk);
 428        if (IS_ERR(rt))
 429                goto no_route;
 430        if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
 431                goto route_err;
 432        return &rt->dst;
 433
 434route_err:
 435        ip_rt_put(rt);
 436no_route:
 437        IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
 438        return NULL;
 439}
 440EXPORT_SYMBOL_GPL(inet_csk_route_req);
 441
 442struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
 443                                            struct sock *newsk,
 444                                            const struct request_sock *req)
 445{
 446        const struct inet_request_sock *ireq = inet_rsk(req);
 447        struct inet_sock *newinet = inet_sk(newsk);
 448        struct ip_options_rcu *opt;
 449        struct net *net = sock_net(sk);
 450        struct flowi4 *fl4;
 451        struct rtable *rt;
 452
 453        fl4 = &newinet->cork.fl.u.ip4;
 454
 455        rcu_read_lock();
 456        opt = rcu_dereference(newinet->inet_opt);
 457        flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 458                           RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 459                           sk->sk_protocol, inet_sk_flowi_flags(sk),
 460                           (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr,
 461                           ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport);
 462        security_req_classify_flow(req, flowi4_to_flowi(fl4));
 463        rt = ip_route_output_flow(net, fl4, sk);
 464        if (IS_ERR(rt))
 465                goto no_route;
 466        if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
 467                goto route_err;
 468        rcu_read_unlock();
 469        return &rt->dst;
 470
 471route_err:
 472        ip_rt_put(rt);
 473no_route:
 474        rcu_read_unlock();
 475        IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
 476        return NULL;
 477}
 478EXPORT_SYMBOL_GPL(inet_csk_route_child_sock);
 479
 480static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,
 481                                 const u32 rnd, const u32 synq_hsize)
 482{
 483        return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1);
 484}
 485
 486#if IS_ENABLED(CONFIG_IPV6)
 487#define AF_INET_FAMILY(fam) ((fam) == AF_INET)
 488#else
 489#define AF_INET_FAMILY(fam) 1
 490#endif
 491
 492struct request_sock *inet_csk_search_req(const struct sock *sk,
 493                                         struct request_sock ***prevp,
 494                                         const __be16 rport, const __be32 raddr,
 495                                         const __be32 laddr)
 496{
 497        const struct inet_connection_sock *icsk = inet_csk(sk);
 498        struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
 499        struct request_sock *req, **prev;
 500
 501        for (prev = &lopt->syn_table[inet_synq_hash(raddr, rport, lopt->hash_rnd,
 502                                                    lopt->nr_table_entries)];
 503             (req = *prev) != NULL;
 504             prev = &req->dl_next) {
 505                const struct inet_request_sock *ireq = inet_rsk(req);
 506
 507                if (ireq->rmt_port == rport &&
 508                    ireq->rmt_addr == raddr &&
 509                    ireq->loc_addr == laddr &&
 510                    AF_INET_FAMILY(req->rsk_ops->family)) {
 511                        WARN_ON(req->sk);
 512                        *prevp = prev;
 513                        break;
 514                }
 515        }
 516
 517        return req;
 518}
 519EXPORT_SYMBOL_GPL(inet_csk_search_req);
 520
 521void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
 522                                   unsigned long timeout)
 523{
 524        struct inet_connection_sock *icsk = inet_csk(sk);
 525        struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
 526        const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port,
 527                                     lopt->hash_rnd, lopt->nr_table_entries);
 528
 529        reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
 530        inet_csk_reqsk_queue_added(sk, timeout);
 531}
 532EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
 533
 534/* Only thing we need from tcp.h */
 535extern int sysctl_tcp_synack_retries;
 536
 537
 538/* Decide when to expire the request and when to resend SYN-ACK */
 539static inline void syn_ack_recalc(struct request_sock *req, const int thresh,
 540                                  const int max_retries,
 541                                  const u8 rskq_defer_accept,
 542                                  int *expire, int *resend)
 543{
 544        if (!rskq_defer_accept) {
 545                *expire = req->num_timeout >= thresh;
 546                *resend = 1;
 547                return;
 548        }
 549        *expire = req->num_timeout >= thresh &&
 550                  (!inet_rsk(req)->acked || req->num_timeout >= max_retries);
 551        /*
 552         * Do not resend while waiting for data after ACK,
 553         * start to resend on end of deferring period to give
 554         * last chance for data or ACK to create established socket.
 555         */
 556        *resend = !inet_rsk(req)->acked ||
 557                  req->num_timeout >= rskq_defer_accept - 1;
 558}
 559
 560int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req)
 561{
 562        int err = req->rsk_ops->rtx_syn_ack(parent, req);
 563
 564        if (!err)
 565                req->num_retrans++;
 566        return err;
 567}
 568EXPORT_SYMBOL(inet_rtx_syn_ack);
 569
 570void inet_csk_reqsk_queue_prune(struct sock *parent,
 571                                const unsigned long interval,
 572                                const unsigned long timeout,
 573                                const unsigned long max_rto)
 574{
 575        struct inet_connection_sock *icsk = inet_csk(parent);
 576        struct request_sock_queue *queue = &icsk->icsk_accept_queue;
 577        struct listen_sock *lopt = queue->listen_opt;
 578        int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
 579        int thresh = max_retries;
 580        unsigned long now = jiffies;
 581        struct request_sock **reqp, *req;
 582        int i, budget;
 583
 584        if (lopt == NULL || lopt->qlen == 0)
 585                return;
 586
 587        /* Normally all the openreqs are young and become mature
 588         * (i.e. converted to established socket) for first timeout.
 589         * If synack was not acknowledged for 1 second, it means
 590         * one of the following things: synack was lost, ack was lost,
 591         * rtt is high or nobody planned to ack (i.e. synflood).
 592         * When server is a bit loaded, queue is populated with old
 593         * open requests, reducing effective size of queue.
 594         * When server is well loaded, queue size reduces to zero
 595         * after several minutes of work. It is not synflood,
 596         * it is normal operation. The solution is pruning
 597         * too old entries overriding normal timeout, when
 598         * situation becomes dangerous.
 599         *
 600         * Essentially, we reserve half of room for young
 601         * embrions; and abort old ones without pity, if old
 602         * ones are about to clog our table.
 603         */
 604        if (lopt->qlen>>(lopt->max_qlen_log-1)) {
 605                int young = (lopt->qlen_young<<1);
 606
 607                while (thresh > 2) {
 608                        if (lopt->qlen < young)
 609                                break;
 610                        thresh--;
 611                        young <<= 1;
 612                }
 613        }
 614
 615        if (queue->rskq_defer_accept)
 616                max_retries = queue->rskq_defer_accept;
 617
 618        budget = 2 * (lopt->nr_table_entries / (timeout / interval));
 619        i = lopt->clock_hand;
 620
 621        do {
 622                reqp=&lopt->syn_table[i];
 623                while ((req = *reqp) != NULL) {
 624                        if (time_after_eq(now, req->expires)) {
 625                                int expire = 0, resend = 0;
 626
 627                                syn_ack_recalc(req, thresh, max_retries,
 628                                               queue->rskq_defer_accept,
 629                                               &expire, &resend);
 630                                req->rsk_ops->syn_ack_timeout(parent, req);
 631                                if (!expire &&
 632                                    (!resend ||
 633                                     !inet_rtx_syn_ack(parent, req) ||
 634                                     inet_rsk(req)->acked)) {
 635                                        unsigned long timeo;
 636
 637                                        if (req->num_timeout++ == 0)
 638                                                lopt->qlen_young--;
 639                                        timeo = min(timeout << req->num_timeout,
 640                                                    max_rto);
 641                                        req->expires = now + timeo;
 642                                        reqp = &req->dl_next;
 643                                        continue;
 644                                }
 645
 646                                /* Drop this request */
 647                                inet_csk_reqsk_queue_unlink(parent, req, reqp);
 648                                reqsk_queue_removed(queue, req);
 649                                reqsk_free(req);
 650                                continue;
 651                        }
 652                        reqp = &req->dl_next;
 653                }
 654
 655                i = (i + 1) & (lopt->nr_table_entries - 1);
 656
 657        } while (--budget > 0);
 658
 659        lopt->clock_hand = i;
 660
 661        if (lopt->qlen)
 662                inet_csk_reset_keepalive_timer(parent, interval);
 663}
 664EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune);
 665
 666/**
 667 *      inet_csk_clone_lock - clone an inet socket, and lock its clone
 668 *      @sk: the socket to clone
 669 *      @req: request_sock
 670 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
 671 *
 672 *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
 673 */
 674struct sock *inet_csk_clone_lock(const struct sock *sk,
 675                                 const struct request_sock *req,
 676                                 const gfp_t priority)
 677{
 678        struct sock *newsk = sk_clone_lock(sk, priority);
 679
 680        if (newsk != NULL) {
 681                struct inet_connection_sock *newicsk = inet_csk(newsk);
 682
 683                newsk->sk_state = TCP_SYN_RECV;
 684                newicsk->icsk_bind_hash = NULL;
 685
 686                inet_sk(newsk)->inet_dport = inet_rsk(req)->rmt_port;
 687                inet_sk(newsk)->inet_num = ntohs(inet_rsk(req)->loc_port);
 688                inet_sk(newsk)->inet_sport = inet_rsk(req)->loc_port;
 689                newsk->sk_write_space = sk_stream_write_space;
 690
 691                newicsk->icsk_retransmits = 0;
 692                newicsk->icsk_backoff     = 0;
 693                newicsk->icsk_probes_out  = 0;
 694
 695                /* Deinitialize accept_queue to trap illegal accesses. */
 696                memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue));
 697
 698                security_inet_csk_clone(newsk, req);
 699        }
 700        return newsk;
 701}
 702EXPORT_SYMBOL_GPL(inet_csk_clone_lock);
 703
 704/*
 705 * At this point, there should be no process reference to this
 706 * socket, and thus no user references at all.  Therefore we
 707 * can assume the socket waitqueue is inactive and nobody will
 708 * try to jump onto it.
 709 */
 710void inet_csk_destroy_sock(struct sock *sk)
 711{
 712        WARN_ON(sk->sk_state != TCP_CLOSE);
 713        WARN_ON(!sock_flag(sk, SOCK_DEAD));
 714
 715        /* It cannot be in hash table! */
 716        WARN_ON(!sk_unhashed(sk));
 717
 718        /* If it has not 0 inet_sk(sk)->inet_num, it must be bound */
 719        WARN_ON(inet_sk(sk)->inet_num && !inet_csk(sk)->icsk_bind_hash);
 720
 721        sk->sk_prot->destroy(sk);
 722
 723        sk_stream_kill_queues(sk);
 724
 725        xfrm_sk_free_policy(sk);
 726
 727        sk_refcnt_debug_release(sk);
 728
 729        percpu_counter_dec(sk->sk_prot->orphan_count);
 730        sock_put(sk);
 731}
 732EXPORT_SYMBOL(inet_csk_destroy_sock);
 733
 734/* This function allows to force a closure of a socket after the call to
 735 * tcp/dccp_create_openreq_child().
 736 */
 737void inet_csk_prepare_forced_close(struct sock *sk)
 738        __releases(&sk->sk_lock.slock)
 739{
 740        /* sk_clone_lock locked the socket and set refcnt to 2 */
 741        bh_unlock_sock(sk);
 742        sock_put(sk);
 743
 744        /* The below has to be done to allow calling inet_csk_destroy_sock */
 745        sock_set_flag(sk, SOCK_DEAD);
 746        percpu_counter_inc(sk->sk_prot->orphan_count);
 747        inet_sk(sk)->inet_num = 0;
 748}
 749EXPORT_SYMBOL(inet_csk_prepare_forced_close);
 750
 751int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
 752{
 753        struct inet_sock *inet = inet_sk(sk);
 754        struct inet_connection_sock *icsk = inet_csk(sk);
 755        int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);
 756
 757        if (rc != 0)
 758                return rc;
 759
 760        sk->sk_max_ack_backlog = 0;
 761        sk->sk_ack_backlog = 0;
 762        inet_csk_delack_init(sk);
 763
 764        /* There is race window here: we announce ourselves listening,
 765         * but this transition is still not validated by get_port().
 766         * It is OK, because this socket enters to hash table only
 767         * after validation is complete.
 768         */
 769        sk->sk_state = TCP_LISTEN;
 770        if (!sk->sk_prot->get_port(sk, inet->inet_num)) {
 771                inet->inet_sport = htons(inet->inet_num);
 772
 773                sk_dst_reset(sk);
 774                sk->sk_prot->hash(sk);
 775
 776                return 0;
 777        }
 778
 779        sk->sk_state = TCP_CLOSE;
 780        __reqsk_queue_destroy(&icsk->icsk_accept_queue);
 781        return -EADDRINUSE;
 782}
 783EXPORT_SYMBOL_GPL(inet_csk_listen_start);
 784
 785/*
 786 *      This routine closes sockets which have been at least partially
 787 *      opened, but not yet accepted.
 788 */
 789void inet_csk_listen_stop(struct sock *sk)
 790{
 791        struct inet_connection_sock *icsk = inet_csk(sk);
 792        struct request_sock_queue *queue = &icsk->icsk_accept_queue;
 793        struct request_sock *acc_req;
 794        struct request_sock *req;
 795
 796        inet_csk_delete_keepalive_timer(sk);
 797
 798        /* make all the listen_opt local to us */
 799        acc_req = reqsk_queue_yank_acceptq(queue);
 800
 801        /* Following specs, it would be better either to send FIN
 802         * (and enter FIN-WAIT-1, it is normal close)
 803         * or to send active reset (abort).
 804         * Certainly, it is pretty dangerous while synflood, but it is
 805         * bad justification for our negligence 8)
 806         * To be honest, we are not able to make either
 807         * of the variants now.                 --ANK
 808         */
 809        reqsk_queue_destroy(queue);
 810
 811        while ((req = acc_req) != NULL) {
 812                struct sock *child = req->sk;
 813
 814                acc_req = req->dl_next;
 815
 816                local_bh_disable();
 817                bh_lock_sock(child);
 818                WARN_ON(sock_owned_by_user(child));
 819                sock_hold(child);
 820
 821                sk->sk_prot->disconnect(child, O_NONBLOCK);
 822
 823                sock_orphan(child);
 824
 825                percpu_counter_inc(sk->sk_prot->orphan_count);
 826
 827                if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->listener) {
 828                        BUG_ON(tcp_sk(child)->fastopen_rsk != req);
 829                        BUG_ON(sk != tcp_rsk(req)->listener);
 830
 831                        /* Paranoid, to prevent race condition if
 832                         * an inbound pkt destined for child is
 833                         * blocked by sock lock in tcp_v4_rcv().
 834                         * Also to satisfy an assertion in
 835                         * tcp_v4_destroy_sock().
 836                         */
 837                        tcp_sk(child)->fastopen_rsk = NULL;
 838                        sock_put(sk);
 839                }
 840                inet_csk_destroy_sock(child);
 841
 842                bh_unlock_sock(child);
 843                local_bh_enable();
 844                sock_put(child);
 845
 846                sk_acceptq_removed(sk);
 847                __reqsk_free(req);
 848        }
 849        if (queue->fastopenq != NULL) {
 850                /* Free all the reqs queued in rskq_rst_head. */
 851                spin_lock_bh(&queue->fastopenq->lock);
 852                acc_req = queue->fastopenq->rskq_rst_head;
 853                queue->fastopenq->rskq_rst_head = NULL;
 854                spin_unlock_bh(&queue->fastopenq->lock);
 855                while ((req = acc_req) != NULL) {
 856                        acc_req = req->dl_next;
 857                        __reqsk_free(req);
 858                }
 859        }
 860        WARN_ON(sk->sk_ack_backlog);
 861}
 862EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
 863
 864void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
 865{
 866        struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
 867        const struct inet_sock *inet = inet_sk(sk);
 868
 869        sin->sin_family         = AF_INET;
 870        sin->sin_addr.s_addr    = inet->inet_daddr;
 871        sin->sin_port           = inet->inet_dport;
 872}
 873EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr);
 874
 875#ifdef CONFIG_COMPAT
 876int inet_csk_compat_getsockopt(struct sock *sk, int level, int optname,
 877                               char __user *optval, int __user *optlen)
 878{
 879        const struct inet_connection_sock *icsk = inet_csk(sk);
 880
 881        if (icsk->icsk_af_ops->compat_getsockopt != NULL)
 882                return icsk->icsk_af_ops->compat_getsockopt(sk, level, optname,
 883                                                            optval, optlen);
 884        return icsk->icsk_af_ops->getsockopt(sk, level, optname,
 885                                             optval, optlen);
 886}
 887EXPORT_SYMBOL_GPL(inet_csk_compat_getsockopt);
 888
 889int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname,
 890                               char __user *optval, unsigned int optlen)
 891{
 892        const struct inet_connection_sock *icsk = inet_csk(sk);
 893
 894        if (icsk->icsk_af_ops->compat_setsockopt != NULL)
 895                return icsk->icsk_af_ops->compat_setsockopt(sk, level, optname,
 896                                                            optval, optlen);
 897        return icsk->icsk_af_ops->setsockopt(sk, level, optname,
 898                                             optval, optlen);
 899}
 900EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt);
 901#endif
 902
 903static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl)
 904{
 905        const struct inet_sock *inet = inet_sk(sk);
 906        const struct ip_options_rcu *inet_opt;
 907        __be32 daddr = inet->inet_daddr;
 908        struct flowi4 *fl4;
 909        struct rtable *rt;
 910
 911        rcu_read_lock();
 912        inet_opt = rcu_dereference(inet->inet_opt);
 913        if (inet_opt && inet_opt->opt.srr)
 914                daddr = inet_opt->opt.faddr;
 915        fl4 = &fl->u.ip4;
 916        rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr,
 917                                   inet->inet_saddr, inet->inet_dport,
 918                                   inet->inet_sport, sk->sk_protocol,
 919                                   RT_CONN_FLAGS(sk), sk->sk_bound_dev_if);
 920        if (IS_ERR(rt))
 921                rt = NULL;
 922        if (rt)
 923                sk_setup_caps(sk, &rt->dst);
 924        rcu_read_unlock();
 925
 926        return &rt->dst;
 927}
 928
 929struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu)
 930{
 931        struct dst_entry *dst = __sk_dst_check(sk, 0);
 932        struct inet_sock *inet = inet_sk(sk);
 933
 934        if (!dst) {
 935                dst = inet_csk_rebuild_route(sk, &inet->cork.fl);
 936                if (!dst)
 937                        goto out;
 938        }
 939        dst->ops->update_pmtu(dst, sk, NULL, mtu);
 940
 941        dst = __sk_dst_check(sk, 0);
 942        if (!dst)
 943                dst = inet_csk_rebuild_route(sk, &inet->cork.fl);
 944out:
 945        return dst;
 946}
 947EXPORT_SYMBOL_GPL(inet_csk_update_pmtu);
 948