LXR linux/net/ipv4/inet_connection

   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              Support for INET connection oriented protocols.
   7 *
   8 * Authors:     See the TCP sources
   9 *
  10 *              This program is free software; you can redistribute it and/or
  11 *              modify it under the terms of the GNU General Public License
  12 *              as published by the Free Software Foundation; either version
  13 *              2 of the License, or(at your option) any later version.
  14 */
  15
  16#include <linux/module.h>
  17#include <linux/jhash.h>
  18
  19#include <net/inet_connection_sock.h>
  20#include <net/inet_hashtables.h>
  21#include <net/inet_timewait_sock.h>
  22#include <net/ip.h>
  23#include <net/route.h>
  24#include <net/tcp_states.h>
  25#include <net/xfrm.h>
  26#include <net/tcp.h>
  27#include <net/sock_reuseport.h>
  28
  29#ifdef INET_CSK_DEBUG
  30const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
  31EXPORT_SYMBOL(inet_csk_timer_bug_msg);
  32#endif
  33
  34void inet_get_local_port_range(struct net *net, int *low, int *high)
  35{
  36        unsigned int seq;
  37
  38        do {
  39                seq = read_seqbegin(&net->ipv4.ip_local_ports.lock);
  40
  41                *low = net->ipv4.ip_local_ports.range[0];
  42                *high = net->ipv4.ip_local_ports.range[1];
  43        } while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq));
  44}
  45EXPORT_SYMBOL(inet_get_local_port_range);
  46
  47int inet_csk_bind_conflict(const struct sock *sk,
  48                           const struct inet_bind_bucket *tb, bool relax,
  49                           bool reuseport_ok)
  50{
  51        struct sock *sk2;
  52        bool reuse = sk->sk_reuse;
  53        bool reuseport = !!sk->sk_reuseport && reuseport_ok;
  54        kuid_t uid = sock_i_uid((struct sock *)sk);
  55
  56        /*
  57         * Unlike other sk lookup places we do not check
  58         * for sk_net here, since _all_ the socks listed
  59         * in tb->owners list belong to the same net - the
  60         * one this bucket belongs to.
  61         */
  62
  63        sk_for_each_bound(sk2, &tb->owners) {
  64                if (sk != sk2 &&
  65                    !inet_v6_ipv6only(sk2) &&
  66                    (!sk->sk_bound_dev_if ||
  67                     !sk2->sk_bound_dev_if ||
  68                     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
  69                        if ((!reuse || !sk2->sk_reuse ||
  70                            sk2->sk_state == TCP_LISTEN) &&
  71                            (!reuseport || !sk2->sk_reuseport ||
  72                             rcu_access_pointer(sk->sk_reuseport_cb) ||
  73                             (sk2->sk_state != TCP_TIME_WAIT &&
  74                             !uid_eq(uid, sock_i_uid(sk2))))) {
  75
  76                                if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
  77                                    sk2->sk_rcv_saddr == sk->sk_rcv_saddr)
  78                                        break;
  79                        }
  80                        if (!relax && reuse && sk2->sk_reuse &&
  81                            sk2->sk_state != TCP_LISTEN) {
  82
  83                                if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
  84                                    sk2->sk_rcv_saddr == sk->sk_rcv_saddr)
  85                                        break;
  86                        }
  87                }
  88        }
  89        return sk2 != NULL;
  90}
  91EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
  92
  93/* Obtain a reference to a local port for the given sock,
  94 * if snum is zero it means select any available local port.
  95 * We try to allocate an odd port (and leave even ports for connect())
  96 */
  97int inet_csk_get_port(struct sock *sk, unsigned short snum)
  98{
  99        bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
 100        struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
 101        int ret = 1, attempts = 5, port = snum;
 102        int smallest_size = -1, smallest_port;
 103        struct inet_bind_hashbucket *head;
 104        struct net *net = sock_net(sk);
 105        int i, low, high, attempt_half;
 106        struct inet_bind_bucket *tb;
 107        kuid_t uid = sock_i_uid(sk);
 108        u32 remaining, offset;
 109        bool reuseport_ok = !!snum;
 110
 111        if (port) {
 112have_port:
 113                head = &hinfo->bhash[inet_bhashfn(net, port,
 114                                                  hinfo->bhash_size)];
 115                spin_lock_bh(&head->lock);
 116                inet_bind_bucket_for_each(tb, &head->chain)
 117                        if (net_eq(ib_net(tb), net) && tb->port == port)
 118                                goto tb_found;
 119
 120                goto tb_not_found;
 121        }
 122again:
 123        attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
 124other_half_scan:
 125        inet_get_local_port_range(net, &low, &high);
 126        high++; /* [32768, 60999] -> [32768, 61000[ */
 127        if (high - low < 4)
 128                attempt_half = 0;
 129        if (attempt_half) {
 130                int half = low + (((high - low) >> 2) << 1);
 131
 132                if (attempt_half == 1)
 133                        high = half;
 134                else
 135                        low = half;
 136        }
 137        remaining = high - low;
 138        if (likely(remaining > 1))
 139                remaining &= ~1U;
 140
 141        offset = prandom_u32() % remaining;
 142        /* __inet_hash_connect() favors ports having @low parity
 143         * We do the opposite to not pollute connect() users.
 144         */
 145        offset |= 1U;
 146        smallest_size = -1;
 147        smallest_port = low; /* avoid compiler warning */
 148
 149other_parity_scan:
 150        port = low + offset;
 151        for (i = 0; i < remaining; i += 2, port += 2) {
 152                if (unlikely(port >= high))
 153                        port -= remaining;
 154                if (inet_is_local_reserved_port(net, port))
 155                        continue;
 156                head = &hinfo->bhash[inet_bhashfn(net, port,
 157                                                  hinfo->bhash_size)];
 158                spin_lock_bh(&head->lock);
 159                inet_bind_bucket_for_each(tb, &head->chain)
 160                        if (net_eq(ib_net(tb), net) && tb->port == port) {
 161                                if (((tb->fastreuse > 0 && reuse) ||
 162                                     (tb->fastreuseport > 0 &&
 163                                      sk->sk_reuseport &&
 164                                      !rcu_access_pointer(sk->sk_reuseport_cb) &&
 165                                      uid_eq(tb->fastuid, uid))) &&
 166                                    (tb->num_owners < smallest_size || smallest_size == -1)) {
 167                                        smallest_size = tb->num_owners;
 168                                        smallest_port = port;
 169                                }
 170                                if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false,
 171                                                                              reuseport_ok))
 172                                        goto tb_found;
 173                                goto next_port;
 174                        }
 175                goto tb_not_found;
 176next_port:
 177                spin_unlock_bh(&head->lock);
 178                cond_resched();
 179        }
 180
 181        if (smallest_size != -1) {
 182                port = smallest_port;
 183                goto have_port;
 184        }
 185        offset--;
 186        if (!(offset & 1))
 187                goto other_parity_scan;
 188
 189        if (attempt_half == 1) {
 190                /* OK we now try the upper half of the range */
 191                attempt_half = 2;
 192                goto other_half_scan;
 193        }
 194        return ret;
 195
 196tb_not_found:
 197        tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
 198                                     net, head, port);
 199        if (!tb)
 200                goto fail_unlock;
 201tb_found:
 202        if (!hlist_empty(&tb->owners)) {
 203                if (sk->sk_reuse == SK_FORCE_REUSE)
 204                        goto success;
 205
 206                if (((tb->fastreuse > 0 && reuse) ||
 207                     (tb->fastreuseport > 0 &&
 208                      !rcu_access_pointer(sk->sk_reuseport_cb) &&
 209                      sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
 210                    smallest_size == -1)
 211                        goto success;
 212                if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true,
 213                                                             reuseport_ok)) {
 214                        if ((reuse ||
 215                             (tb->fastreuseport > 0 &&
 216                              sk->sk_reuseport &&
 217                              !rcu_access_pointer(sk->sk_reuseport_cb) &&
 218                              uid_eq(tb->fastuid, uid))) &&
 219                            !snum && smallest_size != -1 && --attempts >= 0) {
 220                                spin_unlock_bh(&head->lock);
 221                                goto again;
 222                        }
 223                        goto fail_unlock;
 224                }
 225                if (!reuse)
 226                        tb->fastreuse = 0;
 227                if (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))
 228                        tb->fastreuseport = 0;
 229        } else {
 230                tb->fastreuse = reuse;
 231                if (sk->sk_reuseport) {
 232                        tb->fastreuseport = 1;
 233                        tb->fastuid = uid;
 234                } else {
 235                        tb->fastreuseport = 0;
 236                }
 237        }
 238success:
 239        if (!inet_csk(sk)->icsk_bind_hash)
 240                inet_bind_hash(sk, tb, port);
 241        WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
 242        ret = 0;
 243
 244fail_unlock:
 245        spin_unlock_bh(&head->lock);
 246        return ret;
 247}
 248EXPORT_SYMBOL_GPL(inet_csk_get_port);
 249
 250/*
 251 * Wait for an incoming connection, avoid race conditions. This must be called
 252 * with the socket locked.
 253 */
 254static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
 255{
 256        struct inet_connection_sock *icsk = inet_csk(sk);
 257        DEFINE_WAIT(wait);
 258        int err;
 259
 260        /*
 261         * True wake-one mechanism for incoming connections: only
 262         * one process gets woken up, not the 'whole herd'.
 263         * Since we do not 'race & poll' for established sockets
 264         * anymore, the common case will execute the loop only once.
 265         *
 266         * Subtle issue: "add_wait_queue_exclusive()" will be added
 267         * after any current non-exclusive waiters, and we know that
 268         * it will always _stay_ after any new non-exclusive waiters
 269         * because all non-exclusive waiters are added at the
 270         * beginning of the wait-queue. As such, it's ok to "drop"
 271         * our exclusiveness temporarily when we get woken up without
 272         * having to remove and re-insert us on the wait queue.
 273         */
 274        for (;;) {
 275                prepare_to_wait_exclusive(sk_sleep(sk), &wait,
 276                                          TASK_INTERRUPTIBLE);
 277                release_sock(sk);
 278                if (reqsk_queue_empty(&icsk->icsk_accept_queue))
 279                        timeo = schedule_timeout(timeo);
 280                sched_annotate_sleep();
 281                lock_sock(sk);
 282                err = 0;
 283                if (!reqsk_queue_empty(&icsk->icsk_accept_queue))
 284                        break;
 285                err = -EINVAL;
 286                if (sk->sk_state != TCP_LISTEN)
 287                        break;
 288                err = sock_intr_errno(timeo);
 289                if (signal_pending(current))
 290                        break;
 291                err = -EAGAIN;
 292                if (!timeo)
 293                        break;
 294        }
 295        finish_wait(sk_sleep(sk), &wait);
 296        return err;
 297}
 298
 299/*
 300 * This will accept the next outstanding connection.
 301 */
 302struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
 303{
 304        struct inet_connection_sock *icsk = inet_csk(sk);
 305        struct request_sock_queue *queue = &icsk->icsk_accept_queue;
 306        struct request_sock *req;
 307        struct sock *newsk;
 308        int error;
 309
 310        lock_sock(sk);
 311
 312        /* We need to make sure that this socket is listening,
 313         * and that it has something pending.
 314         */
 315        error = -EINVAL;
 316        if (sk->sk_state != TCP_LISTEN)
 317                goto out_err;
 318
 319        /* Find already established connection */
 320        if (reqsk_queue_empty(queue)) {
 321                long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
 322
 323                /* If this is a non blocking socket don't sleep */
 324                error = -EAGAIN;
 325                if (!timeo)
 326                        goto out_err;
 327
 328                error = inet_csk_wait_for_connect(sk, timeo);
 329                if (error)
 330                        goto out_err;
 331        }
 332        req = reqsk_queue_remove(queue, sk);
 333        newsk = req->sk;
 334
 335        if (sk->sk_protocol == IPPROTO_TCP &&
 336            tcp_rsk(req)->tfo_listener) {
 337                spin_lock_bh(&queue->fastopenq.lock);
 338                if (tcp_rsk(req)->tfo_listener) {
 339                        /* We are still waiting for the final ACK from 3WHS
 340                         * so can't free req now. Instead, we set req->sk to
 341                         * NULL to signify that the child socket is taken
 342                         * so reqsk_fastopen_remove() will free the req
 343                         * when 3WHS finishes (or is aborted).
 344                         */
 345                        req->sk = NULL;
 346                        req = NULL;
 347                }
 348                spin_unlock_bh(&queue->fastopenq.lock);
 349        }
 350out:
 351        release_sock(sk);
 352        if (req)
 353                reqsk_put(req);
 354        return newsk;
 355out_err:
 356        newsk = NULL;
 357        req = NULL;
 358        *err = error;
 359        goto out;
 360}
 361EXPORT_SYMBOL(inet_csk_accept);
 362
 363/*
 364 * Using different timers for retransmit, delayed acks and probes
 365 * We may wish use just one timer maintaining a list of expire jiffies
 366 * to optimize.
 367 */
 368void inet_csk_init_xmit_timers(struct sock *sk,
 369                               void (*retransmit_handler)(unsigned long),
 370                               void (*delack_handler)(unsigned long),
 371                               void (*keepalive_handler)(unsigned long))
 372{
 373        struct inet_connection_sock *icsk = inet_csk(sk);
 374
 375        setup_timer(&icsk->icsk_retransmit_timer, retransmit_handler,
 376                        (unsigned long)sk);
 377        setup_timer(&icsk->icsk_delack_timer, delack_handler,
 378                        (unsigned long)sk);
 379        setup_timer(&sk->sk_timer, keepalive_handler, (unsigned long)sk);
 380        icsk->icsk_pending = icsk->icsk_ack.pending = 0;
 381}
 382EXPORT_SYMBOL(inet_csk_init_xmit_timers);
 383
 384void inet_csk_clear_xmit_timers(struct sock *sk)
 385{
 386        struct inet_connection_sock *icsk = inet_csk(sk);
 387
 388        icsk->icsk_pending = icsk->icsk_ack.pending = icsk->icsk_ack.blocked = 0;
 389
 390        sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
 391        sk_stop_timer(sk, &icsk->icsk_delack_timer);
 392        sk_stop_timer(sk, &sk->sk_timer);
 393}
 394EXPORT_SYMBOL(inet_csk_clear_xmit_timers);
 395
 396void inet_csk_delete_keepalive_timer(struct sock *sk)
 397{
 398        sk_stop_timer(sk, &sk->sk_timer);
 399}
 400EXPORT_SYMBOL(inet_csk_delete_keepalive_timer);
 401
 402void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len)
 403{
 404        sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
 405}
 406EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
 407
 408struct dst_entry *inet_csk_route_req(const struct sock *sk,
 409                                     struct flowi4 *fl4,
 410                                     const struct request_sock *req)
 411{
 412        const struct inet_request_sock *ireq = inet_rsk(req);
 413        struct net *net = read_pnet(&ireq->ireq_net);
 414        struct ip_options_rcu *opt = ireq->opt;
 415        struct rtable *rt;
 416
 417        flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark,
 418                           RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 419                           sk->sk_protocol, inet_sk_flowi_flags(sk),
 420                           (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
 421                           ireq->ir_loc_addr, ireq->ir_rmt_port,
 422                           htons(ireq->ir_num), sk->sk_uid);
 423        security_req_classify_flow(req, flowi4_to_flowi(fl4));
 424        rt = ip_route_output_flow(net, fl4, sk);
 425        if (IS_ERR(rt))
 426                goto no_route;
 427        if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
 428                goto route_err;
 429        return &rt->dst;
 430
 431route_err:
 432        ip_rt_put(rt);
 433no_route:
 434        __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
 435        return NULL;
 436}
 437EXPORT_SYMBOL_GPL(inet_csk_route_req);
 438
 439struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
 440                                            struct sock *newsk,
 441                                            const struct request_sock *req)
 442{
 443        const struct inet_request_sock *ireq = inet_rsk(req);
 444        struct net *net = read_pnet(&ireq->ireq_net);
 445        struct inet_sock *newinet = inet_sk(newsk);
 446        struct ip_options_rcu *opt;
 447        struct flowi4 *fl4;
 448        struct rtable *rt;
 449
 450        fl4 = &newinet->cork.fl.u.ip4;
 451
 452        rcu_read_lock();
 453        opt = rcu_dereference(newinet->inet_opt);
 454        flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark,
 455                           RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 456                           sk->sk_protocol, inet_sk_flowi_flags(sk),
 457                           (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
 458                           ireq->ir_loc_addr, ireq->ir_rmt_port,
 459                           htons(ireq->ir_num), sk->sk_uid);
 460        security_req_classify_flow(req, flowi4_to_flowi(fl4));
 461        rt = ip_route_output_flow(net, fl4, sk);
 462        if (IS_ERR(rt))
 463                goto no_route;
 464        if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
 465                goto route_err;
 466        rcu_read_unlock();
 467        return &rt->dst;
 468
 469route_err:
 470        ip_rt_put(rt);
 471no_route:
 472        rcu_read_unlock();
 473        __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
 474        return NULL;
 475}
 476EXPORT_SYMBOL_GPL(inet_csk_route_child_sock);
 477
 478#if IS_ENABLED(CONFIG_IPV6)
 479#define AF_INET_FAMILY(fam) ((fam) == AF_INET)
 480#else
 481#define AF_INET_FAMILY(fam) true
 482#endif
 483
 484/* Decide when to expire the request and when to resend SYN-ACK */
 485static inline void syn_ack_recalc(struct request_sock *req, const int thresh,
 486                                  const int max_retries,
 487                                  const u8 rskq_defer_accept,
 488                                  int *expire, int *resend)
 489{
 490        if (!rskq_defer_accept) {
 491                *expire = req->num_timeout >= thresh;
 492                *resend = 1;
 493                return;
 494        }
 495        *expire = req->num_timeout >= thresh &&
 496                  (!inet_rsk(req)->acked || req->num_timeout >= max_retries);
 497        /*
 498         * Do not resend while waiting for data after ACK,
 499         * start to resend on end of deferring period to give
 500         * last chance for data or ACK to create established socket.
 501         */
 502        *resend = !inet_rsk(req)->acked ||
 503                  req->num_timeout >= rskq_defer_accept - 1;
 504}
 505
 506int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req)
 507{
 508        int err = req->rsk_ops->rtx_syn_ack(parent, req);
 509
 510        if (!err)
 511                req->num_retrans++;
 512        return err;
 513}
 514EXPORT_SYMBOL(inet_rtx_syn_ack);
 515
 516/* return true if req was found in the ehash table */
 517static bool reqsk_queue_unlink(struct request_sock_queue *queue,
 518                               struct request_sock *req)
 519{
 520        struct inet_hashinfo *hashinfo = req_to_sk(req)->sk_prot->h.hashinfo;
 521        bool found = false;
 522
 523        if (sk_hashed(req_to_sk(req))) {
 524                spinlock_t *lock = inet_ehash_lockp(hashinfo, req->rsk_hash);
 525
 526                spin_lock(lock);
 527                found = __sk_nulls_del_node_init_rcu(req_to_sk(req));
 528                spin_unlock(lock);
 529        }
 530        if (timer_pending(&req->rsk_timer) && del_timer_sync(&req->rsk_timer))
 531                reqsk_put(req);
 532        return found;
 533}
 534
 535void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req)
 536{
 537        if (reqsk_queue_unlink(&inet_csk(sk)->icsk_accept_queue, req)) {
 538                reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
 539                reqsk_put(req);
 540        }
 541}
 542EXPORT_SYMBOL(inet_csk_reqsk_queue_drop);
 543
 544void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req)
 545{
 546        inet_csk_reqsk_queue_drop(sk, req);
 547        reqsk_put(req);
 548}
 549EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put);
 550
 551static void reqsk_timer_handler(unsigned long data)
 552{
 553        struct request_sock *req = (struct request_sock *)data;
 554        struct sock *sk_listener = req->rsk_listener;
 555        struct net *net = sock_net(sk_listener);
 556        struct inet_connection_sock *icsk = inet_csk(sk_listener);
 557        struct request_sock_queue *queue = &icsk->icsk_accept_queue;
 558        int qlen, expire = 0, resend = 0;
 559        int max_retries, thresh;
 560        u8 defer_accept;
 561
 562        if (sk_state_load(sk_listener) != TCP_LISTEN)
 563                goto drop;
 564
 565        max_retries = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_synack_retries;
 566        thresh = max_retries;
 567        /* Normally all the openreqs are young and become mature
 568         * (i.e. converted to established socket) for first timeout.
 569         * If synack was not acknowledged for 1 second, it means
 570         * one of the following things: synack was lost, ack was lost,
 571         * rtt is high or nobody planned to ack (i.e. synflood).
 572         * When server is a bit loaded, queue is populated with old
 573         * open requests, reducing effective size of queue.
 574         * When server is well loaded, queue size reduces to zero
 575         * after several minutes of work. It is not synflood,
 576         * it is normal operation. The solution is pruning
 577         * too old entries overriding normal timeout, when
 578         * situation becomes dangerous.
 579         *
 580         * Essentially, we reserve half of room for young
 581         * embrions; and abort old ones without pity, if old
 582         * ones are about to clog our table.
 583         */
 584        qlen = reqsk_queue_len(queue);
 585        if ((qlen << 1) > max(8U, sk_listener->sk_max_ack_backlog)) {
 586                int young = reqsk_queue_len_young(queue) << 1;
 587
 588                while (thresh > 2) {
 589                        if (qlen < young)
 590                                break;
 591                        thresh--;
 592                        young <<= 1;
 593                }
 594        }
 595        defer_accept = READ_ONCE(queue->rskq_defer_accept);
 596        if (defer_accept)
 597                max_retries = defer_accept;
 598        syn_ack_recalc(req, thresh, max_retries, defer_accept,
 599                       &expire, &resend);
 600        req->rsk_ops->syn_ack_timeout(req);
 601        if (!expire &&
 602            (!resend ||
 603             !inet_rtx_syn_ack(sk_listener, req) ||
 604             inet_rsk(req)->acked)) {
 605                unsigned long timeo;
 606
 607                if (req->num_timeout++ == 0)
 608                        atomic_dec(&queue->young);
 609                timeo = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
 610                mod_timer(&req->rsk_timer, jiffies + timeo);
 611                return;
 612        }
 613drop:
 614        inet_csk_reqsk_queue_drop_and_put(sk_listener, req);
 615}
 616
 617static void reqsk_queue_hash_req(struct request_sock *req,
 618                                 unsigned long timeout)
 619{
 620        req->num_retrans = 0;
 621        req->num_timeout = 0;
 622        req->sk = NULL;
 623
 624        setup_pinned_timer(&req->rsk_timer, reqsk_timer_handler,
 625                            (unsigned long)req);
 626        mod_timer(&req->rsk_timer, jiffies + timeout);
 627
 628        inet_ehash_insert(req_to_sk(req), NULL);
 629        /* before letting lookups find us, make sure all req fields
 630         * are committed to memory and refcnt initialized.
 631         */
 632        smp_wmb();
 633        atomic_set(&req->rsk_refcnt, 2 + 1);
 634}
 635
 636void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
 637                                   unsigned long timeout)
 638{
 639        reqsk_queue_hash_req(req, timeout);
 640        inet_csk_reqsk_queue_added(sk);
 641}
 642EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
 643
 644/**
 645 *      inet_csk_clone_lock - clone an inet socket, and lock its clone
 646 *      @sk: the socket to clone
 647 *      @req: request_sock
 648 *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
 649 *
 650 *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
 651 */
 652struct sock *inet_csk_clone_lock(const struct sock *sk,
 653                                 const struct request_sock *req,
 654                                 const gfp_t priority)
 655{
 656        struct sock *newsk = sk_clone_lock(sk, priority);
 657
 658        if (newsk) {
 659                struct inet_connection_sock *newicsk = inet_csk(newsk);
 660
 661                newsk->sk_state = TCP_SYN_RECV;
 662                newicsk->icsk_bind_hash = NULL;
 663
 664                inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port;
 665                inet_sk(newsk)->inet_num = inet_rsk(req)->ir_num;
 666                inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num);
 667                newsk->sk_write_space = sk_stream_write_space;
 668
 669                /* listeners have SOCK_RCU_FREE, not the children */
 670                sock_reset_flag(newsk, SOCK_RCU_FREE);
 671
 672                newsk->sk_mark = inet_rsk(req)->ir_mark;
 673                atomic64_set(&newsk->sk_cookie,
 674                             atomic64_read(&inet_rsk(req)->ir_cookie));
 675
 676                newicsk->icsk_retransmits = 0;
 677                newicsk->icsk_backoff     = 0;
 678                newicsk->icsk_probes_out  = 0;
 679
 680                /* Deinitialize accept_queue to trap illegal accesses. */
 681                memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue));
 682
 683                security_inet_csk_clone(newsk, req);
 684        }
 685        return newsk;
 686}
 687EXPORT_SYMBOL_GPL(inet_csk_clone_lock);
 688
 689/*
 690 * At this point, there should be no process reference to this
 691 * socket, and thus no user references at all.  Therefore we
 692 * can assume the socket waitqueue is inactive and nobody will
 693 * try to jump onto it.
 694 */
 695void inet_csk_destroy_sock(struct sock *sk)
 696{
 697        WARN_ON(sk->sk_state != TCP_CLOSE);
 698        WARN_ON(!sock_flag(sk, SOCK_DEAD));
 699
 700        /* It cannot be in hash table! */
 701        WARN_ON(!sk_unhashed(sk));
 702
 703        /* If it has not 0 inet_sk(sk)->inet_num, it must be bound */
 704        WARN_ON(inet_sk(sk)->inet_num && !inet_csk(sk)->icsk_bind_hash);
 705
 706        sk->sk_prot->destroy(sk);
 707
 708        sk_stream_kill_queues(sk);
 709
 710        xfrm_sk_free_policy(sk);
 711
 712        sk_refcnt_debug_release(sk);
 713
 714        local_bh_disable();
 715        percpu_counter_dec(sk->sk_prot->orphan_count);
 716        local_bh_enable();
 717        sock_put(sk);
 718}
 719EXPORT_SYMBOL(inet_csk_destroy_sock);
 720
 721/* This function allows to force a closure of a socket after the call to
 722 * tcp/dccp_create_openreq_child().
 723 */
 724void inet_csk_prepare_forced_close(struct sock *sk)
 725        __releases(&sk->sk_lock.slock)
 726{
 727        /* sk_clone_lock locked the socket and set refcnt to 2 */
 728        bh_unlock_sock(sk);
 729        sock_put(sk);
 730
 731        /* The below has to be done to allow calling inet_csk_destroy_sock */
 732        sock_set_flag(sk, SOCK_DEAD);
 733        percpu_counter_inc(sk->sk_prot->orphan_count);
 734        inet_sk(sk)->inet_num = 0;
 735}
 736EXPORT_SYMBOL(inet_csk_prepare_forced_close);
 737
 738int inet_csk_listen_start(struct sock *sk, int backlog)
 739{
 740        struct inet_connection_sock *icsk = inet_csk(sk);
 741        struct inet_sock *inet = inet_sk(sk);
 742        int err = -EADDRINUSE;
 743
 744        reqsk_queue_alloc(&icsk->icsk_accept_queue);
 745
 746        sk->sk_max_ack_backlog = backlog;
 747        sk->sk_ack_backlog = 0;
 748        inet_csk_delack_init(sk);
 749
 750        /* There is race window here: we announce ourselves listening,
 751         * but this transition is still not validated by get_port().
 752         * It is OK, because this socket enters to hash table only
 753         * after validation is complete.
 754         */
 755        sk_state_store(sk, TCP_LISTEN);
 756        if (!sk->sk_prot->get_port(sk, inet->inet_num)) {
 757                inet->inet_sport = htons(inet->inet_num);
 758
 759                sk_dst_reset(sk);
 760                err = sk->sk_prot->hash(sk);
 761
 762                if (likely(!err))
 763                        return 0;
 764        }
 765
 766        sk->sk_state = TCP_CLOSE;
 767        return err;
 768}
 769EXPORT_SYMBOL_GPL(inet_csk_listen_start);
 770
 771static void inet_child_forget(struct sock *sk, struct request_sock *req,
 772                              struct sock *child)
 773{
 774        sk->sk_prot->disconnect(child, O_NONBLOCK);
 775
 776        sock_orphan(child);
 777
 778        percpu_counter_inc(sk->sk_prot->orphan_count);
 779
 780        if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) {
 781                BUG_ON(tcp_sk(child)->fastopen_rsk != req);
 782                BUG_ON(sk != req->rsk_listener);
 783
 784                /* Paranoid, to prevent race condition if
 785                 * an inbound pkt destined for child is
 786                 * blocked by sock lock in tcp_v4_rcv().
 787                 * Also to satisfy an assertion in
 788                 * tcp_v4_destroy_sock().
 789                 */
 790                tcp_sk(child)->fastopen_rsk = NULL;
 791        }
 792        inet_csk_destroy_sock(child);
 793        reqsk_put(req);
 794}
 795
 796struct sock *inet_csk_reqsk_queue_add(struct sock *sk,
 797                                      struct request_sock *req,
 798                                      struct sock *child)
 799{
 800        struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
 801
 802        spin_lock(&queue->rskq_lock);
 803        if (unlikely(sk->sk_state != TCP_LISTEN)) {
 804                inet_child_forget(sk, req, child);
 805                child = NULL;
 806        } else {
 807                req->sk = child;
 808                req->dl_next = NULL;
 809                if (queue->rskq_accept_head == NULL)
 810                        queue->rskq_accept_head = req;
 811                else
 812                        queue->rskq_accept_tail->dl_next = req;
 813                queue->rskq_accept_tail = req;
 814                sk_acceptq_added(sk);
 815        }
 816        spin_unlock(&queue->rskq_lock);
 817        return child;
 818}
 819EXPORT_SYMBOL(inet_csk_reqsk_queue_add);
 820
 821struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child,
 822                                         struct request_sock *req, bool own_req)
 823{
 824        if (own_req) {
 825                inet_csk_reqsk_queue_drop(sk, req);
 826                reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
 827                if (inet_csk_reqsk_queue_add(sk, req, child))
 828                        return child;
 829        }
 830        /* Too bad, another child took ownership of the request, undo. */
 831        bh_unlock_sock(child);
 832        sock_put(child);
 833        return NULL;
 834}
 835EXPORT_SYMBOL(inet_csk_complete_hashdance);
 836
 837/*
 838 *      This routine closes sockets which have been at least partially
 839 *      opened, but not yet accepted.
 840 */
 841void inet_csk_listen_stop(struct sock *sk)
 842{
 843        struct inet_connection_sock *icsk = inet_csk(sk);
 844        struct request_sock_queue *queue = &icsk->icsk_accept_queue;
 845        struct request_sock *next, *req;
 846
 847        /* Following specs, it would be better either to send FIN
 848         * (and enter FIN-WAIT-1, it is normal close)
 849         * or to send active reset (abort).
 850         * Certainly, it is pretty dangerous while synflood, but it is
 851         * bad justification for our negligence 8)
 852         * To be honest, we are not able to make either
 853         * of the variants now.                 --ANK
 854         */
 855        while ((req = reqsk_queue_remove(queue, sk)) != NULL) {
 856                struct sock *child = req->sk;
 857
 858                local_bh_disable();
 859                bh_lock_sock(child);
 860                WARN_ON(sock_owned_by_user(child));
 861                sock_hold(child);
 862
 863                inet_child_forget(sk, req, child);
 864                bh_unlock_sock(child);
 865                local_bh_enable();
 866                sock_put(child);
 867
 868                cond_resched();
 869        }
 870        if (queue->fastopenq.rskq_rst_head) {
 871                /* Free all the reqs queued in rskq_rst_head. */
 872                spin_lock_bh(&queue->fastopenq.lock);
 873                req = queue->fastopenq.rskq_rst_head;
 874                queue->fastopenq.rskq_rst_head = NULL;
 875                spin_unlock_bh(&queue->fastopenq.lock);
 876                while (req != NULL) {
 877                        next = req->dl_next;
 878                        reqsk_put(req);
 879                        req = next;
 880                }
 881        }
 882        WARN_ON_ONCE(sk->sk_ack_backlog);
 883}
 884EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
 885
 886void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
 887{
 888        struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
 889        const struct inet_sock *inet = inet_sk(sk);
 890
 891        sin->sin_family         = AF_INET;
 892        sin->sin_addr.s_addr    = inet->inet_daddr;
 893        sin->sin_port           = inet->inet_dport;
 894}
 895EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr);
 896
 897#ifdef CONFIG_COMPAT
 898int inet_csk_compat_getsockopt(struct sock *sk, int level, int optname,
 899                               char __user *optval, int __user *optlen)
 900{
 901        const struct inet_connection_sock *icsk = inet_csk(sk);
 902
 903        if (icsk->icsk_af_ops->compat_getsockopt)
 904                return icsk->icsk_af_ops->compat_getsockopt(sk, level, optname,
 905                                                            optval, optlen);
 906        return icsk->icsk_af_ops->getsockopt(sk, level, optname,
 907                                             optval, optlen);
 908}
 909EXPORT_SYMBOL_GPL(inet_csk_compat_getsockopt);
 910
 911int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname,
 912                               char __user *optval, unsigned int optlen)
 913{
 914        const struct inet_connection_sock *icsk = inet_csk(sk);
 915
 916        if (icsk->icsk_af_ops->compat_setsockopt)
 917                return icsk->icsk_af_ops->compat_setsockopt(sk, level, optname,
 918                                                            optval, optlen);
 919        return icsk->icsk_af_ops->setsockopt(sk, level, optname,
 920                                             optval, optlen);
 921}
 922EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt);
 923#endif
 924
 925static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl)
 926{
 927        const struct inet_sock *inet = inet_sk(sk);
 928        const struct ip_options_rcu *inet_opt;
 929        __be32 daddr = inet->inet_daddr;
 930        struct flowi4 *fl4;
 931        struct rtable *rt;
 932
 933        rcu_read_lock();
 934        inet_opt = rcu_dereference(inet->inet_opt);
 935        if (inet_opt && inet_opt->opt.srr)
 936                daddr = inet_opt->opt.faddr;
 937        fl4 = &fl->u.ip4;
 938        rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr,
 939                                   inet->inet_saddr, inet->inet_dport,
 940                                   inet->inet_sport, sk->sk_protocol,
 941                                   RT_CONN_FLAGS(sk), sk->sk_bound_dev_if);
 942        if (IS_ERR(rt))
 943                rt = NULL;
 944        if (rt)
 945                sk_setup_caps(sk, &rt->dst);
 946        rcu_read_unlock();
 947
 948        return &rt->dst;
 949}
 950
 951struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu)
 952{
 953        struct dst_entry *dst = __sk_dst_check(sk, 0);
 954        struct inet_sock *inet = inet_sk(sk);
 955
 956        if (!dst) {
 957                dst = inet_csk_rebuild_route(sk, &inet->cork.fl);
 958                if (!dst)
 959                        goto out;
 960        }
 961        dst->ops->update_pmtu(dst, sk, NULL, mtu);
 962
 963        dst = __sk_dst_check(sk, 0);
 964        if (!dst)
 965                dst = inet_csk_rebuild_route(sk, &inet->cork.fl);
 966out:
 967        return dst;
 968}
 969EXPORT_SYMBOL_GPL(inet_csk_update_pmtu);
 970