linux/net/netfilter/ipvs/ip_vs_conn.c
<<
>>
Prefs
   1/*
   2 * IPVS         An implementation of the IP virtual server support for the
   3 *              LINUX operating system.  IPVS is now implemented as a module
   4 *              over the Netfilter framework. IPVS can be used to build a
   5 *              high-performance and highly available server based on a
   6 *              cluster of servers.
   7 *
   8 * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
   9 *              Peter Kese <peter.kese@ijs.si>
  10 *              Julian Anastasov <ja@ssi.bg>
  11 *
  12 *              This program is free software; you can redistribute it and/or
  13 *              modify it under the terms of the GNU General Public License
  14 *              as published by the Free Software Foundation; either version
  15 *              2 of the License, or (at your option) any later version.
  16 *
  17 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
  18 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
  19 * and others. Many code here is taken from IP MASQ code of kernel 2.2.
  20 *
  21 * Changes:
  22 *
  23 */
  24
  25#define KMSG_COMPONENT "IPVS"
  26#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
  27
  28#include <linux/interrupt.h>
  29#include <linux/in.h>
  30#include <linux/inet.h>
  31#include <linux/net.h>
  32#include <linux/kernel.h>
  33#include <linux/module.h>
  34#include <linux/vmalloc.h>
  35#include <linux/proc_fs.h>              /* for proc_net_* */
  36#include <linux/slab.h>
  37#include <linux/seq_file.h>
  38#include <linux/jhash.h>
  39#include <linux/random.h>
  40
  41#include <net/net_namespace.h>
  42#include <net/ip_vs.h>
  43
  44
  45#ifndef CONFIG_IP_VS_TAB_BITS
  46#define CONFIG_IP_VS_TAB_BITS   12
  47#endif
  48
  49/*
  50 * Connection hash size. Default is what was selected at compile time.
  51*/
  52static int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
  53module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444);
  54MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size");
  55
  56/* size and mask values */
  57int ip_vs_conn_tab_size __read_mostly;
  58static int ip_vs_conn_tab_mask __read_mostly;
  59
  60/*
  61 *  Connection hash table: for input and output packets lookups of IPVS
  62 */
  63static struct hlist_head *ip_vs_conn_tab __read_mostly;
  64
  65/*  SLAB cache for IPVS connections */
  66static struct kmem_cache *ip_vs_conn_cachep __read_mostly;
  67
  68/*  counter for no client port connections */
  69static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
  70
  71/* random value for IPVS connection hash */
  72static unsigned int ip_vs_conn_rnd __read_mostly;
  73
  74/*
  75 *  Fine locking granularity for big connection hash table
  76 */
  77#define CT_LOCKARRAY_BITS  5
  78#define CT_LOCKARRAY_SIZE  (1<<CT_LOCKARRAY_BITS)
  79#define CT_LOCKARRAY_MASK  (CT_LOCKARRAY_SIZE-1)
  80
  81/* We need an addrstrlen that works with or without v6 */
  82#ifdef CONFIG_IP_VS_IPV6
  83#define IP_VS_ADDRSTRLEN INET6_ADDRSTRLEN
  84#else
  85#define IP_VS_ADDRSTRLEN (8+1)
  86#endif
  87
  88struct ip_vs_aligned_lock
  89{
  90        spinlock_t      l;
  91} __attribute__((__aligned__(SMP_CACHE_BYTES)));
  92
  93/* lock array for conn table */
  94static struct ip_vs_aligned_lock
  95__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
  96
  97static inline void ct_write_lock_bh(unsigned int key)
  98{
  99        spin_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
 100}
 101
 102static inline void ct_write_unlock_bh(unsigned int key)
 103{
 104        spin_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
 105}
 106
 107static void ip_vs_conn_expire(struct timer_list *t);
 108
 109/*
 110 *      Returns hash value for IPVS connection entry
 111 */
 112static unsigned int ip_vs_conn_hashkey(struct netns_ipvs *ipvs, int af, unsigned int proto,
 113                                       const union nf_inet_addr *addr,
 114                                       __be16 port)
 115{
 116#ifdef CONFIG_IP_VS_IPV6
 117        if (af == AF_INET6)
 118                return (jhash_3words(jhash(addr, 16, ip_vs_conn_rnd),
 119                                    (__force u32)port, proto, ip_vs_conn_rnd) ^
 120                        ((size_t)ipvs>>8)) & ip_vs_conn_tab_mask;
 121#endif
 122        return (jhash_3words((__force u32)addr->ip, (__force u32)port, proto,
 123                            ip_vs_conn_rnd) ^
 124                ((size_t)ipvs>>8)) & ip_vs_conn_tab_mask;
 125}
 126
 127static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p,
 128                                             bool inverse)
 129{
 130        const union nf_inet_addr *addr;
 131        __be16 port;
 132
 133        if (p->pe_data && p->pe->hashkey_raw)
 134                return p->pe->hashkey_raw(p, ip_vs_conn_rnd, inverse) &
 135                        ip_vs_conn_tab_mask;
 136
 137        if (likely(!inverse)) {
 138                addr = p->caddr;
 139                port = p->cport;
 140        } else {
 141                addr = p->vaddr;
 142                port = p->vport;
 143        }
 144
 145        return ip_vs_conn_hashkey(p->ipvs, p->af, p->protocol, addr, port);
 146}
 147
 148static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp)
 149{
 150        struct ip_vs_conn_param p;
 151
 152        ip_vs_conn_fill_param(cp->ipvs, cp->af, cp->protocol,
 153                              &cp->caddr, cp->cport, NULL, 0, &p);
 154
 155        if (cp->pe) {
 156                p.pe = cp->pe;
 157                p.pe_data = cp->pe_data;
 158                p.pe_data_len = cp->pe_data_len;
 159        }
 160
 161        return ip_vs_conn_hashkey_param(&p, false);
 162}
 163
 164/*
 165 *      Hashes ip_vs_conn in ip_vs_conn_tab by netns,proto,addr,port.
 166 *      returns bool success.
 167 */
 168static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
 169{
 170        unsigned int hash;
 171        int ret;
 172
 173        if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
 174                return 0;
 175
 176        /* Hash by protocol, client address and port */
 177        hash = ip_vs_conn_hashkey_conn(cp);
 178
 179        ct_write_lock_bh(hash);
 180        spin_lock(&cp->lock);
 181
 182        if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
 183                cp->flags |= IP_VS_CONN_F_HASHED;
 184                refcount_inc(&cp->refcnt);
 185                hlist_add_head_rcu(&cp->c_list, &ip_vs_conn_tab[hash]);
 186                ret = 1;
 187        } else {
 188                pr_err("%s(): request for already hashed, called from %pS\n",
 189                       __func__, __builtin_return_address(0));
 190                ret = 0;
 191        }
 192
 193        spin_unlock(&cp->lock);
 194        ct_write_unlock_bh(hash);
 195
 196        return ret;
 197}
 198
 199
 200/*
 201 *      UNhashes ip_vs_conn from ip_vs_conn_tab.
 202 *      returns bool success. Caller should hold conn reference.
 203 */
 204static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
 205{
 206        unsigned int hash;
 207        int ret;
 208
 209        /* unhash it and decrease its reference counter */
 210        hash = ip_vs_conn_hashkey_conn(cp);
 211
 212        ct_write_lock_bh(hash);
 213        spin_lock(&cp->lock);
 214
 215        if (cp->flags & IP_VS_CONN_F_HASHED) {
 216                hlist_del_rcu(&cp->c_list);
 217                cp->flags &= ~IP_VS_CONN_F_HASHED;
 218                refcount_dec(&cp->refcnt);
 219                ret = 1;
 220        } else
 221                ret = 0;
 222
 223        spin_unlock(&cp->lock);
 224        ct_write_unlock_bh(hash);
 225
 226        return ret;
 227}
 228
 229/* Try to unlink ip_vs_conn from ip_vs_conn_tab.
 230 * returns bool success.
 231 */
 232static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp)
 233{
 234        unsigned int hash;
 235        bool ret = false;
 236
 237        if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
 238                return refcount_dec_if_one(&cp->refcnt);
 239
 240        hash = ip_vs_conn_hashkey_conn(cp);
 241
 242        ct_write_lock_bh(hash);
 243        spin_lock(&cp->lock);
 244
 245        if (cp->flags & IP_VS_CONN_F_HASHED) {
 246                /* Decrease refcnt and unlink conn only if we are last user */
 247                if (refcount_dec_if_one(&cp->refcnt)) {
 248                        hlist_del_rcu(&cp->c_list);
 249                        cp->flags &= ~IP_VS_CONN_F_HASHED;
 250                        ret = true;
 251                }
 252        }
 253
 254        spin_unlock(&cp->lock);
 255        ct_write_unlock_bh(hash);
 256
 257        return ret;
 258}
 259
 260
 261/*
 262 *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
 263 *  Called for pkts coming from OUTside-to-INside.
 264 *      p->caddr, p->cport: pkt source address (foreign host)
 265 *      p->vaddr, p->vport: pkt dest address (load balancer)
 266 */
 267static inline struct ip_vs_conn *
 268__ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
 269{
 270        unsigned int hash;
 271        struct ip_vs_conn *cp;
 272
 273        hash = ip_vs_conn_hashkey_param(p, false);
 274
 275        rcu_read_lock();
 276
 277        hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
 278                if (p->cport == cp->cport && p->vport == cp->vport &&
 279                    cp->af == p->af &&
 280                    ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
 281                    ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) &&
 282                    ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
 283                    p->protocol == cp->protocol &&
 284                    cp->ipvs == p->ipvs) {
 285                        if (!__ip_vs_conn_get(cp))
 286                                continue;
 287                        /* HIT */
 288                        rcu_read_unlock();
 289                        return cp;
 290                }
 291        }
 292
 293        rcu_read_unlock();
 294
 295        return NULL;
 296}
 297
 298struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
 299{
 300        struct ip_vs_conn *cp;
 301
 302        cp = __ip_vs_conn_in_get(p);
 303        if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) {
 304                struct ip_vs_conn_param cport_zero_p = *p;
 305                cport_zero_p.cport = 0;
 306                cp = __ip_vs_conn_in_get(&cport_zero_p);
 307        }
 308
 309        IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n",
 310                      ip_vs_proto_name(p->protocol),
 311                      IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
 312                      IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
 313                      cp ? "hit" : "not hit");
 314
 315        return cp;
 316}
 317
 318static int
 319ip_vs_conn_fill_param_proto(struct netns_ipvs *ipvs,
 320                            int af, const struct sk_buff *skb,
 321                            const struct ip_vs_iphdr *iph,
 322                            struct ip_vs_conn_param *p)
 323{
 324        __be16 _ports[2], *pptr;
 325
 326        pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports);
 327        if (pptr == NULL)
 328                return 1;
 329
 330        if (likely(!ip_vs_iph_inverse(iph)))
 331                ip_vs_conn_fill_param(ipvs, af, iph->protocol, &iph->saddr,
 332                                      pptr[0], &iph->daddr, pptr[1], p);
 333        else
 334                ip_vs_conn_fill_param(ipvs, af, iph->protocol, &iph->daddr,
 335                                      pptr[1], &iph->saddr, pptr[0], p);
 336        return 0;
 337}
 338
 339struct ip_vs_conn *
 340ip_vs_conn_in_get_proto(struct netns_ipvs *ipvs, int af,
 341                        const struct sk_buff *skb,
 342                        const struct ip_vs_iphdr *iph)
 343{
 344        struct ip_vs_conn_param p;
 345
 346        if (ip_vs_conn_fill_param_proto(ipvs, af, skb, iph, &p))
 347                return NULL;
 348
 349        return ip_vs_conn_in_get(&p);
 350}
 351EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto);
 352
 353/* Get reference to connection template */
 354struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
 355{
 356        unsigned int hash;
 357        struct ip_vs_conn *cp;
 358
 359        hash = ip_vs_conn_hashkey_param(p, false);
 360
 361        rcu_read_lock();
 362
 363        hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
 364                if (unlikely(p->pe_data && p->pe->ct_match)) {
 365                        if (cp->ipvs != p->ipvs)
 366                                continue;
 367                        if (p->pe == cp->pe && p->pe->ct_match(p, cp)) {
 368                                if (__ip_vs_conn_get(cp))
 369                                        goto out;
 370                        }
 371                        continue;
 372                }
 373
 374                if (cp->af == p->af &&
 375                    ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
 376                    /* protocol should only be IPPROTO_IP if
 377                     * p->vaddr is a fwmark */
 378                    ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC :
 379                                     p->af, p->vaddr, &cp->vaddr) &&
 380                    p->vport == cp->vport && p->cport == cp->cport &&
 381                    cp->flags & IP_VS_CONN_F_TEMPLATE &&
 382                    p->protocol == cp->protocol &&
 383                    cp->ipvs == p->ipvs) {
 384                        if (__ip_vs_conn_get(cp))
 385                                goto out;
 386                }
 387        }
 388        cp = NULL;
 389
 390  out:
 391        rcu_read_unlock();
 392
 393        IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n",
 394                      ip_vs_proto_name(p->protocol),
 395                      IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
 396                      IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
 397                      cp ? "hit" : "not hit");
 398
 399        return cp;
 400}
 401
 402/* Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
 403 * Called for pkts coming from inside-to-OUTside.
 404 *      p->caddr, p->cport: pkt source address (inside host)
 405 *      p->vaddr, p->vport: pkt dest address (foreign host) */
 406struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
 407{
 408        unsigned int hash;
 409        struct ip_vs_conn *cp, *ret=NULL;
 410
 411        /*
 412         *      Check for "full" addressed entries
 413         */
 414        hash = ip_vs_conn_hashkey_param(p, true);
 415
 416        rcu_read_lock();
 417
 418        hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
 419                if (p->vport == cp->cport && p->cport == cp->dport &&
 420                    cp->af == p->af &&
 421                    ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) &&
 422                    ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) &&
 423                    p->protocol == cp->protocol &&
 424                    cp->ipvs == p->ipvs) {
 425                        if (!__ip_vs_conn_get(cp))
 426                                continue;
 427                        /* HIT */
 428                        ret = cp;
 429                        break;
 430                }
 431        }
 432
 433        rcu_read_unlock();
 434
 435        IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n",
 436                      ip_vs_proto_name(p->protocol),
 437                      IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
 438                      IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
 439                      ret ? "hit" : "not hit");
 440
 441        return ret;
 442}
 443
 444struct ip_vs_conn *
 445ip_vs_conn_out_get_proto(struct netns_ipvs *ipvs, int af,
 446                         const struct sk_buff *skb,
 447                         const struct ip_vs_iphdr *iph)
 448{
 449        struct ip_vs_conn_param p;
 450
 451        if (ip_vs_conn_fill_param_proto(ipvs, af, skb, iph, &p))
 452                return NULL;
 453
 454        return ip_vs_conn_out_get(&p);
 455}
 456EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto);
 457
 458/*
 459 *      Put back the conn and restart its timer with its timeout
 460 */
 461static void __ip_vs_conn_put_timer(struct ip_vs_conn *cp)
 462{
 463        unsigned long t = (cp->flags & IP_VS_CONN_F_ONE_PACKET) ?
 464                0 : cp->timeout;
 465        mod_timer(&cp->timer, jiffies+t);
 466
 467        __ip_vs_conn_put(cp);
 468}
 469
 470void ip_vs_conn_put(struct ip_vs_conn *cp)
 471{
 472        if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) &&
 473            (refcount_read(&cp->refcnt) == 1) &&
 474            !timer_pending(&cp->timer))
 475                /* expire connection immediately */
 476                ip_vs_conn_expire(&cp->timer);
 477        else
 478                __ip_vs_conn_put_timer(cp);
 479}
 480
 481/*
 482 *      Fill a no_client_port connection with a client port number
 483 */
 484void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
 485{
 486        if (ip_vs_conn_unhash(cp)) {
 487                spin_lock_bh(&cp->lock);
 488                if (cp->flags & IP_VS_CONN_F_NO_CPORT) {
 489                        atomic_dec(&ip_vs_conn_no_cport_cnt);
 490                        cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
 491                        cp->cport = cport;
 492                }
 493                spin_unlock_bh(&cp->lock);
 494
 495                /* hash on new dport */
 496                ip_vs_conn_hash(cp);
 497        }
 498}
 499
 500
 501/*
 502 *      Bind a connection entry with the corresponding packet_xmit.
 503 *      Called by ip_vs_conn_new.
 504 */
 505static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
 506{
 507        switch (IP_VS_FWD_METHOD(cp)) {
 508        case IP_VS_CONN_F_MASQ:
 509                cp->packet_xmit = ip_vs_nat_xmit;
 510                break;
 511
 512        case IP_VS_CONN_F_TUNNEL:
 513#ifdef CONFIG_IP_VS_IPV6
 514                if (cp->daf == AF_INET6)
 515                        cp->packet_xmit = ip_vs_tunnel_xmit_v6;
 516                else
 517#endif
 518                        cp->packet_xmit = ip_vs_tunnel_xmit;
 519                break;
 520
 521        case IP_VS_CONN_F_DROUTE:
 522                cp->packet_xmit = ip_vs_dr_xmit;
 523                break;
 524
 525        case IP_VS_CONN_F_LOCALNODE:
 526                cp->packet_xmit = ip_vs_null_xmit;
 527                break;
 528
 529        case IP_VS_CONN_F_BYPASS:
 530                cp->packet_xmit = ip_vs_bypass_xmit;
 531                break;
 532        }
 533}
 534
 535#ifdef CONFIG_IP_VS_IPV6
 536static inline void ip_vs_bind_xmit_v6(struct ip_vs_conn *cp)
 537{
 538        switch (IP_VS_FWD_METHOD(cp)) {
 539        case IP_VS_CONN_F_MASQ:
 540                cp->packet_xmit = ip_vs_nat_xmit_v6;
 541                break;
 542
 543        case IP_VS_CONN_F_TUNNEL:
 544                if (cp->daf == AF_INET6)
 545                        cp->packet_xmit = ip_vs_tunnel_xmit_v6;
 546                else
 547                        cp->packet_xmit = ip_vs_tunnel_xmit;
 548                break;
 549
 550        case IP_VS_CONN_F_DROUTE:
 551                cp->packet_xmit = ip_vs_dr_xmit_v6;
 552                break;
 553
 554        case IP_VS_CONN_F_LOCALNODE:
 555                cp->packet_xmit = ip_vs_null_xmit;
 556                break;
 557
 558        case IP_VS_CONN_F_BYPASS:
 559                cp->packet_xmit = ip_vs_bypass_xmit_v6;
 560                break;
 561        }
 562}
 563#endif
 564
 565
 566static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest)
 567{
 568        return atomic_read(&dest->activeconns)
 569                + atomic_read(&dest->inactconns);
 570}
 571
 572/*
 573 *      Bind a connection entry with a virtual service destination
 574 *      Called just after a new connection entry is created.
 575 */
 576static inline void
 577ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
 578{
 579        unsigned int conn_flags;
 580        __u32 flags;
 581
 582        /* if dest is NULL, then return directly */
 583        if (!dest)
 584                return;
 585
 586        /* Increase the refcnt counter of the dest */
 587        ip_vs_dest_hold(dest);
 588
 589        conn_flags = atomic_read(&dest->conn_flags);
 590        if (cp->protocol != IPPROTO_UDP)
 591                conn_flags &= ~IP_VS_CONN_F_ONE_PACKET;
 592        flags = cp->flags;
 593        /* Bind with the destination and its corresponding transmitter */
 594        if (flags & IP_VS_CONN_F_SYNC) {
 595                /* if the connection is not template and is created
 596                 * by sync, preserve the activity flag.
 597                 */
 598                if (!(flags & IP_VS_CONN_F_TEMPLATE))
 599                        conn_flags &= ~IP_VS_CONN_F_INACTIVE;
 600                /* connections inherit forwarding method from dest */
 601                flags &= ~(IP_VS_CONN_F_FWD_MASK | IP_VS_CONN_F_NOOUTPUT);
 602        }
 603        flags |= conn_flags;
 604        cp->flags = flags;
 605        cp->dest = dest;
 606
 607        IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d "
 608                      "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
 609                      "dest->refcnt:%d\n",
 610                      ip_vs_proto_name(cp->protocol),
 611                      IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
 612                      IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
 613                      IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport),
 614                      ip_vs_fwd_tag(cp), cp->state,
 615                      cp->flags, refcount_read(&cp->refcnt),
 616                      refcount_read(&dest->refcnt));
 617
 618        /* Update the connection counters */
 619        if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
 620                /* It is a normal connection, so modify the counters
 621                 * according to the flags, later the protocol can
 622                 * update them on state change
 623                 */
 624                if (!(flags & IP_VS_CONN_F_INACTIVE))
 625                        atomic_inc(&dest->activeconns);
 626                else
 627                        atomic_inc(&dest->inactconns);
 628        } else {
 629                /* It is a persistent connection/template, so increase
 630                   the persistent connection counter */
 631                atomic_inc(&dest->persistconns);
 632        }
 633
 634        if (dest->u_threshold != 0 &&
 635            ip_vs_dest_totalconns(dest) >= dest->u_threshold)
 636                dest->flags |= IP_VS_DEST_F_OVERLOAD;
 637}
 638
 639
 640/*
 641 * Check if there is a destination for the connection, if so
 642 * bind the connection to the destination.
 643 */
 644void ip_vs_try_bind_dest(struct ip_vs_conn *cp)
 645{
 646        struct ip_vs_dest *dest;
 647
 648        rcu_read_lock();
 649
 650        /* This function is only invoked by the synchronization code. We do
 651         * not currently support heterogeneous pools with synchronization,
 652         * so we can make the assumption that the svc_af is the same as the
 653         * dest_af
 654         */
 655        dest = ip_vs_find_dest(cp->ipvs, cp->af, cp->af, &cp->daddr,
 656                               cp->dport, &cp->vaddr, cp->vport,
 657                               cp->protocol, cp->fwmark, cp->flags);
 658        if (dest) {
 659                struct ip_vs_proto_data *pd;
 660
 661                spin_lock_bh(&cp->lock);
 662                if (cp->dest) {
 663                        spin_unlock_bh(&cp->lock);
 664                        rcu_read_unlock();
 665                        return;
 666                }
 667
 668                /* Applications work depending on the forwarding method
 669                 * but better to reassign them always when binding dest */
 670                if (cp->app)
 671                        ip_vs_unbind_app(cp);
 672
 673                ip_vs_bind_dest(cp, dest);
 674                spin_unlock_bh(&cp->lock);
 675
 676                /* Update its packet transmitter */
 677                cp->packet_xmit = NULL;
 678#ifdef CONFIG_IP_VS_IPV6
 679                if (cp->af == AF_INET6)
 680                        ip_vs_bind_xmit_v6(cp);
 681                else
 682#endif
 683                        ip_vs_bind_xmit(cp);
 684
 685                pd = ip_vs_proto_data_get(cp->ipvs, cp->protocol);
 686                if (pd && atomic_read(&pd->appcnt))
 687                        ip_vs_bind_app(cp, pd->pp);
 688        }
 689        rcu_read_unlock();
 690}
 691
 692
 693/*
 694 *      Unbind a connection entry with its VS destination
 695 *      Called by the ip_vs_conn_expire function.
 696 */
 697static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
 698{
 699        struct ip_vs_dest *dest = cp->dest;
 700
 701        if (!dest)
 702                return;
 703
 704        IP_VS_DBG_BUF(7, "Unbind-dest %s c:%s:%d v:%s:%d "
 705                      "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
 706                      "dest->refcnt:%d\n",
 707                      ip_vs_proto_name(cp->protocol),
 708                      IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
 709                      IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
 710                      IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport),
 711                      ip_vs_fwd_tag(cp), cp->state,
 712                      cp->flags, refcount_read(&cp->refcnt),
 713                      refcount_read(&dest->refcnt));
 714
 715        /* Update the connection counters */
 716        if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
 717                /* It is a normal connection, so decrease the inactconns
 718                   or activeconns counter */
 719                if (cp->flags & IP_VS_CONN_F_INACTIVE) {
 720                        atomic_dec(&dest->inactconns);
 721                } else {
 722                        atomic_dec(&dest->activeconns);
 723                }
 724        } else {
 725                /* It is a persistent connection/template, so decrease
 726                   the persistent connection counter */
 727                atomic_dec(&dest->persistconns);
 728        }
 729
 730        if (dest->l_threshold != 0) {
 731                if (ip_vs_dest_totalconns(dest) < dest->l_threshold)
 732                        dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
 733        } else if (dest->u_threshold != 0) {
 734                if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3)
 735                        dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
 736        } else {
 737                if (dest->flags & IP_VS_DEST_F_OVERLOAD)
 738                        dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
 739        }
 740
 741        ip_vs_dest_put(dest);
 742}
 743
 744static int expire_quiescent_template(struct netns_ipvs *ipvs,
 745                                     struct ip_vs_dest *dest)
 746{
 747#ifdef CONFIG_SYSCTL
 748        return ipvs->sysctl_expire_quiescent_template &&
 749                (atomic_read(&dest->weight) == 0);
 750#else
 751        return 0;
 752#endif
 753}
 754
 755/*
 756 *      Checking if the destination of a connection template is available.
 757 *      If available, return 1, otherwise invalidate this connection
 758 *      template and return 0.
 759 */
 760int ip_vs_check_template(struct ip_vs_conn *ct, struct ip_vs_dest *cdest)
 761{
 762        struct ip_vs_dest *dest = ct->dest;
 763        struct netns_ipvs *ipvs = ct->ipvs;
 764
 765        /*
 766         * Checking the dest server status.
 767         */
 768        if ((dest == NULL) ||
 769            !(dest->flags & IP_VS_DEST_F_AVAILABLE) ||
 770            expire_quiescent_template(ipvs, dest) ||
 771            (cdest && (dest != cdest))) {
 772                IP_VS_DBG_BUF(9, "check_template: dest not available for "
 773                              "protocol %s s:%s:%d v:%s:%d "
 774                              "-> d:%s:%d\n",
 775                              ip_vs_proto_name(ct->protocol),
 776                              IP_VS_DBG_ADDR(ct->af, &ct->caddr),
 777                              ntohs(ct->cport),
 778                              IP_VS_DBG_ADDR(ct->af, &ct->vaddr),
 779                              ntohs(ct->vport),
 780                              IP_VS_DBG_ADDR(ct->daf, &ct->daddr),
 781                              ntohs(ct->dport));
 782
 783                /*
 784                 * Invalidate the connection template
 785                 */
 786                if (ct->vport != htons(0xffff)) {
 787                        if (ip_vs_conn_unhash(ct)) {
 788                                ct->dport = htons(0xffff);
 789                                ct->vport = htons(0xffff);
 790                                ct->cport = 0;
 791                                ip_vs_conn_hash(ct);
 792                        }
 793                }
 794
 795                /*
 796                 * Simply decrease the refcnt of the template,
 797                 * don't restart its timer.
 798                 */
 799                __ip_vs_conn_put(ct);
 800                return 0;
 801        }
 802        return 1;
 803}
 804
 805static void ip_vs_conn_rcu_free(struct rcu_head *head)
 806{
 807        struct ip_vs_conn *cp = container_of(head, struct ip_vs_conn,
 808                                             rcu_head);
 809
 810        ip_vs_pe_put(cp->pe);
 811        kfree(cp->pe_data);
 812        kmem_cache_free(ip_vs_conn_cachep, cp);
 813}
 814
 815static void ip_vs_conn_expire(struct timer_list *t)
 816{
 817        struct ip_vs_conn *cp = from_timer(cp, t, timer);
 818        struct netns_ipvs *ipvs = cp->ipvs;
 819
 820        /*
 821         *      do I control anybody?
 822         */
 823        if (atomic_read(&cp->n_control))
 824                goto expire_later;
 825
 826        /* Unlink conn if not referenced anymore */
 827        if (likely(ip_vs_conn_unlink(cp))) {
 828                struct ip_vs_conn *ct = cp->control;
 829
 830                /* delete the timer if it is activated by other users */
 831                del_timer(&cp->timer);
 832
 833                /* does anybody control me? */
 834                if (ct) {
 835                        ip_vs_control_del(cp);
 836                        /* Drop CTL or non-assured TPL if not used anymore */
 837                        if (!cp->timeout && !atomic_read(&ct->n_control) &&
 838                            (!(ct->flags & IP_VS_CONN_F_TEMPLATE) ||
 839                             !(ct->state & IP_VS_CTPL_S_ASSURED))) {
 840                                IP_VS_DBG(4, "drop controlling connection\n");
 841                                ct->timeout = 0;
 842                                ip_vs_conn_expire_now(ct);
 843                        }
 844                }
 845
 846                if ((cp->flags & IP_VS_CONN_F_NFCT) &&
 847                    !(cp->flags & IP_VS_CONN_F_ONE_PACKET)) {
 848                        /* Do not access conntracks during subsys cleanup
 849                         * because nf_conntrack_find_get can not be used after
 850                         * conntrack cleanup for the net.
 851                         */
 852                        smp_rmb();
 853                        if (ipvs->enable)
 854                                ip_vs_conn_drop_conntrack(cp);
 855                }
 856
 857                if (unlikely(cp->app != NULL))
 858                        ip_vs_unbind_app(cp);
 859                ip_vs_unbind_dest(cp);
 860                if (cp->flags & IP_VS_CONN_F_NO_CPORT)
 861                        atomic_dec(&ip_vs_conn_no_cport_cnt);
 862                if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
 863                        ip_vs_conn_rcu_free(&cp->rcu_head);
 864                else
 865                        call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free);
 866                atomic_dec(&ipvs->conn_count);
 867                return;
 868        }
 869
 870  expire_later:
 871        IP_VS_DBG(7, "delayed: conn->refcnt=%d conn->n_control=%d\n",
 872                  refcount_read(&cp->refcnt),
 873                  atomic_read(&cp->n_control));
 874
 875        refcount_inc(&cp->refcnt);
 876        cp->timeout = 60*HZ;
 877
 878        if (ipvs->sync_state & IP_VS_STATE_MASTER)
 879                ip_vs_sync_conn(ipvs, cp, sysctl_sync_threshold(ipvs));
 880
 881        __ip_vs_conn_put_timer(cp);
 882}
 883
 884/* Modify timer, so that it expires as soon as possible.
 885 * Can be called without reference only if under RCU lock.
 886 * We can have such chain of conns linked with ->control: DATA->CTL->TPL
 887 * - DATA (eg. FTP) and TPL (persistence) can be present depending on setup
 888 * - cp->timeout=0 indicates all conns from chain should be dropped but
 889 * TPL is not dropped if in assured state
 890 */
 891void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
 892{
 893        /* Using mod_timer_pending will ensure the timer is not
 894         * modified after the final del_timer in ip_vs_conn_expire.
 895         */
 896        if (timer_pending(&cp->timer) &&
 897            time_after(cp->timer.expires, jiffies))
 898                mod_timer_pending(&cp->timer, jiffies);
 899}
 900
 901
 902/*
 903 *      Create a new connection entry and hash it into the ip_vs_conn_tab
 904 */
 905struct ip_vs_conn *
 906ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af,
 907               const union nf_inet_addr *daddr, __be16 dport, unsigned int flags,
 908               struct ip_vs_dest *dest, __u32 fwmark)
 909{
 910        struct ip_vs_conn *cp;
 911        struct netns_ipvs *ipvs = p->ipvs;
 912        struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->ipvs,
 913                                                           p->protocol);
 914
 915        cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC);
 916        if (cp == NULL) {
 917                IP_VS_ERR_RL("%s(): no memory\n", __func__);
 918                return NULL;
 919        }
 920
 921        INIT_HLIST_NODE(&cp->c_list);
 922        timer_setup(&cp->timer, ip_vs_conn_expire, 0);
 923        cp->ipvs           = ipvs;
 924        cp->af             = p->af;
 925        cp->daf            = dest_af;
 926        cp->protocol       = p->protocol;
 927        ip_vs_addr_set(p->af, &cp->caddr, p->caddr);
 928        cp->cport          = p->cport;
 929        /* proto should only be IPPROTO_IP if p->vaddr is a fwmark */
 930        ip_vs_addr_set(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
 931                       &cp->vaddr, p->vaddr);
 932        cp->vport          = p->vport;
 933        ip_vs_addr_set(cp->daf, &cp->daddr, daddr);
 934        cp->dport          = dport;
 935        cp->flags          = flags;
 936        cp->fwmark         = fwmark;
 937        if (flags & IP_VS_CONN_F_TEMPLATE && p->pe) {
 938                ip_vs_pe_get(p->pe);
 939                cp->pe = p->pe;
 940                cp->pe_data = p->pe_data;
 941                cp->pe_data_len = p->pe_data_len;
 942        } else {
 943                cp->pe = NULL;
 944                cp->pe_data = NULL;
 945                cp->pe_data_len = 0;
 946        }
 947        spin_lock_init(&cp->lock);
 948
 949        /*
 950         * Set the entry is referenced by the current thread before hashing
 951         * it in the table, so that other thread run ip_vs_random_dropentry
 952         * but cannot drop this entry.
 953         */
 954        refcount_set(&cp->refcnt, 1);
 955
 956        cp->control = NULL;
 957        atomic_set(&cp->n_control, 0);
 958        atomic_set(&cp->in_pkts, 0);
 959
 960        cp->packet_xmit = NULL;
 961        cp->app = NULL;
 962        cp->app_data = NULL;
 963        /* reset struct ip_vs_seq */
 964        cp->in_seq.delta = 0;
 965        cp->out_seq.delta = 0;
 966
 967        atomic_inc(&ipvs->conn_count);
 968        if (flags & IP_VS_CONN_F_NO_CPORT)
 969                atomic_inc(&ip_vs_conn_no_cport_cnt);
 970
 971        /* Bind the connection with a destination server */
 972        cp->dest = NULL;
 973        ip_vs_bind_dest(cp, dest);
 974
 975        /* Set its state and timeout */
 976        cp->state = 0;
 977        cp->old_state = 0;
 978        cp->timeout = 3*HZ;
 979        cp->sync_endtime = jiffies & ~3UL;
 980
 981        /* Bind its packet transmitter */
 982#ifdef CONFIG_IP_VS_IPV6
 983        if (p->af == AF_INET6)
 984                ip_vs_bind_xmit_v6(cp);
 985        else
 986#endif
 987                ip_vs_bind_xmit(cp);
 988
 989        if (unlikely(pd && atomic_read(&pd->appcnt)))
 990                ip_vs_bind_app(cp, pd->pp);
 991
 992        /*
 993         * Allow conntrack to be preserved. By default, conntrack
 994         * is created and destroyed for every packet.
 995         * Sometimes keeping conntrack can be useful for
 996         * IP_VS_CONN_F_ONE_PACKET too.
 997         */
 998
 999        if (ip_vs_conntrack_enabled(ipvs))
1000                cp->flags |= IP_VS_CONN_F_NFCT;
1001
1002        /* Hash it in the ip_vs_conn_tab finally */
1003        ip_vs_conn_hash(cp);
1004
1005        return cp;
1006}
1007
1008/*
1009 *      /proc/net/ip_vs_conn entries
1010 */
1011#ifdef CONFIG_PROC_FS
1012struct ip_vs_iter_state {
1013        struct seq_net_private  p;
1014        struct hlist_head       *l;
1015};
1016
1017static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
1018{
1019        int idx;
1020        struct ip_vs_conn *cp;
1021        struct ip_vs_iter_state *iter = seq->private;
1022
1023        for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
1024                hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
1025                        /* __ip_vs_conn_get() is not needed by
1026                         * ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show
1027                         */
1028                        if (pos-- == 0) {
1029                                iter->l = &ip_vs_conn_tab[idx];
1030                                return cp;
1031                        }
1032                }
1033                cond_resched_rcu();
1034        }
1035
1036        return NULL;
1037}
1038
1039static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
1040        __acquires(RCU)
1041{
1042        struct ip_vs_iter_state *iter = seq->private;
1043
1044        iter->l = NULL;
1045        rcu_read_lock();
1046        return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN;
1047}
1048
1049static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1050{
1051        struct ip_vs_conn *cp = v;
1052        struct ip_vs_iter_state *iter = seq->private;
1053        struct hlist_node *e;
1054        struct hlist_head *l = iter->l;
1055        int idx;
1056
1057        ++*pos;
1058        if (v == SEQ_START_TOKEN)
1059                return ip_vs_conn_array(seq, 0);
1060
1061        /* more on same hash chain? */
1062        e = rcu_dereference(hlist_next_rcu(&cp->c_list));
1063        if (e)
1064                return hlist_entry(e, struct ip_vs_conn, c_list);
1065
1066        idx = l - ip_vs_conn_tab;
1067        while (++idx < ip_vs_conn_tab_size) {
1068                hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
1069                        iter->l = &ip_vs_conn_tab[idx];
1070                        return cp;
1071                }
1072                cond_resched_rcu();
1073        }
1074        iter->l = NULL;
1075        return NULL;
1076}
1077
1078static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
1079        __releases(RCU)
1080{
1081        rcu_read_unlock();
1082}
1083
1084static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
1085{
1086
1087        if (v == SEQ_START_TOKEN)
1088                seq_puts(seq,
1089   "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Expires PEName PEData\n");
1090        else {
1091                const struct ip_vs_conn *cp = v;
1092                struct net *net = seq_file_net(seq);
1093                char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3];
1094                size_t len = 0;
1095                char dbuf[IP_VS_ADDRSTRLEN];
1096
1097                if (!net_eq(cp->ipvs->net, net))
1098                        return 0;
1099                if (cp->pe_data) {
1100                        pe_data[0] = ' ';
1101                        len = strlen(cp->pe->name);
1102                        memcpy(pe_data + 1, cp->pe->name, len);
1103                        pe_data[len + 1] = ' ';
1104                        len += 2;
1105                        len += cp->pe->show_pe_data(cp, pe_data + len);
1106                }
1107                pe_data[len] = '\0';
1108
1109#ifdef CONFIG_IP_VS_IPV6
1110                if (cp->daf == AF_INET6)
1111                        snprintf(dbuf, sizeof(dbuf), "%pI6", &cp->daddr.in6);
1112                else
1113#endif
1114                        snprintf(dbuf, sizeof(dbuf), "%08X",
1115                                 ntohl(cp->daddr.ip));
1116
1117#ifdef CONFIG_IP_VS_IPV6
1118                if (cp->af == AF_INET6)
1119                        seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X "
1120                                "%s %04X %-11s %7u%s\n",
1121                                ip_vs_proto_name(cp->protocol),
1122                                &cp->caddr.in6, ntohs(cp->cport),
1123                                &cp->vaddr.in6, ntohs(cp->vport),
1124                                dbuf, ntohs(cp->dport),
1125                                ip_vs_state_name(cp),
1126                                jiffies_delta_to_msecs(cp->timer.expires -
1127                                                       jiffies) / 1000,
1128                                pe_data);
1129                else
1130#endif
1131                        seq_printf(seq,
1132                                "%-3s %08X %04X %08X %04X"
1133                                " %s %04X %-11s %7u%s\n",
1134                                ip_vs_proto_name(cp->protocol),
1135                                ntohl(cp->caddr.ip), ntohs(cp->cport),
1136                                ntohl(cp->vaddr.ip), ntohs(cp->vport),
1137                                dbuf, ntohs(cp->dport),
1138                                ip_vs_state_name(cp),
1139                                jiffies_delta_to_msecs(cp->timer.expires -
1140                                                       jiffies) / 1000,
1141                                pe_data);
1142        }
1143        return 0;
1144}
1145
1146static const struct seq_operations ip_vs_conn_seq_ops = {
1147        .start = ip_vs_conn_seq_start,
1148        .next  = ip_vs_conn_seq_next,
1149        .stop  = ip_vs_conn_seq_stop,
1150        .show  = ip_vs_conn_seq_show,
1151};
1152
1153static const char *ip_vs_origin_name(unsigned int flags)
1154{
1155        if (flags & IP_VS_CONN_F_SYNC)
1156                return "SYNC";
1157        else
1158                return "LOCAL";
1159}
1160
1161static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v)
1162{
1163        char dbuf[IP_VS_ADDRSTRLEN];
1164
1165        if (v == SEQ_START_TOKEN)
1166                seq_puts(seq,
1167   "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Origin Expires\n");
1168        else {
1169                const struct ip_vs_conn *cp = v;
1170                struct net *net = seq_file_net(seq);
1171
1172                if (!net_eq(cp->ipvs->net, net))
1173                        return 0;
1174
1175#ifdef CONFIG_IP_VS_IPV6
1176                if (cp->daf == AF_INET6)
1177                        snprintf(dbuf, sizeof(dbuf), "%pI6", &cp->daddr.in6);
1178                else
1179#endif
1180                        snprintf(dbuf, sizeof(dbuf), "%08X",
1181                                 ntohl(cp->daddr.ip));
1182
1183#ifdef CONFIG_IP_VS_IPV6
1184                if (cp->af == AF_INET6)
1185                        seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X "
1186                                "%s %04X %-11s %-6s %7u\n",
1187                                ip_vs_proto_name(cp->protocol),
1188                                &cp->caddr.in6, ntohs(cp->cport),
1189                                &cp->vaddr.in6, ntohs(cp->vport),
1190                                dbuf, ntohs(cp->dport),
1191                                ip_vs_state_name(cp),
1192                                ip_vs_origin_name(cp->flags),
1193                                jiffies_delta_to_msecs(cp->timer.expires -
1194                                                       jiffies) / 1000);
1195                else
1196#endif
1197                        seq_printf(seq,
1198                                "%-3s %08X %04X %08X %04X "
1199                                "%s %04X %-11s %-6s %7u\n",
1200                                ip_vs_proto_name(cp->protocol),
1201                                ntohl(cp->caddr.ip), ntohs(cp->cport),
1202                                ntohl(cp->vaddr.ip), ntohs(cp->vport),
1203                                dbuf, ntohs(cp->dport),
1204                                ip_vs_state_name(cp),
1205                                ip_vs_origin_name(cp->flags),
1206                                jiffies_delta_to_msecs(cp->timer.expires -
1207                                                       jiffies) / 1000);
1208        }
1209        return 0;
1210}
1211
1212static const struct seq_operations ip_vs_conn_sync_seq_ops = {
1213        .start = ip_vs_conn_seq_start,
1214        .next  = ip_vs_conn_seq_next,
1215        .stop  = ip_vs_conn_seq_stop,
1216        .show  = ip_vs_conn_sync_seq_show,
1217};
1218#endif
1219
1220
1221/* Randomly drop connection entries before running out of memory
1222 * Can be used for DATA and CTL conns. For TPL conns there are exceptions:
1223 * - traffic for services in OPS mode increases ct->in_pkts, so it is supported
1224 * - traffic for services not in OPS mode does not increase ct->in_pkts in
1225 * all cases, so it is not supported
1226 */
1227static inline int todrop_entry(struct ip_vs_conn *cp)
1228{
1229        /*
1230         * The drop rate array needs tuning for real environments.
1231         * Called from timer bh only => no locking
1232         */
1233        static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
1234        static char todrop_counter[9] = {0};
1235        int i;
1236
1237        /* if the conn entry hasn't lasted for 60 seconds, don't drop it.
1238           This will leave enough time for normal connection to get
1239           through. */
1240        if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ))
1241                return 0;
1242
1243        /* Don't drop the entry if its number of incoming packets is not
1244           located in [0, 8] */
1245        i = atomic_read(&cp->in_pkts);
1246        if (i > 8 || i < 0) return 0;
1247
1248        if (!todrop_rate[i]) return 0;
1249        if (--todrop_counter[i] > 0) return 0;
1250
1251        todrop_counter[i] = todrop_rate[i];
1252        return 1;
1253}
1254
1255static inline bool ip_vs_conn_ops_mode(struct ip_vs_conn *cp)
1256{
1257        struct ip_vs_service *svc;
1258
1259        if (!cp->dest)
1260                return false;
1261        svc = rcu_dereference(cp->dest->svc);
1262        return svc && (svc->flags & IP_VS_SVC_F_ONEPACKET);
1263}
1264
1265/* Called from keventd and must protect itself from softirqs */
1266void ip_vs_random_dropentry(struct netns_ipvs *ipvs)
1267{
1268        int idx;
1269        struct ip_vs_conn *cp;
1270
1271        rcu_read_lock();
1272        /*
1273         * Randomly scan 1/32 of the whole table every second
1274         */
1275        for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) {
1276                unsigned int hash = prandom_u32() & ip_vs_conn_tab_mask;
1277
1278                hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
1279                        if (cp->ipvs != ipvs)
1280                                continue;
1281                        if (atomic_read(&cp->n_control))
1282                                continue;
1283                        if (cp->flags & IP_VS_CONN_F_TEMPLATE) {
1284                                /* connection template of OPS */
1285                                if (ip_vs_conn_ops_mode(cp))
1286                                        goto try_drop;
1287                                if (!(cp->state & IP_VS_CTPL_S_ASSURED))
1288                                        goto drop;
1289                                continue;
1290                        }
1291                        if (cp->protocol == IPPROTO_TCP) {
1292                                switch(cp->state) {
1293                                case IP_VS_TCP_S_SYN_RECV:
1294                                case IP_VS_TCP_S_SYNACK:
1295                                        break;
1296
1297                                case IP_VS_TCP_S_ESTABLISHED:
1298                                        if (todrop_entry(cp))
1299                                                break;
1300                                        continue;
1301
1302                                default:
1303                                        continue;
1304                                }
1305                        } else if (cp->protocol == IPPROTO_SCTP) {
1306                                switch (cp->state) {
1307                                case IP_VS_SCTP_S_INIT1:
1308                                case IP_VS_SCTP_S_INIT:
1309                                        break;
1310                                case IP_VS_SCTP_S_ESTABLISHED:
1311                                        if (todrop_entry(cp))
1312                                                break;
1313                                        continue;
1314                                default:
1315                                        continue;
1316                                }
1317                        } else {
1318try_drop:
1319                                if (!todrop_entry(cp))
1320                                        continue;
1321                        }
1322
1323drop:
1324                        IP_VS_DBG(4, "drop connection\n");
1325                        cp->timeout = 0;
1326                        ip_vs_conn_expire_now(cp);
1327                }
1328                cond_resched_rcu();
1329        }
1330        rcu_read_unlock();
1331}
1332
1333
1334/*
1335 *      Flush all the connection entries in the ip_vs_conn_tab
1336 */
1337static void ip_vs_conn_flush(struct netns_ipvs *ipvs)
1338{
1339        int idx;
1340        struct ip_vs_conn *cp, *cp_c;
1341
1342flush_again:
1343        rcu_read_lock();
1344        for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
1345
1346                hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
1347                        if (cp->ipvs != ipvs)
1348                                continue;
1349                        /* As timers are expired in LIFO order, restart
1350                         * the timer of controlling connection first, so
1351                         * that it is expired after us.
1352                         */
1353                        cp_c = cp->control;
1354                        /* cp->control is valid only with reference to cp */
1355                        if (cp_c && __ip_vs_conn_get(cp)) {
1356                                IP_VS_DBG(4, "del controlling connection\n");
1357                                ip_vs_conn_expire_now(cp_c);
1358                                __ip_vs_conn_put(cp);
1359                        }
1360                        IP_VS_DBG(4, "del connection\n");
1361                        ip_vs_conn_expire_now(cp);
1362                }
1363                cond_resched_rcu();
1364        }
1365        rcu_read_unlock();
1366
1367        /* the counter may be not NULL, because maybe some conn entries
1368           are run by slow timer handler or unhashed but still referred */
1369        if (atomic_read(&ipvs->conn_count) != 0) {
1370                schedule();
1371                goto flush_again;
1372        }
1373}
1374/*
1375 * per netns init and exit
1376 */
1377int __net_init ip_vs_conn_net_init(struct netns_ipvs *ipvs)
1378{
1379        atomic_set(&ipvs->conn_count, 0);
1380
1381        proc_create_net("ip_vs_conn", 0, ipvs->net->proc_net,
1382                        &ip_vs_conn_seq_ops, sizeof(struct ip_vs_iter_state));
1383        proc_create_net("ip_vs_conn_sync", 0, ipvs->net->proc_net,
1384                        &ip_vs_conn_sync_seq_ops,
1385                        sizeof(struct ip_vs_iter_state));
1386        return 0;
1387}
1388
1389void __net_exit ip_vs_conn_net_cleanup(struct netns_ipvs *ipvs)
1390{
1391        /* flush all the connection entries first */
1392        ip_vs_conn_flush(ipvs);
1393        remove_proc_entry("ip_vs_conn", ipvs->net->proc_net);
1394        remove_proc_entry("ip_vs_conn_sync", ipvs->net->proc_net);
1395}
1396
1397int __init ip_vs_conn_init(void)
1398{
1399        int idx;
1400
1401        /* Compute size and mask */
1402        ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
1403        ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1;
1404
1405        /*
1406         * Allocate the connection hash table and initialize its list heads
1407         */
1408        ip_vs_conn_tab = vmalloc(array_size(ip_vs_conn_tab_size,
1409                                            sizeof(*ip_vs_conn_tab)));
1410        if (!ip_vs_conn_tab)
1411                return -ENOMEM;
1412
1413        /* Allocate ip_vs_conn slab cache */
1414        ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
1415                                              sizeof(struct ip_vs_conn), 0,
1416                                              SLAB_HWCACHE_ALIGN, NULL);
1417        if (!ip_vs_conn_cachep) {
1418                vfree(ip_vs_conn_tab);
1419                return -ENOMEM;
1420        }
1421
1422        pr_info("Connection hash table configured "
1423                "(size=%d, memory=%ldKbytes)\n",
1424                ip_vs_conn_tab_size,
1425                (long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024);
1426        IP_VS_DBG(0, "Each connection entry needs %zd bytes at least\n",
1427                  sizeof(struct ip_vs_conn));
1428
1429        for (idx = 0; idx < ip_vs_conn_tab_size; idx++)
1430                INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]);
1431
1432        for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++)  {
1433                spin_lock_init(&__ip_vs_conntbl_lock_array[idx].l);
1434        }
1435
1436        /* calculate the random value for connection hash */
1437        get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
1438
1439        return 0;
1440}
1441
1442void ip_vs_conn_cleanup(void)
1443{
1444        /* Wait all ip_vs_conn_rcu_free() callbacks to complete */
1445        rcu_barrier();
1446        /* Release the empty cache */
1447        kmem_cache_destroy(ip_vs_conn_cachep);
1448        vfree(ip_vs_conn_tab);
1449}
1450