linux/net/netfilter/nf_conntrack_core.c
<<
>>
Prefs
   1/* Connection state tracking for netfilter.  This is separated from,
   2   but required by, the NAT layer; it can also be used by an iptables
   3   extension. */
   4
   5/* (C) 1999-2001 Paul `Rusty' Russell
   6 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
   7 * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
   8 * (C) 2005-2012 Patrick McHardy <kaber@trash.net>
   9 *
  10 * This program is free software; you can redistribute it and/or modify
  11 * it under the terms of the GNU General Public License version 2 as
  12 * published by the Free Software Foundation.
  13 */
  14
  15#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  16
  17#include <linux/types.h>
  18#include <linux/netfilter.h>
  19#include <linux/module.h>
  20#include <linux/sched.h>
  21#include <linux/skbuff.h>
  22#include <linux/proc_fs.h>
  23#include <linux/vmalloc.h>
  24#include <linux/stddef.h>
  25#include <linux/slab.h>
  26#include <linux/random.h>
  27#include <linux/jhash.h>
  28#include <linux/err.h>
  29#include <linux/percpu.h>
  30#include <linux/moduleparam.h>
  31#include <linux/notifier.h>
  32#include <linux/kernel.h>
  33#include <linux/netdevice.h>
  34#include <linux/socket.h>
  35#include <linux/mm.h>
  36#include <linux/nsproxy.h>
  37#include <linux/rculist_nulls.h>
  38
  39#include <net/netfilter/nf_conntrack.h>
  40#include <net/netfilter/nf_conntrack_l3proto.h>
  41#include <net/netfilter/nf_conntrack_l4proto.h>
  42#include <net/netfilter/nf_conntrack_expect.h>
  43#include <net/netfilter/nf_conntrack_helper.h>
  44#include <net/netfilter/nf_conntrack_seqadj.h>
  45#include <net/netfilter/nf_conntrack_core.h>
  46#include <net/netfilter/nf_conntrack_extend.h>
  47#include <net/netfilter/nf_conntrack_acct.h>
  48#include <net/netfilter/nf_conntrack_ecache.h>
  49#include <net/netfilter/nf_conntrack_zones.h>
  50#include <net/netfilter/nf_conntrack_timestamp.h>
  51#include <net/netfilter/nf_conntrack_timeout.h>
  52#include <net/netfilter/nf_conntrack_labels.h>
  53#include <net/netfilter/nf_conntrack_synproxy.h>
  54#include <net/netfilter/nf_nat.h>
  55#include <net/netfilter/nf_nat_core.h>
  56#include <net/netfilter/nf_nat_helper.h>
  57#include <net/netns/hash.h>
  58
  59#define NF_CONNTRACK_VERSION    "0.5.0"
  60
  61int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct,
  62                                      enum nf_nat_manip_type manip,
  63                                      const struct nlattr *attr) __read_mostly;
  64EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook);
  65
  66__cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS];
  67EXPORT_SYMBOL_GPL(nf_conntrack_locks);
  68
  69__cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock);
  70EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock);
  71
  72struct hlist_nulls_head *nf_conntrack_hash __read_mostly;
  73EXPORT_SYMBOL_GPL(nf_conntrack_hash);
  74
  75static __read_mostly struct kmem_cache *nf_conntrack_cachep;
  76static __read_mostly spinlock_t nf_conntrack_locks_all_lock;
  77static __read_mostly seqcount_t nf_conntrack_generation;
  78static __read_mostly DEFINE_SPINLOCK(nf_conntrack_locks_all_lock);
  79static __read_mostly bool nf_conntrack_locks_all;
  80
  81void nf_conntrack_lock(spinlock_t *lock) __acquires(lock)
  82{
  83        spin_lock(lock);
  84        while (unlikely(nf_conntrack_locks_all)) {
  85                spin_unlock(lock);
  86
  87                /*
  88                 * Order the 'nf_conntrack_locks_all' load vs. the
  89                 * spin_unlock_wait() loads below, to ensure
  90                 * that 'nf_conntrack_locks_all_lock' is indeed held:
  91                 */
  92                smp_rmb(); /* spin_lock(&nf_conntrack_locks_all_lock) */
  93                spin_unlock_wait(&nf_conntrack_locks_all_lock);
  94                spin_lock(lock);
  95        }
  96}
  97EXPORT_SYMBOL_GPL(nf_conntrack_lock);
  98
  99static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2)
 100{
 101        h1 %= CONNTRACK_LOCKS;
 102        h2 %= CONNTRACK_LOCKS;
 103        spin_unlock(&nf_conntrack_locks[h1]);
 104        if (h1 != h2)
 105                spin_unlock(&nf_conntrack_locks[h2]);
 106}
 107
 108/* return true if we need to recompute hashes (in case hash table was resized) */
 109static bool nf_conntrack_double_lock(struct net *net, unsigned int h1,
 110                                     unsigned int h2, unsigned int sequence)
 111{
 112        h1 %= CONNTRACK_LOCKS;
 113        h2 %= CONNTRACK_LOCKS;
 114        if (h1 <= h2) {
 115                nf_conntrack_lock(&nf_conntrack_locks[h1]);
 116                if (h1 != h2)
 117                        spin_lock_nested(&nf_conntrack_locks[h2],
 118                                         SINGLE_DEPTH_NESTING);
 119        } else {
 120                nf_conntrack_lock(&nf_conntrack_locks[h2]);
 121                spin_lock_nested(&nf_conntrack_locks[h1],
 122                                 SINGLE_DEPTH_NESTING);
 123        }
 124        if (read_seqcount_retry(&nf_conntrack_generation, sequence)) {
 125                nf_conntrack_double_unlock(h1, h2);
 126                return true;
 127        }
 128        return false;
 129}
 130
 131static void nf_conntrack_all_lock(void)
 132{
 133        int i;
 134
 135        spin_lock(&nf_conntrack_locks_all_lock);
 136        nf_conntrack_locks_all = true;
 137
 138        /*
 139         * Order the above store of 'nf_conntrack_locks_all' against
 140         * the spin_unlock_wait() loads below, such that if
 141         * nf_conntrack_lock() observes 'nf_conntrack_locks_all'
 142         * we must observe nf_conntrack_locks[] held:
 143         */
 144        smp_mb(); /* spin_lock(&nf_conntrack_locks_all_lock) */
 145
 146        for (i = 0; i < CONNTRACK_LOCKS; i++) {
 147                spin_unlock_wait(&nf_conntrack_locks[i]);
 148        }
 149}
 150
 151static void nf_conntrack_all_unlock(void)
 152{
 153        /*
 154         * All prior stores must be complete before we clear
 155         * 'nf_conntrack_locks_all'. Otherwise nf_conntrack_lock()
 156         * might observe the false value but not the entire
 157         * critical section:
 158         */
 159        smp_store_release(&nf_conntrack_locks_all, false);
 160        spin_unlock(&nf_conntrack_locks_all_lock);
 161}
 162
 163unsigned int nf_conntrack_htable_size __read_mostly;
 164EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
 165
 166unsigned int nf_conntrack_max __read_mostly;
 167EXPORT_SYMBOL_GPL(nf_conntrack_max);
 168
 169DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked);
 170EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked);
 171
 172static unsigned int nf_conntrack_hash_rnd __read_mostly;
 173
 174static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple,
 175                              const struct net *net)
 176{
 177        unsigned int n;
 178        u32 seed;
 179
 180        get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd));
 181
 182        /* The direction must be ignored, so we hash everything up to the
 183         * destination ports (which is a multiple of 4) and treat the last
 184         * three bytes manually.
 185         */
 186        seed = nf_conntrack_hash_rnd ^ net_hash_mix(net);
 187        n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32);
 188        return jhash2((u32 *)tuple, n, seed ^
 189                      (((__force __u16)tuple->dst.u.all << 16) |
 190                      tuple->dst.protonum));
 191}
 192
 193static u32 scale_hash(u32 hash)
 194{
 195        return reciprocal_scale(hash, nf_conntrack_htable_size);
 196}
 197
 198static u32 __hash_conntrack(const struct net *net,
 199                            const struct nf_conntrack_tuple *tuple,
 200                            unsigned int size)
 201{
 202        return reciprocal_scale(hash_conntrack_raw(tuple, net), size);
 203}
 204
 205static u32 hash_conntrack(const struct net *net,
 206                          const struct nf_conntrack_tuple *tuple)
 207{
 208        return scale_hash(hash_conntrack_raw(tuple, net));
 209}
 210
 211bool
 212nf_ct_get_tuple(const struct sk_buff *skb,
 213                unsigned int nhoff,
 214                unsigned int dataoff,
 215                u_int16_t l3num,
 216                u_int8_t protonum,
 217                struct net *net,
 218                struct nf_conntrack_tuple *tuple,
 219                const struct nf_conntrack_l3proto *l3proto,
 220                const struct nf_conntrack_l4proto *l4proto)
 221{
 222        memset(tuple, 0, sizeof(*tuple));
 223
 224        tuple->src.l3num = l3num;
 225        if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0)
 226                return false;
 227
 228        tuple->dst.protonum = protonum;
 229        tuple->dst.dir = IP_CT_DIR_ORIGINAL;
 230
 231        return l4proto->pkt_to_tuple(skb, dataoff, net, tuple);
 232}
 233EXPORT_SYMBOL_GPL(nf_ct_get_tuple);
 234
 235bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff,
 236                       u_int16_t l3num,
 237                       struct net *net, struct nf_conntrack_tuple *tuple)
 238{
 239        struct nf_conntrack_l3proto *l3proto;
 240        struct nf_conntrack_l4proto *l4proto;
 241        unsigned int protoff;
 242        u_int8_t protonum;
 243        int ret;
 244
 245        rcu_read_lock();
 246
 247        l3proto = __nf_ct_l3proto_find(l3num);
 248        ret = l3proto->get_l4proto(skb, nhoff, &protoff, &protonum);
 249        if (ret != NF_ACCEPT) {
 250                rcu_read_unlock();
 251                return false;
 252        }
 253
 254        l4proto = __nf_ct_l4proto_find(l3num, protonum);
 255
 256        ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple,
 257                              l3proto, l4proto);
 258
 259        rcu_read_unlock();
 260        return ret;
 261}
 262EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr);
 263
 264bool
 265nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
 266                   const struct nf_conntrack_tuple *orig,
 267                   const struct nf_conntrack_l3proto *l3proto,
 268                   const struct nf_conntrack_l4proto *l4proto)
 269{
 270        memset(inverse, 0, sizeof(*inverse));
 271
 272        inverse->src.l3num = orig->src.l3num;
 273        if (l3proto->invert_tuple(inverse, orig) == 0)
 274                return false;
 275
 276        inverse->dst.dir = !orig->dst.dir;
 277
 278        inverse->dst.protonum = orig->dst.protonum;
 279        return l4proto->invert_tuple(inverse, orig);
 280}
 281EXPORT_SYMBOL_GPL(nf_ct_invert_tuple);
 282
 283static void
 284clean_from_lists(struct nf_conn *ct)
 285{
 286        pr_debug("clean_from_lists(%p)\n", ct);
 287        hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
 288        hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode);
 289
 290        /* Destroy all pending expectations */
 291        nf_ct_remove_expectations(ct);
 292}
 293
 294/* must be called with local_bh_disable */
 295static void nf_ct_add_to_dying_list(struct nf_conn *ct)
 296{
 297        struct ct_pcpu *pcpu;
 298
 299        /* add this conntrack to the (per cpu) dying list */
 300        ct->cpu = smp_processor_id();
 301        pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
 302
 303        spin_lock(&pcpu->lock);
 304        hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
 305                             &pcpu->dying);
 306        spin_unlock(&pcpu->lock);
 307}
 308
 309/* must be called with local_bh_disable */
 310static void nf_ct_add_to_unconfirmed_list(struct nf_conn *ct)
 311{
 312        struct ct_pcpu *pcpu;
 313
 314        /* add this conntrack to the (per cpu) unconfirmed list */
 315        ct->cpu = smp_processor_id();
 316        pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
 317
 318        spin_lock(&pcpu->lock);
 319        hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
 320                             &pcpu->unconfirmed);
 321        spin_unlock(&pcpu->lock);
 322}
 323
 324/* must be called with local_bh_disable */
 325static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct)
 326{
 327        struct ct_pcpu *pcpu;
 328
 329        /* We overload first tuple to link into unconfirmed or dying list.*/
 330        pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
 331
 332        spin_lock(&pcpu->lock);
 333        BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode));
 334        hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
 335        spin_unlock(&pcpu->lock);
 336}
 337
 338/* Released via destroy_conntrack() */
 339struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
 340                                 const struct nf_conntrack_zone *zone,
 341                                 gfp_t flags)
 342{
 343        struct nf_conn *tmpl;
 344
 345        tmpl = kzalloc(sizeof(*tmpl), flags);
 346        if (tmpl == NULL)
 347                return NULL;
 348
 349        tmpl->status = IPS_TEMPLATE;
 350        write_pnet(&tmpl->ct_net, net);
 351        nf_ct_zone_add(tmpl, zone);
 352        atomic_set(&tmpl->ct_general.use, 0);
 353
 354        return tmpl;
 355}
 356EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc);
 357
 358void nf_ct_tmpl_free(struct nf_conn *tmpl)
 359{
 360        nf_ct_ext_destroy(tmpl);
 361        nf_ct_ext_free(tmpl);
 362        kfree(tmpl);
 363}
 364EXPORT_SYMBOL_GPL(nf_ct_tmpl_free);
 365
 366static void
 367destroy_conntrack(struct nf_conntrack *nfct)
 368{
 369        struct nf_conn *ct = (struct nf_conn *)nfct;
 370        struct net *net = nf_ct_net(ct);
 371        struct nf_conntrack_l4proto *l4proto;
 372
 373        pr_debug("destroy_conntrack(%p)\n", ct);
 374        NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
 375        NF_CT_ASSERT(!timer_pending(&ct->timeout));
 376
 377        if (unlikely(nf_ct_is_template(ct))) {
 378                nf_ct_tmpl_free(ct);
 379                return;
 380        }
 381        rcu_read_lock();
 382        l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
 383        if (l4proto->destroy)
 384                l4proto->destroy(ct);
 385
 386        rcu_read_unlock();
 387
 388        local_bh_disable();
 389        /* Expectations will have been removed in clean_from_lists,
 390         * except TFTP can create an expectation on the first packet,
 391         * before connection is in the list, so we need to clean here,
 392         * too.
 393         */
 394        nf_ct_remove_expectations(ct);
 395
 396        nf_ct_del_from_dying_or_unconfirmed_list(ct);
 397
 398        NF_CT_STAT_INC(net, delete);
 399        local_bh_enable();
 400
 401        if (ct->master)
 402                nf_ct_put(ct->master);
 403
 404        pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct);
 405        nf_conntrack_free(ct);
 406}
 407
 408static void nf_ct_delete_from_lists(struct nf_conn *ct)
 409{
 410        struct net *net = nf_ct_net(ct);
 411        unsigned int hash, reply_hash;
 412        unsigned int sequence;
 413
 414        nf_ct_helper_destroy(ct);
 415
 416        local_bh_disable();
 417        do {
 418                sequence = read_seqcount_begin(&nf_conntrack_generation);
 419                hash = hash_conntrack(net,
 420                                      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
 421                reply_hash = hash_conntrack(net,
 422                                           &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
 423        } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
 424
 425        clean_from_lists(ct);
 426        nf_conntrack_double_unlock(hash, reply_hash);
 427
 428        nf_ct_add_to_dying_list(ct);
 429
 430        NF_CT_STAT_INC(net, delete_list);
 431        local_bh_enable();
 432}
 433
 434bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report)
 435{
 436        struct nf_conn_tstamp *tstamp;
 437
 438        tstamp = nf_conn_tstamp_find(ct);
 439        if (tstamp && tstamp->stop == 0)
 440                tstamp->stop = ktime_get_real_ns();
 441
 442        if (nf_ct_is_dying(ct))
 443                goto delete;
 444
 445        if (nf_conntrack_event_report(IPCT_DESTROY, ct,
 446                                    portid, report) < 0) {
 447                /* destroy event was not delivered */
 448                nf_ct_delete_from_lists(ct);
 449                nf_conntrack_ecache_delayed_work(nf_ct_net(ct));
 450                return false;
 451        }
 452
 453        nf_conntrack_ecache_work(nf_ct_net(ct));
 454        set_bit(IPS_DYING_BIT, &ct->status);
 455 delete:
 456        nf_ct_delete_from_lists(ct);
 457        nf_ct_put(ct);
 458        return true;
 459}
 460EXPORT_SYMBOL_GPL(nf_ct_delete);
 461
 462static void death_by_timeout(unsigned long ul_conntrack)
 463{
 464        nf_ct_delete((struct nf_conn *)ul_conntrack, 0, 0);
 465}
 466
 467static inline bool
 468nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
 469                const struct nf_conntrack_tuple *tuple,
 470                const struct nf_conntrack_zone *zone,
 471                const struct net *net)
 472{
 473        struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
 474
 475        /* A conntrack can be recreated with the equal tuple,
 476         * so we need to check that the conntrack is confirmed
 477         */
 478        return nf_ct_tuple_equal(tuple, &h->tuple) &&
 479               nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) &&
 480               nf_ct_is_confirmed(ct) &&
 481               net_eq(net, nf_ct_net(ct));
 482}
 483
 484/* must be called with rcu read lock held */
 485void nf_conntrack_get_ht(struct hlist_nulls_head **hash, unsigned int *hsize)
 486{
 487        struct hlist_nulls_head *hptr;
 488        unsigned int sequence, hsz;
 489
 490        do {
 491                sequence = read_seqcount_begin(&nf_conntrack_generation);
 492                hsz = nf_conntrack_htable_size;
 493                hptr = nf_conntrack_hash;
 494        } while (read_seqcount_retry(&nf_conntrack_generation, sequence));
 495
 496        *hash = hptr;
 497        *hsize = hsz;
 498}
 499EXPORT_SYMBOL_GPL(nf_conntrack_get_ht);
 500
 501/*
 502 * Warning :
 503 * - Caller must take a reference on returned object
 504 *   and recheck nf_ct_tuple_equal(tuple, &h->tuple)
 505 */
 506static struct nf_conntrack_tuple_hash *
 507____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone,
 508                      const struct nf_conntrack_tuple *tuple, u32 hash)
 509{
 510        struct nf_conntrack_tuple_hash *h;
 511        struct hlist_nulls_head *ct_hash;
 512        struct hlist_nulls_node *n;
 513        unsigned int bucket, sequence;
 514
 515begin:
 516        do {
 517                sequence = read_seqcount_begin(&nf_conntrack_generation);
 518                bucket = scale_hash(hash);
 519                ct_hash = nf_conntrack_hash;
 520        } while (read_seqcount_retry(&nf_conntrack_generation, sequence));
 521
 522        hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) {
 523                if (nf_ct_key_equal(h, tuple, zone, net)) {
 524                        NF_CT_STAT_INC_ATOMIC(net, found);
 525                        return h;
 526                }
 527                NF_CT_STAT_INC_ATOMIC(net, searched);
 528        }
 529        /*
 530         * if the nulls value we got at the end of this lookup is
 531         * not the expected one, we must restart lookup.
 532         * We probably met an item that was moved to another chain.
 533         */
 534        if (get_nulls_value(n) != bucket) {
 535                NF_CT_STAT_INC_ATOMIC(net, search_restart);
 536                goto begin;
 537        }
 538
 539        return NULL;
 540}
 541
 542/* Find a connection corresponding to a tuple. */
 543static struct nf_conntrack_tuple_hash *
 544__nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
 545                        const struct nf_conntrack_tuple *tuple, u32 hash)
 546{
 547        struct nf_conntrack_tuple_hash *h;
 548        struct nf_conn *ct;
 549
 550        rcu_read_lock();
 551begin:
 552        h = ____nf_conntrack_find(net, zone, tuple, hash);
 553        if (h) {
 554                ct = nf_ct_tuplehash_to_ctrack(h);
 555                if (unlikely(nf_ct_is_dying(ct) ||
 556                             !atomic_inc_not_zero(&ct->ct_general.use)))
 557                        h = NULL;
 558                else {
 559                        if (unlikely(!nf_ct_key_equal(h, tuple, zone, net))) {
 560                                nf_ct_put(ct);
 561                                goto begin;
 562                        }
 563                }
 564        }
 565        rcu_read_unlock();
 566
 567        return h;
 568}
 569
 570struct nf_conntrack_tuple_hash *
 571nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
 572                      const struct nf_conntrack_tuple *tuple)
 573{
 574        return __nf_conntrack_find_get(net, zone, tuple,
 575                                       hash_conntrack_raw(tuple, net));
 576}
 577EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
 578
 579static void __nf_conntrack_hash_insert(struct nf_conn *ct,
 580                                       unsigned int hash,
 581                                       unsigned int reply_hash)
 582{
 583        hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
 584                           &nf_conntrack_hash[hash]);
 585        hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
 586                           &nf_conntrack_hash[reply_hash]);
 587}
 588
 589int
 590nf_conntrack_hash_check_insert(struct nf_conn *ct)
 591{
 592        const struct nf_conntrack_zone *zone;
 593        struct net *net = nf_ct_net(ct);
 594        unsigned int hash, reply_hash;
 595        struct nf_conntrack_tuple_hash *h;
 596        struct hlist_nulls_node *n;
 597        unsigned int sequence;
 598
 599        zone = nf_ct_zone(ct);
 600
 601        local_bh_disable();
 602        do {
 603                sequence = read_seqcount_begin(&nf_conntrack_generation);
 604                hash = hash_conntrack(net,
 605                                      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
 606                reply_hash = hash_conntrack(net,
 607                                           &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
 608        } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
 609
 610        /* See if there's one in the list already, including reverse */
 611        hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
 612                if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
 613                                    zone, net))
 614                        goto out;
 615
 616        hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
 617                if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
 618                                    zone, net))
 619                        goto out;
 620
 621        add_timer(&ct->timeout);
 622        smp_wmb();
 623        /* The caller holds a reference to this object */
 624        atomic_set(&ct->ct_general.use, 2);
 625        __nf_conntrack_hash_insert(ct, hash, reply_hash);
 626        nf_conntrack_double_unlock(hash, reply_hash);
 627        NF_CT_STAT_INC(net, insert);
 628        local_bh_enable();
 629        return 0;
 630
 631out:
 632        nf_conntrack_double_unlock(hash, reply_hash);
 633        NF_CT_STAT_INC(net, insert_failed);
 634        local_bh_enable();
 635        return -EEXIST;
 636}
 637EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
 638
 639static inline void nf_ct_acct_update(struct nf_conn *ct,
 640                                     enum ip_conntrack_info ctinfo,
 641                                     unsigned int len)
 642{
 643        struct nf_conn_acct *acct;
 644
 645        acct = nf_conn_acct_find(ct);
 646        if (acct) {
 647                struct nf_conn_counter *counter = acct->counter;
 648
 649                atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets);
 650                atomic64_add(len, &counter[CTINFO2DIR(ctinfo)].bytes);
 651        }
 652}
 653
 654static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
 655                             const struct nf_conn *loser_ct)
 656{
 657        struct nf_conn_acct *acct;
 658
 659        acct = nf_conn_acct_find(loser_ct);
 660        if (acct) {
 661                struct nf_conn_counter *counter = acct->counter;
 662                unsigned int bytes;
 663
 664                /* u32 should be fine since we must have seen one packet. */
 665                bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes);
 666                nf_ct_acct_update(ct, ctinfo, bytes);
 667        }
 668}
 669
 670/* Resolve race on insertion if this protocol allows this. */
 671static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb,
 672                               enum ip_conntrack_info ctinfo,
 673                               struct nf_conntrack_tuple_hash *h)
 674{
 675        /* This is the conntrack entry already in hashes that won race. */
 676        struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
 677        struct nf_conntrack_l4proto *l4proto;
 678
 679        l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
 680        if (l4proto->allow_clash &&
 681            !nfct_nat(ct) &&
 682            !nf_ct_is_dying(ct) &&
 683            atomic_inc_not_zero(&ct->ct_general.use)) {
 684                nf_ct_acct_merge(ct, ctinfo, (struct nf_conn *)skb->nfct);
 685                nf_conntrack_put(skb->nfct);
 686                /* Assign conntrack already in hashes to this skbuff. Don't
 687                 * modify skb->nfctinfo to ensure consistent stateful filtering.
 688                 */
 689                skb->nfct = &ct->ct_general;
 690                return NF_ACCEPT;
 691        }
 692        NF_CT_STAT_INC(net, drop);
 693        return NF_DROP;
 694}
 695
 696/* Confirm a connection given skb; places it in hash table */
 697int
 698__nf_conntrack_confirm(struct sk_buff *skb)
 699{
 700        const struct nf_conntrack_zone *zone;
 701        unsigned int hash, reply_hash;
 702        struct nf_conntrack_tuple_hash *h;
 703        struct nf_conn *ct;
 704        struct nf_conn_help *help;
 705        struct nf_conn_tstamp *tstamp;
 706        struct hlist_nulls_node *n;
 707        enum ip_conntrack_info ctinfo;
 708        struct net *net;
 709        unsigned int sequence;
 710        int ret = NF_DROP;
 711
 712        ct = nf_ct_get(skb, &ctinfo);
 713        net = nf_ct_net(ct);
 714
 715        /* ipt_REJECT uses nf_conntrack_attach to attach related
 716           ICMP/TCP RST packets in other direction.  Actual packet
 717           which created connection will be IP_CT_NEW or for an
 718           expected connection, IP_CT_RELATED. */
 719        if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
 720                return NF_ACCEPT;
 721
 722        zone = nf_ct_zone(ct);
 723        local_bh_disable();
 724
 725        do {
 726                sequence = read_seqcount_begin(&nf_conntrack_generation);
 727                /* reuse the hash saved before */
 728                hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
 729                hash = scale_hash(hash);
 730                reply_hash = hash_conntrack(net,
 731                                           &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
 732
 733        } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
 734
 735        /* We're not in hash table, and we refuse to set up related
 736         * connections for unconfirmed conns.  But packet copies and
 737         * REJECT will give spurious warnings here.
 738         */
 739        /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
 740
 741        /* No external references means no one else could have
 742         * confirmed us.
 743         */
 744        NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
 745        pr_debug("Confirming conntrack %p\n", ct);
 746        /* We have to check the DYING flag after unlink to prevent
 747         * a race against nf_ct_get_next_corpse() possibly called from
 748         * user context, else we insert an already 'dead' hash, blocking
 749         * further use of that particular connection -JM.
 750         */
 751        nf_ct_del_from_dying_or_unconfirmed_list(ct);
 752
 753        if (unlikely(nf_ct_is_dying(ct))) {
 754                nf_ct_add_to_dying_list(ct);
 755                goto dying;
 756        }
 757
 758        /* See if there's one in the list already, including reverse:
 759           NAT could have grabbed it without realizing, since we're
 760           not in the hash.  If there is, we lost race. */
 761        hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
 762                if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
 763                                    zone, net))
 764                        goto out;
 765
 766        hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
 767                if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
 768                                    zone, net))
 769                        goto out;
 770
 771        /* Timer relative to confirmation time, not original
 772           setting time, otherwise we'd get timer wrap in
 773           weird delay cases. */
 774        ct->timeout.expires += jiffies;
 775        add_timer(&ct->timeout);
 776        atomic_inc(&ct->ct_general.use);
 777        ct->status |= IPS_CONFIRMED;
 778
 779        /* set conntrack timestamp, if enabled. */
 780        tstamp = nf_conn_tstamp_find(ct);
 781        if (tstamp) {
 782                if (skb->tstamp.tv64 == 0)
 783                        __net_timestamp(skb);
 784
 785                tstamp->start = ktime_to_ns(skb->tstamp);
 786        }
 787        /* Since the lookup is lockless, hash insertion must be done after
 788         * starting the timer and setting the CONFIRMED bit. The RCU barriers
 789         * guarantee that no other CPU can find the conntrack before the above
 790         * stores are visible.
 791         */
 792        __nf_conntrack_hash_insert(ct, hash, reply_hash);
 793        nf_conntrack_double_unlock(hash, reply_hash);
 794        NF_CT_STAT_INC(net, insert);
 795        local_bh_enable();
 796
 797        help = nfct_help(ct);
 798        if (help && help->helper)
 799                nf_conntrack_event_cache(IPCT_HELPER, ct);
 800
 801        nf_conntrack_event_cache(master_ct(ct) ?
 802                                 IPCT_RELATED : IPCT_NEW, ct);
 803        return NF_ACCEPT;
 804
 805out:
 806        nf_ct_add_to_dying_list(ct);
 807        ret = nf_ct_resolve_clash(net, skb, ctinfo, h);
 808dying:
 809        nf_conntrack_double_unlock(hash, reply_hash);
 810        NF_CT_STAT_INC(net, insert_failed);
 811        local_bh_enable();
 812        return ret;
 813}
 814EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);
 815
 816/* Returns true if a connection correspondings to the tuple (required
 817   for NAT). */
 818int
 819nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
 820                         const struct nf_conn *ignored_conntrack)
 821{
 822        struct net *net = nf_ct_net(ignored_conntrack);
 823        const struct nf_conntrack_zone *zone;
 824        struct nf_conntrack_tuple_hash *h;
 825        struct hlist_nulls_head *ct_hash;
 826        unsigned int hash, sequence;
 827        struct hlist_nulls_node *n;
 828        struct nf_conn *ct;
 829
 830        zone = nf_ct_zone(ignored_conntrack);
 831
 832        rcu_read_lock();
 833        do {
 834                sequence = read_seqcount_begin(&nf_conntrack_generation);
 835                hash = hash_conntrack(net, tuple);
 836                ct_hash = nf_conntrack_hash;
 837        } while (read_seqcount_retry(&nf_conntrack_generation, sequence));
 838
 839        hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) {
 840                ct = nf_ct_tuplehash_to_ctrack(h);
 841                if (ct != ignored_conntrack &&
 842                    nf_ct_key_equal(h, tuple, zone, net)) {
 843                        NF_CT_STAT_INC_ATOMIC(net, found);
 844                        rcu_read_unlock();
 845                        return 1;
 846                }
 847                NF_CT_STAT_INC_ATOMIC(net, searched);
 848        }
 849        rcu_read_unlock();
 850
 851        return 0;
 852}
 853EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken);
 854
 855#define NF_CT_EVICTION_RANGE    8
 856
 857/* There's a small race here where we may free a just-assured
 858   connection.  Too bad: we're in trouble anyway. */
 859static unsigned int early_drop_list(struct net *net,
 860                                    struct hlist_nulls_head *head)
 861{
 862        struct nf_conntrack_tuple_hash *h;
 863        struct hlist_nulls_node *n;
 864        unsigned int drops = 0;
 865        struct nf_conn *tmp;
 866
 867        hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) {
 868                tmp = nf_ct_tuplehash_to_ctrack(h);
 869
 870                if (test_bit(IPS_ASSURED_BIT, &tmp->status) ||
 871                    !net_eq(nf_ct_net(tmp), net) ||
 872                    nf_ct_is_dying(tmp))
 873                        continue;
 874
 875                if (!atomic_inc_not_zero(&tmp->ct_general.use))
 876                        continue;
 877
 878                /* kill only if still in same netns -- might have moved due to
 879                 * SLAB_DESTROY_BY_RCU rules.
 880                 *
 881                 * We steal the timer reference.  If that fails timer has
 882                 * already fired or someone else deleted it. Just drop ref
 883                 * and move to next entry.
 884                 */
 885                if (net_eq(nf_ct_net(tmp), net) &&
 886                    nf_ct_is_confirmed(tmp) &&
 887                    del_timer(&tmp->timeout) &&
 888                    nf_ct_delete(tmp, 0, 0))
 889                        drops++;
 890
 891                nf_ct_put(tmp);
 892        }
 893
 894        return drops;
 895}
 896
 897static noinline int early_drop(struct net *net, unsigned int _hash)
 898{
 899        unsigned int i;
 900
 901        for (i = 0; i < NF_CT_EVICTION_RANGE; i++) {
 902                struct hlist_nulls_head *ct_hash;
 903                unsigned hash, sequence, drops;
 904
 905                rcu_read_lock();
 906                do {
 907                        sequence = read_seqcount_begin(&nf_conntrack_generation);
 908                        hash = scale_hash(_hash++);
 909                        ct_hash = nf_conntrack_hash;
 910                } while (read_seqcount_retry(&nf_conntrack_generation, sequence));
 911
 912                drops = early_drop_list(net, &ct_hash[hash]);
 913                rcu_read_unlock();
 914
 915                if (drops) {
 916                        NF_CT_STAT_ADD_ATOMIC(net, early_drop, drops);
 917                        return true;
 918                }
 919        }
 920
 921        return false;
 922}
 923
 924static struct nf_conn *
 925__nf_conntrack_alloc(struct net *net,
 926                     const struct nf_conntrack_zone *zone,
 927                     const struct nf_conntrack_tuple *orig,
 928                     const struct nf_conntrack_tuple *repl,
 929                     gfp_t gfp, u32 hash)
 930{
 931        struct nf_conn *ct;
 932
 933        /* We don't want any race condition at early drop stage */
 934        atomic_inc(&net->ct.count);
 935
 936        if (nf_conntrack_max &&
 937            unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
 938                if (!early_drop(net, hash)) {
 939                        atomic_dec(&net->ct.count);
 940                        net_warn_ratelimited("nf_conntrack: table full, dropping packet\n");
 941                        return ERR_PTR(-ENOMEM);
 942                }
 943        }
 944
 945        /*
 946         * Do not use kmem_cache_zalloc(), as this cache uses
 947         * SLAB_DESTROY_BY_RCU.
 948         */
 949        ct = kmem_cache_alloc(nf_conntrack_cachep, gfp);
 950        if (ct == NULL)
 951                goto out;
 952
 953        spin_lock_init(&ct->lock);
 954        ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
 955        ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL;
 956        ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
 957        /* save hash for reusing when confirming */
 958        *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash;
 959        ct->status = 0;
 960        /* Don't set timer yet: wait for confirmation */
 961        setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct);
 962        write_pnet(&ct->ct_net, net);
 963        memset(&ct->__nfct_init_offset[0], 0,
 964               offsetof(struct nf_conn, proto) -
 965               offsetof(struct nf_conn, __nfct_init_offset[0]));
 966
 967        nf_ct_zone_add(ct, zone);
 968
 969        /* Because we use RCU lookups, we set ct_general.use to zero before
 970         * this is inserted in any list.
 971         */
 972        atomic_set(&ct->ct_general.use, 0);
 973        return ct;
 974out:
 975        atomic_dec(&net->ct.count);
 976        return ERR_PTR(-ENOMEM);
 977}
 978
 979struct nf_conn *nf_conntrack_alloc(struct net *net,
 980                                   const struct nf_conntrack_zone *zone,
 981                                   const struct nf_conntrack_tuple *orig,
 982                                   const struct nf_conntrack_tuple *repl,
 983                                   gfp_t gfp)
 984{
 985        return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0);
 986}
 987EXPORT_SYMBOL_GPL(nf_conntrack_alloc);
 988
 989void nf_conntrack_free(struct nf_conn *ct)
 990{
 991        struct net *net = nf_ct_net(ct);
 992
 993        /* A freed object has refcnt == 0, that's
 994         * the golden rule for SLAB_DESTROY_BY_RCU
 995         */
 996        NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 0);
 997
 998        nf_ct_ext_destroy(ct);
 999        nf_ct_ext_free(ct);
1000        kmem_cache_free(nf_conntrack_cachep, ct);
1001        smp_mb__before_atomic();
1002        atomic_dec(&net->ct.count);
1003}
1004EXPORT_SYMBOL_GPL(nf_conntrack_free);
1005
1006
1007/* Allocate a new conntrack: we return -ENOMEM if classification
1008   failed due to stress.  Otherwise it really is unclassifiable. */
1009static struct nf_conntrack_tuple_hash *
1010init_conntrack(struct net *net, struct nf_conn *tmpl,
1011               const struct nf_conntrack_tuple *tuple,
1012               struct nf_conntrack_l3proto *l3proto,
1013               struct nf_conntrack_l4proto *l4proto,
1014               struct sk_buff *skb,
1015               unsigned int dataoff, u32 hash)
1016{
1017        struct nf_conn *ct;
1018        struct nf_conn_help *help;
1019        struct nf_conntrack_tuple repl_tuple;
1020        struct nf_conntrack_ecache *ecache;
1021        struct nf_conntrack_expect *exp = NULL;
1022        const struct nf_conntrack_zone *zone;
1023        struct nf_conn_timeout *timeout_ext;
1024        struct nf_conntrack_zone tmp;
1025        unsigned int *timeouts;
1026
1027        if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) {
1028                pr_debug("Can't invert tuple.\n");
1029                return NULL;
1030        }
1031
1032        zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
1033        ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC,
1034                                  hash);
1035        if (IS_ERR(ct))
1036                return (struct nf_conntrack_tuple_hash *)ct;
1037
1038        if (!nf_ct_add_synproxy(ct, tmpl)) {
1039                nf_conntrack_free(ct);
1040                return ERR_PTR(-ENOMEM);
1041        }
1042
1043        timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL;
1044        if (timeout_ext) {
1045                timeouts = nf_ct_timeout_data(timeout_ext);
1046                if (unlikely(!timeouts))
1047                        timeouts = l4proto->get_timeouts(net);
1048        } else {
1049                timeouts = l4proto->get_timeouts(net);
1050        }
1051
1052        if (!l4proto->new(ct, skb, dataoff, timeouts)) {
1053                nf_conntrack_free(ct);
1054                pr_debug("can't track with proto module\n");
1055                return NULL;
1056        }
1057
1058        if (timeout_ext)
1059                nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout),
1060                                      GFP_ATOMIC);
1061
1062        nf_ct_acct_ext_add(ct, GFP_ATOMIC);
1063        nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
1064        nf_ct_labels_ext_add(ct);
1065
1066        ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL;
1067        nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0,
1068                                 ecache ? ecache->expmask : 0,
1069                             GFP_ATOMIC);
1070
1071        local_bh_disable();
1072        if (net->ct.expect_count) {
1073                spin_lock(&nf_conntrack_expect_lock);
1074                exp = nf_ct_find_expectation(net, zone, tuple);
1075                if (exp) {
1076                        pr_debug("expectation arrives ct=%p exp=%p\n",
1077                                 ct, exp);
1078                        /* Welcome, Mr. Bond.  We've been expecting you... */
1079                        __set_bit(IPS_EXPECTED_BIT, &ct->status);
1080                        /* exp->master safe, refcnt bumped in nf_ct_find_expectation */
1081                        ct->master = exp->master;
1082                        if (exp->helper) {
1083                                help = nf_ct_helper_ext_add(ct, exp->helper,
1084                                                            GFP_ATOMIC);
1085                                if (help)
1086                                        rcu_assign_pointer(help->helper, exp->helper);
1087                        }
1088
1089#ifdef CONFIG_NF_CONNTRACK_MARK
1090                        ct->mark = exp->master->mark;
1091#endif
1092#ifdef CONFIG_NF_CONNTRACK_SECMARK
1093                        ct->secmark = exp->master->secmark;
1094#endif
1095                        NF_CT_STAT_INC(net, expect_new);
1096                }
1097                spin_unlock(&nf_conntrack_expect_lock);
1098        }
1099        if (!exp) {
1100                __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC);
1101                NF_CT_STAT_INC(net, new);
1102        }
1103
1104        /* Now it is inserted into the unconfirmed list, bump refcount */
1105        nf_conntrack_get(&ct->ct_general);
1106        nf_ct_add_to_unconfirmed_list(ct);
1107
1108        local_bh_enable();
1109
1110        if (exp) {
1111                if (exp->expectfn)
1112                        exp->expectfn(ct, exp);
1113                nf_ct_expect_put(exp);
1114        }
1115
1116        return &ct->tuplehash[IP_CT_DIR_ORIGINAL];
1117}
1118
1119/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
1120static inline struct nf_conn *
1121resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
1122                  struct sk_buff *skb,
1123                  unsigned int dataoff,
1124                  u_int16_t l3num,
1125                  u_int8_t protonum,
1126                  struct nf_conntrack_l3proto *l3proto,
1127                  struct nf_conntrack_l4proto *l4proto,
1128                  int *set_reply,
1129                  enum ip_conntrack_info *ctinfo)
1130{
1131        const struct nf_conntrack_zone *zone;
1132        struct nf_conntrack_tuple tuple;
1133        struct nf_conntrack_tuple_hash *h;
1134        struct nf_conntrack_zone tmp;
1135        struct nf_conn *ct;
1136        u32 hash;
1137
1138        if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
1139                             dataoff, l3num, protonum, net, &tuple, l3proto,
1140                             l4proto)) {
1141                pr_debug("Can't get tuple\n");
1142                return NULL;
1143        }
1144
1145        /* look for tuple match */
1146        zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
1147        hash = hash_conntrack_raw(&tuple, net);
1148        h = __nf_conntrack_find_get(net, zone, &tuple, hash);
1149        if (!h) {
1150                h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto,
1151                                   skb, dataoff, hash);
1152                if (!h)
1153                        return NULL;
1154                if (IS_ERR(h))
1155                        return (void *)h;
1156        }
1157        ct = nf_ct_tuplehash_to_ctrack(h);
1158
1159        /* It exists; we have (non-exclusive) reference. */
1160        if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
1161                *ctinfo = IP_CT_ESTABLISHED_REPLY;
1162                /* Please set reply bit if this packet OK */
1163                *set_reply = 1;
1164        } else {
1165                /* Once we've had two way comms, always ESTABLISHED. */
1166                if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
1167                        pr_debug("normal packet for %p\n", ct);
1168                        *ctinfo = IP_CT_ESTABLISHED;
1169                } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
1170                        pr_debug("related packet for %p\n", ct);
1171                        *ctinfo = IP_CT_RELATED;
1172                } else {
1173                        pr_debug("new packet for %p\n", ct);
1174                        *ctinfo = IP_CT_NEW;
1175                }
1176                *set_reply = 0;
1177        }
1178        skb->nfct = &ct->ct_general;
1179        skb->nfctinfo = *ctinfo;
1180        return ct;
1181}
1182
1183unsigned int
1184nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
1185                struct sk_buff *skb)
1186{
1187        struct nf_conn *ct, *tmpl = NULL;
1188        enum ip_conntrack_info ctinfo;
1189        struct nf_conntrack_l3proto *l3proto;
1190        struct nf_conntrack_l4proto *l4proto;
1191        unsigned int *timeouts;
1192        unsigned int dataoff;
1193        u_int8_t protonum;
1194        int set_reply = 0;
1195        int ret;
1196
1197        if (skb->nfct) {
1198                /* Previously seen (loopback or untracked)?  Ignore. */
1199                tmpl = (struct nf_conn *)skb->nfct;
1200                if (!nf_ct_is_template(tmpl)) {
1201                        NF_CT_STAT_INC_ATOMIC(net, ignore);
1202                        return NF_ACCEPT;
1203                }
1204                skb->nfct = NULL;
1205        }
1206
1207        /* rcu_read_lock()ed by nf_hook_slow */
1208        l3proto = __nf_ct_l3proto_find(pf);
1209        ret = l3proto->get_l4proto(skb, skb_network_offset(skb),
1210                                   &dataoff, &protonum);
1211        if (ret <= 0) {
1212                pr_debug("not prepared to track yet or error occurred\n");
1213                NF_CT_STAT_INC_ATOMIC(net, error);
1214                NF_CT_STAT_INC_ATOMIC(net, invalid);
1215                ret = -ret;
1216                goto out;
1217        }
1218
1219        l4proto = __nf_ct_l4proto_find(pf, protonum);
1220
1221        /* It may be an special packet, error, unclean...
1222         * inverse of the return code tells to the netfilter
1223         * core what to do with the packet. */
1224        if (l4proto->error != NULL) {
1225                ret = l4proto->error(net, tmpl, skb, dataoff, &ctinfo,
1226                                     pf, hooknum);
1227                if (ret <= 0) {
1228                        NF_CT_STAT_INC_ATOMIC(net, error);
1229                        NF_CT_STAT_INC_ATOMIC(net, invalid);
1230                        ret = -ret;
1231                        goto out;
1232                }
1233                /* ICMP[v6] protocol trackers may assign one conntrack. */
1234                if (skb->nfct)
1235                        goto out;
1236        }
1237
1238        ct = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum,
1239                               l3proto, l4proto, &set_reply, &ctinfo);
1240        if (!ct) {
1241                /* Not valid part of a connection */
1242                NF_CT_STAT_INC_ATOMIC(net, invalid);
1243                ret = NF_ACCEPT;
1244                goto out;
1245        }
1246
1247        if (IS_ERR(ct)) {
1248                /* Too stressed to deal. */
1249                NF_CT_STAT_INC_ATOMIC(net, drop);
1250                ret = NF_DROP;
1251                goto out;
1252        }
1253
1254        NF_CT_ASSERT(skb->nfct);
1255
1256        /* Decide what timeout policy we want to apply to this flow. */
1257        timeouts = nf_ct_timeout_lookup(net, ct, l4proto);
1258
1259        ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, hooknum, timeouts);
1260        if (ret <= 0) {
1261                /* Invalid: inverse of the return code tells
1262                 * the netfilter core what to do */
1263                pr_debug("nf_conntrack_in: Can't track with proto module\n");
1264                nf_conntrack_put(skb->nfct);
1265                skb->nfct = NULL;
1266                NF_CT_STAT_INC_ATOMIC(net, invalid);
1267                if (ret == -NF_DROP)
1268                        NF_CT_STAT_INC_ATOMIC(net, drop);
1269                ret = -ret;
1270                goto out;
1271        }
1272
1273        if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
1274                nf_conntrack_event_cache(IPCT_REPLY, ct);
1275out:
1276        if (tmpl) {
1277                /* Special case: we have to repeat this hook, assign the
1278                 * template again to this packet. We assume that this packet
1279                 * has no conntrack assigned. This is used by nf_ct_tcp. */
1280                if (ret == NF_REPEAT)
1281                        skb->nfct = (struct nf_conntrack *)tmpl;
1282                else
1283                        nf_ct_put(tmpl);
1284        }
1285
1286        return ret;
1287}
1288EXPORT_SYMBOL_GPL(nf_conntrack_in);
1289
1290bool nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
1291                          const struct nf_conntrack_tuple *orig)
1292{
1293        bool ret;
1294
1295        rcu_read_lock();
1296        ret = nf_ct_invert_tuple(inverse, orig,
1297                                 __nf_ct_l3proto_find(orig->src.l3num),
1298                                 __nf_ct_l4proto_find(orig->src.l3num,
1299                                                      orig->dst.protonum));
1300        rcu_read_unlock();
1301        return ret;
1302}
1303EXPORT_SYMBOL_GPL(nf_ct_invert_tuplepr);
1304
1305/* Alter reply tuple (maybe alter helper).  This is for NAT, and is
1306   implicitly racy: see __nf_conntrack_confirm */
1307void nf_conntrack_alter_reply(struct nf_conn *ct,
1308                              const struct nf_conntrack_tuple *newreply)
1309{
1310        struct nf_conn_help *help = nfct_help(ct);
1311
1312        /* Should be unconfirmed, so not in hash table yet */
1313        NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
1314
1315        pr_debug("Altering reply tuple of %p to ", ct);
1316        nf_ct_dump_tuple(newreply);
1317
1318        ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1319        if (ct->master || (help && !hlist_empty(&help->expectations)))
1320                return;
1321
1322        rcu_read_lock();
1323        __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC);
1324        rcu_read_unlock();
1325}
1326EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply);
1327
1328/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1329void __nf_ct_refresh_acct(struct nf_conn *ct,
1330                          enum ip_conntrack_info ctinfo,
1331                          const struct sk_buff *skb,
1332                          unsigned long extra_jiffies,
1333                          int do_acct)
1334{
1335        NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
1336        NF_CT_ASSERT(skb);
1337
1338        /* Only update if this is not a fixed timeout */
1339        if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
1340                goto acct;
1341
1342        /* If not in hash table, timer will not be active yet */
1343        if (!nf_ct_is_confirmed(ct)) {
1344                ct->timeout.expires = extra_jiffies;
1345        } else {
1346                unsigned long newtime = jiffies + extra_jiffies;
1347
1348                /* Only update the timeout if the new timeout is at least
1349                   HZ jiffies from the old timeout. Need del_timer for race
1350                   avoidance (may already be dying). */
1351                if (newtime - ct->timeout.expires >= HZ)
1352                        mod_timer_pending(&ct->timeout, newtime);
1353        }
1354
1355acct:
1356        if (do_acct)
1357                nf_ct_acct_update(ct, ctinfo, skb->len);
1358}
1359EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);
1360
1361bool __nf_ct_kill_acct(struct nf_conn *ct,
1362                       enum ip_conntrack_info ctinfo,
1363                       const struct sk_buff *skb,
1364                       int do_acct)
1365{
1366        if (do_acct)
1367                nf_ct_acct_update(ct, ctinfo, skb->len);
1368
1369        if (del_timer(&ct->timeout)) {
1370                ct->timeout.function((unsigned long)ct);
1371                return true;
1372        }
1373        return false;
1374}
1375EXPORT_SYMBOL_GPL(__nf_ct_kill_acct);
1376
1377#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
1378
1379#include <linux/netfilter/nfnetlink.h>
1380#include <linux/netfilter/nfnetlink_conntrack.h>
1381#include <linux/mutex.h>
1382
1383/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1384 * in ip_conntrack_core, since we don't want the protocols to autoload
1385 * or depend on ctnetlink */
1386int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb,
1387                               const struct nf_conntrack_tuple *tuple)
1388{
1389        if (nla_put_be16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port) ||
1390            nla_put_be16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port))
1391                goto nla_put_failure;
1392        return 0;
1393
1394nla_put_failure:
1395        return -1;
1396}
1397EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr);
1398
1399const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = {
1400        [CTA_PROTO_SRC_PORT]  = { .type = NLA_U16 },
1401        [CTA_PROTO_DST_PORT]  = { .type = NLA_U16 },
1402};
1403EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy);
1404
1405int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[],
1406                               struct nf_conntrack_tuple *t)
1407{
1408        if (!tb[CTA_PROTO_SRC_PORT] || !tb[CTA_PROTO_DST_PORT])
1409                return -EINVAL;
1410
1411        t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]);
1412        t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]);
1413
1414        return 0;
1415}
1416EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple);
1417
1418int nf_ct_port_nlattr_tuple_size(void)
1419{
1420        return nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1);
1421}
1422EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size);
1423#endif
1424
1425/* Used by ipt_REJECT and ip6t_REJECT. */
1426static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb)
1427{
1428        struct nf_conn *ct;
1429        enum ip_conntrack_info ctinfo;
1430
1431        /* This ICMP is in reverse direction to the packet which caused it */
1432        ct = nf_ct_get(skb, &ctinfo);
1433        if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1434                ctinfo = IP_CT_RELATED_REPLY;
1435        else
1436                ctinfo = IP_CT_RELATED;
1437
1438        /* Attach to new skbuff, and increment count */
1439        nskb->nfct = &ct->ct_general;
1440        nskb->nfctinfo = ctinfo;
1441        nf_conntrack_get(nskb->nfct);
1442}
1443
1444/* Bring out ya dead! */
1445static struct nf_conn *
1446get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
1447                void *data, unsigned int *bucket)
1448{
1449        struct nf_conntrack_tuple_hash *h;
1450        struct nf_conn *ct;
1451        struct hlist_nulls_node *n;
1452        int cpu;
1453        spinlock_t *lockp;
1454
1455        for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
1456                lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS];
1457                local_bh_disable();
1458                nf_conntrack_lock(lockp);
1459                if (*bucket < nf_conntrack_htable_size) {
1460                        hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnnode) {
1461                                if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
1462                                        continue;
1463                                ct = nf_ct_tuplehash_to_ctrack(h);
1464                                if (net_eq(nf_ct_net(ct), net) &&
1465                                    iter(ct, data))
1466                                        goto found;
1467                        }
1468                }
1469                spin_unlock(lockp);
1470                local_bh_enable();
1471                cond_resched();
1472        }
1473
1474        for_each_possible_cpu(cpu) {
1475                struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
1476
1477                spin_lock_bh(&pcpu->lock);
1478                hlist_nulls_for_each_entry(h, n, &pcpu->unconfirmed, hnnode) {
1479                        ct = nf_ct_tuplehash_to_ctrack(h);
1480                        if (iter(ct, data))
1481                                set_bit(IPS_DYING_BIT, &ct->status);
1482                }
1483                spin_unlock_bh(&pcpu->lock);
1484                cond_resched();
1485        }
1486        return NULL;
1487found:
1488        atomic_inc(&ct->ct_general.use);
1489        spin_unlock(lockp);
1490        local_bh_enable();
1491        return ct;
1492}
1493
1494void nf_ct_iterate_cleanup(struct net *net,
1495                           int (*iter)(struct nf_conn *i, void *data),
1496                           void *data, u32 portid, int report)
1497{
1498        struct nf_conn *ct;
1499        unsigned int bucket = 0;
1500
1501        might_sleep();
1502
1503        if (atomic_read(&net->ct.count) == 0)
1504                return;
1505
1506        while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) {
1507                /* Time to push up daises... */
1508                if (del_timer(&ct->timeout))
1509                        nf_ct_delete(ct, portid, report);
1510
1511                /* ... else the timer will get him soon. */
1512
1513                nf_ct_put(ct);
1514                cond_resched();
1515        }
1516}
1517EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup);
1518
1519static int kill_all(struct nf_conn *i, void *data)
1520{
1521        return 1;
1522}
1523
1524void nf_ct_free_hashtable(void *hash, unsigned int size)
1525{
1526        if (is_vmalloc_addr(hash))
1527                vfree(hash);
1528        else
1529                free_pages((unsigned long)hash,
1530                           get_order(sizeof(struct hlist_head) * size));
1531}
1532EXPORT_SYMBOL_GPL(nf_ct_free_hashtable);
1533
1534static int untrack_refs(void)
1535{
1536        int cnt = 0, cpu;
1537
1538        for_each_possible_cpu(cpu) {
1539                struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu);
1540
1541                cnt += atomic_read(&ct->ct_general.use) - 1;
1542        }
1543        return cnt;
1544}
1545
1546void nf_conntrack_cleanup_start(void)
1547{
1548        RCU_INIT_POINTER(ip_ct_attach, NULL);
1549}
1550
1551void nf_conntrack_cleanup_end(void)
1552{
1553        RCU_INIT_POINTER(nf_ct_destroy, NULL);
1554        while (untrack_refs() > 0)
1555                schedule();
1556
1557        nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size);
1558
1559        nf_conntrack_proto_fini();
1560        nf_conntrack_seqadj_fini();
1561        nf_conntrack_labels_fini();
1562        nf_conntrack_helper_fini();
1563        nf_conntrack_timeout_fini();
1564        nf_conntrack_ecache_fini();
1565        nf_conntrack_tstamp_fini();
1566        nf_conntrack_acct_fini();
1567        nf_conntrack_expect_fini();
1568
1569        kmem_cache_destroy(nf_conntrack_cachep);
1570}
1571
1572/*
1573 * Mishearing the voices in his head, our hero wonders how he's
1574 * supposed to kill the mall.
1575 */
1576void nf_conntrack_cleanup_net(struct net *net)
1577{
1578        LIST_HEAD(single);
1579
1580        list_add(&net->exit_list, &single);
1581        nf_conntrack_cleanup_net_list(&single);
1582}
1583
1584void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list)
1585{
1586        int busy;
1587        struct net *net;
1588
1589        /*
1590         * This makes sure all current packets have passed through
1591         *  netfilter framework.  Roll on, two-stage module
1592         *  delete...
1593         */
1594        synchronize_net();
1595i_see_dead_people:
1596        busy = 0;
1597        list_for_each_entry(net, net_exit_list, exit_list) {
1598                nf_ct_iterate_cleanup(net, kill_all, NULL, 0, 0);
1599                if (atomic_read(&net->ct.count) != 0)
1600                        busy = 1;
1601        }
1602        if (busy) {
1603                schedule();
1604                goto i_see_dead_people;
1605        }
1606
1607        list_for_each_entry(net, net_exit_list, exit_list) {
1608                nf_conntrack_proto_pernet_fini(net);
1609                nf_conntrack_helper_pernet_fini(net);
1610                nf_conntrack_ecache_pernet_fini(net);
1611                nf_conntrack_tstamp_pernet_fini(net);
1612                nf_conntrack_acct_pernet_fini(net);
1613                nf_conntrack_expect_pernet_fini(net);
1614                free_percpu(net->ct.stat);
1615                free_percpu(net->ct.pcpu_lists);
1616        }
1617}
1618
1619void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls)
1620{
1621        struct hlist_nulls_head *hash;
1622        unsigned int nr_slots, i;
1623        size_t sz;
1624
1625        if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head)))
1626                return NULL;
1627
1628        BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head));
1629        nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head));
1630
1631        if (nr_slots > (UINT_MAX / sizeof(struct hlist_nulls_head)))
1632                return NULL;
1633
1634        sz = nr_slots * sizeof(struct hlist_nulls_head);
1635        hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO,
1636                                        get_order(sz));
1637        if (!hash)
1638                hash = vzalloc(sz);
1639
1640        if (hash && nulls)
1641                for (i = 0; i < nr_slots; i++)
1642                        INIT_HLIST_NULLS_HEAD(&hash[i], i);
1643
1644        return hash;
1645}
1646EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable);
1647
1648int nf_conntrack_hash_resize(unsigned int hashsize)
1649{
1650        int i, bucket;
1651        unsigned int old_size;
1652        struct hlist_nulls_head *hash, *old_hash;
1653        struct nf_conntrack_tuple_hash *h;
1654        struct nf_conn *ct;
1655
1656        if (!hashsize)
1657                return -EINVAL;
1658
1659        hash = nf_ct_alloc_hashtable(&hashsize, 1);
1660        if (!hash)
1661                return -ENOMEM;
1662
1663        old_size = nf_conntrack_htable_size;
1664        if (old_size == hashsize) {
1665                nf_ct_free_hashtable(hash, hashsize);
1666                return 0;
1667        }
1668
1669        local_bh_disable();
1670        nf_conntrack_all_lock();
1671        write_seqcount_begin(&nf_conntrack_generation);
1672
1673        /* Lookups in the old hash might happen in parallel, which means we
1674         * might get false negatives during connection lookup. New connections
1675         * created because of a false negative won't make it into the hash
1676         * though since that required taking the locks.
1677         */
1678
1679        for (i = 0; i < nf_conntrack_htable_size; i++) {
1680                while (!hlist_nulls_empty(&nf_conntrack_hash[i])) {
1681                        h = hlist_nulls_entry(nf_conntrack_hash[i].first,
1682                                              struct nf_conntrack_tuple_hash, hnnode);
1683                        ct = nf_ct_tuplehash_to_ctrack(h);
1684                        hlist_nulls_del_rcu(&h->hnnode);
1685                        bucket = __hash_conntrack(nf_ct_net(ct),
1686                                                  &h->tuple, hashsize);
1687                        hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
1688                }
1689        }
1690        old_size = nf_conntrack_htable_size;
1691        old_hash = nf_conntrack_hash;
1692
1693        nf_conntrack_hash = hash;
1694        nf_conntrack_htable_size = hashsize;
1695
1696        write_seqcount_end(&nf_conntrack_generation);
1697        nf_conntrack_all_unlock();
1698        local_bh_enable();
1699
1700        synchronize_net();
1701        nf_ct_free_hashtable(old_hash, old_size);
1702        return 0;
1703}
1704
1705int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
1706{
1707        unsigned int hashsize;
1708        int rc;
1709
1710        if (current->nsproxy->net_ns != &init_net)
1711                return -EOPNOTSUPP;
1712
1713        /* On boot, we can set this without any fancy locking. */
1714        if (!nf_conntrack_htable_size)
1715                return param_set_uint(val, kp);
1716
1717        rc = kstrtouint(val, 0, &hashsize);
1718        if (rc)
1719                return rc;
1720
1721        return nf_conntrack_hash_resize(hashsize);
1722}
1723EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize);
1724
1725module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
1726                  &nf_conntrack_htable_size, 0600);
1727
1728void nf_ct_untracked_status_or(unsigned long bits)
1729{
1730        int cpu;
1731
1732        for_each_possible_cpu(cpu)
1733                per_cpu(nf_conntrack_untracked, cpu).status |= bits;
1734}
1735EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or);
1736
1737int nf_conntrack_init_start(void)
1738{
1739        int max_factor = 8;
1740        int ret = -ENOMEM;
1741        int i, cpu;
1742
1743        seqcount_init(&nf_conntrack_generation);
1744
1745        for (i = 0; i < CONNTRACK_LOCKS; i++)
1746                spin_lock_init(&nf_conntrack_locks[i]);
1747
1748        if (!nf_conntrack_htable_size) {
1749                /* Idea from tcp.c: use 1/16384 of memory.
1750                 * On i386: 32MB machine has 512 buckets.
1751                 * >= 1GB machines have 16384 buckets.
1752                 * >= 4GB machines have 65536 buckets.
1753                 */
1754                nf_conntrack_htable_size
1755                        = (((totalram_pages << PAGE_SHIFT) / 16384)
1756                           / sizeof(struct hlist_head));
1757                if (totalram_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE)))
1758                        nf_conntrack_htable_size = 65536;
1759                else if (totalram_pages > (1024 * 1024 * 1024 / PAGE_SIZE))
1760                        nf_conntrack_htable_size = 16384;
1761                if (nf_conntrack_htable_size < 32)
1762                        nf_conntrack_htable_size = 32;
1763
1764                /* Use a max. factor of four by default to get the same max as
1765                 * with the old struct list_heads. When a table size is given
1766                 * we use the old value of 8 to avoid reducing the max.
1767                 * entries. */
1768                max_factor = 4;
1769        }
1770
1771        nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1);
1772        if (!nf_conntrack_hash)
1773                return -ENOMEM;
1774
1775        nf_conntrack_max = max_factor * nf_conntrack_htable_size;
1776
1777        nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
1778                                                sizeof(struct nf_conn), 0,
1779                                                SLAB_DESTROY_BY_RCU | SLAB_HWCACHE_ALIGN, NULL);
1780        if (!nf_conntrack_cachep)
1781                goto err_cachep;
1782
1783        printk(KERN_INFO "nf_conntrack version %s (%u buckets, %d max)\n",
1784               NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
1785               nf_conntrack_max);
1786
1787        ret = nf_conntrack_expect_init();
1788        if (ret < 0)
1789                goto err_expect;
1790
1791        ret = nf_conntrack_acct_init();
1792        if (ret < 0)
1793                goto err_acct;
1794
1795        ret = nf_conntrack_tstamp_init();
1796        if (ret < 0)
1797                goto err_tstamp;
1798
1799        ret = nf_conntrack_ecache_init();
1800        if (ret < 0)
1801                goto err_ecache;
1802
1803        ret = nf_conntrack_timeout_init();
1804        if (ret < 0)
1805                goto err_timeout;
1806
1807        ret = nf_conntrack_helper_init();
1808        if (ret < 0)
1809                goto err_helper;
1810
1811        ret = nf_conntrack_labels_init();
1812        if (ret < 0)
1813                goto err_labels;
1814
1815        ret = nf_conntrack_seqadj_init();
1816        if (ret < 0)
1817                goto err_seqadj;
1818
1819        ret = nf_conntrack_proto_init();
1820        if (ret < 0)
1821                goto err_proto;
1822
1823        /* Set up fake conntrack: to never be deleted, not in any hashes */
1824        for_each_possible_cpu(cpu) {
1825                struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu);
1826                write_pnet(&ct->ct_net, &init_net);
1827                atomic_set(&ct->ct_general.use, 1);
1828        }
1829        /*  - and look it like as a confirmed connection */
1830        nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED);
1831        return 0;
1832
1833err_proto:
1834        nf_conntrack_seqadj_fini();
1835err_seqadj:
1836        nf_conntrack_labels_fini();
1837err_labels:
1838        nf_conntrack_helper_fini();
1839err_helper:
1840        nf_conntrack_timeout_fini();
1841err_timeout:
1842        nf_conntrack_ecache_fini();
1843err_ecache:
1844        nf_conntrack_tstamp_fini();
1845err_tstamp:
1846        nf_conntrack_acct_fini();
1847err_acct:
1848        nf_conntrack_expect_fini();
1849err_expect:
1850        kmem_cache_destroy(nf_conntrack_cachep);
1851err_cachep:
1852        nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size);
1853        return ret;
1854}
1855
1856void nf_conntrack_init_end(void)
1857{
1858        /* For use by REJECT target */
1859        RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach);
1860        RCU_INIT_POINTER(nf_ct_destroy, destroy_conntrack);
1861}
1862
1863/*
1864 * We need to use special "null" values, not used in hash table
1865 */
1866#define UNCONFIRMED_NULLS_VAL   ((1<<30)+0)
1867#define DYING_NULLS_VAL         ((1<<30)+1)
1868#define TEMPLATE_NULLS_VAL      ((1<<30)+2)
1869
1870int nf_conntrack_init_net(struct net *net)
1871{
1872        int ret = -ENOMEM;
1873        int cpu;
1874
1875        atomic_set(&net->ct.count, 0);
1876
1877        net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu);
1878        if (!net->ct.pcpu_lists)
1879                goto err_stat;
1880
1881        for_each_possible_cpu(cpu) {
1882                struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
1883
1884                spin_lock_init(&pcpu->lock);
1885                INIT_HLIST_NULLS_HEAD(&pcpu->unconfirmed, UNCONFIRMED_NULLS_VAL);
1886                INIT_HLIST_NULLS_HEAD(&pcpu->dying, DYING_NULLS_VAL);
1887        }
1888
1889        net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
1890        if (!net->ct.stat)
1891                goto err_pcpu_lists;
1892
1893        ret = nf_conntrack_expect_pernet_init(net);
1894        if (ret < 0)
1895                goto err_expect;
1896        ret = nf_conntrack_acct_pernet_init(net);
1897        if (ret < 0)
1898                goto err_acct;
1899        ret = nf_conntrack_tstamp_pernet_init(net);
1900        if (ret < 0)
1901                goto err_tstamp;
1902        ret = nf_conntrack_ecache_pernet_init(net);
1903        if (ret < 0)
1904                goto err_ecache;
1905        ret = nf_conntrack_helper_pernet_init(net);
1906        if (ret < 0)
1907                goto err_helper;
1908        ret = nf_conntrack_proto_pernet_init(net);
1909        if (ret < 0)
1910                goto err_proto;
1911        return 0;
1912
1913err_proto:
1914        nf_conntrack_helper_pernet_fini(net);
1915err_helper:
1916        nf_conntrack_ecache_pernet_fini(net);
1917err_ecache:
1918        nf_conntrack_tstamp_pernet_fini(net);
1919err_tstamp:
1920        nf_conntrack_acct_pernet_fini(net);
1921err_acct:
1922        nf_conntrack_expect_pernet_fini(net);
1923err_expect:
1924        free_percpu(net->ct.stat);
1925err_pcpu_lists:
1926        free_percpu(net->ct.pcpu_lists);
1927err_stat:
1928        return ret;
1929}
1930