linux/net/netfilter/nf_conntrack_core.c
<<
>>
Prefs
   1/* Connection state tracking for netfilter.  This is separated from,
   2   but required by, the NAT layer; it can also be used by an iptables
   3   extension. */
   4
   5/* (C) 1999-2001 Paul `Rusty' Russell
   6 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
   7 * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
   8 *
   9 * This program is free software; you can redistribute it and/or modify
  10 * it under the terms of the GNU General Public License version 2 as
  11 * published by the Free Software Foundation.
  12 */
  13
  14#include <linux/types.h>
  15#include <linux/netfilter.h>
  16#include <linux/module.h>
  17#include <linux/skbuff.h>
  18#include <linux/proc_fs.h>
  19#include <linux/vmalloc.h>
  20#include <linux/stddef.h>
  21#include <linux/slab.h>
  22#include <linux/random.h>
  23#include <linux/jhash.h>
  24#include <linux/err.h>
  25#include <linux/percpu.h>
  26#include <linux/moduleparam.h>
  27#include <linux/notifier.h>
  28#include <linux/kernel.h>
  29#include <linux/netdevice.h>
  30#include <linux/socket.h>
  31#include <linux/mm.h>
  32
  33#include <net/netfilter/nf_conntrack.h>
  34#include <net/netfilter/nf_conntrack_l3proto.h>
  35#include <net/netfilter/nf_conntrack_l4proto.h>
  36#include <net/netfilter/nf_conntrack_expect.h>
  37#include <net/netfilter/nf_conntrack_helper.h>
  38#include <net/netfilter/nf_conntrack_core.h>
  39#include <net/netfilter/nf_conntrack_extend.h>
  40
  41#define NF_CONNTRACK_VERSION    "0.5.0"
  42
  43DEFINE_RWLOCK(nf_conntrack_lock);
  44EXPORT_SYMBOL_GPL(nf_conntrack_lock);
  45
  46/* nf_conntrack_standalone needs this */
  47atomic_t nf_conntrack_count = ATOMIC_INIT(0);
  48EXPORT_SYMBOL_GPL(nf_conntrack_count);
  49
  50unsigned int nf_conntrack_htable_size __read_mostly;
  51EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
  52
  53int nf_conntrack_max __read_mostly;
  54EXPORT_SYMBOL_GPL(nf_conntrack_max);
  55
  56struct hlist_head *nf_conntrack_hash __read_mostly;
  57EXPORT_SYMBOL_GPL(nf_conntrack_hash);
  58
  59struct nf_conn nf_conntrack_untracked __read_mostly;
  60EXPORT_SYMBOL_GPL(nf_conntrack_untracked);
  61
  62unsigned int nf_ct_log_invalid __read_mostly;
  63HLIST_HEAD(unconfirmed);
  64static int nf_conntrack_vmalloc __read_mostly;
  65static struct kmem_cache *nf_conntrack_cachep __read_mostly;
  66
  67DEFINE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat);
  68EXPORT_PER_CPU_SYMBOL(nf_conntrack_stat);
  69
  70static int nf_conntrack_hash_rnd_initted;
  71static unsigned int nf_conntrack_hash_rnd;
  72
  73static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
  74                                  unsigned int size, unsigned int rnd)
  75{
  76        unsigned int a, b;
  77
  78        a = jhash2(tuple->src.u3.all, ARRAY_SIZE(tuple->src.u3.all),
  79                   (tuple->src.l3num << 16) | tuple->dst.protonum);
  80        b = jhash2(tuple->dst.u3.all, ARRAY_SIZE(tuple->dst.u3.all),
  81                   ((__force __u16)tuple->src.u.all << 16) |
  82                    (__force __u16)tuple->dst.u.all);
  83
  84        return jhash_2words(a, b, rnd) % size;
  85}
  86
  87static inline u_int32_t hash_conntrack(const struct nf_conntrack_tuple *tuple)
  88{
  89        return __hash_conntrack(tuple, nf_conntrack_htable_size,
  90                                nf_conntrack_hash_rnd);
  91}
  92
  93int
  94nf_ct_get_tuple(const struct sk_buff *skb,
  95                unsigned int nhoff,
  96                unsigned int dataoff,
  97                u_int16_t l3num,
  98                u_int8_t protonum,
  99                struct nf_conntrack_tuple *tuple,
 100                const struct nf_conntrack_l3proto *l3proto,
 101                const struct nf_conntrack_l4proto *l4proto)
 102{
 103        NF_CT_TUPLE_U_BLANK(tuple);
 104
 105        tuple->src.l3num = l3num;
 106        if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0)
 107                return 0;
 108
 109        tuple->dst.protonum = protonum;
 110        tuple->dst.dir = IP_CT_DIR_ORIGINAL;
 111
 112        return l4proto->pkt_to_tuple(skb, dataoff, tuple);
 113}
 114EXPORT_SYMBOL_GPL(nf_ct_get_tuple);
 115
 116int nf_ct_get_tuplepr(const struct sk_buff *skb,
 117                      unsigned int nhoff,
 118                      u_int16_t l3num,
 119                      struct nf_conntrack_tuple *tuple)
 120{
 121        struct nf_conntrack_l3proto *l3proto;
 122        struct nf_conntrack_l4proto *l4proto;
 123        unsigned int protoff;
 124        u_int8_t protonum;
 125        int ret;
 126
 127        rcu_read_lock();
 128
 129        l3proto = __nf_ct_l3proto_find(l3num);
 130        ret = l3proto->get_l4proto(skb, nhoff, &protoff, &protonum);
 131        if (ret != NF_ACCEPT) {
 132                rcu_read_unlock();
 133                return 0;
 134        }
 135
 136        l4proto = __nf_ct_l4proto_find(l3num, protonum);
 137
 138        ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, tuple,
 139                              l3proto, l4proto);
 140
 141        rcu_read_unlock();
 142        return ret;
 143}
 144EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr);
 145
 146int
 147nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
 148                   const struct nf_conntrack_tuple *orig,
 149                   const struct nf_conntrack_l3proto *l3proto,
 150                   const struct nf_conntrack_l4proto *l4proto)
 151{
 152        NF_CT_TUPLE_U_BLANK(inverse);
 153
 154        inverse->src.l3num = orig->src.l3num;
 155        if (l3proto->invert_tuple(inverse, orig) == 0)
 156                return 0;
 157
 158        inverse->dst.dir = !orig->dst.dir;
 159
 160        inverse->dst.protonum = orig->dst.protonum;
 161        return l4proto->invert_tuple(inverse, orig);
 162}
 163EXPORT_SYMBOL_GPL(nf_ct_invert_tuple);
 164
 165static void
 166clean_from_lists(struct nf_conn *ct)
 167{
 168        pr_debug("clean_from_lists(%p)\n", ct);
 169        hlist_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode);
 170        hlist_del(&ct->tuplehash[IP_CT_DIR_REPLY].hnode);
 171
 172        /* Destroy all pending expectations */
 173        nf_ct_remove_expectations(ct);
 174}
 175
 176static void
 177destroy_conntrack(struct nf_conntrack *nfct)
 178{
 179        struct nf_conn *ct = (struct nf_conn *)nfct;
 180        struct nf_conntrack_l4proto *l4proto;
 181
 182        pr_debug("destroy_conntrack(%p)\n", ct);
 183        NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
 184        NF_CT_ASSERT(!timer_pending(&ct->timeout));
 185
 186        nf_conntrack_event(IPCT_DESTROY, ct);
 187        set_bit(IPS_DYING_BIT, &ct->status);
 188
 189        /* To make sure we don't get any weird locking issues here:
 190         * destroy_conntrack() MUST NOT be called with a write lock
 191         * to nf_conntrack_lock!!! -HW */
 192        rcu_read_lock();
 193        l4proto = __nf_ct_l4proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num,
 194                                       ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
 195        if (l4proto && l4proto->destroy)
 196                l4proto->destroy(ct);
 197
 198        nf_ct_ext_destroy(ct);
 199
 200        rcu_read_unlock();
 201
 202        write_lock_bh(&nf_conntrack_lock);
 203        /* Expectations will have been removed in clean_from_lists,
 204         * except TFTP can create an expectation on the first packet,
 205         * before connection is in the list, so we need to clean here,
 206         * too. */
 207        nf_ct_remove_expectations(ct);
 208
 209        /* We overload first tuple to link into unconfirmed list. */
 210        if (!nf_ct_is_confirmed(ct)) {
 211                BUG_ON(hlist_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode));
 212                hlist_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode);
 213        }
 214
 215        NF_CT_STAT_INC(delete);
 216        write_unlock_bh(&nf_conntrack_lock);
 217
 218        if (ct->master)
 219                nf_ct_put(ct->master);
 220
 221        pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct);
 222        nf_conntrack_free(ct);
 223}
 224
 225static void death_by_timeout(unsigned long ul_conntrack)
 226{
 227        struct nf_conn *ct = (void *)ul_conntrack;
 228        struct nf_conn_help *help = nfct_help(ct);
 229        struct nf_conntrack_helper *helper;
 230
 231        if (help) {
 232                rcu_read_lock();
 233                helper = rcu_dereference(help->helper);
 234                if (helper && helper->destroy)
 235                        helper->destroy(ct);
 236                rcu_read_unlock();
 237        }
 238
 239        write_lock_bh(&nf_conntrack_lock);
 240        /* Inside lock so preempt is disabled on module removal path.
 241         * Otherwise we can get spurious warnings. */
 242        NF_CT_STAT_INC(delete_list);
 243        clean_from_lists(ct);
 244        write_unlock_bh(&nf_conntrack_lock);
 245        nf_ct_put(ct);
 246}
 247
 248struct nf_conntrack_tuple_hash *
 249__nf_conntrack_find(const struct nf_conntrack_tuple *tuple,
 250                    const struct nf_conn *ignored_conntrack)
 251{
 252        struct nf_conntrack_tuple_hash *h;
 253        struct hlist_node *n;
 254        unsigned int hash = hash_conntrack(tuple);
 255
 256        hlist_for_each_entry(h, n, &nf_conntrack_hash[hash], hnode) {
 257                if (nf_ct_tuplehash_to_ctrack(h) != ignored_conntrack &&
 258                    nf_ct_tuple_equal(tuple, &h->tuple)) {
 259                        NF_CT_STAT_INC(found);
 260                        return h;
 261                }
 262                NF_CT_STAT_INC(searched);
 263        }
 264
 265        return NULL;
 266}
 267EXPORT_SYMBOL_GPL(__nf_conntrack_find);
 268
 269/* Find a connection corresponding to a tuple. */
 270struct nf_conntrack_tuple_hash *
 271nf_conntrack_find_get(const struct nf_conntrack_tuple *tuple)
 272{
 273        struct nf_conntrack_tuple_hash *h;
 274
 275        read_lock_bh(&nf_conntrack_lock);
 276        h = __nf_conntrack_find(tuple, NULL);
 277        if (h)
 278                atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use);
 279        read_unlock_bh(&nf_conntrack_lock);
 280
 281        return h;
 282}
 283EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
 284
 285static void __nf_conntrack_hash_insert(struct nf_conn *ct,
 286                                       unsigned int hash,
 287                                       unsigned int repl_hash)
 288{
 289        hlist_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode,
 290                       &nf_conntrack_hash[hash]);
 291        hlist_add_head(&ct->tuplehash[IP_CT_DIR_REPLY].hnode,
 292                       &nf_conntrack_hash[repl_hash]);
 293}
 294
 295void nf_conntrack_hash_insert(struct nf_conn *ct)
 296{
 297        unsigned int hash, repl_hash;
 298
 299        hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
 300        repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
 301
 302        write_lock_bh(&nf_conntrack_lock);
 303        __nf_conntrack_hash_insert(ct, hash, repl_hash);
 304        write_unlock_bh(&nf_conntrack_lock);
 305}
 306EXPORT_SYMBOL_GPL(nf_conntrack_hash_insert);
 307
 308/* Confirm a connection given skb; places it in hash table */
 309int
 310__nf_conntrack_confirm(struct sk_buff *skb)
 311{
 312        unsigned int hash, repl_hash;
 313        struct nf_conntrack_tuple_hash *h;
 314        struct nf_conn *ct;
 315        struct nf_conn_help *help;
 316        struct hlist_node *n;
 317        enum ip_conntrack_info ctinfo;
 318
 319        ct = nf_ct_get(skb, &ctinfo);
 320
 321        /* ipt_REJECT uses nf_conntrack_attach to attach related
 322           ICMP/TCP RST packets in other direction.  Actual packet
 323           which created connection will be IP_CT_NEW or for an
 324           expected connection, IP_CT_RELATED. */
 325        if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
 326                return NF_ACCEPT;
 327
 328        hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
 329        repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
 330
 331        /* We're not in hash table, and we refuse to set up related
 332           connections for unconfirmed conns.  But packet copies and
 333           REJECT will give spurious warnings here. */
 334        /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
 335
 336        /* No external references means noone else could have
 337           confirmed us. */
 338        NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
 339        pr_debug("Confirming conntrack %p\n", ct);
 340
 341        write_lock_bh(&nf_conntrack_lock);
 342
 343        /* See if there's one in the list already, including reverse:
 344           NAT could have grabbed it without realizing, since we're
 345           not in the hash.  If there is, we lost race. */
 346        hlist_for_each_entry(h, n, &nf_conntrack_hash[hash], hnode)
 347                if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
 348                                      &h->tuple))
 349                        goto out;
 350        hlist_for_each_entry(h, n, &nf_conntrack_hash[repl_hash], hnode)
 351                if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
 352                                      &h->tuple))
 353                        goto out;
 354
 355        /* Remove from unconfirmed list */
 356        hlist_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode);
 357
 358        __nf_conntrack_hash_insert(ct, hash, repl_hash);
 359        /* Timer relative to confirmation time, not original
 360           setting time, otherwise we'd get timer wrap in
 361           weird delay cases. */
 362        ct->timeout.expires += jiffies;
 363        add_timer(&ct->timeout);
 364        atomic_inc(&ct->ct_general.use);
 365        set_bit(IPS_CONFIRMED_BIT, &ct->status);
 366        NF_CT_STAT_INC(insert);
 367        write_unlock_bh(&nf_conntrack_lock);
 368        help = nfct_help(ct);
 369        if (help && help->helper)
 370                nf_conntrack_event_cache(IPCT_HELPER, skb);
 371#ifdef CONFIG_NF_NAT_NEEDED
 372        if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
 373            test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
 374                nf_conntrack_event_cache(IPCT_NATINFO, skb);
 375#endif
 376        nf_conntrack_event_cache(master_ct(ct) ?
 377                                 IPCT_RELATED : IPCT_NEW, skb);
 378        return NF_ACCEPT;
 379
 380out:
 381        NF_CT_STAT_INC(insert_failed);
 382        write_unlock_bh(&nf_conntrack_lock);
 383        return NF_DROP;
 384}
 385EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);
 386
 387/* Returns true if a connection correspondings to the tuple (required
 388   for NAT). */
 389int
 390nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
 391                         const struct nf_conn *ignored_conntrack)
 392{
 393        struct nf_conntrack_tuple_hash *h;
 394
 395        read_lock_bh(&nf_conntrack_lock);
 396        h = __nf_conntrack_find(tuple, ignored_conntrack);
 397        read_unlock_bh(&nf_conntrack_lock);
 398
 399        return h != NULL;
 400}
 401EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken);
 402
 403#define NF_CT_EVICTION_RANGE    8
 404
 405/* There's a small race here where we may free a just-assured
 406   connection.  Too bad: we're in trouble anyway. */
 407static int early_drop(unsigned int hash)
 408{
 409        /* Use oldest entry, which is roughly LRU */
 410        struct nf_conntrack_tuple_hash *h;
 411        struct nf_conn *ct = NULL, *tmp;
 412        struct hlist_node *n;
 413        unsigned int i, cnt = 0;
 414        int dropped = 0;
 415
 416        read_lock_bh(&nf_conntrack_lock);
 417        for (i = 0; i < nf_conntrack_htable_size; i++) {
 418                hlist_for_each_entry(h, n, &nf_conntrack_hash[hash], hnode) {
 419                        tmp = nf_ct_tuplehash_to_ctrack(h);
 420                        if (!test_bit(IPS_ASSURED_BIT, &tmp->status))
 421                                ct = tmp;
 422                        cnt++;
 423                }
 424                if (ct || cnt >= NF_CT_EVICTION_RANGE)
 425                        break;
 426                hash = (hash + 1) % nf_conntrack_htable_size;
 427        }
 428        if (ct)
 429                atomic_inc(&ct->ct_general.use);
 430        read_unlock_bh(&nf_conntrack_lock);
 431
 432        if (!ct)
 433                return dropped;
 434
 435        if (del_timer(&ct->timeout)) {
 436                death_by_timeout((unsigned long)ct);
 437                dropped = 1;
 438                NF_CT_STAT_INC_ATOMIC(early_drop);
 439        }
 440        nf_ct_put(ct);
 441        return dropped;
 442}
 443
 444struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
 445                                   const struct nf_conntrack_tuple *repl)
 446{
 447        struct nf_conn *conntrack = NULL;
 448
 449        if (unlikely(!nf_conntrack_hash_rnd_initted)) {
 450                get_random_bytes(&nf_conntrack_hash_rnd, 4);
 451                nf_conntrack_hash_rnd_initted = 1;
 452        }
 453
 454        /* We don't want any race condition at early drop stage */
 455        atomic_inc(&nf_conntrack_count);
 456
 457        if (nf_conntrack_max
 458            && atomic_read(&nf_conntrack_count) > nf_conntrack_max) {
 459                unsigned int hash = hash_conntrack(orig);
 460                if (!early_drop(hash)) {
 461                        atomic_dec(&nf_conntrack_count);
 462                        if (net_ratelimit())
 463                                printk(KERN_WARNING
 464                                       "nf_conntrack: table full, dropping"
 465                                       " packet.\n");
 466                        return ERR_PTR(-ENOMEM);
 467                }
 468        }
 469
 470        conntrack = kmem_cache_zalloc(nf_conntrack_cachep, GFP_ATOMIC);
 471        if (conntrack == NULL) {
 472                pr_debug("nf_conntrack_alloc: Can't alloc conntrack.\n");
 473                atomic_dec(&nf_conntrack_count);
 474                return ERR_PTR(-ENOMEM);
 475        }
 476
 477        atomic_set(&conntrack->ct_general.use, 1);
 478        conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
 479        conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
 480        /* Don't set timer yet: wait for confirmation */
 481        setup_timer(&conntrack->timeout, death_by_timeout,
 482                    (unsigned long)conntrack);
 483
 484        return conntrack;
 485}
 486EXPORT_SYMBOL_GPL(nf_conntrack_alloc);
 487
 488void nf_conntrack_free(struct nf_conn *conntrack)
 489{
 490        nf_ct_ext_free(conntrack);
 491        kmem_cache_free(nf_conntrack_cachep, conntrack);
 492        atomic_dec(&nf_conntrack_count);
 493}
 494EXPORT_SYMBOL_GPL(nf_conntrack_free);
 495
 496/* Allocate a new conntrack: we return -ENOMEM if classification
 497   failed due to stress.  Otherwise it really is unclassifiable. */
 498static struct nf_conntrack_tuple_hash *
 499init_conntrack(const struct nf_conntrack_tuple *tuple,
 500               struct nf_conntrack_l3proto *l3proto,
 501               struct nf_conntrack_l4proto *l4proto,
 502               struct sk_buff *skb,
 503               unsigned int dataoff)
 504{
 505        struct nf_conn *conntrack;
 506        struct nf_conn_help *help;
 507        struct nf_conntrack_tuple repl_tuple;
 508        struct nf_conntrack_expect *exp;
 509
 510        if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) {
 511                pr_debug("Can't invert tuple.\n");
 512                return NULL;
 513        }
 514
 515        conntrack = nf_conntrack_alloc(tuple, &repl_tuple);
 516        if (conntrack == NULL || IS_ERR(conntrack)) {
 517                pr_debug("Can't allocate conntrack.\n");
 518                return (struct nf_conntrack_tuple_hash *)conntrack;
 519        }
 520
 521        if (!l4proto->new(conntrack, skb, dataoff)) {
 522                nf_conntrack_free(conntrack);
 523                pr_debug("init conntrack: can't track with proto module\n");
 524                return NULL;
 525        }
 526
 527        write_lock_bh(&nf_conntrack_lock);
 528        exp = nf_ct_find_expectation(tuple);
 529        if (exp) {
 530                pr_debug("conntrack: expectation arrives ct=%p exp=%p\n",
 531                         conntrack, exp);
 532                /* Welcome, Mr. Bond.  We've been expecting you... */
 533                __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
 534                conntrack->master = exp->master;
 535                if (exp->helper) {
 536                        help = nf_ct_helper_ext_add(conntrack, GFP_ATOMIC);
 537                        if (help)
 538                                rcu_assign_pointer(help->helper, exp->helper);
 539                }
 540
 541#ifdef CONFIG_NF_CONNTRACK_MARK
 542                conntrack->mark = exp->master->mark;
 543#endif
 544#ifdef CONFIG_NF_CONNTRACK_SECMARK
 545                conntrack->secmark = exp->master->secmark;
 546#endif
 547                nf_conntrack_get(&conntrack->master->ct_general);
 548                NF_CT_STAT_INC(expect_new);
 549        } else {
 550                struct nf_conntrack_helper *helper;
 551
 552                helper = __nf_ct_helper_find(&repl_tuple);
 553                if (helper) {
 554                        help = nf_ct_helper_ext_add(conntrack, GFP_ATOMIC);
 555                        if (help)
 556                                rcu_assign_pointer(help->helper, helper);
 557                }
 558                NF_CT_STAT_INC(new);
 559        }
 560
 561        /* Overload tuple linked list to put us in unconfirmed list. */
 562        hlist_add_head(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].hnode,
 563                       &unconfirmed);
 564
 565        write_unlock_bh(&nf_conntrack_lock);
 566
 567        if (exp) {
 568                if (exp->expectfn)
 569                        exp->expectfn(conntrack, exp);
 570                nf_ct_expect_put(exp);
 571        }
 572
 573        return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
 574}
 575
 576/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
 577static inline struct nf_conn *
 578resolve_normal_ct(struct sk_buff *skb,
 579                  unsigned int dataoff,
 580                  u_int16_t l3num,
 581                  u_int8_t protonum,
 582                  struct nf_conntrack_l3proto *l3proto,
 583                  struct nf_conntrack_l4proto *l4proto,
 584                  int *set_reply,
 585                  enum ip_conntrack_info *ctinfo)
 586{
 587        struct nf_conntrack_tuple tuple;
 588        struct nf_conntrack_tuple_hash *h;
 589        struct nf_conn *ct;
 590
 591        if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
 592                             dataoff, l3num, protonum, &tuple, l3proto,
 593                             l4proto)) {
 594                pr_debug("resolve_normal_ct: Can't get tuple\n");
 595                return NULL;
 596        }
 597
 598        /* look for tuple match */
 599        h = nf_conntrack_find_get(&tuple);
 600        if (!h) {
 601                h = init_conntrack(&tuple, l3proto, l4proto, skb, dataoff);
 602                if (!h)
 603                        return NULL;
 604                if (IS_ERR(h))
 605                        return (void *)h;
 606        }
 607        ct = nf_ct_tuplehash_to_ctrack(h);
 608
 609        /* It exists; we have (non-exclusive) reference. */
 610        if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
 611                *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
 612                /* Please set reply bit if this packet OK */
 613                *set_reply = 1;
 614        } else {
 615                /* Once we've had two way comms, always ESTABLISHED. */
 616                if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
 617                        pr_debug("nf_conntrack_in: normal packet for %p\n", ct);
 618                        *ctinfo = IP_CT_ESTABLISHED;
 619                } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
 620                        pr_debug("nf_conntrack_in: related packet for %p\n",
 621                                 ct);
 622                        *ctinfo = IP_CT_RELATED;
 623                } else {
 624                        pr_debug("nf_conntrack_in: new packet for %p\n", ct);
 625                        *ctinfo = IP_CT_NEW;
 626                }
 627                *set_reply = 0;
 628        }
 629        skb->nfct = &ct->ct_general;
 630        skb->nfctinfo = *ctinfo;
 631        return ct;
 632}
 633
 634unsigned int
 635nf_conntrack_in(int pf, unsigned int hooknum, struct sk_buff *skb)
 636{
 637        struct nf_conn *ct;
 638        enum ip_conntrack_info ctinfo;
 639        struct nf_conntrack_l3proto *l3proto;
 640        struct nf_conntrack_l4proto *l4proto;
 641        unsigned int dataoff;
 642        u_int8_t protonum;
 643        int set_reply = 0;
 644        int ret;
 645
 646        /* Previously seen (loopback or untracked)?  Ignore. */
 647        if (skb->nfct) {
 648                NF_CT_STAT_INC_ATOMIC(ignore);
 649                return NF_ACCEPT;
 650        }
 651
 652        /* rcu_read_lock()ed by nf_hook_slow */
 653        l3proto = __nf_ct_l3proto_find((u_int16_t)pf);
 654        ret = l3proto->get_l4proto(skb, skb_network_offset(skb),
 655                                   &dataoff, &protonum);
 656        if (ret <= 0) {
 657                pr_debug("not prepared to track yet or error occured\n");
 658                NF_CT_STAT_INC_ATOMIC(error);
 659                NF_CT_STAT_INC_ATOMIC(invalid);
 660                return -ret;
 661        }
 662
 663        l4proto = __nf_ct_l4proto_find((u_int16_t)pf, protonum);
 664
 665        /* It may be an special packet, error, unclean...
 666         * inverse of the return code tells to the netfilter
 667         * core what to do with the packet. */
 668        if (l4proto->error != NULL &&
 669            (ret = l4proto->error(skb, dataoff, &ctinfo, pf, hooknum)) <= 0) {
 670                NF_CT_STAT_INC_ATOMIC(error);
 671                NF_CT_STAT_INC_ATOMIC(invalid);
 672                return -ret;
 673        }
 674
 675        ct = resolve_normal_ct(skb, dataoff, pf, protonum, l3proto, l4proto,
 676                               &set_reply, &ctinfo);
 677        if (!ct) {
 678                /* Not valid part of a connection */
 679                NF_CT_STAT_INC_ATOMIC(invalid);
 680                return NF_ACCEPT;
 681        }
 682
 683        if (IS_ERR(ct)) {
 684                /* Too stressed to deal. */
 685                NF_CT_STAT_INC_ATOMIC(drop);
 686                return NF_DROP;
 687        }
 688
 689        NF_CT_ASSERT(skb->nfct);
 690
 691        ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, hooknum);
 692        if (ret < 0) {
 693                /* Invalid: inverse of the return code tells
 694                 * the netfilter core what to do */
 695                pr_debug("nf_conntrack_in: Can't track with proto module\n");
 696                nf_conntrack_put(skb->nfct);
 697                skb->nfct = NULL;
 698                NF_CT_STAT_INC_ATOMIC(invalid);
 699                return -ret;
 700        }
 701
 702        if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
 703                nf_conntrack_event_cache(IPCT_STATUS, skb);
 704
 705        return ret;
 706}
 707EXPORT_SYMBOL_GPL(nf_conntrack_in);
 708
 709int nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
 710                         const struct nf_conntrack_tuple *orig)
 711{
 712        int ret;
 713
 714        rcu_read_lock();
 715        ret = nf_ct_invert_tuple(inverse, orig,
 716                                 __nf_ct_l3proto_find(orig->src.l3num),
 717                                 __nf_ct_l4proto_find(orig->src.l3num,
 718                                                      orig->dst.protonum));
 719        rcu_read_unlock();
 720        return ret;
 721}
 722EXPORT_SYMBOL_GPL(nf_ct_invert_tuplepr);
 723
 724/* Alter reply tuple (maybe alter helper).  This is for NAT, and is
 725   implicitly racy: see __nf_conntrack_confirm */
 726void nf_conntrack_alter_reply(struct nf_conn *ct,
 727                              const struct nf_conntrack_tuple *newreply)
 728{
 729        struct nf_conn_help *help = nfct_help(ct);
 730        struct nf_conntrack_helper *helper;
 731
 732        write_lock_bh(&nf_conntrack_lock);
 733        /* Should be unconfirmed, so not in hash table yet */
 734        NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
 735
 736        pr_debug("Altering reply tuple of %p to ", ct);
 737        NF_CT_DUMP_TUPLE(newreply);
 738
 739        ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
 740        if (ct->master || (help && help->expecting != 0))
 741                goto out;
 742
 743        helper = __nf_ct_helper_find(newreply);
 744        if (helper == NULL) {
 745                if (help)
 746                        rcu_assign_pointer(help->helper, NULL);
 747                goto out;
 748        }
 749
 750        if (help == NULL) {
 751                help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
 752                if (help == NULL)
 753                        goto out;
 754        } else {
 755                memset(&help->help, 0, sizeof(help->help));
 756        }
 757
 758        rcu_assign_pointer(help->helper, helper);
 759out:
 760        write_unlock_bh(&nf_conntrack_lock);
 761}
 762EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply);
 763
 764/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
 765void __nf_ct_refresh_acct(struct nf_conn *ct,
 766                          enum ip_conntrack_info ctinfo,
 767                          const struct sk_buff *skb,
 768                          unsigned long extra_jiffies,
 769                          int do_acct)
 770{
 771        int event = 0;
 772
 773        NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
 774        NF_CT_ASSERT(skb);
 775
 776        write_lock_bh(&nf_conntrack_lock);
 777
 778        /* Only update if this is not a fixed timeout */
 779        if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
 780                write_unlock_bh(&nf_conntrack_lock);
 781                return;
 782        }
 783
 784        /* If not in hash table, timer will not be active yet */
 785        if (!nf_ct_is_confirmed(ct)) {
 786                ct->timeout.expires = extra_jiffies;
 787                event = IPCT_REFRESH;
 788        } else {
 789                unsigned long newtime = jiffies + extra_jiffies;
 790
 791                /* Only update the timeout if the new timeout is at least
 792                   HZ jiffies from the old timeout. Need del_timer for race
 793                   avoidance (may already be dying). */
 794                if (newtime - ct->timeout.expires >= HZ
 795                    && del_timer(&ct->timeout)) {
 796                        ct->timeout.expires = newtime;
 797                        add_timer(&ct->timeout);
 798                        event = IPCT_REFRESH;
 799                }
 800        }
 801
 802#ifdef CONFIG_NF_CT_ACCT
 803        if (do_acct) {
 804                ct->counters[CTINFO2DIR(ctinfo)].packets++;
 805                ct->counters[CTINFO2DIR(ctinfo)].bytes +=
 806                        skb->len - skb_network_offset(skb);
 807
 808                if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
 809                    || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
 810                        event |= IPCT_COUNTER_FILLING;
 811        }
 812#endif
 813
 814        write_unlock_bh(&nf_conntrack_lock);
 815
 816        /* must be unlocked when calling event cache */
 817        if (event)
 818                nf_conntrack_event_cache(event, skb);
 819}
 820EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);
 821
 822#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
 823
 824#include <linux/netfilter/nfnetlink.h>
 825#include <linux/netfilter/nfnetlink_conntrack.h>
 826#include <linux/mutex.h>
 827
 828/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
 829 * in ip_conntrack_core, since we don't want the protocols to autoload
 830 * or depend on ctnetlink */
 831int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb,
 832                               const struct nf_conntrack_tuple *tuple)
 833{
 834        NLA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
 835                &tuple->src.u.tcp.port);
 836        NLA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
 837                &tuple->dst.u.tcp.port);
 838        return 0;
 839
 840nla_put_failure:
 841        return -1;
 842}
 843EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr);
 844
 845const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = {
 846        [CTA_PROTO_SRC_PORT]  = { .type = NLA_U16 },
 847        [CTA_PROTO_DST_PORT]  = { .type = NLA_U16 },
 848};
 849EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy);
 850
 851int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[],
 852                               struct nf_conntrack_tuple *t)
 853{
 854        if (!tb[CTA_PROTO_SRC_PORT] || !tb[CTA_PROTO_DST_PORT])
 855                return -EINVAL;
 856
 857        t->src.u.tcp.port = *(__be16 *)nla_data(tb[CTA_PROTO_SRC_PORT]);
 858        t->dst.u.tcp.port = *(__be16 *)nla_data(tb[CTA_PROTO_DST_PORT]);
 859
 860        return 0;
 861}
 862EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple);
 863#endif
 864
 865/* Used by ipt_REJECT and ip6t_REJECT. */
 866void __nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
 867{
 868        struct nf_conn *ct;
 869        enum ip_conntrack_info ctinfo;
 870
 871        /* This ICMP is in reverse direction to the packet which caused it */
 872        ct = nf_ct_get(skb, &ctinfo);
 873        if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
 874                ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
 875        else
 876                ctinfo = IP_CT_RELATED;
 877
 878        /* Attach to new skbuff, and increment count */
 879        nskb->nfct = &ct->ct_general;
 880        nskb->nfctinfo = ctinfo;
 881        nf_conntrack_get(nskb->nfct);
 882}
 883EXPORT_SYMBOL_GPL(__nf_conntrack_attach);
 884
 885static inline int
 886do_iter(const struct nf_conntrack_tuple_hash *i,
 887        int (*iter)(struct nf_conn *i, void *data),
 888        void *data)
 889{
 890        return iter(nf_ct_tuplehash_to_ctrack(i), data);
 891}
 892
 893/* Bring out ya dead! */
 894static struct nf_conn *
 895get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
 896                void *data, unsigned int *bucket)
 897{
 898        struct nf_conntrack_tuple_hash *h;
 899        struct nf_conn *ct;
 900        struct hlist_node *n;
 901
 902        write_lock_bh(&nf_conntrack_lock);
 903        for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
 904                hlist_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnode) {
 905                        ct = nf_ct_tuplehash_to_ctrack(h);
 906                        if (iter(ct, data))
 907                                goto found;
 908                }
 909        }
 910        hlist_for_each_entry(h, n, &unconfirmed, hnode) {
 911                ct = nf_ct_tuplehash_to_ctrack(h);
 912                if (iter(ct, data))
 913                        set_bit(IPS_DYING_BIT, &ct->status);
 914        }
 915        write_unlock_bh(&nf_conntrack_lock);
 916        return NULL;
 917found:
 918        atomic_inc(&ct->ct_general.use);
 919        write_unlock_bh(&nf_conntrack_lock);
 920        return ct;
 921}
 922
 923void
 924nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), void *data)
 925{
 926        struct nf_conn *ct;
 927        unsigned int bucket = 0;
 928
 929        while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) {
 930                /* Time to push up daises... */
 931                if (del_timer(&ct->timeout))
 932                        death_by_timeout((unsigned long)ct);
 933                /* ... else the timer will get him soon. */
 934
 935                nf_ct_put(ct);
 936        }
 937}
 938EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup);
 939
 940static int kill_all(struct nf_conn *i, void *data)
 941{
 942        return 1;
 943}
 944
 945void nf_ct_free_hashtable(struct hlist_head *hash, int vmalloced, int size)
 946{
 947        if (vmalloced)
 948                vfree(hash);
 949        else
 950                free_pages((unsigned long)hash,
 951                           get_order(sizeof(struct hlist_head) * size));
 952}
 953EXPORT_SYMBOL_GPL(nf_ct_free_hashtable);
 954
 955void nf_conntrack_flush(void)
 956{
 957        nf_ct_iterate_cleanup(kill_all, NULL);
 958}
 959EXPORT_SYMBOL_GPL(nf_conntrack_flush);
 960
 961/* Mishearing the voices in his head, our hero wonders how he's
 962   supposed to kill the mall. */
 963void nf_conntrack_cleanup(void)
 964{
 965        rcu_assign_pointer(ip_ct_attach, NULL);
 966
 967        /* This makes sure all current packets have passed through
 968           netfilter framework.  Roll on, two-stage module
 969           delete... */
 970        synchronize_net();
 971
 972        nf_ct_event_cache_flush();
 973 i_see_dead_people:
 974        nf_conntrack_flush();
 975        if (atomic_read(&nf_conntrack_count) != 0) {
 976                schedule();
 977                goto i_see_dead_people;
 978        }
 979        /* wait until all references to nf_conntrack_untracked are dropped */
 980        while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1)
 981                schedule();
 982
 983        rcu_assign_pointer(nf_ct_destroy, NULL);
 984
 985        kmem_cache_destroy(nf_conntrack_cachep);
 986        nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_vmalloc,
 987                             nf_conntrack_htable_size);
 988
 989        nf_conntrack_proto_fini();
 990        nf_conntrack_helper_fini();
 991        nf_conntrack_expect_fini();
 992}
 993
 994struct hlist_head *nf_ct_alloc_hashtable(int *sizep, int *vmalloced)
 995{
 996        struct hlist_head *hash;
 997        unsigned int size, i;
 998
 999        *vmalloced = 0;
1000
1001        size = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_head));
1002        hash = (void*)__get_free_pages(GFP_KERNEL|__GFP_NOWARN,
1003                                       get_order(sizeof(struct hlist_head)
1004                                                 * size));
1005        if (!hash) {
1006                *vmalloced = 1;
1007                printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n");
1008                hash = vmalloc(sizeof(struct hlist_head) * size);
1009        }
1010
1011        if (hash)
1012                for (i = 0; i < size; i++)
1013                        INIT_HLIST_HEAD(&hash[i]);
1014
1015        return hash;
1016}
1017EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable);
1018
1019int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
1020{
1021        int i, bucket, hashsize, vmalloced;
1022        int old_vmalloced, old_size;
1023        int rnd;
1024        struct hlist_head *hash, *old_hash;
1025        struct nf_conntrack_tuple_hash *h;
1026
1027        /* On boot, we can set this without any fancy locking. */
1028        if (!nf_conntrack_htable_size)
1029                return param_set_uint(val, kp);
1030
1031        hashsize = simple_strtol(val, NULL, 0);
1032        if (!hashsize)
1033                return -EINVAL;
1034
1035        hash = nf_ct_alloc_hashtable(&hashsize, &vmalloced);
1036        if (!hash)
1037                return -ENOMEM;
1038
1039        /* We have to rehahs for the new table anyway, so we also can
1040         * use a newrandom seed */
1041        get_random_bytes(&rnd, 4);
1042
1043        write_lock_bh(&nf_conntrack_lock);
1044        for (i = 0; i < nf_conntrack_htable_size; i++) {
1045                while (!hlist_empty(&nf_conntrack_hash[i])) {
1046                        h = hlist_entry(nf_conntrack_hash[i].first,
1047                                        struct nf_conntrack_tuple_hash, hnode);
1048                        hlist_del(&h->hnode);
1049                        bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
1050                        hlist_add_head(&h->hnode, &hash[bucket]);
1051                }
1052        }
1053        old_size = nf_conntrack_htable_size;
1054        old_vmalloced = nf_conntrack_vmalloc;
1055        old_hash = nf_conntrack_hash;
1056
1057        nf_conntrack_htable_size = hashsize;
1058        nf_conntrack_vmalloc = vmalloced;
1059        nf_conntrack_hash = hash;
1060        nf_conntrack_hash_rnd = rnd;
1061        write_unlock_bh(&nf_conntrack_lock);
1062
1063        nf_ct_free_hashtable(old_hash, old_vmalloced, old_size);
1064        return 0;
1065}
1066EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize);
1067
1068module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
1069                  &nf_conntrack_htable_size, 0600);
1070
1071int __init nf_conntrack_init(void)
1072{
1073        int max_factor = 8;
1074        int ret;
1075
1076        /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1077         * machine has 512 buckets. >= 1GB machines have 16384 buckets. */
1078        if (!nf_conntrack_htable_size) {
1079                nf_conntrack_htable_size
1080                        = (((num_physpages << PAGE_SHIFT) / 16384)
1081                           / sizeof(struct hlist_head));
1082                if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1083                        nf_conntrack_htable_size = 16384;
1084                if (nf_conntrack_htable_size < 32)
1085                        nf_conntrack_htable_size = 32;
1086
1087                /* Use a max. factor of four by default to get the same max as
1088                 * with the old struct list_heads. When a table size is given
1089                 * we use the old value of 8 to avoid reducing the max.
1090                 * entries. */
1091                max_factor = 4;
1092        }
1093        nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size,
1094                                                  &nf_conntrack_vmalloc);
1095        if (!nf_conntrack_hash) {
1096                printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
1097                goto err_out;
1098        }
1099
1100        nf_conntrack_max = max_factor * nf_conntrack_htable_size;
1101
1102        printk("nf_conntrack version %s (%u buckets, %d max)\n",
1103               NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
1104               nf_conntrack_max);
1105
1106        nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
1107                                                sizeof(struct nf_conn),
1108                                                0, 0, NULL);
1109        if (!nf_conntrack_cachep) {
1110                printk(KERN_ERR "Unable to create nf_conn slab cache\n");
1111                goto err_free_hash;
1112        }
1113
1114        ret = nf_conntrack_proto_init();
1115        if (ret < 0)
1116                goto err_free_conntrack_slab;
1117
1118        ret = nf_conntrack_expect_init();
1119        if (ret < 0)
1120                goto out_fini_proto;
1121
1122        ret = nf_conntrack_helper_init();
1123        if (ret < 0)
1124                goto out_fini_expect;
1125
1126        /* For use by REJECT target */
1127        rcu_assign_pointer(ip_ct_attach, __nf_conntrack_attach);
1128        rcu_assign_pointer(nf_ct_destroy, destroy_conntrack);
1129
1130        /* Set up fake conntrack:
1131            - to never be deleted, not in any hashes */
1132        atomic_set(&nf_conntrack_untracked.ct_general.use, 1);
1133        /*  - and look it like as a confirmed connection */
1134        set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status);
1135
1136        return ret;
1137
1138out_fini_expect:
1139        nf_conntrack_expect_fini();
1140out_fini_proto:
1141        nf_conntrack_proto_fini();
1142err_free_conntrack_slab:
1143        kmem_cache_destroy(nf_conntrack_cachep);
1144err_free_hash:
1145        nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_vmalloc,
1146                             nf_conntrack_htable_size);
1147err_out:
1148        return -ENOMEM;
1149}
1150