linux/net/netfilter/nf_conntrack_core.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/* Connection state tracking for netfilter.  This is separated from,
   3   but required by, the NAT layer; it can also be used by an iptables
   4   extension. */
   5
   6/* (C) 1999-2001 Paul `Rusty' Russell
   7 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
   8 * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
   9 * (C) 2005-2012 Patrick McHardy <kaber@trash.net>
  10 */
  11
  12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  13
  14#include <linux/types.h>
  15#include <linux/netfilter.h>
  16#include <linux/module.h>
  17#include <linux/sched.h>
  18#include <linux/skbuff.h>
  19#include <linux/proc_fs.h>
  20#include <linux/vmalloc.h>
  21#include <linux/stddef.h>
  22#include <linux/slab.h>
  23#include <linux/random.h>
  24#include <linux/jhash.h>
  25#include <linux/siphash.h>
  26#include <linux/err.h>
  27#include <linux/percpu.h>
  28#include <linux/moduleparam.h>
  29#include <linux/notifier.h>
  30#include <linux/kernel.h>
  31#include <linux/netdevice.h>
  32#include <linux/socket.h>
  33#include <linux/mm.h>
  34#include <linux/nsproxy.h>
  35#include <linux/rculist_nulls.h>
  36
  37#include <net/netfilter/nf_conntrack.h>
  38#include <net/netfilter/nf_conntrack_l4proto.h>
  39#include <net/netfilter/nf_conntrack_expect.h>
  40#include <net/netfilter/nf_conntrack_helper.h>
  41#include <net/netfilter/nf_conntrack_seqadj.h>
  42#include <net/netfilter/nf_conntrack_core.h>
  43#include <net/netfilter/nf_conntrack_extend.h>
  44#include <net/netfilter/nf_conntrack_acct.h>
  45#include <net/netfilter/nf_conntrack_ecache.h>
  46#include <net/netfilter/nf_conntrack_zones.h>
  47#include <net/netfilter/nf_conntrack_timestamp.h>
  48#include <net/netfilter/nf_conntrack_timeout.h>
  49#include <net/netfilter/nf_conntrack_labels.h>
  50#include <net/netfilter/nf_conntrack_synproxy.h>
  51#include <net/netfilter/nf_nat.h>
  52#include <net/netfilter/nf_nat_helper.h>
  53#include <net/netns/hash.h>
  54#include <net/ip.h>
  55
  56#include "nf_internals.h"
  57
  58__cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS];
  59EXPORT_SYMBOL_GPL(nf_conntrack_locks);
  60
  61__cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock);
  62EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock);
  63
  64struct hlist_nulls_head *nf_conntrack_hash __read_mostly;
  65EXPORT_SYMBOL_GPL(nf_conntrack_hash);
  66
  67struct conntrack_gc_work {
  68        struct delayed_work     dwork;
  69        u32                     last_bucket;
  70        bool                    exiting;
  71        bool                    early_drop;
  72        long                    next_gc_run;
  73};
  74
  75static __read_mostly struct kmem_cache *nf_conntrack_cachep;
  76static __read_mostly spinlock_t nf_conntrack_locks_all_lock;
  77static __read_mostly DEFINE_SPINLOCK(nf_conntrack_locks_all_lock);
  78static __read_mostly bool nf_conntrack_locks_all;
  79
  80/* every gc cycle scans at most 1/GC_MAX_BUCKETS_DIV part of table */
  81#define GC_MAX_BUCKETS_DIV      128u
  82/* upper bound of full table scan */
  83#define GC_MAX_SCAN_JIFFIES     (16u * HZ)
  84/* desired ratio of entries found to be expired */
  85#define GC_EVICT_RATIO  50u
  86
  87static struct conntrack_gc_work conntrack_gc_work;
  88
  89void nf_conntrack_lock(spinlock_t *lock) __acquires(lock)
  90{
  91        /* 1) Acquire the lock */
  92        spin_lock(lock);
  93
  94        /* 2) read nf_conntrack_locks_all, with ACQUIRE semantics
  95         * It pairs with the smp_store_release() in nf_conntrack_all_unlock()
  96         */
  97        if (likely(smp_load_acquire(&nf_conntrack_locks_all) == false))
  98                return;
  99
 100        /* fast path failed, unlock */
 101        spin_unlock(lock);
 102
 103        /* Slow path 1) get global lock */
 104        spin_lock(&nf_conntrack_locks_all_lock);
 105
 106        /* Slow path 2) get the lock we want */
 107        spin_lock(lock);
 108
 109        /* Slow path 3) release the global lock */
 110        spin_unlock(&nf_conntrack_locks_all_lock);
 111}
 112EXPORT_SYMBOL_GPL(nf_conntrack_lock);
 113
 114static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2)
 115{
 116        h1 %= CONNTRACK_LOCKS;
 117        h2 %= CONNTRACK_LOCKS;
 118        spin_unlock(&nf_conntrack_locks[h1]);
 119        if (h1 != h2)
 120                spin_unlock(&nf_conntrack_locks[h2]);
 121}
 122
 123/* return true if we need to recompute hashes (in case hash table was resized) */
 124static bool nf_conntrack_double_lock(struct net *net, unsigned int h1,
 125                                     unsigned int h2, unsigned int sequence)
 126{
 127        h1 %= CONNTRACK_LOCKS;
 128        h2 %= CONNTRACK_LOCKS;
 129        if (h1 <= h2) {
 130                nf_conntrack_lock(&nf_conntrack_locks[h1]);
 131                if (h1 != h2)
 132                        spin_lock_nested(&nf_conntrack_locks[h2],
 133                                         SINGLE_DEPTH_NESTING);
 134        } else {
 135                nf_conntrack_lock(&nf_conntrack_locks[h2]);
 136                spin_lock_nested(&nf_conntrack_locks[h1],
 137                                 SINGLE_DEPTH_NESTING);
 138        }
 139        if (read_seqcount_retry(&nf_conntrack_generation, sequence)) {
 140                nf_conntrack_double_unlock(h1, h2);
 141                return true;
 142        }
 143        return false;
 144}
 145
 146static void nf_conntrack_all_lock(void)
 147{
 148        int i;
 149
 150        spin_lock(&nf_conntrack_locks_all_lock);
 151
 152        nf_conntrack_locks_all = true;
 153
 154        for (i = 0; i < CONNTRACK_LOCKS; i++) {
 155                spin_lock(&nf_conntrack_locks[i]);
 156
 157                /* This spin_unlock provides the "release" to ensure that
 158                 * nf_conntrack_locks_all==true is visible to everyone that
 159                 * acquired spin_lock(&nf_conntrack_locks[]).
 160                 */
 161                spin_unlock(&nf_conntrack_locks[i]);
 162        }
 163}
 164
 165static void nf_conntrack_all_unlock(void)
 166{
 167        /* All prior stores must be complete before we clear
 168         * 'nf_conntrack_locks_all'. Otherwise nf_conntrack_lock()
 169         * might observe the false value but not the entire
 170         * critical section.
 171         * It pairs with the smp_load_acquire() in nf_conntrack_lock()
 172         */
 173        smp_store_release(&nf_conntrack_locks_all, false);
 174        spin_unlock(&nf_conntrack_locks_all_lock);
 175}
 176
 177unsigned int nf_conntrack_htable_size __read_mostly;
 178EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
 179
 180unsigned int nf_conntrack_max __read_mostly;
 181EXPORT_SYMBOL_GPL(nf_conntrack_max);
 182seqcount_t nf_conntrack_generation __read_mostly;
 183static unsigned int nf_conntrack_hash_rnd __read_mostly;
 184
 185static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple,
 186                              const struct net *net)
 187{
 188        unsigned int n;
 189        u32 seed;
 190
 191        get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd));
 192
 193        /* The direction must be ignored, so we hash everything up to the
 194         * destination ports (which is a multiple of 4) and treat the last
 195         * three bytes manually.
 196         */
 197        seed = nf_conntrack_hash_rnd ^ net_hash_mix(net);
 198        n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32);
 199        return jhash2((u32 *)tuple, n, seed ^
 200                      (((__force __u16)tuple->dst.u.all << 16) |
 201                      tuple->dst.protonum));
 202}
 203
 204static u32 scale_hash(u32 hash)
 205{
 206        return reciprocal_scale(hash, nf_conntrack_htable_size);
 207}
 208
 209static u32 __hash_conntrack(const struct net *net,
 210                            const struct nf_conntrack_tuple *tuple,
 211                            unsigned int size)
 212{
 213        return reciprocal_scale(hash_conntrack_raw(tuple, net), size);
 214}
 215
 216static u32 hash_conntrack(const struct net *net,
 217                          const struct nf_conntrack_tuple *tuple)
 218{
 219        return scale_hash(hash_conntrack_raw(tuple, net));
 220}
 221
 222static bool nf_ct_get_tuple_ports(const struct sk_buff *skb,
 223                                  unsigned int dataoff,
 224                                  struct nf_conntrack_tuple *tuple)
 225{       struct {
 226                __be16 sport;
 227                __be16 dport;
 228        } _inet_hdr, *inet_hdr;
 229
 230        /* Actually only need first 4 bytes to get ports. */
 231        inet_hdr = skb_header_pointer(skb, dataoff, sizeof(_inet_hdr), &_inet_hdr);
 232        if (!inet_hdr)
 233                return false;
 234
 235        tuple->src.u.udp.port = inet_hdr->sport;
 236        tuple->dst.u.udp.port = inet_hdr->dport;
 237        return true;
 238}
 239
 240static bool
 241nf_ct_get_tuple(const struct sk_buff *skb,
 242                unsigned int nhoff,
 243                unsigned int dataoff,
 244                u_int16_t l3num,
 245                u_int8_t protonum,
 246                struct net *net,
 247                struct nf_conntrack_tuple *tuple)
 248{
 249        unsigned int size;
 250        const __be32 *ap;
 251        __be32 _addrs[8];
 252
 253        memset(tuple, 0, sizeof(*tuple));
 254
 255        tuple->src.l3num = l3num;
 256        switch (l3num) {
 257        case NFPROTO_IPV4:
 258                nhoff += offsetof(struct iphdr, saddr);
 259                size = 2 * sizeof(__be32);
 260                break;
 261        case NFPROTO_IPV6:
 262                nhoff += offsetof(struct ipv6hdr, saddr);
 263                size = sizeof(_addrs);
 264                break;
 265        default:
 266                return true;
 267        }
 268
 269        ap = skb_header_pointer(skb, nhoff, size, _addrs);
 270        if (!ap)
 271                return false;
 272
 273        switch (l3num) {
 274        case NFPROTO_IPV4:
 275                tuple->src.u3.ip = ap[0];
 276                tuple->dst.u3.ip = ap[1];
 277                break;
 278        case NFPROTO_IPV6:
 279                memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6));
 280                memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6));
 281                break;
 282        }
 283
 284        tuple->dst.protonum = protonum;
 285        tuple->dst.dir = IP_CT_DIR_ORIGINAL;
 286
 287        switch (protonum) {
 288#if IS_ENABLED(CONFIG_IPV6)
 289        case IPPROTO_ICMPV6:
 290                return icmpv6_pkt_to_tuple(skb, dataoff, net, tuple);
 291#endif
 292        case IPPROTO_ICMP:
 293                return icmp_pkt_to_tuple(skb, dataoff, net, tuple);
 294#ifdef CONFIG_NF_CT_PROTO_GRE
 295        case IPPROTO_GRE:
 296                return gre_pkt_to_tuple(skb, dataoff, net, tuple);
 297#endif
 298        case IPPROTO_TCP:
 299        case IPPROTO_UDP: /* fallthrough */
 300                return nf_ct_get_tuple_ports(skb, dataoff, tuple);
 301#ifdef CONFIG_NF_CT_PROTO_UDPLITE
 302        case IPPROTO_UDPLITE:
 303                return nf_ct_get_tuple_ports(skb, dataoff, tuple);
 304#endif
 305#ifdef CONFIG_NF_CT_PROTO_SCTP
 306        case IPPROTO_SCTP:
 307                return nf_ct_get_tuple_ports(skb, dataoff, tuple);
 308#endif
 309#ifdef CONFIG_NF_CT_PROTO_DCCP
 310        case IPPROTO_DCCP:
 311                return nf_ct_get_tuple_ports(skb, dataoff, tuple);
 312#endif
 313        default:
 314                break;
 315        }
 316
 317        return true;
 318}
 319
 320static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
 321                            u_int8_t *protonum)
 322{
 323        int dataoff = -1;
 324        const struct iphdr *iph;
 325        struct iphdr _iph;
 326
 327        iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
 328        if (!iph)
 329                return -1;
 330
 331        /* Conntrack defragments packets, we might still see fragments
 332         * inside ICMP packets though.
 333         */
 334        if (iph->frag_off & htons(IP_OFFSET))
 335                return -1;
 336
 337        dataoff = nhoff + (iph->ihl << 2);
 338        *protonum = iph->protocol;
 339
 340        /* Check bogus IP headers */
 341        if (dataoff > skb->len) {
 342                pr_debug("bogus IPv4 packet: nhoff %u, ihl %u, skblen %u\n",
 343                         nhoff, iph->ihl << 2, skb->len);
 344                return -1;
 345        }
 346        return dataoff;
 347}
 348
 349#if IS_ENABLED(CONFIG_IPV6)
 350static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
 351                            u8 *protonum)
 352{
 353        int protoff = -1;
 354        unsigned int extoff = nhoff + sizeof(struct ipv6hdr);
 355        __be16 frag_off;
 356        u8 nexthdr;
 357
 358        if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr),
 359                          &nexthdr, sizeof(nexthdr)) != 0) {
 360                pr_debug("can't get nexthdr\n");
 361                return -1;
 362        }
 363        protoff = ipv6_skip_exthdr(skb, extoff, &nexthdr, &frag_off);
 364        /*
 365         * (protoff == skb->len) means the packet has not data, just
 366         * IPv6 and possibly extensions headers, but it is tracked anyway
 367         */
 368        if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
 369                pr_debug("can't find proto in pkt\n");
 370                return -1;
 371        }
 372
 373        *protonum = nexthdr;
 374        return protoff;
 375}
 376#endif
 377
 378static int get_l4proto(const struct sk_buff *skb,
 379                       unsigned int nhoff, u8 pf, u8 *l4num)
 380{
 381        switch (pf) {
 382        case NFPROTO_IPV4:
 383                return ipv4_get_l4proto(skb, nhoff, l4num);
 384#if IS_ENABLED(CONFIG_IPV6)
 385        case NFPROTO_IPV6:
 386                return ipv6_get_l4proto(skb, nhoff, l4num);
 387#endif
 388        default:
 389                *l4num = 0;
 390                break;
 391        }
 392        return -1;
 393}
 394
 395bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff,
 396                       u_int16_t l3num,
 397                       struct net *net, struct nf_conntrack_tuple *tuple)
 398{
 399        u8 protonum;
 400        int protoff;
 401
 402        protoff = get_l4proto(skb, nhoff, l3num, &protonum);
 403        if (protoff <= 0)
 404                return false;
 405
 406        return nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple);
 407}
 408EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr);
 409
 410bool
 411nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
 412                   const struct nf_conntrack_tuple *orig)
 413{
 414        memset(inverse, 0, sizeof(*inverse));
 415
 416        inverse->src.l3num = orig->src.l3num;
 417
 418        switch (orig->src.l3num) {
 419        case NFPROTO_IPV4:
 420                inverse->src.u3.ip = orig->dst.u3.ip;
 421                inverse->dst.u3.ip = orig->src.u3.ip;
 422                break;
 423        case NFPROTO_IPV6:
 424                inverse->src.u3.in6 = orig->dst.u3.in6;
 425                inverse->dst.u3.in6 = orig->src.u3.in6;
 426                break;
 427        default:
 428                break;
 429        }
 430
 431        inverse->dst.dir = !orig->dst.dir;
 432
 433        inverse->dst.protonum = orig->dst.protonum;
 434
 435        switch (orig->dst.protonum) {
 436        case IPPROTO_ICMP:
 437                return nf_conntrack_invert_icmp_tuple(inverse, orig);
 438#if IS_ENABLED(CONFIG_IPV6)
 439        case IPPROTO_ICMPV6:
 440                return nf_conntrack_invert_icmpv6_tuple(inverse, orig);
 441#endif
 442        }
 443
 444        inverse->src.u.all = orig->dst.u.all;
 445        inverse->dst.u.all = orig->src.u.all;
 446        return true;
 447}
 448EXPORT_SYMBOL_GPL(nf_ct_invert_tuple);
 449
 450/* Generate a almost-unique pseudo-id for a given conntrack.
 451 *
 452 * intentionally doesn't re-use any of the seeds used for hash
 453 * table location, we assume id gets exposed to userspace.
 454 *
 455 * Following nf_conn items do not change throughout lifetime
 456 * of the nf_conn:
 457 *
 458 * 1. nf_conn address
 459 * 2. nf_conn->master address (normally NULL)
 460 * 3. the associated net namespace
 461 * 4. the original direction tuple
 462 */
 463u32 nf_ct_get_id(const struct nf_conn *ct)
 464{
 465        static __read_mostly siphash_key_t ct_id_seed;
 466        unsigned long a, b, c, d;
 467
 468        net_get_random_once(&ct_id_seed, sizeof(ct_id_seed));
 469
 470        a = (unsigned long)ct;
 471        b = (unsigned long)ct->master;
 472        c = (unsigned long)nf_ct_net(ct);
 473        d = (unsigned long)siphash(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
 474                                   sizeof(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple),
 475                                   &ct_id_seed);
 476#ifdef CONFIG_64BIT
 477        return siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &ct_id_seed);
 478#else
 479        return siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &ct_id_seed);
 480#endif
 481}
 482EXPORT_SYMBOL_GPL(nf_ct_get_id);
 483
 484static void
 485clean_from_lists(struct nf_conn *ct)
 486{
 487        pr_debug("clean_from_lists(%p)\n", ct);
 488        hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
 489        hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode);
 490
 491        /* Destroy all pending expectations */
 492        nf_ct_remove_expectations(ct);
 493}
 494
 495/* must be called with local_bh_disable */
 496static void nf_ct_add_to_dying_list(struct nf_conn *ct)
 497{
 498        struct ct_pcpu *pcpu;
 499
 500        /* add this conntrack to the (per cpu) dying list */
 501        ct->cpu = smp_processor_id();
 502        pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
 503
 504        spin_lock(&pcpu->lock);
 505        hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
 506                             &pcpu->dying);
 507        spin_unlock(&pcpu->lock);
 508}
 509
 510/* must be called with local_bh_disable */
 511static void nf_ct_add_to_unconfirmed_list(struct nf_conn *ct)
 512{
 513        struct ct_pcpu *pcpu;
 514
 515        /* add this conntrack to the (per cpu) unconfirmed list */
 516        ct->cpu = smp_processor_id();
 517        pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
 518
 519        spin_lock(&pcpu->lock);
 520        hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
 521                             &pcpu->unconfirmed);
 522        spin_unlock(&pcpu->lock);
 523}
 524
 525/* must be called with local_bh_disable */
 526static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct)
 527{
 528        struct ct_pcpu *pcpu;
 529
 530        /* We overload first tuple to link into unconfirmed or dying list.*/
 531        pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
 532
 533        spin_lock(&pcpu->lock);
 534        BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode));
 535        hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
 536        spin_unlock(&pcpu->lock);
 537}
 538
 539#define NFCT_ALIGN(len) (((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK)
 540
 541/* Released via destroy_conntrack() */
 542struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
 543                                 const struct nf_conntrack_zone *zone,
 544                                 gfp_t flags)
 545{
 546        struct nf_conn *tmpl, *p;
 547
 548        if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) {
 549                tmpl = kzalloc(sizeof(*tmpl) + NFCT_INFOMASK, flags);
 550                if (!tmpl)
 551                        return NULL;
 552
 553                p = tmpl;
 554                tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p);
 555                if (tmpl != p) {
 556                        tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p);
 557                        tmpl->proto.tmpl_padto = (char *)tmpl - (char *)p;
 558                }
 559        } else {
 560                tmpl = kzalloc(sizeof(*tmpl), flags);
 561                if (!tmpl)
 562                        return NULL;
 563        }
 564
 565        tmpl->status = IPS_TEMPLATE;
 566        write_pnet(&tmpl->ct_net, net);
 567        nf_ct_zone_add(tmpl, zone);
 568        atomic_set(&tmpl->ct_general.use, 0);
 569
 570        return tmpl;
 571}
 572EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc);
 573
 574void nf_ct_tmpl_free(struct nf_conn *tmpl)
 575{
 576        nf_ct_ext_destroy(tmpl);
 577        nf_ct_ext_free(tmpl);
 578
 579        if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK)
 580                kfree((char *)tmpl - tmpl->proto.tmpl_padto);
 581        else
 582                kfree(tmpl);
 583}
 584EXPORT_SYMBOL_GPL(nf_ct_tmpl_free);
 585
 586static void destroy_gre_conntrack(struct nf_conn *ct)
 587{
 588#ifdef CONFIG_NF_CT_PROTO_GRE
 589        struct nf_conn *master = ct->master;
 590
 591        if (master)
 592                nf_ct_gre_keymap_destroy(master);
 593#endif
 594}
 595
 596static void
 597destroy_conntrack(struct nf_conntrack *nfct)
 598{
 599        struct nf_conn *ct = (struct nf_conn *)nfct;
 600
 601        pr_debug("destroy_conntrack(%p)\n", ct);
 602        WARN_ON(atomic_read(&nfct->use) != 0);
 603
 604        if (unlikely(nf_ct_is_template(ct))) {
 605                nf_ct_tmpl_free(ct);
 606                return;
 607        }
 608
 609        if (unlikely(nf_ct_protonum(ct) == IPPROTO_GRE))
 610                destroy_gre_conntrack(ct);
 611
 612        local_bh_disable();
 613        /* Expectations will have been removed in clean_from_lists,
 614         * except TFTP can create an expectation on the first packet,
 615         * before connection is in the list, so we need to clean here,
 616         * too.
 617         */
 618        nf_ct_remove_expectations(ct);
 619
 620        nf_ct_del_from_dying_or_unconfirmed_list(ct);
 621
 622        local_bh_enable();
 623
 624        if (ct->master)
 625                nf_ct_put(ct->master);
 626
 627        pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct);
 628        nf_conntrack_free(ct);
 629}
 630
 631static void nf_ct_delete_from_lists(struct nf_conn *ct)
 632{
 633        struct net *net = nf_ct_net(ct);
 634        unsigned int hash, reply_hash;
 635        unsigned int sequence;
 636
 637        nf_ct_helper_destroy(ct);
 638
 639        local_bh_disable();
 640        do {
 641                sequence = read_seqcount_begin(&nf_conntrack_generation);
 642                hash = hash_conntrack(net,
 643                                      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
 644                reply_hash = hash_conntrack(net,
 645                                           &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
 646        } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
 647
 648        clean_from_lists(ct);
 649        nf_conntrack_double_unlock(hash, reply_hash);
 650
 651        nf_ct_add_to_dying_list(ct);
 652
 653        local_bh_enable();
 654}
 655
 656bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report)
 657{
 658        struct nf_conn_tstamp *tstamp;
 659
 660        if (test_and_set_bit(IPS_DYING_BIT, &ct->status))
 661                return false;
 662
 663        tstamp = nf_conn_tstamp_find(ct);
 664        if (tstamp && tstamp->stop == 0)
 665                tstamp->stop = ktime_get_real_ns();
 666
 667        if (nf_conntrack_event_report(IPCT_DESTROY, ct,
 668                                    portid, report) < 0) {
 669                /* destroy event was not delivered. nf_ct_put will
 670                 * be done by event cache worker on redelivery.
 671                 */
 672                nf_ct_delete_from_lists(ct);
 673                nf_conntrack_ecache_delayed_work(nf_ct_net(ct));
 674                return false;
 675        }
 676
 677        nf_conntrack_ecache_work(nf_ct_net(ct));
 678        nf_ct_delete_from_lists(ct);
 679        nf_ct_put(ct);
 680        return true;
 681}
 682EXPORT_SYMBOL_GPL(nf_ct_delete);
 683
 684static inline bool
 685nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
 686                const struct nf_conntrack_tuple *tuple,
 687                const struct nf_conntrack_zone *zone,
 688                const struct net *net)
 689{
 690        struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
 691
 692        /* A conntrack can be recreated with the equal tuple,
 693         * so we need to check that the conntrack is confirmed
 694         */
 695        return nf_ct_tuple_equal(tuple, &h->tuple) &&
 696               nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) &&
 697               nf_ct_is_confirmed(ct) &&
 698               net_eq(net, nf_ct_net(ct));
 699}
 700
 701static inline bool
 702nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2)
 703{
 704        return nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
 705                                 &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple) &&
 706               nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple,
 707                                 &ct2->tuplehash[IP_CT_DIR_REPLY].tuple) &&
 708               nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL) &&
 709               nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_REPLY) &&
 710               net_eq(nf_ct_net(ct1), nf_ct_net(ct2));
 711}
 712
 713/* caller must hold rcu readlock and none of the nf_conntrack_locks */
 714static void nf_ct_gc_expired(struct nf_conn *ct)
 715{
 716        if (!atomic_inc_not_zero(&ct->ct_general.use))
 717                return;
 718
 719        if (nf_ct_should_gc(ct))
 720                nf_ct_kill(ct);
 721
 722        nf_ct_put(ct);
 723}
 724
 725/*
 726 * Warning :
 727 * - Caller must take a reference on returned object
 728 *   and recheck nf_ct_tuple_equal(tuple, &h->tuple)
 729 */
 730static struct nf_conntrack_tuple_hash *
 731____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone,
 732                      const struct nf_conntrack_tuple *tuple, u32 hash)
 733{
 734        struct nf_conntrack_tuple_hash *h;
 735        struct hlist_nulls_head *ct_hash;
 736        struct hlist_nulls_node *n;
 737        unsigned int bucket, hsize;
 738
 739begin:
 740        nf_conntrack_get_ht(&ct_hash, &hsize);
 741        bucket = reciprocal_scale(hash, hsize);
 742
 743        hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) {
 744                struct nf_conn *ct;
 745
 746                ct = nf_ct_tuplehash_to_ctrack(h);
 747                if (nf_ct_is_expired(ct)) {
 748                        nf_ct_gc_expired(ct);
 749                        continue;
 750                }
 751
 752                if (nf_ct_key_equal(h, tuple, zone, net))
 753                        return h;
 754        }
 755        /*
 756         * if the nulls value we got at the end of this lookup is
 757         * not the expected one, we must restart lookup.
 758         * We probably met an item that was moved to another chain.
 759         */
 760        if (get_nulls_value(n) != bucket) {
 761                NF_CT_STAT_INC_ATOMIC(net, search_restart);
 762                goto begin;
 763        }
 764
 765        return NULL;
 766}
 767
 768/* Find a connection corresponding to a tuple. */
 769static struct nf_conntrack_tuple_hash *
 770__nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
 771                        const struct nf_conntrack_tuple *tuple, u32 hash)
 772{
 773        struct nf_conntrack_tuple_hash *h;
 774        struct nf_conn *ct;
 775
 776        rcu_read_lock();
 777
 778        h = ____nf_conntrack_find(net, zone, tuple, hash);
 779        if (h) {
 780                /* We have a candidate that matches the tuple we're interested
 781                 * in, try to obtain a reference and re-check tuple
 782                 */
 783                ct = nf_ct_tuplehash_to_ctrack(h);
 784                if (likely(atomic_inc_not_zero(&ct->ct_general.use))) {
 785                        if (likely(nf_ct_key_equal(h, tuple, zone, net)))
 786                                goto found;
 787
 788                        /* TYPESAFE_BY_RCU recycled the candidate */
 789                        nf_ct_put(ct);
 790                }
 791
 792                h = NULL;
 793        }
 794found:
 795        rcu_read_unlock();
 796
 797        return h;
 798}
 799
 800struct nf_conntrack_tuple_hash *
 801nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
 802                      const struct nf_conntrack_tuple *tuple)
 803{
 804        return __nf_conntrack_find_get(net, zone, tuple,
 805                                       hash_conntrack_raw(tuple, net));
 806}
 807EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
 808
 809static void __nf_conntrack_hash_insert(struct nf_conn *ct,
 810                                       unsigned int hash,
 811                                       unsigned int reply_hash)
 812{
 813        hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
 814                           &nf_conntrack_hash[hash]);
 815        hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
 816                           &nf_conntrack_hash[reply_hash]);
 817}
 818
 819int
 820nf_conntrack_hash_check_insert(struct nf_conn *ct)
 821{
 822        const struct nf_conntrack_zone *zone;
 823        struct net *net = nf_ct_net(ct);
 824        unsigned int hash, reply_hash;
 825        struct nf_conntrack_tuple_hash *h;
 826        struct hlist_nulls_node *n;
 827        unsigned int sequence;
 828
 829        zone = nf_ct_zone(ct);
 830
 831        local_bh_disable();
 832        do {
 833                sequence = read_seqcount_begin(&nf_conntrack_generation);
 834                hash = hash_conntrack(net,
 835                                      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
 836                reply_hash = hash_conntrack(net,
 837                                           &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
 838        } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
 839
 840        /* See if there's one in the list already, including reverse */
 841        hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
 842                if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
 843                                    zone, net))
 844                        goto out;
 845
 846        hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
 847                if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
 848                                    zone, net))
 849                        goto out;
 850
 851        smp_wmb();
 852        /* The caller holds a reference to this object */
 853        atomic_set(&ct->ct_general.use, 2);
 854        __nf_conntrack_hash_insert(ct, hash, reply_hash);
 855        nf_conntrack_double_unlock(hash, reply_hash);
 856        NF_CT_STAT_INC(net, insert);
 857        local_bh_enable();
 858        return 0;
 859
 860out:
 861        nf_conntrack_double_unlock(hash, reply_hash);
 862        NF_CT_STAT_INC(net, insert_failed);
 863        local_bh_enable();
 864        return -EEXIST;
 865}
 866EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
 867
 868static inline void nf_ct_acct_update(struct nf_conn *ct,
 869                                     enum ip_conntrack_info ctinfo,
 870                                     unsigned int len)
 871{
 872        struct nf_conn_acct *acct;
 873
 874        acct = nf_conn_acct_find(ct);
 875        if (acct) {
 876                struct nf_conn_counter *counter = acct->counter;
 877
 878                atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets);
 879                atomic64_add(len, &counter[CTINFO2DIR(ctinfo)].bytes);
 880        }
 881}
 882
 883static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
 884                             const struct nf_conn *loser_ct)
 885{
 886        struct nf_conn_acct *acct;
 887
 888        acct = nf_conn_acct_find(loser_ct);
 889        if (acct) {
 890                struct nf_conn_counter *counter = acct->counter;
 891                unsigned int bytes;
 892
 893                /* u32 should be fine since we must have seen one packet. */
 894                bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes);
 895                nf_ct_acct_update(ct, ctinfo, bytes);
 896        }
 897}
 898
 899/* Resolve race on insertion if this protocol allows this. */
 900static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb,
 901                               enum ip_conntrack_info ctinfo,
 902                               struct nf_conntrack_tuple_hash *h)
 903{
 904        /* This is the conntrack entry already in hashes that won race. */
 905        struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
 906        const struct nf_conntrack_l4proto *l4proto;
 907        enum ip_conntrack_info oldinfo;
 908        struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo);
 909
 910        l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct));
 911        if (l4proto->allow_clash &&
 912            !nf_ct_is_dying(ct) &&
 913            atomic_inc_not_zero(&ct->ct_general.use)) {
 914                if (((ct->status & IPS_NAT_DONE_MASK) == 0) ||
 915                    nf_ct_match(ct, loser_ct)) {
 916                        nf_ct_acct_merge(ct, ctinfo, loser_ct);
 917                        nf_conntrack_put(&loser_ct->ct_general);
 918                        nf_ct_set(skb, ct, oldinfo);
 919                        return NF_ACCEPT;
 920                }
 921                nf_ct_put(ct);
 922        }
 923        NF_CT_STAT_INC(net, drop);
 924        return NF_DROP;
 925}
 926
 927/* Confirm a connection given skb; places it in hash table */
 928int
 929__nf_conntrack_confirm(struct sk_buff *skb)
 930{
 931        const struct nf_conntrack_zone *zone;
 932        unsigned int hash, reply_hash;
 933        struct nf_conntrack_tuple_hash *h;
 934        struct nf_conn *ct;
 935        struct nf_conn_help *help;
 936        struct nf_conn_tstamp *tstamp;
 937        struct hlist_nulls_node *n;
 938        enum ip_conntrack_info ctinfo;
 939        struct net *net;
 940        unsigned int sequence;
 941        int ret = NF_DROP;
 942
 943        ct = nf_ct_get(skb, &ctinfo);
 944        net = nf_ct_net(ct);
 945
 946        /* ipt_REJECT uses nf_conntrack_attach to attach related
 947           ICMP/TCP RST packets in other direction.  Actual packet
 948           which created connection will be IP_CT_NEW or for an
 949           expected connection, IP_CT_RELATED. */
 950        if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
 951                return NF_ACCEPT;
 952
 953        zone = nf_ct_zone(ct);
 954        local_bh_disable();
 955
 956        do {
 957                sequence = read_seqcount_begin(&nf_conntrack_generation);
 958                /* reuse the hash saved before */
 959                hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
 960                hash = scale_hash(hash);
 961                reply_hash = hash_conntrack(net,
 962                                           &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
 963
 964        } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
 965
 966        /* We're not in hash table, and we refuse to set up related
 967         * connections for unconfirmed conns.  But packet copies and
 968         * REJECT will give spurious warnings here.
 969         */
 970
 971        /* Another skb with the same unconfirmed conntrack may
 972         * win the race. This may happen for bridge(br_flood)
 973         * or broadcast/multicast packets do skb_clone with
 974         * unconfirmed conntrack.
 975         */
 976        if (unlikely(nf_ct_is_confirmed(ct))) {
 977                WARN_ON_ONCE(1);
 978                nf_conntrack_double_unlock(hash, reply_hash);
 979                local_bh_enable();
 980                return NF_DROP;
 981        }
 982
 983        pr_debug("Confirming conntrack %p\n", ct);
 984        /* We have to check the DYING flag after unlink to prevent
 985         * a race against nf_ct_get_next_corpse() possibly called from
 986         * user context, else we insert an already 'dead' hash, blocking
 987         * further use of that particular connection -JM.
 988         */
 989        nf_ct_del_from_dying_or_unconfirmed_list(ct);
 990
 991        if (unlikely(nf_ct_is_dying(ct))) {
 992                nf_ct_add_to_dying_list(ct);
 993                goto dying;
 994        }
 995
 996        /* See if there's one in the list already, including reverse:
 997           NAT could have grabbed it without realizing, since we're
 998           not in the hash.  If there is, we lost race. */
 999        hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
1000                if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
1001                                    zone, net))
1002                        goto out;
1003
1004        hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
1005                if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
1006                                    zone, net))
1007                        goto out;
1008
1009        /* Timer relative to confirmation time, not original
1010           setting time, otherwise we'd get timer wrap in
1011           weird delay cases. */
1012        ct->timeout += nfct_time_stamp;
1013        atomic_inc(&ct->ct_general.use);
1014        ct->status |= IPS_CONFIRMED;
1015
1016        /* set conntrack timestamp, if enabled. */
1017        tstamp = nf_conn_tstamp_find(ct);
1018        if (tstamp)
1019                tstamp->start = ktime_get_real_ns();
1020
1021        /* Since the lookup is lockless, hash insertion must be done after
1022         * starting the timer and setting the CONFIRMED bit. The RCU barriers
1023         * guarantee that no other CPU can find the conntrack before the above
1024         * stores are visible.
1025         */
1026        __nf_conntrack_hash_insert(ct, hash, reply_hash);
1027        nf_conntrack_double_unlock(hash, reply_hash);
1028        local_bh_enable();
1029
1030        help = nfct_help(ct);
1031        if (help && help->helper)
1032                nf_conntrack_event_cache(IPCT_HELPER, ct);
1033
1034        nf_conntrack_event_cache(master_ct(ct) ?
1035                                 IPCT_RELATED : IPCT_NEW, ct);
1036        return NF_ACCEPT;
1037
1038out:
1039        nf_ct_add_to_dying_list(ct);
1040        ret = nf_ct_resolve_clash(net, skb, ctinfo, h);
1041dying:
1042        nf_conntrack_double_unlock(hash, reply_hash);
1043        NF_CT_STAT_INC(net, insert_failed);
1044        local_bh_enable();
1045        return ret;
1046}
1047EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);
1048
1049/* Returns true if a connection correspondings to the tuple (required
1050   for NAT). */
1051int
1052nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
1053                         const struct nf_conn *ignored_conntrack)
1054{
1055        struct net *net = nf_ct_net(ignored_conntrack);
1056        const struct nf_conntrack_zone *zone;
1057        struct nf_conntrack_tuple_hash *h;
1058        struct hlist_nulls_head *ct_hash;
1059        unsigned int hash, hsize;
1060        struct hlist_nulls_node *n;
1061        struct nf_conn *ct;
1062
1063        zone = nf_ct_zone(ignored_conntrack);
1064
1065        rcu_read_lock();
1066 begin:
1067        nf_conntrack_get_ht(&ct_hash, &hsize);
1068        hash = __hash_conntrack(net, tuple, hsize);
1069
1070        hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) {
1071                ct = nf_ct_tuplehash_to_ctrack(h);
1072
1073                if (ct == ignored_conntrack)
1074                        continue;
1075
1076                if (nf_ct_is_expired(ct)) {
1077                        nf_ct_gc_expired(ct);
1078                        continue;
1079                }
1080
1081                if (nf_ct_key_equal(h, tuple, zone, net)) {
1082                        /* Tuple is taken already, so caller will need to find
1083                         * a new source port to use.
1084                         *
1085                         * Only exception:
1086                         * If the *original tuples* are identical, then both
1087                         * conntracks refer to the same flow.
1088                         * This is a rare situation, it can occur e.g. when
1089                         * more than one UDP packet is sent from same socket
1090                         * in different threads.
1091                         *
1092                         * Let nf_ct_resolve_clash() deal with this later.
1093                         */
1094                        if (nf_ct_tuple_equal(&ignored_conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
1095                                              &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple))
1096                                continue;
1097
1098                        NF_CT_STAT_INC_ATOMIC(net, found);
1099                        rcu_read_unlock();
1100                        return 1;
1101                }
1102        }
1103
1104        if (get_nulls_value(n) != hash) {
1105                NF_CT_STAT_INC_ATOMIC(net, search_restart);
1106                goto begin;
1107        }
1108
1109        rcu_read_unlock();
1110
1111        return 0;
1112}
1113EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken);
1114
1115#define NF_CT_EVICTION_RANGE    8
1116
1117/* There's a small race here where we may free a just-assured
1118   connection.  Too bad: we're in trouble anyway. */
1119static unsigned int early_drop_list(struct net *net,
1120                                    struct hlist_nulls_head *head)
1121{
1122        struct nf_conntrack_tuple_hash *h;
1123        struct hlist_nulls_node *n;
1124        unsigned int drops = 0;
1125        struct nf_conn *tmp;
1126
1127        hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) {
1128                tmp = nf_ct_tuplehash_to_ctrack(h);
1129
1130                if (test_bit(IPS_OFFLOAD_BIT, &tmp->status))
1131                        continue;
1132
1133                if (nf_ct_is_expired(tmp)) {
1134                        nf_ct_gc_expired(tmp);
1135                        continue;
1136                }
1137
1138                if (test_bit(IPS_ASSURED_BIT, &tmp->status) ||
1139                    !net_eq(nf_ct_net(tmp), net) ||
1140                    nf_ct_is_dying(tmp))
1141                        continue;
1142
1143                if (!atomic_inc_not_zero(&tmp->ct_general.use))
1144                        continue;
1145
1146                /* kill only if still in same netns -- might have moved due to
1147                 * SLAB_TYPESAFE_BY_RCU rules.
1148                 *
1149                 * We steal the timer reference.  If that fails timer has
1150                 * already fired or someone else deleted it. Just drop ref
1151                 * and move to next entry.
1152                 */
1153                if (net_eq(nf_ct_net(tmp), net) &&
1154                    nf_ct_is_confirmed(tmp) &&
1155                    nf_ct_delete(tmp, 0, 0))
1156                        drops++;
1157
1158                nf_ct_put(tmp);
1159        }
1160
1161        return drops;
1162}
1163
1164static noinline int early_drop(struct net *net, unsigned int hash)
1165{
1166        unsigned int i, bucket;
1167
1168        for (i = 0; i < NF_CT_EVICTION_RANGE; i++) {
1169                struct hlist_nulls_head *ct_hash;
1170                unsigned int hsize, drops;
1171
1172                rcu_read_lock();
1173                nf_conntrack_get_ht(&ct_hash, &hsize);
1174                if (!i)
1175                        bucket = reciprocal_scale(hash, hsize);
1176                else
1177                        bucket = (bucket + 1) % hsize;
1178
1179                drops = early_drop_list(net, &ct_hash[bucket]);
1180                rcu_read_unlock();
1181
1182                if (drops) {
1183                        NF_CT_STAT_ADD_ATOMIC(net, early_drop, drops);
1184                        return true;
1185                }
1186        }
1187
1188        return false;
1189}
1190
1191static bool gc_worker_skip_ct(const struct nf_conn *ct)
1192{
1193        return !nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct);
1194}
1195
1196static bool gc_worker_can_early_drop(const struct nf_conn *ct)
1197{
1198        const struct nf_conntrack_l4proto *l4proto;
1199
1200        if (!test_bit(IPS_ASSURED_BIT, &ct->status))
1201                return true;
1202
1203        l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct));
1204        if (l4proto->can_early_drop && l4proto->can_early_drop(ct))
1205                return true;
1206
1207        return false;
1208}
1209
1210#define DAY     (86400 * HZ)
1211
1212/* Set an arbitrary timeout large enough not to ever expire, this save
1213 * us a check for the IPS_OFFLOAD_BIT from the packet path via
1214 * nf_ct_is_expired().
1215 */
1216static void nf_ct_offload_timeout(struct nf_conn *ct)
1217{
1218        if (nf_ct_expires(ct) < DAY / 2)
1219                ct->timeout = nfct_time_stamp + DAY;
1220}
1221
1222static void gc_worker(struct work_struct *work)
1223{
1224        unsigned int min_interval = max(HZ / GC_MAX_BUCKETS_DIV, 1u);
1225        unsigned int i, goal, buckets = 0, expired_count = 0;
1226        unsigned int nf_conntrack_max95 = 0;
1227        struct conntrack_gc_work *gc_work;
1228        unsigned int ratio, scanned = 0;
1229        unsigned long next_run;
1230
1231        gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
1232
1233        goal = nf_conntrack_htable_size / GC_MAX_BUCKETS_DIV;
1234        i = gc_work->last_bucket;
1235        if (gc_work->early_drop)
1236                nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
1237
1238        do {
1239                struct nf_conntrack_tuple_hash *h;
1240                struct hlist_nulls_head *ct_hash;
1241                struct hlist_nulls_node *n;
1242                unsigned int hashsz;
1243                struct nf_conn *tmp;
1244
1245                i++;
1246                rcu_read_lock();
1247
1248                nf_conntrack_get_ht(&ct_hash, &hashsz);
1249                if (i >= hashsz)
1250                        i = 0;
1251
1252                hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
1253                        struct net *net;
1254
1255                        tmp = nf_ct_tuplehash_to_ctrack(h);
1256
1257                        scanned++;
1258                        if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
1259                                nf_ct_offload_timeout(tmp);
1260                                continue;
1261                        }
1262
1263                        if (nf_ct_is_expired(tmp)) {
1264                                nf_ct_gc_expired(tmp);
1265                                expired_count++;
1266                                continue;
1267                        }
1268
1269                        if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp))
1270                                continue;
1271
1272                        net = nf_ct_net(tmp);
1273                        if (atomic_read(&net->ct.count) < nf_conntrack_max95)
1274                                continue;
1275
1276                        /* need to take reference to avoid possible races */
1277                        if (!atomic_inc_not_zero(&tmp->ct_general.use))
1278                                continue;
1279
1280                        if (gc_worker_skip_ct(tmp)) {
1281                                nf_ct_put(tmp);
1282                                continue;
1283                        }
1284
1285                        if (gc_worker_can_early_drop(tmp))
1286                                nf_ct_kill(tmp);
1287
1288                        nf_ct_put(tmp);
1289                }
1290
1291                /* could check get_nulls_value() here and restart if ct
1292                 * was moved to another chain.  But given gc is best-effort
1293                 * we will just continue with next hash slot.
1294                 */
1295                rcu_read_unlock();
1296                cond_resched();
1297        } while (++buckets < goal);
1298
1299        if (gc_work->exiting)
1300                return;
1301
1302        /*
1303         * Eviction will normally happen from the packet path, and not
1304         * from this gc worker.
1305         *
1306         * This worker is only here to reap expired entries when system went
1307         * idle after a busy period.
1308         *
1309         * The heuristics below are supposed to balance conflicting goals:
1310         *
1311         * 1. Minimize time until we notice a stale entry
1312         * 2. Maximize scan intervals to not waste cycles
1313         *
1314         * Normally, expire ratio will be close to 0.
1315         *
1316         * As soon as a sizeable fraction of the entries have expired
1317         * increase scan frequency.
1318         */
1319        ratio = scanned ? expired_count * 100 / scanned : 0;
1320        if (ratio > GC_EVICT_RATIO) {
1321                gc_work->next_gc_run = min_interval;
1322        } else {
1323                unsigned int max = GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV;
1324
1325                BUILD_BUG_ON((GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV) == 0);
1326
1327                gc_work->next_gc_run += min_interval;
1328                if (gc_work->next_gc_run > max)
1329                        gc_work->next_gc_run = max;
1330        }
1331
1332        next_run = gc_work->next_gc_run;
1333        gc_work->last_bucket = i;
1334        gc_work->early_drop = false;
1335        queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run);
1336}
1337
1338static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work)
1339{
1340        INIT_DEFERRABLE_WORK(&gc_work->dwork, gc_worker);
1341        gc_work->next_gc_run = HZ;
1342        gc_work->exiting = false;
1343}
1344
1345static struct nf_conn *
1346__nf_conntrack_alloc(struct net *net,
1347                     const struct nf_conntrack_zone *zone,
1348                     const struct nf_conntrack_tuple *orig,
1349                     const struct nf_conntrack_tuple *repl,
1350                     gfp_t gfp, u32 hash)
1351{
1352        struct nf_conn *ct;
1353
1354        /* We don't want any race condition at early drop stage */
1355        atomic_inc(&net->ct.count);
1356
1357        if (nf_conntrack_max &&
1358            unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
1359                if (!early_drop(net, hash)) {
1360                        if (!conntrack_gc_work.early_drop)
1361                                conntrack_gc_work.early_drop = true;
1362                        atomic_dec(&net->ct.count);
1363                        net_warn_ratelimited("nf_conntrack: table full, dropping packet\n");
1364                        return ERR_PTR(-ENOMEM);
1365                }
1366        }
1367
1368        /*
1369         * Do not use kmem_cache_zalloc(), as this cache uses
1370         * SLAB_TYPESAFE_BY_RCU.
1371         */
1372        ct = kmem_cache_alloc(nf_conntrack_cachep, gfp);
1373        if (ct == NULL)
1374                goto out;
1375
1376        spin_lock_init(&ct->lock);
1377        ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
1378        ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL;
1379        ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
1380        /* save hash for reusing when confirming */
1381        *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash;
1382        ct->status = 0;
1383        ct->timeout = 0;
1384        write_pnet(&ct->ct_net, net);
1385        memset(&ct->__nfct_init_offset[0], 0,
1386               offsetof(struct nf_conn, proto) -
1387               offsetof(struct nf_conn, __nfct_init_offset[0]));
1388
1389        nf_ct_zone_add(ct, zone);
1390
1391        /* Because we use RCU lookups, we set ct_general.use to zero before
1392         * this is inserted in any list.
1393         */
1394        atomic_set(&ct->ct_general.use, 0);
1395        return ct;
1396out:
1397        atomic_dec(&net->ct.count);
1398        return ERR_PTR(-ENOMEM);
1399}
1400
1401struct nf_conn *nf_conntrack_alloc(struct net *net,
1402                                   const struct nf_conntrack_zone *zone,
1403                                   const struct nf_conntrack_tuple *orig,
1404                                   const struct nf_conntrack_tuple *repl,
1405                                   gfp_t gfp)
1406{
1407        return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0);
1408}
1409EXPORT_SYMBOL_GPL(nf_conntrack_alloc);
1410
1411void nf_conntrack_free(struct nf_conn *ct)
1412{
1413        struct net *net = nf_ct_net(ct);
1414
1415        /* A freed object has refcnt == 0, that's
1416         * the golden rule for SLAB_TYPESAFE_BY_RCU
1417         */
1418        WARN_ON(atomic_read(&ct->ct_general.use) != 0);
1419
1420        nf_ct_ext_destroy(ct);
1421        nf_ct_ext_free(ct);
1422        kmem_cache_free(nf_conntrack_cachep, ct);
1423        smp_mb__before_atomic();
1424        atomic_dec(&net->ct.count);
1425}
1426EXPORT_SYMBOL_GPL(nf_conntrack_free);
1427
1428
1429/* Allocate a new conntrack: we return -ENOMEM if classification
1430   failed due to stress.  Otherwise it really is unclassifiable. */
1431static noinline struct nf_conntrack_tuple_hash *
1432init_conntrack(struct net *net, struct nf_conn *tmpl,
1433               const struct nf_conntrack_tuple *tuple,
1434               struct sk_buff *skb,
1435               unsigned int dataoff, u32 hash)
1436{
1437        struct nf_conn *ct;
1438        struct nf_conn_help *help;
1439        struct nf_conntrack_tuple repl_tuple;
1440        struct nf_conntrack_ecache *ecache;
1441        struct nf_conntrack_expect *exp = NULL;
1442        const struct nf_conntrack_zone *zone;
1443        struct nf_conn_timeout *timeout_ext;
1444        struct nf_conntrack_zone tmp;
1445
1446        if (!nf_ct_invert_tuple(&repl_tuple, tuple)) {
1447                pr_debug("Can't invert tuple.\n");
1448                return NULL;
1449        }
1450
1451        zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
1452        ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC,
1453                                  hash);
1454        if (IS_ERR(ct))
1455                return (struct nf_conntrack_tuple_hash *)ct;
1456
1457        if (!nf_ct_add_synproxy(ct, tmpl)) {
1458                nf_conntrack_free(ct);
1459                return ERR_PTR(-ENOMEM);
1460        }
1461
1462        timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL;
1463
1464        if (timeout_ext)
1465                nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout),
1466                                      GFP_ATOMIC);
1467
1468        nf_ct_acct_ext_add(ct, GFP_ATOMIC);
1469        nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
1470        nf_ct_labels_ext_add(ct);
1471
1472        ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL;
1473        nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0,
1474                                 ecache ? ecache->expmask : 0,
1475                             GFP_ATOMIC);
1476
1477        local_bh_disable();
1478        if (net->ct.expect_count) {
1479                spin_lock(&nf_conntrack_expect_lock);
1480                exp = nf_ct_find_expectation(net, zone, tuple);
1481                if (exp) {
1482                        pr_debug("expectation arrives ct=%p exp=%p\n",
1483                                 ct, exp);
1484                        /* Welcome, Mr. Bond.  We've been expecting you... */
1485                        __set_bit(IPS_EXPECTED_BIT, &ct->status);
1486                        /* exp->master safe, refcnt bumped in nf_ct_find_expectation */
1487                        ct->master = exp->master;
1488                        if (exp->helper) {
1489                                help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
1490                                if (help)
1491                                        rcu_assign_pointer(help->helper, exp->helper);
1492                        }
1493
1494#ifdef CONFIG_NF_CONNTRACK_MARK
1495                        ct->mark = exp->master->mark;
1496#endif
1497#ifdef CONFIG_NF_CONNTRACK_SECMARK
1498                        ct->secmark = exp->master->secmark;
1499#endif
1500                        NF_CT_STAT_INC(net, expect_new);
1501                }
1502                spin_unlock(&nf_conntrack_expect_lock);
1503        }
1504        if (!exp)
1505                __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC);
1506
1507        /* Now it is inserted into the unconfirmed list, bump refcount */
1508        nf_conntrack_get(&ct->ct_general);
1509        nf_ct_add_to_unconfirmed_list(ct);
1510
1511        local_bh_enable();
1512
1513        if (exp) {
1514                if (exp->expectfn)
1515                        exp->expectfn(ct, exp);
1516                nf_ct_expect_put(exp);
1517        }
1518
1519        return &ct->tuplehash[IP_CT_DIR_ORIGINAL];
1520}
1521
1522/* On success, returns 0, sets skb->_nfct | ctinfo */
1523static int
1524resolve_normal_ct(struct nf_conn *tmpl,
1525                  struct sk_buff *skb,
1526                  unsigned int dataoff,
1527                  u_int8_t protonum,
1528                  const struct nf_hook_state *state)
1529{
1530        const struct nf_conntrack_zone *zone;
1531        struct nf_conntrack_tuple tuple;
1532        struct nf_conntrack_tuple_hash *h;
1533        enum ip_conntrack_info ctinfo;
1534        struct nf_conntrack_zone tmp;
1535        struct nf_conn *ct;
1536        u32 hash;
1537
1538        if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
1539                             dataoff, state->pf, protonum, state->net,
1540                             &tuple)) {
1541                pr_debug("Can't get tuple\n");
1542                return 0;
1543        }
1544
1545        /* look for tuple match */
1546        zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
1547        hash = hash_conntrack_raw(&tuple, state->net);
1548        h = __nf_conntrack_find_get(state->net, zone, &tuple, hash);
1549        if (!h) {
1550                h = init_conntrack(state->net, tmpl, &tuple,
1551                                   skb, dataoff, hash);
1552                if (!h)
1553                        return 0;
1554                if (IS_ERR(h))
1555                        return PTR_ERR(h);
1556        }
1557        ct = nf_ct_tuplehash_to_ctrack(h);
1558
1559        /* It exists; we have (non-exclusive) reference. */
1560        if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
1561                ctinfo = IP_CT_ESTABLISHED_REPLY;
1562        } else {
1563                /* Once we've had two way comms, always ESTABLISHED. */
1564                if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
1565                        pr_debug("normal packet for %p\n", ct);
1566                        ctinfo = IP_CT_ESTABLISHED;
1567                } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
1568                        pr_debug("related packet for %p\n", ct);
1569                        ctinfo = IP_CT_RELATED;
1570                } else {
1571                        pr_debug("new packet for %p\n", ct);
1572                        ctinfo = IP_CT_NEW;
1573                }
1574        }
1575        nf_ct_set(skb, ct, ctinfo);
1576        return 0;
1577}
1578
1579/*
1580 * icmp packets need special treatment to handle error messages that are
1581 * related to a connection.
1582 *
1583 * Callers need to check if skb has a conntrack assigned when this
1584 * helper returns; in such case skb belongs to an already known connection.
1585 */
1586static unsigned int __cold
1587nf_conntrack_handle_icmp(struct nf_conn *tmpl,
1588                         struct sk_buff *skb,
1589                         unsigned int dataoff,
1590                         u8 protonum,
1591                         const struct nf_hook_state *state)
1592{
1593        int ret;
1594
1595        if (state->pf == NFPROTO_IPV4 && protonum == IPPROTO_ICMP)
1596                ret = nf_conntrack_icmpv4_error(tmpl, skb, dataoff, state);
1597#if IS_ENABLED(CONFIG_IPV6)
1598        else if (state->pf == NFPROTO_IPV6 && protonum == IPPROTO_ICMPV6)
1599                ret = nf_conntrack_icmpv6_error(tmpl, skb, dataoff, state);
1600#endif
1601        else
1602                return NF_ACCEPT;
1603
1604        if (ret <= 0) {
1605                NF_CT_STAT_INC_ATOMIC(state->net, error);
1606                NF_CT_STAT_INC_ATOMIC(state->net, invalid);
1607        }
1608
1609        return ret;
1610}
1611
1612static int generic_packet(struct nf_conn *ct, struct sk_buff *skb,
1613                          enum ip_conntrack_info ctinfo)
1614{
1615        const unsigned int *timeout = nf_ct_timeout_lookup(ct);
1616
1617        if (!timeout)
1618                timeout = &nf_generic_pernet(nf_ct_net(ct))->timeout;
1619
1620        nf_ct_refresh_acct(ct, ctinfo, skb, *timeout);
1621        return NF_ACCEPT;
1622}
1623
1624/* Returns verdict for packet, or -1 for invalid. */
1625static int nf_conntrack_handle_packet(struct nf_conn *ct,
1626                                      struct sk_buff *skb,
1627                                      unsigned int dataoff,
1628                                      enum ip_conntrack_info ctinfo,
1629                                      const struct nf_hook_state *state)
1630{
1631        switch (nf_ct_protonum(ct)) {
1632        case IPPROTO_TCP:
1633                return nf_conntrack_tcp_packet(ct, skb, dataoff,
1634                                               ctinfo, state);
1635        case IPPROTO_UDP:
1636                return nf_conntrack_udp_packet(ct, skb, dataoff,
1637                                               ctinfo, state);
1638        case IPPROTO_ICMP:
1639                return nf_conntrack_icmp_packet(ct, skb, ctinfo, state);
1640#if IS_ENABLED(CONFIG_IPV6)
1641        case IPPROTO_ICMPV6:
1642                return nf_conntrack_icmpv6_packet(ct, skb, ctinfo, state);
1643#endif
1644#ifdef CONFIG_NF_CT_PROTO_UDPLITE
1645        case IPPROTO_UDPLITE:
1646                return nf_conntrack_udplite_packet(ct, skb, dataoff,
1647                                                   ctinfo, state);
1648#endif
1649#ifdef CONFIG_NF_CT_PROTO_SCTP
1650        case IPPROTO_SCTP:
1651                return nf_conntrack_sctp_packet(ct, skb, dataoff,
1652                                                ctinfo, state);
1653#endif
1654#ifdef CONFIG_NF_CT_PROTO_DCCP
1655        case IPPROTO_DCCP:
1656                return nf_conntrack_dccp_packet(ct, skb, dataoff,
1657                                                ctinfo, state);
1658#endif
1659#ifdef CONFIG_NF_CT_PROTO_GRE
1660        case IPPROTO_GRE:
1661                return nf_conntrack_gre_packet(ct, skb, dataoff,
1662                                               ctinfo, state);
1663#endif
1664        }
1665
1666        return generic_packet(ct, skb, ctinfo);
1667}
1668
1669unsigned int
1670nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state)
1671{
1672        enum ip_conntrack_info ctinfo;
1673        struct nf_conn *ct, *tmpl;
1674        u_int8_t protonum;
1675        int dataoff, ret;
1676
1677        tmpl = nf_ct_get(skb, &ctinfo);
1678        if (tmpl || ctinfo == IP_CT_UNTRACKED) {
1679                /* Previously seen (loopback or untracked)?  Ignore. */
1680                if ((tmpl && !nf_ct_is_template(tmpl)) ||
1681                     ctinfo == IP_CT_UNTRACKED) {
1682                        NF_CT_STAT_INC_ATOMIC(state->net, ignore);
1683                        return NF_ACCEPT;
1684                }
1685                skb->_nfct = 0;
1686        }
1687
1688        /* rcu_read_lock()ed by nf_hook_thresh */
1689        dataoff = get_l4proto(skb, skb_network_offset(skb), state->pf, &protonum);
1690        if (dataoff <= 0) {
1691                pr_debug("not prepared to track yet or error occurred\n");
1692                NF_CT_STAT_INC_ATOMIC(state->net, error);
1693                NF_CT_STAT_INC_ATOMIC(state->net, invalid);
1694                ret = NF_ACCEPT;
1695                goto out;
1696        }
1697
1698        if (protonum == IPPROTO_ICMP || protonum == IPPROTO_ICMPV6) {
1699                ret = nf_conntrack_handle_icmp(tmpl, skb, dataoff,
1700                                               protonum, state);
1701                if (ret <= 0) {
1702                        ret = -ret;
1703                        goto out;
1704                }
1705                /* ICMP[v6] protocol trackers may assign one conntrack. */
1706                if (skb->_nfct)
1707                        goto out;
1708        }
1709repeat:
1710        ret = resolve_normal_ct(tmpl, skb, dataoff,
1711                                protonum, state);
1712        if (ret < 0) {
1713                /* Too stressed to deal. */
1714                NF_CT_STAT_INC_ATOMIC(state->net, drop);
1715                ret = NF_DROP;
1716                goto out;
1717        }
1718
1719        ct = nf_ct_get(skb, &ctinfo);
1720        if (!ct) {
1721                /* Not valid part of a connection */
1722                NF_CT_STAT_INC_ATOMIC(state->net, invalid);
1723                ret = NF_ACCEPT;
1724                goto out;
1725        }
1726
1727        ret = nf_conntrack_handle_packet(ct, skb, dataoff, ctinfo, state);
1728        if (ret <= 0) {
1729                /* Invalid: inverse of the return code tells
1730                 * the netfilter core what to do */
1731                pr_debug("nf_conntrack_in: Can't track with proto module\n");
1732                nf_conntrack_put(&ct->ct_general);
1733                skb->_nfct = 0;
1734                NF_CT_STAT_INC_ATOMIC(state->net, invalid);
1735                if (ret == -NF_DROP)
1736                        NF_CT_STAT_INC_ATOMIC(state->net, drop);
1737                /* Special case: TCP tracker reports an attempt to reopen a
1738                 * closed/aborted connection. We have to go back and create a
1739                 * fresh conntrack.
1740                 */
1741                if (ret == -NF_REPEAT)
1742                        goto repeat;
1743                ret = -ret;
1744                goto out;
1745        }
1746
1747        if (ctinfo == IP_CT_ESTABLISHED_REPLY &&
1748            !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
1749                nf_conntrack_event_cache(IPCT_REPLY, ct);
1750out:
1751        if (tmpl)
1752                nf_ct_put(tmpl);
1753
1754        return ret;
1755}
1756EXPORT_SYMBOL_GPL(nf_conntrack_in);
1757
1758/* Alter reply tuple (maybe alter helper).  This is for NAT, and is
1759   implicitly racy: see __nf_conntrack_confirm */
1760void nf_conntrack_alter_reply(struct nf_conn *ct,
1761                              const struct nf_conntrack_tuple *newreply)
1762{
1763        struct nf_conn_help *help = nfct_help(ct);
1764
1765        /* Should be unconfirmed, so not in hash table yet */
1766        WARN_ON(nf_ct_is_confirmed(ct));
1767
1768        pr_debug("Altering reply tuple of %p to ", ct);
1769        nf_ct_dump_tuple(newreply);
1770
1771        ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1772        if (ct->master || (help && !hlist_empty(&help->expectations)))
1773                return;
1774
1775        rcu_read_lock();
1776        __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC);
1777        rcu_read_unlock();
1778}
1779EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply);
1780
1781/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1782void __nf_ct_refresh_acct(struct nf_conn *ct,
1783                          enum ip_conntrack_info ctinfo,
1784                          const struct sk_buff *skb,
1785                          u32 extra_jiffies,
1786                          bool do_acct)
1787{
1788        /* Only update if this is not a fixed timeout */
1789        if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
1790                goto acct;
1791
1792        /* If not in hash table, timer will not be active yet */
1793        if (nf_ct_is_confirmed(ct))
1794                extra_jiffies += nfct_time_stamp;
1795
1796        if (ct->timeout != extra_jiffies)
1797                ct->timeout = extra_jiffies;
1798acct:
1799        if (do_acct)
1800                nf_ct_acct_update(ct, ctinfo, skb->len);
1801}
1802EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);
1803
1804bool nf_ct_kill_acct(struct nf_conn *ct,
1805                     enum ip_conntrack_info ctinfo,
1806                     const struct sk_buff *skb)
1807{
1808        nf_ct_acct_update(ct, ctinfo, skb->len);
1809
1810        return nf_ct_delete(ct, 0, 0);
1811}
1812EXPORT_SYMBOL_GPL(nf_ct_kill_acct);
1813
1814#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
1815
1816#include <linux/netfilter/nfnetlink.h>
1817#include <linux/netfilter/nfnetlink_conntrack.h>
1818#include <linux/mutex.h>
1819
1820/* Generic function for tcp/udp/sctp/dccp and alike. */
1821int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb,
1822                               const struct nf_conntrack_tuple *tuple)
1823{
1824        if (nla_put_be16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port) ||
1825            nla_put_be16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port))
1826                goto nla_put_failure;
1827        return 0;
1828
1829nla_put_failure:
1830        return -1;
1831}
1832EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr);
1833
1834const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = {
1835        [CTA_PROTO_SRC_PORT]  = { .type = NLA_U16 },
1836        [CTA_PROTO_DST_PORT]  = { .type = NLA_U16 },
1837};
1838EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy);
1839
1840int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[],
1841                               struct nf_conntrack_tuple *t)
1842{
1843        if (!tb[CTA_PROTO_SRC_PORT] || !tb[CTA_PROTO_DST_PORT])
1844                return -EINVAL;
1845
1846        t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]);
1847        t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]);
1848
1849        return 0;
1850}
1851EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple);
1852
1853unsigned int nf_ct_port_nlattr_tuple_size(void)
1854{
1855        static unsigned int size __read_mostly;
1856
1857        if (!size)
1858                size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1);
1859
1860        return size;
1861}
1862EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size);
1863#endif
1864
1865/* Used by ipt_REJECT and ip6t_REJECT. */
1866static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb)
1867{
1868        struct nf_conn *ct;
1869        enum ip_conntrack_info ctinfo;
1870
1871        /* This ICMP is in reverse direction to the packet which caused it */
1872        ct = nf_ct_get(skb, &ctinfo);
1873        if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1874                ctinfo = IP_CT_RELATED_REPLY;
1875        else
1876                ctinfo = IP_CT_RELATED;
1877
1878        /* Attach to new skbuff, and increment count */
1879        nf_ct_set(nskb, ct, ctinfo);
1880        nf_conntrack_get(skb_nfct(nskb));
1881}
1882
1883static int nf_conntrack_update(struct net *net, struct sk_buff *skb)
1884{
1885        struct nf_conntrack_tuple_hash *h;
1886        struct nf_conntrack_tuple tuple;
1887        enum ip_conntrack_info ctinfo;
1888        struct nf_nat_hook *nat_hook;
1889        unsigned int status;
1890        struct nf_conn *ct;
1891        int dataoff;
1892        u16 l3num;
1893        u8 l4num;
1894
1895        ct = nf_ct_get(skb, &ctinfo);
1896        if (!ct || nf_ct_is_confirmed(ct))
1897                return 0;
1898
1899        l3num = nf_ct_l3num(ct);
1900
1901        dataoff = get_l4proto(skb, skb_network_offset(skb), l3num, &l4num);
1902        if (dataoff <= 0)
1903                return -1;
1904
1905        if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num,
1906                             l4num, net, &tuple))
1907                return -1;
1908
1909        if (ct->status & IPS_SRC_NAT) {
1910                memcpy(tuple.src.u3.all,
1911                       ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.all,
1912                       sizeof(tuple.src.u3.all));
1913                tuple.src.u.all =
1914                        ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all;
1915        }
1916
1917        if (ct->status & IPS_DST_NAT) {
1918                memcpy(tuple.dst.u3.all,
1919                       ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.all,
1920                       sizeof(tuple.dst.u3.all));
1921                tuple.dst.u.all =
1922                        ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.all;
1923        }
1924
1925        h = nf_conntrack_find_get(net, nf_ct_zone(ct), &tuple);
1926        if (!h)
1927                return 0;
1928
1929        /* Store status bits of the conntrack that is clashing to re-do NAT
1930         * mangling according to what it has been done already to this packet.
1931         */
1932        status = ct->status;
1933
1934        nf_ct_put(ct);
1935        ct = nf_ct_tuplehash_to_ctrack(h);
1936        nf_ct_set(skb, ct, ctinfo);
1937
1938        nat_hook = rcu_dereference(nf_nat_hook);
1939        if (!nat_hook)
1940                return 0;
1941
1942        if (status & IPS_SRC_NAT &&
1943            nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_SRC,
1944                                IP_CT_DIR_ORIGINAL) == NF_DROP)
1945                return -1;
1946
1947        if (status & IPS_DST_NAT &&
1948            nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_DST,
1949                                IP_CT_DIR_ORIGINAL) == NF_DROP)
1950                return -1;
1951
1952        return 0;
1953}
1954
1955static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
1956                                       const struct sk_buff *skb)
1957{
1958        const struct nf_conntrack_tuple *src_tuple;
1959        const struct nf_conntrack_tuple_hash *hash;
1960        struct nf_conntrack_tuple srctuple;
1961        enum ip_conntrack_info ctinfo;
1962        struct nf_conn *ct;
1963
1964        ct = nf_ct_get(skb, &ctinfo);
1965        if (ct) {
1966                src_tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo));
1967                memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple));
1968                return true;
1969        }
1970
1971        if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
1972                               NFPROTO_IPV4, dev_net(skb->dev),
1973                               &srctuple))
1974                return false;
1975
1976        hash = nf_conntrack_find_get(dev_net(skb->dev),
1977                                     &nf_ct_zone_dflt,
1978                                     &srctuple);
1979        if (!hash)
1980                return false;
1981
1982        ct = nf_ct_tuplehash_to_ctrack(hash);
1983        src_tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir);
1984        memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple));
1985        nf_ct_put(ct);
1986
1987        return true;
1988}
1989
1990/* Bring out ya dead! */
1991static struct nf_conn *
1992get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
1993                void *data, unsigned int *bucket)
1994{
1995        struct nf_conntrack_tuple_hash *h;
1996        struct nf_conn *ct;
1997        struct hlist_nulls_node *n;
1998        spinlock_t *lockp;
1999
2000        for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
2001                lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS];
2002                local_bh_disable();
2003                nf_conntrack_lock(lockp);
2004                if (*bucket < nf_conntrack_htable_size) {
2005                        hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnnode) {
2006                                if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
2007                                        continue;
2008                                ct = nf_ct_tuplehash_to_ctrack(h);
2009                                if (iter(ct, data))
2010                                        goto found;
2011                        }
2012                }
2013                spin_unlock(lockp);
2014                local_bh_enable();
2015                cond_resched();
2016        }
2017
2018        return NULL;
2019found:
2020        atomic_inc(&ct->ct_general.use);
2021        spin_unlock(lockp);
2022        local_bh_enable();
2023        return ct;
2024}
2025
2026static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data),
2027                                  void *data, u32 portid, int report)
2028{
2029        unsigned int bucket = 0, sequence;
2030        struct nf_conn *ct;
2031
2032        might_sleep();
2033
2034        for (;;) {
2035                sequence = read_seqcount_begin(&nf_conntrack_generation);
2036
2037                while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) {
2038                        /* Time to push up daises... */
2039
2040                        nf_ct_delete(ct, portid, report);
2041                        nf_ct_put(ct);
2042                        cond_resched();
2043                }
2044
2045                if (!read_seqcount_retry(&nf_conntrack_generation, sequence))
2046                        break;
2047                bucket = 0;
2048        }
2049}
2050
2051struct iter_data {
2052        int (*iter)(struct nf_conn *i, void *data);
2053        void *data;
2054        struct net *net;
2055};
2056
2057static int iter_net_only(struct nf_conn *i, void *data)
2058{
2059        struct iter_data *d = data;
2060
2061        if (!net_eq(d->net, nf_ct_net(i)))
2062                return 0;
2063
2064        return d->iter(i, d->data);
2065}
2066
2067static void
2068__nf_ct_unconfirmed_destroy(struct net *net)
2069{
2070        int cpu;
2071
2072        for_each_possible_cpu(cpu) {
2073                struct nf_conntrack_tuple_hash *h;
2074                struct hlist_nulls_node *n;
2075                struct ct_pcpu *pcpu;
2076
2077                pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
2078
2079                spin_lock_bh(&pcpu->lock);
2080                hlist_nulls_for_each_entry(h, n, &pcpu->unconfirmed, hnnode) {
2081                        struct nf_conn *ct;
2082
2083                        ct = nf_ct_tuplehash_to_ctrack(h);
2084
2085                        /* we cannot call iter() on unconfirmed list, the
2086                         * owning cpu can reallocate ct->ext at any time.
2087                         */
2088                        set_bit(IPS_DYING_BIT, &ct->status);
2089                }
2090                spin_unlock_bh(&pcpu->lock);
2091                cond_resched();
2092        }
2093}
2094
2095void nf_ct_unconfirmed_destroy(struct net *net)
2096{
2097        might_sleep();
2098
2099        if (atomic_read(&net->ct.count) > 0) {
2100                __nf_ct_unconfirmed_destroy(net);
2101                nf_queue_nf_hook_drop(net);
2102                synchronize_net();
2103        }
2104}
2105EXPORT_SYMBOL_GPL(nf_ct_unconfirmed_destroy);
2106
2107void nf_ct_iterate_cleanup_net(struct net *net,
2108                               int (*iter)(struct nf_conn *i, void *data),
2109                               void *data, u32 portid, int report)
2110{
2111        struct iter_data d;
2112
2113        might_sleep();
2114
2115        if (atomic_read(&net->ct.count) == 0)
2116                return;
2117
2118        d.iter = iter;
2119        d.data = data;
2120        d.net = net;
2121
2122        nf_ct_iterate_cleanup(iter_net_only, &d, portid, report);
2123}
2124EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net);
2125
2126/**
2127 * nf_ct_iterate_destroy - destroy unconfirmed conntracks and iterate table
2128 * @iter: callback to invoke for each conntrack
2129 * @data: data to pass to @iter
2130 *
2131 * Like nf_ct_iterate_cleanup, but first marks conntracks on the
2132 * unconfirmed list as dying (so they will not be inserted into
2133 * main table).
2134 *
2135 * Can only be called in module exit path.
2136 */
2137void
2138nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data)
2139{
2140        struct net *net;
2141
2142        down_read(&net_rwsem);
2143        for_each_net(net) {
2144                if (atomic_read(&net->ct.count) == 0)
2145                        continue;
2146                __nf_ct_unconfirmed_destroy(net);
2147                nf_queue_nf_hook_drop(net);
2148        }
2149        up_read(&net_rwsem);
2150
2151        /* Need to wait for netns cleanup worker to finish, if its
2152         * running -- it might have deleted a net namespace from
2153         * the global list, so our __nf_ct_unconfirmed_destroy() might
2154         * not have affected all namespaces.
2155         */
2156        net_ns_barrier();
2157
2158        /* a conntrack could have been unlinked from unconfirmed list
2159         * before we grabbed pcpu lock in __nf_ct_unconfirmed_destroy().
2160         * This makes sure its inserted into conntrack table.
2161         */
2162        synchronize_net();
2163
2164        nf_ct_iterate_cleanup(iter, data, 0, 0);
2165}
2166EXPORT_SYMBOL_GPL(nf_ct_iterate_destroy);
2167
2168static int kill_all(struct nf_conn *i, void *data)
2169{
2170        return net_eq(nf_ct_net(i), data);
2171}
2172
2173void nf_conntrack_cleanup_start(void)
2174{
2175        conntrack_gc_work.exiting = true;
2176        RCU_INIT_POINTER(ip_ct_attach, NULL);
2177}
2178
2179void nf_conntrack_cleanup_end(void)
2180{
2181        RCU_INIT_POINTER(nf_ct_hook, NULL);
2182        cancel_delayed_work_sync(&conntrack_gc_work.dwork);
2183        kvfree(nf_conntrack_hash);
2184
2185        nf_conntrack_proto_fini();
2186        nf_conntrack_seqadj_fini();
2187        nf_conntrack_labels_fini();
2188        nf_conntrack_helper_fini();
2189        nf_conntrack_timeout_fini();
2190        nf_conntrack_ecache_fini();
2191        nf_conntrack_tstamp_fini();
2192        nf_conntrack_acct_fini();
2193        nf_conntrack_expect_fini();
2194
2195        kmem_cache_destroy(nf_conntrack_cachep);
2196}
2197
2198/*
2199 * Mishearing the voices in his head, our hero wonders how he's
2200 * supposed to kill the mall.
2201 */
2202void nf_conntrack_cleanup_net(struct net *net)
2203{
2204        LIST_HEAD(single);
2205
2206        list_add(&net->exit_list, &single);
2207        nf_conntrack_cleanup_net_list(&single);
2208}
2209
2210void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list)
2211{
2212        int busy;
2213        struct net *net;
2214
2215        /*
2216         * This makes sure all current packets have passed through
2217         *  netfilter framework.  Roll on, two-stage module
2218         *  delete...
2219         */
2220        synchronize_net();
2221i_see_dead_people:
2222        busy = 0;
2223        list_for_each_entry(net, net_exit_list, exit_list) {
2224                nf_ct_iterate_cleanup(kill_all, net, 0, 0);
2225                if (atomic_read(&net->ct.count) != 0)
2226                        busy = 1;
2227        }
2228        if (busy) {
2229                schedule();
2230                goto i_see_dead_people;
2231        }
2232
2233        list_for_each_entry(net, net_exit_list, exit_list) {
2234                nf_conntrack_proto_pernet_fini(net);
2235                nf_conntrack_ecache_pernet_fini(net);
2236                nf_conntrack_expect_pernet_fini(net);
2237                free_percpu(net->ct.stat);
2238                free_percpu(net->ct.pcpu_lists);
2239        }
2240}
2241
2242void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls)
2243{
2244        struct hlist_nulls_head *hash;
2245        unsigned int nr_slots, i;
2246
2247        if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head)))
2248                return NULL;
2249
2250        BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head));
2251        nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head));
2252
2253        hash = kvmalloc_array(nr_slots, sizeof(struct hlist_nulls_head),
2254                              GFP_KERNEL | __GFP_ZERO);
2255
2256        if (hash && nulls)
2257                for (i = 0; i < nr_slots; i++)
2258                        INIT_HLIST_NULLS_HEAD(&hash[i], i);
2259
2260        return hash;
2261}
2262EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable);
2263
2264int nf_conntrack_hash_resize(unsigned int hashsize)
2265{
2266        int i, bucket;
2267        unsigned int old_size;
2268        struct hlist_nulls_head *hash, *old_hash;
2269        struct nf_conntrack_tuple_hash *h;
2270        struct nf_conn *ct;
2271
2272        if (!hashsize)
2273                return -EINVAL;
2274
2275        hash = nf_ct_alloc_hashtable(&hashsize, 1);
2276        if (!hash)
2277                return -ENOMEM;
2278
2279        old_size = nf_conntrack_htable_size;
2280        if (old_size == hashsize) {
2281                kvfree(hash);
2282                return 0;
2283        }
2284
2285        local_bh_disable();
2286        nf_conntrack_all_lock();
2287        write_seqcount_begin(&nf_conntrack_generation);
2288
2289        /* Lookups in the old hash might happen in parallel, which means we
2290         * might get false negatives during connection lookup. New connections
2291         * created because of a false negative won't make it into the hash
2292         * though since that required taking the locks.
2293         */
2294
2295        for (i = 0; i < nf_conntrack_htable_size; i++) {
2296                while (!hlist_nulls_empty(&nf_conntrack_hash[i])) {
2297                        h = hlist_nulls_entry(nf_conntrack_hash[i].first,
2298                                              struct nf_conntrack_tuple_hash, hnnode);
2299                        ct = nf_ct_tuplehash_to_ctrack(h);
2300                        hlist_nulls_del_rcu(&h->hnnode);
2301                        bucket = __hash_conntrack(nf_ct_net(ct),
2302                                                  &h->tuple, hashsize);
2303                        hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
2304                }
2305        }
2306        old_size = nf_conntrack_htable_size;
2307        old_hash = nf_conntrack_hash;
2308
2309        nf_conntrack_hash = hash;
2310        nf_conntrack_htable_size = hashsize;
2311
2312        write_seqcount_end(&nf_conntrack_generation);
2313        nf_conntrack_all_unlock();
2314        local_bh_enable();
2315
2316        synchronize_net();
2317        kvfree(old_hash);
2318        return 0;
2319}
2320
2321int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp)
2322{
2323        unsigned int hashsize;
2324        int rc;
2325
2326        if (current->nsproxy->net_ns != &init_net)
2327                return -EOPNOTSUPP;
2328
2329        /* On boot, we can set this without any fancy locking. */
2330        if (!nf_conntrack_hash)
2331                return param_set_uint(val, kp);
2332
2333        rc = kstrtouint(val, 0, &hashsize);
2334        if (rc)
2335                return rc;
2336
2337        return nf_conntrack_hash_resize(hashsize);
2338}
2339EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize);
2340
2341static __always_inline unsigned int total_extension_size(void)
2342{
2343        /* remember to add new extensions below */
2344        BUILD_BUG_ON(NF_CT_EXT_NUM > 9);
2345
2346        return sizeof(struct nf_ct_ext) +
2347               sizeof(struct nf_conn_help)
2348#if IS_ENABLED(CONFIG_NF_NAT)
2349                + sizeof(struct nf_conn_nat)
2350#endif
2351                + sizeof(struct nf_conn_seqadj)
2352                + sizeof(struct nf_conn_acct)
2353#ifdef CONFIG_NF_CONNTRACK_EVENTS
2354                + sizeof(struct nf_conntrack_ecache)
2355#endif
2356#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
2357                + sizeof(struct nf_conn_tstamp)
2358#endif
2359#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
2360                + sizeof(struct nf_conn_timeout)
2361#endif
2362#ifdef CONFIG_NF_CONNTRACK_LABELS
2363                + sizeof(struct nf_conn_labels)
2364#endif
2365#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY)
2366                + sizeof(struct nf_conn_synproxy)
2367#endif
2368        ;
2369};
2370
2371int nf_conntrack_init_start(void)
2372{
2373        unsigned long nr_pages = totalram_pages();
2374        int max_factor = 8;
2375        int ret = -ENOMEM;
2376        int i;
2377
2378        /* struct nf_ct_ext uses u8 to store offsets/size */
2379        BUILD_BUG_ON(total_extension_size() > 255u);
2380
2381        seqcount_init(&nf_conntrack_generation);
2382
2383        for (i = 0; i < CONNTRACK_LOCKS; i++)
2384                spin_lock_init(&nf_conntrack_locks[i]);
2385
2386        if (!nf_conntrack_htable_size) {
2387                /* Idea from tcp.c: use 1/16384 of memory.
2388                 * On i386: 32MB machine has 512 buckets.
2389                 * >= 1GB machines have 16384 buckets.
2390                 * >= 4GB machines have 65536 buckets.
2391                 */
2392                nf_conntrack_htable_size
2393                        = (((nr_pages << PAGE_SHIFT) / 16384)
2394                           / sizeof(struct hlist_head));
2395                if (nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE)))
2396                        nf_conntrack_htable_size = 65536;
2397                else if (nr_pages > (1024 * 1024 * 1024 / PAGE_SIZE))
2398                        nf_conntrack_htable_size = 16384;
2399                if (nf_conntrack_htable_size < 32)
2400                        nf_conntrack_htable_size = 32;
2401
2402                /* Use a max. factor of four by default to get the same max as
2403                 * with the old struct list_heads. When a table size is given
2404                 * we use the old value of 8 to avoid reducing the max.
2405                 * entries. */
2406                max_factor = 4;
2407        }
2408
2409        nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1);
2410        if (!nf_conntrack_hash)
2411                return -ENOMEM;
2412
2413        nf_conntrack_max = max_factor * nf_conntrack_htable_size;
2414
2415        nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
2416                                                sizeof(struct nf_conn),
2417                                                NFCT_INFOMASK + 1,
2418                                                SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN, NULL);
2419        if (!nf_conntrack_cachep)
2420                goto err_cachep;
2421
2422        ret = nf_conntrack_expect_init();
2423        if (ret < 0)
2424                goto err_expect;
2425
2426        ret = nf_conntrack_acct_init();
2427        if (ret < 0)
2428                goto err_acct;
2429
2430        ret = nf_conntrack_tstamp_init();
2431        if (ret < 0)
2432                goto err_tstamp;
2433
2434        ret = nf_conntrack_ecache_init();
2435        if (ret < 0)
2436                goto err_ecache;
2437
2438        ret = nf_conntrack_timeout_init();
2439        if (ret < 0)
2440                goto err_timeout;
2441
2442        ret = nf_conntrack_helper_init();
2443        if (ret < 0)
2444                goto err_helper;
2445
2446        ret = nf_conntrack_labels_init();
2447        if (ret < 0)
2448                goto err_labels;
2449
2450        ret = nf_conntrack_seqadj_init();
2451        if (ret < 0)
2452                goto err_seqadj;
2453
2454        ret = nf_conntrack_proto_init();
2455        if (ret < 0)
2456                goto err_proto;
2457
2458        conntrack_gc_work_init(&conntrack_gc_work);
2459        queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, HZ);
2460
2461        return 0;
2462
2463err_proto:
2464        nf_conntrack_seqadj_fini();
2465err_seqadj:
2466        nf_conntrack_labels_fini();
2467err_labels:
2468        nf_conntrack_helper_fini();
2469err_helper:
2470        nf_conntrack_timeout_fini();
2471err_timeout:
2472        nf_conntrack_ecache_fini();
2473err_ecache:
2474        nf_conntrack_tstamp_fini();
2475err_tstamp:
2476        nf_conntrack_acct_fini();
2477err_acct:
2478        nf_conntrack_expect_fini();
2479err_expect:
2480        kmem_cache_destroy(nf_conntrack_cachep);
2481err_cachep:
2482        kvfree(nf_conntrack_hash);
2483        return ret;
2484}
2485
2486static struct nf_ct_hook nf_conntrack_hook = {
2487        .update         = nf_conntrack_update,
2488        .destroy        = destroy_conntrack,
2489        .get_tuple_skb  = nf_conntrack_get_tuple_skb,
2490};
2491
2492void nf_conntrack_init_end(void)
2493{
2494        /* For use by REJECT target */
2495        RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach);
2496        RCU_INIT_POINTER(nf_ct_hook, &nf_conntrack_hook);
2497}
2498
2499/*
2500 * We need to use special "null" values, not used in hash table
2501 */
2502#define UNCONFIRMED_NULLS_VAL   ((1<<30)+0)
2503#define DYING_NULLS_VAL         ((1<<30)+1)
2504#define TEMPLATE_NULLS_VAL      ((1<<30)+2)
2505
2506int nf_conntrack_init_net(struct net *net)
2507{
2508        int ret = -ENOMEM;
2509        int cpu;
2510
2511        BUILD_BUG_ON(IP_CT_UNTRACKED == IP_CT_NUMBER);
2512        BUILD_BUG_ON_NOT_POWER_OF_2(CONNTRACK_LOCKS);
2513        atomic_set(&net->ct.count, 0);
2514
2515        net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu);
2516        if (!net->ct.pcpu_lists)
2517                goto err_stat;
2518
2519        for_each_possible_cpu(cpu) {
2520                struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
2521
2522                spin_lock_init(&pcpu->lock);
2523                INIT_HLIST_NULLS_HEAD(&pcpu->unconfirmed, UNCONFIRMED_NULLS_VAL);
2524                INIT_HLIST_NULLS_HEAD(&pcpu->dying, DYING_NULLS_VAL);
2525        }
2526
2527        net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
2528        if (!net->ct.stat)
2529                goto err_pcpu_lists;
2530
2531        ret = nf_conntrack_expect_pernet_init(net);
2532        if (ret < 0)
2533                goto err_expect;
2534
2535        nf_conntrack_acct_pernet_init(net);
2536        nf_conntrack_tstamp_pernet_init(net);
2537        nf_conntrack_ecache_pernet_init(net);
2538        nf_conntrack_helper_pernet_init(net);
2539        nf_conntrack_proto_pernet_init(net);
2540
2541        return 0;
2542
2543err_expect:
2544        free_percpu(net->ct.stat);
2545err_pcpu_lists:
2546        free_percpu(net->ct.pcpu_lists);
2547err_stat:
2548        return ret;
2549}
2550