linux/net/ipv4/netfilter/nf_nat_core.c
<<
>>
Prefs
   1/* NAT for netfilter; shared with compatibility layer. */
   2
   3/* (C) 1999-2001 Paul `Rusty' Russell
   4 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
   5 *
   6 * This program is free software; you can redistribute it and/or modify
   7 * it under the terms of the GNU General Public License version 2 as
   8 * published by the Free Software Foundation.
   9 */
  10
  11#include <linux/module.h>
  12#include <linux/types.h>
  13#include <linux/timer.h>
  14#include <linux/skbuff.h>
  15#include <net/checksum.h>
  16#include <net/icmp.h>
  17#include <net/ip.h>
  18#include <net/tcp.h>  /* For tcp_prot in getorigdst */
  19#include <linux/icmp.h>
  20#include <linux/udp.h>
  21#include <linux/jhash.h>
  22
  23#include <linux/netfilter_ipv4.h>
  24#include <net/netfilter/nf_conntrack.h>
  25#include <net/netfilter/nf_conntrack_core.h>
  26#include <net/netfilter/nf_nat.h>
  27#include <net/netfilter/nf_nat_protocol.h>
  28#include <net/netfilter/nf_nat_core.h>
  29#include <net/netfilter/nf_nat_helper.h>
  30#include <net/netfilter/nf_conntrack_helper.h>
  31#include <net/netfilter/nf_conntrack_l3proto.h>
  32#include <net/netfilter/nf_conntrack_l4proto.h>
  33
  34static DEFINE_SPINLOCK(nf_nat_lock);
  35
  36static struct nf_conntrack_l3proto *l3proto __read_mostly;
  37
  38/* Calculated at init based on memory size */
  39static unsigned int nf_nat_htable_size __read_mostly;
  40
  41#define MAX_IP_NAT_PROTO 256
  42static const struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO]
  43                                                __read_mostly;
  44
  45static inline const struct nf_nat_protocol *
  46__nf_nat_proto_find(u_int8_t protonum)
  47{
  48        return rcu_dereference(nf_nat_protos[protonum]);
  49}
  50
  51const struct nf_nat_protocol *
  52nf_nat_proto_find_get(u_int8_t protonum)
  53{
  54        const struct nf_nat_protocol *p;
  55
  56        rcu_read_lock();
  57        p = __nf_nat_proto_find(protonum);
  58        if (!try_module_get(p->me))
  59                p = &nf_nat_unknown_protocol;
  60        rcu_read_unlock();
  61
  62        return p;
  63}
  64EXPORT_SYMBOL_GPL(nf_nat_proto_find_get);
  65
  66void
  67nf_nat_proto_put(const struct nf_nat_protocol *p)
  68{
  69        module_put(p->me);
  70}
  71EXPORT_SYMBOL_GPL(nf_nat_proto_put);
  72
  73/* We keep an extra hash for each conntrack, for fast searching. */
  74static inline unsigned int
  75hash_by_src(const struct nf_conntrack_tuple *tuple)
  76{
  77        unsigned int hash;
  78
  79        /* Original src, to ensure we map it consistently if poss. */
  80        hash = jhash_3words((__force u32)tuple->src.u3.ip,
  81                            (__force u32)tuple->src.u.all,
  82                            tuple->dst.protonum, 0);
  83        return ((u64)hash * nf_nat_htable_size) >> 32;
  84}
  85
  86/* Is this tuple already taken? (not by us) */
  87int
  88nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple,
  89                  const struct nf_conn *ignored_conntrack)
  90{
  91        /* Conntrack tracking doesn't keep track of outgoing tuples; only
  92           incoming ones.  NAT means they don't have a fixed mapping,
  93           so we invert the tuple and look for the incoming reply.
  94
  95           We could keep a separate hash if this proves too slow. */
  96        struct nf_conntrack_tuple reply;
  97
  98        nf_ct_invert_tuplepr(&reply, tuple);
  99        return nf_conntrack_tuple_taken(&reply, ignored_conntrack);
 100}
 101EXPORT_SYMBOL(nf_nat_used_tuple);
 102
 103/* If we source map this tuple so reply looks like reply_tuple, will
 104 * that meet the constraints of range. */
 105static int
 106in_range(const struct nf_conntrack_tuple *tuple,
 107         const struct nf_nat_range *range)
 108{
 109        const struct nf_nat_protocol *proto;
 110        int ret = 0;
 111
 112        /* If we are supposed to map IPs, then we must be in the
 113           range specified, otherwise let this drag us onto a new src IP. */
 114        if (range->flags & IP_NAT_RANGE_MAP_IPS) {
 115                if (ntohl(tuple->src.u3.ip) < ntohl(range->min_ip) ||
 116                    ntohl(tuple->src.u3.ip) > ntohl(range->max_ip))
 117                        return 0;
 118        }
 119
 120        rcu_read_lock();
 121        proto = __nf_nat_proto_find(tuple->dst.protonum);
 122        if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) ||
 123            proto->in_range(tuple, IP_NAT_MANIP_SRC,
 124                            &range->min, &range->max))
 125                ret = 1;
 126        rcu_read_unlock();
 127
 128        return ret;
 129}
 130
 131static inline int
 132same_src(const struct nf_conn *ct,
 133         const struct nf_conntrack_tuple *tuple)
 134{
 135        const struct nf_conntrack_tuple *t;
 136
 137        t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
 138        return (t->dst.protonum == tuple->dst.protonum &&
 139                t->src.u3.ip == tuple->src.u3.ip &&
 140                t->src.u.all == tuple->src.u.all);
 141}
 142
 143/* Only called for SRC manip */
 144static int
 145find_appropriate_src(struct net *net,
 146                     const struct nf_conntrack_tuple *tuple,
 147                     struct nf_conntrack_tuple *result,
 148                     const struct nf_nat_range *range)
 149{
 150        unsigned int h = hash_by_src(tuple);
 151        const struct nf_conn_nat *nat;
 152        const struct nf_conn *ct;
 153        const struct hlist_node *n;
 154
 155        rcu_read_lock();
 156        hlist_for_each_entry_rcu(nat, n, &net->ipv4.nat_bysource[h], bysource) {
 157                ct = nat->ct;
 158                if (same_src(ct, tuple)) {
 159                        /* Copy source part from reply tuple. */
 160                        nf_ct_invert_tuplepr(result,
 161                                       &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
 162                        result->dst = tuple->dst;
 163
 164                        if (in_range(result, range)) {
 165                                rcu_read_unlock();
 166                                return 1;
 167                        }
 168                }
 169        }
 170        rcu_read_unlock();
 171        return 0;
 172}
 173
 174/* For [FUTURE] fragmentation handling, we want the least-used
 175   src-ip/dst-ip/proto triple.  Fairness doesn't come into it.  Thus
 176   if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
 177   1-65535, we don't do pro-rata allocation based on ports; we choose
 178   the ip with the lowest src-ip/dst-ip/proto usage.
 179*/
 180static void
 181find_best_ips_proto(struct nf_conntrack_tuple *tuple,
 182                    const struct nf_nat_range *range,
 183                    const struct nf_conn *ct,
 184                    enum nf_nat_manip_type maniptype)
 185{
 186        __be32 *var_ipp;
 187        /* Host order */
 188        u_int32_t minip, maxip, j;
 189
 190        /* No IP mapping?  Do nothing. */
 191        if (!(range->flags & IP_NAT_RANGE_MAP_IPS))
 192                return;
 193
 194        if (maniptype == IP_NAT_MANIP_SRC)
 195                var_ipp = &tuple->src.u3.ip;
 196        else
 197                var_ipp = &tuple->dst.u3.ip;
 198
 199        /* Fast path: only one choice. */
 200        if (range->min_ip == range->max_ip) {
 201                *var_ipp = range->min_ip;
 202                return;
 203        }
 204
 205        /* Hashing source and destination IPs gives a fairly even
 206         * spread in practice (if there are a small number of IPs
 207         * involved, there usually aren't that many connections
 208         * anyway).  The consistency means that servers see the same
 209         * client coming from the same IP (some Internet Banking sites
 210         * like this), even across reboots. */
 211        minip = ntohl(range->min_ip);
 212        maxip = ntohl(range->max_ip);
 213        j = jhash_2words((__force u32)tuple->src.u3.ip,
 214                         range->flags & IP_NAT_RANGE_PERSISTENT ?
 215                                0 : (__force u32)tuple->dst.u3.ip, 0);
 216        j = ((u64)j * (maxip - minip + 1)) >> 32;
 217        *var_ipp = htonl(minip + j);
 218}
 219
 220/* Manipulate the tuple into the range given.  For NF_INET_POST_ROUTING,
 221 * we change the source to map into the range.  For NF_INET_PRE_ROUTING
 222 * and NF_INET_LOCAL_OUT, we change the destination to map into the
 223 * range.  It might not be possible to get a unique tuple, but we try.
 224 * At worst (or if we race), we will end up with a final duplicate in
 225 * __ip_conntrack_confirm and drop the packet. */
 226static void
 227get_unique_tuple(struct nf_conntrack_tuple *tuple,
 228                 const struct nf_conntrack_tuple *orig_tuple,
 229                 const struct nf_nat_range *range,
 230                 struct nf_conn *ct,
 231                 enum nf_nat_manip_type maniptype)
 232{
 233        struct net *net = nf_ct_net(ct);
 234        const struct nf_nat_protocol *proto;
 235
 236        /* 1) If this srcip/proto/src-proto-part is currently mapped,
 237           and that same mapping gives a unique tuple within the given
 238           range, use that.
 239
 240           This is only required for source (ie. NAT/masq) mappings.
 241           So far, we don't do local source mappings, so multiple
 242           manips not an issue.  */
 243        if (maniptype == IP_NAT_MANIP_SRC &&
 244            !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) {
 245                if (find_appropriate_src(net, orig_tuple, tuple, range)) {
 246                        pr_debug("get_unique_tuple: Found current src map\n");
 247                        if (!nf_nat_used_tuple(tuple, ct))
 248                                return;
 249                }
 250        }
 251
 252        /* 2) Select the least-used IP/proto combination in the given
 253           range. */
 254        *tuple = *orig_tuple;
 255        find_best_ips_proto(tuple, range, ct, maniptype);
 256
 257        /* 3) The per-protocol part of the manip is made to map into
 258           the range to make a unique tuple. */
 259
 260        rcu_read_lock();
 261        proto = __nf_nat_proto_find(orig_tuple->dst.protonum);
 262
 263        /* Change protocol info to have some randomization */
 264        if (range->flags & IP_NAT_RANGE_PROTO_RANDOM) {
 265                proto->unique_tuple(tuple, range, maniptype, ct);
 266                goto out;
 267        }
 268
 269        /* Only bother mapping if it's not already in range and unique */
 270        if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) ||
 271             proto->in_range(tuple, maniptype, &range->min, &range->max)) &&
 272            !nf_nat_used_tuple(tuple, ct))
 273                goto out;
 274
 275        /* Last change: get protocol to try to obtain unique tuple. */
 276        proto->unique_tuple(tuple, range, maniptype, ct);
 277out:
 278        rcu_read_unlock();
 279}
 280
 281unsigned int
 282nf_nat_setup_info(struct nf_conn *ct,
 283                  const struct nf_nat_range *range,
 284                  enum nf_nat_manip_type maniptype)
 285{
 286        struct net *net = nf_ct_net(ct);
 287        struct nf_conntrack_tuple curr_tuple, new_tuple;
 288        struct nf_conn_nat *nat;
 289        int have_to_hash = !(ct->status & IPS_NAT_DONE_MASK);
 290
 291        /* nat helper or nfctnetlink also setup binding */
 292        nat = nfct_nat(ct);
 293        if (!nat) {
 294                nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
 295                if (nat == NULL) {
 296                        pr_debug("failed to add NAT extension\n");
 297                        return NF_ACCEPT;
 298                }
 299        }
 300
 301        NF_CT_ASSERT(maniptype == IP_NAT_MANIP_SRC ||
 302                     maniptype == IP_NAT_MANIP_DST);
 303        BUG_ON(nf_nat_initialized(ct, maniptype));
 304
 305        /* What we've got will look like inverse of reply. Normally
 306           this is what is in the conntrack, except for prior
 307           manipulations (future optimization: if num_manips == 0,
 308           orig_tp =
 309           conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */
 310        nf_ct_invert_tuplepr(&curr_tuple,
 311                             &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
 312
 313        get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype);
 314
 315        if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) {
 316                struct nf_conntrack_tuple reply;
 317
 318                /* Alter conntrack table so will recognize replies. */
 319                nf_ct_invert_tuplepr(&reply, &new_tuple);
 320                nf_conntrack_alter_reply(ct, &reply);
 321
 322                /* Non-atomic: we own this at the moment. */
 323                if (maniptype == IP_NAT_MANIP_SRC)
 324                        ct->status |= IPS_SRC_NAT;
 325                else
 326                        ct->status |= IPS_DST_NAT;
 327        }
 328
 329        /* Place in source hash if this is the first time. */
 330        if (have_to_hash) {
 331                unsigned int srchash;
 332
 333                srchash = hash_by_src(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
 334                spin_lock_bh(&nf_nat_lock);
 335                /* nf_conntrack_alter_reply might re-allocate exntension aera */
 336                nat = nfct_nat(ct);
 337                nat->ct = ct;
 338                hlist_add_head_rcu(&nat->bysource,
 339                                   &net->ipv4.nat_bysource[srchash]);
 340                spin_unlock_bh(&nf_nat_lock);
 341        }
 342
 343        /* It's done. */
 344        if (maniptype == IP_NAT_MANIP_DST)
 345                set_bit(IPS_DST_NAT_DONE_BIT, &ct->status);
 346        else
 347                set_bit(IPS_SRC_NAT_DONE_BIT, &ct->status);
 348
 349        return NF_ACCEPT;
 350}
 351EXPORT_SYMBOL(nf_nat_setup_info);
 352
 353/* Returns true if succeeded. */
 354static bool
 355manip_pkt(u_int16_t proto,
 356          struct sk_buff *skb,
 357          unsigned int iphdroff,
 358          const struct nf_conntrack_tuple *target,
 359          enum nf_nat_manip_type maniptype)
 360{
 361        struct iphdr *iph;
 362        const struct nf_nat_protocol *p;
 363
 364        if (!skb_make_writable(skb, iphdroff + sizeof(*iph)))
 365                return false;
 366
 367        iph = (void *)skb->data + iphdroff;
 368
 369        /* Manipulate protcol part. */
 370
 371        /* rcu_read_lock()ed by nf_hook_slow */
 372        p = __nf_nat_proto_find(proto);
 373        if (!p->manip_pkt(skb, iphdroff, target, maniptype))
 374                return false;
 375
 376        iph = (void *)skb->data + iphdroff;
 377
 378        if (maniptype == IP_NAT_MANIP_SRC) {
 379                csum_replace4(&iph->check, iph->saddr, target->src.u3.ip);
 380                iph->saddr = target->src.u3.ip;
 381        } else {
 382                csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip);
 383                iph->daddr = target->dst.u3.ip;
 384        }
 385        return true;
 386}
 387
 388/* Do packet manipulations according to nf_nat_setup_info. */
 389unsigned int nf_nat_packet(struct nf_conn *ct,
 390                           enum ip_conntrack_info ctinfo,
 391                           unsigned int hooknum,
 392                           struct sk_buff *skb)
 393{
 394        enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
 395        unsigned long statusbit;
 396        enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum);
 397
 398        if (mtype == IP_NAT_MANIP_SRC)
 399                statusbit = IPS_SRC_NAT;
 400        else
 401                statusbit = IPS_DST_NAT;
 402
 403        /* Invert if this is reply dir. */
 404        if (dir == IP_CT_DIR_REPLY)
 405                statusbit ^= IPS_NAT_MASK;
 406
 407        /* Non-atomic: these bits don't change. */
 408        if (ct->status & statusbit) {
 409                struct nf_conntrack_tuple target;
 410
 411                /* We are aiming to look like inverse of other direction. */
 412                nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
 413
 414                if (!manip_pkt(target.dst.protonum, skb, 0, &target, mtype))
 415                        return NF_DROP;
 416        }
 417        return NF_ACCEPT;
 418}
 419EXPORT_SYMBOL_GPL(nf_nat_packet);
 420
 421/* Dir is direction ICMP is coming from (opposite to packet it contains) */
 422int nf_nat_icmp_reply_translation(struct nf_conn *ct,
 423                                  enum ip_conntrack_info ctinfo,
 424                                  unsigned int hooknum,
 425                                  struct sk_buff *skb)
 426{
 427        struct {
 428                struct icmphdr icmp;
 429                struct iphdr ip;
 430        } *inside;
 431        const struct nf_conntrack_l4proto *l4proto;
 432        struct nf_conntrack_tuple inner, target;
 433        int hdrlen = ip_hdrlen(skb);
 434        enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
 435        unsigned long statusbit;
 436        enum nf_nat_manip_type manip = HOOK2MANIP(hooknum);
 437
 438        if (!skb_make_writable(skb, hdrlen + sizeof(*inside)))
 439                return 0;
 440
 441        inside = (void *)skb->data + ip_hdrlen(skb);
 442
 443        /* We're actually going to mangle it beyond trivial checksum
 444           adjustment, so make sure the current checksum is correct. */
 445        if (nf_ip_checksum(skb, hooknum, hdrlen, 0))
 446                return 0;
 447
 448        /* Must be RELATED */
 449        NF_CT_ASSERT(skb->nfctinfo == IP_CT_RELATED ||
 450                     skb->nfctinfo == IP_CT_RELATED+IP_CT_IS_REPLY);
 451
 452        /* Redirects on non-null nats must be dropped, else they'll
 453           start talking to each other without our translation, and be
 454           confused... --RR */
 455        if (inside->icmp.type == ICMP_REDIRECT) {
 456                /* If NAT isn't finished, assume it and drop. */
 457                if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK)
 458                        return 0;
 459
 460                if (ct->status & IPS_NAT_MASK)
 461                        return 0;
 462        }
 463
 464        pr_debug("icmp_reply_translation: translating error %p manip %u "
 465                 "dir %s\n", skb, manip,
 466                 dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
 467
 468        /* rcu_read_lock()ed by nf_hook_slow */
 469        l4proto = __nf_ct_l4proto_find(PF_INET, inside->ip.protocol);
 470
 471        if (!nf_ct_get_tuple(skb,
 472                             ip_hdrlen(skb) + sizeof(struct icmphdr),
 473                             (ip_hdrlen(skb) +
 474                              sizeof(struct icmphdr) + inside->ip.ihl * 4),
 475                             (u_int16_t)AF_INET,
 476                             inside->ip.protocol,
 477                             &inner, l3proto, l4proto))
 478                return 0;
 479
 480        /* Change inner back to look like incoming packet.  We do the
 481           opposite manip on this hook to normal, because it might not
 482           pass all hooks (locally-generated ICMP).  Consider incoming
 483           packet: PREROUTING (DST manip), routing produces ICMP, goes
 484           through POSTROUTING (which must correct the DST manip). */
 485        if (!manip_pkt(inside->ip.protocol, skb,
 486                       ip_hdrlen(skb) + sizeof(inside->icmp),
 487                       &ct->tuplehash[!dir].tuple,
 488                       !manip))
 489                return 0;
 490
 491        if (skb->ip_summed != CHECKSUM_PARTIAL) {
 492                /* Reloading "inside" here since manip_pkt inner. */
 493                inside = (void *)skb->data + ip_hdrlen(skb);
 494                inside->icmp.checksum = 0;
 495                inside->icmp.checksum =
 496                        csum_fold(skb_checksum(skb, hdrlen,
 497                                               skb->len - hdrlen, 0));
 498        }
 499
 500        /* Change outer to look the reply to an incoming packet
 501         * (proto 0 means don't invert per-proto part). */
 502        if (manip == IP_NAT_MANIP_SRC)
 503                statusbit = IPS_SRC_NAT;
 504        else
 505                statusbit = IPS_DST_NAT;
 506
 507        /* Invert if this is reply dir. */
 508        if (dir == IP_CT_DIR_REPLY)
 509                statusbit ^= IPS_NAT_MASK;
 510
 511        if (ct->status & statusbit) {
 512                nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
 513                if (!manip_pkt(0, skb, 0, &target, manip))
 514                        return 0;
 515        }
 516
 517        return 1;
 518}
 519EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation);
 520
 521/* Protocol registration. */
 522int nf_nat_protocol_register(const struct nf_nat_protocol *proto)
 523{
 524        int ret = 0;
 525
 526        spin_lock_bh(&nf_nat_lock);
 527        if (nf_nat_protos[proto->protonum] != &nf_nat_unknown_protocol) {
 528                ret = -EBUSY;
 529                goto out;
 530        }
 531        rcu_assign_pointer(nf_nat_protos[proto->protonum], proto);
 532 out:
 533        spin_unlock_bh(&nf_nat_lock);
 534        return ret;
 535}
 536EXPORT_SYMBOL(nf_nat_protocol_register);
 537
 538/* Noone stores the protocol anywhere; simply delete it. */
 539void nf_nat_protocol_unregister(const struct nf_nat_protocol *proto)
 540{
 541        spin_lock_bh(&nf_nat_lock);
 542        rcu_assign_pointer(nf_nat_protos[proto->protonum],
 543                           &nf_nat_unknown_protocol);
 544        spin_unlock_bh(&nf_nat_lock);
 545        synchronize_rcu();
 546}
 547EXPORT_SYMBOL(nf_nat_protocol_unregister);
 548
 549/* Noone using conntrack by the time this called. */
 550static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
 551{
 552        struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT);
 553
 554        if (nat == NULL || nat->ct == NULL)
 555                return;
 556
 557        NF_CT_ASSERT(nat->ct->status & IPS_NAT_DONE_MASK);
 558
 559        spin_lock_bh(&nf_nat_lock);
 560        hlist_del_rcu(&nat->bysource);
 561        spin_unlock_bh(&nf_nat_lock);
 562}
 563
 564static void nf_nat_move_storage(void *new, void *old)
 565{
 566        struct nf_conn_nat *new_nat = new;
 567        struct nf_conn_nat *old_nat = old;
 568        struct nf_conn *ct = old_nat->ct;
 569
 570        if (!ct || !(ct->status & IPS_NAT_DONE_MASK))
 571                return;
 572
 573        spin_lock_bh(&nf_nat_lock);
 574        new_nat->ct = ct;
 575        hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource);
 576        spin_unlock_bh(&nf_nat_lock);
 577}
 578
 579static struct nf_ct_ext_type nat_extend __read_mostly = {
 580        .len            = sizeof(struct nf_conn_nat),
 581        .align          = __alignof__(struct nf_conn_nat),
 582        .destroy        = nf_nat_cleanup_conntrack,
 583        .move           = nf_nat_move_storage,
 584        .id             = NF_CT_EXT_NAT,
 585        .flags          = NF_CT_EXT_F_PREALLOC,
 586};
 587
 588#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
 589
 590#include <linux/netfilter/nfnetlink.h>
 591#include <linux/netfilter/nfnetlink_conntrack.h>
 592
 593static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = {
 594        [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 },
 595        [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 },
 596};
 597
 598static int nfnetlink_parse_nat_proto(struct nlattr *attr,
 599                                     const struct nf_conn *ct,
 600                                     struct nf_nat_range *range)
 601{
 602        struct nlattr *tb[CTA_PROTONAT_MAX+1];
 603        const struct nf_nat_protocol *npt;
 604        int err;
 605
 606        err = nla_parse_nested(tb, CTA_PROTONAT_MAX, attr, protonat_nla_policy);
 607        if (err < 0)
 608                return err;
 609
 610        npt = nf_nat_proto_find_get(nf_ct_protonum(ct));
 611        if (npt->nlattr_to_range)
 612                err = npt->nlattr_to_range(tb, range);
 613        nf_nat_proto_put(npt);
 614        return err;
 615}
 616
 617static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = {
 618        [CTA_NAT_MINIP]         = { .type = NLA_U32 },
 619        [CTA_NAT_MAXIP]         = { .type = NLA_U32 },
 620};
 621
 622static int
 623nfnetlink_parse_nat(const struct nlattr *nat,
 624                    const struct nf_conn *ct, struct nf_nat_range *range)
 625{
 626        struct nlattr *tb[CTA_NAT_MAX+1];
 627        int err;
 628
 629        memset(range, 0, sizeof(*range));
 630
 631        err = nla_parse_nested(tb, CTA_NAT_MAX, nat, nat_nla_policy);
 632        if (err < 0)
 633                return err;
 634
 635        if (tb[CTA_NAT_MINIP])
 636                range->min_ip = nla_get_be32(tb[CTA_NAT_MINIP]);
 637
 638        if (!tb[CTA_NAT_MAXIP])
 639                range->max_ip = range->min_ip;
 640        else
 641                range->max_ip = nla_get_be32(tb[CTA_NAT_MAXIP]);
 642
 643        if (range->min_ip)
 644                range->flags |= IP_NAT_RANGE_MAP_IPS;
 645
 646        if (!tb[CTA_NAT_PROTO])
 647                return 0;
 648
 649        err = nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range);
 650        if (err < 0)
 651                return err;
 652
 653        return 0;
 654}
 655
 656static int
 657nfnetlink_parse_nat_setup(struct nf_conn *ct,
 658                          enum nf_nat_manip_type manip,
 659                          const struct nlattr *attr)
 660{
 661        struct nf_nat_range range;
 662
 663        if (nfnetlink_parse_nat(attr, ct, &range) < 0)
 664                return -EINVAL;
 665        if (nf_nat_initialized(ct, manip))
 666                return -EEXIST;
 667
 668        return nf_nat_setup_info(ct, &range, manip);
 669}
 670#else
 671static int
 672nfnetlink_parse_nat_setup(struct nf_conn *ct,
 673                          enum nf_nat_manip_type manip,
 674                          const struct nlattr *attr)
 675{
 676        return -EOPNOTSUPP;
 677}
 678#endif
 679
 680static int __net_init nf_nat_net_init(struct net *net)
 681{
 682        net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size,
 683                                                      &net->ipv4.nat_vmalloced, 0);
 684        if (!net->ipv4.nat_bysource)
 685                return -ENOMEM;
 686        return 0;
 687}
 688
 689/* Clear NAT section of all conntracks, in case we're loaded again. */
 690static int clean_nat(struct nf_conn *i, void *data)
 691{
 692        struct nf_conn_nat *nat = nfct_nat(i);
 693
 694        if (!nat)
 695                return 0;
 696        memset(nat, 0, sizeof(*nat));
 697        i->status &= ~(IPS_NAT_MASK | IPS_NAT_DONE_MASK | IPS_SEQ_ADJUST);
 698        return 0;
 699}
 700
 701static void __net_exit nf_nat_net_exit(struct net *net)
 702{
 703        nf_ct_iterate_cleanup(net, &clean_nat, NULL);
 704        synchronize_rcu();
 705        nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_vmalloced,
 706                             nf_nat_htable_size);
 707}
 708
 709static struct pernet_operations nf_nat_net_ops = {
 710        .init = nf_nat_net_init,
 711        .exit = nf_nat_net_exit,
 712};
 713
 714static int __init nf_nat_init(void)
 715{
 716        size_t i;
 717        int ret;
 718
 719        need_ipv4_conntrack();
 720
 721        ret = nf_ct_extend_register(&nat_extend);
 722        if (ret < 0) {
 723                printk(KERN_ERR "nf_nat_core: Unable to register extension\n");
 724                return ret;
 725        }
 726
 727        /* Leave them the same for the moment. */
 728        nf_nat_htable_size = nf_conntrack_htable_size;
 729
 730        ret = register_pernet_subsys(&nf_nat_net_ops);
 731        if (ret < 0)
 732                goto cleanup_extend;
 733
 734        /* Sew in builtin protocols. */
 735        spin_lock_bh(&nf_nat_lock);
 736        for (i = 0; i < MAX_IP_NAT_PROTO; i++)
 737                rcu_assign_pointer(nf_nat_protos[i], &nf_nat_unknown_protocol);
 738        rcu_assign_pointer(nf_nat_protos[IPPROTO_TCP], &nf_nat_protocol_tcp);
 739        rcu_assign_pointer(nf_nat_protos[IPPROTO_UDP], &nf_nat_protocol_udp);
 740        rcu_assign_pointer(nf_nat_protos[IPPROTO_ICMP], &nf_nat_protocol_icmp);
 741        spin_unlock_bh(&nf_nat_lock);
 742
 743        /* Initialize fake conntrack so that NAT will skip it */
 744        nf_conntrack_untracked.status |= IPS_NAT_DONE_MASK;
 745
 746        l3proto = nf_ct_l3proto_find_get((u_int16_t)AF_INET);
 747
 748        BUG_ON(nf_nat_seq_adjust_hook != NULL);
 749        rcu_assign_pointer(nf_nat_seq_adjust_hook, nf_nat_seq_adjust);
 750        BUG_ON(nfnetlink_parse_nat_setup_hook != NULL);
 751        rcu_assign_pointer(nfnetlink_parse_nat_setup_hook,
 752                           nfnetlink_parse_nat_setup);
 753        BUG_ON(nf_ct_nat_offset != NULL);
 754        rcu_assign_pointer(nf_ct_nat_offset, nf_nat_get_offset);
 755        return 0;
 756
 757 cleanup_extend:
 758        nf_ct_extend_unregister(&nat_extend);
 759        return ret;
 760}
 761
 762static void __exit nf_nat_cleanup(void)
 763{
 764        unregister_pernet_subsys(&nf_nat_net_ops);
 765        nf_ct_l3proto_put(l3proto);
 766        nf_ct_extend_unregister(&nat_extend);
 767        rcu_assign_pointer(nf_nat_seq_adjust_hook, NULL);
 768        rcu_assign_pointer(nfnetlink_parse_nat_setup_hook, NULL);
 769        rcu_assign_pointer(nf_ct_nat_offset, NULL);
 770        synchronize_net();
 771}
 772
 773MODULE_LICENSE("GPL");
 774MODULE_ALIAS("nf-nat-ipv4");
 775
 776module_init(nf_nat_init);
 777module_exit(nf_nat_cleanup);
 778