linux/net/netfilter/nf_conncount.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * count the number of connections matching an arbitrary key.
   4 *
   5 * (C) 2017 Red Hat GmbH
   6 * Author: Florian Westphal <fw@strlen.de>
   7 *
   8 * split from xt_connlimit.c:
   9 *   (c) 2000 Gerd Knorr <kraxel@bytesex.org>
  10 *   Nov 2002: Martin Bene <martin.bene@icomedias.com>:
  11 *              only ignore TIME_WAIT or gone connections
  12 *   (C) CC Computer Consultants GmbH, 2007
  13 */
  14#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  15#include <linux/in.h>
  16#include <linux/in6.h>
  17#include <linux/ip.h>
  18#include <linux/ipv6.h>
  19#include <linux/jhash.h>
  20#include <linux/slab.h>
  21#include <linux/list.h>
  22#include <linux/rbtree.h>
  23#include <linux/module.h>
  24#include <linux/random.h>
  25#include <linux/skbuff.h>
  26#include <linux/spinlock.h>
  27#include <linux/netfilter/nf_conntrack_tcp.h>
  28#include <linux/netfilter/x_tables.h>
  29#include <net/netfilter/nf_conntrack.h>
  30#include <net/netfilter/nf_conntrack_count.h>
  31#include <net/netfilter/nf_conntrack_core.h>
  32#include <net/netfilter/nf_conntrack_tuple.h>
  33#include <net/netfilter/nf_conntrack_zones.h>
  34
  35#define CONNCOUNT_SLOTS         256U
  36
  37#define CONNCOUNT_GC_MAX_NODES  8
  38#define MAX_KEYLEN              5
  39
  40/* we will save the tuples of all connections we care about */
  41struct nf_conncount_tuple {
  42        struct list_head                node;
  43        struct nf_conntrack_tuple       tuple;
  44        struct nf_conntrack_zone        zone;
  45        int                             cpu;
  46        u32                             jiffies32;
  47};
  48
  49struct nf_conncount_rb {
  50        struct rb_node node;
  51        struct nf_conncount_list list;
  52        u32 key[MAX_KEYLEN];
  53        struct rcu_head rcu_head;
  54};
  55
  56static spinlock_t nf_conncount_locks[CONNCOUNT_SLOTS] __cacheline_aligned_in_smp;
  57
  58struct nf_conncount_data {
  59        unsigned int keylen;
  60        struct rb_root root[CONNCOUNT_SLOTS];
  61        struct net *net;
  62        struct work_struct gc_work;
  63        unsigned long pending_trees[BITS_TO_LONGS(CONNCOUNT_SLOTS)];
  64        unsigned int gc_tree;
  65};
  66
  67static u_int32_t conncount_rnd __read_mostly;
  68static struct kmem_cache *conncount_rb_cachep __read_mostly;
  69static struct kmem_cache *conncount_conn_cachep __read_mostly;
  70
  71static inline bool already_closed(const struct nf_conn *conn)
  72{
  73        if (nf_ct_protonum(conn) == IPPROTO_TCP)
  74                return conn->proto.tcp.state == TCP_CONNTRACK_TIME_WAIT ||
  75                       conn->proto.tcp.state == TCP_CONNTRACK_CLOSE;
  76        else
  77                return false;
  78}
  79
  80static int key_diff(const u32 *a, const u32 *b, unsigned int klen)
  81{
  82        return memcmp(a, b, klen * sizeof(u32));
  83}
  84
  85static void conn_free(struct nf_conncount_list *list,
  86                      struct nf_conncount_tuple *conn)
  87{
  88        lockdep_assert_held(&list->list_lock);
  89
  90        list->count--;
  91        list_del(&conn->node);
  92
  93        kmem_cache_free(conncount_conn_cachep, conn);
  94}
  95
  96static const struct nf_conntrack_tuple_hash *
  97find_or_evict(struct net *net, struct nf_conncount_list *list,
  98              struct nf_conncount_tuple *conn)
  99{
 100        const struct nf_conntrack_tuple_hash *found;
 101        unsigned long a, b;
 102        int cpu = raw_smp_processor_id();
 103        u32 age;
 104
 105        found = nf_conntrack_find_get(net, &conn->zone, &conn->tuple);
 106        if (found)
 107                return found;
 108        b = conn->jiffies32;
 109        a = (u32)jiffies;
 110
 111        /* conn might have been added just before by another cpu and
 112         * might still be unconfirmed.  In this case, nf_conntrack_find()
 113         * returns no result.  Thus only evict if this cpu added the
 114         * stale entry or if the entry is older than two jiffies.
 115         */
 116        age = a - b;
 117        if (conn->cpu == cpu || age >= 2) {
 118                conn_free(list, conn);
 119                return ERR_PTR(-ENOENT);
 120        }
 121
 122        return ERR_PTR(-EAGAIN);
 123}
 124
 125static int __nf_conncount_add(struct net *net,
 126                              struct nf_conncount_list *list,
 127                              const struct nf_conntrack_tuple *tuple,
 128                              const struct nf_conntrack_zone *zone)
 129{
 130        const struct nf_conntrack_tuple_hash *found;
 131        struct nf_conncount_tuple *conn, *conn_n;
 132        struct nf_conn *found_ct;
 133        unsigned int collect = 0;
 134
 135        /* check the saved connections */
 136        list_for_each_entry_safe(conn, conn_n, &list->head, node) {
 137                if (collect > CONNCOUNT_GC_MAX_NODES)
 138                        break;
 139
 140                found = find_or_evict(net, list, conn);
 141                if (IS_ERR(found)) {
 142                        /* Not found, but might be about to be confirmed */
 143                        if (PTR_ERR(found) == -EAGAIN) {
 144                                if (nf_ct_tuple_equal(&conn->tuple, tuple) &&
 145                                    nf_ct_zone_id(&conn->zone, conn->zone.dir) ==
 146                                    nf_ct_zone_id(zone, zone->dir))
 147                                        return 0; /* already exists */
 148                        } else {
 149                                collect++;
 150                        }
 151                        continue;
 152                }
 153
 154                found_ct = nf_ct_tuplehash_to_ctrack(found);
 155
 156                if (nf_ct_tuple_equal(&conn->tuple, tuple) &&
 157                    nf_ct_zone_equal(found_ct, zone, zone->dir)) {
 158                        /*
 159                         * We should not see tuples twice unless someone hooks
 160                         * this into a table without "-p tcp --syn".
 161                         *
 162                         * Attempt to avoid a re-add in this case.
 163                         */
 164                        nf_ct_put(found_ct);
 165                        return 0;
 166                } else if (already_closed(found_ct)) {
 167                        /*
 168                         * we do not care about connections which are
 169                         * closed already -> ditch it
 170                         */
 171                        nf_ct_put(found_ct);
 172                        conn_free(list, conn);
 173                        collect++;
 174                        continue;
 175                }
 176
 177                nf_ct_put(found_ct);
 178        }
 179
 180        if (WARN_ON_ONCE(list->count > INT_MAX))
 181                return -EOVERFLOW;
 182
 183        conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC);
 184        if (conn == NULL)
 185                return -ENOMEM;
 186
 187        conn->tuple = *tuple;
 188        conn->zone = *zone;
 189        conn->cpu = raw_smp_processor_id();
 190        conn->jiffies32 = (u32)jiffies;
 191        list_add_tail(&conn->node, &list->head);
 192        list->count++;
 193        return 0;
 194}
 195
 196int nf_conncount_add(struct net *net,
 197                     struct nf_conncount_list *list,
 198                     const struct nf_conntrack_tuple *tuple,
 199                     const struct nf_conntrack_zone *zone)
 200{
 201        int ret;
 202
 203        /* check the saved connections */
 204        spin_lock_bh(&list->list_lock);
 205        ret = __nf_conncount_add(net, list, tuple, zone);
 206        spin_unlock_bh(&list->list_lock);
 207
 208        return ret;
 209}
 210EXPORT_SYMBOL_GPL(nf_conncount_add);
 211
 212void nf_conncount_list_init(struct nf_conncount_list *list)
 213{
 214        spin_lock_init(&list->list_lock);
 215        INIT_LIST_HEAD(&list->head);
 216        list->count = 0;
 217}
 218EXPORT_SYMBOL_GPL(nf_conncount_list_init);
 219
 220/* Return true if the list is empty. Must be called with BH disabled. */
 221bool nf_conncount_gc_list(struct net *net,
 222                          struct nf_conncount_list *list)
 223{
 224        const struct nf_conntrack_tuple_hash *found;
 225        struct nf_conncount_tuple *conn, *conn_n;
 226        struct nf_conn *found_ct;
 227        unsigned int collected = 0;
 228        bool ret = false;
 229
 230        /* don't bother if other cpu is already doing GC */
 231        if (!spin_trylock(&list->list_lock))
 232                return false;
 233
 234        list_for_each_entry_safe(conn, conn_n, &list->head, node) {
 235                found = find_or_evict(net, list, conn);
 236                if (IS_ERR(found)) {
 237                        if (PTR_ERR(found) == -ENOENT)
 238                                collected++;
 239                        continue;
 240                }
 241
 242                found_ct = nf_ct_tuplehash_to_ctrack(found);
 243                if (already_closed(found_ct)) {
 244                        /*
 245                         * we do not care about connections which are
 246                         * closed already -> ditch it
 247                         */
 248                        nf_ct_put(found_ct);
 249                        conn_free(list, conn);
 250                        collected++;
 251                        continue;
 252                }
 253
 254                nf_ct_put(found_ct);
 255                if (collected > CONNCOUNT_GC_MAX_NODES)
 256                        break;
 257        }
 258
 259        if (!list->count)
 260                ret = true;
 261        spin_unlock(&list->list_lock);
 262
 263        return ret;
 264}
 265EXPORT_SYMBOL_GPL(nf_conncount_gc_list);
 266
 267static void __tree_nodes_free(struct rcu_head *h)
 268{
 269        struct nf_conncount_rb *rbconn;
 270
 271        rbconn = container_of(h, struct nf_conncount_rb, rcu_head);
 272        kmem_cache_free(conncount_rb_cachep, rbconn);
 273}
 274
 275/* caller must hold tree nf_conncount_locks[] lock */
 276static void tree_nodes_free(struct rb_root *root,
 277                            struct nf_conncount_rb *gc_nodes[],
 278                            unsigned int gc_count)
 279{
 280        struct nf_conncount_rb *rbconn;
 281
 282        while (gc_count) {
 283                rbconn = gc_nodes[--gc_count];
 284                spin_lock(&rbconn->list.list_lock);
 285                if (!rbconn->list.count) {
 286                        rb_erase(&rbconn->node, root);
 287                        call_rcu(&rbconn->rcu_head, __tree_nodes_free);
 288                }
 289                spin_unlock(&rbconn->list.list_lock);
 290        }
 291}
 292
 293static void schedule_gc_worker(struct nf_conncount_data *data, int tree)
 294{
 295        set_bit(tree, data->pending_trees);
 296        schedule_work(&data->gc_work);
 297}
 298
 299static unsigned int
 300insert_tree(struct net *net,
 301            struct nf_conncount_data *data,
 302            struct rb_root *root,
 303            unsigned int hash,
 304            const u32 *key,
 305            const struct nf_conntrack_tuple *tuple,
 306            const struct nf_conntrack_zone *zone)
 307{
 308        struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES];
 309        struct rb_node **rbnode, *parent;
 310        struct nf_conncount_rb *rbconn;
 311        struct nf_conncount_tuple *conn;
 312        unsigned int count = 0, gc_count = 0;
 313        u8 keylen = data->keylen;
 314        bool do_gc = true;
 315
 316        spin_lock_bh(&nf_conncount_locks[hash]);
 317restart:
 318        parent = NULL;
 319        rbnode = &(root->rb_node);
 320        while (*rbnode) {
 321                int diff;
 322                rbconn = rb_entry(*rbnode, struct nf_conncount_rb, node);
 323
 324                parent = *rbnode;
 325                diff = key_diff(key, rbconn->key, keylen);
 326                if (diff < 0) {
 327                        rbnode = &((*rbnode)->rb_left);
 328                } else if (diff > 0) {
 329                        rbnode = &((*rbnode)->rb_right);
 330                } else {
 331                        int ret;
 332
 333                        ret = nf_conncount_add(net, &rbconn->list, tuple, zone);
 334                        if (ret)
 335                                count = 0; /* hotdrop */
 336                        else
 337                                count = rbconn->list.count;
 338                        tree_nodes_free(root, gc_nodes, gc_count);
 339                        goto out_unlock;
 340                }
 341
 342                if (gc_count >= ARRAY_SIZE(gc_nodes))
 343                        continue;
 344
 345                if (do_gc && nf_conncount_gc_list(net, &rbconn->list))
 346                        gc_nodes[gc_count++] = rbconn;
 347        }
 348
 349        if (gc_count) {
 350                tree_nodes_free(root, gc_nodes, gc_count);
 351                schedule_gc_worker(data, hash);
 352                gc_count = 0;
 353                do_gc = false;
 354                goto restart;
 355        }
 356
 357        /* expected case: match, insert new node */
 358        rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC);
 359        if (rbconn == NULL)
 360                goto out_unlock;
 361
 362        conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC);
 363        if (conn == NULL) {
 364                kmem_cache_free(conncount_rb_cachep, rbconn);
 365                goto out_unlock;
 366        }
 367
 368        conn->tuple = *tuple;
 369        conn->zone = *zone;
 370        memcpy(rbconn->key, key, sizeof(u32) * keylen);
 371
 372        nf_conncount_list_init(&rbconn->list);
 373        list_add(&conn->node, &rbconn->list.head);
 374        count = 1;
 375        rbconn->list.count = count;
 376
 377        rb_link_node_rcu(&rbconn->node, parent, rbnode);
 378        rb_insert_color(&rbconn->node, root);
 379out_unlock:
 380        spin_unlock_bh(&nf_conncount_locks[hash]);
 381        return count;
 382}
 383
 384static unsigned int
 385count_tree(struct net *net,
 386           struct nf_conncount_data *data,
 387           const u32 *key,
 388           const struct nf_conntrack_tuple *tuple,
 389           const struct nf_conntrack_zone *zone)
 390{
 391        struct rb_root *root;
 392        struct rb_node *parent;
 393        struct nf_conncount_rb *rbconn;
 394        unsigned int hash;
 395        u8 keylen = data->keylen;
 396
 397        hash = jhash2(key, data->keylen, conncount_rnd) % CONNCOUNT_SLOTS;
 398        root = &data->root[hash];
 399
 400        parent = rcu_dereference_raw(root->rb_node);
 401        while (parent) {
 402                int diff;
 403
 404                rbconn = rb_entry(parent, struct nf_conncount_rb, node);
 405
 406                diff = key_diff(key, rbconn->key, keylen);
 407                if (diff < 0) {
 408                        parent = rcu_dereference_raw(parent->rb_left);
 409                } else if (diff > 0) {
 410                        parent = rcu_dereference_raw(parent->rb_right);
 411                } else {
 412                        int ret;
 413
 414                        if (!tuple) {
 415                                nf_conncount_gc_list(net, &rbconn->list);
 416                                return rbconn->list.count;
 417                        }
 418
 419                        spin_lock_bh(&rbconn->list.list_lock);
 420                        /* Node might be about to be free'd.
 421                         * We need to defer to insert_tree() in this case.
 422                         */
 423                        if (rbconn->list.count == 0) {
 424                                spin_unlock_bh(&rbconn->list.list_lock);
 425                                break;
 426                        }
 427
 428                        /* same source network -> be counted! */
 429                        ret = __nf_conncount_add(net, &rbconn->list, tuple, zone);
 430                        spin_unlock_bh(&rbconn->list.list_lock);
 431                        if (ret)
 432                                return 0; /* hotdrop */
 433                        else
 434                                return rbconn->list.count;
 435                }
 436        }
 437
 438        if (!tuple)
 439                return 0;
 440
 441        return insert_tree(net, data, root, hash, key, tuple, zone);
 442}
 443
 444static void tree_gc_worker(struct work_struct *work)
 445{
 446        struct nf_conncount_data *data = container_of(work, struct nf_conncount_data, gc_work);
 447        struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES], *rbconn;
 448        struct rb_root *root;
 449        struct rb_node *node;
 450        unsigned int tree, next_tree, gc_count = 0;
 451
 452        tree = data->gc_tree % CONNCOUNT_SLOTS;
 453        root = &data->root[tree];
 454
 455        local_bh_disable();
 456        rcu_read_lock();
 457        for (node = rb_first(root); node != NULL; node = rb_next(node)) {
 458                rbconn = rb_entry(node, struct nf_conncount_rb, node);
 459                if (nf_conncount_gc_list(data->net, &rbconn->list))
 460                        gc_count++;
 461        }
 462        rcu_read_unlock();
 463        local_bh_enable();
 464
 465        cond_resched();
 466
 467        spin_lock_bh(&nf_conncount_locks[tree]);
 468        if (gc_count < ARRAY_SIZE(gc_nodes))
 469                goto next; /* do not bother */
 470
 471        gc_count = 0;
 472        node = rb_first(root);
 473        while (node != NULL) {
 474                rbconn = rb_entry(node, struct nf_conncount_rb, node);
 475                node = rb_next(node);
 476
 477                if (rbconn->list.count > 0)
 478                        continue;
 479
 480                gc_nodes[gc_count++] = rbconn;
 481                if (gc_count >= ARRAY_SIZE(gc_nodes)) {
 482                        tree_nodes_free(root, gc_nodes, gc_count);
 483                        gc_count = 0;
 484                }
 485        }
 486
 487        tree_nodes_free(root, gc_nodes, gc_count);
 488next:
 489        clear_bit(tree, data->pending_trees);
 490
 491        next_tree = (tree + 1) % CONNCOUNT_SLOTS;
 492        next_tree = find_next_bit(data->pending_trees, CONNCOUNT_SLOTS, next_tree);
 493
 494        if (next_tree < CONNCOUNT_SLOTS) {
 495                data->gc_tree = next_tree;
 496                schedule_work(work);
 497        }
 498
 499        spin_unlock_bh(&nf_conncount_locks[tree]);
 500}
 501
 502/* Count and return number of conntrack entries in 'net' with particular 'key'.
 503 * If 'tuple' is not null, insert it into the accounting data structure.
 504 * Call with RCU read lock.
 505 */
 506unsigned int nf_conncount_count(struct net *net,
 507                                struct nf_conncount_data *data,
 508                                const u32 *key,
 509                                const struct nf_conntrack_tuple *tuple,
 510                                const struct nf_conntrack_zone *zone)
 511{
 512        return count_tree(net, data, key, tuple, zone);
 513}
 514EXPORT_SYMBOL_GPL(nf_conncount_count);
 515
 516struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family,
 517                                            unsigned int keylen)
 518{
 519        struct nf_conncount_data *data;
 520        int ret, i;
 521
 522        if (keylen % sizeof(u32) ||
 523            keylen / sizeof(u32) > MAX_KEYLEN ||
 524            keylen == 0)
 525                return ERR_PTR(-EINVAL);
 526
 527        net_get_random_once(&conncount_rnd, sizeof(conncount_rnd));
 528
 529        data = kmalloc(sizeof(*data), GFP_KERNEL);
 530        if (!data)
 531                return ERR_PTR(-ENOMEM);
 532
 533        ret = nf_ct_netns_get(net, family);
 534        if (ret < 0) {
 535                kfree(data);
 536                return ERR_PTR(ret);
 537        }
 538
 539        for (i = 0; i < ARRAY_SIZE(data->root); ++i)
 540                data->root[i] = RB_ROOT;
 541
 542        data->keylen = keylen / sizeof(u32);
 543        data->net = net;
 544        INIT_WORK(&data->gc_work, tree_gc_worker);
 545
 546        return data;
 547}
 548EXPORT_SYMBOL_GPL(nf_conncount_init);
 549
 550void nf_conncount_cache_free(struct nf_conncount_list *list)
 551{
 552        struct nf_conncount_tuple *conn, *conn_n;
 553
 554        list_for_each_entry_safe(conn, conn_n, &list->head, node)
 555                kmem_cache_free(conncount_conn_cachep, conn);
 556}
 557EXPORT_SYMBOL_GPL(nf_conncount_cache_free);
 558
 559static void destroy_tree(struct rb_root *r)
 560{
 561        struct nf_conncount_rb *rbconn;
 562        struct rb_node *node;
 563
 564        while ((node = rb_first(r)) != NULL) {
 565                rbconn = rb_entry(node, struct nf_conncount_rb, node);
 566
 567                rb_erase(node, r);
 568
 569                nf_conncount_cache_free(&rbconn->list);
 570
 571                kmem_cache_free(conncount_rb_cachep, rbconn);
 572        }
 573}
 574
 575void nf_conncount_destroy(struct net *net, unsigned int family,
 576                          struct nf_conncount_data *data)
 577{
 578        unsigned int i;
 579
 580        cancel_work_sync(&data->gc_work);
 581        nf_ct_netns_put(net, family);
 582
 583        for (i = 0; i < ARRAY_SIZE(data->root); ++i)
 584                destroy_tree(&data->root[i]);
 585
 586        kfree(data);
 587}
 588EXPORT_SYMBOL_GPL(nf_conncount_destroy);
 589
 590static int __init nf_conncount_modinit(void)
 591{
 592        int i;
 593
 594        for (i = 0; i < CONNCOUNT_SLOTS; ++i)
 595                spin_lock_init(&nf_conncount_locks[i]);
 596
 597        conncount_conn_cachep = kmem_cache_create("nf_conncount_tuple",
 598                                           sizeof(struct nf_conncount_tuple),
 599                                           0, 0, NULL);
 600        if (!conncount_conn_cachep)
 601                return -ENOMEM;
 602
 603        conncount_rb_cachep = kmem_cache_create("nf_conncount_rb",
 604                                           sizeof(struct nf_conncount_rb),
 605                                           0, 0, NULL);
 606        if (!conncount_rb_cachep) {
 607                kmem_cache_destroy(conncount_conn_cachep);
 608                return -ENOMEM;
 609        }
 610
 611        return 0;
 612}
 613
 614static void __exit nf_conncount_modexit(void)
 615{
 616        kmem_cache_destroy(conncount_conn_cachep);
 617        kmem_cache_destroy(conncount_rb_cachep);
 618}
 619
 620module_init(nf_conncount_modinit);
 621module_exit(nf_conncount_modexit);
 622MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
 623MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
 624MODULE_DESCRIPTION("netfilter: count number of connections matching a key");
 625MODULE_LICENSE("GPL");
 626