linux/net/netfilter/nft_set_rbtree.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
   3 *
   4 * This program is free software; you can redistribute it and/or modify
   5 * it under the terms of the GNU General Public License version 2 as
   6 * published by the Free Software Foundation.
   7 *
   8 * Development of this code funded by Astaro AG (http://www.astaro.com/)
   9 */
  10
  11#include <linux/kernel.h>
  12#include <linux/init.h>
  13#include <linux/module.h>
  14#include <linux/list.h>
  15#include <linux/rbtree.h>
  16#include <linux/netlink.h>
  17#include <linux/netfilter.h>
  18#include <linux/netfilter/nf_tables.h>
  19#include <net/netfilter/nf_tables.h>
  20
  21struct nft_rbtree {
  22        struct rb_root          root;
  23        rwlock_t                lock;
  24        seqcount_t              count;
  25        struct delayed_work     gc_work;
  26};
  27
  28struct nft_rbtree_elem {
  29        struct rb_node          node;
  30        struct nft_set_ext      ext;
  31};
  32
  33static bool nft_rbtree_interval_end(const struct nft_rbtree_elem *rbe)
  34{
  35        return nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) &&
  36               (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END);
  37}
  38
  39static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe)
  40{
  41        return !nft_rbtree_interval_end(rbe);
  42}
  43
  44static bool nft_rbtree_equal(const struct nft_set *set, const void *this,
  45                             const struct nft_rbtree_elem *interval)
  46{
  47        return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0;
  48}
  49
  50static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
  51                                const u32 *key, const struct nft_set_ext **ext,
  52                                unsigned int seq)
  53{
  54        struct nft_rbtree *priv = nft_set_priv(set);
  55        const struct nft_rbtree_elem *rbe, *interval = NULL;
  56        u8 genmask = nft_genmask_cur(net);
  57        const struct rb_node *parent;
  58        const void *this;
  59        int d;
  60
  61        parent = rcu_dereference_raw(priv->root.rb_node);
  62        while (parent != NULL) {
  63                if (read_seqcount_retry(&priv->count, seq))
  64                        return false;
  65
  66                rbe = rb_entry(parent, struct nft_rbtree_elem, node);
  67
  68                this = nft_set_ext_key(&rbe->ext);
  69                d = memcmp(this, key, set->klen);
  70                if (d < 0) {
  71                        parent = rcu_dereference_raw(parent->rb_left);
  72                        if (interval &&
  73                            nft_rbtree_equal(set, this, interval) &&
  74                            nft_rbtree_interval_end(rbe) &&
  75                            nft_rbtree_interval_start(interval))
  76                                continue;
  77                        interval = rbe;
  78                } else if (d > 0)
  79                        parent = rcu_dereference_raw(parent->rb_right);
  80                else {
  81                        if (!nft_set_elem_active(&rbe->ext, genmask)) {
  82                                parent = rcu_dereference_raw(parent->rb_left);
  83                                continue;
  84                        }
  85
  86                        if (nft_set_elem_expired(&rbe->ext))
  87                                return false;
  88
  89                        if (nft_rbtree_interval_end(rbe)) {
  90                                if (nft_set_is_anonymous(set))
  91                                        return false;
  92                                parent = rcu_dereference_raw(parent->rb_left);
  93                                interval = NULL;
  94                                continue;
  95                        }
  96
  97                        *ext = &rbe->ext;
  98                        return true;
  99                }
 100        }
 101
 102        if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
 103            nft_set_elem_active(&interval->ext, genmask) &&
 104            !nft_set_elem_expired(&interval->ext) &&
 105            nft_rbtree_interval_start(interval)) {
 106                *ext = &interval->ext;
 107                return true;
 108        }
 109
 110        return false;
 111}
 112
 113static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
 114                              const u32 *key, const struct nft_set_ext **ext)
 115{
 116        struct nft_rbtree *priv = nft_set_priv(set);
 117        unsigned int seq = read_seqcount_begin(&priv->count);
 118        bool ret;
 119
 120        ret = __nft_rbtree_lookup(net, set, key, ext, seq);
 121        if (ret || !read_seqcount_retry(&priv->count, seq))
 122                return ret;
 123
 124        read_lock_bh(&priv->lock);
 125        seq = read_seqcount_begin(&priv->count);
 126        ret = __nft_rbtree_lookup(net, set, key, ext, seq);
 127        read_unlock_bh(&priv->lock);
 128
 129        return ret;
 130}
 131
 132static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set,
 133                             const u32 *key, struct nft_rbtree_elem **elem,
 134                             unsigned int seq, unsigned int flags, u8 genmask)
 135{
 136        struct nft_rbtree_elem *rbe, *interval = NULL;
 137        struct nft_rbtree *priv = nft_set_priv(set);
 138        const struct rb_node *parent;
 139        const void *this;
 140        int d;
 141
 142        parent = rcu_dereference_raw(priv->root.rb_node);
 143        while (parent != NULL) {
 144                if (read_seqcount_retry(&priv->count, seq))
 145                        return false;
 146
 147                rbe = rb_entry(parent, struct nft_rbtree_elem, node);
 148
 149                this = nft_set_ext_key(&rbe->ext);
 150                d = memcmp(this, key, set->klen);
 151                if (d < 0) {
 152                        parent = rcu_dereference_raw(parent->rb_left);
 153                        if (!(flags & NFT_SET_ELEM_INTERVAL_END))
 154                                interval = rbe;
 155                } else if (d > 0) {
 156                        parent = rcu_dereference_raw(parent->rb_right);
 157                        if (flags & NFT_SET_ELEM_INTERVAL_END)
 158                                interval = rbe;
 159                } else {
 160                        if (!nft_set_elem_active(&rbe->ext, genmask)) {
 161                                parent = rcu_dereference_raw(parent->rb_left);
 162                                continue;
 163                        }
 164
 165                        if (nft_set_elem_expired(&rbe->ext))
 166                                return false;
 167
 168                        if (!nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) ||
 169                            (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END) ==
 170                            (flags & NFT_SET_ELEM_INTERVAL_END)) {
 171                                *elem = rbe;
 172                                return true;
 173                        }
 174
 175                        if (nft_rbtree_interval_end(rbe))
 176                                interval = NULL;
 177
 178                        parent = rcu_dereference_raw(parent->rb_left);
 179                }
 180        }
 181
 182        if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
 183            nft_set_elem_active(&interval->ext, genmask) &&
 184            !nft_set_elem_expired(&interval->ext) &&
 185            ((!nft_rbtree_interval_end(interval) &&
 186              !(flags & NFT_SET_ELEM_INTERVAL_END)) ||
 187             (nft_rbtree_interval_end(interval) &&
 188              (flags & NFT_SET_ELEM_INTERVAL_END)))) {
 189                *elem = interval;
 190                return true;
 191        }
 192
 193        return false;
 194}
 195
 196static void *nft_rbtree_get(const struct net *net, const struct nft_set *set,
 197                            const struct nft_set_elem *elem, unsigned int flags)
 198{
 199        struct nft_rbtree *priv = nft_set_priv(set);
 200        unsigned int seq = read_seqcount_begin(&priv->count);
 201        struct nft_rbtree_elem *rbe = ERR_PTR(-ENOENT);
 202        const u32 *key = (const u32 *)&elem->key.val;
 203        u8 genmask = nft_genmask_cur(net);
 204        bool ret;
 205
 206        ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask);
 207        if (ret || !read_seqcount_retry(&priv->count, seq))
 208                return rbe;
 209
 210        read_lock_bh(&priv->lock);
 211        seq = read_seqcount_begin(&priv->count);
 212        ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask);
 213        if (!ret)
 214                rbe = ERR_PTR(-ENOENT);
 215        read_unlock_bh(&priv->lock);
 216
 217        return rbe;
 218}
 219
 220static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
 221                               struct nft_rbtree_elem *new,
 222                               struct nft_set_ext **ext)
 223{
 224        struct nft_rbtree *priv = nft_set_priv(set);
 225        u8 genmask = nft_genmask_next(net);
 226        struct nft_rbtree_elem *rbe;
 227        struct rb_node *parent, **p;
 228        bool overlap = false;
 229        int d;
 230
 231        /* Detect overlaps as we descend the tree. Set the flag in these cases:
 232         *
 233         * a1. _ _ __>|  ?_ _ __|  (insert end before existing end)
 234         * a2. _ _ ___|  ?_ _ _>|  (insert end after existing end)
 235         * a3. _ _ ___? >|_ _ __|  (insert start before existing end)
 236         *
 237         * and clear it later on, as we eventually reach the points indicated by
 238         * '?' above, in the cases described below. We'll always meet these
 239         * later, locally, due to tree ordering, and overlaps for the intervals
 240         * that are the closest together are always evaluated last.
 241         *
 242         * b1. _ _ __>|  !_ _ __|  (insert end before existing start)
 243         * b2. _ _ ___|  !_ _ _>|  (insert end after existing start)
 244         * b3. _ _ ___! >|_ _ __|  (insert start after existing end, as a leaf)
 245         *            '--' no nodes falling in this range
 246         * b4.          >|_ _   !  (insert start before existing start)
 247         *
 248         * Case a3. resolves to b3.:
 249         * - if the inserted start element is the leftmost, because the '0'
 250         *   element in the tree serves as end element
 251         * - otherwise, if an existing end is found immediately to the left. If
 252         *   there are existing nodes in between, we need to further descend the
 253         *   tree before we can conclude the new start isn't causing an overlap
 254         *
 255         * or to b4., which, preceded by a3., means we already traversed one or
 256         * more existing intervals entirely, from the right.
 257         *
 258         * For a new, rightmost pair of elements, we'll hit cases b3. and b2.,
 259         * in that order.
 260         *
 261         * The flag is also cleared in two special cases:
 262         *
 263         * b5. |__ _ _!|<_ _ _   (insert start right before existing end)
 264         * b6. |__ _ >|!__ _ _   (insert end right after existing start)
 265         *
 266         * which always happen as last step and imply that no further
 267         * overlapping is possible.
 268         */
 269
 270        parent = NULL;
 271        p = &priv->root.rb_node;
 272        while (*p != NULL) {
 273                parent = *p;
 274                rbe = rb_entry(parent, struct nft_rbtree_elem, node);
 275                d = memcmp(nft_set_ext_key(&rbe->ext),
 276                           nft_set_ext_key(&new->ext),
 277                           set->klen);
 278                if (d < 0) {
 279                        p = &parent->rb_left;
 280
 281                        if (nft_rbtree_interval_start(new)) {
 282                                if (nft_rbtree_interval_end(rbe) &&
 283                                    nft_set_elem_active(&rbe->ext, genmask) &&
 284                                    !nft_set_elem_expired(&rbe->ext) && !*p)
 285                                        overlap = false;
 286                        } else {
 287                                overlap = nft_rbtree_interval_end(rbe) &&
 288                                          nft_set_elem_active(&rbe->ext,
 289                                                              genmask) &&
 290                                          !nft_set_elem_expired(&rbe->ext);
 291                        }
 292                } else if (d > 0) {
 293                        p = &parent->rb_right;
 294
 295                        if (nft_rbtree_interval_end(new)) {
 296                                overlap = nft_rbtree_interval_end(rbe) &&
 297                                          nft_set_elem_active(&rbe->ext,
 298                                                              genmask) &&
 299                                          !nft_set_elem_expired(&rbe->ext);
 300                        } else if (nft_set_elem_active(&rbe->ext, genmask) &&
 301                                   !nft_set_elem_expired(&rbe->ext)) {
 302                                overlap = nft_rbtree_interval_end(rbe);
 303                        }
 304                } else {
 305                        if (nft_rbtree_interval_end(rbe) &&
 306                            nft_rbtree_interval_start(new)) {
 307                                p = &parent->rb_left;
 308
 309                                if (nft_set_elem_active(&rbe->ext, genmask) &&
 310                                    !nft_set_elem_expired(&rbe->ext))
 311                                        overlap = false;
 312                        } else if (nft_rbtree_interval_start(rbe) &&
 313                                   nft_rbtree_interval_end(new)) {
 314                                p = &parent->rb_right;
 315
 316                                if (nft_set_elem_active(&rbe->ext, genmask) &&
 317                                    !nft_set_elem_expired(&rbe->ext))
 318                                        overlap = false;
 319                        } else if (nft_set_elem_active(&rbe->ext, genmask) &&
 320                                   !nft_set_elem_expired(&rbe->ext)) {
 321                                *ext = &rbe->ext;
 322                                return -EEXIST;
 323                        } else {
 324                                p = &parent->rb_left;
 325                        }
 326                }
 327        }
 328
 329        if (overlap)
 330                return -ENOTEMPTY;
 331
 332        rb_link_node_rcu(&new->node, parent, p);
 333        rb_insert_color(&new->node, &priv->root);
 334        return 0;
 335}
 336
 337static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
 338                             const struct nft_set_elem *elem,
 339                             struct nft_set_ext **ext)
 340{
 341        struct nft_rbtree *priv = nft_set_priv(set);
 342        struct nft_rbtree_elem *rbe = elem->priv;
 343        int err;
 344
 345        write_lock_bh(&priv->lock);
 346        write_seqcount_begin(&priv->count);
 347        err = __nft_rbtree_insert(net, set, rbe, ext);
 348        write_seqcount_end(&priv->count);
 349        write_unlock_bh(&priv->lock);
 350
 351        return err;
 352}
 353
 354static void nft_rbtree_remove(const struct net *net,
 355                              const struct nft_set *set,
 356                              const struct nft_set_elem *elem)
 357{
 358        struct nft_rbtree *priv = nft_set_priv(set);
 359        struct nft_rbtree_elem *rbe = elem->priv;
 360
 361        write_lock_bh(&priv->lock);
 362        write_seqcount_begin(&priv->count);
 363        rb_erase(&rbe->node, &priv->root);
 364        write_seqcount_end(&priv->count);
 365        write_unlock_bh(&priv->lock);
 366}
 367
 368static void nft_rbtree_activate(const struct net *net,
 369                                const struct nft_set *set,
 370                                const struct nft_set_elem *elem)
 371{
 372        struct nft_rbtree_elem *rbe = elem->priv;
 373
 374        nft_set_elem_change_active(net, set, &rbe->ext);
 375        nft_set_elem_clear_busy(&rbe->ext);
 376}
 377
 378static bool nft_rbtree_flush(const struct net *net,
 379                             const struct nft_set *set, void *priv)
 380{
 381        struct nft_rbtree_elem *rbe = priv;
 382
 383        if (!nft_set_elem_mark_busy(&rbe->ext) ||
 384            !nft_is_active(net, &rbe->ext)) {
 385                nft_set_elem_change_active(net, set, &rbe->ext);
 386                return true;
 387        }
 388        return false;
 389}
 390
 391static void *nft_rbtree_deactivate(const struct net *net,
 392                                   const struct nft_set *set,
 393                                   const struct nft_set_elem *elem)
 394{
 395        const struct nft_rbtree *priv = nft_set_priv(set);
 396        const struct rb_node *parent = priv->root.rb_node;
 397        struct nft_rbtree_elem *rbe, *this = elem->priv;
 398        u8 genmask = nft_genmask_next(net);
 399        int d;
 400
 401        while (parent != NULL) {
 402                rbe = rb_entry(parent, struct nft_rbtree_elem, node);
 403
 404                d = memcmp(nft_set_ext_key(&rbe->ext), &elem->key.val,
 405                                           set->klen);
 406                if (d < 0)
 407                        parent = parent->rb_left;
 408                else if (d > 0)
 409                        parent = parent->rb_right;
 410                else {
 411                        if (nft_rbtree_interval_end(rbe) &&
 412                            nft_rbtree_interval_start(this)) {
 413                                parent = parent->rb_left;
 414                                continue;
 415                        } else if (nft_rbtree_interval_start(rbe) &&
 416                                   nft_rbtree_interval_end(this)) {
 417                                parent = parent->rb_right;
 418                                continue;
 419                        } else if (!nft_set_elem_active(&rbe->ext, genmask)) {
 420                                parent = parent->rb_left;
 421                                continue;
 422                        }
 423                        nft_rbtree_flush(net, set, rbe);
 424                        return rbe;
 425                }
 426        }
 427        return NULL;
 428}
 429
 430static void nft_rbtree_walk(const struct nft_ctx *ctx,
 431                            struct nft_set *set,
 432                            struct nft_set_iter *iter)
 433{
 434        struct nft_rbtree *priv = nft_set_priv(set);
 435        struct nft_rbtree_elem *rbe;
 436        struct nft_set_elem elem;
 437        struct rb_node *node;
 438
 439        read_lock_bh(&priv->lock);
 440        for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
 441                rbe = rb_entry(node, struct nft_rbtree_elem, node);
 442
 443                if (iter->count < iter->skip)
 444                        goto cont;
 445                if (nft_set_elem_expired(&rbe->ext))
 446                        goto cont;
 447                if (!nft_set_elem_active(&rbe->ext, iter->genmask))
 448                        goto cont;
 449
 450                elem.priv = rbe;
 451
 452                iter->err = iter->fn(ctx, set, iter, &elem);
 453                if (iter->err < 0) {
 454                        read_unlock_bh(&priv->lock);
 455                        return;
 456                }
 457cont:
 458                iter->count++;
 459        }
 460        read_unlock_bh(&priv->lock);
 461}
 462
 463static void nft_rbtree_gc(struct work_struct *work)
 464{
 465        struct nft_rbtree_elem *rbe, *rbe_end = NULL, *rbe_prev = NULL;
 466        struct nft_set_gc_batch *gcb = NULL;
 467        struct nft_rbtree *priv;
 468        struct rb_node *node;
 469        struct nft_set *set;
 470
 471        priv = container_of(work, struct nft_rbtree, gc_work.work);
 472        set  = nft_set_container_of(priv);
 473
 474        write_lock_bh(&priv->lock);
 475        write_seqcount_begin(&priv->count);
 476        for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
 477                rbe = rb_entry(node, struct nft_rbtree_elem, node);
 478
 479                if (nft_rbtree_interval_end(rbe)) {
 480                        rbe_end = rbe;
 481                        continue;
 482                }
 483                if (!nft_set_elem_expired(&rbe->ext))
 484                        continue;
 485                if (nft_set_elem_mark_busy(&rbe->ext))
 486                        continue;
 487
 488                if (rbe_prev) {
 489                        rb_erase(&rbe_prev->node, &priv->root);
 490                        rbe_prev = NULL;
 491                }
 492                gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
 493                if (!gcb)
 494                        break;
 495
 496                atomic_dec(&set->nelems);
 497                nft_set_gc_batch_add(gcb, rbe);
 498                rbe_prev = rbe;
 499
 500                if (rbe_end) {
 501                        atomic_dec(&set->nelems);
 502                        nft_set_gc_batch_add(gcb, rbe_end);
 503                        rb_erase(&rbe_end->node, &priv->root);
 504                        rbe_end = NULL;
 505                }
 506                node = rb_next(node);
 507                if (!node)
 508                        break;
 509        }
 510        if (rbe_prev)
 511                rb_erase(&rbe_prev->node, &priv->root);
 512        write_seqcount_end(&priv->count);
 513        write_unlock_bh(&priv->lock);
 514
 515        nft_set_gc_batch_complete(gcb);
 516
 517        queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
 518                           nft_set_gc_interval(set));
 519}
 520
 521static u64 nft_rbtree_privsize(const struct nlattr * const nla[],
 522                               const struct nft_set_desc *desc)
 523{
 524        return sizeof(struct nft_rbtree);
 525}
 526
 527static int nft_rbtree_init(const struct nft_set *set,
 528                           const struct nft_set_desc *desc,
 529                           const struct nlattr * const nla[])
 530{
 531        struct nft_rbtree *priv = nft_set_priv(set);
 532
 533        rwlock_init(&priv->lock);
 534        seqcount_init(&priv->count);
 535        priv->root = RB_ROOT;
 536
 537        INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rbtree_gc);
 538        if (set->flags & NFT_SET_TIMEOUT)
 539                queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
 540                                   nft_set_gc_interval(set));
 541
 542        return 0;
 543}
 544
 545static void nft_rbtree_destroy(const struct nft_set *set)
 546{
 547        struct nft_rbtree *priv = nft_set_priv(set);
 548        struct nft_rbtree_elem *rbe;
 549        struct rb_node *node;
 550
 551        cancel_delayed_work_sync(&priv->gc_work);
 552        rcu_barrier();
 553        while ((node = priv->root.rb_node) != NULL) {
 554                rb_erase(node, &priv->root);
 555                rbe = rb_entry(node, struct nft_rbtree_elem, node);
 556                nft_set_elem_destroy(set, rbe, true);
 557        }
 558}
 559
 560static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
 561                                struct nft_set_estimate *est)
 562{
 563        if (desc->field_count > 1)
 564                return false;
 565
 566        if (desc->size)
 567                est->size = sizeof(struct nft_rbtree) +
 568                            desc->size * sizeof(struct nft_rbtree_elem);
 569        else
 570                est->size = ~0;
 571
 572        est->lookup = NFT_SET_CLASS_O_LOG_N;
 573        est->space  = NFT_SET_CLASS_O_N;
 574
 575        return true;
 576}
 577
 578struct nft_set_type nft_set_rbtree_type __read_mostly = {
 579        .owner          = THIS_MODULE,
 580        .features       = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT,
 581        .ops            = {
 582                .privsize       = nft_rbtree_privsize,
 583                .elemsize       = offsetof(struct nft_rbtree_elem, ext),
 584                .estimate       = nft_rbtree_estimate,
 585                .init           = nft_rbtree_init,
 586                .destroy        = nft_rbtree_destroy,
 587                .insert         = nft_rbtree_insert,
 588                .remove         = nft_rbtree_remove,
 589                .deactivate     = nft_rbtree_deactivate,
 590                .flush          = nft_rbtree_flush,
 591                .activate       = nft_rbtree_activate,
 592                .lookup         = nft_rbtree_lookup,
 593                .walk           = nft_rbtree_walk,
 594                .get            = nft_rbtree_get,
 595        },
 596};
 597