linux/net/netfilter/nft_set_rbtree.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
   4 *
   5 * Development of this code funded by Astaro AG (http://www.astaro.com/)
   6 */
   7
   8#include <linux/kernel.h>
   9#include <linux/init.h>
  10#include <linux/module.h>
  11#include <linux/list.h>
  12#include <linux/rbtree.h>
  13#include <linux/netlink.h>
  14#include <linux/netfilter.h>
  15#include <linux/netfilter/nf_tables.h>
  16#include <net/netfilter/nf_tables_core.h>
  17
  18struct nft_rbtree {
  19        struct rb_root          root;
  20        rwlock_t                lock;
  21        seqcount_t              count;
  22        struct delayed_work     gc_work;
  23};
  24
  25struct nft_rbtree_elem {
  26        struct rb_node          node;
  27        struct nft_set_ext      ext;
  28};
  29
  30static bool nft_rbtree_interval_end(const struct nft_rbtree_elem *rbe)
  31{
  32        return nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) &&
  33               (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END);
  34}
  35
  36static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe)
  37{
  38        return !nft_rbtree_interval_end(rbe);
  39}
  40
  41static bool nft_rbtree_equal(const struct nft_set *set, const void *this,
  42                             const struct nft_rbtree_elem *interval)
  43{
  44        return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0;
  45}
  46
  47static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
  48                                const u32 *key, const struct nft_set_ext **ext,
  49                                unsigned int seq)
  50{
  51        struct nft_rbtree *priv = nft_set_priv(set);
  52        const struct nft_rbtree_elem *rbe, *interval = NULL;
  53        u8 genmask = nft_genmask_cur(net);
  54        const struct rb_node *parent;
  55        const void *this;
  56        int d;
  57
  58        parent = rcu_dereference_raw(priv->root.rb_node);
  59        while (parent != NULL) {
  60                if (read_seqcount_retry(&priv->count, seq))
  61                        return false;
  62
  63                rbe = rb_entry(parent, struct nft_rbtree_elem, node);
  64
  65                this = nft_set_ext_key(&rbe->ext);
  66                d = memcmp(this, key, set->klen);
  67                if (d < 0) {
  68                        parent = rcu_dereference_raw(parent->rb_left);
  69                        if (interval &&
  70                            nft_rbtree_equal(set, this, interval) &&
  71                            nft_rbtree_interval_end(rbe) &&
  72                            nft_rbtree_interval_start(interval))
  73                                continue;
  74                        interval = rbe;
  75                } else if (d > 0)
  76                        parent = rcu_dereference_raw(parent->rb_right);
  77                else {
  78                        if (!nft_set_elem_active(&rbe->ext, genmask)) {
  79                                parent = rcu_dereference_raw(parent->rb_left);
  80                                continue;
  81                        }
  82
  83                        if (nft_set_elem_expired(&rbe->ext))
  84                                return false;
  85
  86                        if (nft_rbtree_interval_end(rbe)) {
  87                                if (nft_set_is_anonymous(set))
  88                                        return false;
  89                                parent = rcu_dereference_raw(parent->rb_left);
  90                                interval = NULL;
  91                                continue;
  92                        }
  93
  94                        *ext = &rbe->ext;
  95                        return true;
  96                }
  97        }
  98
  99        if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
 100            nft_set_elem_active(&interval->ext, genmask) &&
 101            !nft_set_elem_expired(&interval->ext) &&
 102            nft_rbtree_interval_start(interval)) {
 103                *ext = &interval->ext;
 104                return true;
 105        }
 106
 107        return false;
 108}
 109
 110static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
 111                              const u32 *key, const struct nft_set_ext **ext)
 112{
 113        struct nft_rbtree *priv = nft_set_priv(set);
 114        unsigned int seq = read_seqcount_begin(&priv->count);
 115        bool ret;
 116
 117        ret = __nft_rbtree_lookup(net, set, key, ext, seq);
 118        if (ret || !read_seqcount_retry(&priv->count, seq))
 119                return ret;
 120
 121        read_lock_bh(&priv->lock);
 122        seq = read_seqcount_begin(&priv->count);
 123        ret = __nft_rbtree_lookup(net, set, key, ext, seq);
 124        read_unlock_bh(&priv->lock);
 125
 126        return ret;
 127}
 128
 129static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set,
 130                             const u32 *key, struct nft_rbtree_elem **elem,
 131                             unsigned int seq, unsigned int flags, u8 genmask)
 132{
 133        struct nft_rbtree_elem *rbe, *interval = NULL;
 134        struct nft_rbtree *priv = nft_set_priv(set);
 135        const struct rb_node *parent;
 136        const void *this;
 137        int d;
 138
 139        parent = rcu_dereference_raw(priv->root.rb_node);
 140        while (parent != NULL) {
 141                if (read_seqcount_retry(&priv->count, seq))
 142                        return false;
 143
 144                rbe = rb_entry(parent, struct nft_rbtree_elem, node);
 145
 146                this = nft_set_ext_key(&rbe->ext);
 147                d = memcmp(this, key, set->klen);
 148                if (d < 0) {
 149                        parent = rcu_dereference_raw(parent->rb_left);
 150                        if (!(flags & NFT_SET_ELEM_INTERVAL_END))
 151                                interval = rbe;
 152                } else if (d > 0) {
 153                        parent = rcu_dereference_raw(parent->rb_right);
 154                        if (flags & NFT_SET_ELEM_INTERVAL_END)
 155                                interval = rbe;
 156                } else {
 157                        if (!nft_set_elem_active(&rbe->ext, genmask)) {
 158                                parent = rcu_dereference_raw(parent->rb_left);
 159                                continue;
 160                        }
 161
 162                        if (nft_set_elem_expired(&rbe->ext))
 163                                return false;
 164
 165                        if (!nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) ||
 166                            (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END) ==
 167                            (flags & NFT_SET_ELEM_INTERVAL_END)) {
 168                                *elem = rbe;
 169                                return true;
 170                        }
 171
 172                        if (nft_rbtree_interval_end(rbe))
 173                                interval = NULL;
 174
 175                        parent = rcu_dereference_raw(parent->rb_left);
 176                }
 177        }
 178
 179        if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
 180            nft_set_elem_active(&interval->ext, genmask) &&
 181            !nft_set_elem_expired(&interval->ext) &&
 182            ((!nft_rbtree_interval_end(interval) &&
 183              !(flags & NFT_SET_ELEM_INTERVAL_END)) ||
 184             (nft_rbtree_interval_end(interval) &&
 185              (flags & NFT_SET_ELEM_INTERVAL_END)))) {
 186                *elem = interval;
 187                return true;
 188        }
 189
 190        return false;
 191}
 192
 193static void *nft_rbtree_get(const struct net *net, const struct nft_set *set,
 194                            const struct nft_set_elem *elem, unsigned int flags)
 195{
 196        struct nft_rbtree *priv = nft_set_priv(set);
 197        unsigned int seq = read_seqcount_begin(&priv->count);
 198        struct nft_rbtree_elem *rbe = ERR_PTR(-ENOENT);
 199        const u32 *key = (const u32 *)&elem->key.val;
 200        u8 genmask = nft_genmask_cur(net);
 201        bool ret;
 202
 203        ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask);
 204        if (ret || !read_seqcount_retry(&priv->count, seq))
 205                return rbe;
 206
 207        read_lock_bh(&priv->lock);
 208        seq = read_seqcount_begin(&priv->count);
 209        ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask);
 210        if (!ret)
 211                rbe = ERR_PTR(-ENOENT);
 212        read_unlock_bh(&priv->lock);
 213
 214        return rbe;
 215}
 216
 217static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
 218                               struct nft_rbtree_elem *new,
 219                               struct nft_set_ext **ext)
 220{
 221        struct nft_rbtree *priv = nft_set_priv(set);
 222        u8 genmask = nft_genmask_next(net);
 223        struct nft_rbtree_elem *rbe;
 224        struct rb_node *parent, **p;
 225        bool overlap = false;
 226        int d;
 227
 228        /* Detect overlaps as we descend the tree. Set the flag in these cases:
 229         *
 230         * a1. _ _ __>|  ?_ _ __|  (insert end before existing end)
 231         * a2. _ _ ___|  ?_ _ _>|  (insert end after existing end)
 232         * a3. _ _ ___? >|_ _ __|  (insert start before existing end)
 233         *
 234         * and clear it later on, as we eventually reach the points indicated by
 235         * '?' above, in the cases described below. We'll always meet these
 236         * later, locally, due to tree ordering, and overlaps for the intervals
 237         * that are the closest together are always evaluated last.
 238         *
 239         * b1. _ _ __>|  !_ _ __|  (insert end before existing start)
 240         * b2. _ _ ___|  !_ _ _>|  (insert end after existing start)
 241         * b3. _ _ ___! >|_ _ __|  (insert start after existing end)
 242         *
 243         * Case a3. resolves to b3.:
 244         * - if the inserted start element is the leftmost, because the '0'
 245         *   element in the tree serves as end element
 246         * - otherwise, if an existing end is found. Note that end elements are
 247         *   always inserted after corresponding start elements.
 248         *
 249         * For a new, rightmost pair of elements, we'll hit cases b3. and b2.,
 250         * in that order.
 251         *
 252         * The flag is also cleared in two special cases:
 253         *
 254         * b4. |__ _ _!|<_ _ _   (insert start right before existing end)
 255         * b5. |__ _ >|!__ _ _   (insert end right after existing start)
 256         *
 257         * which always happen as last step and imply that no further
 258         * overlapping is possible.
 259         */
 260
 261        parent = NULL;
 262        p = &priv->root.rb_node;
 263        while (*p != NULL) {
 264                parent = *p;
 265                rbe = rb_entry(parent, struct nft_rbtree_elem, node);
 266                d = memcmp(nft_set_ext_key(&rbe->ext),
 267                           nft_set_ext_key(&new->ext),
 268                           set->klen);
 269                if (d < 0) {
 270                        p = &parent->rb_left;
 271
 272                        if (nft_rbtree_interval_start(new)) {
 273                                if (nft_rbtree_interval_end(rbe) &&
 274                                    nft_set_elem_active(&rbe->ext, genmask) &&
 275                                    !nft_set_elem_expired(&rbe->ext))
 276                                        overlap = false;
 277                        } else {
 278                                overlap = nft_rbtree_interval_end(rbe) &&
 279                                          nft_set_elem_active(&rbe->ext,
 280                                                              genmask) &&
 281                                          !nft_set_elem_expired(&rbe->ext);
 282                        }
 283                } else if (d > 0) {
 284                        p = &parent->rb_right;
 285
 286                        if (nft_rbtree_interval_end(new)) {
 287                                overlap = nft_rbtree_interval_end(rbe) &&
 288                                          nft_set_elem_active(&rbe->ext,
 289                                                              genmask) &&
 290                                          !nft_set_elem_expired(&rbe->ext);
 291                        } else if (nft_rbtree_interval_end(rbe) &&
 292                                   nft_set_elem_active(&rbe->ext, genmask) &&
 293                                   !nft_set_elem_expired(&rbe->ext)) {
 294                                overlap = true;
 295                        }
 296                } else {
 297                        if (nft_rbtree_interval_end(rbe) &&
 298                            nft_rbtree_interval_start(new)) {
 299                                p = &parent->rb_left;
 300
 301                                if (nft_set_elem_active(&rbe->ext, genmask) &&
 302                                    !nft_set_elem_expired(&rbe->ext))
 303                                        overlap = false;
 304                        } else if (nft_rbtree_interval_start(rbe) &&
 305                                   nft_rbtree_interval_end(new)) {
 306                                p = &parent->rb_right;
 307
 308                                if (nft_set_elem_active(&rbe->ext, genmask) &&
 309                                    !nft_set_elem_expired(&rbe->ext))
 310                                        overlap = false;
 311                        } else if (nft_set_elem_active(&rbe->ext, genmask) &&
 312                                   !nft_set_elem_expired(&rbe->ext)) {
 313                                *ext = &rbe->ext;
 314                                return -EEXIST;
 315                        } else {
 316                                p = &parent->rb_left;
 317                        }
 318                }
 319        }
 320
 321        if (overlap)
 322                return -ENOTEMPTY;
 323
 324        rb_link_node_rcu(&new->node, parent, p);
 325        rb_insert_color(&new->node, &priv->root);
 326        return 0;
 327}
 328
 329static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
 330                             const struct nft_set_elem *elem,
 331                             struct nft_set_ext **ext)
 332{
 333        struct nft_rbtree *priv = nft_set_priv(set);
 334        struct nft_rbtree_elem *rbe = elem->priv;
 335        int err;
 336
 337        write_lock_bh(&priv->lock);
 338        write_seqcount_begin(&priv->count);
 339        err = __nft_rbtree_insert(net, set, rbe, ext);
 340        write_seqcount_end(&priv->count);
 341        write_unlock_bh(&priv->lock);
 342
 343        return err;
 344}
 345
 346static void nft_rbtree_remove(const struct net *net,
 347                              const struct nft_set *set,
 348                              const struct nft_set_elem *elem)
 349{
 350        struct nft_rbtree *priv = nft_set_priv(set);
 351        struct nft_rbtree_elem *rbe = elem->priv;
 352
 353        write_lock_bh(&priv->lock);
 354        write_seqcount_begin(&priv->count);
 355        rb_erase(&rbe->node, &priv->root);
 356        write_seqcount_end(&priv->count);
 357        write_unlock_bh(&priv->lock);
 358}
 359
 360static void nft_rbtree_activate(const struct net *net,
 361                                const struct nft_set *set,
 362                                const struct nft_set_elem *elem)
 363{
 364        struct nft_rbtree_elem *rbe = elem->priv;
 365
 366        nft_set_elem_change_active(net, set, &rbe->ext);
 367        nft_set_elem_clear_busy(&rbe->ext);
 368}
 369
 370static bool nft_rbtree_flush(const struct net *net,
 371                             const struct nft_set *set, void *priv)
 372{
 373        struct nft_rbtree_elem *rbe = priv;
 374
 375        if (!nft_set_elem_mark_busy(&rbe->ext) ||
 376            !nft_is_active(net, &rbe->ext)) {
 377                nft_set_elem_change_active(net, set, &rbe->ext);
 378                return true;
 379        }
 380        return false;
 381}
 382
 383static void *nft_rbtree_deactivate(const struct net *net,
 384                                   const struct nft_set *set,
 385                                   const struct nft_set_elem *elem)
 386{
 387        const struct nft_rbtree *priv = nft_set_priv(set);
 388        const struct rb_node *parent = priv->root.rb_node;
 389        struct nft_rbtree_elem *rbe, *this = elem->priv;
 390        u8 genmask = nft_genmask_next(net);
 391        int d;
 392
 393        while (parent != NULL) {
 394                rbe = rb_entry(parent, struct nft_rbtree_elem, node);
 395
 396                d = memcmp(nft_set_ext_key(&rbe->ext), &elem->key.val,
 397                                           set->klen);
 398                if (d < 0)
 399                        parent = parent->rb_left;
 400                else if (d > 0)
 401                        parent = parent->rb_right;
 402                else {
 403                        if (nft_rbtree_interval_end(rbe) &&
 404                            nft_rbtree_interval_start(this)) {
 405                                parent = parent->rb_left;
 406                                continue;
 407                        } else if (nft_rbtree_interval_start(rbe) &&
 408                                   nft_rbtree_interval_end(this)) {
 409                                parent = parent->rb_right;
 410                                continue;
 411                        } else if (!nft_set_elem_active(&rbe->ext, genmask)) {
 412                                parent = parent->rb_left;
 413                                continue;
 414                        }
 415                        nft_rbtree_flush(net, set, rbe);
 416                        return rbe;
 417                }
 418        }
 419        return NULL;
 420}
 421
 422static void nft_rbtree_walk(const struct nft_ctx *ctx,
 423                            struct nft_set *set,
 424                            struct nft_set_iter *iter)
 425{
 426        struct nft_rbtree *priv = nft_set_priv(set);
 427        struct nft_rbtree_elem *rbe;
 428        struct nft_set_elem elem;
 429        struct rb_node *node;
 430
 431        read_lock_bh(&priv->lock);
 432        for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
 433                rbe = rb_entry(node, struct nft_rbtree_elem, node);
 434
 435                if (iter->count < iter->skip)
 436                        goto cont;
 437                if (nft_set_elem_expired(&rbe->ext))
 438                        goto cont;
 439                if (!nft_set_elem_active(&rbe->ext, iter->genmask))
 440                        goto cont;
 441
 442                elem.priv = rbe;
 443
 444                iter->err = iter->fn(ctx, set, iter, &elem);
 445                if (iter->err < 0) {
 446                        read_unlock_bh(&priv->lock);
 447                        return;
 448                }
 449cont:
 450                iter->count++;
 451        }
 452        read_unlock_bh(&priv->lock);
 453}
 454
 455static void nft_rbtree_gc(struct work_struct *work)
 456{
 457        struct nft_rbtree_elem *rbe, *rbe_end = NULL, *rbe_prev = NULL;
 458        struct nft_set_gc_batch *gcb = NULL;
 459        struct nft_rbtree *priv;
 460        struct rb_node *node;
 461        struct nft_set *set;
 462
 463        priv = container_of(work, struct nft_rbtree, gc_work.work);
 464        set  = nft_set_container_of(priv);
 465
 466        write_lock_bh(&priv->lock);
 467        write_seqcount_begin(&priv->count);
 468        for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
 469                rbe = rb_entry(node, struct nft_rbtree_elem, node);
 470
 471                if (nft_rbtree_interval_end(rbe)) {
 472                        rbe_end = rbe;
 473                        continue;
 474                }
 475                if (!nft_set_elem_expired(&rbe->ext))
 476                        continue;
 477                if (nft_set_elem_mark_busy(&rbe->ext))
 478                        continue;
 479
 480                if (rbe_prev) {
 481                        rb_erase(&rbe_prev->node, &priv->root);
 482                        rbe_prev = NULL;
 483                }
 484                gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
 485                if (!gcb)
 486                        break;
 487
 488                atomic_dec(&set->nelems);
 489                nft_set_gc_batch_add(gcb, rbe);
 490                rbe_prev = rbe;
 491
 492                if (rbe_end) {
 493                        atomic_dec(&set->nelems);
 494                        nft_set_gc_batch_add(gcb, rbe_end);
 495                        rb_erase(&rbe_end->node, &priv->root);
 496                        rbe_end = NULL;
 497                }
 498                node = rb_next(node);
 499                if (!node)
 500                        break;
 501        }
 502        if (rbe_prev)
 503                rb_erase(&rbe_prev->node, &priv->root);
 504        write_seqcount_end(&priv->count);
 505        write_unlock_bh(&priv->lock);
 506
 507        nft_set_gc_batch_complete(gcb);
 508
 509        queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
 510                           nft_set_gc_interval(set));
 511}
 512
 513static u64 nft_rbtree_privsize(const struct nlattr * const nla[],
 514                               const struct nft_set_desc *desc)
 515{
 516        return sizeof(struct nft_rbtree);
 517}
 518
 519static int nft_rbtree_init(const struct nft_set *set,
 520                           const struct nft_set_desc *desc,
 521                           const struct nlattr * const nla[])
 522{
 523        struct nft_rbtree *priv = nft_set_priv(set);
 524
 525        rwlock_init(&priv->lock);
 526        seqcount_init(&priv->count);
 527        priv->root = RB_ROOT;
 528
 529        INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rbtree_gc);
 530        if (set->flags & NFT_SET_TIMEOUT)
 531                queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
 532                                   nft_set_gc_interval(set));
 533
 534        return 0;
 535}
 536
 537static void nft_rbtree_destroy(const struct nft_set *set)
 538{
 539        struct nft_rbtree *priv = nft_set_priv(set);
 540        struct nft_rbtree_elem *rbe;
 541        struct rb_node *node;
 542
 543        cancel_delayed_work_sync(&priv->gc_work);
 544        rcu_barrier();
 545        while ((node = priv->root.rb_node) != NULL) {
 546                rb_erase(node, &priv->root);
 547                rbe = rb_entry(node, struct nft_rbtree_elem, node);
 548                nft_set_elem_destroy(set, rbe, true);
 549        }
 550}
 551
 552static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
 553                                struct nft_set_estimate *est)
 554{
 555        if (desc->field_count > 1)
 556                return false;
 557
 558        if (desc->size)
 559                est->size = sizeof(struct nft_rbtree) +
 560                            desc->size * sizeof(struct nft_rbtree_elem);
 561        else
 562                est->size = ~0;
 563
 564        est->lookup = NFT_SET_CLASS_O_LOG_N;
 565        est->space  = NFT_SET_CLASS_O_N;
 566
 567        return true;
 568}
 569
 570const struct nft_set_type nft_set_rbtree_type = {
 571        .features       = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT,
 572        .ops            = {
 573                .privsize       = nft_rbtree_privsize,
 574                .elemsize       = offsetof(struct nft_rbtree_elem, ext),
 575                .estimate       = nft_rbtree_estimate,
 576                .init           = nft_rbtree_init,
 577                .destroy        = nft_rbtree_destroy,
 578                .insert         = nft_rbtree_insert,
 579                .remove         = nft_rbtree_remove,
 580                .deactivate     = nft_rbtree_deactivate,
 581                .flush          = nft_rbtree_flush,
 582                .activate       = nft_rbtree_activate,
 583                .lookup         = nft_rbtree_lookup,
 584                .walk           = nft_rbtree_walk,
 585                .get            = nft_rbtree_get,
 586        },
 587};
 588