linux/net/netfilter/nft_set_rbtree.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
   4 *
   5 * Development of this code funded by Astaro AG (http://www.astaro.com/)
   6 */
   7
   8#include <linux/kernel.h>
   9#include <linux/init.h>
  10#include <linux/module.h>
  11#include <linux/list.h>
  12#include <linux/rbtree.h>
  13#include <linux/netlink.h>
  14#include <linux/netfilter.h>
  15#include <linux/netfilter/nf_tables.h>
  16#include <net/netfilter/nf_tables_core.h>
  17
  18struct nft_rbtree {
  19        struct rb_root          root;
  20        rwlock_t                lock;
  21        seqcount_rwlock_t       count;
  22        struct delayed_work     gc_work;
  23};
  24
  25struct nft_rbtree_elem {
  26        struct rb_node          node;
  27        struct nft_set_ext      ext;
  28};
  29
  30static bool nft_rbtree_interval_end(const struct nft_rbtree_elem *rbe)
  31{
  32        return nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) &&
  33               (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END);
  34}
  35
  36static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe)
  37{
  38        return !nft_rbtree_interval_end(rbe);
  39}
  40
  41static bool nft_rbtree_equal(const struct nft_set *set, const void *this,
  42                             const struct nft_rbtree_elem *interval)
  43{
  44        return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0;
  45}
  46
  47static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
  48                                const u32 *key, const struct nft_set_ext **ext,
  49                                unsigned int seq)
  50{
  51        struct nft_rbtree *priv = nft_set_priv(set);
  52        const struct nft_rbtree_elem *rbe, *interval = NULL;
  53        u8 genmask = nft_genmask_cur(net);
  54        const struct rb_node *parent;
  55        const void *this;
  56        int d;
  57
  58        parent = rcu_dereference_raw(priv->root.rb_node);
  59        while (parent != NULL) {
  60                if (read_seqcount_retry(&priv->count, seq))
  61                        return false;
  62
  63                rbe = rb_entry(parent, struct nft_rbtree_elem, node);
  64
  65                this = nft_set_ext_key(&rbe->ext);
  66                d = memcmp(this, key, set->klen);
  67                if (d < 0) {
  68                        parent = rcu_dereference_raw(parent->rb_left);
  69                        if (interval &&
  70                            nft_rbtree_equal(set, this, interval) &&
  71                            nft_rbtree_interval_end(rbe) &&
  72                            nft_rbtree_interval_start(interval))
  73                                continue;
  74                        interval = rbe;
  75                } else if (d > 0)
  76                        parent = rcu_dereference_raw(parent->rb_right);
  77                else {
  78                        if (!nft_set_elem_active(&rbe->ext, genmask)) {
  79                                parent = rcu_dereference_raw(parent->rb_left);
  80                                continue;
  81                        }
  82
  83                        if (nft_set_elem_expired(&rbe->ext))
  84                                return false;
  85
  86                        if (nft_rbtree_interval_end(rbe)) {
  87                                if (nft_set_is_anonymous(set))
  88                                        return false;
  89                                parent = rcu_dereference_raw(parent->rb_left);
  90                                interval = NULL;
  91                                continue;
  92                        }
  93
  94                        *ext = &rbe->ext;
  95                        return true;
  96                }
  97        }
  98
  99        if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
 100            nft_set_elem_active(&interval->ext, genmask) &&
 101            !nft_set_elem_expired(&interval->ext) &&
 102            nft_rbtree_interval_start(interval)) {
 103                *ext = &interval->ext;
 104                return true;
 105        }
 106
 107        return false;
 108}
 109
 110static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
 111                              const u32 *key, const struct nft_set_ext **ext)
 112{
 113        struct nft_rbtree *priv = nft_set_priv(set);
 114        unsigned int seq = read_seqcount_begin(&priv->count);
 115        bool ret;
 116
 117        ret = __nft_rbtree_lookup(net, set, key, ext, seq);
 118        if (ret || !read_seqcount_retry(&priv->count, seq))
 119                return ret;
 120
 121        read_lock_bh(&priv->lock);
 122        seq = read_seqcount_begin(&priv->count);
 123        ret = __nft_rbtree_lookup(net, set, key, ext, seq);
 124        read_unlock_bh(&priv->lock);
 125
 126        return ret;
 127}
 128
 129static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set,
 130                             const u32 *key, struct nft_rbtree_elem **elem,
 131                             unsigned int seq, unsigned int flags, u8 genmask)
 132{
 133        struct nft_rbtree_elem *rbe, *interval = NULL;
 134        struct nft_rbtree *priv = nft_set_priv(set);
 135        const struct rb_node *parent;
 136        const void *this;
 137        int d;
 138
 139        parent = rcu_dereference_raw(priv->root.rb_node);
 140        while (parent != NULL) {
 141                if (read_seqcount_retry(&priv->count, seq))
 142                        return false;
 143
 144                rbe = rb_entry(parent, struct nft_rbtree_elem, node);
 145
 146                this = nft_set_ext_key(&rbe->ext);
 147                d = memcmp(this, key, set->klen);
 148                if (d < 0) {
 149                        parent = rcu_dereference_raw(parent->rb_left);
 150                        if (!(flags & NFT_SET_ELEM_INTERVAL_END))
 151                                interval = rbe;
 152                } else if (d > 0) {
 153                        parent = rcu_dereference_raw(parent->rb_right);
 154                        if (flags & NFT_SET_ELEM_INTERVAL_END)
 155                                interval = rbe;
 156                } else {
 157                        if (!nft_set_elem_active(&rbe->ext, genmask)) {
 158                                parent = rcu_dereference_raw(parent->rb_left);
 159                                continue;
 160                        }
 161
 162                        if (nft_set_elem_expired(&rbe->ext))
 163                                return false;
 164
 165                        if (!nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) ||
 166                            (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END) ==
 167                            (flags & NFT_SET_ELEM_INTERVAL_END)) {
 168                                *elem = rbe;
 169                                return true;
 170                        }
 171
 172                        if (nft_rbtree_interval_end(rbe))
 173                                interval = NULL;
 174
 175                        parent = rcu_dereference_raw(parent->rb_left);
 176                }
 177        }
 178
 179        if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
 180            nft_set_elem_active(&interval->ext, genmask) &&
 181            !nft_set_elem_expired(&interval->ext) &&
 182            ((!nft_rbtree_interval_end(interval) &&
 183              !(flags & NFT_SET_ELEM_INTERVAL_END)) ||
 184             (nft_rbtree_interval_end(interval) &&
 185              (flags & NFT_SET_ELEM_INTERVAL_END)))) {
 186                *elem = interval;
 187                return true;
 188        }
 189
 190        return false;
 191}
 192
 193static void *nft_rbtree_get(const struct net *net, const struct nft_set *set,
 194                            const struct nft_set_elem *elem, unsigned int flags)
 195{
 196        struct nft_rbtree *priv = nft_set_priv(set);
 197        unsigned int seq = read_seqcount_begin(&priv->count);
 198        struct nft_rbtree_elem *rbe = ERR_PTR(-ENOENT);
 199        const u32 *key = (const u32 *)&elem->key.val;
 200        u8 genmask = nft_genmask_cur(net);
 201        bool ret;
 202
 203        ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask);
 204        if (ret || !read_seqcount_retry(&priv->count, seq))
 205                return rbe;
 206
 207        read_lock_bh(&priv->lock);
 208        seq = read_seqcount_begin(&priv->count);
 209        ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask);
 210        if (!ret)
 211                rbe = ERR_PTR(-ENOENT);
 212        read_unlock_bh(&priv->lock);
 213
 214        return rbe;
 215}
 216
 217static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
 218                               struct nft_rbtree_elem *new,
 219                               struct nft_set_ext **ext)
 220{
 221        bool overlap = false, dup_end_left = false, dup_end_right = false;
 222        struct nft_rbtree *priv = nft_set_priv(set);
 223        u8 genmask = nft_genmask_next(net);
 224        struct nft_rbtree_elem *rbe;
 225        struct rb_node *parent, **p;
 226        int d;
 227
 228        /* Detect overlaps as we descend the tree. Set the flag in these cases:
 229         *
 230         * a1. _ _ __>|  ?_ _ __|  (insert end before existing end)
 231         * a2. _ _ ___|  ?_ _ _>|  (insert end after existing end)
 232         * a3. _ _ ___? >|_ _ __|  (insert start before existing end)
 233         *
 234         * and clear it later on, as we eventually reach the points indicated by
 235         * '?' above, in the cases described below. We'll always meet these
 236         * later, locally, due to tree ordering, and overlaps for the intervals
 237         * that are the closest together are always evaluated last.
 238         *
 239         * b1. _ _ __>|  !_ _ __|  (insert end before existing start)
 240         * b2. _ _ ___|  !_ _ _>|  (insert end after existing start)
 241         * b3. _ _ ___! >|_ _ __|  (insert start after existing end, as a leaf)
 242         *            '--' no nodes falling in this range
 243         * b4.          >|_ _   !  (insert start before existing start)
 244         *
 245         * Case a3. resolves to b3.:
 246         * - if the inserted start element is the leftmost, because the '0'
 247         *   element in the tree serves as end element
 248         * - otherwise, if an existing end is found immediately to the left. If
 249         *   there are existing nodes in between, we need to further descend the
 250         *   tree before we can conclude the new start isn't causing an overlap
 251         *
 252         * or to b4., which, preceded by a3., means we already traversed one or
 253         * more existing intervals entirely, from the right.
 254         *
 255         * For a new, rightmost pair of elements, we'll hit cases b3. and b2.,
 256         * in that order.
 257         *
 258         * The flag is also cleared in two special cases:
 259         *
 260         * b5. |__ _ _!|<_ _ _   (insert start right before existing end)
 261         * b6. |__ _ >|!__ _ _   (insert end right after existing start)
 262         *
 263         * which always happen as last step and imply that no further
 264         * overlapping is possible.
 265         *
 266         * Another special case comes from the fact that start elements matching
 267         * an already existing start element are allowed: insertion is not
 268         * performed but we return -EEXIST in that case, and the error will be
 269         * cleared by the caller if NLM_F_EXCL is not present in the request.
 270         * This way, request for insertion of an exact overlap isn't reported as
 271         * error to userspace if not desired.
 272         *
 273         * However, if the existing start matches a pre-existing start, but the
 274         * end element doesn't match the corresponding pre-existing end element,
 275         * we need to report a partial overlap. This is a local condition that
 276         * can be noticed without need for a tracking flag, by checking for a
 277         * local duplicated end for a corresponding start, from left and right,
 278         * separately.
 279         */
 280
 281        parent = NULL;
 282        p = &priv->root.rb_node;
 283        while (*p != NULL) {
 284                parent = *p;
 285                rbe = rb_entry(parent, struct nft_rbtree_elem, node);
 286                d = memcmp(nft_set_ext_key(&rbe->ext),
 287                           nft_set_ext_key(&new->ext),
 288                           set->klen);
 289                if (d < 0) {
 290                        p = &parent->rb_left;
 291
 292                        if (nft_rbtree_interval_start(new)) {
 293                                if (nft_rbtree_interval_end(rbe) &&
 294                                    nft_set_elem_active(&rbe->ext, genmask) &&
 295                                    !nft_set_elem_expired(&rbe->ext) && !*p)
 296                                        overlap = false;
 297                        } else {
 298                                if (dup_end_left && !*p)
 299                                        return -ENOTEMPTY;
 300
 301                                overlap = nft_rbtree_interval_end(rbe) &&
 302                                          nft_set_elem_active(&rbe->ext,
 303                                                              genmask) &&
 304                                          !nft_set_elem_expired(&rbe->ext);
 305
 306                                if (overlap) {
 307                                        dup_end_right = true;
 308                                        continue;
 309                                }
 310                        }
 311                } else if (d > 0) {
 312                        p = &parent->rb_right;
 313
 314                        if (nft_rbtree_interval_end(new)) {
 315                                if (dup_end_right && !*p)
 316                                        return -ENOTEMPTY;
 317
 318                                overlap = nft_rbtree_interval_end(rbe) &&
 319                                          nft_set_elem_active(&rbe->ext,
 320                                                              genmask) &&
 321                                          !nft_set_elem_expired(&rbe->ext);
 322
 323                                if (overlap) {
 324                                        dup_end_left = true;
 325                                        continue;
 326                                }
 327                        } else if (nft_set_elem_active(&rbe->ext, genmask) &&
 328                                   !nft_set_elem_expired(&rbe->ext)) {
 329                                overlap = nft_rbtree_interval_end(rbe);
 330                        }
 331                } else {
 332                        if (nft_rbtree_interval_end(rbe) &&
 333                            nft_rbtree_interval_start(new)) {
 334                                p = &parent->rb_left;
 335
 336                                if (nft_set_elem_active(&rbe->ext, genmask) &&
 337                                    !nft_set_elem_expired(&rbe->ext))
 338                                        overlap = false;
 339                        } else if (nft_rbtree_interval_start(rbe) &&
 340                                   nft_rbtree_interval_end(new)) {
 341                                p = &parent->rb_right;
 342
 343                                if (nft_set_elem_active(&rbe->ext, genmask) &&
 344                                    !nft_set_elem_expired(&rbe->ext))
 345                                        overlap = false;
 346                        } else if (nft_set_elem_active(&rbe->ext, genmask) &&
 347                                   !nft_set_elem_expired(&rbe->ext)) {
 348                                *ext = &rbe->ext;
 349                                return -EEXIST;
 350                        } else {
 351                                p = &parent->rb_left;
 352                        }
 353                }
 354
 355                dup_end_left = dup_end_right = false;
 356        }
 357
 358        if (overlap)
 359                return -ENOTEMPTY;
 360
 361        rb_link_node_rcu(&new->node, parent, p);
 362        rb_insert_color(&new->node, &priv->root);
 363        return 0;
 364}
 365
 366static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
 367                             const struct nft_set_elem *elem,
 368                             struct nft_set_ext **ext)
 369{
 370        struct nft_rbtree *priv = nft_set_priv(set);
 371        struct nft_rbtree_elem *rbe = elem->priv;
 372        int err;
 373
 374        write_lock_bh(&priv->lock);
 375        write_seqcount_begin(&priv->count);
 376        err = __nft_rbtree_insert(net, set, rbe, ext);
 377        write_seqcount_end(&priv->count);
 378        write_unlock_bh(&priv->lock);
 379
 380        return err;
 381}
 382
 383static void nft_rbtree_remove(const struct net *net,
 384                              const struct nft_set *set,
 385                              const struct nft_set_elem *elem)
 386{
 387        struct nft_rbtree *priv = nft_set_priv(set);
 388        struct nft_rbtree_elem *rbe = elem->priv;
 389
 390        write_lock_bh(&priv->lock);
 391        write_seqcount_begin(&priv->count);
 392        rb_erase(&rbe->node, &priv->root);
 393        write_seqcount_end(&priv->count);
 394        write_unlock_bh(&priv->lock);
 395}
 396
 397static void nft_rbtree_activate(const struct net *net,
 398                                const struct nft_set *set,
 399                                const struct nft_set_elem *elem)
 400{
 401        struct nft_rbtree_elem *rbe = elem->priv;
 402
 403        nft_set_elem_change_active(net, set, &rbe->ext);
 404        nft_set_elem_clear_busy(&rbe->ext);
 405}
 406
 407static bool nft_rbtree_flush(const struct net *net,
 408                             const struct nft_set *set, void *priv)
 409{
 410        struct nft_rbtree_elem *rbe = priv;
 411
 412        if (!nft_set_elem_mark_busy(&rbe->ext) ||
 413            !nft_is_active(net, &rbe->ext)) {
 414                nft_set_elem_change_active(net, set, &rbe->ext);
 415                return true;
 416        }
 417        return false;
 418}
 419
 420static void *nft_rbtree_deactivate(const struct net *net,
 421                                   const struct nft_set *set,
 422                                   const struct nft_set_elem *elem)
 423{
 424        const struct nft_rbtree *priv = nft_set_priv(set);
 425        const struct rb_node *parent = priv->root.rb_node;
 426        struct nft_rbtree_elem *rbe, *this = elem->priv;
 427        u8 genmask = nft_genmask_next(net);
 428        int d;
 429
 430        while (parent != NULL) {
 431                rbe = rb_entry(parent, struct nft_rbtree_elem, node);
 432
 433                d = memcmp(nft_set_ext_key(&rbe->ext), &elem->key.val,
 434                                           set->klen);
 435                if (d < 0)
 436                        parent = parent->rb_left;
 437                else if (d > 0)
 438                        parent = parent->rb_right;
 439                else {
 440                        if (nft_rbtree_interval_end(rbe) &&
 441                            nft_rbtree_interval_start(this)) {
 442                                parent = parent->rb_left;
 443                                continue;
 444                        } else if (nft_rbtree_interval_start(rbe) &&
 445                                   nft_rbtree_interval_end(this)) {
 446                                parent = parent->rb_right;
 447                                continue;
 448                        } else if (!nft_set_elem_active(&rbe->ext, genmask)) {
 449                                parent = parent->rb_left;
 450                                continue;
 451                        }
 452                        nft_rbtree_flush(net, set, rbe);
 453                        return rbe;
 454                }
 455        }
 456        return NULL;
 457}
 458
 459static void nft_rbtree_walk(const struct nft_ctx *ctx,
 460                            struct nft_set *set,
 461                            struct nft_set_iter *iter)
 462{
 463        struct nft_rbtree *priv = nft_set_priv(set);
 464        struct nft_rbtree_elem *rbe;
 465        struct nft_set_elem elem;
 466        struct rb_node *node;
 467
 468        read_lock_bh(&priv->lock);
 469        for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
 470                rbe = rb_entry(node, struct nft_rbtree_elem, node);
 471
 472                if (iter->count < iter->skip)
 473                        goto cont;
 474                if (nft_set_elem_expired(&rbe->ext))
 475                        goto cont;
 476                if (!nft_set_elem_active(&rbe->ext, iter->genmask))
 477                        goto cont;
 478
 479                elem.priv = rbe;
 480
 481                iter->err = iter->fn(ctx, set, iter, &elem);
 482                if (iter->err < 0) {
 483                        read_unlock_bh(&priv->lock);
 484                        return;
 485                }
 486cont:
 487                iter->count++;
 488        }
 489        read_unlock_bh(&priv->lock);
 490}
 491
 492static void nft_rbtree_gc(struct work_struct *work)
 493{
 494        struct nft_rbtree_elem *rbe, *rbe_end = NULL, *rbe_prev = NULL;
 495        struct nft_set_gc_batch *gcb = NULL;
 496        struct nft_rbtree *priv;
 497        struct rb_node *node;
 498        struct nft_set *set;
 499
 500        priv = container_of(work, struct nft_rbtree, gc_work.work);
 501        set  = nft_set_container_of(priv);
 502
 503        write_lock_bh(&priv->lock);
 504        write_seqcount_begin(&priv->count);
 505        for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
 506                rbe = rb_entry(node, struct nft_rbtree_elem, node);
 507
 508                if (nft_rbtree_interval_end(rbe)) {
 509                        rbe_end = rbe;
 510                        continue;
 511                }
 512                if (!nft_set_elem_expired(&rbe->ext))
 513                        continue;
 514                if (nft_set_elem_mark_busy(&rbe->ext))
 515                        continue;
 516
 517                if (rbe_prev) {
 518                        rb_erase(&rbe_prev->node, &priv->root);
 519                        rbe_prev = NULL;
 520                }
 521                gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
 522                if (!gcb)
 523                        break;
 524
 525                atomic_dec(&set->nelems);
 526                nft_set_gc_batch_add(gcb, rbe);
 527                rbe_prev = rbe;
 528
 529                if (rbe_end) {
 530                        atomic_dec(&set->nelems);
 531                        nft_set_gc_batch_add(gcb, rbe_end);
 532                        rb_erase(&rbe_end->node, &priv->root);
 533                        rbe_end = NULL;
 534                }
 535                node = rb_next(node);
 536                if (!node)
 537                        break;
 538        }
 539        if (rbe_prev)
 540                rb_erase(&rbe_prev->node, &priv->root);
 541        write_seqcount_end(&priv->count);
 542        write_unlock_bh(&priv->lock);
 543
 544        nft_set_gc_batch_complete(gcb);
 545
 546        queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
 547                           nft_set_gc_interval(set));
 548}
 549
 550static u64 nft_rbtree_privsize(const struct nlattr * const nla[],
 551                               const struct nft_set_desc *desc)
 552{
 553        return sizeof(struct nft_rbtree);
 554}
 555
 556static int nft_rbtree_init(const struct nft_set *set,
 557                           const struct nft_set_desc *desc,
 558                           const struct nlattr * const nla[])
 559{
 560        struct nft_rbtree *priv = nft_set_priv(set);
 561
 562        rwlock_init(&priv->lock);
 563        seqcount_rwlock_init(&priv->count, &priv->lock);
 564        priv->root = RB_ROOT;
 565
 566        INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rbtree_gc);
 567        if (set->flags & NFT_SET_TIMEOUT)
 568                queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
 569                                   nft_set_gc_interval(set));
 570
 571        return 0;
 572}
 573
 574static void nft_rbtree_destroy(const struct nft_set *set)
 575{
 576        struct nft_rbtree *priv = nft_set_priv(set);
 577        struct nft_rbtree_elem *rbe;
 578        struct rb_node *node;
 579
 580        cancel_delayed_work_sync(&priv->gc_work);
 581        rcu_barrier();
 582        while ((node = priv->root.rb_node) != NULL) {
 583                rb_erase(node, &priv->root);
 584                rbe = rb_entry(node, struct nft_rbtree_elem, node);
 585                nft_set_elem_destroy(set, rbe, true);
 586        }
 587}
 588
 589static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
 590                                struct nft_set_estimate *est)
 591{
 592        if (desc->field_count > 1)
 593                return false;
 594
 595        if (desc->size)
 596                est->size = sizeof(struct nft_rbtree) +
 597                            desc->size * sizeof(struct nft_rbtree_elem);
 598        else
 599                est->size = ~0;
 600
 601        est->lookup = NFT_SET_CLASS_O_LOG_N;
 602        est->space  = NFT_SET_CLASS_O_N;
 603
 604        return true;
 605}
 606
 607const struct nft_set_type nft_set_rbtree_type = {
 608        .features       = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT,
 609        .ops            = {
 610                .privsize       = nft_rbtree_privsize,
 611                .elemsize       = offsetof(struct nft_rbtree_elem, ext),
 612                .estimate       = nft_rbtree_estimate,
 613                .init           = nft_rbtree_init,
 614                .destroy        = nft_rbtree_destroy,
 615                .insert         = nft_rbtree_insert,
 616                .remove         = nft_rbtree_remove,
 617                .deactivate     = nft_rbtree_deactivate,
 618                .flush          = nft_rbtree_flush,
 619                .activate       = nft_rbtree_activate,
 620                .lookup         = nft_rbtree_lookup,
 621                .walk           = nft_rbtree_walk,
 622                .get            = nft_rbtree_get,
 623        },
 624};
 625