linux/net/netfilter/nft_set_rbtree.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
   4 *
   5 * Development of this code funded by Astaro AG (http://www.astaro.com/)
   6 */
   7
   8#include <linux/kernel.h>
   9#include <linux/init.h>
  10#include <linux/module.h>
  11#include <linux/list.h>
  12#include <linux/rbtree.h>
  13#include <linux/netlink.h>
  14#include <linux/netfilter.h>
  15#include <linux/netfilter/nf_tables.h>
  16#include <net/netfilter/nf_tables.h>
  17
  18struct nft_rbtree {
  19        struct rb_root          root;
  20        rwlock_t                lock;
  21        seqcount_t              count;
  22        struct delayed_work     gc_work;
  23};
  24
  25struct nft_rbtree_elem {
  26        struct rb_node          node;
  27        struct nft_set_ext      ext;
  28};
  29
  30static bool nft_rbtree_interval_end(const struct nft_rbtree_elem *rbe)
  31{
  32        return nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) &&
  33               (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END);
  34}
  35
  36static bool nft_rbtree_equal(const struct nft_set *set, const void *this,
  37                             const struct nft_rbtree_elem *interval)
  38{
  39        return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0;
  40}
  41
  42static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
  43                                const u32 *key, const struct nft_set_ext **ext,
  44                                unsigned int seq)
  45{
  46        struct nft_rbtree *priv = nft_set_priv(set);
  47        const struct nft_rbtree_elem *rbe, *interval = NULL;
  48        u8 genmask = nft_genmask_cur(net);
  49        const struct rb_node *parent;
  50        const void *this;
  51        int d;
  52
  53        parent = rcu_dereference_raw(priv->root.rb_node);
  54        while (parent != NULL) {
  55                if (read_seqcount_retry(&priv->count, seq))
  56                        return false;
  57
  58                rbe = rb_entry(parent, struct nft_rbtree_elem, node);
  59
  60                this = nft_set_ext_key(&rbe->ext);
  61                d = memcmp(this, key, set->klen);
  62                if (d < 0) {
  63                        parent = rcu_dereference_raw(parent->rb_left);
  64                        if (interval &&
  65                            nft_rbtree_equal(set, this, interval) &&
  66                            nft_rbtree_interval_end(rbe) &&
  67                            !nft_rbtree_interval_end(interval))
  68                                continue;
  69                        interval = rbe;
  70                } else if (d > 0)
  71                        parent = rcu_dereference_raw(parent->rb_right);
  72                else {
  73                        if (!nft_set_elem_active(&rbe->ext, genmask)) {
  74                                parent = rcu_dereference_raw(parent->rb_left);
  75                                continue;
  76                        }
  77                        if (nft_rbtree_interval_end(rbe))
  78                                goto out;
  79
  80                        *ext = &rbe->ext;
  81                        return true;
  82                }
  83        }
  84
  85        if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
  86            nft_set_elem_active(&interval->ext, genmask) &&
  87            !nft_rbtree_interval_end(interval)) {
  88                *ext = &interval->ext;
  89                return true;
  90        }
  91out:
  92        return false;
  93}
  94
  95static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
  96                              const u32 *key, const struct nft_set_ext **ext)
  97{
  98        struct nft_rbtree *priv = nft_set_priv(set);
  99        unsigned int seq = read_seqcount_begin(&priv->count);
 100        bool ret;
 101
 102        ret = __nft_rbtree_lookup(net, set, key, ext, seq);
 103        if (ret || !read_seqcount_retry(&priv->count, seq))
 104                return ret;
 105
 106        read_lock_bh(&priv->lock);
 107        seq = read_seqcount_begin(&priv->count);
 108        ret = __nft_rbtree_lookup(net, set, key, ext, seq);
 109        read_unlock_bh(&priv->lock);
 110
 111        return ret;
 112}
 113
 114static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set,
 115                             const u32 *key, struct nft_rbtree_elem **elem,
 116                             unsigned int seq, unsigned int flags, u8 genmask)
 117{
 118        struct nft_rbtree_elem *rbe, *interval = NULL;
 119        struct nft_rbtree *priv = nft_set_priv(set);
 120        const struct rb_node *parent;
 121        const void *this;
 122        int d;
 123
 124        parent = rcu_dereference_raw(priv->root.rb_node);
 125        while (parent != NULL) {
 126                if (read_seqcount_retry(&priv->count, seq))
 127                        return false;
 128
 129                rbe = rb_entry(parent, struct nft_rbtree_elem, node);
 130
 131                this = nft_set_ext_key(&rbe->ext);
 132                d = memcmp(this, key, set->klen);
 133                if (d < 0) {
 134                        parent = rcu_dereference_raw(parent->rb_left);
 135                        if (!(flags & NFT_SET_ELEM_INTERVAL_END))
 136                                interval = rbe;
 137                } else if (d > 0) {
 138                        parent = rcu_dereference_raw(parent->rb_right);
 139                        if (flags & NFT_SET_ELEM_INTERVAL_END)
 140                                interval = rbe;
 141                } else {
 142                        if (!nft_set_elem_active(&rbe->ext, genmask))
 143                                parent = rcu_dereference_raw(parent->rb_left);
 144
 145                        if (!nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) ||
 146                            (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END) ==
 147                            (flags & NFT_SET_ELEM_INTERVAL_END)) {
 148                                *elem = rbe;
 149                                return true;
 150                        }
 151                        return false;
 152                }
 153        }
 154
 155        if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
 156            nft_set_elem_active(&interval->ext, genmask) &&
 157            ((!nft_rbtree_interval_end(interval) &&
 158              !(flags & NFT_SET_ELEM_INTERVAL_END)) ||
 159             (nft_rbtree_interval_end(interval) &&
 160              (flags & NFT_SET_ELEM_INTERVAL_END)))) {
 161                *elem = interval;
 162                return true;
 163        }
 164
 165        return false;
 166}
 167
 168static void *nft_rbtree_get(const struct net *net, const struct nft_set *set,
 169                            const struct nft_set_elem *elem, unsigned int flags)
 170{
 171        struct nft_rbtree *priv = nft_set_priv(set);
 172        unsigned int seq = read_seqcount_begin(&priv->count);
 173        struct nft_rbtree_elem *rbe = ERR_PTR(-ENOENT);
 174        const u32 *key = (const u32 *)&elem->key.val;
 175        u8 genmask = nft_genmask_cur(net);
 176        bool ret;
 177
 178        ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask);
 179        if (ret || !read_seqcount_retry(&priv->count, seq))
 180                return rbe;
 181
 182        read_lock_bh(&priv->lock);
 183        seq = read_seqcount_begin(&priv->count);
 184        ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask);
 185        if (!ret)
 186                rbe = ERR_PTR(-ENOENT);
 187        read_unlock_bh(&priv->lock);
 188
 189        return rbe;
 190}
 191
 192static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
 193                               struct nft_rbtree_elem *new,
 194                               struct nft_set_ext **ext)
 195{
 196        struct nft_rbtree *priv = nft_set_priv(set);
 197        u8 genmask = nft_genmask_next(net);
 198        struct nft_rbtree_elem *rbe;
 199        struct rb_node *parent, **p;
 200        int d;
 201
 202        parent = NULL;
 203        p = &priv->root.rb_node;
 204        while (*p != NULL) {
 205                parent = *p;
 206                rbe = rb_entry(parent, struct nft_rbtree_elem, node);
 207                d = memcmp(nft_set_ext_key(&rbe->ext),
 208                           nft_set_ext_key(&new->ext),
 209                           set->klen);
 210                if (d < 0)
 211                        p = &parent->rb_left;
 212                else if (d > 0)
 213                        p = &parent->rb_right;
 214                else {
 215                        if (nft_rbtree_interval_end(rbe) &&
 216                            !nft_rbtree_interval_end(new)) {
 217                                p = &parent->rb_left;
 218                        } else if (!nft_rbtree_interval_end(rbe) &&
 219                                   nft_rbtree_interval_end(new)) {
 220                                p = &parent->rb_right;
 221                        } else if (nft_set_elem_active(&rbe->ext, genmask)) {
 222                                *ext = &rbe->ext;
 223                                return -EEXIST;
 224                        } else {
 225                                p = &parent->rb_left;
 226                        }
 227                }
 228        }
 229        rb_link_node_rcu(&new->node, parent, p);
 230        rb_insert_color(&new->node, &priv->root);
 231        return 0;
 232}
 233
 234static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
 235                             const struct nft_set_elem *elem,
 236                             struct nft_set_ext **ext)
 237{
 238        struct nft_rbtree *priv = nft_set_priv(set);
 239        struct nft_rbtree_elem *rbe = elem->priv;
 240        int err;
 241
 242        write_lock_bh(&priv->lock);
 243        write_seqcount_begin(&priv->count);
 244        err = __nft_rbtree_insert(net, set, rbe, ext);
 245        write_seqcount_end(&priv->count);
 246        write_unlock_bh(&priv->lock);
 247
 248        return err;
 249}
 250
 251static void nft_rbtree_remove(const struct net *net,
 252                              const struct nft_set *set,
 253                              const struct nft_set_elem *elem)
 254{
 255        struct nft_rbtree *priv = nft_set_priv(set);
 256        struct nft_rbtree_elem *rbe = elem->priv;
 257
 258        write_lock_bh(&priv->lock);
 259        write_seqcount_begin(&priv->count);
 260        rb_erase(&rbe->node, &priv->root);
 261        write_seqcount_end(&priv->count);
 262        write_unlock_bh(&priv->lock);
 263}
 264
 265static void nft_rbtree_activate(const struct net *net,
 266                                const struct nft_set *set,
 267                                const struct nft_set_elem *elem)
 268{
 269        struct nft_rbtree_elem *rbe = elem->priv;
 270
 271        nft_set_elem_change_active(net, set, &rbe->ext);
 272        nft_set_elem_clear_busy(&rbe->ext);
 273}
 274
 275static bool nft_rbtree_flush(const struct net *net,
 276                             const struct nft_set *set, void *priv)
 277{
 278        struct nft_rbtree_elem *rbe = priv;
 279
 280        if (!nft_set_elem_mark_busy(&rbe->ext) ||
 281            !nft_is_active(net, &rbe->ext)) {
 282                nft_set_elem_change_active(net, set, &rbe->ext);
 283                return true;
 284        }
 285        return false;
 286}
 287
 288static void *nft_rbtree_deactivate(const struct net *net,
 289                                   const struct nft_set *set,
 290                                   const struct nft_set_elem *elem)
 291{
 292        const struct nft_rbtree *priv = nft_set_priv(set);
 293        const struct rb_node *parent = priv->root.rb_node;
 294        struct nft_rbtree_elem *rbe, *this = elem->priv;
 295        u8 genmask = nft_genmask_next(net);
 296        int d;
 297
 298        while (parent != NULL) {
 299                rbe = rb_entry(parent, struct nft_rbtree_elem, node);
 300
 301                d = memcmp(nft_set_ext_key(&rbe->ext), &elem->key.val,
 302                                           set->klen);
 303                if (d < 0)
 304                        parent = parent->rb_left;
 305                else if (d > 0)
 306                        parent = parent->rb_right;
 307                else {
 308                        if (nft_rbtree_interval_end(rbe) &&
 309                            !nft_rbtree_interval_end(this)) {
 310                                parent = parent->rb_left;
 311                                continue;
 312                        } else if (!nft_rbtree_interval_end(rbe) &&
 313                                   nft_rbtree_interval_end(this)) {
 314                                parent = parent->rb_right;
 315                                continue;
 316                        } else if (!nft_set_elem_active(&rbe->ext, genmask)) {
 317                                parent = parent->rb_left;
 318                                continue;
 319                        }
 320                        nft_rbtree_flush(net, set, rbe);
 321                        return rbe;
 322                }
 323        }
 324        return NULL;
 325}
 326
 327static void nft_rbtree_walk(const struct nft_ctx *ctx,
 328                            struct nft_set *set,
 329                            struct nft_set_iter *iter)
 330{
 331        struct nft_rbtree *priv = nft_set_priv(set);
 332        struct nft_rbtree_elem *rbe;
 333        struct nft_set_elem elem;
 334        struct rb_node *node;
 335
 336        read_lock_bh(&priv->lock);
 337        for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
 338                rbe = rb_entry(node, struct nft_rbtree_elem, node);
 339
 340                if (iter->count < iter->skip)
 341                        goto cont;
 342                if (!nft_set_elem_active(&rbe->ext, iter->genmask))
 343                        goto cont;
 344
 345                elem.priv = rbe;
 346
 347                iter->err = iter->fn(ctx, set, iter, &elem);
 348                if (iter->err < 0) {
 349                        read_unlock_bh(&priv->lock);
 350                        return;
 351                }
 352cont:
 353                iter->count++;
 354        }
 355        read_unlock_bh(&priv->lock);
 356}
 357
 358static void nft_rbtree_gc(struct work_struct *work)
 359{
 360        struct nft_rbtree_elem *rbe, *rbe_end = NULL, *rbe_prev = NULL;
 361        struct nft_set_gc_batch *gcb = NULL;
 362        struct nft_rbtree *priv;
 363        struct rb_node *node;
 364        struct nft_set *set;
 365
 366        priv = container_of(work, struct nft_rbtree, gc_work.work);
 367        set  = nft_set_container_of(priv);
 368
 369        write_lock_bh(&priv->lock);
 370        write_seqcount_begin(&priv->count);
 371        for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
 372                rbe = rb_entry(node, struct nft_rbtree_elem, node);
 373
 374                if (nft_rbtree_interval_end(rbe)) {
 375                        rbe_end = rbe;
 376                        continue;
 377                }
 378                if (!nft_set_elem_expired(&rbe->ext))
 379                        continue;
 380                if (nft_set_elem_mark_busy(&rbe->ext))
 381                        continue;
 382
 383                if (rbe_prev) {
 384                        rb_erase(&rbe_prev->node, &priv->root);
 385                        rbe_prev = NULL;
 386                }
 387                gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
 388                if (!gcb)
 389                        break;
 390
 391                atomic_dec(&set->nelems);
 392                nft_set_gc_batch_add(gcb, rbe);
 393                rbe_prev = rbe;
 394
 395                if (rbe_end) {
 396                        atomic_dec(&set->nelems);
 397                        nft_set_gc_batch_add(gcb, rbe_end);
 398                        rb_erase(&rbe_end->node, &priv->root);
 399                        rbe_end = NULL;
 400                }
 401                node = rb_next(node);
 402                if (!node)
 403                        break;
 404        }
 405        if (rbe_prev)
 406                rb_erase(&rbe_prev->node, &priv->root);
 407        write_seqcount_end(&priv->count);
 408        write_unlock_bh(&priv->lock);
 409
 410        nft_set_gc_batch_complete(gcb);
 411
 412        queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
 413                           nft_set_gc_interval(set));
 414}
 415
 416static u64 nft_rbtree_privsize(const struct nlattr * const nla[],
 417                               const struct nft_set_desc *desc)
 418{
 419        return sizeof(struct nft_rbtree);
 420}
 421
 422static int nft_rbtree_init(const struct nft_set *set,
 423                           const struct nft_set_desc *desc,
 424                           const struct nlattr * const nla[])
 425{
 426        struct nft_rbtree *priv = nft_set_priv(set);
 427
 428        rwlock_init(&priv->lock);
 429        seqcount_init(&priv->count);
 430        priv->root = RB_ROOT;
 431
 432        INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rbtree_gc);
 433        if (set->flags & NFT_SET_TIMEOUT)
 434                queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
 435                                   nft_set_gc_interval(set));
 436
 437        return 0;
 438}
 439
 440static void nft_rbtree_destroy(const struct nft_set *set)
 441{
 442        struct nft_rbtree *priv = nft_set_priv(set);
 443        struct nft_rbtree_elem *rbe;
 444        struct rb_node *node;
 445
 446        cancel_delayed_work_sync(&priv->gc_work);
 447        rcu_barrier();
 448        while ((node = priv->root.rb_node) != NULL) {
 449                rb_erase(node, &priv->root);
 450                rbe = rb_entry(node, struct nft_rbtree_elem, node);
 451                nft_set_elem_destroy(set, rbe, true);
 452        }
 453}
 454
 455static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
 456                                struct nft_set_estimate *est)
 457{
 458        if (desc->size)
 459                est->size = sizeof(struct nft_rbtree) +
 460                            desc->size * sizeof(struct nft_rbtree_elem);
 461        else
 462                est->size = ~0;
 463
 464        est->lookup = NFT_SET_CLASS_O_LOG_N;
 465        est->space  = NFT_SET_CLASS_O_N;
 466
 467        return true;
 468}
 469
 470struct nft_set_type nft_set_rbtree_type __read_mostly = {
 471        .owner          = THIS_MODULE,
 472        .features       = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT,
 473        .ops            = {
 474                .privsize       = nft_rbtree_privsize,
 475                .elemsize       = offsetof(struct nft_rbtree_elem, ext),
 476                .estimate       = nft_rbtree_estimate,
 477                .init           = nft_rbtree_init,
 478                .destroy        = nft_rbtree_destroy,
 479                .insert         = nft_rbtree_insert,
 480                .remove         = nft_rbtree_remove,
 481                .deactivate     = nft_rbtree_deactivate,
 482                .flush          = nft_rbtree_flush,
 483                .activate       = nft_rbtree_activate,
 484                .lookup         = nft_rbtree_lookup,
 485                .walk           = nft_rbtree_walk,
 486                .get            = nft_rbtree_get,
 487        },
 488};
 489