1
2
3
4
5
6
7
8#include <linux/kernel.h>
9#include <linux/init.h>
10#include <linux/module.h>
11#include <linux/list.h>
12#include <linux/rbtree.h>
13#include <linux/netlink.h>
14#include <linux/netfilter.h>
15#include <linux/netfilter/nf_tables.h>
16#include <net/netfilter/nf_tables_core.h>
17
18struct nft_rbtree {
19 struct rb_root root;
20 rwlock_t lock;
21 seqcount_rwlock_t count;
22 struct delayed_work gc_work;
23};
24
25struct nft_rbtree_elem {
26 struct rb_node node;
27 struct nft_set_ext ext;
28};
29
30static bool nft_rbtree_interval_end(const struct nft_rbtree_elem *rbe)
31{
32 return nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) &&
33 (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END);
34}
35
36static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe)
37{
38 return !nft_rbtree_interval_end(rbe);
39}
40
41static bool nft_rbtree_equal(const struct nft_set *set, const void *this,
42 const struct nft_rbtree_elem *interval)
43{
44 return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0;
45}
46
47static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
48 const u32 *key, const struct nft_set_ext **ext,
49 unsigned int seq)
50{
51 struct nft_rbtree *priv = nft_set_priv(set);
52 const struct nft_rbtree_elem *rbe, *interval = NULL;
53 u8 genmask = nft_genmask_cur(net);
54 const struct rb_node *parent;
55 const void *this;
56 int d;
57
58 parent = rcu_dereference_raw(priv->root.rb_node);
59 while (parent != NULL) {
60 if (read_seqcount_retry(&priv->count, seq))
61 return false;
62
63 rbe = rb_entry(parent, struct nft_rbtree_elem, node);
64
65 this = nft_set_ext_key(&rbe->ext);
66 d = memcmp(this, key, set->klen);
67 if (d < 0) {
68 parent = rcu_dereference_raw(parent->rb_left);
69 if (interval &&
70 nft_rbtree_equal(set, this, interval) &&
71 nft_rbtree_interval_end(rbe) &&
72 nft_rbtree_interval_start(interval))
73 continue;
74 interval = rbe;
75 } else if (d > 0)
76 parent = rcu_dereference_raw(parent->rb_right);
77 else {
78 if (!nft_set_elem_active(&rbe->ext, genmask)) {
79 parent = rcu_dereference_raw(parent->rb_left);
80 continue;
81 }
82
83 if (nft_set_elem_expired(&rbe->ext))
84 return false;
85
86 if (nft_rbtree_interval_end(rbe)) {
87 if (nft_set_is_anonymous(set))
88 return false;
89 parent = rcu_dereference_raw(parent->rb_left);
90 interval = NULL;
91 continue;
92 }
93
94 *ext = &rbe->ext;
95 return true;
96 }
97 }
98
99 if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
100 nft_set_elem_active(&interval->ext, genmask) &&
101 !nft_set_elem_expired(&interval->ext) &&
102 nft_rbtree_interval_start(interval)) {
103 *ext = &interval->ext;
104 return true;
105 }
106
107 return false;
108}
109
110INDIRECT_CALLABLE_SCOPE
111bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
112 const u32 *key, const struct nft_set_ext **ext)
113{
114 struct nft_rbtree *priv = nft_set_priv(set);
115 unsigned int seq = read_seqcount_begin(&priv->count);
116 bool ret;
117
118 ret = __nft_rbtree_lookup(net, set, key, ext, seq);
119 if (ret || !read_seqcount_retry(&priv->count, seq))
120 return ret;
121
122 read_lock_bh(&priv->lock);
123 seq = read_seqcount_begin(&priv->count);
124 ret = __nft_rbtree_lookup(net, set, key, ext, seq);
125 read_unlock_bh(&priv->lock);
126
127 return ret;
128}
129
130static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set,
131 const u32 *key, struct nft_rbtree_elem **elem,
132 unsigned int seq, unsigned int flags, u8 genmask)
133{
134 struct nft_rbtree_elem *rbe, *interval = NULL;
135 struct nft_rbtree *priv = nft_set_priv(set);
136 const struct rb_node *parent;
137 const void *this;
138 int d;
139
140 parent = rcu_dereference_raw(priv->root.rb_node);
141 while (parent != NULL) {
142 if (read_seqcount_retry(&priv->count, seq))
143 return false;
144
145 rbe = rb_entry(parent, struct nft_rbtree_elem, node);
146
147 this = nft_set_ext_key(&rbe->ext);
148 d = memcmp(this, key, set->klen);
149 if (d < 0) {
150 parent = rcu_dereference_raw(parent->rb_left);
151 if (!(flags & NFT_SET_ELEM_INTERVAL_END))
152 interval = rbe;
153 } else if (d > 0) {
154 parent = rcu_dereference_raw(parent->rb_right);
155 if (flags & NFT_SET_ELEM_INTERVAL_END)
156 interval = rbe;
157 } else {
158 if (!nft_set_elem_active(&rbe->ext, genmask)) {
159 parent = rcu_dereference_raw(parent->rb_left);
160 continue;
161 }
162
163 if (nft_set_elem_expired(&rbe->ext))
164 return false;
165
166 if (!nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) ||
167 (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END) ==
168 (flags & NFT_SET_ELEM_INTERVAL_END)) {
169 *elem = rbe;
170 return true;
171 }
172
173 if (nft_rbtree_interval_end(rbe))
174 interval = NULL;
175
176 parent = rcu_dereference_raw(parent->rb_left);
177 }
178 }
179
180 if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
181 nft_set_elem_active(&interval->ext, genmask) &&
182 !nft_set_elem_expired(&interval->ext) &&
183 ((!nft_rbtree_interval_end(interval) &&
184 !(flags & NFT_SET_ELEM_INTERVAL_END)) ||
185 (nft_rbtree_interval_end(interval) &&
186 (flags & NFT_SET_ELEM_INTERVAL_END)))) {
187 *elem = interval;
188 return true;
189 }
190
191 return false;
192}
193
194static void *nft_rbtree_get(const struct net *net, const struct nft_set *set,
195 const struct nft_set_elem *elem, unsigned int flags)
196{
197 struct nft_rbtree *priv = nft_set_priv(set);
198 unsigned int seq = read_seqcount_begin(&priv->count);
199 struct nft_rbtree_elem *rbe = ERR_PTR(-ENOENT);
200 const u32 *key = (const u32 *)&elem->key.val;
201 u8 genmask = nft_genmask_cur(net);
202 bool ret;
203
204 ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask);
205 if (ret || !read_seqcount_retry(&priv->count, seq))
206 return rbe;
207
208 read_lock_bh(&priv->lock);
209 seq = read_seqcount_begin(&priv->count);
210 ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask);
211 if (!ret)
212 rbe = ERR_PTR(-ENOENT);
213 read_unlock_bh(&priv->lock);
214
215 return rbe;
216}
217
218static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
219 struct nft_rbtree_elem *new,
220 struct nft_set_ext **ext)
221{
222 bool overlap = false, dup_end_left = false, dup_end_right = false;
223 struct nft_rbtree *priv = nft_set_priv(set);
224 u8 genmask = nft_genmask_next(net);
225 struct nft_rbtree_elem *rbe;
226 struct rb_node *parent, **p;
227 int d;
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282 parent = NULL;
283 p = &priv->root.rb_node;
284 while (*p != NULL) {
285 parent = *p;
286 rbe = rb_entry(parent, struct nft_rbtree_elem, node);
287 d = memcmp(nft_set_ext_key(&rbe->ext),
288 nft_set_ext_key(&new->ext),
289 set->klen);
290 if (d < 0) {
291 p = &parent->rb_left;
292
293 if (nft_rbtree_interval_start(new)) {
294 if (nft_rbtree_interval_end(rbe) &&
295 nft_set_elem_active(&rbe->ext, genmask) &&
296 !nft_set_elem_expired(&rbe->ext) && !*p)
297 overlap = false;
298 } else {
299 if (dup_end_left && !*p)
300 return -ENOTEMPTY;
301
302 overlap = nft_rbtree_interval_end(rbe) &&
303 nft_set_elem_active(&rbe->ext,
304 genmask) &&
305 !nft_set_elem_expired(&rbe->ext);
306
307 if (overlap) {
308 dup_end_right = true;
309 continue;
310 }
311 }
312 } else if (d > 0) {
313 p = &parent->rb_right;
314
315 if (nft_rbtree_interval_end(new)) {
316 if (dup_end_right && !*p)
317 return -ENOTEMPTY;
318
319 overlap = nft_rbtree_interval_end(rbe) &&
320 nft_set_elem_active(&rbe->ext,
321 genmask) &&
322 !nft_set_elem_expired(&rbe->ext);
323
324 if (overlap) {
325 dup_end_left = true;
326 continue;
327 }
328 } else if (nft_set_elem_active(&rbe->ext, genmask) &&
329 !nft_set_elem_expired(&rbe->ext)) {
330 overlap = nft_rbtree_interval_end(rbe);
331 }
332 } else {
333 if (nft_rbtree_interval_end(rbe) &&
334 nft_rbtree_interval_start(new)) {
335 p = &parent->rb_left;
336
337 if (nft_set_elem_active(&rbe->ext, genmask) &&
338 !nft_set_elem_expired(&rbe->ext))
339 overlap = false;
340 } else if (nft_rbtree_interval_start(rbe) &&
341 nft_rbtree_interval_end(new)) {
342 p = &parent->rb_right;
343
344 if (nft_set_elem_active(&rbe->ext, genmask) &&
345 !nft_set_elem_expired(&rbe->ext))
346 overlap = false;
347 } else if (nft_set_elem_active(&rbe->ext, genmask) &&
348 !nft_set_elem_expired(&rbe->ext)) {
349 *ext = &rbe->ext;
350 return -EEXIST;
351 } else {
352 p = &parent->rb_left;
353 }
354 }
355
356 dup_end_left = dup_end_right = false;
357 }
358
359 if (overlap)
360 return -ENOTEMPTY;
361
362 rb_link_node_rcu(&new->node, parent, p);
363 rb_insert_color(&new->node, &priv->root);
364 return 0;
365}
366
367static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
368 const struct nft_set_elem *elem,
369 struct nft_set_ext **ext)
370{
371 struct nft_rbtree *priv = nft_set_priv(set);
372 struct nft_rbtree_elem *rbe = elem->priv;
373 int err;
374
375 write_lock_bh(&priv->lock);
376 write_seqcount_begin(&priv->count);
377 err = __nft_rbtree_insert(net, set, rbe, ext);
378 write_seqcount_end(&priv->count);
379 write_unlock_bh(&priv->lock);
380
381 return err;
382}
383
384static void nft_rbtree_remove(const struct net *net,
385 const struct nft_set *set,
386 const struct nft_set_elem *elem)
387{
388 struct nft_rbtree *priv = nft_set_priv(set);
389 struct nft_rbtree_elem *rbe = elem->priv;
390
391 write_lock_bh(&priv->lock);
392 write_seqcount_begin(&priv->count);
393 rb_erase(&rbe->node, &priv->root);
394 write_seqcount_end(&priv->count);
395 write_unlock_bh(&priv->lock);
396}
397
398static void nft_rbtree_activate(const struct net *net,
399 const struct nft_set *set,
400 const struct nft_set_elem *elem)
401{
402 struct nft_rbtree_elem *rbe = elem->priv;
403
404 nft_set_elem_change_active(net, set, &rbe->ext);
405 nft_set_elem_clear_busy(&rbe->ext);
406}
407
408static bool nft_rbtree_flush(const struct net *net,
409 const struct nft_set *set, void *priv)
410{
411 struct nft_rbtree_elem *rbe = priv;
412
413 if (!nft_set_elem_mark_busy(&rbe->ext) ||
414 !nft_is_active(net, &rbe->ext)) {
415 nft_set_elem_change_active(net, set, &rbe->ext);
416 return true;
417 }
418 return false;
419}
420
421static void *nft_rbtree_deactivate(const struct net *net,
422 const struct nft_set *set,
423 const struct nft_set_elem *elem)
424{
425 const struct nft_rbtree *priv = nft_set_priv(set);
426 const struct rb_node *parent = priv->root.rb_node;
427 struct nft_rbtree_elem *rbe, *this = elem->priv;
428 u8 genmask = nft_genmask_next(net);
429 int d;
430
431 while (parent != NULL) {
432 rbe = rb_entry(parent, struct nft_rbtree_elem, node);
433
434 d = memcmp(nft_set_ext_key(&rbe->ext), &elem->key.val,
435 set->klen);
436 if (d < 0)
437 parent = parent->rb_left;
438 else if (d > 0)
439 parent = parent->rb_right;
440 else {
441 if (nft_rbtree_interval_end(rbe) &&
442 nft_rbtree_interval_start(this)) {
443 parent = parent->rb_left;
444 continue;
445 } else if (nft_rbtree_interval_start(rbe) &&
446 nft_rbtree_interval_end(this)) {
447 parent = parent->rb_right;
448 continue;
449 } else if (!nft_set_elem_active(&rbe->ext, genmask)) {
450 parent = parent->rb_left;
451 continue;
452 }
453 nft_rbtree_flush(net, set, rbe);
454 return rbe;
455 }
456 }
457 return NULL;
458}
459
460static void nft_rbtree_walk(const struct nft_ctx *ctx,
461 struct nft_set *set,
462 struct nft_set_iter *iter)
463{
464 struct nft_rbtree *priv = nft_set_priv(set);
465 struct nft_rbtree_elem *rbe;
466 struct nft_set_elem elem;
467 struct rb_node *node;
468
469 read_lock_bh(&priv->lock);
470 for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
471 rbe = rb_entry(node, struct nft_rbtree_elem, node);
472
473 if (iter->count < iter->skip)
474 goto cont;
475 if (nft_set_elem_expired(&rbe->ext))
476 goto cont;
477 if (!nft_set_elem_active(&rbe->ext, iter->genmask))
478 goto cont;
479
480 elem.priv = rbe;
481
482 iter->err = iter->fn(ctx, set, iter, &elem);
483 if (iter->err < 0) {
484 read_unlock_bh(&priv->lock);
485 return;
486 }
487cont:
488 iter->count++;
489 }
490 read_unlock_bh(&priv->lock);
491}
492
493static void nft_rbtree_gc(struct work_struct *work)
494{
495 struct nft_rbtree_elem *rbe, *rbe_end = NULL, *rbe_prev = NULL;
496 struct nft_set_gc_batch *gcb = NULL;
497 struct nft_rbtree *priv;
498 struct rb_node *node;
499 struct nft_set *set;
500
501 priv = container_of(work, struct nft_rbtree, gc_work.work);
502 set = nft_set_container_of(priv);
503
504 write_lock_bh(&priv->lock);
505 write_seqcount_begin(&priv->count);
506 for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
507 rbe = rb_entry(node, struct nft_rbtree_elem, node);
508
509 if (nft_rbtree_interval_end(rbe)) {
510 rbe_end = rbe;
511 continue;
512 }
513 if (!nft_set_elem_expired(&rbe->ext))
514 continue;
515 if (nft_set_elem_mark_busy(&rbe->ext))
516 continue;
517
518 if (rbe_prev) {
519 rb_erase(&rbe_prev->node, &priv->root);
520 rbe_prev = NULL;
521 }
522 gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
523 if (!gcb)
524 break;
525
526 atomic_dec(&set->nelems);
527 nft_set_gc_batch_add(gcb, rbe);
528 rbe_prev = rbe;
529
530 if (rbe_end) {
531 atomic_dec(&set->nelems);
532 nft_set_gc_batch_add(gcb, rbe_end);
533 rb_erase(&rbe_end->node, &priv->root);
534 rbe_end = NULL;
535 }
536 node = rb_next(node);
537 if (!node)
538 break;
539 }
540 if (rbe_prev)
541 rb_erase(&rbe_prev->node, &priv->root);
542 write_seqcount_end(&priv->count);
543 write_unlock_bh(&priv->lock);
544
545 rbe = nft_set_catchall_gc(set);
546 if (rbe) {
547 gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
548 if (gcb)
549 nft_set_gc_batch_add(gcb, rbe);
550 }
551 nft_set_gc_batch_complete(gcb);
552
553 queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
554 nft_set_gc_interval(set));
555}
556
557static u64 nft_rbtree_privsize(const struct nlattr * const nla[],
558 const struct nft_set_desc *desc)
559{
560 return sizeof(struct nft_rbtree);
561}
562
563static int nft_rbtree_init(const struct nft_set *set,
564 const struct nft_set_desc *desc,
565 const struct nlattr * const nla[])
566{
567 struct nft_rbtree *priv = nft_set_priv(set);
568
569 rwlock_init(&priv->lock);
570 seqcount_rwlock_init(&priv->count, &priv->lock);
571 priv->root = RB_ROOT;
572
573 INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rbtree_gc);
574 if (set->flags & NFT_SET_TIMEOUT)
575 queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
576 nft_set_gc_interval(set));
577
578 return 0;
579}
580
581static void nft_rbtree_destroy(const struct nft_set *set)
582{
583 struct nft_rbtree *priv = nft_set_priv(set);
584 struct nft_rbtree_elem *rbe;
585 struct rb_node *node;
586
587 cancel_delayed_work_sync(&priv->gc_work);
588 rcu_barrier();
589 while ((node = priv->root.rb_node) != NULL) {
590 rb_erase(node, &priv->root);
591 rbe = rb_entry(node, struct nft_rbtree_elem, node);
592 nft_set_elem_destroy(set, rbe, true);
593 }
594}
595
596static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
597 struct nft_set_estimate *est)
598{
599 if (desc->field_count > 1)
600 return false;
601
602 if (desc->size)
603 est->size = sizeof(struct nft_rbtree) +
604 desc->size * sizeof(struct nft_rbtree_elem);
605 else
606 est->size = ~0;
607
608 est->lookup = NFT_SET_CLASS_O_LOG_N;
609 est->space = NFT_SET_CLASS_O_N;
610
611 return true;
612}
613
614const struct nft_set_type nft_set_rbtree_type = {
615 .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT,
616 .ops = {
617 .privsize = nft_rbtree_privsize,
618 .elemsize = offsetof(struct nft_rbtree_elem, ext),
619 .estimate = nft_rbtree_estimate,
620 .init = nft_rbtree_init,
621 .destroy = nft_rbtree_destroy,
622 .insert = nft_rbtree_insert,
623 .remove = nft_rbtree_remove,
624 .deactivate = nft_rbtree_deactivate,
625 .flush = nft_rbtree_flush,
626 .activate = nft_rbtree_activate,
627 .lookup = nft_rbtree_lookup,
628 .walk = nft_rbtree_walk,
629 .get = nft_rbtree_get,
630 },
631};
632