1
2
3
4
5
6
7
8#include <linux/kernel.h>
9#include <linux/init.h>
10#include <linux/module.h>
11#include <linux/list.h>
12#include <linux/rbtree.h>
13#include <linux/netlink.h>
14#include <linux/netfilter.h>
15#include <linux/netfilter/nf_tables.h>
16#include <net/netfilter/nf_tables_core.h>
17
18struct nft_rbtree {
19 struct rb_root root;
20 rwlock_t lock;
21 seqcount_rwlock_t count;
22 struct delayed_work gc_work;
23};
24
25struct nft_rbtree_elem {
26 struct rb_node node;
27 struct nft_set_ext ext;
28};
29
30static bool nft_rbtree_interval_end(const struct nft_rbtree_elem *rbe)
31{
32 return nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) &&
33 (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END);
34}
35
36static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe)
37{
38 return !nft_rbtree_interval_end(rbe);
39}
40
41static bool nft_rbtree_equal(const struct nft_set *set, const void *this,
42 const struct nft_rbtree_elem *interval)
43{
44 return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0;
45}
46
47static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
48 const u32 *key, const struct nft_set_ext **ext,
49 unsigned int seq)
50{
51 struct nft_rbtree *priv = nft_set_priv(set);
52 const struct nft_rbtree_elem *rbe, *interval = NULL;
53 u8 genmask = nft_genmask_cur(net);
54 const struct rb_node *parent;
55 const void *this;
56 int d;
57
58 parent = rcu_dereference_raw(priv->root.rb_node);
59 while (parent != NULL) {
60 if (read_seqcount_retry(&priv->count, seq))
61 return false;
62
63 rbe = rb_entry(parent, struct nft_rbtree_elem, node);
64
65 this = nft_set_ext_key(&rbe->ext);
66 d = memcmp(this, key, set->klen);
67 if (d < 0) {
68 parent = rcu_dereference_raw(parent->rb_left);
69 if (interval &&
70 nft_rbtree_equal(set, this, interval) &&
71 nft_rbtree_interval_end(rbe) &&
72 nft_rbtree_interval_start(interval))
73 continue;
74 interval = rbe;
75 } else if (d > 0)
76 parent = rcu_dereference_raw(parent->rb_right);
77 else {
78 if (!nft_set_elem_active(&rbe->ext, genmask)) {
79 parent = rcu_dereference_raw(parent->rb_left);
80 continue;
81 }
82
83 if (nft_set_elem_expired(&rbe->ext))
84 return false;
85
86 if (nft_rbtree_interval_end(rbe)) {
87 if (nft_set_is_anonymous(set))
88 return false;
89 parent = rcu_dereference_raw(parent->rb_left);
90 interval = NULL;
91 continue;
92 }
93
94 *ext = &rbe->ext;
95 return true;
96 }
97 }
98
99 if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
100 nft_set_elem_active(&interval->ext, genmask) &&
101 !nft_set_elem_expired(&interval->ext) &&
102 nft_rbtree_interval_start(interval)) {
103 *ext = &interval->ext;
104 return true;
105 }
106
107 return false;
108}
109
110static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
111 const u32 *key, const struct nft_set_ext **ext)
112{
113 struct nft_rbtree *priv = nft_set_priv(set);
114 unsigned int seq = read_seqcount_begin(&priv->count);
115 bool ret;
116
117 ret = __nft_rbtree_lookup(net, set, key, ext, seq);
118 if (ret || !read_seqcount_retry(&priv->count, seq))
119 return ret;
120
121 read_lock_bh(&priv->lock);
122 seq = read_seqcount_begin(&priv->count);
123 ret = __nft_rbtree_lookup(net, set, key, ext, seq);
124 read_unlock_bh(&priv->lock);
125
126 return ret;
127}
128
129static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set,
130 const u32 *key, struct nft_rbtree_elem **elem,
131 unsigned int seq, unsigned int flags, u8 genmask)
132{
133 struct nft_rbtree_elem *rbe, *interval = NULL;
134 struct nft_rbtree *priv = nft_set_priv(set);
135 const struct rb_node *parent;
136 const void *this;
137 int d;
138
139 parent = rcu_dereference_raw(priv->root.rb_node);
140 while (parent != NULL) {
141 if (read_seqcount_retry(&priv->count, seq))
142 return false;
143
144 rbe = rb_entry(parent, struct nft_rbtree_elem, node);
145
146 this = nft_set_ext_key(&rbe->ext);
147 d = memcmp(this, key, set->klen);
148 if (d < 0) {
149 parent = rcu_dereference_raw(parent->rb_left);
150 if (!(flags & NFT_SET_ELEM_INTERVAL_END))
151 interval = rbe;
152 } else if (d > 0) {
153 parent = rcu_dereference_raw(parent->rb_right);
154 if (flags & NFT_SET_ELEM_INTERVAL_END)
155 interval = rbe;
156 } else {
157 if (!nft_set_elem_active(&rbe->ext, genmask)) {
158 parent = rcu_dereference_raw(parent->rb_left);
159 continue;
160 }
161
162 if (nft_set_elem_expired(&rbe->ext))
163 return false;
164
165 if (!nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) ||
166 (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END) ==
167 (flags & NFT_SET_ELEM_INTERVAL_END)) {
168 *elem = rbe;
169 return true;
170 }
171
172 if (nft_rbtree_interval_end(rbe))
173 interval = NULL;
174
175 parent = rcu_dereference_raw(parent->rb_left);
176 }
177 }
178
179 if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
180 nft_set_elem_active(&interval->ext, genmask) &&
181 !nft_set_elem_expired(&interval->ext) &&
182 ((!nft_rbtree_interval_end(interval) &&
183 !(flags & NFT_SET_ELEM_INTERVAL_END)) ||
184 (nft_rbtree_interval_end(interval) &&
185 (flags & NFT_SET_ELEM_INTERVAL_END)))) {
186 *elem = interval;
187 return true;
188 }
189
190 return false;
191}
192
193static void *nft_rbtree_get(const struct net *net, const struct nft_set *set,
194 const struct nft_set_elem *elem, unsigned int flags)
195{
196 struct nft_rbtree *priv = nft_set_priv(set);
197 unsigned int seq = read_seqcount_begin(&priv->count);
198 struct nft_rbtree_elem *rbe = ERR_PTR(-ENOENT);
199 const u32 *key = (const u32 *)&elem->key.val;
200 u8 genmask = nft_genmask_cur(net);
201 bool ret;
202
203 ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask);
204 if (ret || !read_seqcount_retry(&priv->count, seq))
205 return rbe;
206
207 read_lock_bh(&priv->lock);
208 seq = read_seqcount_begin(&priv->count);
209 ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask);
210 if (!ret)
211 rbe = ERR_PTR(-ENOENT);
212 read_unlock_bh(&priv->lock);
213
214 return rbe;
215}
216
217static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
218 struct nft_rbtree_elem *new,
219 struct nft_set_ext **ext)
220{
221 bool overlap = false, dup_end_left = false, dup_end_right = false;
222 struct nft_rbtree *priv = nft_set_priv(set);
223 u8 genmask = nft_genmask_next(net);
224 struct nft_rbtree_elem *rbe;
225 struct rb_node *parent, **p;
226 int d;
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281 parent = NULL;
282 p = &priv->root.rb_node;
283 while (*p != NULL) {
284 parent = *p;
285 rbe = rb_entry(parent, struct nft_rbtree_elem, node);
286 d = memcmp(nft_set_ext_key(&rbe->ext),
287 nft_set_ext_key(&new->ext),
288 set->klen);
289 if (d < 0) {
290 p = &parent->rb_left;
291
292 if (nft_rbtree_interval_start(new)) {
293 if (nft_rbtree_interval_end(rbe) &&
294 nft_set_elem_active(&rbe->ext, genmask) &&
295 !nft_set_elem_expired(&rbe->ext) && !*p)
296 overlap = false;
297 } else {
298 if (dup_end_left && !*p)
299 return -ENOTEMPTY;
300
301 overlap = nft_rbtree_interval_end(rbe) &&
302 nft_set_elem_active(&rbe->ext,
303 genmask) &&
304 !nft_set_elem_expired(&rbe->ext);
305
306 if (overlap) {
307 dup_end_right = true;
308 continue;
309 }
310 }
311 } else if (d > 0) {
312 p = &parent->rb_right;
313
314 if (nft_rbtree_interval_end(new)) {
315 if (dup_end_right && !*p)
316 return -ENOTEMPTY;
317
318 overlap = nft_rbtree_interval_end(rbe) &&
319 nft_set_elem_active(&rbe->ext,
320 genmask) &&
321 !nft_set_elem_expired(&rbe->ext);
322
323 if (overlap) {
324 dup_end_left = true;
325 continue;
326 }
327 } else if (nft_set_elem_active(&rbe->ext, genmask) &&
328 !nft_set_elem_expired(&rbe->ext)) {
329 overlap = nft_rbtree_interval_end(rbe);
330 }
331 } else {
332 if (nft_rbtree_interval_end(rbe) &&
333 nft_rbtree_interval_start(new)) {
334 p = &parent->rb_left;
335
336 if (nft_set_elem_active(&rbe->ext, genmask) &&
337 !nft_set_elem_expired(&rbe->ext))
338 overlap = false;
339 } else if (nft_rbtree_interval_start(rbe) &&
340 nft_rbtree_interval_end(new)) {
341 p = &parent->rb_right;
342
343 if (nft_set_elem_active(&rbe->ext, genmask) &&
344 !nft_set_elem_expired(&rbe->ext))
345 overlap = false;
346 } else if (nft_set_elem_active(&rbe->ext, genmask) &&
347 !nft_set_elem_expired(&rbe->ext)) {
348 *ext = &rbe->ext;
349 return -EEXIST;
350 } else {
351 p = &parent->rb_left;
352 }
353 }
354
355 dup_end_left = dup_end_right = false;
356 }
357
358 if (overlap)
359 return -ENOTEMPTY;
360
361 rb_link_node_rcu(&new->node, parent, p);
362 rb_insert_color(&new->node, &priv->root);
363 return 0;
364}
365
366static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
367 const struct nft_set_elem *elem,
368 struct nft_set_ext **ext)
369{
370 struct nft_rbtree *priv = nft_set_priv(set);
371 struct nft_rbtree_elem *rbe = elem->priv;
372 int err;
373
374 write_lock_bh(&priv->lock);
375 write_seqcount_begin(&priv->count);
376 err = __nft_rbtree_insert(net, set, rbe, ext);
377 write_seqcount_end(&priv->count);
378 write_unlock_bh(&priv->lock);
379
380 return err;
381}
382
383static void nft_rbtree_remove(const struct net *net,
384 const struct nft_set *set,
385 const struct nft_set_elem *elem)
386{
387 struct nft_rbtree *priv = nft_set_priv(set);
388 struct nft_rbtree_elem *rbe = elem->priv;
389
390 write_lock_bh(&priv->lock);
391 write_seqcount_begin(&priv->count);
392 rb_erase(&rbe->node, &priv->root);
393 write_seqcount_end(&priv->count);
394 write_unlock_bh(&priv->lock);
395}
396
397static void nft_rbtree_activate(const struct net *net,
398 const struct nft_set *set,
399 const struct nft_set_elem *elem)
400{
401 struct nft_rbtree_elem *rbe = elem->priv;
402
403 nft_set_elem_change_active(net, set, &rbe->ext);
404 nft_set_elem_clear_busy(&rbe->ext);
405}
406
407static bool nft_rbtree_flush(const struct net *net,
408 const struct nft_set *set, void *priv)
409{
410 struct nft_rbtree_elem *rbe = priv;
411
412 if (!nft_set_elem_mark_busy(&rbe->ext) ||
413 !nft_is_active(net, &rbe->ext)) {
414 nft_set_elem_change_active(net, set, &rbe->ext);
415 return true;
416 }
417 return false;
418}
419
420static void *nft_rbtree_deactivate(const struct net *net,
421 const struct nft_set *set,
422 const struct nft_set_elem *elem)
423{
424 const struct nft_rbtree *priv = nft_set_priv(set);
425 const struct rb_node *parent = priv->root.rb_node;
426 struct nft_rbtree_elem *rbe, *this = elem->priv;
427 u8 genmask = nft_genmask_next(net);
428 int d;
429
430 while (parent != NULL) {
431 rbe = rb_entry(parent, struct nft_rbtree_elem, node);
432
433 d = memcmp(nft_set_ext_key(&rbe->ext), &elem->key.val,
434 set->klen);
435 if (d < 0)
436 parent = parent->rb_left;
437 else if (d > 0)
438 parent = parent->rb_right;
439 else {
440 if (nft_rbtree_interval_end(rbe) &&
441 nft_rbtree_interval_start(this)) {
442 parent = parent->rb_left;
443 continue;
444 } else if (nft_rbtree_interval_start(rbe) &&
445 nft_rbtree_interval_end(this)) {
446 parent = parent->rb_right;
447 continue;
448 } else if (!nft_set_elem_active(&rbe->ext, genmask)) {
449 parent = parent->rb_left;
450 continue;
451 }
452 nft_rbtree_flush(net, set, rbe);
453 return rbe;
454 }
455 }
456 return NULL;
457}
458
459static void nft_rbtree_walk(const struct nft_ctx *ctx,
460 struct nft_set *set,
461 struct nft_set_iter *iter)
462{
463 struct nft_rbtree *priv = nft_set_priv(set);
464 struct nft_rbtree_elem *rbe;
465 struct nft_set_elem elem;
466 struct rb_node *node;
467
468 read_lock_bh(&priv->lock);
469 for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
470 rbe = rb_entry(node, struct nft_rbtree_elem, node);
471
472 if (iter->count < iter->skip)
473 goto cont;
474 if (nft_set_elem_expired(&rbe->ext))
475 goto cont;
476 if (!nft_set_elem_active(&rbe->ext, iter->genmask))
477 goto cont;
478
479 elem.priv = rbe;
480
481 iter->err = iter->fn(ctx, set, iter, &elem);
482 if (iter->err < 0) {
483 read_unlock_bh(&priv->lock);
484 return;
485 }
486cont:
487 iter->count++;
488 }
489 read_unlock_bh(&priv->lock);
490}
491
492static void nft_rbtree_gc(struct work_struct *work)
493{
494 struct nft_rbtree_elem *rbe, *rbe_end = NULL, *rbe_prev = NULL;
495 struct nft_set_gc_batch *gcb = NULL;
496 struct nft_rbtree *priv;
497 struct rb_node *node;
498 struct nft_set *set;
499
500 priv = container_of(work, struct nft_rbtree, gc_work.work);
501 set = nft_set_container_of(priv);
502
503 write_lock_bh(&priv->lock);
504 write_seqcount_begin(&priv->count);
505 for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
506 rbe = rb_entry(node, struct nft_rbtree_elem, node);
507
508 if (nft_rbtree_interval_end(rbe)) {
509 rbe_end = rbe;
510 continue;
511 }
512 if (!nft_set_elem_expired(&rbe->ext))
513 continue;
514 if (nft_set_elem_mark_busy(&rbe->ext))
515 continue;
516
517 if (rbe_prev) {
518 rb_erase(&rbe_prev->node, &priv->root);
519 rbe_prev = NULL;
520 }
521 gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
522 if (!gcb)
523 break;
524
525 atomic_dec(&set->nelems);
526 nft_set_gc_batch_add(gcb, rbe);
527 rbe_prev = rbe;
528
529 if (rbe_end) {
530 atomic_dec(&set->nelems);
531 nft_set_gc_batch_add(gcb, rbe_end);
532 rb_erase(&rbe_end->node, &priv->root);
533 rbe_end = NULL;
534 }
535 node = rb_next(node);
536 if (!node)
537 break;
538 }
539 if (rbe_prev)
540 rb_erase(&rbe_prev->node, &priv->root);
541 write_seqcount_end(&priv->count);
542 write_unlock_bh(&priv->lock);
543
544 nft_set_gc_batch_complete(gcb);
545
546 queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
547 nft_set_gc_interval(set));
548}
549
550static u64 nft_rbtree_privsize(const struct nlattr * const nla[],
551 const struct nft_set_desc *desc)
552{
553 return sizeof(struct nft_rbtree);
554}
555
556static int nft_rbtree_init(const struct nft_set *set,
557 const struct nft_set_desc *desc,
558 const struct nlattr * const nla[])
559{
560 struct nft_rbtree *priv = nft_set_priv(set);
561
562 rwlock_init(&priv->lock);
563 seqcount_rwlock_init(&priv->count, &priv->lock);
564 priv->root = RB_ROOT;
565
566 INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rbtree_gc);
567 if (set->flags & NFT_SET_TIMEOUT)
568 queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
569 nft_set_gc_interval(set));
570
571 return 0;
572}
573
574static void nft_rbtree_destroy(const struct nft_set *set)
575{
576 struct nft_rbtree *priv = nft_set_priv(set);
577 struct nft_rbtree_elem *rbe;
578 struct rb_node *node;
579
580 cancel_delayed_work_sync(&priv->gc_work);
581 rcu_barrier();
582 while ((node = priv->root.rb_node) != NULL) {
583 rb_erase(node, &priv->root);
584 rbe = rb_entry(node, struct nft_rbtree_elem, node);
585 nft_set_elem_destroy(set, rbe, true);
586 }
587}
588
589static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
590 struct nft_set_estimate *est)
591{
592 if (desc->field_count > 1)
593 return false;
594
595 if (desc->size)
596 est->size = sizeof(struct nft_rbtree) +
597 desc->size * sizeof(struct nft_rbtree_elem);
598 else
599 est->size = ~0;
600
601 est->lookup = NFT_SET_CLASS_O_LOG_N;
602 est->space = NFT_SET_CLASS_O_N;
603
604 return true;
605}
606
607const struct nft_set_type nft_set_rbtree_type = {
608 .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT,
609 .ops = {
610 .privsize = nft_rbtree_privsize,
611 .elemsize = offsetof(struct nft_rbtree_elem, ext),
612 .estimate = nft_rbtree_estimate,
613 .init = nft_rbtree_init,
614 .destroy = nft_rbtree_destroy,
615 .insert = nft_rbtree_insert,
616 .remove = nft_rbtree_remove,
617 .deactivate = nft_rbtree_deactivate,
618 .flush = nft_rbtree_flush,
619 .activate = nft_rbtree_activate,
620 .lookup = nft_rbtree_lookup,
621 .walk = nft_rbtree_walk,
622 .get = nft_rbtree_get,
623 },
624};
625