1
2
3
4
5
6
7
8
9
10
11#include <linux/kernel.h>
12#include <linux/init.h>
13#include <linux/module.h>
14#include <linux/list.h>
15#include <linux/rbtree.h>
16#include <linux/netlink.h>
17#include <linux/netfilter.h>
18#include <linux/netfilter/nf_tables.h>
19#include <net/netfilter/nf_tables.h>
20
21struct nft_rbtree {
22 struct rb_root root;
23 rwlock_t lock;
24 seqcount_t count;
25 struct delayed_work gc_work;
26};
27
28struct nft_rbtree_elem {
29 struct rb_node node;
30 struct nft_set_ext ext;
31};
32
33static bool nft_rbtree_interval_end(const struct nft_rbtree_elem *rbe)
34{
35 return nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) &&
36 (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END);
37}
38
39static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe)
40{
41 return !nft_rbtree_interval_end(rbe);
42}
43
44static bool nft_rbtree_equal(const struct nft_set *set, const void *this,
45 const struct nft_rbtree_elem *interval)
46{
47 return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0;
48}
49
50static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
51 const u32 *key, const struct nft_set_ext **ext,
52 unsigned int seq)
53{
54 struct nft_rbtree *priv = nft_set_priv(set);
55 const struct nft_rbtree_elem *rbe, *interval = NULL;
56 u8 genmask = nft_genmask_cur(net);
57 const struct rb_node *parent;
58 const void *this;
59 int d;
60
61 parent = rcu_dereference_raw(priv->root.rb_node);
62 while (parent != NULL) {
63 if (read_seqcount_retry(&priv->count, seq))
64 return false;
65
66 rbe = rb_entry(parent, struct nft_rbtree_elem, node);
67
68 this = nft_set_ext_key(&rbe->ext);
69 d = memcmp(this, key, set->klen);
70 if (d < 0) {
71 parent = rcu_dereference_raw(parent->rb_left);
72 if (interval &&
73 nft_rbtree_equal(set, this, interval) &&
74 nft_rbtree_interval_end(rbe) &&
75 nft_rbtree_interval_start(interval))
76 continue;
77 interval = rbe;
78 } else if (d > 0)
79 parent = rcu_dereference_raw(parent->rb_right);
80 else {
81 if (!nft_set_elem_active(&rbe->ext, genmask)) {
82 parent = rcu_dereference_raw(parent->rb_left);
83 continue;
84 }
85
86 if (nft_set_elem_expired(&rbe->ext))
87 return false;
88
89 if (nft_rbtree_interval_end(rbe)) {
90 if (nft_set_is_anonymous(set))
91 return false;
92 parent = rcu_dereference_raw(parent->rb_left);
93 interval = NULL;
94 continue;
95 }
96
97 *ext = &rbe->ext;
98 return true;
99 }
100 }
101
102 if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
103 nft_set_elem_active(&interval->ext, genmask) &&
104 !nft_set_elem_expired(&interval->ext) &&
105 nft_rbtree_interval_start(interval)) {
106 *ext = &interval->ext;
107 return true;
108 }
109
110 return false;
111}
112
113static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
114 const u32 *key, const struct nft_set_ext **ext)
115{
116 struct nft_rbtree *priv = nft_set_priv(set);
117 unsigned int seq = read_seqcount_begin(&priv->count);
118 bool ret;
119
120 ret = __nft_rbtree_lookup(net, set, key, ext, seq);
121 if (ret || !read_seqcount_retry(&priv->count, seq))
122 return ret;
123
124 read_lock_bh(&priv->lock);
125 seq = read_seqcount_begin(&priv->count);
126 ret = __nft_rbtree_lookup(net, set, key, ext, seq);
127 read_unlock_bh(&priv->lock);
128
129 return ret;
130}
131
132static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set,
133 const u32 *key, struct nft_rbtree_elem **elem,
134 unsigned int seq, unsigned int flags, u8 genmask)
135{
136 struct nft_rbtree_elem *rbe, *interval = NULL;
137 struct nft_rbtree *priv = nft_set_priv(set);
138 const struct rb_node *parent;
139 const void *this;
140 int d;
141
142 parent = rcu_dereference_raw(priv->root.rb_node);
143 while (parent != NULL) {
144 if (read_seqcount_retry(&priv->count, seq))
145 return false;
146
147 rbe = rb_entry(parent, struct nft_rbtree_elem, node);
148
149 this = nft_set_ext_key(&rbe->ext);
150 d = memcmp(this, key, set->klen);
151 if (d < 0) {
152 parent = rcu_dereference_raw(parent->rb_left);
153 if (!(flags & NFT_SET_ELEM_INTERVAL_END))
154 interval = rbe;
155 } else if (d > 0) {
156 parent = rcu_dereference_raw(parent->rb_right);
157 if (flags & NFT_SET_ELEM_INTERVAL_END)
158 interval = rbe;
159 } else {
160 if (!nft_set_elem_active(&rbe->ext, genmask)) {
161 parent = rcu_dereference_raw(parent->rb_left);
162 continue;
163 }
164
165 if (nft_set_elem_expired(&rbe->ext))
166 return false;
167
168 if (!nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) ||
169 (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END) ==
170 (flags & NFT_SET_ELEM_INTERVAL_END)) {
171 *elem = rbe;
172 return true;
173 }
174
175 if (nft_rbtree_interval_end(rbe))
176 interval = NULL;
177
178 parent = rcu_dereference_raw(parent->rb_left);
179 }
180 }
181
182 if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
183 nft_set_elem_active(&interval->ext, genmask) &&
184 !nft_set_elem_expired(&interval->ext) &&
185 ((!nft_rbtree_interval_end(interval) &&
186 !(flags & NFT_SET_ELEM_INTERVAL_END)) ||
187 (nft_rbtree_interval_end(interval) &&
188 (flags & NFT_SET_ELEM_INTERVAL_END)))) {
189 *elem = interval;
190 return true;
191 }
192
193 return false;
194}
195
196static void *nft_rbtree_get(const struct net *net, const struct nft_set *set,
197 const struct nft_set_elem *elem, unsigned int flags)
198{
199 struct nft_rbtree *priv = nft_set_priv(set);
200 unsigned int seq = read_seqcount_begin(&priv->count);
201 struct nft_rbtree_elem *rbe = ERR_PTR(-ENOENT);
202 const u32 *key = (const u32 *)&elem->key.val;
203 u8 genmask = nft_genmask_cur(net);
204 bool ret;
205
206 ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask);
207 if (ret || !read_seqcount_retry(&priv->count, seq))
208 return rbe;
209
210 read_lock_bh(&priv->lock);
211 seq = read_seqcount_begin(&priv->count);
212 ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask);
213 if (!ret)
214 rbe = ERR_PTR(-ENOENT);
215 read_unlock_bh(&priv->lock);
216
217 return rbe;
218}
219
220static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
221 struct nft_rbtree_elem *new,
222 struct nft_set_ext **ext)
223{
224 struct nft_rbtree *priv = nft_set_priv(set);
225 u8 genmask = nft_genmask_next(net);
226 struct nft_rbtree_elem *rbe;
227 struct rb_node *parent, **p;
228 bool overlap = false;
229 int d;
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270 parent = NULL;
271 p = &priv->root.rb_node;
272 while (*p != NULL) {
273 parent = *p;
274 rbe = rb_entry(parent, struct nft_rbtree_elem, node);
275 d = memcmp(nft_set_ext_key(&rbe->ext),
276 nft_set_ext_key(&new->ext),
277 set->klen);
278 if (d < 0) {
279 p = &parent->rb_left;
280
281 if (nft_rbtree_interval_start(new)) {
282 if (nft_rbtree_interval_end(rbe) &&
283 nft_set_elem_active(&rbe->ext, genmask) &&
284 !nft_set_elem_expired(&rbe->ext) && !*p)
285 overlap = false;
286 } else {
287 overlap = nft_rbtree_interval_end(rbe) &&
288 nft_set_elem_active(&rbe->ext,
289 genmask) &&
290 !nft_set_elem_expired(&rbe->ext);
291 }
292 } else if (d > 0) {
293 p = &parent->rb_right;
294
295 if (nft_rbtree_interval_end(new)) {
296 overlap = nft_rbtree_interval_end(rbe) &&
297 nft_set_elem_active(&rbe->ext,
298 genmask) &&
299 !nft_set_elem_expired(&rbe->ext);
300 } else if (nft_set_elem_active(&rbe->ext, genmask) &&
301 !nft_set_elem_expired(&rbe->ext)) {
302 overlap = nft_rbtree_interval_end(rbe);
303 }
304 } else {
305 if (nft_rbtree_interval_end(rbe) &&
306 nft_rbtree_interval_start(new)) {
307 p = &parent->rb_left;
308
309 if (nft_set_elem_active(&rbe->ext, genmask) &&
310 !nft_set_elem_expired(&rbe->ext))
311 overlap = false;
312 } else if (nft_rbtree_interval_start(rbe) &&
313 nft_rbtree_interval_end(new)) {
314 p = &parent->rb_right;
315
316 if (nft_set_elem_active(&rbe->ext, genmask) &&
317 !nft_set_elem_expired(&rbe->ext))
318 overlap = false;
319 } else if (nft_set_elem_active(&rbe->ext, genmask) &&
320 !nft_set_elem_expired(&rbe->ext)) {
321 *ext = &rbe->ext;
322 return -EEXIST;
323 } else {
324 p = &parent->rb_left;
325 }
326 }
327 }
328
329 if (overlap)
330 return -ENOTEMPTY;
331
332 rb_link_node_rcu(&new->node, parent, p);
333 rb_insert_color(&new->node, &priv->root);
334 return 0;
335}
336
337static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
338 const struct nft_set_elem *elem,
339 struct nft_set_ext **ext)
340{
341 struct nft_rbtree *priv = nft_set_priv(set);
342 struct nft_rbtree_elem *rbe = elem->priv;
343 int err;
344
345 write_lock_bh(&priv->lock);
346 write_seqcount_begin(&priv->count);
347 err = __nft_rbtree_insert(net, set, rbe, ext);
348 write_seqcount_end(&priv->count);
349 write_unlock_bh(&priv->lock);
350
351 return err;
352}
353
354static void nft_rbtree_remove(const struct net *net,
355 const struct nft_set *set,
356 const struct nft_set_elem *elem)
357{
358 struct nft_rbtree *priv = nft_set_priv(set);
359 struct nft_rbtree_elem *rbe = elem->priv;
360
361 write_lock_bh(&priv->lock);
362 write_seqcount_begin(&priv->count);
363 rb_erase(&rbe->node, &priv->root);
364 write_seqcount_end(&priv->count);
365 write_unlock_bh(&priv->lock);
366}
367
368static void nft_rbtree_activate(const struct net *net,
369 const struct nft_set *set,
370 const struct nft_set_elem *elem)
371{
372 struct nft_rbtree_elem *rbe = elem->priv;
373
374 nft_set_elem_change_active(net, set, &rbe->ext);
375 nft_set_elem_clear_busy(&rbe->ext);
376}
377
378static bool nft_rbtree_flush(const struct net *net,
379 const struct nft_set *set, void *priv)
380{
381 struct nft_rbtree_elem *rbe = priv;
382
383 if (!nft_set_elem_mark_busy(&rbe->ext) ||
384 !nft_is_active(net, &rbe->ext)) {
385 nft_set_elem_change_active(net, set, &rbe->ext);
386 return true;
387 }
388 return false;
389}
390
391static void *nft_rbtree_deactivate(const struct net *net,
392 const struct nft_set *set,
393 const struct nft_set_elem *elem)
394{
395 const struct nft_rbtree *priv = nft_set_priv(set);
396 const struct rb_node *parent = priv->root.rb_node;
397 struct nft_rbtree_elem *rbe, *this = elem->priv;
398 u8 genmask = nft_genmask_next(net);
399 int d;
400
401 while (parent != NULL) {
402 rbe = rb_entry(parent, struct nft_rbtree_elem, node);
403
404 d = memcmp(nft_set_ext_key(&rbe->ext), &elem->key.val,
405 set->klen);
406 if (d < 0)
407 parent = parent->rb_left;
408 else if (d > 0)
409 parent = parent->rb_right;
410 else {
411 if (nft_rbtree_interval_end(rbe) &&
412 nft_rbtree_interval_start(this)) {
413 parent = parent->rb_left;
414 continue;
415 } else if (nft_rbtree_interval_start(rbe) &&
416 nft_rbtree_interval_end(this)) {
417 parent = parent->rb_right;
418 continue;
419 } else if (!nft_set_elem_active(&rbe->ext, genmask)) {
420 parent = parent->rb_left;
421 continue;
422 }
423 nft_rbtree_flush(net, set, rbe);
424 return rbe;
425 }
426 }
427 return NULL;
428}
429
430static void nft_rbtree_walk(const struct nft_ctx *ctx,
431 struct nft_set *set,
432 struct nft_set_iter *iter)
433{
434 struct nft_rbtree *priv = nft_set_priv(set);
435 struct nft_rbtree_elem *rbe;
436 struct nft_set_elem elem;
437 struct rb_node *node;
438
439 read_lock_bh(&priv->lock);
440 for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
441 rbe = rb_entry(node, struct nft_rbtree_elem, node);
442
443 if (iter->count < iter->skip)
444 goto cont;
445 if (nft_set_elem_expired(&rbe->ext))
446 goto cont;
447 if (!nft_set_elem_active(&rbe->ext, iter->genmask))
448 goto cont;
449
450 elem.priv = rbe;
451
452 iter->err = iter->fn(ctx, set, iter, &elem);
453 if (iter->err < 0) {
454 read_unlock_bh(&priv->lock);
455 return;
456 }
457cont:
458 iter->count++;
459 }
460 read_unlock_bh(&priv->lock);
461}
462
463static void nft_rbtree_gc(struct work_struct *work)
464{
465 struct nft_rbtree_elem *rbe, *rbe_end = NULL, *rbe_prev = NULL;
466 struct nft_set_gc_batch *gcb = NULL;
467 struct nft_rbtree *priv;
468 struct rb_node *node;
469 struct nft_set *set;
470
471 priv = container_of(work, struct nft_rbtree, gc_work.work);
472 set = nft_set_container_of(priv);
473
474 write_lock_bh(&priv->lock);
475 write_seqcount_begin(&priv->count);
476 for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
477 rbe = rb_entry(node, struct nft_rbtree_elem, node);
478
479 if (nft_rbtree_interval_end(rbe)) {
480 rbe_end = rbe;
481 continue;
482 }
483 if (!nft_set_elem_expired(&rbe->ext))
484 continue;
485 if (nft_set_elem_mark_busy(&rbe->ext))
486 continue;
487
488 if (rbe_prev) {
489 rb_erase(&rbe_prev->node, &priv->root);
490 rbe_prev = NULL;
491 }
492 gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
493 if (!gcb)
494 break;
495
496 atomic_dec(&set->nelems);
497 nft_set_gc_batch_add(gcb, rbe);
498 rbe_prev = rbe;
499
500 if (rbe_end) {
501 atomic_dec(&set->nelems);
502 nft_set_gc_batch_add(gcb, rbe_end);
503 rb_erase(&rbe_end->node, &priv->root);
504 rbe_end = NULL;
505 }
506 node = rb_next(node);
507 if (!node)
508 break;
509 }
510 if (rbe_prev)
511 rb_erase(&rbe_prev->node, &priv->root);
512 write_seqcount_end(&priv->count);
513 write_unlock_bh(&priv->lock);
514
515 nft_set_gc_batch_complete(gcb);
516
517 queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
518 nft_set_gc_interval(set));
519}
520
521static u64 nft_rbtree_privsize(const struct nlattr * const nla[],
522 const struct nft_set_desc *desc)
523{
524 return sizeof(struct nft_rbtree);
525}
526
527static int nft_rbtree_init(const struct nft_set *set,
528 const struct nft_set_desc *desc,
529 const struct nlattr * const nla[])
530{
531 struct nft_rbtree *priv = nft_set_priv(set);
532
533 rwlock_init(&priv->lock);
534 seqcount_init(&priv->count);
535 priv->root = RB_ROOT;
536
537 INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rbtree_gc);
538 if (set->flags & NFT_SET_TIMEOUT)
539 queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
540 nft_set_gc_interval(set));
541
542 return 0;
543}
544
545static void nft_rbtree_destroy(const struct nft_set *set)
546{
547 struct nft_rbtree *priv = nft_set_priv(set);
548 struct nft_rbtree_elem *rbe;
549 struct rb_node *node;
550
551 cancel_delayed_work_sync(&priv->gc_work);
552 rcu_barrier();
553 while ((node = priv->root.rb_node) != NULL) {
554 rb_erase(node, &priv->root);
555 rbe = rb_entry(node, struct nft_rbtree_elem, node);
556 nft_set_elem_destroy(set, rbe, true);
557 }
558}
559
560static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
561 struct nft_set_estimate *est)
562{
563 if (desc->field_count > 1)
564 return false;
565
566 if (desc->size)
567 est->size = sizeof(struct nft_rbtree) +
568 desc->size * sizeof(struct nft_rbtree_elem);
569 else
570 est->size = ~0;
571
572 est->lookup = NFT_SET_CLASS_O_LOG_N;
573 est->space = NFT_SET_CLASS_O_N;
574
575 return true;
576}
577
578struct nft_set_type nft_set_rbtree_type __read_mostly = {
579 .owner = THIS_MODULE,
580 .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT,
581 .ops = {
582 .privsize = nft_rbtree_privsize,
583 .elemsize = offsetof(struct nft_rbtree_elem, ext),
584 .estimate = nft_rbtree_estimate,
585 .init = nft_rbtree_init,
586 .destroy = nft_rbtree_destroy,
587 .insert = nft_rbtree_insert,
588 .remove = nft_rbtree_remove,
589 .deactivate = nft_rbtree_deactivate,
590 .flush = nft_rbtree_flush,
591 .activate = nft_rbtree_activate,
592 .lookup = nft_rbtree_lookup,
593 .walk = nft_rbtree_walk,
594 .get = nft_rbtree_get,
595 },
596};
597