1
2
3
4
5
6
7
8#include <linux/kernel.h>
9#include <linux/init.h>
10#include <linux/module.h>
11#include <linux/list.h>
12#include <linux/rbtree.h>
13#include <linux/netlink.h>
14#include <linux/netfilter.h>
15#include <linux/netfilter/nf_tables.h>
16#include <net/netfilter/nf_tables_core.h>
17
18struct nft_rbtree {
19 struct rb_root root;
20 rwlock_t lock;
21 seqcount_t count;
22 struct delayed_work gc_work;
23};
24
25struct nft_rbtree_elem {
26 struct rb_node node;
27 struct nft_set_ext ext;
28};
29
30static bool nft_rbtree_interval_end(const struct nft_rbtree_elem *rbe)
31{
32 return nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) &&
33 (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END);
34}
35
36static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe)
37{
38 return !nft_rbtree_interval_end(rbe);
39}
40
41static bool nft_rbtree_equal(const struct nft_set *set, const void *this,
42 const struct nft_rbtree_elem *interval)
43{
44 return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0;
45}
46
47static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
48 const u32 *key, const struct nft_set_ext **ext,
49 unsigned int seq)
50{
51 struct nft_rbtree *priv = nft_set_priv(set);
52 const struct nft_rbtree_elem *rbe, *interval = NULL;
53 u8 genmask = nft_genmask_cur(net);
54 const struct rb_node *parent;
55 const void *this;
56 int d;
57
58 parent = rcu_dereference_raw(priv->root.rb_node);
59 while (parent != NULL) {
60 if (read_seqcount_retry(&priv->count, seq))
61 return false;
62
63 rbe = rb_entry(parent, struct nft_rbtree_elem, node);
64
65 this = nft_set_ext_key(&rbe->ext);
66 d = memcmp(this, key, set->klen);
67 if (d < 0) {
68 parent = rcu_dereference_raw(parent->rb_left);
69 if (interval &&
70 nft_rbtree_equal(set, this, interval) &&
71 nft_rbtree_interval_end(rbe) &&
72 nft_rbtree_interval_start(interval))
73 continue;
74 interval = rbe;
75 } else if (d > 0)
76 parent = rcu_dereference_raw(parent->rb_right);
77 else {
78 if (!nft_set_elem_active(&rbe->ext, genmask)) {
79 parent = rcu_dereference_raw(parent->rb_left);
80 continue;
81 }
82
83 if (nft_set_elem_expired(&rbe->ext))
84 return false;
85
86 if (nft_rbtree_interval_end(rbe)) {
87 if (nft_set_is_anonymous(set))
88 return false;
89 parent = rcu_dereference_raw(parent->rb_left);
90 interval = NULL;
91 continue;
92 }
93
94 *ext = &rbe->ext;
95 return true;
96 }
97 }
98
99 if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
100 nft_set_elem_active(&interval->ext, genmask) &&
101 !nft_set_elem_expired(&interval->ext) &&
102 nft_rbtree_interval_start(interval)) {
103 *ext = &interval->ext;
104 return true;
105 }
106
107 return false;
108}
109
110static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
111 const u32 *key, const struct nft_set_ext **ext)
112{
113 struct nft_rbtree *priv = nft_set_priv(set);
114 unsigned int seq = read_seqcount_begin(&priv->count);
115 bool ret;
116
117 ret = __nft_rbtree_lookup(net, set, key, ext, seq);
118 if (ret || !read_seqcount_retry(&priv->count, seq))
119 return ret;
120
121 read_lock_bh(&priv->lock);
122 seq = read_seqcount_begin(&priv->count);
123 ret = __nft_rbtree_lookup(net, set, key, ext, seq);
124 read_unlock_bh(&priv->lock);
125
126 return ret;
127}
128
129static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set,
130 const u32 *key, struct nft_rbtree_elem **elem,
131 unsigned int seq, unsigned int flags, u8 genmask)
132{
133 struct nft_rbtree_elem *rbe, *interval = NULL;
134 struct nft_rbtree *priv = nft_set_priv(set);
135 const struct rb_node *parent;
136 const void *this;
137 int d;
138
139 parent = rcu_dereference_raw(priv->root.rb_node);
140 while (parent != NULL) {
141 if (read_seqcount_retry(&priv->count, seq))
142 return false;
143
144 rbe = rb_entry(parent, struct nft_rbtree_elem, node);
145
146 this = nft_set_ext_key(&rbe->ext);
147 d = memcmp(this, key, set->klen);
148 if (d < 0) {
149 parent = rcu_dereference_raw(parent->rb_left);
150 if (!(flags & NFT_SET_ELEM_INTERVAL_END))
151 interval = rbe;
152 } else if (d > 0) {
153 parent = rcu_dereference_raw(parent->rb_right);
154 if (flags & NFT_SET_ELEM_INTERVAL_END)
155 interval = rbe;
156 } else {
157 if (!nft_set_elem_active(&rbe->ext, genmask)) {
158 parent = rcu_dereference_raw(parent->rb_left);
159 continue;
160 }
161
162 if (nft_set_elem_expired(&rbe->ext))
163 return false;
164
165 if (!nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) ||
166 (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END) ==
167 (flags & NFT_SET_ELEM_INTERVAL_END)) {
168 *elem = rbe;
169 return true;
170 }
171
172 if (nft_rbtree_interval_end(rbe))
173 interval = NULL;
174
175 parent = rcu_dereference_raw(parent->rb_left);
176 }
177 }
178
179 if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
180 nft_set_elem_active(&interval->ext, genmask) &&
181 !nft_set_elem_expired(&interval->ext) &&
182 ((!nft_rbtree_interval_end(interval) &&
183 !(flags & NFT_SET_ELEM_INTERVAL_END)) ||
184 (nft_rbtree_interval_end(interval) &&
185 (flags & NFT_SET_ELEM_INTERVAL_END)))) {
186 *elem = interval;
187 return true;
188 }
189
190 return false;
191}
192
193static void *nft_rbtree_get(const struct net *net, const struct nft_set *set,
194 const struct nft_set_elem *elem, unsigned int flags)
195{
196 struct nft_rbtree *priv = nft_set_priv(set);
197 unsigned int seq = read_seqcount_begin(&priv->count);
198 struct nft_rbtree_elem *rbe = ERR_PTR(-ENOENT);
199 const u32 *key = (const u32 *)&elem->key.val;
200 u8 genmask = nft_genmask_cur(net);
201 bool ret;
202
203 ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask);
204 if (ret || !read_seqcount_retry(&priv->count, seq))
205 return rbe;
206
207 read_lock_bh(&priv->lock);
208 seq = read_seqcount_begin(&priv->count);
209 ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask);
210 if (!ret)
211 rbe = ERR_PTR(-ENOENT);
212 read_unlock_bh(&priv->lock);
213
214 return rbe;
215}
216
217static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
218 struct nft_rbtree_elem *new,
219 struct nft_set_ext **ext)
220{
221 struct nft_rbtree *priv = nft_set_priv(set);
222 u8 genmask = nft_genmask_next(net);
223 struct nft_rbtree_elem *rbe;
224 struct rb_node *parent, **p;
225 bool overlap = false;
226 int d;
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261 parent = NULL;
262 p = &priv->root.rb_node;
263 while (*p != NULL) {
264 parent = *p;
265 rbe = rb_entry(parent, struct nft_rbtree_elem, node);
266 d = memcmp(nft_set_ext_key(&rbe->ext),
267 nft_set_ext_key(&new->ext),
268 set->klen);
269 if (d < 0) {
270 p = &parent->rb_left;
271
272 if (nft_rbtree_interval_start(new)) {
273 if (nft_rbtree_interval_end(rbe) &&
274 nft_set_elem_active(&rbe->ext, genmask) &&
275 !nft_set_elem_expired(&rbe->ext))
276 overlap = false;
277 } else {
278 overlap = nft_rbtree_interval_end(rbe) &&
279 nft_set_elem_active(&rbe->ext,
280 genmask) &&
281 !nft_set_elem_expired(&rbe->ext);
282 }
283 } else if (d > 0) {
284 p = &parent->rb_right;
285
286 if (nft_rbtree_interval_end(new)) {
287 overlap = nft_rbtree_interval_end(rbe) &&
288 nft_set_elem_active(&rbe->ext,
289 genmask) &&
290 !nft_set_elem_expired(&rbe->ext);
291 } else if (nft_rbtree_interval_end(rbe) &&
292 nft_set_elem_active(&rbe->ext, genmask) &&
293 !nft_set_elem_expired(&rbe->ext)) {
294 overlap = true;
295 }
296 } else {
297 if (nft_rbtree_interval_end(rbe) &&
298 nft_rbtree_interval_start(new)) {
299 p = &parent->rb_left;
300
301 if (nft_set_elem_active(&rbe->ext, genmask) &&
302 !nft_set_elem_expired(&rbe->ext))
303 overlap = false;
304 } else if (nft_rbtree_interval_start(rbe) &&
305 nft_rbtree_interval_end(new)) {
306 p = &parent->rb_right;
307
308 if (nft_set_elem_active(&rbe->ext, genmask) &&
309 !nft_set_elem_expired(&rbe->ext))
310 overlap = false;
311 } else if (nft_set_elem_active(&rbe->ext, genmask) &&
312 !nft_set_elem_expired(&rbe->ext)) {
313 *ext = &rbe->ext;
314 return -EEXIST;
315 } else {
316 p = &parent->rb_left;
317 }
318 }
319 }
320
321 if (overlap)
322 return -ENOTEMPTY;
323
324 rb_link_node_rcu(&new->node, parent, p);
325 rb_insert_color(&new->node, &priv->root);
326 return 0;
327}
328
329static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
330 const struct nft_set_elem *elem,
331 struct nft_set_ext **ext)
332{
333 struct nft_rbtree *priv = nft_set_priv(set);
334 struct nft_rbtree_elem *rbe = elem->priv;
335 int err;
336
337 write_lock_bh(&priv->lock);
338 write_seqcount_begin(&priv->count);
339 err = __nft_rbtree_insert(net, set, rbe, ext);
340 write_seqcount_end(&priv->count);
341 write_unlock_bh(&priv->lock);
342
343 return err;
344}
345
346static void nft_rbtree_remove(const struct net *net,
347 const struct nft_set *set,
348 const struct nft_set_elem *elem)
349{
350 struct nft_rbtree *priv = nft_set_priv(set);
351 struct nft_rbtree_elem *rbe = elem->priv;
352
353 write_lock_bh(&priv->lock);
354 write_seqcount_begin(&priv->count);
355 rb_erase(&rbe->node, &priv->root);
356 write_seqcount_end(&priv->count);
357 write_unlock_bh(&priv->lock);
358}
359
360static void nft_rbtree_activate(const struct net *net,
361 const struct nft_set *set,
362 const struct nft_set_elem *elem)
363{
364 struct nft_rbtree_elem *rbe = elem->priv;
365
366 nft_set_elem_change_active(net, set, &rbe->ext);
367 nft_set_elem_clear_busy(&rbe->ext);
368}
369
370static bool nft_rbtree_flush(const struct net *net,
371 const struct nft_set *set, void *priv)
372{
373 struct nft_rbtree_elem *rbe = priv;
374
375 if (!nft_set_elem_mark_busy(&rbe->ext) ||
376 !nft_is_active(net, &rbe->ext)) {
377 nft_set_elem_change_active(net, set, &rbe->ext);
378 return true;
379 }
380 return false;
381}
382
383static void *nft_rbtree_deactivate(const struct net *net,
384 const struct nft_set *set,
385 const struct nft_set_elem *elem)
386{
387 const struct nft_rbtree *priv = nft_set_priv(set);
388 const struct rb_node *parent = priv->root.rb_node;
389 struct nft_rbtree_elem *rbe, *this = elem->priv;
390 u8 genmask = nft_genmask_next(net);
391 int d;
392
393 while (parent != NULL) {
394 rbe = rb_entry(parent, struct nft_rbtree_elem, node);
395
396 d = memcmp(nft_set_ext_key(&rbe->ext), &elem->key.val,
397 set->klen);
398 if (d < 0)
399 parent = parent->rb_left;
400 else if (d > 0)
401 parent = parent->rb_right;
402 else {
403 if (nft_rbtree_interval_end(rbe) &&
404 nft_rbtree_interval_start(this)) {
405 parent = parent->rb_left;
406 continue;
407 } else if (nft_rbtree_interval_start(rbe) &&
408 nft_rbtree_interval_end(this)) {
409 parent = parent->rb_right;
410 continue;
411 } else if (!nft_set_elem_active(&rbe->ext, genmask)) {
412 parent = parent->rb_left;
413 continue;
414 }
415 nft_rbtree_flush(net, set, rbe);
416 return rbe;
417 }
418 }
419 return NULL;
420}
421
422static void nft_rbtree_walk(const struct nft_ctx *ctx,
423 struct nft_set *set,
424 struct nft_set_iter *iter)
425{
426 struct nft_rbtree *priv = nft_set_priv(set);
427 struct nft_rbtree_elem *rbe;
428 struct nft_set_elem elem;
429 struct rb_node *node;
430
431 read_lock_bh(&priv->lock);
432 for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
433 rbe = rb_entry(node, struct nft_rbtree_elem, node);
434
435 if (iter->count < iter->skip)
436 goto cont;
437 if (nft_set_elem_expired(&rbe->ext))
438 goto cont;
439 if (!nft_set_elem_active(&rbe->ext, iter->genmask))
440 goto cont;
441
442 elem.priv = rbe;
443
444 iter->err = iter->fn(ctx, set, iter, &elem);
445 if (iter->err < 0) {
446 read_unlock_bh(&priv->lock);
447 return;
448 }
449cont:
450 iter->count++;
451 }
452 read_unlock_bh(&priv->lock);
453}
454
455static void nft_rbtree_gc(struct work_struct *work)
456{
457 struct nft_rbtree_elem *rbe, *rbe_end = NULL, *rbe_prev = NULL;
458 struct nft_set_gc_batch *gcb = NULL;
459 struct nft_rbtree *priv;
460 struct rb_node *node;
461 struct nft_set *set;
462
463 priv = container_of(work, struct nft_rbtree, gc_work.work);
464 set = nft_set_container_of(priv);
465
466 write_lock_bh(&priv->lock);
467 write_seqcount_begin(&priv->count);
468 for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
469 rbe = rb_entry(node, struct nft_rbtree_elem, node);
470
471 if (nft_rbtree_interval_end(rbe)) {
472 rbe_end = rbe;
473 continue;
474 }
475 if (!nft_set_elem_expired(&rbe->ext))
476 continue;
477 if (nft_set_elem_mark_busy(&rbe->ext))
478 continue;
479
480 if (rbe_prev) {
481 rb_erase(&rbe_prev->node, &priv->root);
482 rbe_prev = NULL;
483 }
484 gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
485 if (!gcb)
486 break;
487
488 atomic_dec(&set->nelems);
489 nft_set_gc_batch_add(gcb, rbe);
490 rbe_prev = rbe;
491
492 if (rbe_end) {
493 atomic_dec(&set->nelems);
494 nft_set_gc_batch_add(gcb, rbe_end);
495 rb_erase(&rbe_end->node, &priv->root);
496 rbe_end = NULL;
497 }
498 node = rb_next(node);
499 if (!node)
500 break;
501 }
502 if (rbe_prev)
503 rb_erase(&rbe_prev->node, &priv->root);
504 write_seqcount_end(&priv->count);
505 write_unlock_bh(&priv->lock);
506
507 nft_set_gc_batch_complete(gcb);
508
509 queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
510 nft_set_gc_interval(set));
511}
512
513static u64 nft_rbtree_privsize(const struct nlattr * const nla[],
514 const struct nft_set_desc *desc)
515{
516 return sizeof(struct nft_rbtree);
517}
518
519static int nft_rbtree_init(const struct nft_set *set,
520 const struct nft_set_desc *desc,
521 const struct nlattr * const nla[])
522{
523 struct nft_rbtree *priv = nft_set_priv(set);
524
525 rwlock_init(&priv->lock);
526 seqcount_init(&priv->count);
527 priv->root = RB_ROOT;
528
529 INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rbtree_gc);
530 if (set->flags & NFT_SET_TIMEOUT)
531 queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
532 nft_set_gc_interval(set));
533
534 return 0;
535}
536
537static void nft_rbtree_destroy(const struct nft_set *set)
538{
539 struct nft_rbtree *priv = nft_set_priv(set);
540 struct nft_rbtree_elem *rbe;
541 struct rb_node *node;
542
543 cancel_delayed_work_sync(&priv->gc_work);
544 rcu_barrier();
545 while ((node = priv->root.rb_node) != NULL) {
546 rb_erase(node, &priv->root);
547 rbe = rb_entry(node, struct nft_rbtree_elem, node);
548 nft_set_elem_destroy(set, rbe, true);
549 }
550}
551
552static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
553 struct nft_set_estimate *est)
554{
555 if (desc->field_count > 1)
556 return false;
557
558 if (desc->size)
559 est->size = sizeof(struct nft_rbtree) +
560 desc->size * sizeof(struct nft_rbtree_elem);
561 else
562 est->size = ~0;
563
564 est->lookup = NFT_SET_CLASS_O_LOG_N;
565 est->space = NFT_SET_CLASS_O_N;
566
567 return true;
568}
569
570const struct nft_set_type nft_set_rbtree_type = {
571 .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT,
572 .ops = {
573 .privsize = nft_rbtree_privsize,
574 .elemsize = offsetof(struct nft_rbtree_elem, ext),
575 .estimate = nft_rbtree_estimate,
576 .init = nft_rbtree_init,
577 .destroy = nft_rbtree_destroy,
578 .insert = nft_rbtree_insert,
579 .remove = nft_rbtree_remove,
580 .deactivate = nft_rbtree_deactivate,
581 .flush = nft_rbtree_flush,
582 .activate = nft_rbtree_activate,
583 .lookup = nft_rbtree_lookup,
584 .walk = nft_rbtree_walk,
585 .get = nft_rbtree_get,
586 },
587};
588