1
2
3
4
5
6
7
8#include <linux/kernel.h>
9#include <linux/init.h>
10#include <linux/module.h>
11#include <linux/list.h>
12#include <linux/rbtree.h>
13#include <linux/netlink.h>
14#include <linux/netfilter.h>
15#include <linux/netfilter/nf_tables.h>
16#include <net/netfilter/nf_tables.h>
17
18struct nft_rbtree {
19 struct rb_root root;
20 rwlock_t lock;
21 seqcount_t count;
22 struct delayed_work gc_work;
23};
24
25struct nft_rbtree_elem {
26 struct rb_node node;
27 struct nft_set_ext ext;
28};
29
30static bool nft_rbtree_interval_end(const struct nft_rbtree_elem *rbe)
31{
32 return nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) &&
33 (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END);
34}
35
36static bool nft_rbtree_equal(const struct nft_set *set, const void *this,
37 const struct nft_rbtree_elem *interval)
38{
39 return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0;
40}
41
42static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
43 const u32 *key, const struct nft_set_ext **ext,
44 unsigned int seq)
45{
46 struct nft_rbtree *priv = nft_set_priv(set);
47 const struct nft_rbtree_elem *rbe, *interval = NULL;
48 u8 genmask = nft_genmask_cur(net);
49 const struct rb_node *parent;
50 const void *this;
51 int d;
52
53 parent = rcu_dereference_raw(priv->root.rb_node);
54 while (parent != NULL) {
55 if (read_seqcount_retry(&priv->count, seq))
56 return false;
57
58 rbe = rb_entry(parent, struct nft_rbtree_elem, node);
59
60 this = nft_set_ext_key(&rbe->ext);
61 d = memcmp(this, key, set->klen);
62 if (d < 0) {
63 parent = rcu_dereference_raw(parent->rb_left);
64 if (interval &&
65 nft_rbtree_equal(set, this, interval) &&
66 nft_rbtree_interval_end(rbe) &&
67 !nft_rbtree_interval_end(interval))
68 continue;
69 interval = rbe;
70 } else if (d > 0)
71 parent = rcu_dereference_raw(parent->rb_right);
72 else {
73 if (!nft_set_elem_active(&rbe->ext, genmask)) {
74 parent = rcu_dereference_raw(parent->rb_left);
75 continue;
76 }
77 if (nft_rbtree_interval_end(rbe))
78 goto out;
79
80 *ext = &rbe->ext;
81 return true;
82 }
83 }
84
85 if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
86 nft_set_elem_active(&interval->ext, genmask) &&
87 !nft_rbtree_interval_end(interval)) {
88 *ext = &interval->ext;
89 return true;
90 }
91out:
92 return false;
93}
94
95static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
96 const u32 *key, const struct nft_set_ext **ext)
97{
98 struct nft_rbtree *priv = nft_set_priv(set);
99 unsigned int seq = read_seqcount_begin(&priv->count);
100 bool ret;
101
102 ret = __nft_rbtree_lookup(net, set, key, ext, seq);
103 if (ret || !read_seqcount_retry(&priv->count, seq))
104 return ret;
105
106 read_lock_bh(&priv->lock);
107 seq = read_seqcount_begin(&priv->count);
108 ret = __nft_rbtree_lookup(net, set, key, ext, seq);
109 read_unlock_bh(&priv->lock);
110
111 return ret;
112}
113
114static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set,
115 const u32 *key, struct nft_rbtree_elem **elem,
116 unsigned int seq, unsigned int flags, u8 genmask)
117{
118 struct nft_rbtree_elem *rbe, *interval = NULL;
119 struct nft_rbtree *priv = nft_set_priv(set);
120 const struct rb_node *parent;
121 const void *this;
122 int d;
123
124 parent = rcu_dereference_raw(priv->root.rb_node);
125 while (parent != NULL) {
126 if (read_seqcount_retry(&priv->count, seq))
127 return false;
128
129 rbe = rb_entry(parent, struct nft_rbtree_elem, node);
130
131 this = nft_set_ext_key(&rbe->ext);
132 d = memcmp(this, key, set->klen);
133 if (d < 0) {
134 parent = rcu_dereference_raw(parent->rb_left);
135 if (!(flags & NFT_SET_ELEM_INTERVAL_END))
136 interval = rbe;
137 } else if (d > 0) {
138 parent = rcu_dereference_raw(parent->rb_right);
139 if (flags & NFT_SET_ELEM_INTERVAL_END)
140 interval = rbe;
141 } else {
142 if (!nft_set_elem_active(&rbe->ext, genmask))
143 parent = rcu_dereference_raw(parent->rb_left);
144
145 if (!nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) ||
146 (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END) ==
147 (flags & NFT_SET_ELEM_INTERVAL_END)) {
148 *elem = rbe;
149 return true;
150 }
151 return false;
152 }
153 }
154
155 if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
156 nft_set_elem_active(&interval->ext, genmask) &&
157 ((!nft_rbtree_interval_end(interval) &&
158 !(flags & NFT_SET_ELEM_INTERVAL_END)) ||
159 (nft_rbtree_interval_end(interval) &&
160 (flags & NFT_SET_ELEM_INTERVAL_END)))) {
161 *elem = interval;
162 return true;
163 }
164
165 return false;
166}
167
168static void *nft_rbtree_get(const struct net *net, const struct nft_set *set,
169 const struct nft_set_elem *elem, unsigned int flags)
170{
171 struct nft_rbtree *priv = nft_set_priv(set);
172 unsigned int seq = read_seqcount_begin(&priv->count);
173 struct nft_rbtree_elem *rbe = ERR_PTR(-ENOENT);
174 const u32 *key = (const u32 *)&elem->key.val;
175 u8 genmask = nft_genmask_cur(net);
176 bool ret;
177
178 ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask);
179 if (ret || !read_seqcount_retry(&priv->count, seq))
180 return rbe;
181
182 read_lock_bh(&priv->lock);
183 seq = read_seqcount_begin(&priv->count);
184 ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask);
185 if (!ret)
186 rbe = ERR_PTR(-ENOENT);
187 read_unlock_bh(&priv->lock);
188
189 return rbe;
190}
191
192static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
193 struct nft_rbtree_elem *new,
194 struct nft_set_ext **ext)
195{
196 struct nft_rbtree *priv = nft_set_priv(set);
197 u8 genmask = nft_genmask_next(net);
198 struct nft_rbtree_elem *rbe;
199 struct rb_node *parent, **p;
200 int d;
201
202 parent = NULL;
203 p = &priv->root.rb_node;
204 while (*p != NULL) {
205 parent = *p;
206 rbe = rb_entry(parent, struct nft_rbtree_elem, node);
207 d = memcmp(nft_set_ext_key(&rbe->ext),
208 nft_set_ext_key(&new->ext),
209 set->klen);
210 if (d < 0)
211 p = &parent->rb_left;
212 else if (d > 0)
213 p = &parent->rb_right;
214 else {
215 if (nft_rbtree_interval_end(rbe) &&
216 !nft_rbtree_interval_end(new)) {
217 p = &parent->rb_left;
218 } else if (!nft_rbtree_interval_end(rbe) &&
219 nft_rbtree_interval_end(new)) {
220 p = &parent->rb_right;
221 } else if (nft_set_elem_active(&rbe->ext, genmask)) {
222 *ext = &rbe->ext;
223 return -EEXIST;
224 } else {
225 p = &parent->rb_left;
226 }
227 }
228 }
229 rb_link_node_rcu(&new->node, parent, p);
230 rb_insert_color(&new->node, &priv->root);
231 return 0;
232}
233
234static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
235 const struct nft_set_elem *elem,
236 struct nft_set_ext **ext)
237{
238 struct nft_rbtree *priv = nft_set_priv(set);
239 struct nft_rbtree_elem *rbe = elem->priv;
240 int err;
241
242 write_lock_bh(&priv->lock);
243 write_seqcount_begin(&priv->count);
244 err = __nft_rbtree_insert(net, set, rbe, ext);
245 write_seqcount_end(&priv->count);
246 write_unlock_bh(&priv->lock);
247
248 return err;
249}
250
251static void nft_rbtree_remove(const struct net *net,
252 const struct nft_set *set,
253 const struct nft_set_elem *elem)
254{
255 struct nft_rbtree *priv = nft_set_priv(set);
256 struct nft_rbtree_elem *rbe = elem->priv;
257
258 write_lock_bh(&priv->lock);
259 write_seqcount_begin(&priv->count);
260 rb_erase(&rbe->node, &priv->root);
261 write_seqcount_end(&priv->count);
262 write_unlock_bh(&priv->lock);
263}
264
265static void nft_rbtree_activate(const struct net *net,
266 const struct nft_set *set,
267 const struct nft_set_elem *elem)
268{
269 struct nft_rbtree_elem *rbe = elem->priv;
270
271 nft_set_elem_change_active(net, set, &rbe->ext);
272 nft_set_elem_clear_busy(&rbe->ext);
273}
274
275static bool nft_rbtree_flush(const struct net *net,
276 const struct nft_set *set, void *priv)
277{
278 struct nft_rbtree_elem *rbe = priv;
279
280 if (!nft_set_elem_mark_busy(&rbe->ext) ||
281 !nft_is_active(net, &rbe->ext)) {
282 nft_set_elem_change_active(net, set, &rbe->ext);
283 return true;
284 }
285 return false;
286}
287
288static void *nft_rbtree_deactivate(const struct net *net,
289 const struct nft_set *set,
290 const struct nft_set_elem *elem)
291{
292 const struct nft_rbtree *priv = nft_set_priv(set);
293 const struct rb_node *parent = priv->root.rb_node;
294 struct nft_rbtree_elem *rbe, *this = elem->priv;
295 u8 genmask = nft_genmask_next(net);
296 int d;
297
298 while (parent != NULL) {
299 rbe = rb_entry(parent, struct nft_rbtree_elem, node);
300
301 d = memcmp(nft_set_ext_key(&rbe->ext), &elem->key.val,
302 set->klen);
303 if (d < 0)
304 parent = parent->rb_left;
305 else if (d > 0)
306 parent = parent->rb_right;
307 else {
308 if (nft_rbtree_interval_end(rbe) &&
309 !nft_rbtree_interval_end(this)) {
310 parent = parent->rb_left;
311 continue;
312 } else if (!nft_rbtree_interval_end(rbe) &&
313 nft_rbtree_interval_end(this)) {
314 parent = parent->rb_right;
315 continue;
316 } else if (!nft_set_elem_active(&rbe->ext, genmask)) {
317 parent = parent->rb_left;
318 continue;
319 }
320 nft_rbtree_flush(net, set, rbe);
321 return rbe;
322 }
323 }
324 return NULL;
325}
326
327static void nft_rbtree_walk(const struct nft_ctx *ctx,
328 struct nft_set *set,
329 struct nft_set_iter *iter)
330{
331 struct nft_rbtree *priv = nft_set_priv(set);
332 struct nft_rbtree_elem *rbe;
333 struct nft_set_elem elem;
334 struct rb_node *node;
335
336 read_lock_bh(&priv->lock);
337 for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
338 rbe = rb_entry(node, struct nft_rbtree_elem, node);
339
340 if (iter->count < iter->skip)
341 goto cont;
342 if (!nft_set_elem_active(&rbe->ext, iter->genmask))
343 goto cont;
344
345 elem.priv = rbe;
346
347 iter->err = iter->fn(ctx, set, iter, &elem);
348 if (iter->err < 0) {
349 read_unlock_bh(&priv->lock);
350 return;
351 }
352cont:
353 iter->count++;
354 }
355 read_unlock_bh(&priv->lock);
356}
357
358static void nft_rbtree_gc(struct work_struct *work)
359{
360 struct nft_rbtree_elem *rbe, *rbe_end = NULL, *rbe_prev = NULL;
361 struct nft_set_gc_batch *gcb = NULL;
362 struct nft_rbtree *priv;
363 struct rb_node *node;
364 struct nft_set *set;
365
366 priv = container_of(work, struct nft_rbtree, gc_work.work);
367 set = nft_set_container_of(priv);
368
369 write_lock_bh(&priv->lock);
370 write_seqcount_begin(&priv->count);
371 for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
372 rbe = rb_entry(node, struct nft_rbtree_elem, node);
373
374 if (nft_rbtree_interval_end(rbe)) {
375 rbe_end = rbe;
376 continue;
377 }
378 if (!nft_set_elem_expired(&rbe->ext))
379 continue;
380 if (nft_set_elem_mark_busy(&rbe->ext))
381 continue;
382
383 if (rbe_prev) {
384 rb_erase(&rbe_prev->node, &priv->root);
385 rbe_prev = NULL;
386 }
387 gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
388 if (!gcb)
389 break;
390
391 atomic_dec(&set->nelems);
392 nft_set_gc_batch_add(gcb, rbe);
393 rbe_prev = rbe;
394
395 if (rbe_end) {
396 atomic_dec(&set->nelems);
397 nft_set_gc_batch_add(gcb, rbe_end);
398 rb_erase(&rbe_end->node, &priv->root);
399 rbe_end = NULL;
400 }
401 node = rb_next(node);
402 if (!node)
403 break;
404 }
405 if (rbe_prev)
406 rb_erase(&rbe_prev->node, &priv->root);
407 write_seqcount_end(&priv->count);
408 write_unlock_bh(&priv->lock);
409
410 nft_set_gc_batch_complete(gcb);
411
412 queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
413 nft_set_gc_interval(set));
414}
415
416static u64 nft_rbtree_privsize(const struct nlattr * const nla[],
417 const struct nft_set_desc *desc)
418{
419 return sizeof(struct nft_rbtree);
420}
421
422static int nft_rbtree_init(const struct nft_set *set,
423 const struct nft_set_desc *desc,
424 const struct nlattr * const nla[])
425{
426 struct nft_rbtree *priv = nft_set_priv(set);
427
428 rwlock_init(&priv->lock);
429 seqcount_init(&priv->count);
430 priv->root = RB_ROOT;
431
432 INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rbtree_gc);
433 if (set->flags & NFT_SET_TIMEOUT)
434 queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
435 nft_set_gc_interval(set));
436
437 return 0;
438}
439
440static void nft_rbtree_destroy(const struct nft_set *set)
441{
442 struct nft_rbtree *priv = nft_set_priv(set);
443 struct nft_rbtree_elem *rbe;
444 struct rb_node *node;
445
446 cancel_delayed_work_sync(&priv->gc_work);
447 rcu_barrier();
448 while ((node = priv->root.rb_node) != NULL) {
449 rb_erase(node, &priv->root);
450 rbe = rb_entry(node, struct nft_rbtree_elem, node);
451 nft_set_elem_destroy(set, rbe, true);
452 }
453}
454
455static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
456 struct nft_set_estimate *est)
457{
458 if (desc->size)
459 est->size = sizeof(struct nft_rbtree) +
460 desc->size * sizeof(struct nft_rbtree_elem);
461 else
462 est->size = ~0;
463
464 est->lookup = NFT_SET_CLASS_O_LOG_N;
465 est->space = NFT_SET_CLASS_O_N;
466
467 return true;
468}
469
470struct nft_set_type nft_set_rbtree_type __read_mostly = {
471 .owner = THIS_MODULE,
472 .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT,
473 .ops = {
474 .privsize = nft_rbtree_privsize,
475 .elemsize = offsetof(struct nft_rbtree_elem, ext),
476 .estimate = nft_rbtree_estimate,
477 .init = nft_rbtree_init,
478 .destroy = nft_rbtree_destroy,
479 .insert = nft_rbtree_insert,
480 .remove = nft_rbtree_remove,
481 .deactivate = nft_rbtree_deactivate,
482 .flush = nft_rbtree_flush,
483 .activate = nft_rbtree_activate,
484 .lookup = nft_rbtree_lookup,
485 .walk = nft_rbtree_walk,
486 .get = nft_rbtree_get,
487 },
488};
489