1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41#include <linux/bpf.h>
42#include <net/xdp.h>
43#include <linux/filter.h>
44#include <trace/events/xdp.h>
45
46#define DEV_CREATE_FLAG_MASK \
47 (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
48
49#define DEV_MAP_BULK_SIZE 16
50struct bpf_dtab_netdev;
51
52struct xdp_bulk_queue {
53 struct xdp_frame *q[DEV_MAP_BULK_SIZE];
54 struct list_head flush_node;
55 struct net_device *dev_rx;
56 struct bpf_dtab_netdev *obj;
57 unsigned int count;
58};
59
60struct bpf_dtab_netdev {
61 struct net_device *dev;
62 struct bpf_dtab *dtab;
63 unsigned int bit;
64 struct xdp_bulk_queue __percpu *bulkq;
65 struct rcu_head rcu;
66};
67
68struct bpf_dtab {
69 struct bpf_map map;
70 struct bpf_dtab_netdev **netdev_map;
71 struct list_head __percpu *flush_list;
72 struct list_head list;
73};
74
75static DEFINE_SPINLOCK(dev_map_lock);
76static LIST_HEAD(dev_map_list);
77
78static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
79{
80 struct bpf_dtab *dtab;
81 int err, cpu;
82 u64 cost;
83
84 if (!capable(CAP_NET_ADMIN))
85 return ERR_PTR(-EPERM);
86
87
88 if (attr->max_entries == 0 || attr->key_size != 4 ||
89 attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
90 return ERR_PTR(-EINVAL);
91
92
93
94
95 attr->map_flags |= BPF_F_RDONLY_PROG;
96
97 dtab = kzalloc(sizeof(*dtab), GFP_USER);
98 if (!dtab)
99 return ERR_PTR(-ENOMEM);
100
101 bpf_map_init_from_attr(&dtab->map, attr);
102
103
104 cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
105 cost += sizeof(struct list_head) * num_possible_cpus();
106
107
108 err = bpf_map_charge_init(&dtab->map.memory, cost);
109 if (err)
110 goto free_dtab;
111
112 err = -ENOMEM;
113
114 dtab->flush_list = alloc_percpu(struct list_head);
115 if (!dtab->flush_list)
116 goto free_charge;
117
118 for_each_possible_cpu(cpu)
119 INIT_LIST_HEAD(per_cpu_ptr(dtab->flush_list, cpu));
120
121 dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
122 sizeof(struct bpf_dtab_netdev *),
123 dtab->map.numa_node);
124 if (!dtab->netdev_map)
125 goto free_percpu;
126
127 spin_lock(&dev_map_lock);
128 list_add_tail_rcu(&dtab->list, &dev_map_list);
129 spin_unlock(&dev_map_lock);
130
131 return &dtab->map;
132
133free_percpu:
134 free_percpu(dtab->flush_list);
135free_charge:
136 bpf_map_charge_finish(&dtab->map.memory);
137free_dtab:
138 kfree(dtab);
139 return ERR_PTR(err);
140}
141
142static void dev_map_free(struct bpf_map *map)
143{
144 struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
145 int i, cpu;
146
147
148
149
150
151
152
153
154
155 spin_lock(&dev_map_lock);
156 list_del_rcu(&dtab->list);
157 spin_unlock(&dev_map_lock);
158
159 bpf_clear_redirect_map(map);
160 synchronize_rcu();
161
162
163 rcu_barrier();
164
165
166
167
168
169
170 for_each_online_cpu(cpu) {
171 struct list_head *flush_list = per_cpu_ptr(dtab->flush_list, cpu);
172
173 while (!list_empty(flush_list))
174 cond_resched();
175 }
176
177 for (i = 0; i < dtab->map.max_entries; i++) {
178 struct bpf_dtab_netdev *dev;
179
180 dev = dtab->netdev_map[i];
181 if (!dev)
182 continue;
183
184 free_percpu(dev->bulkq);
185 dev_put(dev->dev);
186 kfree(dev);
187 }
188
189 free_percpu(dtab->flush_list);
190 bpf_map_area_free(dtab->netdev_map);
191 kfree(dtab);
192}
193
194static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
195{
196 struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
197 u32 index = key ? *(u32 *)key : U32_MAX;
198 u32 *next = next_key;
199
200 if (index >= dtab->map.max_entries) {
201 *next = 0;
202 return 0;
203 }
204
205 if (index == dtab->map.max_entries - 1)
206 return -ENOENT;
207 *next = index + 1;
208 return 0;
209}
210
211static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags,
212 bool in_napi_ctx)
213{
214 struct bpf_dtab_netdev *obj = bq->obj;
215 struct net_device *dev = obj->dev;
216 int sent = 0, drops = 0, err = 0;
217 int i;
218
219 if (unlikely(!bq->count))
220 return 0;
221
222 for (i = 0; i < bq->count; i++) {
223 struct xdp_frame *xdpf = bq->q[i];
224
225 prefetch(xdpf);
226 }
227
228 sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags);
229 if (sent < 0) {
230 err = sent;
231 sent = 0;
232 goto error;
233 }
234 drops = bq->count - sent;
235out:
236 bq->count = 0;
237
238 trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit,
239 sent, drops, bq->dev_rx, dev, err);
240 bq->dev_rx = NULL;
241 __list_del_clearprev(&bq->flush_node);
242 return 0;
243error:
244
245
246
247 for (i = 0; i < bq->count; i++) {
248 struct xdp_frame *xdpf = bq->q[i];
249
250
251 if (likely(in_napi_ctx))
252 xdp_return_frame_rx_napi(xdpf);
253 else
254 xdp_return_frame(xdpf);
255 drops++;
256 }
257 goto out;
258}
259
260
261
262
263
264
265
266
267void __dev_map_flush(struct bpf_map *map)
268{
269 struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
270 struct list_head *flush_list = this_cpu_ptr(dtab->flush_list);
271 struct xdp_bulk_queue *bq, *tmp;
272
273 rcu_read_lock();
274 list_for_each_entry_safe(bq, tmp, flush_list, flush_node)
275 bq_xmit_all(bq, XDP_XMIT_FLUSH, true);
276 rcu_read_unlock();
277}
278
279
280
281
282
283struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
284{
285 struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
286 struct bpf_dtab_netdev *obj;
287
288 if (key >= map->max_entries)
289 return NULL;
290
291 obj = READ_ONCE(dtab->netdev_map[key]);
292 return obj;
293}
294
295
296
297
298static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
299 struct net_device *dev_rx)
300
301{
302 struct list_head *flush_list = this_cpu_ptr(obj->dtab->flush_list);
303 struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
304
305 if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
306 bq_xmit_all(bq, 0, true);
307
308
309
310
311
312 if (!bq->dev_rx)
313 bq->dev_rx = dev_rx;
314
315 bq->q[bq->count++] = xdpf;
316
317 if (!bq->flush_node.prev)
318 list_add(&bq->flush_node, flush_list);
319
320 return 0;
321}
322
323int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
324 struct net_device *dev_rx)
325{
326 struct net_device *dev = dst->dev;
327 struct xdp_frame *xdpf;
328 int err;
329
330 if (!dev->netdev_ops->ndo_xdp_xmit)
331 return -EOPNOTSUPP;
332
333 err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data);
334 if (unlikely(err))
335 return err;
336
337 xdpf = convert_to_xdp_frame(xdp);
338 if (unlikely(!xdpf))
339 return -EOVERFLOW;
340
341 return bq_enqueue(dst, xdpf, dev_rx);
342}
343
344int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
345 struct bpf_prog *xdp_prog)
346{
347 int err;
348
349 err = xdp_ok_fwd_dev(dst->dev, skb->len);
350 if (unlikely(err))
351 return err;
352 skb->dev = dst->dev;
353 generic_xdp_tx(skb, xdp_prog);
354
355 return 0;
356}
357
358static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
359{
360 struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key);
361 struct net_device *dev = obj ? obj->dev : NULL;
362
363 return dev ? &dev->ifindex : NULL;
364}
365
366static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
367{
368 if (dev->dev->netdev_ops->ndo_xdp_xmit) {
369 struct xdp_bulk_queue *bq;
370 int cpu;
371
372 rcu_read_lock();
373 for_each_online_cpu(cpu) {
374 bq = per_cpu_ptr(dev->bulkq, cpu);
375 bq_xmit_all(bq, XDP_XMIT_FLUSH, false);
376 }
377 rcu_read_unlock();
378 }
379}
380
381static void __dev_map_entry_free(struct rcu_head *rcu)
382{
383 struct bpf_dtab_netdev *dev;
384
385 dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
386 dev_map_flush_old(dev);
387 free_percpu(dev->bulkq);
388 dev_put(dev->dev);
389 kfree(dev);
390}
391
392static int dev_map_delete_elem(struct bpf_map *map, void *key)
393{
394 struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
395 struct bpf_dtab_netdev *old_dev;
396 int k = *(u32 *)key;
397
398 if (k >= map->max_entries)
399 return -EINVAL;
400
401
402
403
404
405
406
407
408
409 old_dev = xchg(&dtab->netdev_map[k], NULL);
410 if (old_dev)
411 call_rcu(&old_dev->rcu, __dev_map_entry_free);
412 return 0;
413}
414
415static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
416 u64 map_flags)
417{
418 struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
419 struct net *net = current->nsproxy->net_ns;
420 gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
421 struct bpf_dtab_netdev *dev, *old_dev;
422 u32 ifindex = *(u32 *)value;
423 struct xdp_bulk_queue *bq;
424 u32 i = *(u32 *)key;
425 int cpu;
426
427 if (unlikely(map_flags > BPF_EXIST))
428 return -EINVAL;
429 if (unlikely(i >= dtab->map.max_entries))
430 return -E2BIG;
431 if (unlikely(map_flags == BPF_NOEXIST))
432 return -EEXIST;
433
434 if (!ifindex) {
435 dev = NULL;
436 } else {
437 dev = kmalloc_node(sizeof(*dev), gfp, map->numa_node);
438 if (!dev)
439 return -ENOMEM;
440
441 dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq),
442 sizeof(void *), gfp);
443 if (!dev->bulkq) {
444 kfree(dev);
445 return -ENOMEM;
446 }
447
448 for_each_possible_cpu(cpu) {
449 bq = per_cpu_ptr(dev->bulkq, cpu);
450 bq->obj = dev;
451 }
452
453 dev->dev = dev_get_by_index(net, ifindex);
454 if (!dev->dev) {
455 free_percpu(dev->bulkq);
456 kfree(dev);
457 return -EINVAL;
458 }
459
460 dev->bit = i;
461 dev->dtab = dtab;
462 }
463
464
465
466
467
468 old_dev = xchg(&dtab->netdev_map[i], dev);
469 if (old_dev)
470 call_rcu(&old_dev->rcu, __dev_map_entry_free);
471
472 return 0;
473}
474
475const struct bpf_map_ops dev_map_ops = {
476 .map_alloc = dev_map_alloc,
477 .map_free = dev_map_free,
478 .map_get_next_key = dev_map_get_next_key,
479 .map_lookup_elem = dev_map_lookup_elem,
480 .map_update_elem = dev_map_update_elem,
481 .map_delete_elem = dev_map_delete_elem,
482 .map_check_btf = map_check_no_btf,
483};
484
485static int dev_map_notification(struct notifier_block *notifier,
486 ulong event, void *ptr)
487{
488 struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
489 struct bpf_dtab *dtab;
490 int i;
491
492 switch (event) {
493 case NETDEV_UNREGISTER:
494
495
496
497
498
499 rcu_read_lock();
500 list_for_each_entry_rcu(dtab, &dev_map_list, list) {
501 for (i = 0; i < dtab->map.max_entries; i++) {
502 struct bpf_dtab_netdev *dev, *odev;
503
504 dev = READ_ONCE(dtab->netdev_map[i]);
505 if (!dev || netdev != dev->dev)
506 continue;
507 odev = cmpxchg(&dtab->netdev_map[i], dev, NULL);
508 if (dev == odev)
509 call_rcu(&dev->rcu,
510 __dev_map_entry_free);
511 }
512 }
513 rcu_read_unlock();
514 break;
515 default:
516 break;
517 }
518 return NOTIFY_OK;
519}
520
521static struct notifier_block dev_map_notifier = {
522 .notifier_call = dev_map_notification,
523};
524
525static int __init dev_map_init(void)
526{
527
528 BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) !=
529 offsetof(struct _bpf_dtab_netdev, dev));
530 register_netdevice_notifier(&dev_map_notifier);
531 return 0;
532}
533
534subsys_initcall(dev_map_init);
535