1
2
3
4
5
6
7
8
9
10
11
12#define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
13
14#include <linux/if_xdp.h>
15#include <linux/init.h>
16#include <linux/sched/mm.h>
17#include <linux/sched/signal.h>
18#include <linux/sched/task.h>
19#include <linux/socket.h>
20#include <linux/file.h>
21#include <linux/uaccess.h>
22#include <linux/net.h>
23#include <linux/netdevice.h>
24#include <linux/rculist.h>
25#include <net/xdp_sock.h>
26#include <net/xdp.h>
27
28#include "xsk_queue.h"
29#include "xdp_umem.h"
30#include "xsk.h"
31
32#define TX_BATCH_SIZE 16
33
34bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
35{
36 return READ_ONCE(xs->rx) && READ_ONCE(xs->umem) &&
37 READ_ONCE(xs->umem->fq);
38}
39
40bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt)
41{
42 return xskq_has_addrs(umem->fq, cnt);
43}
44EXPORT_SYMBOL(xsk_umem_has_addrs);
45
46u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
47{
48 return xskq_peek_addr(umem->fq, addr, umem);
49}
50EXPORT_SYMBOL(xsk_umem_peek_addr);
51
52void xsk_umem_discard_addr(struct xdp_umem *umem)
53{
54 xskq_discard_addr(umem->fq);
55}
56EXPORT_SYMBOL(xsk_umem_discard_addr);
57
58void xsk_set_rx_need_wakeup(struct xdp_umem *umem)
59{
60 if (umem->need_wakeup & XDP_WAKEUP_RX)
61 return;
62
63 umem->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
64 umem->need_wakeup |= XDP_WAKEUP_RX;
65}
66EXPORT_SYMBOL(xsk_set_rx_need_wakeup);
67
68void xsk_set_tx_need_wakeup(struct xdp_umem *umem)
69{
70 struct xdp_sock *xs;
71
72 if (umem->need_wakeup & XDP_WAKEUP_TX)
73 return;
74
75 rcu_read_lock();
76 list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
77 xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
78 }
79 rcu_read_unlock();
80
81 umem->need_wakeup |= XDP_WAKEUP_TX;
82}
83EXPORT_SYMBOL(xsk_set_tx_need_wakeup);
84
85void xsk_clear_rx_need_wakeup(struct xdp_umem *umem)
86{
87 if (!(umem->need_wakeup & XDP_WAKEUP_RX))
88 return;
89
90 umem->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
91 umem->need_wakeup &= ~XDP_WAKEUP_RX;
92}
93EXPORT_SYMBOL(xsk_clear_rx_need_wakeup);
94
95void xsk_clear_tx_need_wakeup(struct xdp_umem *umem)
96{
97 struct xdp_sock *xs;
98
99 if (!(umem->need_wakeup & XDP_WAKEUP_TX))
100 return;
101
102 rcu_read_lock();
103 list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
104 xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
105 }
106 rcu_read_unlock();
107
108 umem->need_wakeup &= ~XDP_WAKEUP_TX;
109}
110EXPORT_SYMBOL(xsk_clear_tx_need_wakeup);
111
112bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
113{
114 return umem->flags & XDP_UMEM_USES_NEED_WAKEUP;
115}
116EXPORT_SYMBOL(xsk_umem_uses_need_wakeup);
117
118
119
120
121static void __xsk_rcv_memcpy(struct xdp_umem *umem, u64 addr, void *from_buf,
122 u32 len, u32 metalen)
123{
124 void *to_buf = xdp_umem_get_data(umem, addr);
125
126 addr = xsk_umem_add_offset_to_addr(addr);
127 if (xskq_crosses_non_contig_pg(umem, addr, len + metalen)) {
128 void *next_pg_addr = umem->pages[(addr >> PAGE_SHIFT) + 1].addr;
129 u64 page_start = addr & ~(PAGE_SIZE - 1);
130 u64 first_len = PAGE_SIZE - (addr - page_start);
131
132 memcpy(to_buf, from_buf, first_len + metalen);
133 memcpy(next_pg_addr, from_buf + first_len, len - first_len);
134
135 return;
136 }
137
138 memcpy(to_buf, from_buf, len + metalen);
139}
140
141static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
142{
143 u64 offset = xs->umem->headroom;
144 u64 addr, memcpy_addr;
145 void *from_buf;
146 u32 metalen;
147 int err;
148
149 if (!xskq_peek_addr(xs->umem->fq, &addr, xs->umem) ||
150 len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
151 xs->rx_dropped++;
152 return -ENOSPC;
153 }
154
155 if (unlikely(xdp_data_meta_unsupported(xdp))) {
156 from_buf = xdp->data;
157 metalen = 0;
158 } else {
159 from_buf = xdp->data_meta;
160 metalen = xdp->data - xdp->data_meta;
161 }
162
163 memcpy_addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
164 __xsk_rcv_memcpy(xs->umem, memcpy_addr, from_buf, len, metalen);
165
166 offset += metalen;
167 addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
168 err = xskq_produce_batch_desc(xs->rx, addr, len);
169 if (!err) {
170 xskq_discard_addr(xs->umem->fq);
171 xdp_return_buff(xdp);
172 return 0;
173 }
174
175 xs->rx_dropped++;
176 return err;
177}
178
179static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
180{
181 int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len);
182
183 if (err)
184 xs->rx_dropped++;
185
186 return err;
187}
188
189static bool xsk_is_bound(struct xdp_sock *xs)
190{
191 if (READ_ONCE(xs->state) == XSK_BOUND) {
192
193 smp_rmb();
194 return true;
195 }
196 return false;
197}
198
199int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
200{
201 u32 len;
202
203 if (!xsk_is_bound(xs))
204 return -EINVAL;
205
206 if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
207 return -EINVAL;
208
209 len = xdp->data_end - xdp->data;
210
211 return (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) ?
212 __xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len);
213}
214
215void xsk_flush(struct xdp_sock *xs)
216{
217 xskq_produce_flush_desc(xs->rx);
218 xs->sk.sk_data_ready(&xs->sk);
219}
220
221int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
222{
223 u32 metalen = xdp->data - xdp->data_meta;
224 u32 len = xdp->data_end - xdp->data;
225 u64 offset = xs->umem->headroom;
226 void *buffer;
227 u64 addr;
228 int err;
229
230 spin_lock_bh(&xs->rx_lock);
231
232 if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) {
233 err = -EINVAL;
234 goto out_unlock;
235 }
236
237 if (!xskq_peek_addr(xs->umem->fq, &addr, xs->umem) ||
238 len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
239 err = -ENOSPC;
240 goto out_drop;
241 }
242
243 addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
244 buffer = xdp_umem_get_data(xs->umem, addr);
245 memcpy(buffer, xdp->data_meta, len + metalen);
246
247 addr = xsk_umem_adjust_offset(xs->umem, addr, metalen);
248 err = xskq_produce_batch_desc(xs->rx, addr, len);
249 if (err)
250 goto out_drop;
251
252 xskq_discard_addr(xs->umem->fq);
253 xskq_produce_flush_desc(xs->rx);
254
255 spin_unlock_bh(&xs->rx_lock);
256
257 xs->sk.sk_data_ready(&xs->sk);
258 return 0;
259
260out_drop:
261 xs->rx_dropped++;
262out_unlock:
263 spin_unlock_bh(&xs->rx_lock);
264 return err;
265}
266
267void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
268{
269 xskq_produce_flush_addr_n(umem->cq, nb_entries);
270}
271EXPORT_SYMBOL(xsk_umem_complete_tx);
272
273void xsk_umem_consume_tx_done(struct xdp_umem *umem)
274{
275 struct xdp_sock *xs;
276
277 rcu_read_lock();
278 list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
279 xs->sk.sk_write_space(&xs->sk);
280 }
281 rcu_read_unlock();
282}
283EXPORT_SYMBOL(xsk_umem_consume_tx_done);
284
285bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc)
286{
287 struct xdp_sock *xs;
288
289 rcu_read_lock();
290 list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
291 if (!xskq_peek_desc(xs->tx, desc, umem))
292 continue;
293
294 if (xskq_produce_addr_lazy(umem->cq, desc->addr))
295 goto out;
296
297 xskq_discard_desc(xs->tx);
298 rcu_read_unlock();
299 return true;
300 }
301
302out:
303 rcu_read_unlock();
304 return false;
305}
306EXPORT_SYMBOL(xsk_umem_consume_tx);
307
308static int xsk_zc_xmit(struct xdp_sock *xs)
309{
310 struct net_device *dev = xs->dev;
311
312 return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id,
313 XDP_WAKEUP_TX);
314}
315
316static void xsk_destruct_skb(struct sk_buff *skb)
317{
318 u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
319 struct xdp_sock *xs = xdp_sk(skb->sk);
320 unsigned long flags;
321
322 spin_lock_irqsave(&xs->tx_completion_lock, flags);
323 WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr));
324 spin_unlock_irqrestore(&xs->tx_completion_lock, flags);
325
326 sock_wfree(skb);
327}
328
329static int xsk_generic_xmit(struct sock *sk)
330{
331 struct xdp_sock *xs = xdp_sk(sk);
332 u32 max_batch = TX_BATCH_SIZE;
333 bool sent_frame = false;
334 struct xdp_desc desc;
335 struct sk_buff *skb;
336 int err = 0;
337
338 mutex_lock(&xs->mutex);
339
340 if (xs->queue_id >= xs->dev->real_num_tx_queues)
341 goto out;
342
343 while (xskq_peek_desc(xs->tx, &desc, xs->umem)) {
344 char *buffer;
345 u64 addr;
346 u32 len;
347
348 if (max_batch-- == 0) {
349 err = -EAGAIN;
350 goto out;
351 }
352
353 len = desc.len;
354 skb = sock_alloc_send_skb(sk, len, 1, &err);
355 if (unlikely(!skb)) {
356 err = -EAGAIN;
357 goto out;
358 }
359
360 skb_put(skb, len);
361 addr = desc.addr;
362 buffer = xdp_umem_get_data(xs->umem, addr);
363 err = skb_store_bits(skb, 0, buffer, len);
364 if (unlikely(err) || xskq_reserve_addr(xs->umem->cq)) {
365 kfree_skb(skb);
366 goto out;
367 }
368
369 skb->dev = xs->dev;
370 skb->priority = sk->sk_priority;
371 skb->mark = sk->sk_mark;
372 skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
373 skb->destructor = xsk_destruct_skb;
374
375 err = dev_direct_xmit(skb, xs->queue_id);
376 xskq_discard_desc(xs->tx);
377
378 if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) {
379
380 err = -EBUSY;
381 goto out;
382 }
383
384 sent_frame = true;
385 }
386
387out:
388 if (sent_frame)
389 sk->sk_write_space(sk);
390
391 mutex_unlock(&xs->mutex);
392 return err;
393}
394
395static int __xsk_sendmsg(struct sock *sk)
396{
397 struct xdp_sock *xs = xdp_sk(sk);
398
399 if (unlikely(!(xs->dev->flags & IFF_UP)))
400 return -ENETDOWN;
401 if (unlikely(!xs->tx))
402 return -ENOBUFS;
403
404 return xs->zc ? xsk_zc_xmit(xs) : xsk_generic_xmit(sk);
405}
406
407static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
408{
409 bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
410 struct sock *sk = sock->sk;
411 struct xdp_sock *xs = xdp_sk(sk);
412
413 if (unlikely(!xsk_is_bound(xs)))
414 return -ENXIO;
415 if (unlikely(need_wait))
416 return -EOPNOTSUPP;
417
418 return __xsk_sendmsg(sk);
419}
420
421static unsigned int xsk_poll(struct file *file, struct socket *sock,
422 struct poll_table_struct *wait)
423{
424 unsigned int mask = datagram_poll(file, sock, wait);
425 struct sock *sk = sock->sk;
426 struct xdp_sock *xs = xdp_sk(sk);
427 struct net_device *dev;
428 struct xdp_umem *umem;
429
430 if (unlikely(!xsk_is_bound(xs)))
431 return mask;
432
433 dev = xs->dev;
434 umem = xs->umem;
435
436 if (umem->need_wakeup) {
437 if (dev->netdev_ops->ndo_xsk_wakeup)
438 dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id,
439 umem->need_wakeup);
440 else
441
442 __xsk_sendmsg(sk);
443 }
444
445 if (xs->rx && !xskq_empty_desc(xs->rx))
446 mask |= POLLIN | POLLRDNORM;
447 if (xs->tx && !xskq_full_desc(xs->tx))
448 mask |= POLLOUT | POLLWRNORM;
449
450 return mask;
451}
452
453static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
454 bool umem_queue)
455{
456 struct xsk_queue *q;
457
458 if (entries == 0 || *queue || !is_power_of_2(entries))
459 return -EINVAL;
460
461 q = xskq_create(entries, umem_queue);
462 if (!q)
463 return -ENOMEM;
464
465
466 smp_wmb();
467 WRITE_ONCE(*queue, q);
468 return 0;
469}
470
471static void xsk_unbind_dev(struct xdp_sock *xs)
472{
473 struct net_device *dev = xs->dev;
474
475 if (xs->state != XSK_BOUND)
476 return;
477 WRITE_ONCE(xs->state, XSK_UNBOUND);
478
479
480 xdp_del_sk_umem(xs->umem, xs);
481 xs->dev = NULL;
482 synchronize_net();
483 dev_put(dev);
484}
485
486static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
487 struct xdp_sock ***map_entry)
488{
489 struct xsk_map *map = NULL;
490 struct xsk_map_node *node;
491
492 *map_entry = NULL;
493
494 spin_lock_bh(&xs->map_list_lock);
495 node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node,
496 node);
497 if (node) {
498 WARN_ON(xsk_map_inc(node->map));
499 map = node->map;
500 *map_entry = node->map_entry;
501 }
502 spin_unlock_bh(&xs->map_list_lock);
503 return map;
504}
505
506static void xsk_delete_from_maps(struct xdp_sock *xs)
507{
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523 struct xdp_sock **map_entry = NULL;
524 struct xsk_map *map;
525
526 while ((map = xsk_get_map_list_entry(xs, &map_entry))) {
527 xsk_map_try_sock_delete(map, xs, map_entry);
528 xsk_map_put(map);
529 }
530}
531
532static int xsk_release(struct socket *sock)
533{
534 struct sock *sk = sock->sk;
535 struct xdp_sock *xs = xdp_sk(sk);
536 struct net *net;
537
538 if (!sk)
539 return 0;
540
541 net = sock_net(sk);
542
543 mutex_lock(&net->xdp.lock);
544 sk_del_node_init_rcu(sk);
545 mutex_unlock(&net->xdp.lock);
546
547 local_bh_disable();
548 sock_prot_inuse_add(net, sk->sk_prot, -1);
549 local_bh_enable();
550
551 xsk_delete_from_maps(xs);
552 mutex_lock(&xs->mutex);
553 xsk_unbind_dev(xs);
554 mutex_unlock(&xs->mutex);
555
556 xskq_destroy(xs->rx);
557 xskq_destroy(xs->tx);
558
559 sock_orphan(sk);
560 sock->sk = NULL;
561
562 sk_refcnt_debug_release(sk);
563 sock_put(sk);
564
565 return 0;
566}
567
568static struct socket *xsk_lookup_xsk_from_fd(int fd)
569{
570 struct socket *sock;
571 int err;
572
573 sock = sockfd_lookup(fd, &err);
574 if (!sock)
575 return ERR_PTR(-ENOTSOCK);
576
577 if (sock->sk->sk_family != PF_XDP) {
578 sockfd_put(sock);
579 return ERR_PTR(-ENOPROTOOPT);
580 }
581
582 return sock;
583}
584
585
586
587
588
589
590static void xsk_check_page_contiguity(struct xdp_umem *umem, u32 flags)
591{
592 struct xdp_umem_page *pgs = umem->pages;
593 int i, is_contig;
594
595 for (i = 0; i < umem->npgs - 1; i++) {
596 is_contig = (flags & XDP_ZEROCOPY) ?
597 (pgs[i].dma + PAGE_SIZE == pgs[i + 1].dma) :
598 (pgs[i].addr + PAGE_SIZE == pgs[i + 1].addr);
599 pgs[i].addr += is_contig << XSK_NEXT_PG_CONTIG_SHIFT;
600 }
601}
602
603static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
604{
605 struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
606 struct sock *sk = sock->sk;
607 struct xdp_sock *xs = xdp_sk(sk);
608 struct net_device *dev;
609 u32 flags, qid;
610 int err = 0;
611
612 if (addr_len < sizeof(struct sockaddr_xdp))
613 return -EINVAL;
614 if (sxdp->sxdp_family != AF_XDP)
615 return -EINVAL;
616
617 flags = sxdp->sxdp_flags;
618 if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
619 XDP_USE_NEED_WAKEUP))
620 return -EINVAL;
621
622 rtnl_lock();
623 mutex_lock(&xs->mutex);
624 if (xs->state != XSK_READY) {
625 err = -EBUSY;
626 goto out_release;
627 }
628
629 dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
630 if (!dev) {
631 err = -ENODEV;
632 goto out_release;
633 }
634
635 if (!xs->rx && !xs->tx) {
636 err = -EINVAL;
637 goto out_unlock;
638 }
639
640 qid = sxdp->sxdp_queue_id;
641
642 if (flags & XDP_SHARED_UMEM) {
643 struct xdp_sock *umem_xs;
644 struct socket *sock;
645
646 if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
647 (flags & XDP_USE_NEED_WAKEUP)) {
648
649 err = -EINVAL;
650 goto out_unlock;
651 }
652
653 if (xs->umem) {
654
655 err = -EINVAL;
656 goto out_unlock;
657 }
658
659 sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
660 if (IS_ERR(sock)) {
661 err = PTR_ERR(sock);
662 goto out_unlock;
663 }
664
665 umem_xs = xdp_sk(sock->sk);
666 if (!xsk_is_bound(umem_xs)) {
667 err = -EBADF;
668 sockfd_put(sock);
669 goto out_unlock;
670 }
671 if (umem_xs->dev != dev || umem_xs->queue_id != qid) {
672 err = -EINVAL;
673 sockfd_put(sock);
674 goto out_unlock;
675 }
676
677 xdp_get_umem(umem_xs->umem);
678 WRITE_ONCE(xs->umem, umem_xs->umem);
679 sockfd_put(sock);
680 } else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) {
681 err = -EINVAL;
682 goto out_unlock;
683 } else {
684
685 xskq_set_umem(xs->umem->fq, xs->umem->size,
686 xs->umem->chunk_mask);
687 xskq_set_umem(xs->umem->cq, xs->umem->size,
688 xs->umem->chunk_mask);
689
690 err = xdp_umem_assign_dev(xs->umem, dev, qid, flags);
691 if (err)
692 goto out_unlock;
693
694 xsk_check_page_contiguity(xs->umem, flags);
695 }
696
697 xs->dev = dev;
698 xs->zc = xs->umem->zc;
699 xs->queue_id = qid;
700 xskq_set_umem(xs->rx, xs->umem->size, xs->umem->chunk_mask);
701 xskq_set_umem(xs->tx, xs->umem->size, xs->umem->chunk_mask);
702 xdp_add_sk_umem(xs->umem, xs);
703
704out_unlock:
705 if (err) {
706 dev_put(dev);
707 } else {
708
709
710
711 smp_wmb();
712 WRITE_ONCE(xs->state, XSK_BOUND);
713 }
714out_release:
715 mutex_unlock(&xs->mutex);
716 rtnl_unlock();
717 return err;
718}
719
720struct xdp_umem_reg_v1 {
721 __u64 addr;
722 __u64 len;
723 __u32 chunk_size;
724 __u32 headroom;
725};
726
727static int xsk_setsockopt(struct socket *sock, int level, int optname,
728 char __user *optval, unsigned int optlen)
729{
730 struct sock *sk = sock->sk;
731 struct xdp_sock *xs = xdp_sk(sk);
732 int err;
733
734 if (level != SOL_XDP)
735 return -ENOPROTOOPT;
736
737 switch (optname) {
738 case XDP_RX_RING:
739 case XDP_TX_RING:
740 {
741 struct xsk_queue **q;
742 int entries;
743
744 if (optlen < sizeof(entries))
745 return -EINVAL;
746 if (copy_from_user(&entries, optval, sizeof(entries)))
747 return -EFAULT;
748
749 mutex_lock(&xs->mutex);
750 if (xs->state != XSK_READY) {
751 mutex_unlock(&xs->mutex);
752 return -EBUSY;
753 }
754 q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
755 err = xsk_init_queue(entries, q, false);
756 if (!err && optname == XDP_TX_RING)
757
758 xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
759 mutex_unlock(&xs->mutex);
760 return err;
761 }
762 case XDP_UMEM_REG:
763 {
764 size_t mr_size = sizeof(struct xdp_umem_reg);
765 struct xdp_umem_reg mr = {};
766 struct xdp_umem *umem;
767
768 if (optlen < sizeof(struct xdp_umem_reg_v1))
769 return -EINVAL;
770 else if (optlen < sizeof(mr))
771 mr_size = sizeof(struct xdp_umem_reg_v1);
772
773 if (copy_from_user(&mr, optval, mr_size))
774 return -EFAULT;
775
776 mutex_lock(&xs->mutex);
777 if (xs->state != XSK_READY || xs->umem) {
778 mutex_unlock(&xs->mutex);
779 return -EBUSY;
780 }
781
782 umem = xdp_umem_create(&mr);
783 if (IS_ERR(umem)) {
784 mutex_unlock(&xs->mutex);
785 return PTR_ERR(umem);
786 }
787
788
789 smp_wmb();
790 WRITE_ONCE(xs->umem, umem);
791 mutex_unlock(&xs->mutex);
792 return 0;
793 }
794 case XDP_UMEM_FILL_RING:
795 case XDP_UMEM_COMPLETION_RING:
796 {
797 struct xsk_queue **q;
798 int entries;
799
800 if (copy_from_user(&entries, optval, sizeof(entries)))
801 return -EFAULT;
802
803 mutex_lock(&xs->mutex);
804 if (xs->state != XSK_READY) {
805 mutex_unlock(&xs->mutex);
806 return -EBUSY;
807 }
808 if (!xs->umem) {
809 mutex_unlock(&xs->mutex);
810 return -EINVAL;
811 }
812
813 q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq :
814 &xs->umem->cq;
815 err = xsk_init_queue(entries, q, true);
816 mutex_unlock(&xs->mutex);
817 return err;
818 }
819 default:
820 break;
821 }
822
823 return -ENOPROTOOPT;
824}
825
826static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring)
827{
828 ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
829 ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
830 ring->desc = offsetof(struct xdp_rxtx_ring, desc);
831}
832
833static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring)
834{
835 ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer);
836 ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
837 ring->desc = offsetof(struct xdp_umem_ring, desc);
838}
839
840static int xsk_getsockopt(struct socket *sock, int level, int optname,
841 char __user *optval, int __user *optlen)
842{
843 struct sock *sk = sock->sk;
844 struct xdp_sock *xs = xdp_sk(sk);
845 int len;
846
847 if (level != SOL_XDP)
848 return -ENOPROTOOPT;
849
850 if (get_user(len, optlen))
851 return -EFAULT;
852 if (len < 0)
853 return -EINVAL;
854
855 switch (optname) {
856 case XDP_STATISTICS:
857 {
858 struct xdp_statistics stats;
859
860 if (len < sizeof(stats))
861 return -EINVAL;
862
863 mutex_lock(&xs->mutex);
864 stats.rx_dropped = xs->rx_dropped;
865 stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
866 stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
867 mutex_unlock(&xs->mutex);
868
869 if (copy_to_user(optval, &stats, sizeof(stats)))
870 return -EFAULT;
871 if (put_user(sizeof(stats), optlen))
872 return -EFAULT;
873
874 return 0;
875 }
876 case XDP_MMAP_OFFSETS:
877 {
878 struct xdp_mmap_offsets off;
879 struct xdp_mmap_offsets_v1 off_v1;
880 bool flags_supported = true;
881 void *to_copy;
882
883 if (len < sizeof(off_v1))
884 return -EINVAL;
885 else if (len < sizeof(off))
886 flags_supported = false;
887
888 if (flags_supported) {
889
890
891
892 xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
893 &off.rx);
894 xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
895 &off.tx);
896 xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
897 &off.fr);
898 xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
899 &off.cr);
900 off.rx.flags = offsetof(struct xdp_rxtx_ring,
901 ptrs.flags);
902 off.tx.flags = offsetof(struct xdp_rxtx_ring,
903 ptrs.flags);
904 off.fr.flags = offsetof(struct xdp_umem_ring,
905 ptrs.flags);
906 off.cr.flags = offsetof(struct xdp_umem_ring,
907 ptrs.flags);
908
909 len = sizeof(off);
910 to_copy = &off;
911 } else {
912 xsk_enter_rxtx_offsets(&off_v1.rx);
913 xsk_enter_rxtx_offsets(&off_v1.tx);
914 xsk_enter_umem_offsets(&off_v1.fr);
915 xsk_enter_umem_offsets(&off_v1.cr);
916
917 len = sizeof(off_v1);
918 to_copy = &off_v1;
919 }
920
921 if (copy_to_user(optval, to_copy, len))
922 return -EFAULT;
923 if (put_user(len, optlen))
924 return -EFAULT;
925
926 return 0;
927 }
928 case XDP_OPTIONS:
929 {
930 struct xdp_options opts = {};
931
932 if (len < sizeof(opts))
933 return -EINVAL;
934
935 mutex_lock(&xs->mutex);
936 if (xs->zc)
937 opts.flags |= XDP_OPTIONS_ZEROCOPY;
938 mutex_unlock(&xs->mutex);
939
940 len = sizeof(opts);
941 if (copy_to_user(optval, &opts, len))
942 return -EFAULT;
943 if (put_user(len, optlen))
944 return -EFAULT;
945
946 return 0;
947 }
948 default:
949 break;
950 }
951
952 return -EOPNOTSUPP;
953}
954
955static int xsk_mmap(struct file *file, struct socket *sock,
956 struct vm_area_struct *vma)
957{
958 loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
959 unsigned long size = vma->vm_end - vma->vm_start;
960 struct xdp_sock *xs = xdp_sk(sock->sk);
961 struct xsk_queue *q = NULL;
962 struct xdp_umem *umem;
963 unsigned long pfn;
964 struct page *qpg;
965
966 if (READ_ONCE(xs->state) != XSK_READY)
967 return -EBUSY;
968
969 if (offset == XDP_PGOFF_RX_RING) {
970 q = READ_ONCE(xs->rx);
971 } else if (offset == XDP_PGOFF_TX_RING) {
972 q = READ_ONCE(xs->tx);
973 } else {
974 umem = READ_ONCE(xs->umem);
975 if (!umem)
976 return -EINVAL;
977
978
979 smp_rmb();
980 if (offset == XDP_UMEM_PGOFF_FILL_RING)
981 q = READ_ONCE(umem->fq);
982 else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
983 q = READ_ONCE(umem->cq);
984 }
985
986 if (!q)
987 return -EINVAL;
988
989
990 smp_rmb();
991 qpg = virt_to_head_page(q->ring);
992 if (size > page_size(qpg))
993 return -EINVAL;
994
995 pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
996 return remap_pfn_range(vma, vma->vm_start, pfn,
997 size, vma->vm_page_prot);
998}
999
1000static int xsk_notifier(struct notifier_block *this,
1001 unsigned long msg, void *ptr)
1002{
1003 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1004 struct net *net = dev_net(dev);
1005 struct sock *sk;
1006
1007 switch (msg) {
1008 case NETDEV_UNREGISTER:
1009 mutex_lock(&net->xdp.lock);
1010 sk_for_each(sk, &net->xdp.list) {
1011 struct xdp_sock *xs = xdp_sk(sk);
1012
1013 mutex_lock(&xs->mutex);
1014 if (xs->dev == dev) {
1015 sk->sk_err = ENETDOWN;
1016 if (!sock_flag(sk, SOCK_DEAD))
1017 sk->sk_error_report(sk);
1018
1019 xsk_unbind_dev(xs);
1020
1021
1022 xdp_umem_clear_dev(xs->umem);
1023 }
1024 mutex_unlock(&xs->mutex);
1025 }
1026 mutex_unlock(&net->xdp.lock);
1027 break;
1028 }
1029 return NOTIFY_DONE;
1030}
1031
1032static struct proto xsk_proto = {
1033 .name = "XDP",
1034 .owner = THIS_MODULE,
1035 .obj_size = sizeof(struct xdp_sock),
1036};
1037
1038static const struct proto_ops xsk_proto_ops = {
1039 .family = PF_XDP,
1040 .owner = THIS_MODULE,
1041 .release = xsk_release,
1042 .bind = xsk_bind,
1043 .connect = sock_no_connect,
1044 .socketpair = sock_no_socketpair,
1045 .accept = sock_no_accept,
1046 .getname = sock_no_getname,
1047 .poll = xsk_poll,
1048 .ioctl = sock_no_ioctl,
1049 .listen = sock_no_listen,
1050 .shutdown = sock_no_shutdown,
1051 .setsockopt = xsk_setsockopt,
1052 .getsockopt = xsk_getsockopt,
1053 .sendmsg = xsk_sendmsg,
1054 .recvmsg = sock_no_recvmsg,
1055 .mmap = xsk_mmap,
1056 .sendpage = sock_no_sendpage,
1057};
1058
1059static void xsk_destruct(struct sock *sk)
1060{
1061 struct xdp_sock *xs = xdp_sk(sk);
1062
1063 if (!sock_flag(sk, SOCK_DEAD))
1064 return;
1065
1066 xdp_put_umem(xs->umem);
1067
1068 sk_refcnt_debug_dec(sk);
1069}
1070
1071static int xsk_create(struct net *net, struct socket *sock, int protocol,
1072 int kern)
1073{
1074 struct sock *sk;
1075 struct xdp_sock *xs;
1076
1077 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1078 return -EPERM;
1079 if (sock->type != SOCK_RAW)
1080 return -ESOCKTNOSUPPORT;
1081
1082 if (protocol)
1083 return -EPROTONOSUPPORT;
1084
1085 sock->state = SS_UNCONNECTED;
1086
1087 sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
1088 if (!sk)
1089 return -ENOBUFS;
1090
1091 sock->ops = &xsk_proto_ops;
1092
1093 sock_init_data(sock, sk);
1094
1095 sk->sk_family = PF_XDP;
1096
1097 sk->sk_destruct = xsk_destruct;
1098 sk_refcnt_debug_inc(sk);
1099
1100 sock_set_flag(sk, SOCK_RCU_FREE);
1101
1102 xs = xdp_sk(sk);
1103 xs->state = XSK_READY;
1104 mutex_init(&xs->mutex);
1105 spin_lock_init(&xs->rx_lock);
1106 spin_lock_init(&xs->tx_completion_lock);
1107
1108 INIT_LIST_HEAD(&xs->map_list);
1109 spin_lock_init(&xs->map_list_lock);
1110
1111 mutex_lock(&net->xdp.lock);
1112 sk_add_node_rcu(sk, &net->xdp.list);
1113 mutex_unlock(&net->xdp.lock);
1114
1115 local_bh_disable();
1116 sock_prot_inuse_add(net, &xsk_proto, 1);
1117 local_bh_enable();
1118
1119 return 0;
1120}
1121
1122static const struct net_proto_family xsk_family_ops = {
1123 .family = PF_XDP,
1124 .create = xsk_create,
1125 .owner = THIS_MODULE,
1126};
1127
1128static struct notifier_block xsk_netdev_notifier = {
1129 .notifier_call = xsk_notifier,
1130};
1131
1132static int __net_init xsk_net_init(struct net *net)
1133{
1134 mutex_init(&net->xdp.lock);
1135 INIT_HLIST_HEAD(&net->xdp.list);
1136 return 0;
1137}
1138
1139static void __net_exit xsk_net_exit(struct net *net)
1140{
1141 WARN_ON_ONCE(!hlist_empty(&net->xdp.list));
1142}
1143
1144static struct pernet_operations xsk_net_ops = {
1145 .init = xsk_net_init,
1146 .exit = xsk_net_exit,
1147};
1148
1149static int __init xsk_init(void)
1150{
1151 int err;
1152
1153 err = proto_register(&xsk_proto, 0 );
1154 if (err)
1155 goto out;
1156
1157 err = sock_register(&xsk_family_ops);
1158 if (err)
1159 goto out_proto;
1160
1161 err = register_pernet_subsys(&xsk_net_ops);
1162 if (err)
1163 goto out_sk;
1164
1165 err = register_netdevice_notifier(&xsk_netdev_notifier);
1166 if (err)
1167 goto out_pernet;
1168
1169 return 0;
1170
1171out_pernet:
1172 unregister_pernet_subsys(&xsk_net_ops);
1173out_sk:
1174 sock_unregister(PF_XDP);
1175out_proto:
1176 proto_unregister(&xsk_proto);
1177out:
1178 return err;
1179}
1180
1181fs_initcall(xsk_init);
1182