1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55#include <linux/types.h>
56#include <linux/mm.h>
57#include <linux/capability.h>
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
65#include <linux/kernel.h>
66#include <linux/kmod.h>
67#include <linux/slab.h>
68#include <linux/vmalloc.h>
69#include <net/net_namespace.h>
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
76#include <asm/uaccess.h>
77#include <asm/ioctls.h>
78#include <asm/page.h>
79#include <asm/cacheflush.h>
80#include <asm/io.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83#include <linux/poll.h>
84#include <linux/module.h>
85#include <linux/init.h>
86#include <linux/mutex.h>
87#include <linux/if_vlan.h>
88#include <linux/virtio_net.h>
89#include <linux/errqueue.h>
90#include <linux/net_tstamp.h>
91#include <linux/percpu.h>
92#ifdef CONFIG_INET
93#include <net/inet_common.h>
94#endif
95#include <linux/bpf.h>
96#include <net/compat.h>
97
98#include "internal.h"
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156struct packet_mreq_max {
157 int mr_ifindex;
158 unsigned short mr_type;
159 unsigned short mr_alen;
160 unsigned char mr_address[MAX_ADDR_LEN];
161};
162
163union tpacket_uhdr {
164 struct tpacket_hdr *h1;
165 struct tpacket2_hdr *h2;
166 struct tpacket3_hdr *h3;
167 void *raw;
168};
169
170static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
171 int closing, int tx_ring);
172
173#define V3_ALIGNMENT (8)
174
175#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
176
177#define BLK_PLUS_PRIV(sz_of_priv) \
178 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
179
180#define PGV_FROM_VMALLOC 1
181
182#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
183#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
184#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
185#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
186#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
187#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
188#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
189
190struct packet_sock;
191static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
192static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
193 struct packet_type *pt, struct net_device *orig_dev);
194
195static void *packet_previous_frame(struct packet_sock *po,
196 struct packet_ring_buffer *rb,
197 int status);
198static void packet_increment_head(struct packet_ring_buffer *buff);
199static int prb_curr_blk_in_use(struct tpacket_kbdq_core *,
200 struct tpacket_block_desc *);
201static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
202 struct packet_sock *);
203static void prb_retire_current_block(struct tpacket_kbdq_core *,
204 struct packet_sock *, unsigned int status);
205static int prb_queue_frozen(struct tpacket_kbdq_core *);
206static void prb_open_block(struct tpacket_kbdq_core *,
207 struct tpacket_block_desc *);
208static void prb_retire_rx_blk_timer_expired(unsigned long);
209static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
210static void prb_init_blk_timer(struct packet_sock *,
211 struct tpacket_kbdq_core *,
212 void (*func) (unsigned long));
213static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
214static void prb_clear_rxhash(struct tpacket_kbdq_core *,
215 struct tpacket3_hdr *);
216static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
217 struct tpacket3_hdr *);
218static void packet_flush_mclist(struct sock *sk);
219
220struct packet_skb_cb {
221 union {
222 struct sockaddr_pkt pkt;
223 union {
224
225
226
227
228 unsigned int origlen;
229 struct sockaddr_ll ll;
230 };
231 } sa;
232};
233
234#define vio_le() virtio_legacy_is_little_endian()
235
236#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
237
238#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
239#define GET_PBLOCK_DESC(x, bid) \
240 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
241#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
242 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
243#define GET_NEXT_PRB_BLK_NUM(x) \
244 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
245 ((x)->kactive_blk_num+1) : 0)
246
247static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
248static void __fanout_link(struct sock *sk, struct packet_sock *po);
249
250static int packet_direct_xmit(struct sk_buff *skb)
251{
252 struct net_device *dev = skb->dev;
253 struct sk_buff *orig_skb = skb;
254 struct netdev_queue *txq;
255 int ret = NETDEV_TX_BUSY;
256
257 if (unlikely(!netif_running(dev) ||
258 !netif_carrier_ok(dev)))
259 goto drop;
260
261 skb = validate_xmit_skb_list(skb, dev);
262 if (skb != orig_skb)
263 goto drop;
264
265 txq = skb_get_tx_queue(dev, skb);
266
267 local_bh_disable();
268
269 HARD_TX_LOCK(dev, txq, smp_processor_id());
270 if (!netif_xmit_frozen_or_drv_stopped(txq))
271 ret = netdev_start_xmit(skb, dev, txq, false);
272 HARD_TX_UNLOCK(dev, txq);
273
274 local_bh_enable();
275
276 if (!dev_xmit_complete(ret))
277 kfree_skb(skb);
278
279 return ret;
280drop:
281 atomic_long_inc(&dev->tx_dropped);
282 kfree_skb_list(skb);
283 return NET_XMIT_DROP;
284}
285
286static struct net_device *packet_cached_dev_get(struct packet_sock *po)
287{
288 struct net_device *dev;
289
290 rcu_read_lock();
291 dev = rcu_dereference(po->cached_dev);
292 if (likely(dev))
293 dev_hold(dev);
294 rcu_read_unlock();
295
296 return dev;
297}
298
299static void packet_cached_dev_assign(struct packet_sock *po,
300 struct net_device *dev)
301{
302 rcu_assign_pointer(po->cached_dev, dev);
303}
304
305static void packet_cached_dev_reset(struct packet_sock *po)
306{
307 RCU_INIT_POINTER(po->cached_dev, NULL);
308}
309
310static bool packet_use_direct_xmit(const struct packet_sock *po)
311{
312 return po->xmit == packet_direct_xmit;
313}
314
315static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
316{
317 return (u16) raw_smp_processor_id() % dev->real_num_tx_queues;
318}
319
320static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
321{
322 const struct net_device_ops *ops = dev->netdev_ops;
323 u16 queue_index;
324
325 if (ops->ndo_select_queue) {
326 queue_index = ops->ndo_select_queue(dev, skb, NULL,
327 __packet_pick_tx_queue);
328 queue_index = netdev_cap_txqueue(dev, queue_index);
329 } else {
330 queue_index = __packet_pick_tx_queue(dev, skb);
331 }
332
333 skb_set_queue_mapping(skb, queue_index);
334}
335
336
337
338
339
340static void register_prot_hook(struct sock *sk)
341{
342 struct packet_sock *po = pkt_sk(sk);
343
344 if (!po->running) {
345 if (po->fanout)
346 __fanout_link(sk, po);
347 else
348 dev_add_pack(&po->prot_hook);
349
350 sock_hold(sk);
351 po->running = 1;
352 }
353}
354
355
356
357
358
359
360
361
362static void __unregister_prot_hook(struct sock *sk, bool sync)
363{
364 struct packet_sock *po = pkt_sk(sk);
365
366 po->running = 0;
367
368 if (po->fanout)
369 __fanout_unlink(sk, po);
370 else
371 __dev_remove_pack(&po->prot_hook);
372
373 __sock_put(sk);
374
375 if (sync) {
376 spin_unlock(&po->bind_lock);
377 synchronize_net();
378 spin_lock(&po->bind_lock);
379 }
380}
381
382static void unregister_prot_hook(struct sock *sk, bool sync)
383{
384 struct packet_sock *po = pkt_sk(sk);
385
386 if (po->running)
387 __unregister_prot_hook(sk, sync);
388}
389
390static inline struct page * __pure pgv_to_page(void *addr)
391{
392 if (is_vmalloc_addr(addr))
393 return vmalloc_to_page(addr);
394 return virt_to_page(addr);
395}
396
397static void __packet_set_status(struct packet_sock *po, void *frame, int status)
398{
399 union tpacket_uhdr h;
400
401 h.raw = frame;
402 switch (po->tp_version) {
403 case TPACKET_V1:
404 h.h1->tp_status = status;
405 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
406 break;
407 case TPACKET_V2:
408 h.h2->tp_status = status;
409 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
410 break;
411 case TPACKET_V3:
412 default:
413 WARN(1, "TPACKET version not supported.\n");
414 BUG();
415 }
416
417 smp_wmb();
418}
419
420static int __packet_get_status(struct packet_sock *po, void *frame)
421{
422 union tpacket_uhdr h;
423
424 smp_rmb();
425
426 h.raw = frame;
427 switch (po->tp_version) {
428 case TPACKET_V1:
429 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
430 return h.h1->tp_status;
431 case TPACKET_V2:
432 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
433 return h.h2->tp_status;
434 case TPACKET_V3:
435 default:
436 WARN(1, "TPACKET version not supported.\n");
437 BUG();
438 return 0;
439 }
440}
441
442static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
443 unsigned int flags)
444{
445 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
446
447 if (shhwtstamps &&
448 (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
449 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
450 return TP_STATUS_TS_RAW_HARDWARE;
451
452 if (ktime_to_timespec_cond(skb->tstamp, ts))
453 return TP_STATUS_TS_SOFTWARE;
454
455 return 0;
456}
457
458static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
459 struct sk_buff *skb)
460{
461 union tpacket_uhdr h;
462 struct timespec ts;
463 __u32 ts_status;
464
465 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
466 return 0;
467
468 h.raw = frame;
469 switch (po->tp_version) {
470 case TPACKET_V1:
471 h.h1->tp_sec = ts.tv_sec;
472 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
473 break;
474 case TPACKET_V2:
475 h.h2->tp_sec = ts.tv_sec;
476 h.h2->tp_nsec = ts.tv_nsec;
477 break;
478 case TPACKET_V3:
479 default:
480 WARN(1, "TPACKET version not supported.\n");
481 BUG();
482 }
483
484
485 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
486 smp_wmb();
487
488 return ts_status;
489}
490
491static void *packet_lookup_frame(struct packet_sock *po,
492 struct packet_ring_buffer *rb,
493 unsigned int position,
494 int status)
495{
496 unsigned int pg_vec_pos, frame_offset;
497 union tpacket_uhdr h;
498
499 pg_vec_pos = position / rb->frames_per_block;
500 frame_offset = position % rb->frames_per_block;
501
502 h.raw = rb->pg_vec[pg_vec_pos].buffer +
503 (frame_offset * rb->frame_size);
504
505 if (status != __packet_get_status(po, h.raw))
506 return NULL;
507
508 return h.raw;
509}
510
511static void *packet_current_frame(struct packet_sock *po,
512 struct packet_ring_buffer *rb,
513 int status)
514{
515 return packet_lookup_frame(po, rb, rb->head, status);
516}
517
518static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
519{
520 del_timer_sync(&pkc->retire_blk_timer);
521}
522
523static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
524 struct sk_buff_head *rb_queue)
525{
526 struct tpacket_kbdq_core *pkc;
527
528 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
529
530 spin_lock_bh(&rb_queue->lock);
531 pkc->delete_blk_timer = 1;
532 spin_unlock_bh(&rb_queue->lock);
533
534 prb_del_retire_blk_timer(pkc);
535}
536
537static void prb_init_blk_timer(struct packet_sock *po,
538 struct tpacket_kbdq_core *pkc,
539 void (*func) (unsigned long))
540{
541 init_timer(&pkc->retire_blk_timer);
542 pkc->retire_blk_timer.data = (long)po;
543 pkc->retire_blk_timer.function = func;
544 pkc->retire_blk_timer.expires = jiffies;
545}
546
547static void prb_setup_retire_blk_timer(struct packet_sock *po)
548{
549 struct tpacket_kbdq_core *pkc;
550
551 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
552 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
553}
554
555static int prb_calc_retire_blk_tmo(struct packet_sock *po,
556 int blk_size_in_bytes)
557{
558 struct net_device *dev;
559 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
560 struct ethtool_link_ksettings ecmd;
561 int err;
562
563 rtnl_lock();
564 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
565 if (unlikely(!dev)) {
566 rtnl_unlock();
567 return DEFAULT_PRB_RETIRE_TOV;
568 }
569 err = __ethtool_get_link_ksettings(dev, &ecmd);
570 rtnl_unlock();
571 if (!err) {
572
573
574
575
576 if (ecmd.base.speed < SPEED_1000 ||
577 ecmd.base.speed == SPEED_UNKNOWN) {
578 return DEFAULT_PRB_RETIRE_TOV;
579 } else {
580 msec = 1;
581 div = ecmd.base.speed / 1000;
582 }
583 }
584
585 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
586
587 if (div)
588 mbits /= div;
589
590 tmo = mbits * msec;
591
592 if (div)
593 return tmo+1;
594 return tmo;
595}
596
597static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
598 union tpacket_req_u *req_u)
599{
600 p1->feature_req_word = req_u->req3.tp_feature_req_word;
601}
602
603static void init_prb_bdqc(struct packet_sock *po,
604 struct packet_ring_buffer *rb,
605 struct pgv *pg_vec,
606 union tpacket_req_u *req_u)
607{
608 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
609 struct tpacket_block_desc *pbd;
610
611 memset(p1, 0x0, sizeof(*p1));
612
613 p1->knxt_seq_num = 1;
614 p1->pkbdq = pg_vec;
615 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
616 p1->pkblk_start = pg_vec[0].buffer;
617 p1->kblk_size = req_u->req3.tp_block_size;
618 p1->knum_blocks = req_u->req3.tp_block_nr;
619 p1->hdrlen = po->tp_hdrlen;
620 p1->version = po->tp_version;
621 p1->last_kactive_blk_num = 0;
622 po->stats.stats3.tp_freeze_q_cnt = 0;
623 if (req_u->req3.tp_retire_blk_tov)
624 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
625 else
626 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
627 req_u->req3.tp_block_size);
628 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
629 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
630
631 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
632 prb_init_ft_ops(p1, req_u);
633 prb_setup_retire_blk_timer(po);
634 prb_open_block(p1, pbd);
635}
636
637
638
639
640static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
641{
642 mod_timer(&pkc->retire_blk_timer,
643 jiffies + pkc->tov_in_jiffies);
644 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
645}
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670static void prb_retire_rx_blk_timer_expired(unsigned long data)
671{
672 struct packet_sock *po = (struct packet_sock *)data;
673 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
674 unsigned int frozen;
675 struct tpacket_block_desc *pbd;
676
677 spin_lock(&po->sk.sk_receive_queue.lock);
678
679 frozen = prb_queue_frozen(pkc);
680 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
681
682 if (unlikely(pkc->delete_blk_timer))
683 goto out;
684
685
686
687
688
689
690
691
692
693
694 if (BLOCK_NUM_PKTS(pbd)) {
695 while (atomic_read(&pkc->blk_fill_in_prog)) {
696
697 cpu_relax();
698 }
699 }
700
701 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
702 if (!frozen) {
703 if (!BLOCK_NUM_PKTS(pbd)) {
704
705 goto refresh_timer;
706 }
707 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
708 if (!prb_dispatch_next_block(pkc, po))
709 goto refresh_timer;
710 else
711 goto out;
712 } else {
713
714
715
716 if (prb_curr_blk_in_use(pkc, pbd)) {
717
718
719
720
721 goto refresh_timer;
722 } else {
723
724
725
726
727
728
729
730 prb_open_block(pkc, pbd);
731 goto out;
732 }
733 }
734 }
735
736refresh_timer:
737 _prb_refresh_rx_retire_blk_timer(pkc);
738
739out:
740 spin_unlock(&po->sk.sk_receive_queue.lock);
741}
742
743static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
744 struct tpacket_block_desc *pbd1, __u32 status)
745{
746
747
748#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
749 u8 *start, *end;
750
751 start = (u8 *)pbd1;
752
753
754 start += PAGE_SIZE;
755
756 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
757 for (; start < end; start += PAGE_SIZE)
758 flush_dcache_page(pgv_to_page(start));
759
760 smp_wmb();
761#endif
762
763
764
765 BLOCK_STATUS(pbd1) = status;
766
767
768
769#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
770 start = (u8 *)pbd1;
771 flush_dcache_page(pgv_to_page(start));
772
773 smp_wmb();
774#endif
775}
776
777
778
779
780
781
782
783
784
785
786static void prb_close_block(struct tpacket_kbdq_core *pkc1,
787 struct tpacket_block_desc *pbd1,
788 struct packet_sock *po, unsigned int stat)
789{
790 __u32 status = TP_STATUS_USER | stat;
791
792 struct tpacket3_hdr *last_pkt;
793 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
794 struct sock *sk = &po->sk;
795
796 if (po->stats.stats3.tp_drops)
797 status |= TP_STATUS_LOSING;
798
799 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
800 last_pkt->tp_next_offset = 0;
801
802
803 if (BLOCK_NUM_PKTS(pbd1)) {
804 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
805 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
806 } else {
807
808
809
810
811
812 struct timespec ts;
813 getnstimeofday(&ts);
814 h1->ts_last_pkt.ts_sec = ts.tv_sec;
815 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
816 }
817
818 smp_wmb();
819
820
821 prb_flush_block(pkc1, pbd1, status);
822
823 sk->sk_data_ready(sk);
824
825 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
826}
827
828static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
829{
830 pkc->reset_pending_on_curr_blk = 0;
831}
832
833
834
835
836
837
838
839
840static void prb_open_block(struct tpacket_kbdq_core *pkc1,
841 struct tpacket_block_desc *pbd1)
842{
843 struct timespec ts;
844 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
845
846 smp_rmb();
847
848
849
850
851
852 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
853 BLOCK_NUM_PKTS(pbd1) = 0;
854 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
855
856 getnstimeofday(&ts);
857
858 h1->ts_first_pkt.ts_sec = ts.tv_sec;
859 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
860
861 pkc1->pkblk_start = (char *)pbd1;
862 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
863
864 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
865 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
866
867 pbd1->version = pkc1->version;
868 pkc1->prev = pkc1->nxt_offset;
869 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
870
871 prb_thaw_queue(pkc1);
872 _prb_refresh_rx_retire_blk_timer(pkc1);
873
874 smp_wmb();
875}
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
901 struct packet_sock *po)
902{
903 pkc->reset_pending_on_curr_blk = 1;
904 po->stats.stats3.tp_freeze_q_cnt++;
905}
906
907#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
908
909
910
911
912
913
914
915static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
916 struct packet_sock *po)
917{
918 struct tpacket_block_desc *pbd;
919
920 smp_rmb();
921
922
923 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
924
925
926 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
927 prb_freeze_queue(pkc, po);
928 return NULL;
929 }
930
931
932
933
934
935
936 prb_open_block(pkc, pbd);
937 return (void *)pkc->nxt_offset;
938}
939
940static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
941 struct packet_sock *po, unsigned int status)
942{
943 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
944
945
946 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
947
948
949
950
951
952
953
954
955
956 if (!(status & TP_STATUS_BLK_TMO)) {
957 while (atomic_read(&pkc->blk_fill_in_prog)) {
958
959 cpu_relax();
960 }
961 }
962 prb_close_block(pkc, pbd, po, status);
963 return;
964 }
965}
966
967static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc,
968 struct tpacket_block_desc *pbd)
969{
970 return TP_STATUS_USER & BLOCK_STATUS(pbd);
971}
972
973static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
974{
975 return pkc->reset_pending_on_curr_blk;
976}
977
978static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
979{
980 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
981 atomic_dec(&pkc->blk_fill_in_prog);
982}
983
984static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
985 struct tpacket3_hdr *ppd)
986{
987 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
988}
989
990static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
991 struct tpacket3_hdr *ppd)
992{
993 ppd->hv1.tp_rxhash = 0;
994}
995
996static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
997 struct tpacket3_hdr *ppd)
998{
999 if (skb_vlan_tag_present(pkc->skb)) {
1000 ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
1001 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
1002 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
1003 } else {
1004 ppd->hv1.tp_vlan_tci = 0;
1005 ppd->hv1.tp_vlan_tpid = 0;
1006 ppd->tp_status = TP_STATUS_AVAILABLE;
1007 }
1008}
1009
1010static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
1011 struct tpacket3_hdr *ppd)
1012{
1013 ppd->hv1.tp_padding = 0;
1014 prb_fill_vlan_info(pkc, ppd);
1015
1016 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
1017 prb_fill_rxhash(pkc, ppd);
1018 else
1019 prb_clear_rxhash(pkc, ppd);
1020}
1021
1022static void prb_fill_curr_block(char *curr,
1023 struct tpacket_kbdq_core *pkc,
1024 struct tpacket_block_desc *pbd,
1025 unsigned int len)
1026{
1027 struct tpacket3_hdr *ppd;
1028
1029 ppd = (struct tpacket3_hdr *)curr;
1030 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1031 pkc->prev = curr;
1032 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1033 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1034 BLOCK_NUM_PKTS(pbd) += 1;
1035 atomic_inc(&pkc->blk_fill_in_prog);
1036 prb_run_all_ft_ops(pkc, ppd);
1037}
1038
1039
1040static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1041 struct sk_buff *skb,
1042 int status,
1043 unsigned int len
1044 )
1045{
1046 struct tpacket_kbdq_core *pkc;
1047 struct tpacket_block_desc *pbd;
1048 char *curr, *end;
1049
1050 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
1051 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1052
1053
1054 if (prb_queue_frozen(pkc)) {
1055
1056
1057
1058
1059 if (prb_curr_blk_in_use(pkc, pbd)) {
1060
1061 return NULL;
1062 } else {
1063
1064
1065
1066
1067
1068
1069 prb_open_block(pkc, pbd);
1070 }
1071 }
1072
1073 smp_mb();
1074 curr = pkc->nxt_offset;
1075 pkc->skb = skb;
1076 end = (char *)pbd + pkc->kblk_size;
1077
1078
1079 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1080 prb_fill_curr_block(curr, pkc, pbd, len);
1081 return (void *)curr;
1082 }
1083
1084
1085 prb_retire_current_block(pkc, po, 0);
1086
1087
1088 curr = (char *)prb_dispatch_next_block(pkc, po);
1089 if (curr) {
1090 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1091 prb_fill_curr_block(curr, pkc, pbd, len);
1092 return (void *)curr;
1093 }
1094
1095
1096
1097
1098
1099 return NULL;
1100}
1101
1102static void *packet_current_rx_frame(struct packet_sock *po,
1103 struct sk_buff *skb,
1104 int status, unsigned int len)
1105{
1106 char *curr = NULL;
1107 switch (po->tp_version) {
1108 case TPACKET_V1:
1109 case TPACKET_V2:
1110 curr = packet_lookup_frame(po, &po->rx_ring,
1111 po->rx_ring.head, status);
1112 return curr;
1113 case TPACKET_V3:
1114 return __packet_lookup_frame_in_block(po, skb, status, len);
1115 default:
1116 WARN(1, "TPACKET version not supported\n");
1117 BUG();
1118 return NULL;
1119 }
1120}
1121
1122static void *prb_lookup_block(struct packet_sock *po,
1123 struct packet_ring_buffer *rb,
1124 unsigned int idx,
1125 int status)
1126{
1127 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
1128 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
1129
1130 if (status != BLOCK_STATUS(pbd))
1131 return NULL;
1132 return pbd;
1133}
1134
1135static int prb_previous_blk_num(struct packet_ring_buffer *rb)
1136{
1137 unsigned int prev;
1138 if (rb->prb_bdqc.kactive_blk_num)
1139 prev = rb->prb_bdqc.kactive_blk_num-1;
1140 else
1141 prev = rb->prb_bdqc.knum_blocks-1;
1142 return prev;
1143}
1144
1145
1146static void *__prb_previous_block(struct packet_sock *po,
1147 struct packet_ring_buffer *rb,
1148 int status)
1149{
1150 unsigned int previous = prb_previous_blk_num(rb);
1151 return prb_lookup_block(po, rb, previous, status);
1152}
1153
1154static void *packet_previous_rx_frame(struct packet_sock *po,
1155 struct packet_ring_buffer *rb,
1156 int status)
1157{
1158 if (po->tp_version <= TPACKET_V2)
1159 return packet_previous_frame(po, rb, status);
1160
1161 return __prb_previous_block(po, rb, status);
1162}
1163
1164static void packet_increment_rx_head(struct packet_sock *po,
1165 struct packet_ring_buffer *rb)
1166{
1167 switch (po->tp_version) {
1168 case TPACKET_V1:
1169 case TPACKET_V2:
1170 return packet_increment_head(rb);
1171 case TPACKET_V3:
1172 default:
1173 WARN(1, "TPACKET version not supported.\n");
1174 BUG();
1175 return;
1176 }
1177}
1178
1179static void *packet_previous_frame(struct packet_sock *po,
1180 struct packet_ring_buffer *rb,
1181 int status)
1182{
1183 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1184 return packet_lookup_frame(po, rb, previous, status);
1185}
1186
1187static void packet_increment_head(struct packet_ring_buffer *buff)
1188{
1189 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1190}
1191
1192static void packet_inc_pending(struct packet_ring_buffer *rb)
1193{
1194 this_cpu_inc(*rb->pending_refcnt);
1195}
1196
1197static void packet_dec_pending(struct packet_ring_buffer *rb)
1198{
1199 this_cpu_dec(*rb->pending_refcnt);
1200}
1201
1202static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1203{
1204 unsigned int refcnt = 0;
1205 int cpu;
1206
1207
1208 if (rb->pending_refcnt == NULL)
1209 return 0;
1210
1211 for_each_possible_cpu(cpu)
1212 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1213
1214 return refcnt;
1215}
1216
1217static int packet_alloc_pending(struct packet_sock *po)
1218{
1219 po->rx_ring.pending_refcnt = NULL;
1220
1221 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1222 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1223 return -ENOBUFS;
1224
1225 return 0;
1226}
1227
1228static void packet_free_pending(struct packet_sock *po)
1229{
1230 free_percpu(po->tx_ring.pending_refcnt);
1231}
1232
1233#define ROOM_POW_OFF 2
1234#define ROOM_NONE 0x0
1235#define ROOM_LOW 0x1
1236#define ROOM_NORMAL 0x2
1237
1238static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
1239{
1240 int idx, len;
1241
1242 len = po->rx_ring.frame_max + 1;
1243 idx = po->rx_ring.head;
1244 if (pow_off)
1245 idx += len >> pow_off;
1246 if (idx >= len)
1247 idx -= len;
1248 return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1249}
1250
1251static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
1252{
1253 int idx, len;
1254
1255 len = po->rx_ring.prb_bdqc.knum_blocks;
1256 idx = po->rx_ring.prb_bdqc.kactive_blk_num;
1257 if (pow_off)
1258 idx += len >> pow_off;
1259 if (idx >= len)
1260 idx -= len;
1261 return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1262}
1263
1264static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1265{
1266 struct sock *sk = &po->sk;
1267 int ret = ROOM_NONE;
1268
1269 if (po->prot_hook.func != tpacket_rcv) {
1270 int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
1271 - (skb ? skb->truesize : 0);
1272 if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
1273 return ROOM_NORMAL;
1274 else if (avail > 0)
1275 return ROOM_LOW;
1276 else
1277 return ROOM_NONE;
1278 }
1279
1280 if (po->tp_version == TPACKET_V3) {
1281 if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
1282 ret = ROOM_NORMAL;
1283 else if (__tpacket_v3_has_room(po, 0))
1284 ret = ROOM_LOW;
1285 } else {
1286 if (__tpacket_has_room(po, ROOM_POW_OFF))
1287 ret = ROOM_NORMAL;
1288 else if (__tpacket_has_room(po, 0))
1289 ret = ROOM_LOW;
1290 }
1291
1292 return ret;
1293}
1294
1295static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1296{
1297 int ret;
1298 bool has_room;
1299
1300 spin_lock_bh(&po->sk.sk_receive_queue.lock);
1301 ret = __packet_rcv_has_room(po, skb);
1302 has_room = ret == ROOM_NORMAL;
1303 if (po->pressure == has_room)
1304 po->pressure = !has_room;
1305 spin_unlock_bh(&po->sk.sk_receive_queue.lock);
1306
1307 return ret;
1308}
1309
1310static void packet_sock_destruct(struct sock *sk)
1311{
1312 skb_queue_purge(&sk->sk_error_queue);
1313
1314 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1315 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
1316
1317 if (!sock_flag(sk, SOCK_DEAD)) {
1318 pr_err("Attempt to release alive packet socket: %p\n", sk);
1319 return;
1320 }
1321
1322 sk_refcnt_debug_dec(sk);
1323}
1324
1325static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
1326{
1327 u32 rxhash;
1328 int i, count = 0;
1329
1330 rxhash = skb_get_hash(skb);
1331 for (i = 0; i < ROLLOVER_HLEN; i++)
1332 if (po->rollover->history[i] == rxhash)
1333 count++;
1334
1335 po->rollover->history[prandom_u32() % ROLLOVER_HLEN] = rxhash;
1336 return count > (ROLLOVER_HLEN >> 1);
1337}
1338
1339static unsigned int fanout_demux_hash(struct packet_fanout *f,
1340 struct sk_buff *skb,
1341 unsigned int num)
1342{
1343 return reciprocal_scale(__skb_get_hash_symmetric(skb), num);
1344}
1345
1346static unsigned int fanout_demux_lb(struct packet_fanout *f,
1347 struct sk_buff *skb,
1348 unsigned int num)
1349{
1350 unsigned int val = atomic_inc_return(&f->rr_cur);
1351
1352 return val % num;
1353}
1354
1355static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1356 struct sk_buff *skb,
1357 unsigned int num)
1358{
1359 return smp_processor_id() % num;
1360}
1361
1362static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1363 struct sk_buff *skb,
1364 unsigned int num)
1365{
1366 return prandom_u32_max(num);
1367}
1368
1369static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1370 struct sk_buff *skb,
1371 unsigned int idx, bool try_self,
1372 unsigned int num)
1373{
1374 struct packet_sock *po, *po_next, *po_skip = NULL;
1375 unsigned int i, j, room = ROOM_NONE;
1376
1377 po = pkt_sk(f->arr[idx]);
1378
1379 if (try_self) {
1380 room = packet_rcv_has_room(po, skb);
1381 if (room == ROOM_NORMAL ||
1382 (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
1383 return idx;
1384 po_skip = po;
1385 }
1386
1387 i = j = min_t(int, po->rollover->sock, num - 1);
1388 do {
1389 po_next = pkt_sk(f->arr[i]);
1390 if (po_next != po_skip && !po_next->pressure &&
1391 packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
1392 if (i != j)
1393 po->rollover->sock = i;
1394 atomic_long_inc(&po->rollover->num);
1395 if (room == ROOM_LOW)
1396 atomic_long_inc(&po->rollover->num_huge);
1397 return i;
1398 }
1399
1400 if (++i == num)
1401 i = 0;
1402 } while (i != j);
1403
1404 atomic_long_inc(&po->rollover->num_failed);
1405 return idx;
1406}
1407
1408static unsigned int fanout_demux_qm(struct packet_fanout *f,
1409 struct sk_buff *skb,
1410 unsigned int num)
1411{
1412 return skb_get_queue_mapping(skb) % num;
1413}
1414
1415static unsigned int fanout_demux_bpf(struct packet_fanout *f,
1416 struct sk_buff *skb,
1417 unsigned int num)
1418{
1419 struct bpf_prog *prog;
1420 unsigned int ret = 0;
1421
1422 rcu_read_lock();
1423 prog = rcu_dereference(f->bpf_prog);
1424 if (prog)
1425 ret = bpf_prog_run_clear_cb(prog, skb) % num;
1426 rcu_read_unlock();
1427
1428 return ret;
1429}
1430
1431static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1432{
1433 return f->flags & (flag >> 8);
1434}
1435
1436static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1437 struct packet_type *pt, struct net_device *orig_dev)
1438{
1439 struct packet_fanout *f = pt->af_packet_priv;
1440 unsigned int num = READ_ONCE(f->num_members);
1441 struct net *net = read_pnet(&f->net);
1442 struct packet_sock *po;
1443 unsigned int idx;
1444
1445 if (!net_eq(dev_net(dev), net) || !num) {
1446 kfree_skb(skb);
1447 return 0;
1448 }
1449
1450 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
1451 skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET);
1452 if (!skb)
1453 return 0;
1454 }
1455 switch (f->type) {
1456 case PACKET_FANOUT_HASH:
1457 default:
1458 idx = fanout_demux_hash(f, skb, num);
1459 break;
1460 case PACKET_FANOUT_LB:
1461 idx = fanout_demux_lb(f, skb, num);
1462 break;
1463 case PACKET_FANOUT_CPU:
1464 idx = fanout_demux_cpu(f, skb, num);
1465 break;
1466 case PACKET_FANOUT_RND:
1467 idx = fanout_demux_rnd(f, skb, num);
1468 break;
1469 case PACKET_FANOUT_QM:
1470 idx = fanout_demux_qm(f, skb, num);
1471 break;
1472 case PACKET_FANOUT_ROLLOVER:
1473 idx = fanout_demux_rollover(f, skb, 0, false, num);
1474 break;
1475 case PACKET_FANOUT_CBPF:
1476 case PACKET_FANOUT_EBPF:
1477 idx = fanout_demux_bpf(f, skb, num);
1478 break;
1479 }
1480
1481 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
1482 idx = fanout_demux_rollover(f, skb, idx, true, num);
1483
1484 po = pkt_sk(f->arr[idx]);
1485 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1486}
1487
1488DEFINE_MUTEX(fanout_mutex);
1489EXPORT_SYMBOL_GPL(fanout_mutex);
1490static LIST_HEAD(fanout_list);
1491
1492static void __fanout_link(struct sock *sk, struct packet_sock *po)
1493{
1494 struct packet_fanout *f = po->fanout;
1495
1496 spin_lock(&f->lock);
1497 f->arr[f->num_members] = sk;
1498 smp_wmb();
1499 f->num_members++;
1500 spin_unlock(&f->lock);
1501}
1502
1503static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1504{
1505 struct packet_fanout *f = po->fanout;
1506 int i;
1507
1508 spin_lock(&f->lock);
1509 for (i = 0; i < f->num_members; i++) {
1510 if (f->arr[i] == sk)
1511 break;
1512 }
1513 BUG_ON(i >= f->num_members);
1514 f->arr[i] = f->arr[f->num_members - 1];
1515 f->num_members--;
1516 spin_unlock(&f->lock);
1517}
1518
1519static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
1520{
1521 if (sk->sk_family != PF_PACKET)
1522 return false;
1523
1524 return ptype->af_packet_priv == pkt_sk(sk)->fanout;
1525}
1526
1527static void fanout_init_data(struct packet_fanout *f)
1528{
1529 switch (f->type) {
1530 case PACKET_FANOUT_LB:
1531 atomic_set(&f->rr_cur, 0);
1532 break;
1533 case PACKET_FANOUT_CBPF:
1534 case PACKET_FANOUT_EBPF:
1535 RCU_INIT_POINTER(f->bpf_prog, NULL);
1536 break;
1537 }
1538}
1539
1540static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
1541{
1542 struct bpf_prog *old;
1543
1544 spin_lock(&f->lock);
1545 old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
1546 rcu_assign_pointer(f->bpf_prog, new);
1547 spin_unlock(&f->lock);
1548
1549 if (old) {
1550 synchronize_net();
1551 bpf_prog_destroy(old);
1552 }
1553}
1554
1555static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data,
1556 unsigned int len)
1557{
1558 struct bpf_prog *new;
1559 struct sock_fprog fprog;
1560 int ret;
1561
1562 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1563 return -EPERM;
1564 if (len != sizeof(fprog))
1565 return -EINVAL;
1566 if (copy_from_user(&fprog, data, len))
1567 return -EFAULT;
1568
1569 ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
1570 if (ret)
1571 return ret;
1572
1573 __fanout_set_data_bpf(po->fanout, new);
1574 return 0;
1575}
1576
1577static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data,
1578 unsigned int len)
1579{
1580 struct bpf_prog *new;
1581 u32 fd;
1582
1583 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1584 return -EPERM;
1585 if (len != sizeof(fd))
1586 return -EINVAL;
1587 if (copy_from_user(&fd, data, len))
1588 return -EFAULT;
1589
1590 new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
1591 if (IS_ERR(new))
1592 return PTR_ERR(new);
1593
1594 __fanout_set_data_bpf(po->fanout, new);
1595 return 0;
1596}
1597
1598static int fanout_set_data(struct packet_sock *po, char __user *data,
1599 unsigned int len)
1600{
1601 switch (po->fanout->type) {
1602 case PACKET_FANOUT_CBPF:
1603 return fanout_set_data_cbpf(po, data, len);
1604 case PACKET_FANOUT_EBPF:
1605 return fanout_set_data_ebpf(po, data, len);
1606 default:
1607 return -EINVAL;
1608 };
1609}
1610
1611static void fanout_release_data(struct packet_fanout *f)
1612{
1613 switch (f->type) {
1614 case PACKET_FANOUT_CBPF:
1615 case PACKET_FANOUT_EBPF:
1616 __fanout_set_data_bpf(f, NULL);
1617 };
1618}
1619
1620static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
1621{
1622 struct packet_sock *po = pkt_sk(sk);
1623 struct packet_fanout *f, *match;
1624 u8 type = type_flags & 0xff;
1625 u8 flags = type_flags >> 8;
1626 int err;
1627
1628 switch (type) {
1629 case PACKET_FANOUT_ROLLOVER:
1630 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1631 return -EINVAL;
1632 case PACKET_FANOUT_HASH:
1633 case PACKET_FANOUT_LB:
1634 case PACKET_FANOUT_CPU:
1635 case PACKET_FANOUT_RND:
1636 case PACKET_FANOUT_QM:
1637 case PACKET_FANOUT_CBPF:
1638 case PACKET_FANOUT_EBPF:
1639 break;
1640 default:
1641 return -EINVAL;
1642 }
1643
1644 if (!po->running)
1645 return -EINVAL;
1646
1647 if (po->fanout)
1648 return -EALREADY;
1649
1650 if (type == PACKET_FANOUT_ROLLOVER ||
1651 (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
1652 po->rollover = kzalloc(sizeof(*po->rollover), GFP_KERNEL);
1653 if (!po->rollover)
1654 return -ENOMEM;
1655 atomic_long_set(&po->rollover->num, 0);
1656 atomic_long_set(&po->rollover->num_huge, 0);
1657 atomic_long_set(&po->rollover->num_failed, 0);
1658 }
1659
1660 mutex_lock(&fanout_mutex);
1661 match = NULL;
1662 list_for_each_entry(f, &fanout_list, list) {
1663 if (f->id == id &&
1664 read_pnet(&f->net) == sock_net(sk)) {
1665 match = f;
1666 break;
1667 }
1668 }
1669 err = -EINVAL;
1670 if (match && match->flags != flags)
1671 goto out;
1672 if (!match) {
1673 err = -ENOMEM;
1674 match = kzalloc(sizeof(*match), GFP_KERNEL);
1675 if (!match)
1676 goto out;
1677 write_pnet(&match->net, sock_net(sk));
1678 match->id = id;
1679 match->type = type;
1680 match->flags = flags;
1681 INIT_LIST_HEAD(&match->list);
1682 spin_lock_init(&match->lock);
1683 atomic_set(&match->sk_ref, 0);
1684 fanout_init_data(match);
1685 match->prot_hook.type = po->prot_hook.type;
1686 match->prot_hook.dev = po->prot_hook.dev;
1687 match->prot_hook.func = packet_rcv_fanout;
1688 match->prot_hook.af_packet_priv = match;
1689 match->prot_hook.id_match = match_fanout_group;
1690 dev_add_pack(&match->prot_hook);
1691 list_add(&match->list, &fanout_list);
1692 }
1693 err = -EINVAL;
1694 if (match->type == type &&
1695 match->prot_hook.type == po->prot_hook.type &&
1696 match->prot_hook.dev == po->prot_hook.dev) {
1697 err = -ENOSPC;
1698 if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1699 __dev_remove_pack(&po->prot_hook);
1700 po->fanout = match;
1701 atomic_inc(&match->sk_ref);
1702 __fanout_link(sk, po);
1703 err = 0;
1704 }
1705 }
1706out:
1707 mutex_unlock(&fanout_mutex);
1708 if (err) {
1709 kfree(po->rollover);
1710 po->rollover = NULL;
1711 }
1712 return err;
1713}
1714
1715static void fanout_release(struct sock *sk)
1716{
1717 struct packet_sock *po = pkt_sk(sk);
1718 struct packet_fanout *f;
1719
1720 f = po->fanout;
1721 if (!f)
1722 return;
1723
1724 mutex_lock(&fanout_mutex);
1725 po->fanout = NULL;
1726
1727 if (atomic_dec_and_test(&f->sk_ref)) {
1728 list_del(&f->list);
1729 dev_remove_pack(&f->prot_hook);
1730 fanout_release_data(f);
1731 kfree(f);
1732 }
1733 mutex_unlock(&fanout_mutex);
1734
1735 if (po->rollover)
1736 kfree_rcu(po->rollover, rcu);
1737}
1738
1739static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
1740 struct sk_buff *skb)
1741{
1742
1743
1744
1745
1746 if (unlikely(dev->type != ARPHRD_ETHER))
1747 return false;
1748
1749 skb_reset_mac_header(skb);
1750 return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
1751}
1752
1753static const struct proto_ops packet_ops;
1754
1755static const struct proto_ops packet_ops_spkt;
1756
1757static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1758 struct packet_type *pt, struct net_device *orig_dev)
1759{
1760 struct sock *sk;
1761 struct sockaddr_pkt *spkt;
1762
1763
1764
1765
1766
1767
1768 sk = pt->af_packet_priv;
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781 if (skb->pkt_type == PACKET_LOOPBACK)
1782 goto out;
1783
1784 if (!net_eq(dev_net(dev), sock_net(sk)))
1785 goto out;
1786
1787 skb = skb_share_check(skb, GFP_ATOMIC);
1788 if (skb == NULL)
1789 goto oom;
1790
1791
1792 skb_dst_drop(skb);
1793
1794
1795 nf_reset(skb);
1796
1797 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1798
1799 skb_push(skb, skb->data - skb_mac_header(skb));
1800
1801
1802
1803
1804
1805 spkt->spkt_family = dev->type;
1806 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1807 spkt->spkt_protocol = skb->protocol;
1808
1809
1810
1811
1812
1813
1814 if (sock_queue_rcv_skb(sk, skb) == 0)
1815 return 0;
1816
1817out:
1818 kfree_skb(skb);
1819oom:
1820 return 0;
1821}
1822
1823
1824
1825
1826
1827
1828
1829static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
1830 size_t len)
1831{
1832 struct sock *sk = sock->sk;
1833 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1834 struct sk_buff *skb = NULL;
1835 struct net_device *dev;
1836 struct sockcm_cookie sockc;
1837 __be16 proto = 0;
1838 int err;
1839 int extra_len = 0;
1840
1841
1842
1843
1844
1845 if (saddr) {
1846 if (msg->msg_namelen < sizeof(struct sockaddr))
1847 return -EINVAL;
1848 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1849 proto = saddr->spkt_protocol;
1850 } else
1851 return -ENOTCONN;
1852
1853
1854
1855
1856
1857 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1858retry:
1859 rcu_read_lock();
1860 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1861 err = -ENODEV;
1862 if (dev == NULL)
1863 goto out_unlock;
1864
1865 err = -ENETDOWN;
1866 if (!(dev->flags & IFF_UP))
1867 goto out_unlock;
1868
1869
1870
1871
1872
1873
1874 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1875 if (!netif_supports_nofcs(dev)) {
1876 err = -EPROTONOSUPPORT;
1877 goto out_unlock;
1878 }
1879 extra_len = 4;
1880 }
1881
1882 err = -EMSGSIZE;
1883 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1884 goto out_unlock;
1885
1886 if (!skb) {
1887 size_t reserved = LL_RESERVED_SPACE(dev);
1888 int tlen = dev->needed_tailroom;
1889 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1890
1891 rcu_read_unlock();
1892 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1893 if (skb == NULL)
1894 return -ENOBUFS;
1895
1896
1897
1898
1899 skb_reserve(skb, reserved);
1900 skb_reset_network_header(skb);
1901
1902
1903 if (hhlen) {
1904 skb->data -= hhlen;
1905 skb->tail -= hhlen;
1906 if (len < hhlen)
1907 skb_reset_network_header(skb);
1908 }
1909 err = memcpy_from_msg(skb_put(skb, len), msg, len);
1910 if (err)
1911 goto out_free;
1912 goto retry;
1913 }
1914
1915 if (!dev_validate_header(dev, skb->data, len)) {
1916 err = -EINVAL;
1917 goto out_unlock;
1918 }
1919 if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
1920 !packet_extra_vlan_len_allowed(dev, skb)) {
1921 err = -EMSGSIZE;
1922 goto out_unlock;
1923 }
1924
1925 sockc.tsflags = sk->sk_tsflags;
1926 if (msg->msg_controllen) {
1927 err = sock_cmsg_send(sk, msg, &sockc);
1928 if (unlikely(err))
1929 goto out_unlock;
1930 }
1931
1932 skb->protocol = proto;
1933 skb->dev = dev;
1934 skb->priority = sk->sk_priority;
1935 skb->mark = sk->sk_mark;
1936
1937 sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags);
1938
1939 if (unlikely(extra_len == 4))
1940 skb->no_fcs = 1;
1941
1942 skb_probe_transport_header(skb, 0);
1943
1944 dev_queue_xmit(skb);
1945 rcu_read_unlock();
1946 return len;
1947
1948out_unlock:
1949 rcu_read_unlock();
1950out_free:
1951 kfree_skb(skb);
1952 return err;
1953}
1954
1955static unsigned int run_filter(struct sk_buff *skb,
1956 const struct sock *sk,
1957 unsigned int res)
1958{
1959 struct sk_filter *filter;
1960
1961 rcu_read_lock();
1962 filter = rcu_dereference(sk->sk_filter);
1963 if (filter != NULL)
1964 res = bpf_prog_run_clear_cb(filter->prog, skb);
1965 rcu_read_unlock();
1966
1967 return res;
1968}
1969
1970static int __packet_rcv_vnet(const struct sk_buff *skb,
1971 struct virtio_net_hdr *vnet_hdr)
1972{
1973 *vnet_hdr = (const struct virtio_net_hdr) { 0 };
1974
1975 if (virtio_net_hdr_from_skb(skb, vnet_hdr, vio_le()))
1976 BUG();
1977
1978 return 0;
1979}
1980
1981static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
1982 size_t *len)
1983{
1984 struct virtio_net_hdr vnet_hdr;
1985
1986 if (*len < sizeof(vnet_hdr))
1987 return -EINVAL;
1988 *len -= sizeof(vnet_hdr);
1989
1990 if (__packet_rcv_vnet(skb, &vnet_hdr))
1991 return -EINVAL;
1992
1993 return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
1994}
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
2009 struct packet_type *pt, struct net_device *orig_dev)
2010{
2011 struct sock *sk;
2012 struct sockaddr_ll *sll;
2013 struct packet_sock *po;
2014 u8 *skb_head = skb->data;
2015 int skb_len = skb->len;
2016 unsigned int snaplen, res;
2017 bool is_drop_n_account = false;
2018
2019 if (skb->pkt_type == PACKET_LOOPBACK)
2020 goto drop;
2021
2022 sk = pt->af_packet_priv;
2023 po = pkt_sk(sk);
2024
2025 if (!net_eq(dev_net(dev), sock_net(sk)))
2026 goto drop;
2027
2028 skb->dev = dev;
2029
2030 if (dev->header_ops) {
2031
2032
2033
2034
2035
2036
2037
2038 if (sk->sk_type != SOCK_DGRAM)
2039 skb_push(skb, skb->data - skb_mac_header(skb));
2040 else if (skb->pkt_type == PACKET_OUTGOING) {
2041
2042 skb_pull(skb, skb_network_offset(skb));
2043 }
2044 }
2045
2046 snaplen = skb->len;
2047
2048 res = run_filter(skb, sk, snaplen);
2049 if (!res)
2050 goto drop_n_restore;
2051 if (snaplen > res)
2052 snaplen = res;
2053
2054 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
2055 goto drop_n_acct;
2056
2057 if (skb_shared(skb)) {
2058 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
2059 if (nskb == NULL)
2060 goto drop_n_acct;
2061
2062 if (skb_head != skb->data) {
2063 skb->data = skb_head;
2064 skb->len = skb_len;
2065 }
2066 consume_skb(skb);
2067 skb = nskb;
2068 }
2069
2070 sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
2071
2072 sll = &PACKET_SKB_CB(skb)->sa.ll;
2073 sll->sll_hatype = dev->type;
2074 sll->sll_pkttype = skb->pkt_type;
2075 if (unlikely(po->origdev))
2076 sll->sll_ifindex = orig_dev->ifindex;
2077 else
2078 sll->sll_ifindex = dev->ifindex;
2079
2080 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
2081
2082
2083
2084
2085 PACKET_SKB_CB(skb)->sa.origlen = skb->len;
2086
2087 if (pskb_trim(skb, snaplen))
2088 goto drop_n_acct;
2089
2090 skb_set_owner_r(skb, sk);
2091 skb->dev = NULL;
2092 skb_dst_drop(skb);
2093
2094
2095 nf_reset(skb);
2096
2097 spin_lock(&sk->sk_receive_queue.lock);
2098 po->stats.stats1.tp_packets++;
2099 sock_skb_set_dropcount(sk, skb);
2100 __skb_queue_tail(&sk->sk_receive_queue, skb);
2101 spin_unlock(&sk->sk_receive_queue.lock);
2102 sk->sk_data_ready(sk);
2103 return 0;
2104
2105drop_n_acct:
2106 is_drop_n_account = true;
2107 spin_lock(&sk->sk_receive_queue.lock);
2108 po->stats.stats1.tp_drops++;
2109 atomic_inc(&sk->sk_drops);
2110 spin_unlock(&sk->sk_receive_queue.lock);
2111
2112drop_n_restore:
2113 if (skb_head != skb->data && skb_shared(skb)) {
2114 skb->data = skb_head;
2115 skb->len = skb_len;
2116 }
2117drop:
2118 if (!is_drop_n_account)
2119 consume_skb(skb);
2120 else
2121 kfree_skb(skb);
2122 return 0;
2123}
2124
2125static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
2126 struct packet_type *pt, struct net_device *orig_dev)
2127{
2128 struct sock *sk;
2129 struct packet_sock *po;
2130 struct sockaddr_ll *sll;
2131 union tpacket_uhdr h;
2132 u8 *skb_head = skb->data;
2133 int skb_len = skb->len;
2134 unsigned int snaplen, res;
2135 unsigned long status = TP_STATUS_USER;
2136 unsigned short macoff, netoff, hdrlen;
2137 struct sk_buff *copy_skb = NULL;
2138 struct timespec ts;
2139 __u32 ts_status;
2140 bool is_drop_n_account = false;
2141
2142
2143
2144
2145
2146 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
2147 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
2148
2149 if (skb->pkt_type == PACKET_LOOPBACK)
2150 goto drop;
2151
2152 sk = pt->af_packet_priv;
2153 po = pkt_sk(sk);
2154
2155 if (!net_eq(dev_net(dev), sock_net(sk)))
2156 goto drop;
2157
2158 if (dev->header_ops) {
2159 if (sk->sk_type != SOCK_DGRAM)
2160 skb_push(skb, skb->data - skb_mac_header(skb));
2161 else if (skb->pkt_type == PACKET_OUTGOING) {
2162
2163 skb_pull(skb, skb_network_offset(skb));
2164 }
2165 }
2166
2167 snaplen = skb->len;
2168
2169 res = run_filter(skb, sk, snaplen);
2170 if (!res)
2171 goto drop_n_restore;
2172
2173 if (skb->ip_summed == CHECKSUM_PARTIAL)
2174 status |= TP_STATUS_CSUMNOTREADY;
2175 else if (skb->pkt_type != PACKET_OUTGOING &&
2176 (skb->ip_summed == CHECKSUM_COMPLETE ||
2177 skb_csum_unnecessary(skb)))
2178 status |= TP_STATUS_CSUM_VALID;
2179
2180 if (snaplen > res)
2181 snaplen = res;
2182
2183 if (sk->sk_type == SOCK_DGRAM) {
2184 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
2185 po->tp_reserve;
2186 } else {
2187 unsigned int maclen = skb_network_offset(skb);
2188 netoff = TPACKET_ALIGN(po->tp_hdrlen +
2189 (maclen < 16 ? 16 : maclen)) +
2190 po->tp_reserve;
2191 if (po->has_vnet_hdr)
2192 netoff += sizeof(struct virtio_net_hdr);
2193 macoff = netoff - maclen;
2194 }
2195 if (po->tp_version <= TPACKET_V2) {
2196 if (macoff + snaplen > po->rx_ring.frame_size) {
2197 if (po->copy_thresh &&
2198 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
2199 if (skb_shared(skb)) {
2200 copy_skb = skb_clone(skb, GFP_ATOMIC);
2201 } else {
2202 copy_skb = skb_get(skb);
2203 skb_head = skb->data;
2204 }
2205 if (copy_skb)
2206 skb_set_owner_r(copy_skb, sk);
2207 }
2208 snaplen = po->rx_ring.frame_size - macoff;
2209 if ((int)snaplen < 0)
2210 snaplen = 0;
2211 }
2212 } else if (unlikely(macoff + snaplen >
2213 GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
2214 u32 nval;
2215
2216 nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
2217 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2218 snaplen, nval, macoff);
2219 snaplen = nval;
2220 if (unlikely((int)snaplen < 0)) {
2221 snaplen = 0;
2222 macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
2223 }
2224 }
2225 spin_lock(&sk->sk_receive_queue.lock);
2226 h.raw = packet_current_rx_frame(po, skb,
2227 TP_STATUS_KERNEL, (macoff+snaplen));
2228 if (!h.raw)
2229 goto drop_n_account;
2230 if (po->tp_version <= TPACKET_V2) {
2231 packet_increment_rx_head(po, &po->rx_ring);
2232
2233
2234
2235
2236
2237
2238 if (po->stats.stats1.tp_drops)
2239 status |= TP_STATUS_LOSING;
2240 }
2241 po->stats.stats1.tp_packets++;
2242 if (copy_skb) {
2243 status |= TP_STATUS_COPY;
2244 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
2245 }
2246 spin_unlock(&sk->sk_receive_queue.lock);
2247
2248 if (po->has_vnet_hdr) {
2249 if (__packet_rcv_vnet(skb, h.raw + macoff -
2250 sizeof(struct virtio_net_hdr))) {
2251 spin_lock(&sk->sk_receive_queue.lock);
2252 goto drop_n_account;
2253 }
2254 }
2255
2256 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
2257
2258 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
2259 getnstimeofday(&ts);
2260
2261 status |= ts_status;
2262
2263 switch (po->tp_version) {
2264 case TPACKET_V1:
2265 h.h1->tp_len = skb->len;
2266 h.h1->tp_snaplen = snaplen;
2267 h.h1->tp_mac = macoff;
2268 h.h1->tp_net = netoff;
2269 h.h1->tp_sec = ts.tv_sec;
2270 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
2271 hdrlen = sizeof(*h.h1);
2272 break;
2273 case TPACKET_V2:
2274 h.h2->tp_len = skb->len;
2275 h.h2->tp_snaplen = snaplen;
2276 h.h2->tp_mac = macoff;
2277 h.h2->tp_net = netoff;
2278 h.h2->tp_sec = ts.tv_sec;
2279 h.h2->tp_nsec = ts.tv_nsec;
2280 if (skb_vlan_tag_present(skb)) {
2281 h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
2282 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2283 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
2284 } else {
2285 h.h2->tp_vlan_tci = 0;
2286 h.h2->tp_vlan_tpid = 0;
2287 }
2288 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
2289 hdrlen = sizeof(*h.h2);
2290 break;
2291 case TPACKET_V3:
2292
2293
2294
2295 h.h3->tp_status |= status;
2296 h.h3->tp_len = skb->len;
2297 h.h3->tp_snaplen = snaplen;
2298 h.h3->tp_mac = macoff;
2299 h.h3->tp_net = netoff;
2300 h.h3->tp_sec = ts.tv_sec;
2301 h.h3->tp_nsec = ts.tv_nsec;
2302 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
2303 hdrlen = sizeof(*h.h3);
2304 break;
2305 default:
2306 BUG();
2307 }
2308
2309 sll = h.raw + TPACKET_ALIGN(hdrlen);
2310 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
2311 sll->sll_family = AF_PACKET;
2312 sll->sll_hatype = dev->type;
2313 sll->sll_protocol = skb->protocol;
2314 sll->sll_pkttype = skb->pkt_type;
2315 if (unlikely(po->origdev))
2316 sll->sll_ifindex = orig_dev->ifindex;
2317 else
2318 sll->sll_ifindex = dev->ifindex;
2319
2320 smp_mb();
2321
2322#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
2323 if (po->tp_version <= TPACKET_V2) {
2324 u8 *start, *end;
2325
2326 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2327 macoff + snaplen);
2328
2329 for (start = h.raw; start < end; start += PAGE_SIZE)
2330 flush_dcache_page(pgv_to_page(start));
2331 }
2332 smp_wmb();
2333#endif
2334
2335 if (po->tp_version <= TPACKET_V2) {
2336 __packet_set_status(po, h.raw, status);
2337 sk->sk_data_ready(sk);
2338 } else {
2339 prb_clear_blk_fill_status(&po->rx_ring);
2340 }
2341
2342drop_n_restore:
2343 if (skb_head != skb->data && skb_shared(skb)) {
2344 skb->data = skb_head;
2345 skb->len = skb_len;
2346 }
2347drop:
2348 if (!is_drop_n_account)
2349 consume_skb(skb);
2350 else
2351 kfree_skb(skb);
2352 return 0;
2353
2354drop_n_account:
2355 is_drop_n_account = true;
2356 po->stats.stats1.tp_drops++;
2357 spin_unlock(&sk->sk_receive_queue.lock);
2358
2359 sk->sk_data_ready(sk);
2360 kfree_skb(copy_skb);
2361 goto drop_n_restore;
2362}
2363
2364static void tpacket_destruct_skb(struct sk_buff *skb)
2365{
2366 struct packet_sock *po = pkt_sk(skb->sk);
2367
2368 if (likely(po->tx_ring.pg_vec)) {
2369 void *ph;
2370 __u32 ts;
2371
2372 ph = skb_shinfo(skb)->destructor_arg;
2373 packet_dec_pending(&po->tx_ring);
2374
2375 ts = __packet_set_timestamp(po, ph, skb);
2376 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
2377 }
2378
2379 sock_wfree(skb);
2380}
2381
2382static void tpacket_set_protocol(const struct net_device *dev,
2383 struct sk_buff *skb)
2384{
2385 if (dev->type == ARPHRD_ETHER) {
2386 skb_reset_mac_header(skb);
2387 skb->protocol = eth_hdr(skb)->h_proto;
2388 }
2389}
2390
2391static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
2392{
2393 unsigned short gso_type = 0;
2394
2395 if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2396 (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2397 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
2398 __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len)))
2399 vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
2400 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2401 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2);
2402
2403 if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
2404 return -EINVAL;
2405
2406 if (vnet_hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2407 switch (vnet_hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2408 case VIRTIO_NET_HDR_GSO_TCPV4:
2409 gso_type = SKB_GSO_TCPV4;
2410 break;
2411 case VIRTIO_NET_HDR_GSO_TCPV6:
2412 gso_type = SKB_GSO_TCPV6;
2413 break;
2414 case VIRTIO_NET_HDR_GSO_UDP:
2415 gso_type = SKB_GSO_UDP;
2416 break;
2417 default:
2418 return -EINVAL;
2419 }
2420
2421 if (vnet_hdr->gso_type & VIRTIO_NET_HDR_GSO_ECN)
2422 gso_type |= SKB_GSO_TCP_ECN;
2423
2424 if (vnet_hdr->gso_size == 0)
2425 return -EINVAL;
2426 }
2427
2428 vnet_hdr->gso_type = gso_type;
2429 return 0;
2430}
2431
2432static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
2433 struct virtio_net_hdr *vnet_hdr)
2434{
2435 int n;
2436
2437 if (*len < sizeof(*vnet_hdr))
2438 return -EINVAL;
2439 *len -= sizeof(*vnet_hdr);
2440
2441 n = copy_from_iter(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter);
2442 if (n != sizeof(*vnet_hdr))
2443 return -EFAULT;
2444
2445 return __packet_snd_vnet_parse(vnet_hdr, *len);
2446}
2447
2448static int packet_snd_vnet_gso(struct sk_buff *skb,
2449 struct virtio_net_hdr *vnet_hdr)
2450{
2451 if (vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
2452 u16 s = __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start);
2453 u16 o = __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset);
2454
2455 if (!skb_partial_csum_set(skb, s, o))
2456 return -EINVAL;
2457 }
2458
2459 skb_shinfo(skb)->gso_size =
2460 __virtio16_to_cpu(vio_le(), vnet_hdr->gso_size);
2461 skb_shinfo(skb)->gso_type = vnet_hdr->gso_type;
2462
2463
2464 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
2465 skb_shinfo(skb)->gso_segs = 0;
2466 return 0;
2467}
2468
2469static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
2470 void *frame, struct net_device *dev, void *data, int tp_len,
2471 __be16 proto, unsigned char *addr, int hlen, int copylen,
2472 const struct sockcm_cookie *sockc)
2473{
2474 union tpacket_uhdr ph;
2475 int to_write, offset, len, nr_frags, len_max;
2476 struct socket *sock = po->sk.sk_socket;
2477 struct page *page;
2478 int err;
2479
2480 ph.raw = frame;
2481
2482 skb->protocol = proto;
2483 skb->dev = dev;
2484 skb->priority = po->sk.sk_priority;
2485 skb->mark = po->sk.sk_mark;
2486 sock_tx_timestamp(&po->sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags);
2487 skb_shinfo(skb)->destructor_arg = ph.raw;
2488
2489 skb_reserve(skb, hlen);
2490 skb_reset_network_header(skb);
2491
2492 to_write = tp_len;
2493
2494 if (sock->type == SOCK_DGRAM) {
2495 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2496 NULL, tp_len);
2497 if (unlikely(err < 0))
2498 return -EINVAL;
2499 } else if (copylen) {
2500 int hdrlen = min_t(int, copylen, tp_len);
2501
2502 skb_push(skb, dev->hard_header_len);
2503 skb_put(skb, copylen - dev->hard_header_len);
2504 err = skb_store_bits(skb, 0, data, hdrlen);
2505 if (unlikely(err))
2506 return err;
2507 if (!dev_validate_header(dev, skb->data, hdrlen))
2508 return -EINVAL;
2509 if (!skb->protocol)
2510 tpacket_set_protocol(dev, skb);
2511
2512 data += hdrlen;
2513 to_write -= hdrlen;
2514 }
2515
2516 offset = offset_in_page(data);
2517 len_max = PAGE_SIZE - offset;
2518 len = ((to_write > len_max) ? len_max : to_write);
2519
2520 skb->data_len = to_write;
2521 skb->len += to_write;
2522 skb->truesize += to_write;
2523 atomic_add(to_write, &po->sk.sk_wmem_alloc);
2524
2525 while (likely(to_write)) {
2526 nr_frags = skb_shinfo(skb)->nr_frags;
2527
2528 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
2529 pr_err("Packet exceed the number of skb frags(%lu)\n",
2530 MAX_SKB_FRAGS);
2531 return -EFAULT;
2532 }
2533
2534 page = pgv_to_page(data);
2535 data += len;
2536 flush_dcache_page(page);
2537 get_page(page);
2538 skb_fill_page_desc(skb, nr_frags, page, offset, len);
2539 to_write -= len;
2540 offset = 0;
2541 len_max = PAGE_SIZE;
2542 len = ((to_write > len_max) ? len_max : to_write);
2543 }
2544
2545 skb_probe_transport_header(skb, 0);
2546
2547 return tp_len;
2548}
2549
2550static int tpacket_parse_header(struct packet_sock *po, void *frame,
2551 int size_max, void **data)
2552{
2553 union tpacket_uhdr ph;
2554 int tp_len, off;
2555
2556 ph.raw = frame;
2557
2558 switch (po->tp_version) {
2559 case TPACKET_V2:
2560 tp_len = ph.h2->tp_len;
2561 break;
2562 default:
2563 tp_len = ph.h1->tp_len;
2564 break;
2565 }
2566 if (unlikely(tp_len > size_max)) {
2567 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2568 return -EMSGSIZE;
2569 }
2570
2571 if (unlikely(po->tp_tx_has_off)) {
2572 int off_min, off_max;
2573
2574 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2575 off_max = po->tx_ring.frame_size - tp_len;
2576 if (po->sk.sk_type == SOCK_DGRAM) {
2577 switch (po->tp_version) {
2578 case TPACKET_V2:
2579 off = ph.h2->tp_net;
2580 break;
2581 default:
2582 off = ph.h1->tp_net;
2583 break;
2584 }
2585 } else {
2586 switch (po->tp_version) {
2587 case TPACKET_V2:
2588 off = ph.h2->tp_mac;
2589 break;
2590 default:
2591 off = ph.h1->tp_mac;
2592 break;
2593 }
2594 }
2595 if (unlikely((off < off_min) || (off_max < off)))
2596 return -EINVAL;
2597 } else {
2598 off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2599 }
2600
2601 *data = frame + off;
2602 return tp_len;
2603}
2604
2605static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2606{
2607 struct sk_buff *skb;
2608 struct net_device *dev;
2609 struct virtio_net_hdr *vnet_hdr = NULL;
2610 struct sockcm_cookie sockc;
2611 __be16 proto;
2612 int err, reserve = 0;
2613 void *ph;
2614 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
2615 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
2616 int tp_len, size_max;
2617 unsigned char *addr;
2618 void *data;
2619 int len_sum = 0;
2620 int status = TP_STATUS_AVAILABLE;
2621 int hlen, tlen, copylen = 0;
2622
2623 mutex_lock(&po->pg_vec_lock);
2624
2625 if (likely(saddr == NULL)) {
2626 dev = packet_cached_dev_get(po);
2627 proto = po->num;
2628 addr = NULL;
2629 } else {
2630 err = -EINVAL;
2631 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2632 goto out;
2633 if (msg->msg_namelen < (saddr->sll_halen
2634 + offsetof(struct sockaddr_ll,
2635 sll_addr)))
2636 goto out;
2637 proto = saddr->sll_protocol;
2638 addr = saddr->sll_addr;
2639 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
2640 }
2641
2642 sockc.tsflags = po->sk.sk_tsflags;
2643 if (msg->msg_controllen) {
2644 err = sock_cmsg_send(&po->sk, msg, &sockc);
2645 if (unlikely(err))
2646 goto out;
2647 }
2648
2649 err = -ENXIO;
2650 if (unlikely(dev == NULL))
2651 goto out;
2652 err = -ENETDOWN;
2653 if (unlikely(!(dev->flags & IFF_UP)))
2654 goto out_put;
2655
2656 if (po->sk.sk_socket->type == SOCK_RAW)
2657 reserve = dev->hard_header_len;
2658 size_max = po->tx_ring.frame_size
2659 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
2660
2661 if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
2662 size_max = dev->mtu + reserve + VLAN_HLEN;
2663
2664 do {
2665 ph = packet_current_frame(po, &po->tx_ring,
2666 TP_STATUS_SEND_REQUEST);
2667 if (unlikely(ph == NULL)) {
2668 if (need_wait && need_resched())
2669 schedule();
2670 continue;
2671 }
2672
2673 skb = NULL;
2674 tp_len = tpacket_parse_header(po, ph, size_max, &data);
2675 if (tp_len < 0)
2676 goto tpacket_error;
2677
2678 status = TP_STATUS_SEND_REQUEST;
2679 hlen = LL_RESERVED_SPACE(dev);
2680 tlen = dev->needed_tailroom;
2681 if (po->has_vnet_hdr) {
2682 vnet_hdr = data;
2683 data += sizeof(*vnet_hdr);
2684 tp_len -= sizeof(*vnet_hdr);
2685 if (tp_len < 0 ||
2686 __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
2687 tp_len = -EINVAL;
2688 goto tpacket_error;
2689 }
2690 copylen = __virtio16_to_cpu(vio_le(),
2691 vnet_hdr->hdr_len);
2692 }
2693 copylen = max_t(int, copylen, dev->hard_header_len);
2694 skb = sock_alloc_send_skb(&po->sk,
2695 hlen + tlen + sizeof(struct sockaddr_ll) +
2696 (copylen - dev->hard_header_len),
2697 !need_wait, &err);
2698
2699 if (unlikely(skb == NULL)) {
2700
2701 if (likely(len_sum > 0))
2702 err = len_sum;
2703 goto out_status;
2704 }
2705 tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
2706 addr, hlen, copylen, &sockc);
2707 if (likely(tp_len >= 0) &&
2708 tp_len > dev->mtu + reserve &&
2709 !po->has_vnet_hdr &&
2710 !packet_extra_vlan_len_allowed(dev, skb))
2711 tp_len = -EMSGSIZE;
2712
2713 if (unlikely(tp_len < 0)) {
2714tpacket_error:
2715 if (po->tp_loss) {
2716 __packet_set_status(po, ph,
2717 TP_STATUS_AVAILABLE);
2718 packet_increment_head(&po->tx_ring);
2719 kfree_skb(skb);
2720 continue;
2721 } else {
2722 status = TP_STATUS_WRONG_FORMAT;
2723 err = tp_len;
2724 goto out_status;
2725 }
2726 }
2727
2728 if (po->has_vnet_hdr && packet_snd_vnet_gso(skb, vnet_hdr)) {
2729 tp_len = -EINVAL;
2730 goto tpacket_error;
2731 }
2732
2733 packet_pick_tx_queue(dev, skb);
2734
2735 skb->destructor = tpacket_destruct_skb;
2736 __packet_set_status(po, ph, TP_STATUS_SENDING);
2737 packet_inc_pending(&po->tx_ring);
2738
2739 status = TP_STATUS_SEND_REQUEST;
2740 err = po->xmit(skb);
2741 if (unlikely(err > 0)) {
2742 err = net_xmit_errno(err);
2743 if (err && __packet_get_status(po, ph) ==
2744 TP_STATUS_AVAILABLE) {
2745
2746 skb = NULL;
2747 goto out_status;
2748 }
2749
2750
2751
2752
2753 err = 0;
2754 }
2755 packet_increment_head(&po->tx_ring);
2756 len_sum += tp_len;
2757 } while (likely((ph != NULL) ||
2758
2759
2760
2761
2762
2763
2764 (need_wait && packet_read_pending(&po->tx_ring))));
2765
2766 err = len_sum;
2767 goto out_put;
2768
2769out_status:
2770 __packet_set_status(po, ph, status);
2771 kfree_skb(skb);
2772out_put:
2773 dev_put(dev);
2774out:
2775 mutex_unlock(&po->pg_vec_lock);
2776 return err;
2777}
2778
2779static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2780 size_t reserve, size_t len,
2781 size_t linear, int noblock,
2782 int *err)
2783{
2784 struct sk_buff *skb;
2785
2786
2787 if (prepad + len < PAGE_SIZE || !linear)
2788 linear = len;
2789
2790 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
2791 err, 0);
2792 if (!skb)
2793 return NULL;
2794
2795 skb_reserve(skb, reserve);
2796 skb_put(skb, linear);
2797 skb->data_len = len - linear;
2798 skb->len += len - linear;
2799
2800 return skb;
2801}
2802
2803static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
2804{
2805 struct sock *sk = sock->sk;
2806 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
2807 struct sk_buff *skb;
2808 struct net_device *dev;
2809 __be16 proto;
2810 unsigned char *addr;
2811 int err, reserve = 0;
2812 struct sockcm_cookie sockc;
2813 struct virtio_net_hdr vnet_hdr = { 0 };
2814 int offset = 0;
2815 struct packet_sock *po = pkt_sk(sk);
2816 int hlen, tlen;
2817 int extra_len = 0;
2818
2819
2820
2821
2822
2823 if (likely(saddr == NULL)) {
2824 dev = packet_cached_dev_get(po);
2825 proto = po->num;
2826 addr = NULL;
2827 } else {
2828 err = -EINVAL;
2829 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2830 goto out;
2831 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2832 goto out;
2833 proto = saddr->sll_protocol;
2834 addr = saddr->sll_addr;
2835 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
2836 }
2837
2838 err = -ENXIO;
2839 if (unlikely(dev == NULL))
2840 goto out_unlock;
2841 err = -ENETDOWN;
2842 if (unlikely(!(dev->flags & IFF_UP)))
2843 goto out_unlock;
2844
2845 sockc.tsflags = sk->sk_tsflags;
2846 sockc.mark = sk->sk_mark;
2847 if (msg->msg_controllen) {
2848 err = sock_cmsg_send(sk, msg, &sockc);
2849 if (unlikely(err))
2850 goto out_unlock;
2851 }
2852
2853 if (sock->type == SOCK_RAW)
2854 reserve = dev->hard_header_len;
2855 if (po->has_vnet_hdr) {
2856 err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
2857 if (err)
2858 goto out_unlock;
2859 }
2860
2861 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2862 if (!netif_supports_nofcs(dev)) {
2863 err = -EPROTONOSUPPORT;
2864 goto out_unlock;
2865 }
2866 extra_len = 4;
2867 }
2868
2869 err = -EMSGSIZE;
2870 if (!vnet_hdr.gso_type &&
2871 (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
2872 goto out_unlock;
2873
2874 err = -ENOBUFS;
2875 hlen = LL_RESERVED_SPACE(dev);
2876 tlen = dev->needed_tailroom;
2877 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len,
2878 __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len),
2879 msg->msg_flags & MSG_DONTWAIT, &err);
2880 if (skb == NULL)
2881 goto out_unlock;
2882
2883 skb_set_network_header(skb, reserve);
2884
2885 err = -EINVAL;
2886 if (sock->type == SOCK_DGRAM) {
2887 offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
2888 if (unlikely(offset < 0))
2889 goto out_free;
2890 }
2891
2892
2893 err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
2894 if (err)
2895 goto out_free;
2896
2897 if (sock->type == SOCK_RAW &&
2898 !dev_validate_header(dev, skb->data, len)) {
2899 err = -EINVAL;
2900 goto out_free;
2901 }
2902
2903 sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags);
2904
2905 if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
2906 !packet_extra_vlan_len_allowed(dev, skb)) {
2907 err = -EMSGSIZE;
2908 goto out_free;
2909 }
2910
2911 skb->protocol = proto;
2912 skb->dev = dev;
2913 skb->priority = sk->sk_priority;
2914 skb->mark = sockc.mark;
2915
2916 packet_pick_tx_queue(dev, skb);
2917
2918 if (po->has_vnet_hdr) {
2919 err = packet_snd_vnet_gso(skb, &vnet_hdr);
2920 if (err)
2921 goto out_free;
2922 len += sizeof(vnet_hdr);
2923 }
2924
2925 skb_probe_transport_header(skb, reserve);
2926
2927 if (unlikely(extra_len == 4))
2928 skb->no_fcs = 1;
2929
2930 err = po->xmit(skb);
2931 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2932 goto out_unlock;
2933
2934 dev_put(dev);
2935
2936 return len;
2937
2938out_free:
2939 kfree_skb(skb);
2940out_unlock:
2941 if (dev)
2942 dev_put(dev);
2943out:
2944 return err;
2945}
2946
2947static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
2948{
2949 struct sock *sk = sock->sk;
2950 struct packet_sock *po = pkt_sk(sk);
2951
2952 if (po->tx_ring.pg_vec)
2953 return tpacket_snd(po, msg);
2954 else
2955 return packet_snd(sock, msg, len);
2956}
2957
2958
2959
2960
2961
2962
2963static int packet_release(struct socket *sock)
2964{
2965 struct sock *sk = sock->sk;
2966 struct packet_sock *po;
2967 struct net *net;
2968 union tpacket_req_u req_u;
2969
2970 if (!sk)
2971 return 0;
2972
2973 net = sock_net(sk);
2974 po = pkt_sk(sk);
2975
2976 mutex_lock(&net->packet.sklist_lock);
2977 sk_del_node_init_rcu(sk);
2978 mutex_unlock(&net->packet.sklist_lock);
2979
2980 preempt_disable();
2981 sock_prot_inuse_add(net, sk->sk_prot, -1);
2982 preempt_enable();
2983
2984 spin_lock(&po->bind_lock);
2985 unregister_prot_hook(sk, false);
2986 packet_cached_dev_reset(po);
2987
2988 if (po->prot_hook.dev) {
2989 dev_put(po->prot_hook.dev);
2990 po->prot_hook.dev = NULL;
2991 }
2992 spin_unlock(&po->bind_lock);
2993
2994 packet_flush_mclist(sk);
2995
2996 if (po->rx_ring.pg_vec) {
2997 memset(&req_u, 0, sizeof(req_u));
2998 packet_set_ring(sk, &req_u, 1, 0);
2999 }
3000
3001 if (po->tx_ring.pg_vec) {
3002 memset(&req_u, 0, sizeof(req_u));
3003 packet_set_ring(sk, &req_u, 1, 1);
3004 }
3005
3006 fanout_release(sk);
3007
3008 synchronize_net();
3009
3010
3011
3012 sock_orphan(sk);
3013 sock->sk = NULL;
3014
3015
3016
3017 skb_queue_purge(&sk->sk_receive_queue);
3018 packet_free_pending(po);
3019 sk_refcnt_debug_release(sk);
3020
3021 sock_put(sk);
3022 return 0;
3023}
3024
3025
3026
3027
3028
3029static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
3030 __be16 proto)
3031{
3032 struct packet_sock *po = pkt_sk(sk);
3033 struct net_device *dev_curr;
3034 __be16 proto_curr;
3035 bool need_rehook;
3036 struct net_device *dev = NULL;
3037 int ret = 0;
3038 bool unlisted = false;
3039
3040 if (po->fanout)
3041 return -EINVAL;
3042
3043 lock_sock(sk);
3044 spin_lock(&po->bind_lock);
3045 rcu_read_lock();
3046
3047 if (name) {
3048 dev = dev_get_by_name_rcu(sock_net(sk), name);
3049 if (!dev) {
3050 ret = -ENODEV;
3051 goto out_unlock;
3052 }
3053 } else if (ifindex) {
3054 dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
3055 if (!dev) {
3056 ret = -ENODEV;
3057 goto out_unlock;
3058 }
3059 }
3060
3061 if (dev)
3062 dev_hold(dev);
3063
3064 proto_curr = po->prot_hook.type;
3065 dev_curr = po->prot_hook.dev;
3066
3067 need_rehook = proto_curr != proto || dev_curr != dev;
3068
3069 if (need_rehook) {
3070 if (po->running) {
3071 rcu_read_unlock();
3072 __unregister_prot_hook(sk, true);
3073 rcu_read_lock();
3074 dev_curr = po->prot_hook.dev;
3075 if (dev)
3076 unlisted = !dev_get_by_index_rcu(sock_net(sk),
3077 dev->ifindex);
3078 }
3079
3080 po->num = proto;
3081 po->prot_hook.type = proto;
3082
3083 if (unlikely(unlisted)) {
3084 dev_put(dev);
3085 po->prot_hook.dev = NULL;
3086 po->ifindex = -1;
3087 packet_cached_dev_reset(po);
3088 } else {
3089 po->prot_hook.dev = dev;
3090 po->ifindex = dev ? dev->ifindex : 0;
3091 packet_cached_dev_assign(po, dev);
3092 }
3093 }
3094 if (dev_curr)
3095 dev_put(dev_curr);
3096
3097 if (proto == 0 || !need_rehook)
3098 goto out_unlock;
3099
3100 if (!unlisted && (!dev || (dev->flags & IFF_UP))) {
3101 register_prot_hook(sk);
3102 } else {
3103 sk->sk_err = ENETDOWN;
3104 if (!sock_flag(sk, SOCK_DEAD))
3105 sk->sk_error_report(sk);
3106 }
3107
3108out_unlock:
3109 rcu_read_unlock();
3110 spin_unlock(&po->bind_lock);
3111 release_sock(sk);
3112 return ret;
3113}
3114
3115
3116
3117
3118
3119static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
3120 int addr_len)
3121{
3122 struct sock *sk = sock->sk;
3123 char name[15];
3124
3125
3126
3127
3128
3129 if (addr_len != sizeof(struct sockaddr))
3130 return -EINVAL;
3131 strlcpy(name, uaddr->sa_data, sizeof(name));
3132
3133 return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
3134}
3135
3136static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
3137{
3138 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
3139 struct sock *sk = sock->sk;
3140
3141
3142
3143
3144
3145 if (addr_len < sizeof(struct sockaddr_ll))
3146 return -EINVAL;
3147 if (sll->sll_family != AF_PACKET)
3148 return -EINVAL;
3149
3150 return packet_do_bind(sk, NULL, sll->sll_ifindex,
3151 sll->sll_protocol ? : pkt_sk(sk)->num);
3152}
3153
3154static struct proto packet_proto = {
3155 .name = "PACKET",
3156 .owner = THIS_MODULE,
3157 .obj_size = sizeof(struct packet_sock),
3158};
3159
3160
3161
3162
3163
3164static int packet_create(struct net *net, struct socket *sock, int protocol,
3165 int kern)
3166{
3167 struct sock *sk;
3168 struct packet_sock *po;
3169 __be16 proto = (__force __be16)protocol;
3170 int err;
3171
3172 if (!ns_capable(net->user_ns, CAP_NET_RAW))
3173 return -EPERM;
3174 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
3175 sock->type != SOCK_PACKET)
3176 return -ESOCKTNOSUPPORT;
3177
3178 sock->state = SS_UNCONNECTED;
3179
3180 err = -ENOBUFS;
3181 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
3182 if (sk == NULL)
3183 goto out;
3184
3185 sock->ops = &packet_ops;
3186 if (sock->type == SOCK_PACKET)
3187 sock->ops = &packet_ops_spkt;
3188
3189 sock_init_data(sock, sk);
3190
3191 po = pkt_sk(sk);
3192 sk->sk_family = PF_PACKET;
3193 po->num = proto;
3194 po->xmit = dev_queue_xmit;
3195
3196 err = packet_alloc_pending(po);
3197 if (err)
3198 goto out2;
3199
3200 packet_cached_dev_reset(po);
3201
3202 sk->sk_destruct = packet_sock_destruct;
3203 sk_refcnt_debug_inc(sk);
3204
3205
3206
3207
3208
3209 spin_lock_init(&po->bind_lock);
3210 mutex_init(&po->pg_vec_lock);
3211 po->rollover = NULL;
3212 po->prot_hook.func = packet_rcv;
3213
3214 if (sock->type == SOCK_PACKET)
3215 po->prot_hook.func = packet_rcv_spkt;
3216
3217 po->prot_hook.af_packet_priv = sk;
3218
3219 if (proto) {
3220 po->prot_hook.type = proto;
3221 register_prot_hook(sk);
3222 }
3223
3224 mutex_lock(&net->packet.sklist_lock);
3225 sk_add_node_rcu(sk, &net->packet.sklist);
3226 mutex_unlock(&net->packet.sklist_lock);
3227
3228 preempt_disable();
3229 sock_prot_inuse_add(net, &packet_proto, 1);
3230 preempt_enable();
3231
3232 return 0;
3233out2:
3234 sk_free(sk);
3235out:
3236 return err;
3237}
3238
3239
3240
3241
3242
3243
3244static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
3245 int flags)
3246{
3247 struct sock *sk = sock->sk;
3248 struct sk_buff *skb;
3249 int copied, err;
3250 int vnet_hdr_len = 0;
3251 unsigned int origlen = 0;
3252
3253 err = -EINVAL;
3254 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
3255 goto out;
3256
3257#if 0
3258
3259 if (pkt_sk(sk)->ifindex < 0)
3260 return -ENODEV;
3261#endif
3262
3263 if (flags & MSG_ERRQUEUE) {
3264 err = sock_recv_errqueue(sk, msg, len,
3265 SOL_PACKET, PACKET_TX_TIMESTAMP);
3266 goto out;
3267 }
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
3279
3280
3281
3282
3283
3284
3285
3286 if (skb == NULL)
3287 goto out;
3288
3289 if (pkt_sk(sk)->pressure)
3290 packet_rcv_has_room(pkt_sk(sk), NULL);
3291
3292 if (pkt_sk(sk)->has_vnet_hdr) {
3293 err = packet_rcv_vnet(msg, skb, &len);
3294 if (err)
3295 goto out_free;
3296 vnet_hdr_len = sizeof(struct virtio_net_hdr);
3297 }
3298
3299
3300
3301
3302
3303 copied = skb->len;
3304 if (copied > len) {
3305 copied = len;
3306 msg->msg_flags |= MSG_TRUNC;
3307 }
3308
3309 err = skb_copy_datagram_msg(skb, 0, msg, copied);
3310 if (err)
3311 goto out_free;
3312
3313 if (sock->type != SOCK_PACKET) {
3314 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3315
3316
3317 origlen = PACKET_SKB_CB(skb)->sa.origlen;
3318 sll->sll_family = AF_PACKET;
3319 sll->sll_protocol = skb->protocol;
3320 }
3321
3322 sock_recv_ts_and_drops(msg, sk, skb);
3323
3324 if (msg->msg_name) {
3325
3326
3327
3328 if (sock->type == SOCK_PACKET) {
3329 __sockaddr_check_size(sizeof(struct sockaddr_pkt));
3330 msg->msg_namelen = sizeof(struct sockaddr_pkt);
3331 } else {
3332 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3333
3334 msg->msg_namelen = sll->sll_halen +
3335 offsetof(struct sockaddr_ll, sll_addr);
3336 }
3337 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
3338 msg->msg_namelen);
3339 }
3340
3341 if (pkt_sk(sk)->auxdata) {
3342 struct tpacket_auxdata aux;
3343
3344 aux.tp_status = TP_STATUS_USER;
3345 if (skb->ip_summed == CHECKSUM_PARTIAL)
3346 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
3347 else if (skb->pkt_type != PACKET_OUTGOING &&
3348 (skb->ip_summed == CHECKSUM_COMPLETE ||
3349 skb_csum_unnecessary(skb)))
3350 aux.tp_status |= TP_STATUS_CSUM_VALID;
3351
3352 aux.tp_len = origlen;
3353 aux.tp_snaplen = skb->len;
3354 aux.tp_mac = 0;
3355 aux.tp_net = skb_network_offset(skb);
3356 if (skb_vlan_tag_present(skb)) {
3357 aux.tp_vlan_tci = skb_vlan_tag_get(skb);
3358 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3359 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
3360 } else {
3361 aux.tp_vlan_tci = 0;
3362 aux.tp_vlan_tpid = 0;
3363 }
3364 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
3365 }
3366
3367
3368
3369
3370
3371 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
3372
3373out_free:
3374 skb_free_datagram(sk, skb);
3375out:
3376 return err;
3377}
3378
3379static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
3380 int *uaddr_len, int peer)
3381{
3382 struct net_device *dev;
3383 struct sock *sk = sock->sk;
3384
3385 if (peer)
3386 return -EOPNOTSUPP;
3387
3388 uaddr->sa_family = AF_PACKET;
3389 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
3390 rcu_read_lock();
3391 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
3392 if (dev)
3393 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
3394 rcu_read_unlock();
3395 *uaddr_len = sizeof(*uaddr);
3396
3397 return 0;
3398}
3399
3400static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
3401 int *uaddr_len, int peer)
3402{
3403 struct net_device *dev;
3404 struct sock *sk = sock->sk;
3405 struct packet_sock *po = pkt_sk(sk);
3406 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
3407
3408 if (peer)
3409 return -EOPNOTSUPP;
3410
3411 sll->sll_family = AF_PACKET;
3412 sll->sll_ifindex = po->ifindex;
3413 sll->sll_protocol = po->num;
3414 sll->sll_pkttype = 0;
3415 rcu_read_lock();
3416 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
3417 if (dev) {
3418 sll->sll_hatype = dev->type;
3419 sll->sll_halen = dev->addr_len;
3420 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
3421 } else {
3422 sll->sll_hatype = 0;
3423 sll->sll_halen = 0;
3424 }
3425 rcu_read_unlock();
3426 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
3427
3428 return 0;
3429}
3430
3431static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3432 int what)
3433{
3434 switch (i->type) {
3435 case PACKET_MR_MULTICAST:
3436 if (i->alen != dev->addr_len)
3437 return -EINVAL;
3438 if (what > 0)
3439 return dev_mc_add(dev, i->addr);
3440 else
3441 return dev_mc_del(dev, i->addr);
3442 break;
3443 case PACKET_MR_PROMISC:
3444 return dev_set_promiscuity(dev, what);
3445 case PACKET_MR_ALLMULTI:
3446 return dev_set_allmulti(dev, what);
3447 case PACKET_MR_UNICAST:
3448 if (i->alen != dev->addr_len)
3449 return -EINVAL;
3450 if (what > 0)
3451 return dev_uc_add(dev, i->addr);
3452 else
3453 return dev_uc_del(dev, i->addr);
3454 break;
3455 default:
3456 break;
3457 }
3458 return 0;
3459}
3460
3461static void packet_dev_mclist_delete(struct net_device *dev,
3462 struct packet_mclist **mlp)
3463{
3464 struct packet_mclist *ml;
3465
3466 while ((ml = *mlp) != NULL) {
3467 if (ml->ifindex == dev->ifindex) {
3468 packet_dev_mc(dev, ml, -1);
3469 *mlp = ml->next;
3470 kfree(ml);
3471 } else
3472 mlp = &ml->next;
3473 }
3474}
3475
3476static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
3477{
3478 struct packet_sock *po = pkt_sk(sk);
3479 struct packet_mclist *ml, *i;
3480 struct net_device *dev;
3481 int err;
3482
3483 rtnl_lock();
3484
3485 err = -ENODEV;
3486 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
3487 if (!dev)
3488 goto done;
3489
3490 err = -EINVAL;
3491 if (mreq->mr_alen > dev->addr_len)
3492 goto done;
3493
3494 err = -ENOBUFS;
3495 i = kmalloc(sizeof(*i), GFP_KERNEL);
3496 if (i == NULL)
3497 goto done;
3498
3499 err = 0;
3500 for (ml = po->mclist; ml; ml = ml->next) {
3501 if (ml->ifindex == mreq->mr_ifindex &&
3502 ml->type == mreq->mr_type &&
3503 ml->alen == mreq->mr_alen &&
3504 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3505 ml->count++;
3506
3507 kfree(i);
3508 goto done;
3509 }
3510 }
3511
3512 i->type = mreq->mr_type;
3513 i->ifindex = mreq->mr_ifindex;
3514 i->alen = mreq->mr_alen;
3515 memcpy(i->addr, mreq->mr_address, i->alen);
3516 memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
3517 i->count = 1;
3518 i->next = po->mclist;
3519 po->mclist = i;
3520 err = packet_dev_mc(dev, i, 1);
3521 if (err) {
3522 po->mclist = i->next;
3523 kfree(i);
3524 }
3525
3526done:
3527 rtnl_unlock();
3528 return err;
3529}
3530
3531static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
3532{
3533 struct packet_mclist *ml, **mlp;
3534
3535 rtnl_lock();
3536
3537 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3538 if (ml->ifindex == mreq->mr_ifindex &&
3539 ml->type == mreq->mr_type &&
3540 ml->alen == mreq->mr_alen &&
3541 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3542 if (--ml->count == 0) {
3543 struct net_device *dev;
3544 *mlp = ml->next;
3545 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3546 if (dev)
3547 packet_dev_mc(dev, ml, -1);
3548 kfree(ml);
3549 }
3550 break;
3551 }
3552 }
3553 rtnl_unlock();
3554 return 0;
3555}
3556
3557static void packet_flush_mclist(struct sock *sk)
3558{
3559 struct packet_sock *po = pkt_sk(sk);
3560 struct packet_mclist *ml;
3561
3562 if (!po->mclist)
3563 return;
3564
3565 rtnl_lock();
3566 while ((ml = po->mclist) != NULL) {
3567 struct net_device *dev;
3568
3569 po->mclist = ml->next;
3570 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3571 if (dev != NULL)
3572 packet_dev_mc(dev, ml, -1);
3573 kfree(ml);
3574 }
3575 rtnl_unlock();
3576}
3577
3578static int
3579packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
3580{
3581 struct sock *sk = sock->sk;
3582 struct packet_sock *po = pkt_sk(sk);
3583 int ret;
3584
3585 if (level != SOL_PACKET)
3586 return -ENOPROTOOPT;
3587
3588 switch (optname) {
3589 case PACKET_ADD_MEMBERSHIP:
3590 case PACKET_DROP_MEMBERSHIP:
3591 {
3592 struct packet_mreq_max mreq;
3593 int len = optlen;
3594 memset(&mreq, 0, sizeof(mreq));
3595 if (len < sizeof(struct packet_mreq))
3596 return -EINVAL;
3597 if (len > sizeof(mreq))
3598 len = sizeof(mreq);
3599 if (copy_from_user(&mreq, optval, len))
3600 return -EFAULT;
3601 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3602 return -EINVAL;
3603 if (optname == PACKET_ADD_MEMBERSHIP)
3604 ret = packet_mc_add(sk, &mreq);
3605 else
3606 ret = packet_mc_drop(sk, &mreq);
3607 return ret;
3608 }
3609
3610 case PACKET_RX_RING:
3611 case PACKET_TX_RING:
3612 {
3613 union tpacket_req_u req_u;
3614 int len;
3615
3616 switch (po->tp_version) {
3617 case TPACKET_V1:
3618 case TPACKET_V2:
3619 len = sizeof(req_u.req);
3620 break;
3621 case TPACKET_V3:
3622 default:
3623 len = sizeof(req_u.req3);
3624 break;
3625 }
3626 if (optlen < len)
3627 return -EINVAL;
3628 if (copy_from_user(&req_u.req, optval, len))
3629 return -EFAULT;
3630 return packet_set_ring(sk, &req_u, 0,
3631 optname == PACKET_TX_RING);
3632 }
3633 case PACKET_COPY_THRESH:
3634 {
3635 int val;
3636
3637 if (optlen != sizeof(val))
3638 return -EINVAL;
3639 if (copy_from_user(&val, optval, sizeof(val)))
3640 return -EFAULT;
3641
3642 pkt_sk(sk)->copy_thresh = val;
3643 return 0;
3644 }
3645 case PACKET_VERSION:
3646 {
3647 int val;
3648
3649 if (optlen != sizeof(val))
3650 return -EINVAL;
3651 if (copy_from_user(&val, optval, sizeof(val)))
3652 return -EFAULT;
3653 switch (val) {
3654 case TPACKET_V1:
3655 case TPACKET_V2:
3656 case TPACKET_V3:
3657 break;
3658 default:
3659 return -EINVAL;
3660 }
3661 lock_sock(sk);
3662 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3663 ret = -EBUSY;
3664 } else {
3665 po->tp_version = val;
3666 ret = 0;
3667 }
3668 release_sock(sk);
3669 return ret;
3670 }
3671 case PACKET_RESERVE:
3672 {
3673 unsigned int val;
3674
3675 if (optlen != sizeof(val))
3676 return -EINVAL;
3677 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3678 return -EBUSY;
3679 if (copy_from_user(&val, optval, sizeof(val)))
3680 return -EFAULT;
3681 po->tp_reserve = val;
3682 return 0;
3683 }
3684 case PACKET_LOSS:
3685 {
3686 unsigned int val;
3687
3688 if (optlen != sizeof(val))
3689 return -EINVAL;
3690 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3691 return -EBUSY;
3692 if (copy_from_user(&val, optval, sizeof(val)))
3693 return -EFAULT;
3694 po->tp_loss = !!val;
3695 return 0;
3696 }
3697 case PACKET_AUXDATA:
3698 {
3699 int val;
3700
3701 if (optlen < sizeof(val))
3702 return -EINVAL;
3703 if (copy_from_user(&val, optval, sizeof(val)))
3704 return -EFAULT;
3705
3706 po->auxdata = !!val;
3707 return 0;
3708 }
3709 case PACKET_ORIGDEV:
3710 {
3711 int val;
3712
3713 if (optlen < sizeof(val))
3714 return -EINVAL;
3715 if (copy_from_user(&val, optval, sizeof(val)))
3716 return -EFAULT;
3717
3718 po->origdev = !!val;
3719 return 0;
3720 }
3721 case PACKET_VNET_HDR:
3722 {
3723 int val;
3724
3725 if (sock->type != SOCK_RAW)
3726 return -EINVAL;
3727 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3728 return -EBUSY;
3729 if (optlen < sizeof(val))
3730 return -EINVAL;
3731 if (copy_from_user(&val, optval, sizeof(val)))
3732 return -EFAULT;
3733
3734 po->has_vnet_hdr = !!val;
3735 return 0;
3736 }
3737 case PACKET_TIMESTAMP:
3738 {
3739 int val;
3740
3741 if (optlen != sizeof(val))
3742 return -EINVAL;
3743 if (copy_from_user(&val, optval, sizeof(val)))
3744 return -EFAULT;
3745
3746 po->tp_tstamp = val;
3747 return 0;
3748 }
3749 case PACKET_FANOUT:
3750 {
3751 int val;
3752
3753 if (optlen != sizeof(val))
3754 return -EINVAL;
3755 if (copy_from_user(&val, optval, sizeof(val)))
3756 return -EFAULT;
3757
3758 return fanout_add(sk, val & 0xffff, val >> 16);
3759 }
3760 case PACKET_FANOUT_DATA:
3761 {
3762 if (!po->fanout)
3763 return -EINVAL;
3764
3765 return fanout_set_data(po, optval, optlen);
3766 }
3767 case PACKET_TX_HAS_OFF:
3768 {
3769 unsigned int val;
3770
3771 if (optlen != sizeof(val))
3772 return -EINVAL;
3773 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3774 return -EBUSY;
3775 if (copy_from_user(&val, optval, sizeof(val)))
3776 return -EFAULT;
3777 po->tp_tx_has_off = !!val;
3778 return 0;
3779 }
3780 case PACKET_QDISC_BYPASS:
3781 {
3782 int val;
3783
3784 if (optlen != sizeof(val))
3785 return -EINVAL;
3786 if (copy_from_user(&val, optval, sizeof(val)))
3787 return -EFAULT;
3788
3789 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3790 return 0;
3791 }
3792 default:
3793 return -ENOPROTOOPT;
3794 }
3795}
3796
3797static int packet_getsockopt(struct socket *sock, int level, int optname,
3798 char __user *optval, int __user *optlen)
3799{
3800 int len;
3801 int val, lv = sizeof(val);
3802 struct sock *sk = sock->sk;
3803 struct packet_sock *po = pkt_sk(sk);
3804 void *data = &val;
3805 union tpacket_stats_u st;
3806 struct tpacket_rollover_stats rstats;
3807
3808 if (level != SOL_PACKET)
3809 return -ENOPROTOOPT;
3810
3811 if (get_user(len, optlen))
3812 return -EFAULT;
3813
3814 if (len < 0)
3815 return -EINVAL;
3816
3817 switch (optname) {
3818 case PACKET_STATISTICS:
3819 spin_lock_bh(&sk->sk_receive_queue.lock);
3820 memcpy(&st, &po->stats, sizeof(st));
3821 memset(&po->stats, 0, sizeof(po->stats));
3822 spin_unlock_bh(&sk->sk_receive_queue.lock);
3823
3824 if (po->tp_version == TPACKET_V3) {
3825 lv = sizeof(struct tpacket_stats_v3);
3826 st.stats3.tp_packets += st.stats3.tp_drops;
3827 data = &st.stats3;
3828 } else {
3829 lv = sizeof(struct tpacket_stats);
3830 st.stats1.tp_packets += st.stats1.tp_drops;
3831 data = &st.stats1;
3832 }
3833
3834 break;
3835 case PACKET_AUXDATA:
3836 val = po->auxdata;
3837 break;
3838 case PACKET_ORIGDEV:
3839 val = po->origdev;
3840 break;
3841 case PACKET_VNET_HDR:
3842 val = po->has_vnet_hdr;
3843 break;
3844 case PACKET_VERSION:
3845 val = po->tp_version;
3846 break;
3847 case PACKET_HDRLEN:
3848 if (len > sizeof(int))
3849 len = sizeof(int);
3850 if (copy_from_user(&val, optval, len))
3851 return -EFAULT;
3852 switch (val) {
3853 case TPACKET_V1:
3854 val = sizeof(struct tpacket_hdr);
3855 break;
3856 case TPACKET_V2:
3857 val = sizeof(struct tpacket2_hdr);
3858 break;
3859 case TPACKET_V3:
3860 val = sizeof(struct tpacket3_hdr);
3861 break;
3862 default:
3863 return -EINVAL;
3864 }
3865 break;
3866 case PACKET_RESERVE:
3867 val = po->tp_reserve;
3868 break;
3869 case PACKET_LOSS:
3870 val = po->tp_loss;
3871 break;
3872 case PACKET_TIMESTAMP:
3873 val = po->tp_tstamp;
3874 break;
3875 case PACKET_FANOUT:
3876 val = (po->fanout ?
3877 ((u32)po->fanout->id |
3878 ((u32)po->fanout->type << 16) |
3879 ((u32)po->fanout->flags << 24)) :
3880 0);
3881 break;
3882 case PACKET_ROLLOVER_STATS:
3883 if (!po->rollover)
3884 return -EINVAL;
3885 rstats.tp_all = atomic_long_read(&po->rollover->num);
3886 rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
3887 rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
3888 data = &rstats;
3889 lv = sizeof(rstats);
3890 break;
3891 case PACKET_TX_HAS_OFF:
3892 val = po->tp_tx_has_off;
3893 break;
3894 case PACKET_QDISC_BYPASS:
3895 val = packet_use_direct_xmit(po);
3896 break;
3897 default:
3898 return -ENOPROTOOPT;
3899 }
3900
3901 if (len > lv)
3902 len = lv;
3903 if (put_user(len, optlen))
3904 return -EFAULT;
3905 if (copy_to_user(optval, data, len))
3906 return -EFAULT;
3907 return 0;
3908}
3909
3910
3911#ifdef CONFIG_COMPAT
3912static int compat_packet_setsockopt(struct socket *sock, int level, int optname,
3913 char __user *optval, unsigned int optlen)
3914{
3915 struct packet_sock *po = pkt_sk(sock->sk);
3916
3917 if (level != SOL_PACKET)
3918 return -ENOPROTOOPT;
3919
3920 if (optname == PACKET_FANOUT_DATA &&
3921 po->fanout && po->fanout->type == PACKET_FANOUT_CBPF) {
3922 optval = (char __user *)get_compat_bpf_fprog(optval);
3923 if (!optval)
3924 return -EFAULT;
3925 optlen = sizeof(struct sock_fprog);
3926 }
3927
3928 return packet_setsockopt(sock, level, optname, optval, optlen);
3929}
3930#endif
3931
3932static int packet_notifier(struct notifier_block *this,
3933 unsigned long msg, void *ptr)
3934{
3935 struct sock *sk;
3936 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3937 struct net *net = dev_net(dev);
3938
3939 rcu_read_lock();
3940 sk_for_each_rcu(sk, &net->packet.sklist) {
3941 struct packet_sock *po = pkt_sk(sk);
3942
3943 switch (msg) {
3944 case NETDEV_UNREGISTER:
3945 if (po->mclist)
3946 packet_dev_mclist_delete(dev, &po->mclist);
3947
3948
3949 case NETDEV_DOWN:
3950 if (dev->ifindex == po->ifindex) {
3951 spin_lock(&po->bind_lock);
3952 if (po->running) {
3953 __unregister_prot_hook(sk, false);
3954 sk->sk_err = ENETDOWN;
3955 if (!sock_flag(sk, SOCK_DEAD))
3956 sk->sk_error_report(sk);
3957 }
3958 if (msg == NETDEV_UNREGISTER) {
3959 packet_cached_dev_reset(po);
3960 fanout_release(sk);
3961 po->ifindex = -1;
3962 if (po->prot_hook.dev)
3963 dev_put(po->prot_hook.dev);
3964 po->prot_hook.dev = NULL;
3965 }
3966 spin_unlock(&po->bind_lock);
3967 }
3968 break;
3969 case NETDEV_UP:
3970 if (dev->ifindex == po->ifindex) {
3971 spin_lock(&po->bind_lock);
3972 if (po->num)
3973 register_prot_hook(sk);
3974 spin_unlock(&po->bind_lock);
3975 }
3976 break;
3977 }
3978 }
3979 rcu_read_unlock();
3980 return NOTIFY_DONE;
3981}
3982
3983
3984static int packet_ioctl(struct socket *sock, unsigned int cmd,
3985 unsigned long arg)
3986{
3987 struct sock *sk = sock->sk;
3988
3989 switch (cmd) {
3990 case SIOCOUTQ:
3991 {
3992 int amount = sk_wmem_alloc_get(sk);
3993
3994 return put_user(amount, (int __user *)arg);
3995 }
3996 case SIOCINQ:
3997 {
3998 struct sk_buff *skb;
3999 int amount = 0;
4000
4001 spin_lock_bh(&sk->sk_receive_queue.lock);
4002 skb = skb_peek(&sk->sk_receive_queue);
4003 if (skb)
4004 amount = skb->len;
4005 spin_unlock_bh(&sk->sk_receive_queue.lock);
4006 return put_user(amount, (int __user *)arg);
4007 }
4008 case SIOCGSTAMP:
4009 return sock_get_timestamp(sk, (struct timeval __user *)arg);
4010 case SIOCGSTAMPNS:
4011 return sock_get_timestampns(sk, (struct timespec __user *)arg);
4012
4013#ifdef CONFIG_INET
4014 case SIOCADDRT:
4015 case SIOCDELRT:
4016 case SIOCDARP:
4017 case SIOCGARP:
4018 case SIOCSARP:
4019 case SIOCGIFADDR:
4020 case SIOCSIFADDR:
4021 case SIOCGIFBRDADDR:
4022 case SIOCSIFBRDADDR:
4023 case SIOCGIFNETMASK:
4024 case SIOCSIFNETMASK:
4025 case SIOCGIFDSTADDR:
4026 case SIOCSIFDSTADDR:
4027 case SIOCSIFFLAGS:
4028 return inet_dgram_ops.ioctl(sock, cmd, arg);
4029#endif
4030
4031 default:
4032 return -ENOIOCTLCMD;
4033 }
4034 return 0;
4035}
4036
4037static unsigned int packet_poll(struct file *file, struct socket *sock,
4038 poll_table *wait)
4039{
4040 struct sock *sk = sock->sk;
4041 struct packet_sock *po = pkt_sk(sk);
4042 unsigned int mask = datagram_poll(file, sock, wait);
4043
4044 spin_lock_bh(&sk->sk_receive_queue.lock);
4045 if (po->rx_ring.pg_vec) {
4046 if (!packet_previous_rx_frame(po, &po->rx_ring,
4047 TP_STATUS_KERNEL))
4048 mask |= POLLIN | POLLRDNORM;
4049 }
4050 if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
4051 po->pressure = 0;
4052 spin_unlock_bh(&sk->sk_receive_queue.lock);
4053 spin_lock_bh(&sk->sk_write_queue.lock);
4054 if (po->tx_ring.pg_vec) {
4055 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
4056 mask |= POLLOUT | POLLWRNORM;
4057 }
4058 spin_unlock_bh(&sk->sk_write_queue.lock);
4059 return mask;
4060}
4061
4062
4063
4064
4065
4066
4067static void packet_mm_open(struct vm_area_struct *vma)
4068{
4069 struct file *file = vma->vm_file;
4070 struct socket *sock = file->private_data;
4071 struct sock *sk = sock->sk;
4072
4073 if (sk)
4074 atomic_inc(&pkt_sk(sk)->mapped);
4075}
4076
4077static void packet_mm_close(struct vm_area_struct *vma)
4078{
4079 struct file *file = vma->vm_file;
4080 struct socket *sock = file->private_data;
4081 struct sock *sk = sock->sk;
4082
4083 if (sk)
4084 atomic_dec(&pkt_sk(sk)->mapped);
4085}
4086
4087static const struct vm_operations_struct packet_mmap_ops = {
4088 .open = packet_mm_open,
4089 .close = packet_mm_close,
4090};
4091
4092static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
4093 unsigned int len)
4094{
4095 int i;
4096
4097 for (i = 0; i < len; i++) {
4098 if (likely(pg_vec[i].buffer)) {
4099 if (is_vmalloc_addr(pg_vec[i].buffer))
4100 vfree(pg_vec[i].buffer);
4101 else
4102 free_pages((unsigned long)pg_vec[i].buffer,
4103 order);
4104 pg_vec[i].buffer = NULL;
4105 }
4106 }
4107 kfree(pg_vec);
4108}
4109
4110static char *alloc_one_pg_vec_page(unsigned long order)
4111{
4112 char *buffer;
4113 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
4114 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
4115
4116 buffer = (char *) __get_free_pages(gfp_flags, order);
4117 if (buffer)
4118 return buffer;
4119
4120
4121 buffer = vzalloc((1 << order) * PAGE_SIZE);
4122 if (buffer)
4123 return buffer;
4124
4125
4126 gfp_flags &= ~__GFP_NORETRY;
4127 buffer = (char *) __get_free_pages(gfp_flags, order);
4128 if (buffer)
4129 return buffer;
4130
4131
4132 return NULL;
4133}
4134
4135static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4136{
4137 unsigned int block_nr = req->tp_block_nr;
4138 struct pgv *pg_vec;
4139 int i;
4140
4141 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4142 if (unlikely(!pg_vec))
4143 goto out;
4144
4145 for (i = 0; i < block_nr; i++) {
4146 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
4147 if (unlikely(!pg_vec[i].buffer))
4148 goto out_free_pgvec;
4149 }
4150
4151out:
4152 return pg_vec;
4153
4154out_free_pgvec:
4155 free_pg_vec(pg_vec, order, block_nr);
4156 pg_vec = NULL;
4157 goto out;
4158}
4159
4160static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
4161 int closing, int tx_ring)
4162{
4163 struct pgv *pg_vec = NULL;
4164 struct packet_sock *po = pkt_sk(sk);
4165 int was_running, order = 0;
4166 struct packet_ring_buffer *rb;
4167 struct sk_buff_head *rb_queue;
4168 __be16 num;
4169 int err = -EINVAL;
4170
4171 struct tpacket_req *req = &req_u->req;
4172
4173 lock_sock(sk);
4174
4175 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
4176 net_warn_ratelimited("Tx-ring is not supported.\n");
4177 goto out;
4178 }
4179
4180 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
4181 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
4182
4183 err = -EBUSY;
4184 if (!closing) {
4185 if (atomic_read(&po->mapped))
4186 goto out;
4187 if (packet_read_pending(rb))
4188 goto out;
4189 }
4190
4191 if (req->tp_block_nr) {
4192
4193 err = -EBUSY;
4194 if (unlikely(rb->pg_vec))
4195 goto out;
4196
4197 switch (po->tp_version) {
4198 case TPACKET_V1:
4199 po->tp_hdrlen = TPACKET_HDRLEN;
4200 break;
4201 case TPACKET_V2:
4202 po->tp_hdrlen = TPACKET2_HDRLEN;
4203 break;
4204 case TPACKET_V3:
4205 po->tp_hdrlen = TPACKET3_HDRLEN;
4206 break;
4207 }
4208
4209 err = -EINVAL;
4210 if (unlikely((int)req->tp_block_size <= 0))
4211 goto out;
4212 if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
4213 goto out;
4214 if (po->tp_version >= TPACKET_V3 &&
4215 (int)(req->tp_block_size -
4216 BLK_PLUS_PRIV(req_u->req3.tp_sizeof_priv)) <= 0)
4217 goto out;
4218 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
4219 po->tp_reserve))
4220 goto out;
4221 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
4222 goto out;
4223
4224 rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
4225 if (unlikely(rb->frames_per_block == 0))
4226 goto out;
4227 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
4228 req->tp_frame_nr))
4229 goto out;
4230
4231 err = -ENOMEM;
4232 order = get_order(req->tp_block_size);
4233 pg_vec = alloc_pg_vec(req, order);
4234 if (unlikely(!pg_vec))
4235 goto out;
4236 switch (po->tp_version) {
4237 case TPACKET_V3:
4238
4239
4240
4241 if (!tx_ring)
4242 init_prb_bdqc(po, rb, pg_vec, req_u);
4243 break;
4244 default:
4245 break;
4246 }
4247 }
4248
4249 else {
4250 err = -EINVAL;
4251 if (unlikely(req->tp_frame_nr))
4252 goto out;
4253 }
4254
4255
4256
4257 spin_lock(&po->bind_lock);
4258 was_running = po->running;
4259 num = po->num;
4260 if (was_running) {
4261 po->num = 0;
4262 __unregister_prot_hook(sk, false);
4263 }
4264 spin_unlock(&po->bind_lock);
4265
4266 synchronize_net();
4267
4268 err = -EBUSY;
4269 mutex_lock(&po->pg_vec_lock);
4270 if (closing || atomic_read(&po->mapped) == 0) {
4271 err = 0;
4272 spin_lock_bh(&rb_queue->lock);
4273 swap(rb->pg_vec, pg_vec);
4274 rb->frame_max = (req->tp_frame_nr - 1);
4275 rb->head = 0;
4276 rb->frame_size = req->tp_frame_size;
4277 spin_unlock_bh(&rb_queue->lock);
4278
4279 swap(rb->pg_vec_order, order);
4280 swap(rb->pg_vec_len, req->tp_block_nr);
4281
4282 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
4283 po->prot_hook.func = (po->rx_ring.pg_vec) ?
4284 tpacket_rcv : packet_rcv;
4285 skb_queue_purge(rb_queue);
4286 if (atomic_read(&po->mapped))
4287 pr_err("packet_mmap: vma is busy: %d\n",
4288 atomic_read(&po->mapped));
4289 }
4290 mutex_unlock(&po->pg_vec_lock);
4291
4292 spin_lock(&po->bind_lock);
4293 if (was_running) {
4294 po->num = num;
4295 register_prot_hook(sk);
4296 }
4297 spin_unlock(&po->bind_lock);
4298 if (closing && (po->tp_version > TPACKET_V2)) {
4299
4300 if (!tx_ring)
4301 prb_shutdown_retire_blk_timer(po, rb_queue);
4302 }
4303
4304 if (pg_vec)
4305 free_pg_vec(pg_vec, order, req->tp_block_nr);
4306out:
4307 release_sock(sk);
4308 return err;
4309}
4310
4311static int packet_mmap(struct file *file, struct socket *sock,
4312 struct vm_area_struct *vma)
4313{
4314 struct sock *sk = sock->sk;
4315 struct packet_sock *po = pkt_sk(sk);
4316 unsigned long size, expected_size;
4317 struct packet_ring_buffer *rb;
4318 unsigned long start;
4319 int err = -EINVAL;
4320 int i;
4321
4322 if (vma->vm_pgoff)
4323 return -EINVAL;
4324
4325 mutex_lock(&po->pg_vec_lock);
4326
4327 expected_size = 0;
4328 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4329 if (rb->pg_vec) {
4330 expected_size += rb->pg_vec_len
4331 * rb->pg_vec_pages
4332 * PAGE_SIZE;
4333 }
4334 }
4335
4336 if (expected_size == 0)
4337 goto out;
4338
4339 size = vma->vm_end - vma->vm_start;
4340 if (size != expected_size)
4341 goto out;
4342
4343 start = vma->vm_start;
4344 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4345 if (rb->pg_vec == NULL)
4346 continue;
4347
4348 for (i = 0; i < rb->pg_vec_len; i++) {
4349 struct page *page;
4350 void *kaddr = rb->pg_vec[i].buffer;
4351 int pg_num;
4352
4353 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
4354 page = pgv_to_page(kaddr);
4355 err = vm_insert_page(vma, start, page);
4356 if (unlikely(err))
4357 goto out;
4358 start += PAGE_SIZE;
4359 kaddr += PAGE_SIZE;
4360 }
4361 }
4362 }
4363
4364 atomic_inc(&po->mapped);
4365 vma->vm_ops = &packet_mmap_ops;
4366 err = 0;
4367
4368out:
4369 mutex_unlock(&po->pg_vec_lock);
4370 return err;
4371}
4372
4373static const struct proto_ops packet_ops_spkt = {
4374 .family = PF_PACKET,
4375 .owner = THIS_MODULE,
4376 .release = packet_release,
4377 .bind = packet_bind_spkt,
4378 .connect = sock_no_connect,
4379 .socketpair = sock_no_socketpair,
4380 .accept = sock_no_accept,
4381 .getname = packet_getname_spkt,
4382 .poll = datagram_poll,
4383 .ioctl = packet_ioctl,
4384 .listen = sock_no_listen,
4385 .shutdown = sock_no_shutdown,
4386 .setsockopt = sock_no_setsockopt,
4387 .getsockopt = sock_no_getsockopt,
4388 .sendmsg = packet_sendmsg_spkt,
4389 .recvmsg = packet_recvmsg,
4390 .mmap = sock_no_mmap,
4391 .sendpage = sock_no_sendpage,
4392};
4393
4394static const struct proto_ops packet_ops = {
4395 .family = PF_PACKET,
4396 .owner = THIS_MODULE,
4397 .release = packet_release,
4398 .bind = packet_bind,
4399 .connect = sock_no_connect,
4400 .socketpair = sock_no_socketpair,
4401 .accept = sock_no_accept,
4402 .getname = packet_getname,
4403 .poll = packet_poll,
4404 .ioctl = packet_ioctl,
4405 .listen = sock_no_listen,
4406 .shutdown = sock_no_shutdown,
4407 .setsockopt = packet_setsockopt,
4408 .getsockopt = packet_getsockopt,
4409#ifdef CONFIG_COMPAT
4410 .compat_setsockopt = compat_packet_setsockopt,
4411#endif
4412 .sendmsg = packet_sendmsg,
4413 .recvmsg = packet_recvmsg,
4414 .mmap = packet_mmap,
4415 .sendpage = sock_no_sendpage,
4416};
4417
4418static const struct net_proto_family packet_family_ops = {
4419 .family = PF_PACKET,
4420 .create = packet_create,
4421 .owner = THIS_MODULE,
4422};
4423
4424static struct notifier_block packet_netdev_notifier = {
4425 .notifier_call = packet_notifier,
4426};
4427
4428#ifdef CONFIG_PROC_FS
4429
4430static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
4431 __acquires(RCU)
4432{
4433 struct net *net = seq_file_net(seq);
4434
4435 rcu_read_lock();
4436 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
4437}
4438
4439static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4440{
4441 struct net *net = seq_file_net(seq);
4442 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
4443}
4444
4445static void packet_seq_stop(struct seq_file *seq, void *v)
4446 __releases(RCU)
4447{
4448 rcu_read_unlock();
4449}
4450
4451static int packet_seq_show(struct seq_file *seq, void *v)
4452{
4453 if (v == SEQ_START_TOKEN)
4454 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
4455 else {
4456 struct sock *s = sk_entry(v);
4457 const struct packet_sock *po = pkt_sk(s);
4458
4459 seq_printf(seq,
4460 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
4461 s,
4462 atomic_read(&s->sk_refcnt),
4463 s->sk_type,
4464 ntohs(po->num),
4465 po->ifindex,
4466 po->running,
4467 atomic_read(&s->sk_rmem_alloc),
4468 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
4469 sock_i_ino(s));
4470 }
4471
4472 return 0;
4473}
4474
4475static const struct seq_operations packet_seq_ops = {
4476 .start = packet_seq_start,
4477 .next = packet_seq_next,
4478 .stop = packet_seq_stop,
4479 .show = packet_seq_show,
4480};
4481
4482static int packet_seq_open(struct inode *inode, struct file *file)
4483{
4484 return seq_open_net(inode, file, &packet_seq_ops,
4485 sizeof(struct seq_net_private));
4486}
4487
4488static const struct file_operations packet_seq_fops = {
4489 .owner = THIS_MODULE,
4490 .open = packet_seq_open,
4491 .read = seq_read,
4492 .llseek = seq_lseek,
4493 .release = seq_release_net,
4494};
4495
4496#endif
4497
4498static int __net_init packet_net_init(struct net *net)
4499{
4500 mutex_init(&net->packet.sklist_lock);
4501 INIT_HLIST_HEAD(&net->packet.sklist);
4502
4503 if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))
4504 return -ENOMEM;
4505
4506 return 0;
4507}
4508
4509static void __net_exit packet_net_exit(struct net *net)
4510{
4511 remove_proc_entry("packet", net->proc_net);
4512}
4513
4514static struct pernet_operations packet_net_ops = {
4515 .init = packet_net_init,
4516 .exit = packet_net_exit,
4517};
4518
4519
4520static void __exit packet_exit(void)
4521{
4522 unregister_netdevice_notifier(&packet_netdev_notifier);
4523 unregister_pernet_subsys(&packet_net_ops);
4524 sock_unregister(PF_PACKET);
4525 proto_unregister(&packet_proto);
4526}
4527
4528static int __init packet_init(void)
4529{
4530 int rc = proto_register(&packet_proto, 0);
4531
4532 if (rc != 0)
4533 goto out;
4534
4535 sock_register(&packet_family_ops);
4536 register_pernet_subsys(&packet_net_ops);
4537 register_netdevice_notifier(&packet_netdev_notifier);
4538out:
4539 return rc;
4540}
4541
4542module_init(packet_init);
4543module_exit(packet_exit);
4544MODULE_LICENSE("GPL");
4545MODULE_ALIAS_NETPROTO(PF_PACKET);
4546