1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55#include <linux/types.h>
56#include <linux/mm.h>
57#include <linux/capability.h>
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
65#include <linux/kernel.h>
66#include <linux/kmod.h>
67#include <linux/slab.h>
68#include <linux/vmalloc.h>
69#include <net/net_namespace.h>
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
76#include <linux/uaccess.h>
77#include <asm/ioctls.h>
78#include <asm/page.h>
79#include <asm/cacheflush.h>
80#include <asm/io.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83#include <linux/poll.h>
84#include <linux/module.h>
85#include <linux/init.h>
86#include <linux/mutex.h>
87#include <linux/if_vlan.h>
88#include <linux/virtio_net.h>
89#include <linux/errqueue.h>
90#include <linux/net_tstamp.h>
91#include <linux/percpu.h>
92#ifdef CONFIG_INET
93#include <net/inet_common.h>
94#endif
95#include <linux/bpf.h>
96#include <net/compat.h>
97
98#include "internal.h"
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156struct packet_mreq_max {
157 int mr_ifindex;
158 unsigned short mr_type;
159 unsigned short mr_alen;
160 unsigned char mr_address[MAX_ADDR_LEN];
161};
162
163union tpacket_uhdr {
164 struct tpacket_hdr *h1;
165 struct tpacket2_hdr *h2;
166 struct tpacket3_hdr *h3;
167 void *raw;
168};
169
170static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
171 int closing, int tx_ring);
172
173#define V3_ALIGNMENT (8)
174
175#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
176
177#define BLK_PLUS_PRIV(sz_of_priv) \
178 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
179
180#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
181#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
182#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
183#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
184#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
185#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
186#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
187
188struct packet_sock;
189static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
190 struct packet_type *pt, struct net_device *orig_dev);
191
192static void *packet_previous_frame(struct packet_sock *po,
193 struct packet_ring_buffer *rb,
194 int status);
195static void packet_increment_head(struct packet_ring_buffer *buff);
196static int prb_curr_blk_in_use(struct tpacket_block_desc *);
197static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
198 struct packet_sock *);
199static void prb_retire_current_block(struct tpacket_kbdq_core *,
200 struct packet_sock *, unsigned int status);
201static int prb_queue_frozen(struct tpacket_kbdq_core *);
202static void prb_open_block(struct tpacket_kbdq_core *,
203 struct tpacket_block_desc *);
204static void prb_retire_rx_blk_timer_expired(struct timer_list *);
205static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
206static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
207static void prb_clear_rxhash(struct tpacket_kbdq_core *,
208 struct tpacket3_hdr *);
209static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
210 struct tpacket3_hdr *);
211static void packet_flush_mclist(struct sock *sk);
212static u16 packet_pick_tx_queue(struct sk_buff *skb);
213
214struct packet_skb_cb {
215 union {
216 struct sockaddr_pkt pkt;
217 union {
218
219
220
221
222 unsigned int origlen;
223 struct sockaddr_ll ll;
224 };
225 } sa;
226};
227
228#define vio_le() virtio_legacy_is_little_endian()
229
230#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
231
232#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
233#define GET_PBLOCK_DESC(x, bid) \
234 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
235#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
236 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
237#define GET_NEXT_PRB_BLK_NUM(x) \
238 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
239 ((x)->kactive_blk_num+1) : 0)
240
241static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
242static void __fanout_link(struct sock *sk, struct packet_sock *po);
243
244static int packet_direct_xmit(struct sk_buff *skb)
245{
246 return dev_direct_xmit(skb, packet_pick_tx_queue(skb));
247}
248
249static struct net_device *packet_cached_dev_get(struct packet_sock *po)
250{
251 struct net_device *dev;
252
253 rcu_read_lock();
254 dev = rcu_dereference(po->cached_dev);
255 if (likely(dev))
256 dev_hold(dev);
257 rcu_read_unlock();
258
259 return dev;
260}
261
262static void packet_cached_dev_assign(struct packet_sock *po,
263 struct net_device *dev)
264{
265 rcu_assign_pointer(po->cached_dev, dev);
266}
267
268static void packet_cached_dev_reset(struct packet_sock *po)
269{
270 RCU_INIT_POINTER(po->cached_dev, NULL);
271}
272
273static bool packet_use_direct_xmit(const struct packet_sock *po)
274{
275 return po->xmit == packet_direct_xmit;
276}
277
278static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb,
279 struct net_device *sb_dev)
280{
281 return dev_pick_tx_cpu_id(dev, skb, sb_dev, NULL);
282}
283
284static u16 packet_pick_tx_queue(struct sk_buff *skb)
285{
286 struct net_device *dev = skb->dev;
287 const struct net_device_ops *ops = dev->netdev_ops;
288 u16 queue_index;
289
290 if (ops->ndo_select_queue) {
291 queue_index = ops->ndo_select_queue(dev, skb, NULL,
292 __packet_pick_tx_queue);
293 queue_index = netdev_cap_txqueue(dev, queue_index);
294 } else {
295 queue_index = __packet_pick_tx_queue(dev, skb, NULL);
296 }
297
298 return queue_index;
299}
300
301
302
303
304
305static void __register_prot_hook(struct sock *sk)
306{
307 struct packet_sock *po = pkt_sk(sk);
308
309 if (!po->running) {
310 if (po->fanout)
311 __fanout_link(sk, po);
312 else
313 dev_add_pack(&po->prot_hook);
314
315 sock_hold(sk);
316 po->running = 1;
317 }
318}
319
320static void register_prot_hook(struct sock *sk)
321{
322 lockdep_assert_held_once(&pkt_sk(sk)->bind_lock);
323 __register_prot_hook(sk);
324}
325
326
327
328
329
330
331
332static void __unregister_prot_hook(struct sock *sk, bool sync)
333{
334 struct packet_sock *po = pkt_sk(sk);
335
336 lockdep_assert_held_once(&po->bind_lock);
337
338 po->running = 0;
339
340 if (po->fanout)
341 __fanout_unlink(sk, po);
342 else
343 __dev_remove_pack(&po->prot_hook);
344
345 __sock_put(sk);
346
347 if (sync) {
348 spin_unlock(&po->bind_lock);
349 synchronize_net();
350 spin_lock(&po->bind_lock);
351 }
352}
353
354static void unregister_prot_hook(struct sock *sk, bool sync)
355{
356 struct packet_sock *po = pkt_sk(sk);
357
358 if (po->running)
359 __unregister_prot_hook(sk, sync);
360}
361
362static inline struct page * __pure pgv_to_page(void *addr)
363{
364 if (is_vmalloc_addr(addr))
365 return vmalloc_to_page(addr);
366 return virt_to_page(addr);
367}
368
369static void __packet_set_status(struct packet_sock *po, void *frame, int status)
370{
371 union tpacket_uhdr h;
372
373 h.raw = frame;
374 switch (po->tp_version) {
375 case TPACKET_V1:
376 h.h1->tp_status = status;
377 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
378 break;
379 case TPACKET_V2:
380 h.h2->tp_status = status;
381 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
382 break;
383 case TPACKET_V3:
384 h.h3->tp_status = status;
385 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
386 break;
387 default:
388 WARN(1, "TPACKET version not supported.\n");
389 BUG();
390 }
391
392 smp_wmb();
393}
394
395static int __packet_get_status(struct packet_sock *po, void *frame)
396{
397 union tpacket_uhdr h;
398
399 smp_rmb();
400
401 h.raw = frame;
402 switch (po->tp_version) {
403 case TPACKET_V1:
404 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
405 return h.h1->tp_status;
406 case TPACKET_V2:
407 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
408 return h.h2->tp_status;
409 case TPACKET_V3:
410 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
411 return h.h3->tp_status;
412 default:
413 WARN(1, "TPACKET version not supported.\n");
414 BUG();
415 return 0;
416 }
417}
418
419static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
420 unsigned int flags)
421{
422 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
423
424 if (shhwtstamps &&
425 (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
426 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
427 return TP_STATUS_TS_RAW_HARDWARE;
428
429 if (ktime_to_timespec_cond(skb->tstamp, ts))
430 return TP_STATUS_TS_SOFTWARE;
431
432 return 0;
433}
434
435static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
436 struct sk_buff *skb)
437{
438 union tpacket_uhdr h;
439 struct timespec ts;
440 __u32 ts_status;
441
442 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
443 return 0;
444
445 h.raw = frame;
446 switch (po->tp_version) {
447 case TPACKET_V1:
448 h.h1->tp_sec = ts.tv_sec;
449 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
450 break;
451 case TPACKET_V2:
452 h.h2->tp_sec = ts.tv_sec;
453 h.h2->tp_nsec = ts.tv_nsec;
454 break;
455 case TPACKET_V3:
456 h.h3->tp_sec = ts.tv_sec;
457 h.h3->tp_nsec = ts.tv_nsec;
458 break;
459 default:
460 WARN(1, "TPACKET version not supported.\n");
461 BUG();
462 }
463
464
465 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
466 smp_wmb();
467
468 return ts_status;
469}
470
471static void *packet_lookup_frame(struct packet_sock *po,
472 struct packet_ring_buffer *rb,
473 unsigned int position,
474 int status)
475{
476 unsigned int pg_vec_pos, frame_offset;
477 union tpacket_uhdr h;
478
479 pg_vec_pos = position / rb->frames_per_block;
480 frame_offset = position % rb->frames_per_block;
481
482 h.raw = rb->pg_vec[pg_vec_pos].buffer +
483 (frame_offset * rb->frame_size);
484
485 if (status != __packet_get_status(po, h.raw))
486 return NULL;
487
488 return h.raw;
489}
490
491static void *packet_current_frame(struct packet_sock *po,
492 struct packet_ring_buffer *rb,
493 int status)
494{
495 return packet_lookup_frame(po, rb, rb->head, status);
496}
497
498static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
499{
500 del_timer_sync(&pkc->retire_blk_timer);
501}
502
503static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
504 struct sk_buff_head *rb_queue)
505{
506 struct tpacket_kbdq_core *pkc;
507
508 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
509
510 spin_lock_bh(&rb_queue->lock);
511 pkc->delete_blk_timer = 1;
512 spin_unlock_bh(&rb_queue->lock);
513
514 prb_del_retire_blk_timer(pkc);
515}
516
517static void prb_setup_retire_blk_timer(struct packet_sock *po)
518{
519 struct tpacket_kbdq_core *pkc;
520
521 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
522 timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
523 0);
524 pkc->retire_blk_timer.expires = jiffies;
525}
526
527static int prb_calc_retire_blk_tmo(struct packet_sock *po,
528 int blk_size_in_bytes)
529{
530 struct net_device *dev;
531 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
532 struct ethtool_link_ksettings ecmd;
533 int err;
534
535 rtnl_lock();
536 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
537 if (unlikely(!dev)) {
538 rtnl_unlock();
539 return DEFAULT_PRB_RETIRE_TOV;
540 }
541 err = __ethtool_get_link_ksettings(dev, &ecmd);
542 rtnl_unlock();
543 if (!err) {
544
545
546
547
548 if (ecmd.base.speed < SPEED_1000 ||
549 ecmd.base.speed == SPEED_UNKNOWN) {
550 return DEFAULT_PRB_RETIRE_TOV;
551 } else {
552 msec = 1;
553 div = ecmd.base.speed / 1000;
554 }
555 }
556
557 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
558
559 if (div)
560 mbits /= div;
561
562 tmo = mbits * msec;
563
564 if (div)
565 return tmo+1;
566 return tmo;
567}
568
569static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
570 union tpacket_req_u *req_u)
571{
572 p1->feature_req_word = req_u->req3.tp_feature_req_word;
573}
574
575static void init_prb_bdqc(struct packet_sock *po,
576 struct packet_ring_buffer *rb,
577 struct pgv *pg_vec,
578 union tpacket_req_u *req_u)
579{
580 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
581 struct tpacket_block_desc *pbd;
582
583 memset(p1, 0x0, sizeof(*p1));
584
585 p1->knxt_seq_num = 1;
586 p1->pkbdq = pg_vec;
587 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
588 p1->pkblk_start = pg_vec[0].buffer;
589 p1->kblk_size = req_u->req3.tp_block_size;
590 p1->knum_blocks = req_u->req3.tp_block_nr;
591 p1->hdrlen = po->tp_hdrlen;
592 p1->version = po->tp_version;
593 p1->last_kactive_blk_num = 0;
594 po->stats.stats3.tp_freeze_q_cnt = 0;
595 if (req_u->req3.tp_retire_blk_tov)
596 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
597 else
598 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
599 req_u->req3.tp_block_size);
600 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
601 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
602
603 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
604 prb_init_ft_ops(p1, req_u);
605 prb_setup_retire_blk_timer(po);
606 prb_open_block(p1, pbd);
607}
608
609
610
611
612static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
613{
614 mod_timer(&pkc->retire_blk_timer,
615 jiffies + pkc->tov_in_jiffies);
616 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
617}
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
643{
644 struct packet_sock *po =
645 from_timer(po, t, rx_ring.prb_bdqc.retire_blk_timer);
646 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
647 unsigned int frozen;
648 struct tpacket_block_desc *pbd;
649
650 spin_lock(&po->sk.sk_receive_queue.lock);
651
652 frozen = prb_queue_frozen(pkc);
653 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
654
655 if (unlikely(pkc->delete_blk_timer))
656 goto out;
657
658
659
660
661
662
663
664
665
666
667 if (BLOCK_NUM_PKTS(pbd)) {
668 while (atomic_read(&pkc->blk_fill_in_prog)) {
669
670 cpu_relax();
671 }
672 }
673
674 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
675 if (!frozen) {
676 if (!BLOCK_NUM_PKTS(pbd)) {
677
678 goto refresh_timer;
679 }
680 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
681 if (!prb_dispatch_next_block(pkc, po))
682 goto refresh_timer;
683 else
684 goto out;
685 } else {
686
687
688
689 if (prb_curr_blk_in_use(pbd)) {
690
691
692
693
694 goto refresh_timer;
695 } else {
696
697
698
699
700
701
702
703 prb_open_block(pkc, pbd);
704 goto out;
705 }
706 }
707 }
708
709refresh_timer:
710 _prb_refresh_rx_retire_blk_timer(pkc);
711
712out:
713 spin_unlock(&po->sk.sk_receive_queue.lock);
714}
715
716static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
717 struct tpacket_block_desc *pbd1, __u32 status)
718{
719
720
721#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
722 u8 *start, *end;
723
724 start = (u8 *)pbd1;
725
726
727 start += PAGE_SIZE;
728
729 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
730 for (; start < end; start += PAGE_SIZE)
731 flush_dcache_page(pgv_to_page(start));
732
733 smp_wmb();
734#endif
735
736
737
738 BLOCK_STATUS(pbd1) = status;
739
740
741
742#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
743 start = (u8 *)pbd1;
744 flush_dcache_page(pgv_to_page(start));
745
746 smp_wmb();
747#endif
748}
749
750
751
752
753
754
755
756
757
758
759static void prb_close_block(struct tpacket_kbdq_core *pkc1,
760 struct tpacket_block_desc *pbd1,
761 struct packet_sock *po, unsigned int stat)
762{
763 __u32 status = TP_STATUS_USER | stat;
764
765 struct tpacket3_hdr *last_pkt;
766 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
767 struct sock *sk = &po->sk;
768
769 if (po->stats.stats3.tp_drops)
770 status |= TP_STATUS_LOSING;
771
772 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
773 last_pkt->tp_next_offset = 0;
774
775
776 if (BLOCK_NUM_PKTS(pbd1)) {
777 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
778 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
779 } else {
780
781
782
783
784
785 struct timespec ts;
786 getnstimeofday(&ts);
787 h1->ts_last_pkt.ts_sec = ts.tv_sec;
788 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
789 }
790
791 smp_wmb();
792
793
794 prb_flush_block(pkc1, pbd1, status);
795
796 sk->sk_data_ready(sk);
797
798 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
799}
800
801static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
802{
803 pkc->reset_pending_on_curr_blk = 0;
804}
805
806
807
808
809
810
811
812
813static void prb_open_block(struct tpacket_kbdq_core *pkc1,
814 struct tpacket_block_desc *pbd1)
815{
816 struct timespec ts;
817 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
818
819 smp_rmb();
820
821
822
823
824
825 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
826 BLOCK_NUM_PKTS(pbd1) = 0;
827 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
828
829 getnstimeofday(&ts);
830
831 h1->ts_first_pkt.ts_sec = ts.tv_sec;
832 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
833
834 pkc1->pkblk_start = (char *)pbd1;
835 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
836
837 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
838 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
839
840 pbd1->version = pkc1->version;
841 pkc1->prev = pkc1->nxt_offset;
842 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
843
844 prb_thaw_queue(pkc1);
845 _prb_refresh_rx_retire_blk_timer(pkc1);
846
847 smp_wmb();
848}
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
874 struct packet_sock *po)
875{
876 pkc->reset_pending_on_curr_blk = 1;
877 po->stats.stats3.tp_freeze_q_cnt++;
878}
879
880#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
881
882
883
884
885
886
887
888static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
889 struct packet_sock *po)
890{
891 struct tpacket_block_desc *pbd;
892
893 smp_rmb();
894
895
896 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
897
898
899 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
900 prb_freeze_queue(pkc, po);
901 return NULL;
902 }
903
904
905
906
907
908
909 prb_open_block(pkc, pbd);
910 return (void *)pkc->nxt_offset;
911}
912
913static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
914 struct packet_sock *po, unsigned int status)
915{
916 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
917
918
919 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
920
921
922
923
924
925
926
927
928
929 if (!(status & TP_STATUS_BLK_TMO)) {
930 while (atomic_read(&pkc->blk_fill_in_prog)) {
931
932 cpu_relax();
933 }
934 }
935 prb_close_block(pkc, pbd, po, status);
936 return;
937 }
938}
939
940static int prb_curr_blk_in_use(struct tpacket_block_desc *pbd)
941{
942 return TP_STATUS_USER & BLOCK_STATUS(pbd);
943}
944
945static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
946{
947 return pkc->reset_pending_on_curr_blk;
948}
949
950static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
951{
952 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
953 atomic_dec(&pkc->blk_fill_in_prog);
954}
955
956static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
957 struct tpacket3_hdr *ppd)
958{
959 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
960}
961
962static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
963 struct tpacket3_hdr *ppd)
964{
965 ppd->hv1.tp_rxhash = 0;
966}
967
968static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
969 struct tpacket3_hdr *ppd)
970{
971 if (skb_vlan_tag_present(pkc->skb)) {
972 ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
973 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
974 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
975 } else {
976 ppd->hv1.tp_vlan_tci = 0;
977 ppd->hv1.tp_vlan_tpid = 0;
978 ppd->tp_status = TP_STATUS_AVAILABLE;
979 }
980}
981
982static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
983 struct tpacket3_hdr *ppd)
984{
985 ppd->hv1.tp_padding = 0;
986 prb_fill_vlan_info(pkc, ppd);
987
988 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
989 prb_fill_rxhash(pkc, ppd);
990 else
991 prb_clear_rxhash(pkc, ppd);
992}
993
994static void prb_fill_curr_block(char *curr,
995 struct tpacket_kbdq_core *pkc,
996 struct tpacket_block_desc *pbd,
997 unsigned int len)
998{
999 struct tpacket3_hdr *ppd;
1000
1001 ppd = (struct tpacket3_hdr *)curr;
1002 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1003 pkc->prev = curr;
1004 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1005 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1006 BLOCK_NUM_PKTS(pbd) += 1;
1007 atomic_inc(&pkc->blk_fill_in_prog);
1008 prb_run_all_ft_ops(pkc, ppd);
1009}
1010
1011
1012static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1013 struct sk_buff *skb,
1014 int status,
1015 unsigned int len
1016 )
1017{
1018 struct tpacket_kbdq_core *pkc;
1019 struct tpacket_block_desc *pbd;
1020 char *curr, *end;
1021
1022 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
1023 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1024
1025
1026 if (prb_queue_frozen(pkc)) {
1027
1028
1029
1030
1031 if (prb_curr_blk_in_use(pbd)) {
1032
1033 return NULL;
1034 } else {
1035
1036
1037
1038
1039
1040
1041 prb_open_block(pkc, pbd);
1042 }
1043 }
1044
1045 smp_mb();
1046 curr = pkc->nxt_offset;
1047 pkc->skb = skb;
1048 end = (char *)pbd + pkc->kblk_size;
1049
1050
1051 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1052 prb_fill_curr_block(curr, pkc, pbd, len);
1053 return (void *)curr;
1054 }
1055
1056
1057 prb_retire_current_block(pkc, po, 0);
1058
1059
1060 curr = (char *)prb_dispatch_next_block(pkc, po);
1061 if (curr) {
1062 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1063 prb_fill_curr_block(curr, pkc, pbd, len);
1064 return (void *)curr;
1065 }
1066
1067
1068
1069
1070
1071 return NULL;
1072}
1073
1074static void *packet_current_rx_frame(struct packet_sock *po,
1075 struct sk_buff *skb,
1076 int status, unsigned int len)
1077{
1078 char *curr = NULL;
1079 switch (po->tp_version) {
1080 case TPACKET_V1:
1081 case TPACKET_V2:
1082 curr = packet_lookup_frame(po, &po->rx_ring,
1083 po->rx_ring.head, status);
1084 return curr;
1085 case TPACKET_V3:
1086 return __packet_lookup_frame_in_block(po, skb, status, len);
1087 default:
1088 WARN(1, "TPACKET version not supported\n");
1089 BUG();
1090 return NULL;
1091 }
1092}
1093
1094static void *prb_lookup_block(struct packet_sock *po,
1095 struct packet_ring_buffer *rb,
1096 unsigned int idx,
1097 int status)
1098{
1099 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
1100 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
1101
1102 if (status != BLOCK_STATUS(pbd))
1103 return NULL;
1104 return pbd;
1105}
1106
1107static int prb_previous_blk_num(struct packet_ring_buffer *rb)
1108{
1109 unsigned int prev;
1110 if (rb->prb_bdqc.kactive_blk_num)
1111 prev = rb->prb_bdqc.kactive_blk_num-1;
1112 else
1113 prev = rb->prb_bdqc.knum_blocks-1;
1114 return prev;
1115}
1116
1117
1118static void *__prb_previous_block(struct packet_sock *po,
1119 struct packet_ring_buffer *rb,
1120 int status)
1121{
1122 unsigned int previous = prb_previous_blk_num(rb);
1123 return prb_lookup_block(po, rb, previous, status);
1124}
1125
1126static void *packet_previous_rx_frame(struct packet_sock *po,
1127 struct packet_ring_buffer *rb,
1128 int status)
1129{
1130 if (po->tp_version <= TPACKET_V2)
1131 return packet_previous_frame(po, rb, status);
1132
1133 return __prb_previous_block(po, rb, status);
1134}
1135
1136static void packet_increment_rx_head(struct packet_sock *po,
1137 struct packet_ring_buffer *rb)
1138{
1139 switch (po->tp_version) {
1140 case TPACKET_V1:
1141 case TPACKET_V2:
1142 return packet_increment_head(rb);
1143 case TPACKET_V3:
1144 default:
1145 WARN(1, "TPACKET version not supported.\n");
1146 BUG();
1147 return;
1148 }
1149}
1150
1151static void *packet_previous_frame(struct packet_sock *po,
1152 struct packet_ring_buffer *rb,
1153 int status)
1154{
1155 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1156 return packet_lookup_frame(po, rb, previous, status);
1157}
1158
1159static void packet_increment_head(struct packet_ring_buffer *buff)
1160{
1161 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1162}
1163
1164static void packet_inc_pending(struct packet_ring_buffer *rb)
1165{
1166 this_cpu_inc(*rb->pending_refcnt);
1167}
1168
1169static void packet_dec_pending(struct packet_ring_buffer *rb)
1170{
1171 this_cpu_dec(*rb->pending_refcnt);
1172}
1173
1174static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1175{
1176 unsigned int refcnt = 0;
1177 int cpu;
1178
1179
1180 if (rb->pending_refcnt == NULL)
1181 return 0;
1182
1183 for_each_possible_cpu(cpu)
1184 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1185
1186 return refcnt;
1187}
1188
1189static int packet_alloc_pending(struct packet_sock *po)
1190{
1191 po->rx_ring.pending_refcnt = NULL;
1192
1193 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1194 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1195 return -ENOBUFS;
1196
1197 return 0;
1198}
1199
1200static void packet_free_pending(struct packet_sock *po)
1201{
1202 free_percpu(po->tx_ring.pending_refcnt);
1203}
1204
1205#define ROOM_POW_OFF 2
1206#define ROOM_NONE 0x0
1207#define ROOM_LOW 0x1
1208#define ROOM_NORMAL 0x2
1209
1210static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
1211{
1212 int idx, len;
1213
1214 len = po->rx_ring.frame_max + 1;
1215 idx = po->rx_ring.head;
1216 if (pow_off)
1217 idx += len >> pow_off;
1218 if (idx >= len)
1219 idx -= len;
1220 return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1221}
1222
1223static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
1224{
1225 int idx, len;
1226
1227 len = po->rx_ring.prb_bdqc.knum_blocks;
1228 idx = po->rx_ring.prb_bdqc.kactive_blk_num;
1229 if (pow_off)
1230 idx += len >> pow_off;
1231 if (idx >= len)
1232 idx -= len;
1233 return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1234}
1235
1236static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1237{
1238 struct sock *sk = &po->sk;
1239 int ret = ROOM_NONE;
1240
1241 if (po->prot_hook.func != tpacket_rcv) {
1242 int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
1243 - (skb ? skb->truesize : 0);
1244 if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
1245 return ROOM_NORMAL;
1246 else if (avail > 0)
1247 return ROOM_LOW;
1248 else
1249 return ROOM_NONE;
1250 }
1251
1252 if (po->tp_version == TPACKET_V3) {
1253 if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
1254 ret = ROOM_NORMAL;
1255 else if (__tpacket_v3_has_room(po, 0))
1256 ret = ROOM_LOW;
1257 } else {
1258 if (__tpacket_has_room(po, ROOM_POW_OFF))
1259 ret = ROOM_NORMAL;
1260 else if (__tpacket_has_room(po, 0))
1261 ret = ROOM_LOW;
1262 }
1263
1264 return ret;
1265}
1266
1267static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1268{
1269 int ret;
1270 bool has_room;
1271
1272 spin_lock_bh(&po->sk.sk_receive_queue.lock);
1273 ret = __packet_rcv_has_room(po, skb);
1274 has_room = ret == ROOM_NORMAL;
1275 if (po->pressure == has_room)
1276 po->pressure = !has_room;
1277 spin_unlock_bh(&po->sk.sk_receive_queue.lock);
1278
1279 return ret;
1280}
1281
1282static void packet_sock_destruct(struct sock *sk)
1283{
1284 skb_queue_purge(&sk->sk_error_queue);
1285
1286 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1287 WARN_ON(refcount_read(&sk->sk_wmem_alloc));
1288
1289 if (!sock_flag(sk, SOCK_DEAD)) {
1290 pr_err("Attempt to release alive packet socket: %p\n", sk);
1291 return;
1292 }
1293
1294 sk_refcnt_debug_dec(sk);
1295}
1296
1297static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
1298{
1299 u32 rxhash;
1300 int i, count = 0;
1301
1302 rxhash = skb_get_hash(skb);
1303 for (i = 0; i < ROLLOVER_HLEN; i++)
1304 if (po->rollover->history[i] == rxhash)
1305 count++;
1306
1307 po->rollover->history[prandom_u32() % ROLLOVER_HLEN] = rxhash;
1308 return count > (ROLLOVER_HLEN >> 1);
1309}
1310
1311static unsigned int fanout_demux_hash(struct packet_fanout *f,
1312 struct sk_buff *skb,
1313 unsigned int num)
1314{
1315 return reciprocal_scale(__skb_get_hash_symmetric(skb), num);
1316}
1317
1318static unsigned int fanout_demux_lb(struct packet_fanout *f,
1319 struct sk_buff *skb,
1320 unsigned int num)
1321{
1322 unsigned int val = atomic_inc_return(&f->rr_cur);
1323
1324 return val % num;
1325}
1326
1327static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1328 struct sk_buff *skb,
1329 unsigned int num)
1330{
1331 return smp_processor_id() % num;
1332}
1333
1334static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1335 struct sk_buff *skb,
1336 unsigned int num)
1337{
1338 return prandom_u32_max(num);
1339}
1340
1341static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1342 struct sk_buff *skb,
1343 unsigned int idx, bool try_self,
1344 unsigned int num)
1345{
1346 struct packet_sock *po, *po_next, *po_skip = NULL;
1347 unsigned int i, j, room = ROOM_NONE;
1348
1349 po = pkt_sk(f->arr[idx]);
1350
1351 if (try_self) {
1352 room = packet_rcv_has_room(po, skb);
1353 if (room == ROOM_NORMAL ||
1354 (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
1355 return idx;
1356 po_skip = po;
1357 }
1358
1359 i = j = min_t(int, po->rollover->sock, num - 1);
1360 do {
1361 po_next = pkt_sk(f->arr[i]);
1362 if (po_next != po_skip && !po_next->pressure &&
1363 packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
1364 if (i != j)
1365 po->rollover->sock = i;
1366 atomic_long_inc(&po->rollover->num);
1367 if (room == ROOM_LOW)
1368 atomic_long_inc(&po->rollover->num_huge);
1369 return i;
1370 }
1371
1372 if (++i == num)
1373 i = 0;
1374 } while (i != j);
1375
1376 atomic_long_inc(&po->rollover->num_failed);
1377 return idx;
1378}
1379
1380static unsigned int fanout_demux_qm(struct packet_fanout *f,
1381 struct sk_buff *skb,
1382 unsigned int num)
1383{
1384 return skb_get_queue_mapping(skb) % num;
1385}
1386
1387static unsigned int fanout_demux_bpf(struct packet_fanout *f,
1388 struct sk_buff *skb,
1389 unsigned int num)
1390{
1391 struct bpf_prog *prog;
1392 unsigned int ret = 0;
1393
1394 rcu_read_lock();
1395 prog = rcu_dereference(f->bpf_prog);
1396 if (prog)
1397 ret = bpf_prog_run_clear_cb(prog, skb) % num;
1398 rcu_read_unlock();
1399
1400 return ret;
1401}
1402
1403static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1404{
1405 return f->flags & (flag >> 8);
1406}
1407
1408static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1409 struct packet_type *pt, struct net_device *orig_dev)
1410{
1411 struct packet_fanout *f = pt->af_packet_priv;
1412 unsigned int num = READ_ONCE(f->num_members);
1413 struct net *net = read_pnet(&f->net);
1414 struct packet_sock *po;
1415 unsigned int idx;
1416
1417 if (!net_eq(dev_net(dev), net) || !num) {
1418 kfree_skb(skb);
1419 return 0;
1420 }
1421
1422 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
1423 skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET);
1424 if (!skb)
1425 return 0;
1426 }
1427 switch (f->type) {
1428 case PACKET_FANOUT_HASH:
1429 default:
1430 idx = fanout_demux_hash(f, skb, num);
1431 break;
1432 case PACKET_FANOUT_LB:
1433 idx = fanout_demux_lb(f, skb, num);
1434 break;
1435 case PACKET_FANOUT_CPU:
1436 idx = fanout_demux_cpu(f, skb, num);
1437 break;
1438 case PACKET_FANOUT_RND:
1439 idx = fanout_demux_rnd(f, skb, num);
1440 break;
1441 case PACKET_FANOUT_QM:
1442 idx = fanout_demux_qm(f, skb, num);
1443 break;
1444 case PACKET_FANOUT_ROLLOVER:
1445 idx = fanout_demux_rollover(f, skb, 0, false, num);
1446 break;
1447 case PACKET_FANOUT_CBPF:
1448 case PACKET_FANOUT_EBPF:
1449 idx = fanout_demux_bpf(f, skb, num);
1450 break;
1451 }
1452
1453 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
1454 idx = fanout_demux_rollover(f, skb, idx, true, num);
1455
1456 po = pkt_sk(f->arr[idx]);
1457 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1458}
1459
1460DEFINE_MUTEX(fanout_mutex);
1461EXPORT_SYMBOL_GPL(fanout_mutex);
1462static LIST_HEAD(fanout_list);
1463static u16 fanout_next_id;
1464
1465static void __fanout_link(struct sock *sk, struct packet_sock *po)
1466{
1467 struct packet_fanout *f = po->fanout;
1468
1469 spin_lock(&f->lock);
1470 f->arr[f->num_members] = sk;
1471 smp_wmb();
1472 f->num_members++;
1473 if (f->num_members == 1)
1474 dev_add_pack(&f->prot_hook);
1475 spin_unlock(&f->lock);
1476}
1477
1478static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1479{
1480 struct packet_fanout *f = po->fanout;
1481 int i;
1482
1483 spin_lock(&f->lock);
1484 for (i = 0; i < f->num_members; i++) {
1485 if (f->arr[i] == sk)
1486 break;
1487 }
1488 BUG_ON(i >= f->num_members);
1489 f->arr[i] = f->arr[f->num_members - 1];
1490 f->num_members--;
1491 if (f->num_members == 0)
1492 __dev_remove_pack(&f->prot_hook);
1493 spin_unlock(&f->lock);
1494}
1495
1496static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
1497{
1498 if (sk->sk_family != PF_PACKET)
1499 return false;
1500
1501 return ptype->af_packet_priv == pkt_sk(sk)->fanout;
1502}
1503
1504static void fanout_init_data(struct packet_fanout *f)
1505{
1506 switch (f->type) {
1507 case PACKET_FANOUT_LB:
1508 atomic_set(&f->rr_cur, 0);
1509 break;
1510 case PACKET_FANOUT_CBPF:
1511 case PACKET_FANOUT_EBPF:
1512 RCU_INIT_POINTER(f->bpf_prog, NULL);
1513 break;
1514 }
1515}
1516
1517static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
1518{
1519 struct bpf_prog *old;
1520
1521 spin_lock(&f->lock);
1522 old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
1523 rcu_assign_pointer(f->bpf_prog, new);
1524 spin_unlock(&f->lock);
1525
1526 if (old) {
1527 synchronize_net();
1528 bpf_prog_destroy(old);
1529 }
1530}
1531
1532static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data,
1533 unsigned int len)
1534{
1535 struct bpf_prog *new;
1536 struct sock_fprog fprog;
1537 int ret;
1538
1539 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1540 return -EPERM;
1541 if (len != sizeof(fprog))
1542 return -EINVAL;
1543 if (copy_from_user(&fprog, data, len))
1544 return -EFAULT;
1545
1546 ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
1547 if (ret)
1548 return ret;
1549
1550 __fanout_set_data_bpf(po->fanout, new);
1551 return 0;
1552}
1553
1554static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data,
1555 unsigned int len)
1556{
1557 struct bpf_prog *new;
1558 u32 fd;
1559
1560 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1561 return -EPERM;
1562 if (len != sizeof(fd))
1563 return -EINVAL;
1564 if (copy_from_user(&fd, data, len))
1565 return -EFAULT;
1566
1567 new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
1568 if (IS_ERR(new))
1569 return PTR_ERR(new);
1570
1571 __fanout_set_data_bpf(po->fanout, new);
1572 return 0;
1573}
1574
1575static int fanout_set_data(struct packet_sock *po, char __user *data,
1576 unsigned int len)
1577{
1578 switch (po->fanout->type) {
1579 case PACKET_FANOUT_CBPF:
1580 return fanout_set_data_cbpf(po, data, len);
1581 case PACKET_FANOUT_EBPF:
1582 return fanout_set_data_ebpf(po, data, len);
1583 default:
1584 return -EINVAL;
1585 }
1586}
1587
1588static void fanout_release_data(struct packet_fanout *f)
1589{
1590 switch (f->type) {
1591 case PACKET_FANOUT_CBPF:
1592 case PACKET_FANOUT_EBPF:
1593 __fanout_set_data_bpf(f, NULL);
1594 }
1595}
1596
1597static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id)
1598{
1599 struct packet_fanout *f;
1600
1601 list_for_each_entry(f, &fanout_list, list) {
1602 if (f->id == candidate_id &&
1603 read_pnet(&f->net) == sock_net(sk)) {
1604 return false;
1605 }
1606 }
1607 return true;
1608}
1609
1610static bool fanout_find_new_id(struct sock *sk, u16 *new_id)
1611{
1612 u16 id = fanout_next_id;
1613
1614 do {
1615 if (__fanout_id_is_free(sk, id)) {
1616 *new_id = id;
1617 fanout_next_id = id + 1;
1618 return true;
1619 }
1620
1621 id++;
1622 } while (id != fanout_next_id);
1623
1624 return false;
1625}
1626
1627static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
1628{
1629 struct packet_rollover *rollover = NULL;
1630 struct packet_sock *po = pkt_sk(sk);
1631 struct packet_fanout *f, *match;
1632 u8 type = type_flags & 0xff;
1633 u8 flags = type_flags >> 8;
1634 int err;
1635
1636 switch (type) {
1637 case PACKET_FANOUT_ROLLOVER:
1638 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1639 return -EINVAL;
1640 case PACKET_FANOUT_HASH:
1641 case PACKET_FANOUT_LB:
1642 case PACKET_FANOUT_CPU:
1643 case PACKET_FANOUT_RND:
1644 case PACKET_FANOUT_QM:
1645 case PACKET_FANOUT_CBPF:
1646 case PACKET_FANOUT_EBPF:
1647 break;
1648 default:
1649 return -EINVAL;
1650 }
1651
1652 mutex_lock(&fanout_mutex);
1653
1654 err = -EALREADY;
1655 if (po->fanout)
1656 goto out;
1657
1658 if (type == PACKET_FANOUT_ROLLOVER ||
1659 (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
1660 err = -ENOMEM;
1661 rollover = kzalloc(sizeof(*rollover), GFP_KERNEL);
1662 if (!rollover)
1663 goto out;
1664 atomic_long_set(&rollover->num, 0);
1665 atomic_long_set(&rollover->num_huge, 0);
1666 atomic_long_set(&rollover->num_failed, 0);
1667 }
1668
1669 if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) {
1670 if (id != 0) {
1671 err = -EINVAL;
1672 goto out;
1673 }
1674 if (!fanout_find_new_id(sk, &id)) {
1675 err = -ENOMEM;
1676 goto out;
1677 }
1678
1679 flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8);
1680 }
1681
1682 match = NULL;
1683 list_for_each_entry(f, &fanout_list, list) {
1684 if (f->id == id &&
1685 read_pnet(&f->net) == sock_net(sk)) {
1686 match = f;
1687 break;
1688 }
1689 }
1690 err = -EINVAL;
1691 if (match && match->flags != flags)
1692 goto out;
1693 if (!match) {
1694 err = -ENOMEM;
1695 match = kzalloc(sizeof(*match), GFP_KERNEL);
1696 if (!match)
1697 goto out;
1698 write_pnet(&match->net, sock_net(sk));
1699 match->id = id;
1700 match->type = type;
1701 match->flags = flags;
1702 INIT_LIST_HEAD(&match->list);
1703 spin_lock_init(&match->lock);
1704 refcount_set(&match->sk_ref, 0);
1705 fanout_init_data(match);
1706 match->prot_hook.type = po->prot_hook.type;
1707 match->prot_hook.dev = po->prot_hook.dev;
1708 match->prot_hook.func = packet_rcv_fanout;
1709 match->prot_hook.af_packet_priv = match;
1710 match->prot_hook.id_match = match_fanout_group;
1711 list_add(&match->list, &fanout_list);
1712 }
1713 err = -EINVAL;
1714
1715 spin_lock(&po->bind_lock);
1716 if (po->running &&
1717 match->type == type &&
1718 match->prot_hook.type == po->prot_hook.type &&
1719 match->prot_hook.dev == po->prot_hook.dev) {
1720 err = -ENOSPC;
1721 if (refcount_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1722 __dev_remove_pack(&po->prot_hook);
1723 po->fanout = match;
1724 po->rollover = rollover;
1725 rollover = NULL;
1726 refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1);
1727 __fanout_link(sk, po);
1728 err = 0;
1729 }
1730 }
1731 spin_unlock(&po->bind_lock);
1732
1733 if (err && !refcount_read(&match->sk_ref)) {
1734 list_del(&match->list);
1735 kfree(match);
1736 }
1737
1738out:
1739 kfree(rollover);
1740 mutex_unlock(&fanout_mutex);
1741 return err;
1742}
1743
1744
1745
1746
1747
1748
1749static struct packet_fanout *fanout_release(struct sock *sk)
1750{
1751 struct packet_sock *po = pkt_sk(sk);
1752 struct packet_fanout *f;
1753
1754 mutex_lock(&fanout_mutex);
1755 f = po->fanout;
1756 if (f) {
1757 po->fanout = NULL;
1758
1759 if (refcount_dec_and_test(&f->sk_ref))
1760 list_del(&f->list);
1761 else
1762 f = NULL;
1763 }
1764 mutex_unlock(&fanout_mutex);
1765
1766 return f;
1767}
1768
1769static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
1770 struct sk_buff *skb)
1771{
1772
1773
1774
1775
1776 if (unlikely(dev->type != ARPHRD_ETHER))
1777 return false;
1778
1779 skb_reset_mac_header(skb);
1780 return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
1781}
1782
1783static const struct proto_ops packet_ops;
1784
1785static const struct proto_ops packet_ops_spkt;
1786
1787static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1788 struct packet_type *pt, struct net_device *orig_dev)
1789{
1790 struct sock *sk;
1791 struct sockaddr_pkt *spkt;
1792
1793
1794
1795
1796
1797
1798 sk = pt->af_packet_priv;
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811 if (skb->pkt_type == PACKET_LOOPBACK)
1812 goto out;
1813
1814 if (!net_eq(dev_net(dev), sock_net(sk)))
1815 goto out;
1816
1817 skb = skb_share_check(skb, GFP_ATOMIC);
1818 if (skb == NULL)
1819 goto oom;
1820
1821
1822 skb_dst_drop(skb);
1823
1824
1825 nf_reset(skb);
1826
1827 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1828
1829 skb_push(skb, skb->data - skb_mac_header(skb));
1830
1831
1832
1833
1834
1835 spkt->spkt_family = dev->type;
1836 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1837 spkt->spkt_protocol = skb->protocol;
1838
1839
1840
1841
1842
1843
1844 if (sock_queue_rcv_skb(sk, skb) == 0)
1845 return 0;
1846
1847out:
1848 kfree_skb(skb);
1849oom:
1850 return 0;
1851}
1852
1853
1854
1855
1856
1857
1858
1859static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
1860 size_t len)
1861{
1862 struct sock *sk = sock->sk;
1863 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1864 struct sk_buff *skb = NULL;
1865 struct net_device *dev;
1866 struct sockcm_cookie sockc;
1867 __be16 proto = 0;
1868 int err;
1869 int extra_len = 0;
1870
1871
1872
1873
1874
1875 if (saddr) {
1876 if (msg->msg_namelen < sizeof(struct sockaddr))
1877 return -EINVAL;
1878 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1879 proto = saddr->spkt_protocol;
1880 } else
1881 return -ENOTCONN;
1882
1883
1884
1885
1886
1887 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1888retry:
1889 rcu_read_lock();
1890 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1891 err = -ENODEV;
1892 if (dev == NULL)
1893 goto out_unlock;
1894
1895 err = -ENETDOWN;
1896 if (!(dev->flags & IFF_UP))
1897 goto out_unlock;
1898
1899
1900
1901
1902
1903
1904 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1905 if (!netif_supports_nofcs(dev)) {
1906 err = -EPROTONOSUPPORT;
1907 goto out_unlock;
1908 }
1909 extra_len = 4;
1910 }
1911
1912 err = -EMSGSIZE;
1913 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1914 goto out_unlock;
1915
1916 if (!skb) {
1917 size_t reserved = LL_RESERVED_SPACE(dev);
1918 int tlen = dev->needed_tailroom;
1919 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1920
1921 rcu_read_unlock();
1922 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1923 if (skb == NULL)
1924 return -ENOBUFS;
1925
1926
1927
1928
1929 skb_reserve(skb, reserved);
1930 skb_reset_network_header(skb);
1931
1932
1933 if (hhlen) {
1934 skb->data -= hhlen;
1935 skb->tail -= hhlen;
1936 if (len < hhlen)
1937 skb_reset_network_header(skb);
1938 }
1939 err = memcpy_from_msg(skb_put(skb, len), msg, len);
1940 if (err)
1941 goto out_free;
1942 goto retry;
1943 }
1944
1945 if (!dev_validate_header(dev, skb->data, len)) {
1946 err = -EINVAL;
1947 goto out_unlock;
1948 }
1949 if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
1950 !packet_extra_vlan_len_allowed(dev, skb)) {
1951 err = -EMSGSIZE;
1952 goto out_unlock;
1953 }
1954
1955 sockcm_init(&sockc, sk);
1956 if (msg->msg_controllen) {
1957 err = sock_cmsg_send(sk, msg, &sockc);
1958 if (unlikely(err))
1959 goto out_unlock;
1960 }
1961
1962 skb->protocol = proto;
1963 skb->dev = dev;
1964 skb->priority = sk->sk_priority;
1965 skb->mark = sk->sk_mark;
1966 skb->tstamp = sockc.transmit_time;
1967
1968 sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags);
1969
1970 if (unlikely(extra_len == 4))
1971 skb->no_fcs = 1;
1972
1973 skb_probe_transport_header(skb, 0);
1974
1975 dev_queue_xmit(skb);
1976 rcu_read_unlock();
1977 return len;
1978
1979out_unlock:
1980 rcu_read_unlock();
1981out_free:
1982 kfree_skb(skb);
1983 return err;
1984}
1985
1986static unsigned int run_filter(struct sk_buff *skb,
1987 const struct sock *sk,
1988 unsigned int res)
1989{
1990 struct sk_filter *filter;
1991
1992 rcu_read_lock();
1993 filter = rcu_dereference(sk->sk_filter);
1994 if (filter != NULL)
1995 res = bpf_prog_run_clear_cb(filter->prog, skb);
1996 rcu_read_unlock();
1997
1998 return res;
1999}
2000
2001static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
2002 size_t *len)
2003{
2004 struct virtio_net_hdr vnet_hdr;
2005
2006 if (*len < sizeof(vnet_hdr))
2007 return -EINVAL;
2008 *len -= sizeof(vnet_hdr);
2009
2010 if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true, 0))
2011 return -EINVAL;
2012
2013 return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
2014}
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
2029 struct packet_type *pt, struct net_device *orig_dev)
2030{
2031 struct sock *sk;
2032 struct sockaddr_ll *sll;
2033 struct packet_sock *po;
2034 u8 *skb_head = skb->data;
2035 int skb_len = skb->len;
2036 unsigned int snaplen, res;
2037 bool is_drop_n_account = false;
2038
2039 if (skb->pkt_type == PACKET_LOOPBACK)
2040 goto drop;
2041
2042 sk = pt->af_packet_priv;
2043 po = pkt_sk(sk);
2044
2045 if (!net_eq(dev_net(dev), sock_net(sk)))
2046 goto drop;
2047
2048 skb->dev = dev;
2049
2050 if (dev->header_ops) {
2051
2052
2053
2054
2055
2056
2057
2058 if (sk->sk_type != SOCK_DGRAM)
2059 skb_push(skb, skb->data - skb_mac_header(skb));
2060 else if (skb->pkt_type == PACKET_OUTGOING) {
2061
2062 skb_pull(skb, skb_network_offset(skb));
2063 }
2064 }
2065
2066 snaplen = skb->len;
2067
2068 res = run_filter(skb, sk, snaplen);
2069 if (!res)
2070 goto drop_n_restore;
2071 if (snaplen > res)
2072 snaplen = res;
2073
2074 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
2075 goto drop_n_acct;
2076
2077 if (skb_shared(skb)) {
2078 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
2079 if (nskb == NULL)
2080 goto drop_n_acct;
2081
2082 if (skb_head != skb->data) {
2083 skb->data = skb_head;
2084 skb->len = skb_len;
2085 }
2086 consume_skb(skb);
2087 skb = nskb;
2088 }
2089
2090 sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
2091
2092 sll = &PACKET_SKB_CB(skb)->sa.ll;
2093 sll->sll_hatype = dev->type;
2094 sll->sll_pkttype = skb->pkt_type;
2095 if (unlikely(po->origdev))
2096 sll->sll_ifindex = orig_dev->ifindex;
2097 else
2098 sll->sll_ifindex = dev->ifindex;
2099
2100 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
2101
2102
2103
2104
2105 PACKET_SKB_CB(skb)->sa.origlen = skb->len;
2106
2107 if (pskb_trim(skb, snaplen))
2108 goto drop_n_acct;
2109
2110 skb_set_owner_r(skb, sk);
2111 skb->dev = NULL;
2112 skb_dst_drop(skb);
2113
2114
2115 nf_reset(skb);
2116
2117 spin_lock(&sk->sk_receive_queue.lock);
2118 po->stats.stats1.tp_packets++;
2119 sock_skb_set_dropcount(sk, skb);
2120 __skb_queue_tail(&sk->sk_receive_queue, skb);
2121 spin_unlock(&sk->sk_receive_queue.lock);
2122 sk->sk_data_ready(sk);
2123 return 0;
2124
2125drop_n_acct:
2126 is_drop_n_account = true;
2127 spin_lock(&sk->sk_receive_queue.lock);
2128 po->stats.stats1.tp_drops++;
2129 atomic_inc(&sk->sk_drops);
2130 spin_unlock(&sk->sk_receive_queue.lock);
2131
2132drop_n_restore:
2133 if (skb_head != skb->data && skb_shared(skb)) {
2134 skb->data = skb_head;
2135 skb->len = skb_len;
2136 }
2137drop:
2138 if (!is_drop_n_account)
2139 consume_skb(skb);
2140 else
2141 kfree_skb(skb);
2142 return 0;
2143}
2144
2145static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
2146 struct packet_type *pt, struct net_device *orig_dev)
2147{
2148 struct sock *sk;
2149 struct packet_sock *po;
2150 struct sockaddr_ll *sll;
2151 union tpacket_uhdr h;
2152 u8 *skb_head = skb->data;
2153 int skb_len = skb->len;
2154 unsigned int snaplen, res;
2155 unsigned long status = TP_STATUS_USER;
2156 unsigned short macoff, netoff, hdrlen;
2157 struct sk_buff *copy_skb = NULL;
2158 struct timespec ts;
2159 __u32 ts_status;
2160 bool is_drop_n_account = false;
2161 bool do_vnet = false;
2162
2163
2164
2165
2166
2167 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
2168 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
2169
2170 if (skb->pkt_type == PACKET_LOOPBACK)
2171 goto drop;
2172
2173 sk = pt->af_packet_priv;
2174 po = pkt_sk(sk);
2175
2176 if (!net_eq(dev_net(dev), sock_net(sk)))
2177 goto drop;
2178
2179 if (dev->header_ops) {
2180 if (sk->sk_type != SOCK_DGRAM)
2181 skb_push(skb, skb->data - skb_mac_header(skb));
2182 else if (skb->pkt_type == PACKET_OUTGOING) {
2183
2184 skb_pull(skb, skb_network_offset(skb));
2185 }
2186 }
2187
2188 snaplen = skb->len;
2189
2190 res = run_filter(skb, sk, snaplen);
2191 if (!res)
2192 goto drop_n_restore;
2193
2194 if (skb->ip_summed == CHECKSUM_PARTIAL)
2195 status |= TP_STATUS_CSUMNOTREADY;
2196 else if (skb->pkt_type != PACKET_OUTGOING &&
2197 (skb->ip_summed == CHECKSUM_COMPLETE ||
2198 skb_csum_unnecessary(skb)))
2199 status |= TP_STATUS_CSUM_VALID;
2200
2201 if (snaplen > res)
2202 snaplen = res;
2203
2204 if (sk->sk_type == SOCK_DGRAM) {
2205 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
2206 po->tp_reserve;
2207 } else {
2208 unsigned int maclen = skb_network_offset(skb);
2209 netoff = TPACKET_ALIGN(po->tp_hdrlen +
2210 (maclen < 16 ? 16 : maclen)) +
2211 po->tp_reserve;
2212 if (po->has_vnet_hdr) {
2213 netoff += sizeof(struct virtio_net_hdr);
2214 do_vnet = true;
2215 }
2216 macoff = netoff - maclen;
2217 }
2218 if (po->tp_version <= TPACKET_V2) {
2219 if (macoff + snaplen > po->rx_ring.frame_size) {
2220 if (po->copy_thresh &&
2221 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
2222 if (skb_shared(skb)) {
2223 copy_skb = skb_clone(skb, GFP_ATOMIC);
2224 } else {
2225 copy_skb = skb_get(skb);
2226 skb_head = skb->data;
2227 }
2228 if (copy_skb)
2229 skb_set_owner_r(copy_skb, sk);
2230 }
2231 snaplen = po->rx_ring.frame_size - macoff;
2232 if ((int)snaplen < 0) {
2233 snaplen = 0;
2234 do_vnet = false;
2235 }
2236 }
2237 } else if (unlikely(macoff + snaplen >
2238 GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
2239 u32 nval;
2240
2241 nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
2242 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2243 snaplen, nval, macoff);
2244 snaplen = nval;
2245 if (unlikely((int)snaplen < 0)) {
2246 snaplen = 0;
2247 macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
2248 do_vnet = false;
2249 }
2250 }
2251 spin_lock(&sk->sk_receive_queue.lock);
2252 h.raw = packet_current_rx_frame(po, skb,
2253 TP_STATUS_KERNEL, (macoff+snaplen));
2254 if (!h.raw)
2255 goto drop_n_account;
2256 if (po->tp_version <= TPACKET_V2) {
2257 packet_increment_rx_head(po, &po->rx_ring);
2258
2259
2260
2261
2262
2263
2264 if (po->stats.stats1.tp_drops)
2265 status |= TP_STATUS_LOSING;
2266 }
2267
2268 if (do_vnet &&
2269 virtio_net_hdr_from_skb(skb, h.raw + macoff -
2270 sizeof(struct virtio_net_hdr),
2271 vio_le(), true, 0))
2272 goto drop_n_account;
2273
2274 po->stats.stats1.tp_packets++;
2275 if (copy_skb) {
2276 status |= TP_STATUS_COPY;
2277 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
2278 }
2279 spin_unlock(&sk->sk_receive_queue.lock);
2280
2281 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
2282
2283 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
2284 getnstimeofday(&ts);
2285
2286 status |= ts_status;
2287
2288 switch (po->tp_version) {
2289 case TPACKET_V1:
2290 h.h1->tp_len = skb->len;
2291 h.h1->tp_snaplen = snaplen;
2292 h.h1->tp_mac = macoff;
2293 h.h1->tp_net = netoff;
2294 h.h1->tp_sec = ts.tv_sec;
2295 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
2296 hdrlen = sizeof(*h.h1);
2297 break;
2298 case TPACKET_V2:
2299 h.h2->tp_len = skb->len;
2300 h.h2->tp_snaplen = snaplen;
2301 h.h2->tp_mac = macoff;
2302 h.h2->tp_net = netoff;
2303 h.h2->tp_sec = ts.tv_sec;
2304 h.h2->tp_nsec = ts.tv_nsec;
2305 if (skb_vlan_tag_present(skb)) {
2306 h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
2307 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2308 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
2309 } else {
2310 h.h2->tp_vlan_tci = 0;
2311 h.h2->tp_vlan_tpid = 0;
2312 }
2313 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
2314 hdrlen = sizeof(*h.h2);
2315 break;
2316 case TPACKET_V3:
2317
2318
2319
2320 h.h3->tp_status |= status;
2321 h.h3->tp_len = skb->len;
2322 h.h3->tp_snaplen = snaplen;
2323 h.h3->tp_mac = macoff;
2324 h.h3->tp_net = netoff;
2325 h.h3->tp_sec = ts.tv_sec;
2326 h.h3->tp_nsec = ts.tv_nsec;
2327 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
2328 hdrlen = sizeof(*h.h3);
2329 break;
2330 default:
2331 BUG();
2332 }
2333
2334 sll = h.raw + TPACKET_ALIGN(hdrlen);
2335 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
2336 sll->sll_family = AF_PACKET;
2337 sll->sll_hatype = dev->type;
2338 sll->sll_protocol = skb->protocol;
2339 sll->sll_pkttype = skb->pkt_type;
2340 if (unlikely(po->origdev))
2341 sll->sll_ifindex = orig_dev->ifindex;
2342 else
2343 sll->sll_ifindex = dev->ifindex;
2344
2345 smp_mb();
2346
2347#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
2348 if (po->tp_version <= TPACKET_V2) {
2349 u8 *start, *end;
2350
2351 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2352 macoff + snaplen);
2353
2354 for (start = h.raw; start < end; start += PAGE_SIZE)
2355 flush_dcache_page(pgv_to_page(start));
2356 }
2357 smp_wmb();
2358#endif
2359
2360 if (po->tp_version <= TPACKET_V2) {
2361 __packet_set_status(po, h.raw, status);
2362 sk->sk_data_ready(sk);
2363 } else {
2364 prb_clear_blk_fill_status(&po->rx_ring);
2365 }
2366
2367drop_n_restore:
2368 if (skb_head != skb->data && skb_shared(skb)) {
2369 skb->data = skb_head;
2370 skb->len = skb_len;
2371 }
2372drop:
2373 if (!is_drop_n_account)
2374 consume_skb(skb);
2375 else
2376 kfree_skb(skb);
2377 return 0;
2378
2379drop_n_account:
2380 is_drop_n_account = true;
2381 po->stats.stats1.tp_drops++;
2382 spin_unlock(&sk->sk_receive_queue.lock);
2383
2384 sk->sk_data_ready(sk);
2385 kfree_skb(copy_skb);
2386 goto drop_n_restore;
2387}
2388
2389static void tpacket_destruct_skb(struct sk_buff *skb)
2390{
2391 struct packet_sock *po = pkt_sk(skb->sk);
2392
2393 if (likely(po->tx_ring.pg_vec)) {
2394 void *ph;
2395 __u32 ts;
2396
2397 ph = skb_shinfo(skb)->destructor_arg;
2398 packet_dec_pending(&po->tx_ring);
2399
2400 ts = __packet_set_timestamp(po, ph, skb);
2401 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
2402 }
2403
2404 sock_wfree(skb);
2405}
2406
2407static void tpacket_set_protocol(const struct net_device *dev,
2408 struct sk_buff *skb)
2409{
2410 if (dev->type == ARPHRD_ETHER) {
2411 skb_reset_mac_header(skb);
2412 skb->protocol = eth_hdr(skb)->h_proto;
2413 }
2414}
2415
2416static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
2417{
2418 if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2419 (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2420 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
2421 __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len)))
2422 vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
2423 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2424 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2);
2425
2426 if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
2427 return -EINVAL;
2428
2429 return 0;
2430}
2431
2432static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
2433 struct virtio_net_hdr *vnet_hdr)
2434{
2435 if (*len < sizeof(*vnet_hdr))
2436 return -EINVAL;
2437 *len -= sizeof(*vnet_hdr);
2438
2439 if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
2440 return -EFAULT;
2441
2442 return __packet_snd_vnet_parse(vnet_hdr, *len);
2443}
2444
2445static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
2446 void *frame, struct net_device *dev, void *data, int tp_len,
2447 __be16 proto, unsigned char *addr, int hlen, int copylen,
2448 const struct sockcm_cookie *sockc)
2449{
2450 union tpacket_uhdr ph;
2451 int to_write, offset, len, nr_frags, len_max;
2452 struct socket *sock = po->sk.sk_socket;
2453 struct page *page;
2454 int err;
2455
2456 ph.raw = frame;
2457
2458 skb->protocol = proto;
2459 skb->dev = dev;
2460 skb->priority = po->sk.sk_priority;
2461 skb->mark = po->sk.sk_mark;
2462 skb->tstamp = sockc->transmit_time;
2463 sock_tx_timestamp(&po->sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags);
2464 skb_shinfo(skb)->destructor_arg = ph.raw;
2465
2466 skb_reserve(skb, hlen);
2467 skb_reset_network_header(skb);
2468
2469 to_write = tp_len;
2470
2471 if (sock->type == SOCK_DGRAM) {
2472 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2473 NULL, tp_len);
2474 if (unlikely(err < 0))
2475 return -EINVAL;
2476 } else if (copylen) {
2477 int hdrlen = min_t(int, copylen, tp_len);
2478
2479 skb_push(skb, dev->hard_header_len);
2480 skb_put(skb, copylen - dev->hard_header_len);
2481 err = skb_store_bits(skb, 0, data, hdrlen);
2482 if (unlikely(err))
2483 return err;
2484 if (!dev_validate_header(dev, skb->data, hdrlen))
2485 return -EINVAL;
2486 if (!skb->protocol)
2487 tpacket_set_protocol(dev, skb);
2488
2489 data += hdrlen;
2490 to_write -= hdrlen;
2491 }
2492
2493 offset = offset_in_page(data);
2494 len_max = PAGE_SIZE - offset;
2495 len = ((to_write > len_max) ? len_max : to_write);
2496
2497 skb->data_len = to_write;
2498 skb->len += to_write;
2499 skb->truesize += to_write;
2500 refcount_add(to_write, &po->sk.sk_wmem_alloc);
2501
2502 while (likely(to_write)) {
2503 nr_frags = skb_shinfo(skb)->nr_frags;
2504
2505 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
2506 pr_err("Packet exceed the number of skb frags(%lu)\n",
2507 MAX_SKB_FRAGS);
2508 return -EFAULT;
2509 }
2510
2511 page = pgv_to_page(data);
2512 data += len;
2513 flush_dcache_page(page);
2514 get_page(page);
2515 skb_fill_page_desc(skb, nr_frags, page, offset, len);
2516 to_write -= len;
2517 offset = 0;
2518 len_max = PAGE_SIZE;
2519 len = ((to_write > len_max) ? len_max : to_write);
2520 }
2521
2522 skb_probe_transport_header(skb, 0);
2523
2524 return tp_len;
2525}
2526
2527static int tpacket_parse_header(struct packet_sock *po, void *frame,
2528 int size_max, void **data)
2529{
2530 union tpacket_uhdr ph;
2531 int tp_len, off;
2532
2533 ph.raw = frame;
2534
2535 switch (po->tp_version) {
2536 case TPACKET_V3:
2537 if (ph.h3->tp_next_offset != 0) {
2538 pr_warn_once("variable sized slot not supported");
2539 return -EINVAL;
2540 }
2541 tp_len = ph.h3->tp_len;
2542 break;
2543 case TPACKET_V2:
2544 tp_len = ph.h2->tp_len;
2545 break;
2546 default:
2547 tp_len = ph.h1->tp_len;
2548 break;
2549 }
2550 if (unlikely(tp_len > size_max)) {
2551 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2552 return -EMSGSIZE;
2553 }
2554
2555 if (unlikely(po->tp_tx_has_off)) {
2556 int off_min, off_max;
2557
2558 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2559 off_max = po->tx_ring.frame_size - tp_len;
2560 if (po->sk.sk_type == SOCK_DGRAM) {
2561 switch (po->tp_version) {
2562 case TPACKET_V3:
2563 off = ph.h3->tp_net;
2564 break;
2565 case TPACKET_V2:
2566 off = ph.h2->tp_net;
2567 break;
2568 default:
2569 off = ph.h1->tp_net;
2570 break;
2571 }
2572 } else {
2573 switch (po->tp_version) {
2574 case TPACKET_V3:
2575 off = ph.h3->tp_mac;
2576 break;
2577 case TPACKET_V2:
2578 off = ph.h2->tp_mac;
2579 break;
2580 default:
2581 off = ph.h1->tp_mac;
2582 break;
2583 }
2584 }
2585 if (unlikely((off < off_min) || (off_max < off)))
2586 return -EINVAL;
2587 } else {
2588 off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2589 }
2590
2591 *data = frame + off;
2592 return tp_len;
2593}
2594
2595static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2596{
2597 struct sk_buff *skb;
2598 struct net_device *dev;
2599 struct virtio_net_hdr *vnet_hdr = NULL;
2600 struct sockcm_cookie sockc;
2601 __be16 proto;
2602 int err, reserve = 0;
2603 void *ph;
2604 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
2605 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
2606 int tp_len, size_max;
2607 unsigned char *addr;
2608 void *data;
2609 int len_sum = 0;
2610 int status = TP_STATUS_AVAILABLE;
2611 int hlen, tlen, copylen = 0;
2612
2613 mutex_lock(&po->pg_vec_lock);
2614
2615 if (likely(saddr == NULL)) {
2616 dev = packet_cached_dev_get(po);
2617 proto = po->num;
2618 addr = NULL;
2619 } else {
2620 err = -EINVAL;
2621 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2622 goto out;
2623 if (msg->msg_namelen < (saddr->sll_halen
2624 + offsetof(struct sockaddr_ll,
2625 sll_addr)))
2626 goto out;
2627 proto = saddr->sll_protocol;
2628 addr = saddr->sll_addr;
2629 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
2630 }
2631
2632 err = -ENXIO;
2633 if (unlikely(dev == NULL))
2634 goto out;
2635 err = -ENETDOWN;
2636 if (unlikely(!(dev->flags & IFF_UP)))
2637 goto out_put;
2638
2639 sockcm_init(&sockc, &po->sk);
2640 if (msg->msg_controllen) {
2641 err = sock_cmsg_send(&po->sk, msg, &sockc);
2642 if (unlikely(err))
2643 goto out_put;
2644 }
2645
2646 if (po->sk.sk_socket->type == SOCK_RAW)
2647 reserve = dev->hard_header_len;
2648 size_max = po->tx_ring.frame_size
2649 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
2650
2651 if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
2652 size_max = dev->mtu + reserve + VLAN_HLEN;
2653
2654 do {
2655 ph = packet_current_frame(po, &po->tx_ring,
2656 TP_STATUS_SEND_REQUEST);
2657 if (unlikely(ph == NULL)) {
2658 if (need_wait && need_resched())
2659 schedule();
2660 continue;
2661 }
2662
2663 skb = NULL;
2664 tp_len = tpacket_parse_header(po, ph, size_max, &data);
2665 if (tp_len < 0)
2666 goto tpacket_error;
2667
2668 status = TP_STATUS_SEND_REQUEST;
2669 hlen = LL_RESERVED_SPACE(dev);
2670 tlen = dev->needed_tailroom;
2671 if (po->has_vnet_hdr) {
2672 vnet_hdr = data;
2673 data += sizeof(*vnet_hdr);
2674 tp_len -= sizeof(*vnet_hdr);
2675 if (tp_len < 0 ||
2676 __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
2677 tp_len = -EINVAL;
2678 goto tpacket_error;
2679 }
2680 copylen = __virtio16_to_cpu(vio_le(),
2681 vnet_hdr->hdr_len);
2682 }
2683 copylen = max_t(int, copylen, dev->hard_header_len);
2684 skb = sock_alloc_send_skb(&po->sk,
2685 hlen + tlen + sizeof(struct sockaddr_ll) +
2686 (copylen - dev->hard_header_len),
2687 !need_wait, &err);
2688
2689 if (unlikely(skb == NULL)) {
2690
2691 if (likely(len_sum > 0))
2692 err = len_sum;
2693 goto out_status;
2694 }
2695 tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
2696 addr, hlen, copylen, &sockc);
2697 if (likely(tp_len >= 0) &&
2698 tp_len > dev->mtu + reserve &&
2699 !po->has_vnet_hdr &&
2700 !packet_extra_vlan_len_allowed(dev, skb))
2701 tp_len = -EMSGSIZE;
2702
2703 if (unlikely(tp_len < 0)) {
2704tpacket_error:
2705 if (po->tp_loss) {
2706 __packet_set_status(po, ph,
2707 TP_STATUS_AVAILABLE);
2708 packet_increment_head(&po->tx_ring);
2709 kfree_skb(skb);
2710 continue;
2711 } else {
2712 status = TP_STATUS_WRONG_FORMAT;
2713 err = tp_len;
2714 goto out_status;
2715 }
2716 }
2717
2718 if (po->has_vnet_hdr) {
2719 if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) {
2720 tp_len = -EINVAL;
2721 goto tpacket_error;
2722 }
2723 virtio_net_hdr_set_proto(skb, vnet_hdr);
2724 }
2725
2726 skb->destructor = tpacket_destruct_skb;
2727 __packet_set_status(po, ph, TP_STATUS_SENDING);
2728 packet_inc_pending(&po->tx_ring);
2729
2730 status = TP_STATUS_SEND_REQUEST;
2731 err = po->xmit(skb);
2732 if (unlikely(err > 0)) {
2733 err = net_xmit_errno(err);
2734 if (err && __packet_get_status(po, ph) ==
2735 TP_STATUS_AVAILABLE) {
2736
2737 skb = NULL;
2738 goto out_status;
2739 }
2740
2741
2742
2743
2744 err = 0;
2745 }
2746 packet_increment_head(&po->tx_ring);
2747 len_sum += tp_len;
2748 } while (likely((ph != NULL) ||
2749
2750
2751
2752
2753
2754
2755 (need_wait && packet_read_pending(&po->tx_ring))));
2756
2757 err = len_sum;
2758 goto out_put;
2759
2760out_status:
2761 __packet_set_status(po, ph, status);
2762 kfree_skb(skb);
2763out_put:
2764 dev_put(dev);
2765out:
2766 mutex_unlock(&po->pg_vec_lock);
2767 return err;
2768}
2769
2770static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2771 size_t reserve, size_t len,
2772 size_t linear, int noblock,
2773 int *err)
2774{
2775 struct sk_buff *skb;
2776
2777
2778 if (prepad + len < PAGE_SIZE || !linear)
2779 linear = len;
2780
2781 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
2782 err, 0);
2783 if (!skb)
2784 return NULL;
2785
2786 skb_reserve(skb, reserve);
2787 skb_put(skb, linear);
2788 skb->data_len = len - linear;
2789 skb->len += len - linear;
2790
2791 return skb;
2792}
2793
2794static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
2795{
2796 struct sock *sk = sock->sk;
2797 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
2798 struct sk_buff *skb;
2799 struct net_device *dev;
2800 __be16 proto;
2801 unsigned char *addr;
2802 int err, reserve = 0;
2803 struct sockcm_cookie sockc;
2804 struct virtio_net_hdr vnet_hdr = { 0 };
2805 int offset = 0;
2806 struct packet_sock *po = pkt_sk(sk);
2807 bool has_vnet_hdr = false;
2808 int hlen, tlen, linear;
2809 int extra_len = 0;
2810
2811
2812
2813
2814
2815 if (likely(saddr == NULL)) {
2816 dev = packet_cached_dev_get(po);
2817 proto = po->num;
2818 addr = NULL;
2819 } else {
2820 err = -EINVAL;
2821 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2822 goto out;
2823 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2824 goto out;
2825 proto = saddr->sll_protocol;
2826 addr = saddr->sll_addr;
2827 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
2828 }
2829
2830 err = -ENXIO;
2831 if (unlikely(dev == NULL))
2832 goto out_unlock;
2833 err = -ENETDOWN;
2834 if (unlikely(!(dev->flags & IFF_UP)))
2835 goto out_unlock;
2836
2837 sockcm_init(&sockc, sk);
2838 sockc.mark = sk->sk_mark;
2839 if (msg->msg_controllen) {
2840 err = sock_cmsg_send(sk, msg, &sockc);
2841 if (unlikely(err))
2842 goto out_unlock;
2843 }
2844
2845 if (sock->type == SOCK_RAW)
2846 reserve = dev->hard_header_len;
2847 if (po->has_vnet_hdr) {
2848 err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
2849 if (err)
2850 goto out_unlock;
2851 has_vnet_hdr = true;
2852 }
2853
2854 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2855 if (!netif_supports_nofcs(dev)) {
2856 err = -EPROTONOSUPPORT;
2857 goto out_unlock;
2858 }
2859 extra_len = 4;
2860 }
2861
2862 err = -EMSGSIZE;
2863 if (!vnet_hdr.gso_type &&
2864 (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
2865 goto out_unlock;
2866
2867 err = -ENOBUFS;
2868 hlen = LL_RESERVED_SPACE(dev);
2869 tlen = dev->needed_tailroom;
2870 linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len);
2871 linear = max(linear, min_t(int, len, dev->hard_header_len));
2872 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear,
2873 msg->msg_flags & MSG_DONTWAIT, &err);
2874 if (skb == NULL)
2875 goto out_unlock;
2876
2877 skb_reset_network_header(skb);
2878
2879 err = -EINVAL;
2880 if (sock->type == SOCK_DGRAM) {
2881 offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
2882 if (unlikely(offset < 0))
2883 goto out_free;
2884 } else if (reserve) {
2885 skb_reserve(skb, -reserve);
2886 if (len < reserve)
2887 skb_reset_network_header(skb);
2888 }
2889
2890
2891 err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
2892 if (err)
2893 goto out_free;
2894
2895 if (sock->type == SOCK_RAW &&
2896 !dev_validate_header(dev, skb->data, len)) {
2897 err = -EINVAL;
2898 goto out_free;
2899 }
2900
2901 sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags);
2902
2903 if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
2904 !packet_extra_vlan_len_allowed(dev, skb)) {
2905 err = -EMSGSIZE;
2906 goto out_free;
2907 }
2908
2909 skb->protocol = proto;
2910 skb->dev = dev;
2911 skb->priority = sk->sk_priority;
2912 skb->mark = sockc.mark;
2913 skb->tstamp = sockc.transmit_time;
2914
2915 if (has_vnet_hdr) {
2916 err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
2917 if (err)
2918 goto out_free;
2919 len += sizeof(vnet_hdr);
2920 virtio_net_hdr_set_proto(skb, &vnet_hdr);
2921 }
2922
2923 skb_probe_transport_header(skb, reserve);
2924
2925 if (unlikely(extra_len == 4))
2926 skb->no_fcs = 1;
2927
2928 err = po->xmit(skb);
2929 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2930 goto out_unlock;
2931
2932 dev_put(dev);
2933
2934 return len;
2935
2936out_free:
2937 kfree_skb(skb);
2938out_unlock:
2939 if (dev)
2940 dev_put(dev);
2941out:
2942 return err;
2943}
2944
2945static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
2946{
2947 struct sock *sk = sock->sk;
2948 struct packet_sock *po = pkt_sk(sk);
2949
2950 if (po->tx_ring.pg_vec)
2951 return tpacket_snd(po, msg);
2952 else
2953 return packet_snd(sock, msg, len);
2954}
2955
2956
2957
2958
2959
2960
2961static int packet_release(struct socket *sock)
2962{
2963 struct sock *sk = sock->sk;
2964 struct packet_sock *po;
2965 struct packet_fanout *f;
2966 struct net *net;
2967 union tpacket_req_u req_u;
2968
2969 if (!sk)
2970 return 0;
2971
2972 net = sock_net(sk);
2973 po = pkt_sk(sk);
2974
2975 mutex_lock(&net->packet.sklist_lock);
2976 sk_del_node_init_rcu(sk);
2977 mutex_unlock(&net->packet.sklist_lock);
2978
2979 preempt_disable();
2980 sock_prot_inuse_add(net, sk->sk_prot, -1);
2981 preempt_enable();
2982
2983 spin_lock(&po->bind_lock);
2984 unregister_prot_hook(sk, false);
2985 packet_cached_dev_reset(po);
2986
2987 if (po->prot_hook.dev) {
2988 dev_put(po->prot_hook.dev);
2989 po->prot_hook.dev = NULL;
2990 }
2991 spin_unlock(&po->bind_lock);
2992
2993 packet_flush_mclist(sk);
2994
2995 lock_sock(sk);
2996 if (po->rx_ring.pg_vec) {
2997 memset(&req_u, 0, sizeof(req_u));
2998 packet_set_ring(sk, &req_u, 1, 0);
2999 }
3000
3001 if (po->tx_ring.pg_vec) {
3002 memset(&req_u, 0, sizeof(req_u));
3003 packet_set_ring(sk, &req_u, 1, 1);
3004 }
3005 release_sock(sk);
3006
3007 f = fanout_release(sk);
3008
3009 synchronize_net();
3010
3011 if (f) {
3012 kfree(po->rollover);
3013 fanout_release_data(f);
3014 kfree(f);
3015 }
3016
3017
3018
3019 sock_orphan(sk);
3020 sock->sk = NULL;
3021
3022
3023
3024 skb_queue_purge(&sk->sk_receive_queue);
3025 packet_free_pending(po);
3026 sk_refcnt_debug_release(sk);
3027
3028 sock_put(sk);
3029 return 0;
3030}
3031
3032
3033
3034
3035
3036static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
3037 __be16 proto)
3038{
3039 struct packet_sock *po = pkt_sk(sk);
3040 struct net_device *dev_curr;
3041 __be16 proto_curr;
3042 bool need_rehook;
3043 struct net_device *dev = NULL;
3044 int ret = 0;
3045 bool unlisted = false;
3046
3047 lock_sock(sk);
3048 spin_lock(&po->bind_lock);
3049 rcu_read_lock();
3050
3051 if (po->fanout) {
3052 ret = -EINVAL;
3053 goto out_unlock;
3054 }
3055
3056 if (name) {
3057 dev = dev_get_by_name_rcu(sock_net(sk), name);
3058 if (!dev) {
3059 ret = -ENODEV;
3060 goto out_unlock;
3061 }
3062 } else if (ifindex) {
3063 dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
3064 if (!dev) {
3065 ret = -ENODEV;
3066 goto out_unlock;
3067 }
3068 }
3069
3070 if (dev)
3071 dev_hold(dev);
3072
3073 proto_curr = po->prot_hook.type;
3074 dev_curr = po->prot_hook.dev;
3075
3076 need_rehook = proto_curr != proto || dev_curr != dev;
3077
3078 if (need_rehook) {
3079 if (po->running) {
3080 rcu_read_unlock();
3081
3082
3083
3084 po->num = 0;
3085 __unregister_prot_hook(sk, true);
3086 rcu_read_lock();
3087 dev_curr = po->prot_hook.dev;
3088 if (dev)
3089 unlisted = !dev_get_by_index_rcu(sock_net(sk),
3090 dev->ifindex);
3091 }
3092
3093 BUG_ON(po->running);
3094 po->num = proto;
3095 po->prot_hook.type = proto;
3096
3097 if (unlikely(unlisted)) {
3098 dev_put(dev);
3099 po->prot_hook.dev = NULL;
3100 po->ifindex = -1;
3101 packet_cached_dev_reset(po);
3102 } else {
3103 po->prot_hook.dev = dev;
3104 po->ifindex = dev ? dev->ifindex : 0;
3105 packet_cached_dev_assign(po, dev);
3106 }
3107 }
3108 if (dev_curr)
3109 dev_put(dev_curr);
3110
3111 if (proto == 0 || !need_rehook)
3112 goto out_unlock;
3113
3114 if (!unlisted && (!dev || (dev->flags & IFF_UP))) {
3115 register_prot_hook(sk);
3116 } else {
3117 sk->sk_err = ENETDOWN;
3118 if (!sock_flag(sk, SOCK_DEAD))
3119 sk->sk_error_report(sk);
3120 }
3121
3122out_unlock:
3123 rcu_read_unlock();
3124 spin_unlock(&po->bind_lock);
3125 release_sock(sk);
3126 return ret;
3127}
3128
3129
3130
3131
3132
3133static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
3134 int addr_len)
3135{
3136 struct sock *sk = sock->sk;
3137 char name[sizeof(uaddr->sa_data) + 1];
3138
3139
3140
3141
3142
3143 if (addr_len != sizeof(struct sockaddr))
3144 return -EINVAL;
3145
3146
3147
3148 memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data));
3149 name[sizeof(uaddr->sa_data)] = 0;
3150
3151 return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
3152}
3153
3154static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
3155{
3156 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
3157 struct sock *sk = sock->sk;
3158
3159
3160
3161
3162
3163 if (addr_len < sizeof(struct sockaddr_ll))
3164 return -EINVAL;
3165 if (sll->sll_family != AF_PACKET)
3166 return -EINVAL;
3167
3168 return packet_do_bind(sk, NULL, sll->sll_ifindex,
3169 sll->sll_protocol ? : pkt_sk(sk)->num);
3170}
3171
3172static struct proto packet_proto = {
3173 .name = "PACKET",
3174 .owner = THIS_MODULE,
3175 .obj_size = sizeof(struct packet_sock),
3176};
3177
3178
3179
3180
3181
3182static int packet_create(struct net *net, struct socket *sock, int protocol,
3183 int kern)
3184{
3185 struct sock *sk;
3186 struct packet_sock *po;
3187 __be16 proto = (__force __be16)protocol;
3188 int err;
3189
3190 if (!ns_capable(net->user_ns, CAP_NET_RAW))
3191 return -EPERM;
3192 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
3193 sock->type != SOCK_PACKET)
3194 return -ESOCKTNOSUPPORT;
3195
3196 sock->state = SS_UNCONNECTED;
3197
3198 err = -ENOBUFS;
3199 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
3200 if (sk == NULL)
3201 goto out;
3202
3203 sock->ops = &packet_ops;
3204 if (sock->type == SOCK_PACKET)
3205 sock->ops = &packet_ops_spkt;
3206
3207 sock_init_data(sock, sk);
3208
3209 po = pkt_sk(sk);
3210 sk->sk_family = PF_PACKET;
3211 po->num = proto;
3212 po->xmit = dev_queue_xmit;
3213
3214 err = packet_alloc_pending(po);
3215 if (err)
3216 goto out2;
3217
3218 packet_cached_dev_reset(po);
3219
3220 sk->sk_destruct = packet_sock_destruct;
3221 sk_refcnt_debug_inc(sk);
3222
3223
3224
3225
3226
3227 spin_lock_init(&po->bind_lock);
3228 mutex_init(&po->pg_vec_lock);
3229 po->rollover = NULL;
3230 po->prot_hook.func = packet_rcv;
3231
3232 if (sock->type == SOCK_PACKET)
3233 po->prot_hook.func = packet_rcv_spkt;
3234
3235 po->prot_hook.af_packet_priv = sk;
3236
3237 if (proto) {
3238 po->prot_hook.type = proto;
3239 __register_prot_hook(sk);
3240 }
3241
3242 mutex_lock(&net->packet.sklist_lock);
3243 sk_add_node_rcu(sk, &net->packet.sklist);
3244 mutex_unlock(&net->packet.sklist_lock);
3245
3246 preempt_disable();
3247 sock_prot_inuse_add(net, &packet_proto, 1);
3248 preempt_enable();
3249
3250 return 0;
3251out2:
3252 sk_free(sk);
3253out:
3254 return err;
3255}
3256
3257
3258
3259
3260
3261
3262static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
3263 int flags)
3264{
3265 struct sock *sk = sock->sk;
3266 struct sk_buff *skb;
3267 int copied, err;
3268 int vnet_hdr_len = 0;
3269 unsigned int origlen = 0;
3270
3271 err = -EINVAL;
3272 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
3273 goto out;
3274
3275#if 0
3276
3277 if (pkt_sk(sk)->ifindex < 0)
3278 return -ENODEV;
3279#endif
3280
3281 if (flags & MSG_ERRQUEUE) {
3282 err = sock_recv_errqueue(sk, msg, len,
3283 SOL_PACKET, PACKET_TX_TIMESTAMP);
3284 goto out;
3285 }
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
3297
3298
3299
3300
3301
3302
3303
3304 if (skb == NULL)
3305 goto out;
3306
3307 if (pkt_sk(sk)->pressure)
3308 packet_rcv_has_room(pkt_sk(sk), NULL);
3309
3310 if (pkt_sk(sk)->has_vnet_hdr) {
3311 err = packet_rcv_vnet(msg, skb, &len);
3312 if (err)
3313 goto out_free;
3314 vnet_hdr_len = sizeof(struct virtio_net_hdr);
3315 }
3316
3317
3318
3319
3320
3321 copied = skb->len;
3322 if (copied > len) {
3323 copied = len;
3324 msg->msg_flags |= MSG_TRUNC;
3325 }
3326
3327 err = skb_copy_datagram_msg(skb, 0, msg, copied);
3328 if (err)
3329 goto out_free;
3330
3331 if (sock->type != SOCK_PACKET) {
3332 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3333
3334
3335 origlen = PACKET_SKB_CB(skb)->sa.origlen;
3336 sll->sll_family = AF_PACKET;
3337 sll->sll_protocol = skb->protocol;
3338 }
3339
3340 sock_recv_ts_and_drops(msg, sk, skb);
3341
3342 if (msg->msg_name) {
3343
3344
3345
3346 if (sock->type == SOCK_PACKET) {
3347 __sockaddr_check_size(sizeof(struct sockaddr_pkt));
3348 msg->msg_namelen = sizeof(struct sockaddr_pkt);
3349 } else {
3350 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3351
3352 msg->msg_namelen = sll->sll_halen +
3353 offsetof(struct sockaddr_ll, sll_addr);
3354 }
3355 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
3356 msg->msg_namelen);
3357 }
3358
3359 if (pkt_sk(sk)->auxdata) {
3360 struct tpacket_auxdata aux;
3361
3362 aux.tp_status = TP_STATUS_USER;
3363 if (skb->ip_summed == CHECKSUM_PARTIAL)
3364 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
3365 else if (skb->pkt_type != PACKET_OUTGOING &&
3366 (skb->ip_summed == CHECKSUM_COMPLETE ||
3367 skb_csum_unnecessary(skb)))
3368 aux.tp_status |= TP_STATUS_CSUM_VALID;
3369
3370 aux.tp_len = origlen;
3371 aux.tp_snaplen = skb->len;
3372 aux.tp_mac = 0;
3373 aux.tp_net = skb_network_offset(skb);
3374 if (skb_vlan_tag_present(skb)) {
3375 aux.tp_vlan_tci = skb_vlan_tag_get(skb);
3376 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3377 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
3378 } else {
3379 aux.tp_vlan_tci = 0;
3380 aux.tp_vlan_tpid = 0;
3381 }
3382 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
3383 }
3384
3385
3386
3387
3388
3389 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
3390
3391out_free:
3392 skb_free_datagram(sk, skb);
3393out:
3394 return err;
3395}
3396
3397static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
3398 int peer)
3399{
3400 struct net_device *dev;
3401 struct sock *sk = sock->sk;
3402
3403 if (peer)
3404 return -EOPNOTSUPP;
3405
3406 uaddr->sa_family = AF_PACKET;
3407 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
3408 rcu_read_lock();
3409 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
3410 if (dev)
3411 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
3412 rcu_read_unlock();
3413
3414 return sizeof(*uaddr);
3415}
3416
3417static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
3418 int peer)
3419{
3420 struct net_device *dev;
3421 struct sock *sk = sock->sk;
3422 struct packet_sock *po = pkt_sk(sk);
3423 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
3424
3425 if (peer)
3426 return -EOPNOTSUPP;
3427
3428 sll->sll_family = AF_PACKET;
3429 sll->sll_ifindex = po->ifindex;
3430 sll->sll_protocol = po->num;
3431 sll->sll_pkttype = 0;
3432 rcu_read_lock();
3433 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
3434 if (dev) {
3435 sll->sll_hatype = dev->type;
3436 sll->sll_halen = dev->addr_len;
3437 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
3438 } else {
3439 sll->sll_hatype = 0;
3440 sll->sll_halen = 0;
3441 }
3442 rcu_read_unlock();
3443
3444 return offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
3445}
3446
3447static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3448 int what)
3449{
3450 switch (i->type) {
3451 case PACKET_MR_MULTICAST:
3452 if (i->alen != dev->addr_len)
3453 return -EINVAL;
3454 if (what > 0)
3455 return dev_mc_add(dev, i->addr);
3456 else
3457 return dev_mc_del(dev, i->addr);
3458 break;
3459 case PACKET_MR_PROMISC:
3460 return dev_set_promiscuity(dev, what);
3461 case PACKET_MR_ALLMULTI:
3462 return dev_set_allmulti(dev, what);
3463 case PACKET_MR_UNICAST:
3464 if (i->alen != dev->addr_len)
3465 return -EINVAL;
3466 if (what > 0)
3467 return dev_uc_add(dev, i->addr);
3468 else
3469 return dev_uc_del(dev, i->addr);
3470 break;
3471 default:
3472 break;
3473 }
3474 return 0;
3475}
3476
3477static void packet_dev_mclist_delete(struct net_device *dev,
3478 struct packet_mclist **mlp)
3479{
3480 struct packet_mclist *ml;
3481
3482 while ((ml = *mlp) != NULL) {
3483 if (ml->ifindex == dev->ifindex) {
3484 packet_dev_mc(dev, ml, -1);
3485 *mlp = ml->next;
3486 kfree(ml);
3487 } else
3488 mlp = &ml->next;
3489 }
3490}
3491
3492static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
3493{
3494 struct packet_sock *po = pkt_sk(sk);
3495 struct packet_mclist *ml, *i;
3496 struct net_device *dev;
3497 int err;
3498
3499 rtnl_lock();
3500
3501 err = -ENODEV;
3502 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
3503 if (!dev)
3504 goto done;
3505
3506 err = -EINVAL;
3507 if (mreq->mr_alen > dev->addr_len)
3508 goto done;
3509
3510 err = -ENOBUFS;
3511 i = kmalloc(sizeof(*i), GFP_KERNEL);
3512 if (i == NULL)
3513 goto done;
3514
3515 err = 0;
3516 for (ml = po->mclist; ml; ml = ml->next) {
3517 if (ml->ifindex == mreq->mr_ifindex &&
3518 ml->type == mreq->mr_type &&
3519 ml->alen == mreq->mr_alen &&
3520 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3521 ml->count++;
3522
3523 kfree(i);
3524 goto done;
3525 }
3526 }
3527
3528 i->type = mreq->mr_type;
3529 i->ifindex = mreq->mr_ifindex;
3530 i->alen = mreq->mr_alen;
3531 memcpy(i->addr, mreq->mr_address, i->alen);
3532 memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
3533 i->count = 1;
3534 i->next = po->mclist;
3535 po->mclist = i;
3536 err = packet_dev_mc(dev, i, 1);
3537 if (err) {
3538 po->mclist = i->next;
3539 kfree(i);
3540 }
3541
3542done:
3543 rtnl_unlock();
3544 return err;
3545}
3546
3547static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
3548{
3549 struct packet_mclist *ml, **mlp;
3550
3551 rtnl_lock();
3552
3553 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3554 if (ml->ifindex == mreq->mr_ifindex &&
3555 ml->type == mreq->mr_type &&
3556 ml->alen == mreq->mr_alen &&
3557 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3558 if (--ml->count == 0) {
3559 struct net_device *dev;
3560 *mlp = ml->next;
3561 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3562 if (dev)
3563 packet_dev_mc(dev, ml, -1);
3564 kfree(ml);
3565 }
3566 break;
3567 }
3568 }
3569 rtnl_unlock();
3570 return 0;
3571}
3572
3573static void packet_flush_mclist(struct sock *sk)
3574{
3575 struct packet_sock *po = pkt_sk(sk);
3576 struct packet_mclist *ml;
3577
3578 if (!po->mclist)
3579 return;
3580
3581 rtnl_lock();
3582 while ((ml = po->mclist) != NULL) {
3583 struct net_device *dev;
3584
3585 po->mclist = ml->next;
3586 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3587 if (dev != NULL)
3588 packet_dev_mc(dev, ml, -1);
3589 kfree(ml);
3590 }
3591 rtnl_unlock();
3592}
3593
3594static int
3595packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
3596{
3597 struct sock *sk = sock->sk;
3598 struct packet_sock *po = pkt_sk(sk);
3599 int ret;
3600
3601 if (level != SOL_PACKET)
3602 return -ENOPROTOOPT;
3603
3604 switch (optname) {
3605 case PACKET_ADD_MEMBERSHIP:
3606 case PACKET_DROP_MEMBERSHIP:
3607 {
3608 struct packet_mreq_max mreq;
3609 int len = optlen;
3610 memset(&mreq, 0, sizeof(mreq));
3611 if (len < sizeof(struct packet_mreq))
3612 return -EINVAL;
3613 if (len > sizeof(mreq))
3614 len = sizeof(mreq);
3615 if (copy_from_user(&mreq, optval, len))
3616 return -EFAULT;
3617 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3618 return -EINVAL;
3619 if (optname == PACKET_ADD_MEMBERSHIP)
3620 ret = packet_mc_add(sk, &mreq);
3621 else
3622 ret = packet_mc_drop(sk, &mreq);
3623 return ret;
3624 }
3625
3626 case PACKET_RX_RING:
3627 case PACKET_TX_RING:
3628 {
3629 union tpacket_req_u req_u;
3630 int len;
3631
3632 lock_sock(sk);
3633 switch (po->tp_version) {
3634 case TPACKET_V1:
3635 case TPACKET_V2:
3636 len = sizeof(req_u.req);
3637 break;
3638 case TPACKET_V3:
3639 default:
3640 len = sizeof(req_u.req3);
3641 break;
3642 }
3643 if (optlen < len) {
3644 ret = -EINVAL;
3645 } else {
3646 if (copy_from_user(&req_u.req, optval, len))
3647 ret = -EFAULT;
3648 else
3649 ret = packet_set_ring(sk, &req_u, 0,
3650 optname == PACKET_TX_RING);
3651 }
3652 release_sock(sk);
3653 return ret;
3654 }
3655 case PACKET_COPY_THRESH:
3656 {
3657 int val;
3658
3659 if (optlen != sizeof(val))
3660 return -EINVAL;
3661 if (copy_from_user(&val, optval, sizeof(val)))
3662 return -EFAULT;
3663
3664 pkt_sk(sk)->copy_thresh = val;
3665 return 0;
3666 }
3667 case PACKET_VERSION:
3668 {
3669 int val;
3670
3671 if (optlen != sizeof(val))
3672 return -EINVAL;
3673 if (copy_from_user(&val, optval, sizeof(val)))
3674 return -EFAULT;
3675 switch (val) {
3676 case TPACKET_V1:
3677 case TPACKET_V2:
3678 case TPACKET_V3:
3679 break;
3680 default:
3681 return -EINVAL;
3682 }
3683 lock_sock(sk);
3684 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3685 ret = -EBUSY;
3686 } else {
3687 po->tp_version = val;
3688 ret = 0;
3689 }
3690 release_sock(sk);
3691 return ret;
3692 }
3693 case PACKET_RESERVE:
3694 {
3695 unsigned int val;
3696
3697 if (optlen != sizeof(val))
3698 return -EINVAL;
3699 if (copy_from_user(&val, optval, sizeof(val)))
3700 return -EFAULT;
3701 if (val > INT_MAX)
3702 return -EINVAL;
3703 lock_sock(sk);
3704 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3705 ret = -EBUSY;
3706 } else {
3707 po->tp_reserve = val;
3708 ret = 0;
3709 }
3710 release_sock(sk);
3711 return ret;
3712 }
3713 case PACKET_LOSS:
3714 {
3715 unsigned int val;
3716
3717 if (optlen != sizeof(val))
3718 return -EINVAL;
3719 if (copy_from_user(&val, optval, sizeof(val)))
3720 return -EFAULT;
3721
3722 lock_sock(sk);
3723 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3724 ret = -EBUSY;
3725 } else {
3726 po->tp_loss = !!val;
3727 ret = 0;
3728 }
3729 release_sock(sk);
3730 return ret;
3731 }
3732 case PACKET_AUXDATA:
3733 {
3734 int val;
3735
3736 if (optlen < sizeof(val))
3737 return -EINVAL;
3738 if (copy_from_user(&val, optval, sizeof(val)))
3739 return -EFAULT;
3740
3741 lock_sock(sk);
3742 po->auxdata = !!val;
3743 release_sock(sk);
3744 return 0;
3745 }
3746 case PACKET_ORIGDEV:
3747 {
3748 int val;
3749
3750 if (optlen < sizeof(val))
3751 return -EINVAL;
3752 if (copy_from_user(&val, optval, sizeof(val)))
3753 return -EFAULT;
3754
3755 lock_sock(sk);
3756 po->origdev = !!val;
3757 release_sock(sk);
3758 return 0;
3759 }
3760 case PACKET_VNET_HDR:
3761 {
3762 int val;
3763
3764 if (sock->type != SOCK_RAW)
3765 return -EINVAL;
3766 if (optlen < sizeof(val))
3767 return -EINVAL;
3768 if (copy_from_user(&val, optval, sizeof(val)))
3769 return -EFAULT;
3770
3771 lock_sock(sk);
3772 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3773 ret = -EBUSY;
3774 } else {
3775 po->has_vnet_hdr = !!val;
3776 ret = 0;
3777 }
3778 release_sock(sk);
3779 return ret;
3780 }
3781 case PACKET_TIMESTAMP:
3782 {
3783 int val;
3784
3785 if (optlen != sizeof(val))
3786 return -EINVAL;
3787 if (copy_from_user(&val, optval, sizeof(val)))
3788 return -EFAULT;
3789
3790 po->tp_tstamp = val;
3791 return 0;
3792 }
3793 case PACKET_FANOUT:
3794 {
3795 int val;
3796
3797 if (optlen != sizeof(val))
3798 return -EINVAL;
3799 if (copy_from_user(&val, optval, sizeof(val)))
3800 return -EFAULT;
3801
3802 return fanout_add(sk, val & 0xffff, val >> 16);
3803 }
3804 case PACKET_FANOUT_DATA:
3805 {
3806 if (!po->fanout)
3807 return -EINVAL;
3808
3809 return fanout_set_data(po, optval, optlen);
3810 }
3811 case PACKET_TX_HAS_OFF:
3812 {
3813 unsigned int val;
3814
3815 if (optlen != sizeof(val))
3816 return -EINVAL;
3817 if (copy_from_user(&val, optval, sizeof(val)))
3818 return -EFAULT;
3819
3820 lock_sock(sk);
3821 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3822 ret = -EBUSY;
3823 } else {
3824 po->tp_tx_has_off = !!val;
3825 ret = 0;
3826 }
3827 release_sock(sk);
3828 return 0;
3829 }
3830 case PACKET_QDISC_BYPASS:
3831 {
3832 int val;
3833
3834 if (optlen != sizeof(val))
3835 return -EINVAL;
3836 if (copy_from_user(&val, optval, sizeof(val)))
3837 return -EFAULT;
3838
3839 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3840 return 0;
3841 }
3842 default:
3843 return -ENOPROTOOPT;
3844 }
3845}
3846
3847static int packet_getsockopt(struct socket *sock, int level, int optname,
3848 char __user *optval, int __user *optlen)
3849{
3850 int len;
3851 int val, lv = sizeof(val);
3852 struct sock *sk = sock->sk;
3853 struct packet_sock *po = pkt_sk(sk);
3854 void *data = &val;
3855 union tpacket_stats_u st;
3856 struct tpacket_rollover_stats rstats;
3857
3858 if (level != SOL_PACKET)
3859 return -ENOPROTOOPT;
3860
3861 if (get_user(len, optlen))
3862 return -EFAULT;
3863
3864 if (len < 0)
3865 return -EINVAL;
3866
3867 switch (optname) {
3868 case PACKET_STATISTICS:
3869 spin_lock_bh(&sk->sk_receive_queue.lock);
3870 memcpy(&st, &po->stats, sizeof(st));
3871 memset(&po->stats, 0, sizeof(po->stats));
3872 spin_unlock_bh(&sk->sk_receive_queue.lock);
3873
3874 if (po->tp_version == TPACKET_V3) {
3875 lv = sizeof(struct tpacket_stats_v3);
3876 st.stats3.tp_packets += st.stats3.tp_drops;
3877 data = &st.stats3;
3878 } else {
3879 lv = sizeof(struct tpacket_stats);
3880 st.stats1.tp_packets += st.stats1.tp_drops;
3881 data = &st.stats1;
3882 }
3883
3884 break;
3885 case PACKET_AUXDATA:
3886 val = po->auxdata;
3887 break;
3888 case PACKET_ORIGDEV:
3889 val = po->origdev;
3890 break;
3891 case PACKET_VNET_HDR:
3892 val = po->has_vnet_hdr;
3893 break;
3894 case PACKET_VERSION:
3895 val = po->tp_version;
3896 break;
3897 case PACKET_HDRLEN:
3898 if (len > sizeof(int))
3899 len = sizeof(int);
3900 if (len < sizeof(int))
3901 return -EINVAL;
3902 if (copy_from_user(&val, optval, len))
3903 return -EFAULT;
3904 switch (val) {
3905 case TPACKET_V1:
3906 val = sizeof(struct tpacket_hdr);
3907 break;
3908 case TPACKET_V2:
3909 val = sizeof(struct tpacket2_hdr);
3910 break;
3911 case TPACKET_V3:
3912 val = sizeof(struct tpacket3_hdr);
3913 break;
3914 default:
3915 return -EINVAL;
3916 }
3917 break;
3918 case PACKET_RESERVE:
3919 val = po->tp_reserve;
3920 break;
3921 case PACKET_LOSS:
3922 val = po->tp_loss;
3923 break;
3924 case PACKET_TIMESTAMP:
3925 val = po->tp_tstamp;
3926 break;
3927 case PACKET_FANOUT:
3928 val = (po->fanout ?
3929 ((u32)po->fanout->id |
3930 ((u32)po->fanout->type << 16) |
3931 ((u32)po->fanout->flags << 24)) :
3932 0);
3933 break;
3934 case PACKET_ROLLOVER_STATS:
3935 if (!po->rollover)
3936 return -EINVAL;
3937 rstats.tp_all = atomic_long_read(&po->rollover->num);
3938 rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
3939 rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
3940 data = &rstats;
3941 lv = sizeof(rstats);
3942 break;
3943 case PACKET_TX_HAS_OFF:
3944 val = po->tp_tx_has_off;
3945 break;
3946 case PACKET_QDISC_BYPASS:
3947 val = packet_use_direct_xmit(po);
3948 break;
3949 default:
3950 return -ENOPROTOOPT;
3951 }
3952
3953 if (len > lv)
3954 len = lv;
3955 if (put_user(len, optlen))
3956 return -EFAULT;
3957 if (copy_to_user(optval, data, len))
3958 return -EFAULT;
3959 return 0;
3960}
3961
3962
3963#ifdef CONFIG_COMPAT
3964static int compat_packet_setsockopt(struct socket *sock, int level, int optname,
3965 char __user *optval, unsigned int optlen)
3966{
3967 struct packet_sock *po = pkt_sk(sock->sk);
3968
3969 if (level != SOL_PACKET)
3970 return -ENOPROTOOPT;
3971
3972 if (optname == PACKET_FANOUT_DATA &&
3973 po->fanout && po->fanout->type == PACKET_FANOUT_CBPF) {
3974 optval = (char __user *)get_compat_bpf_fprog(optval);
3975 if (!optval)
3976 return -EFAULT;
3977 optlen = sizeof(struct sock_fprog);
3978 }
3979
3980 return packet_setsockopt(sock, level, optname, optval, optlen);
3981}
3982#endif
3983
3984static int packet_notifier(struct notifier_block *this,
3985 unsigned long msg, void *ptr)
3986{
3987 struct sock *sk;
3988 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3989 struct net *net = dev_net(dev);
3990
3991 rcu_read_lock();
3992 sk_for_each_rcu(sk, &net->packet.sklist) {
3993 struct packet_sock *po = pkt_sk(sk);
3994
3995 switch (msg) {
3996 case NETDEV_UNREGISTER:
3997 if (po->mclist)
3998 packet_dev_mclist_delete(dev, &po->mclist);
3999
4000
4001 case NETDEV_DOWN:
4002 if (dev->ifindex == po->ifindex) {
4003 spin_lock(&po->bind_lock);
4004 if (po->running) {
4005 __unregister_prot_hook(sk, false);
4006 sk->sk_err = ENETDOWN;
4007 if (!sock_flag(sk, SOCK_DEAD))
4008 sk->sk_error_report(sk);
4009 }
4010 if (msg == NETDEV_UNREGISTER) {
4011 packet_cached_dev_reset(po);
4012 po->ifindex = -1;
4013 if (po->prot_hook.dev)
4014 dev_put(po->prot_hook.dev);
4015 po->prot_hook.dev = NULL;
4016 }
4017 spin_unlock(&po->bind_lock);
4018 }
4019 break;
4020 case NETDEV_UP:
4021 if (dev->ifindex == po->ifindex) {
4022 spin_lock(&po->bind_lock);
4023 if (po->num)
4024 register_prot_hook(sk);
4025 spin_unlock(&po->bind_lock);
4026 }
4027 break;
4028 }
4029 }
4030 rcu_read_unlock();
4031 return NOTIFY_DONE;
4032}
4033
4034
4035static int packet_ioctl(struct socket *sock, unsigned int cmd,
4036 unsigned long arg)
4037{
4038 struct sock *sk = sock->sk;
4039
4040 switch (cmd) {
4041 case SIOCOUTQ:
4042 {
4043 int amount = sk_wmem_alloc_get(sk);
4044
4045 return put_user(amount, (int __user *)arg);
4046 }
4047 case SIOCINQ:
4048 {
4049 struct sk_buff *skb;
4050 int amount = 0;
4051
4052 spin_lock_bh(&sk->sk_receive_queue.lock);
4053 skb = skb_peek(&sk->sk_receive_queue);
4054 if (skb)
4055 amount = skb->len;
4056 spin_unlock_bh(&sk->sk_receive_queue.lock);
4057 return put_user(amount, (int __user *)arg);
4058 }
4059 case SIOCGSTAMP:
4060 return sock_get_timestamp(sk, (struct timeval __user *)arg);
4061 case SIOCGSTAMPNS:
4062 return sock_get_timestampns(sk, (struct timespec __user *)arg);
4063
4064#ifdef CONFIG_INET
4065 case SIOCADDRT:
4066 case SIOCDELRT:
4067 case SIOCDARP:
4068 case SIOCGARP:
4069 case SIOCSARP:
4070 case SIOCGIFADDR:
4071 case SIOCSIFADDR:
4072 case SIOCGIFBRDADDR:
4073 case SIOCSIFBRDADDR:
4074 case SIOCGIFNETMASK:
4075 case SIOCSIFNETMASK:
4076 case SIOCGIFDSTADDR:
4077 case SIOCSIFDSTADDR:
4078 case SIOCSIFFLAGS:
4079 return inet_dgram_ops.ioctl(sock, cmd, arg);
4080#endif
4081
4082 default:
4083 return -ENOIOCTLCMD;
4084 }
4085 return 0;
4086}
4087
4088static __poll_t packet_poll(struct file *file, struct socket *sock,
4089 poll_table *wait)
4090{
4091 struct sock *sk = sock->sk;
4092 struct packet_sock *po = pkt_sk(sk);
4093 __poll_t mask = datagram_poll(file, sock, wait);
4094
4095 spin_lock_bh(&sk->sk_receive_queue.lock);
4096 if (po->rx_ring.pg_vec) {
4097 if (!packet_previous_rx_frame(po, &po->rx_ring,
4098 TP_STATUS_KERNEL))
4099 mask |= EPOLLIN | EPOLLRDNORM;
4100 }
4101 if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
4102 po->pressure = 0;
4103 spin_unlock_bh(&sk->sk_receive_queue.lock);
4104 spin_lock_bh(&sk->sk_write_queue.lock);
4105 if (po->tx_ring.pg_vec) {
4106 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
4107 mask |= EPOLLOUT | EPOLLWRNORM;
4108 }
4109 spin_unlock_bh(&sk->sk_write_queue.lock);
4110 return mask;
4111}
4112
4113
4114
4115
4116
4117
4118static void packet_mm_open(struct vm_area_struct *vma)
4119{
4120 struct file *file = vma->vm_file;
4121 struct socket *sock = file->private_data;
4122 struct sock *sk = sock->sk;
4123
4124 if (sk)
4125 atomic_inc(&pkt_sk(sk)->mapped);
4126}
4127
4128static void packet_mm_close(struct vm_area_struct *vma)
4129{
4130 struct file *file = vma->vm_file;
4131 struct socket *sock = file->private_data;
4132 struct sock *sk = sock->sk;
4133
4134 if (sk)
4135 atomic_dec(&pkt_sk(sk)->mapped);
4136}
4137
4138static const struct vm_operations_struct packet_mmap_ops = {
4139 .open = packet_mm_open,
4140 .close = packet_mm_close,
4141};
4142
4143static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
4144 unsigned int len)
4145{
4146 int i;
4147
4148 for (i = 0; i < len; i++) {
4149 if (likely(pg_vec[i].buffer)) {
4150 if (is_vmalloc_addr(pg_vec[i].buffer))
4151 vfree(pg_vec[i].buffer);
4152 else
4153 free_pages((unsigned long)pg_vec[i].buffer,
4154 order);
4155 pg_vec[i].buffer = NULL;
4156 }
4157 }
4158 kfree(pg_vec);
4159}
4160
4161static char *alloc_one_pg_vec_page(unsigned long order)
4162{
4163 char *buffer;
4164 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
4165 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
4166
4167 buffer = (char *) __get_free_pages(gfp_flags, order);
4168 if (buffer)
4169 return buffer;
4170
4171
4172 buffer = vzalloc(array_size((1 << order), PAGE_SIZE));
4173 if (buffer)
4174 return buffer;
4175
4176
4177 gfp_flags &= ~__GFP_NORETRY;
4178 buffer = (char *) __get_free_pages(gfp_flags, order);
4179 if (buffer)
4180 return buffer;
4181
4182
4183 return NULL;
4184}
4185
4186static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4187{
4188 unsigned int block_nr = req->tp_block_nr;
4189 struct pgv *pg_vec;
4190 int i;
4191
4192 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4193 if (unlikely(!pg_vec))
4194 goto out;
4195
4196 for (i = 0; i < block_nr; i++) {
4197 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
4198 if (unlikely(!pg_vec[i].buffer))
4199 goto out_free_pgvec;
4200 }
4201
4202out:
4203 return pg_vec;
4204
4205out_free_pgvec:
4206 free_pg_vec(pg_vec, order, block_nr);
4207 pg_vec = NULL;
4208 goto out;
4209}
4210
4211static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
4212 int closing, int tx_ring)
4213{
4214 struct pgv *pg_vec = NULL;
4215 struct packet_sock *po = pkt_sk(sk);
4216 int was_running, order = 0;
4217 struct packet_ring_buffer *rb;
4218 struct sk_buff_head *rb_queue;
4219 __be16 num;
4220 int err = -EINVAL;
4221
4222 struct tpacket_req *req = &req_u->req;
4223
4224 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
4225 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
4226
4227 err = -EBUSY;
4228 if (!closing) {
4229 if (atomic_read(&po->mapped))
4230 goto out;
4231 if (packet_read_pending(rb))
4232 goto out;
4233 }
4234
4235 if (req->tp_block_nr) {
4236 unsigned int min_frame_size;
4237
4238
4239 err = -EBUSY;
4240 if (unlikely(rb->pg_vec))
4241 goto out;
4242
4243 switch (po->tp_version) {
4244 case TPACKET_V1:
4245 po->tp_hdrlen = TPACKET_HDRLEN;
4246 break;
4247 case TPACKET_V2:
4248 po->tp_hdrlen = TPACKET2_HDRLEN;
4249 break;
4250 case TPACKET_V3:
4251 po->tp_hdrlen = TPACKET3_HDRLEN;
4252 break;
4253 }
4254
4255 err = -EINVAL;
4256 if (unlikely((int)req->tp_block_size <= 0))
4257 goto out;
4258 if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
4259 goto out;
4260 min_frame_size = po->tp_hdrlen + po->tp_reserve;
4261 if (po->tp_version >= TPACKET_V3 &&
4262 req->tp_block_size <
4263 BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + min_frame_size)
4264 goto out;
4265 if (unlikely(req->tp_frame_size < min_frame_size))
4266 goto out;
4267 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
4268 goto out;
4269
4270 rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
4271 if (unlikely(rb->frames_per_block == 0))
4272 goto out;
4273 if (unlikely(req->tp_block_size > UINT_MAX / req->tp_block_nr))
4274 goto out;
4275 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
4276 req->tp_frame_nr))
4277 goto out;
4278
4279 err = -ENOMEM;
4280 order = get_order(req->tp_block_size);
4281 pg_vec = alloc_pg_vec(req, order);
4282 if (unlikely(!pg_vec))
4283 goto out;
4284 switch (po->tp_version) {
4285 case TPACKET_V3:
4286
4287 if (!tx_ring) {
4288 init_prb_bdqc(po, rb, pg_vec, req_u);
4289 } else {
4290 struct tpacket_req3 *req3 = &req_u->req3;
4291
4292 if (req3->tp_retire_blk_tov ||
4293 req3->tp_sizeof_priv ||
4294 req3->tp_feature_req_word) {
4295 err = -EINVAL;
4296 goto out;
4297 }
4298 }
4299 break;
4300 default:
4301 break;
4302 }
4303 }
4304
4305 else {
4306 err = -EINVAL;
4307 if (unlikely(req->tp_frame_nr))
4308 goto out;
4309 }
4310
4311
4312
4313 spin_lock(&po->bind_lock);
4314 was_running = po->running;
4315 num = po->num;
4316 if (was_running) {
4317 po->num = 0;
4318 __unregister_prot_hook(sk, false);
4319 }
4320 spin_unlock(&po->bind_lock);
4321
4322 synchronize_net();
4323
4324 err = -EBUSY;
4325 mutex_lock(&po->pg_vec_lock);
4326 if (closing || atomic_read(&po->mapped) == 0) {
4327 err = 0;
4328 spin_lock_bh(&rb_queue->lock);
4329 swap(rb->pg_vec, pg_vec);
4330 rb->frame_max = (req->tp_frame_nr - 1);
4331 rb->head = 0;
4332 rb->frame_size = req->tp_frame_size;
4333 spin_unlock_bh(&rb_queue->lock);
4334
4335 swap(rb->pg_vec_order, order);
4336 swap(rb->pg_vec_len, req->tp_block_nr);
4337
4338 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
4339 po->prot_hook.func = (po->rx_ring.pg_vec) ?
4340 tpacket_rcv : packet_rcv;
4341 skb_queue_purge(rb_queue);
4342 if (atomic_read(&po->mapped))
4343 pr_err("packet_mmap: vma is busy: %d\n",
4344 atomic_read(&po->mapped));
4345 }
4346 mutex_unlock(&po->pg_vec_lock);
4347
4348 spin_lock(&po->bind_lock);
4349 if (was_running) {
4350 po->num = num;
4351 register_prot_hook(sk);
4352 }
4353 spin_unlock(&po->bind_lock);
4354 if (pg_vec && (po->tp_version > TPACKET_V2)) {
4355
4356 if (!tx_ring)
4357 prb_shutdown_retire_blk_timer(po, rb_queue);
4358 }
4359
4360 if (pg_vec)
4361 free_pg_vec(pg_vec, order, req->tp_block_nr);
4362out:
4363 return err;
4364}
4365
4366static int packet_mmap(struct file *file, struct socket *sock,
4367 struct vm_area_struct *vma)
4368{
4369 struct sock *sk = sock->sk;
4370 struct packet_sock *po = pkt_sk(sk);
4371 unsigned long size, expected_size;
4372 struct packet_ring_buffer *rb;
4373 unsigned long start;
4374 int err = -EINVAL;
4375 int i;
4376
4377 if (vma->vm_pgoff)
4378 return -EINVAL;
4379
4380 mutex_lock(&po->pg_vec_lock);
4381
4382 expected_size = 0;
4383 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4384 if (rb->pg_vec) {
4385 expected_size += rb->pg_vec_len
4386 * rb->pg_vec_pages
4387 * PAGE_SIZE;
4388 }
4389 }
4390
4391 if (expected_size == 0)
4392 goto out;
4393
4394 size = vma->vm_end - vma->vm_start;
4395 if (size != expected_size)
4396 goto out;
4397
4398 start = vma->vm_start;
4399 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4400 if (rb->pg_vec == NULL)
4401 continue;
4402
4403 for (i = 0; i < rb->pg_vec_len; i++) {
4404 struct page *page;
4405 void *kaddr = rb->pg_vec[i].buffer;
4406 int pg_num;
4407
4408 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
4409 page = pgv_to_page(kaddr);
4410 err = vm_insert_page(vma, start, page);
4411 if (unlikely(err))
4412 goto out;
4413 start += PAGE_SIZE;
4414 kaddr += PAGE_SIZE;
4415 }
4416 }
4417 }
4418
4419 atomic_inc(&po->mapped);
4420 vma->vm_ops = &packet_mmap_ops;
4421 err = 0;
4422
4423out:
4424 mutex_unlock(&po->pg_vec_lock);
4425 return err;
4426}
4427
4428static const struct proto_ops packet_ops_spkt = {
4429 .family = PF_PACKET,
4430 .owner = THIS_MODULE,
4431 .release = packet_release,
4432 .bind = packet_bind_spkt,
4433 .connect = sock_no_connect,
4434 .socketpair = sock_no_socketpair,
4435 .accept = sock_no_accept,
4436 .getname = packet_getname_spkt,
4437 .poll = datagram_poll,
4438 .ioctl = packet_ioctl,
4439 .listen = sock_no_listen,
4440 .shutdown = sock_no_shutdown,
4441 .setsockopt = sock_no_setsockopt,
4442 .getsockopt = sock_no_getsockopt,
4443 .sendmsg = packet_sendmsg_spkt,
4444 .recvmsg = packet_recvmsg,
4445 .mmap = sock_no_mmap,
4446 .sendpage = sock_no_sendpage,
4447};
4448
4449static const struct proto_ops packet_ops = {
4450 .family = PF_PACKET,
4451 .owner = THIS_MODULE,
4452 .release = packet_release,
4453 .bind = packet_bind,
4454 .connect = sock_no_connect,
4455 .socketpair = sock_no_socketpair,
4456 .accept = sock_no_accept,
4457 .getname = packet_getname,
4458 .poll = packet_poll,
4459 .ioctl = packet_ioctl,
4460 .listen = sock_no_listen,
4461 .shutdown = sock_no_shutdown,
4462 .setsockopt = packet_setsockopt,
4463 .getsockopt = packet_getsockopt,
4464#ifdef CONFIG_COMPAT
4465 .compat_setsockopt = compat_packet_setsockopt,
4466#endif
4467 .sendmsg = packet_sendmsg,
4468 .recvmsg = packet_recvmsg,
4469 .mmap = packet_mmap,
4470 .sendpage = sock_no_sendpage,
4471};
4472
4473static const struct net_proto_family packet_family_ops = {
4474 .family = PF_PACKET,
4475 .create = packet_create,
4476 .owner = THIS_MODULE,
4477};
4478
4479static struct notifier_block packet_netdev_notifier = {
4480 .notifier_call = packet_notifier,
4481};
4482
4483#ifdef CONFIG_PROC_FS
4484
4485static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
4486 __acquires(RCU)
4487{
4488 struct net *net = seq_file_net(seq);
4489
4490 rcu_read_lock();
4491 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
4492}
4493
4494static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4495{
4496 struct net *net = seq_file_net(seq);
4497 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
4498}
4499
4500static void packet_seq_stop(struct seq_file *seq, void *v)
4501 __releases(RCU)
4502{
4503 rcu_read_unlock();
4504}
4505
4506static int packet_seq_show(struct seq_file *seq, void *v)
4507{
4508 if (v == SEQ_START_TOKEN)
4509 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
4510 else {
4511 struct sock *s = sk_entry(v);
4512 const struct packet_sock *po = pkt_sk(s);
4513
4514 seq_printf(seq,
4515 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
4516 s,
4517 refcount_read(&s->sk_refcnt),
4518 s->sk_type,
4519 ntohs(po->num),
4520 po->ifindex,
4521 po->running,
4522 atomic_read(&s->sk_rmem_alloc),
4523 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
4524 sock_i_ino(s));
4525 }
4526
4527 return 0;
4528}
4529
4530static const struct seq_operations packet_seq_ops = {
4531 .start = packet_seq_start,
4532 .next = packet_seq_next,
4533 .stop = packet_seq_stop,
4534 .show = packet_seq_show,
4535};
4536#endif
4537
4538static int __net_init packet_net_init(struct net *net)
4539{
4540 mutex_init(&net->packet.sklist_lock);
4541 INIT_HLIST_HEAD(&net->packet.sklist);
4542
4543 if (!proc_create_net("packet", 0, net->proc_net, &packet_seq_ops,
4544 sizeof(struct seq_net_private)))
4545 return -ENOMEM;
4546
4547 return 0;
4548}
4549
4550static void __net_exit packet_net_exit(struct net *net)
4551{
4552 remove_proc_entry("packet", net->proc_net);
4553 WARN_ON_ONCE(!hlist_empty(&net->packet.sklist));
4554}
4555
4556static struct pernet_operations packet_net_ops = {
4557 .init = packet_net_init,
4558 .exit = packet_net_exit,
4559};
4560
4561
4562static void __exit packet_exit(void)
4563{
4564 unregister_netdevice_notifier(&packet_netdev_notifier);
4565 unregister_pernet_subsys(&packet_net_ops);
4566 sock_unregister(PF_PACKET);
4567 proto_unregister(&packet_proto);
4568}
4569
4570static int __init packet_init(void)
4571{
4572 int rc = proto_register(&packet_proto, 0);
4573
4574 if (rc != 0)
4575 goto out;
4576
4577 sock_register(&packet_family_ops);
4578 register_pernet_subsys(&packet_net_ops);
4579 register_netdevice_notifier(&packet_netdev_notifier);
4580out:
4581 return rc;
4582}
4583
4584module_init(packet_init);
4585module_exit(packet_exit);
4586MODULE_LICENSE("GPL");
4587MODULE_ALIAS_NETPROTO(PF_PACKET);
4588