linux/net/packet/af_packet.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              PACKET - implements raw packet sockets.
   7 *
   8 * Authors:     Ross Biro
   9 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11 *
  12 * Fixes:
  13 *              Alan Cox        :       verify_area() now used correctly
  14 *              Alan Cox        :       new skbuff lists, look ma no backlogs!
  15 *              Alan Cox        :       tidied skbuff lists.
  16 *              Alan Cox        :       Now uses generic datagram routines I
  17 *                                      added. Also fixed the peek/read crash
  18 *                                      from all old Linux datagram code.
  19 *              Alan Cox        :       Uses the improved datagram code.
  20 *              Alan Cox        :       Added NULL's for socket options.
  21 *              Alan Cox        :       Re-commented the code.
  22 *              Alan Cox        :       Use new kernel side addressing
  23 *              Rob Janssen     :       Correct MTU usage.
  24 *              Dave Platt      :       Counter leaks caused by incorrect
  25 *                                      interrupt locking and some slightly
  26 *                                      dubious gcc output. Can you read
  27 *                                      compiler: it said _VOLATILE_
  28 *      Richard Kooijman        :       Timestamp fixes.
  29 *              Alan Cox        :       New buffers. Use sk->mac.raw.
  30 *              Alan Cox        :       sendmsg/recvmsg support.
  31 *              Alan Cox        :       Protocol setting support
  32 *      Alexey Kuznetsov        :       Untied from IPv4 stack.
  33 *      Cyrus Durgin            :       Fixed kerneld for kmod.
  34 *      Michal Ostrowski        :       Module initialization cleanup.
  35 *         Ulises Alonso        :       Frame number limit removal and
  36 *                                      packet_set_ring memory leak.
  37 *              Eric Biederman  :       Allow for > 8 byte hardware addresses.
  38 *                                      The convention is that longer addresses
  39 *                                      will simply extend the hardware address
  40 *                                      byte arrays at the end of sockaddr_ll
  41 *                                      and packet_mreq.
  42 *              Johann Baudy    :       Added TX RING.
  43 *
  44 *              This program is free software; you can redistribute it and/or
  45 *              modify it under the terms of the GNU General Public License
  46 *              as published by the Free Software Foundation; either version
  47 *              2 of the License, or (at your option) any later version.
  48 *
  49 */
  50
  51#include <linux/types.h>
  52#include <linux/mm.h>
  53#include <linux/capability.h>
  54#include <linux/fcntl.h>
  55#include <linux/socket.h>
  56#include <linux/in.h>
  57#include <linux/inet.h>
  58#include <linux/netdevice.h>
  59#include <linux/if_packet.h>
  60#include <linux/wireless.h>
  61#include <linux/kernel.h>
  62#include <linux/kmod.h>
  63#include <linux/slab.h>
  64#include <linux/vmalloc.h>
  65#include <net/net_namespace.h>
  66#include <net/ip.h>
  67#include <net/protocol.h>
  68#include <linux/skbuff.h>
  69#include <net/sock.h>
  70#include <linux/errno.h>
  71#include <linux/timer.h>
  72#include <asm/system.h>
  73#include <asm/uaccess.h>
  74#include <asm/ioctls.h>
  75#include <asm/page.h>
  76#include <asm/cacheflush.h>
  77#include <asm/io.h>
  78#include <linux/proc_fs.h>
  79#include <linux/seq_file.h>
  80#include <linux/poll.h>
  81#include <linux/module.h>
  82#include <linux/init.h>
  83#include <linux/mutex.h>
  84#include <linux/if_vlan.h>
  85#include <linux/virtio_net.h>
  86#include <linux/errqueue.h>
  87#include <linux/net_tstamp.h>
  88
  89#ifdef CONFIG_INET
  90#include <net/inet_common.h>
  91#endif
  92
  93/*
  94   Assumptions:
  95   - if device has no dev->hard_header routine, it adds and removes ll header
  96     inside itself. In this case ll header is invisible outside of device,
  97     but higher levels still should reserve dev->hard_header_len.
  98     Some devices are enough clever to reallocate skb, when header
  99     will not fit to reserved space (tunnel), another ones are silly
 100     (PPP).
 101   - packet socket receives packets with pulled ll header,
 102     so that SOCK_RAW should push it back.
 103
 104On receive:
 105-----------
 106
 107Incoming, dev->hard_header!=NULL
 108   mac_header -> ll header
 109   data       -> data
 110
 111Outgoing, dev->hard_header!=NULL
 112   mac_header -> ll header
 113   data       -> ll header
 114
 115Incoming, dev->hard_header==NULL
 116   mac_header -> UNKNOWN position. It is very likely, that it points to ll
 117                 header.  PPP makes it, that is wrong, because introduce
 118                 assymetry between rx and tx paths.
 119   data       -> data
 120
 121Outgoing, dev->hard_header==NULL
 122   mac_header -> data. ll header is still not built!
 123   data       -> data
 124
 125Resume
 126  If dev->hard_header==NULL we are unlikely to restore sensible ll header.
 127
 128
 129On transmit:
 130------------
 131
 132dev->hard_header != NULL
 133   mac_header -> ll header
 134   data       -> ll header
 135
 136dev->hard_header == NULL (ll header is added by device, we cannot control it)
 137   mac_header -> data
 138   data       -> data
 139
 140   We should set nh.raw on output to correct posistion,
 141   packet classifier depends on it.
 142 */
 143
 144/* Private packet socket structures. */
 145
 146struct packet_mclist {
 147        struct packet_mclist    *next;
 148        int                     ifindex;
 149        int                     count;
 150        unsigned short          type;
 151        unsigned short          alen;
 152        unsigned char           addr[MAX_ADDR_LEN];
 153};
 154/* identical to struct packet_mreq except it has
 155 * a longer address field.
 156 */
 157struct packet_mreq_max {
 158        int             mr_ifindex;
 159        unsigned short  mr_type;
 160        unsigned short  mr_alen;
 161        unsigned char   mr_address[MAX_ADDR_LEN];
 162};
 163
 164static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
 165                int closing, int tx_ring);
 166
 167#define PGV_FROM_VMALLOC 1
 168struct pgv {
 169        char *buffer;
 170};
 171
 172struct packet_ring_buffer {
 173        struct pgv              *pg_vec;
 174        unsigned int            head;
 175        unsigned int            frames_per_block;
 176        unsigned int            frame_size;
 177        unsigned int            frame_max;
 178
 179        unsigned int            pg_vec_order;
 180        unsigned int            pg_vec_pages;
 181        unsigned int            pg_vec_len;
 182
 183        atomic_t                pending;
 184};
 185
 186struct packet_sock;
 187static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
 188
 189static void packet_flush_mclist(struct sock *sk);
 190
 191struct packet_sock {
 192        /* struct sock has to be the first member of packet_sock */
 193        struct sock             sk;
 194        struct tpacket_stats    stats;
 195        struct packet_ring_buffer       rx_ring;
 196        struct packet_ring_buffer       tx_ring;
 197        int                     copy_thresh;
 198        spinlock_t              bind_lock;
 199        struct mutex            pg_vec_lock;
 200        unsigned int            running:1,      /* prot_hook is attached*/
 201                                auxdata:1,
 202                                origdev:1,
 203                                has_vnet_hdr:1;
 204        int                     ifindex;        /* bound device         */
 205        __be16                  num;
 206        struct packet_mclist    *mclist;
 207        atomic_t                mapped;
 208        enum tpacket_versions   tp_version;
 209        unsigned int            tp_hdrlen;
 210        unsigned int            tp_reserve;
 211        unsigned int            tp_loss:1;
 212        unsigned int            tp_tstamp;
 213        struct packet_type      prot_hook ____cacheline_aligned_in_smp;
 214};
 215
 216struct packet_skb_cb {
 217        unsigned int origlen;
 218        union {
 219                struct sockaddr_pkt pkt;
 220                struct sockaddr_ll ll;
 221        } sa;
 222};
 223
 224#define PACKET_SKB_CB(__skb)    ((struct packet_skb_cb *)((__skb)->cb))
 225
 226static inline __pure struct page *pgv_to_page(void *addr)
 227{
 228        if (is_vmalloc_addr(addr))
 229                return vmalloc_to_page(addr);
 230        return virt_to_page(addr);
 231}
 232
 233static void __packet_set_status(struct packet_sock *po, void *frame, int status)
 234{
 235        union {
 236                struct tpacket_hdr *h1;
 237                struct tpacket2_hdr *h2;
 238                void *raw;
 239        } h;
 240
 241        h.raw = frame;
 242        switch (po->tp_version) {
 243        case TPACKET_V1:
 244                h.h1->tp_status = status;
 245                flush_dcache_page(pgv_to_page(&h.h1->tp_status));
 246                break;
 247        case TPACKET_V2:
 248                h.h2->tp_status = status;
 249                flush_dcache_page(pgv_to_page(&h.h2->tp_status));
 250                break;
 251        default:
 252                pr_err("TPACKET version not supported\n");
 253                BUG();
 254        }
 255
 256        smp_wmb();
 257}
 258
 259static int __packet_get_status(struct packet_sock *po, void *frame)
 260{
 261        union {
 262                struct tpacket_hdr *h1;
 263                struct tpacket2_hdr *h2;
 264                void *raw;
 265        } h;
 266
 267        smp_rmb();
 268
 269        h.raw = frame;
 270        switch (po->tp_version) {
 271        case TPACKET_V1:
 272                flush_dcache_page(pgv_to_page(&h.h1->tp_status));
 273                return h.h1->tp_status;
 274        case TPACKET_V2:
 275                flush_dcache_page(pgv_to_page(&h.h2->tp_status));
 276                return h.h2->tp_status;
 277        default:
 278                pr_err("TPACKET version not supported\n");
 279                BUG();
 280                return 0;
 281        }
 282}
 283
 284static void *packet_lookup_frame(struct packet_sock *po,
 285                struct packet_ring_buffer *rb,
 286                unsigned int position,
 287                int status)
 288{
 289        unsigned int pg_vec_pos, frame_offset;
 290        union {
 291                struct tpacket_hdr *h1;
 292                struct tpacket2_hdr *h2;
 293                void *raw;
 294        } h;
 295
 296        pg_vec_pos = position / rb->frames_per_block;
 297        frame_offset = position % rb->frames_per_block;
 298
 299        h.raw = rb->pg_vec[pg_vec_pos].buffer +
 300                (frame_offset * rb->frame_size);
 301
 302        if (status != __packet_get_status(po, h.raw))
 303                return NULL;
 304
 305        return h.raw;
 306}
 307
 308static inline void *packet_current_frame(struct packet_sock *po,
 309                struct packet_ring_buffer *rb,
 310                int status)
 311{
 312        return packet_lookup_frame(po, rb, rb->head, status);
 313}
 314
 315static inline void *packet_previous_frame(struct packet_sock *po,
 316                struct packet_ring_buffer *rb,
 317                int status)
 318{
 319        unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
 320        return packet_lookup_frame(po, rb, previous, status);
 321}
 322
 323static inline void packet_increment_head(struct packet_ring_buffer *buff)
 324{
 325        buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
 326}
 327
 328static inline struct packet_sock *pkt_sk(struct sock *sk)
 329{
 330        return (struct packet_sock *)sk;
 331}
 332
 333static void packet_sock_destruct(struct sock *sk)
 334{
 335        skb_queue_purge(&sk->sk_error_queue);
 336
 337        WARN_ON(atomic_read(&sk->sk_rmem_alloc));
 338        WARN_ON(atomic_read(&sk->sk_wmem_alloc));
 339
 340        if (!sock_flag(sk, SOCK_DEAD)) {
 341                pr_err("Attempt to release alive packet socket: %p\n", sk);
 342                return;
 343        }
 344
 345        sk_refcnt_debug_dec(sk);
 346}
 347
 348
 349static const struct proto_ops packet_ops;
 350
 351static const struct proto_ops packet_ops_spkt;
 352
 353static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
 354                           struct packet_type *pt, struct net_device *orig_dev)
 355{
 356        struct sock *sk;
 357        struct sockaddr_pkt *spkt;
 358
 359        /*
 360         *      When we registered the protocol we saved the socket in the data
 361         *      field for just this event.
 362         */
 363
 364        sk = pt->af_packet_priv;
 365
 366        /*
 367         *      Yank back the headers [hope the device set this
 368         *      right or kerboom...]
 369         *
 370         *      Incoming packets have ll header pulled,
 371         *      push it back.
 372         *
 373         *      For outgoing ones skb->data == skb_mac_header(skb)
 374         *      so that this procedure is noop.
 375         */
 376
 377        if (skb->pkt_type == PACKET_LOOPBACK)
 378                goto out;
 379
 380        if (!net_eq(dev_net(dev), sock_net(sk)))
 381                goto out;
 382
 383        skb = skb_share_check(skb, GFP_ATOMIC);
 384        if (skb == NULL)
 385                goto oom;
 386
 387        /* drop any routing info */
 388        skb_dst_drop(skb);
 389
 390        /* drop conntrack reference */
 391        nf_reset(skb);
 392
 393        spkt = &PACKET_SKB_CB(skb)->sa.pkt;
 394
 395        skb_push(skb, skb->data - skb_mac_header(skb));
 396
 397        /*
 398         *      The SOCK_PACKET socket receives _all_ frames.
 399         */
 400
 401        spkt->spkt_family = dev->type;
 402        strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
 403        spkt->spkt_protocol = skb->protocol;
 404
 405        /*
 406         *      Charge the memory to the socket. This is done specifically
 407         *      to prevent sockets using all the memory up.
 408         */
 409
 410        if (sock_queue_rcv_skb(sk, skb) == 0)
 411                return 0;
 412
 413out:
 414        kfree_skb(skb);
 415oom:
 416        return 0;
 417}
 418
 419
 420/*
 421 *      Output a raw packet to a device layer. This bypasses all the other
 422 *      protocol layers and you must therefore supply it with a complete frame
 423 */
 424
 425static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
 426                               struct msghdr *msg, size_t len)
 427{
 428        struct sock *sk = sock->sk;
 429        struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
 430        struct sk_buff *skb = NULL;
 431        struct net_device *dev;
 432        __be16 proto = 0;
 433        int err;
 434
 435        /*
 436         *      Get and verify the address.
 437         */
 438
 439        if (saddr) {
 440                if (msg->msg_namelen < sizeof(struct sockaddr))
 441                        return -EINVAL;
 442                if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
 443                        proto = saddr->spkt_protocol;
 444        } else
 445                return -ENOTCONN;       /* SOCK_PACKET must be sent giving an address */
 446
 447        /*
 448         *      Find the device first to size check it
 449         */
 450
 451        saddr->spkt_device[13] = 0;
 452retry:
 453        rcu_read_lock();
 454        dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
 455        err = -ENODEV;
 456        if (dev == NULL)
 457                goto out_unlock;
 458
 459        err = -ENETDOWN;
 460        if (!(dev->flags & IFF_UP))
 461                goto out_unlock;
 462
 463        /*
 464         * You may not queue a frame bigger than the mtu. This is the lowest level
 465         * raw protocol and you must do your own fragmentation at this level.
 466         */
 467
 468        err = -EMSGSIZE;
 469        if (len > dev->mtu + dev->hard_header_len)
 470                goto out_unlock;
 471
 472        if (!skb) {
 473                size_t reserved = LL_RESERVED_SPACE(dev);
 474                unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
 475
 476                rcu_read_unlock();
 477                skb = sock_wmalloc(sk, len + reserved, 0, GFP_KERNEL);
 478                if (skb == NULL)
 479                        return -ENOBUFS;
 480                /* FIXME: Save some space for broken drivers that write a hard
 481                 * header at transmission time by themselves. PPP is the notable
 482                 * one here. This should really be fixed at the driver level.
 483                 */
 484                skb_reserve(skb, reserved);
 485                skb_reset_network_header(skb);
 486
 487                /* Try to align data part correctly */
 488                if (hhlen) {
 489                        skb->data -= hhlen;
 490                        skb->tail -= hhlen;
 491                        if (len < hhlen)
 492                                skb_reset_network_header(skb);
 493                }
 494                err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
 495                if (err)
 496                        goto out_free;
 497                goto retry;
 498        }
 499
 500
 501        skb->protocol = proto;
 502        skb->dev = dev;
 503        skb->priority = sk->sk_priority;
 504        skb->mark = sk->sk_mark;
 505        err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
 506        if (err < 0)
 507                goto out_unlock;
 508
 509        dev_queue_xmit(skb);
 510        rcu_read_unlock();
 511        return len;
 512
 513out_unlock:
 514        rcu_read_unlock();
 515out_free:
 516        kfree_skb(skb);
 517        return err;
 518}
 519
 520static inline unsigned int run_filter(const struct sk_buff *skb,
 521                                      const struct sock *sk,
 522                                      unsigned int res)
 523{
 524        struct sk_filter *filter;
 525
 526        rcu_read_lock_bh();
 527        filter = rcu_dereference_bh(sk->sk_filter);
 528        if (filter != NULL)
 529                res = sk_run_filter(skb, filter->insns);
 530        rcu_read_unlock_bh();
 531
 532        return res;
 533}
 534
 535/*
 536 * This function makes lazy skb cloning in hope that most of packets
 537 * are discarded by BPF.
 538 *
 539 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
 540 * and skb->cb are mangled. It works because (and until) packets
 541 * falling here are owned by current CPU. Output packets are cloned
 542 * by dev_queue_xmit_nit(), input packets are processed by net_bh
 543 * sequencially, so that if we return skb to original state on exit,
 544 * we will not harm anyone.
 545 */
 546
 547static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
 548                      struct packet_type *pt, struct net_device *orig_dev)
 549{
 550        struct sock *sk;
 551        struct sockaddr_ll *sll;
 552        struct packet_sock *po;
 553        u8 *skb_head = skb->data;
 554        int skb_len = skb->len;
 555        unsigned int snaplen, res;
 556
 557        if (skb->pkt_type == PACKET_LOOPBACK)
 558                goto drop;
 559
 560        sk = pt->af_packet_priv;
 561        po = pkt_sk(sk);
 562
 563        if (!net_eq(dev_net(dev), sock_net(sk)))
 564                goto drop;
 565
 566        skb->dev = dev;
 567
 568        if (dev->header_ops) {
 569                /* The device has an explicit notion of ll header,
 570                 * exported to higher levels.
 571                 *
 572                 * Otherwise, the device hides details of its frame
 573                 * structure, so that corresponding packet head is
 574                 * never delivered to user.
 575                 */
 576                if (sk->sk_type != SOCK_DGRAM)
 577                        skb_push(skb, skb->data - skb_mac_header(skb));
 578                else if (skb->pkt_type == PACKET_OUTGOING) {
 579                        /* Special case: outgoing packets have ll header at head */
 580                        skb_pull(skb, skb_network_offset(skb));
 581                }
 582        }
 583
 584        snaplen = skb->len;
 585
 586        res = run_filter(skb, sk, snaplen);
 587        if (!res)
 588                goto drop_n_restore;
 589        if (snaplen > res)
 590                snaplen = res;
 591
 592        if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
 593            (unsigned)sk->sk_rcvbuf)
 594                goto drop_n_acct;
 595
 596        if (skb_shared(skb)) {
 597                struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
 598                if (nskb == NULL)
 599                        goto drop_n_acct;
 600
 601                if (skb_head != skb->data) {
 602                        skb->data = skb_head;
 603                        skb->len = skb_len;
 604                }
 605                kfree_skb(skb);
 606                skb = nskb;
 607        }
 608
 609        BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
 610                     sizeof(skb->cb));
 611
 612        sll = &PACKET_SKB_CB(skb)->sa.ll;
 613        sll->sll_family = AF_PACKET;
 614        sll->sll_hatype = dev->type;
 615        sll->sll_protocol = skb->protocol;
 616        sll->sll_pkttype = skb->pkt_type;
 617        if (unlikely(po->origdev))
 618                sll->sll_ifindex = orig_dev->ifindex;
 619        else
 620                sll->sll_ifindex = dev->ifindex;
 621
 622        sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
 623
 624        PACKET_SKB_CB(skb)->origlen = skb->len;
 625
 626        if (pskb_trim(skb, snaplen))
 627                goto drop_n_acct;
 628
 629        skb_set_owner_r(skb, sk);
 630        skb->dev = NULL;
 631        skb_dst_drop(skb);
 632
 633        /* drop conntrack reference */
 634        nf_reset(skb);
 635
 636        spin_lock(&sk->sk_receive_queue.lock);
 637        po->stats.tp_packets++;
 638        skb->dropcount = atomic_read(&sk->sk_drops);
 639        __skb_queue_tail(&sk->sk_receive_queue, skb);
 640        spin_unlock(&sk->sk_receive_queue.lock);
 641        sk->sk_data_ready(sk, skb->len);
 642        return 0;
 643
 644drop_n_acct:
 645        po->stats.tp_drops = atomic_inc_return(&sk->sk_drops);
 646
 647drop_n_restore:
 648        if (skb_head != skb->data && skb_shared(skb)) {
 649                skb->data = skb_head;
 650                skb->len = skb_len;
 651        }
 652drop:
 653        consume_skb(skb);
 654        return 0;
 655}
 656
 657static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 658                       struct packet_type *pt, struct net_device *orig_dev)
 659{
 660        struct sock *sk;
 661        struct packet_sock *po;
 662        struct sockaddr_ll *sll;
 663        union {
 664                struct tpacket_hdr *h1;
 665                struct tpacket2_hdr *h2;
 666                void *raw;
 667        } h;
 668        u8 *skb_head = skb->data;
 669        int skb_len = skb->len;
 670        unsigned int snaplen, res;
 671        unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
 672        unsigned short macoff, netoff, hdrlen;
 673        struct sk_buff *copy_skb = NULL;
 674        struct timeval tv;
 675        struct timespec ts;
 676        struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
 677
 678        if (skb->pkt_type == PACKET_LOOPBACK)
 679                goto drop;
 680
 681        sk = pt->af_packet_priv;
 682        po = pkt_sk(sk);
 683
 684        if (!net_eq(dev_net(dev), sock_net(sk)))
 685                goto drop;
 686
 687        if (dev->header_ops) {
 688                if (sk->sk_type != SOCK_DGRAM)
 689                        skb_push(skb, skb->data - skb_mac_header(skb));
 690                else if (skb->pkt_type == PACKET_OUTGOING) {
 691                        /* Special case: outgoing packets have ll header at head */
 692                        skb_pull(skb, skb_network_offset(skb));
 693                }
 694        }
 695
 696        if (skb->ip_summed == CHECKSUM_PARTIAL)
 697                status |= TP_STATUS_CSUMNOTREADY;
 698
 699        snaplen = skb->len;
 700
 701        res = run_filter(skb, sk, snaplen);
 702        if (!res)
 703                goto drop_n_restore;
 704        if (snaplen > res)
 705                snaplen = res;
 706
 707        if (sk->sk_type == SOCK_DGRAM) {
 708                macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
 709                                  po->tp_reserve;
 710        } else {
 711                unsigned maclen = skb_network_offset(skb);
 712                netoff = TPACKET_ALIGN(po->tp_hdrlen +
 713                                       (maclen < 16 ? 16 : maclen)) +
 714                        po->tp_reserve;
 715                macoff = netoff - maclen;
 716        }
 717
 718        if (macoff + snaplen > po->rx_ring.frame_size) {
 719                if (po->copy_thresh &&
 720                    atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
 721                    (unsigned)sk->sk_rcvbuf) {
 722                        if (skb_shared(skb)) {
 723                                copy_skb = skb_clone(skb, GFP_ATOMIC);
 724                        } else {
 725                                copy_skb = skb_get(skb);
 726                                skb_head = skb->data;
 727                        }
 728                        if (copy_skb)
 729                                skb_set_owner_r(copy_skb, sk);
 730                }
 731                snaplen = po->rx_ring.frame_size - macoff;
 732                if ((int)snaplen < 0)
 733                        snaplen = 0;
 734        }
 735
 736        spin_lock(&sk->sk_receive_queue.lock);
 737        h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
 738        if (!h.raw)
 739                goto ring_is_full;
 740        packet_increment_head(&po->rx_ring);
 741        po->stats.tp_packets++;
 742        if (copy_skb) {
 743                status |= TP_STATUS_COPY;
 744                __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
 745        }
 746        if (!po->stats.tp_drops)
 747                status &= ~TP_STATUS_LOSING;
 748        spin_unlock(&sk->sk_receive_queue.lock);
 749
 750        skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
 751
 752        switch (po->tp_version) {
 753        case TPACKET_V1:
 754                h.h1->tp_len = skb->len;
 755                h.h1->tp_snaplen = snaplen;
 756                h.h1->tp_mac = macoff;
 757                h.h1->tp_net = netoff;
 758                if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
 759                                && shhwtstamps->syststamp.tv64)
 760                        tv = ktime_to_timeval(shhwtstamps->syststamp);
 761                else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
 762                                && shhwtstamps->hwtstamp.tv64)
 763                        tv = ktime_to_timeval(shhwtstamps->hwtstamp);
 764                else if (skb->tstamp.tv64)
 765                        tv = ktime_to_timeval(skb->tstamp);
 766                else
 767                        do_gettimeofday(&tv);
 768                h.h1->tp_sec = tv.tv_sec;
 769                h.h1->tp_usec = tv.tv_usec;
 770                hdrlen = sizeof(*h.h1);
 771                break;
 772        case TPACKET_V2:
 773                h.h2->tp_len = skb->len;
 774                h.h2->tp_snaplen = snaplen;
 775                h.h2->tp_mac = macoff;
 776                h.h2->tp_net = netoff;
 777                if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
 778                                && shhwtstamps->syststamp.tv64)
 779                        ts = ktime_to_timespec(shhwtstamps->syststamp);
 780                else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
 781                                && shhwtstamps->hwtstamp.tv64)
 782                        ts = ktime_to_timespec(shhwtstamps->hwtstamp);
 783                else if (skb->tstamp.tv64)
 784                        ts = ktime_to_timespec(skb->tstamp);
 785                else
 786                        getnstimeofday(&ts);
 787                h.h2->tp_sec = ts.tv_sec;
 788                h.h2->tp_nsec = ts.tv_nsec;
 789                h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
 790                hdrlen = sizeof(*h.h2);
 791                break;
 792        default:
 793                BUG();
 794        }
 795
 796        sll = h.raw + TPACKET_ALIGN(hdrlen);
 797        sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
 798        sll->sll_family = AF_PACKET;
 799        sll->sll_hatype = dev->type;
 800        sll->sll_protocol = skb->protocol;
 801        sll->sll_pkttype = skb->pkt_type;
 802        if (unlikely(po->origdev))
 803                sll->sll_ifindex = orig_dev->ifindex;
 804        else
 805                sll->sll_ifindex = dev->ifindex;
 806
 807        __packet_set_status(po, h.raw, status);
 808        smp_mb();
 809#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
 810        {
 811                u8 *start, *end;
 812
 813                end = (u8 *)PAGE_ALIGN((unsigned long)h.raw + macoff + snaplen);
 814                for (start = h.raw; start < end; start += PAGE_SIZE)
 815                        flush_dcache_page(pgv_to_page(start));
 816        }
 817#endif
 818
 819        sk->sk_data_ready(sk, 0);
 820
 821drop_n_restore:
 822        if (skb_head != skb->data && skb_shared(skb)) {
 823                skb->data = skb_head;
 824                skb->len = skb_len;
 825        }
 826drop:
 827        kfree_skb(skb);
 828        return 0;
 829
 830ring_is_full:
 831        po->stats.tp_drops++;
 832        spin_unlock(&sk->sk_receive_queue.lock);
 833
 834        sk->sk_data_ready(sk, 0);
 835        kfree_skb(copy_skb);
 836        goto drop_n_restore;
 837}
 838
 839static void tpacket_destruct_skb(struct sk_buff *skb)
 840{
 841        struct packet_sock *po = pkt_sk(skb->sk);
 842        void *ph;
 843
 844        BUG_ON(skb == NULL);
 845
 846        if (likely(po->tx_ring.pg_vec)) {
 847                ph = skb_shinfo(skb)->destructor_arg;
 848                BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
 849                BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
 850                atomic_dec(&po->tx_ring.pending);
 851                __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
 852        }
 853
 854        sock_wfree(skb);
 855}
 856
 857static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
 858                void *frame, struct net_device *dev, int size_max,
 859                __be16 proto, unsigned char *addr)
 860{
 861        union {
 862                struct tpacket_hdr *h1;
 863                struct tpacket2_hdr *h2;
 864                void *raw;
 865        } ph;
 866        int to_write, offset, len, tp_len, nr_frags, len_max;
 867        struct socket *sock = po->sk.sk_socket;
 868        struct page *page;
 869        void *data;
 870        int err;
 871
 872        ph.raw = frame;
 873
 874        skb->protocol = proto;
 875        skb->dev = dev;
 876        skb->priority = po->sk.sk_priority;
 877        skb->mark = po->sk.sk_mark;
 878        skb_shinfo(skb)->destructor_arg = ph.raw;
 879
 880        switch (po->tp_version) {
 881        case TPACKET_V2:
 882                tp_len = ph.h2->tp_len;
 883                break;
 884        default:
 885                tp_len = ph.h1->tp_len;
 886                break;
 887        }
 888        if (unlikely(tp_len > size_max)) {
 889                pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
 890                return -EMSGSIZE;
 891        }
 892
 893        skb_reserve(skb, LL_RESERVED_SPACE(dev));
 894        skb_reset_network_header(skb);
 895
 896        data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
 897        to_write = tp_len;
 898
 899        if (sock->type == SOCK_DGRAM) {
 900                err = dev_hard_header(skb, dev, ntohs(proto), addr,
 901                                NULL, tp_len);
 902                if (unlikely(err < 0))
 903                        return -EINVAL;
 904        } else if (dev->hard_header_len) {
 905                /* net device doesn't like empty head */
 906                if (unlikely(tp_len <= dev->hard_header_len)) {
 907                        pr_err("packet size is too short (%d < %d)\n",
 908                               tp_len, dev->hard_header_len);
 909                        return -EINVAL;
 910                }
 911
 912                skb_push(skb, dev->hard_header_len);
 913                err = skb_store_bits(skb, 0, data,
 914                                dev->hard_header_len);
 915                if (unlikely(err))
 916                        return err;
 917
 918                data += dev->hard_header_len;
 919                to_write -= dev->hard_header_len;
 920        }
 921
 922        err = -EFAULT;
 923        offset = offset_in_page(data);
 924        len_max = PAGE_SIZE - offset;
 925        len = ((to_write > len_max) ? len_max : to_write);
 926
 927        skb->data_len = to_write;
 928        skb->len += to_write;
 929        skb->truesize += to_write;
 930        atomic_add(to_write, &po->sk.sk_wmem_alloc);
 931
 932        while (likely(to_write)) {
 933                nr_frags = skb_shinfo(skb)->nr_frags;
 934
 935                if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
 936                        pr_err("Packet exceed the number of skb frags(%lu)\n",
 937                               MAX_SKB_FRAGS);
 938                        return -EFAULT;
 939                }
 940
 941                page = pgv_to_page(data);
 942                data += len;
 943                flush_dcache_page(page);
 944                get_page(page);
 945                skb_fill_page_desc(skb, nr_frags, page, offset, len);
 946                to_write -= len;
 947                offset = 0;
 948                len_max = PAGE_SIZE;
 949                len = ((to_write > len_max) ? len_max : to_write);
 950        }
 951
 952        return tp_len;
 953}
 954
 955static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 956{
 957        struct socket *sock;
 958        struct sk_buff *skb;
 959        struct net_device *dev;
 960        __be16 proto;
 961        int ifindex, err, reserve = 0;
 962        void *ph;
 963        struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
 964        int tp_len, size_max;
 965        unsigned char *addr;
 966        int len_sum = 0;
 967        int status = 0;
 968
 969        sock = po->sk.sk_socket;
 970
 971        mutex_lock(&po->pg_vec_lock);
 972
 973        err = -EBUSY;
 974        if (saddr == NULL) {
 975                ifindex = po->ifindex;
 976                proto   = po->num;
 977                addr    = NULL;
 978        } else {
 979                err = -EINVAL;
 980                if (msg->msg_namelen < sizeof(struct sockaddr_ll))
 981                        goto out;
 982                if (msg->msg_namelen < (saddr->sll_halen
 983                                        + offsetof(struct sockaddr_ll,
 984                                                sll_addr)))
 985                        goto out;
 986                ifindex = saddr->sll_ifindex;
 987                proto   = saddr->sll_protocol;
 988                addr    = saddr->sll_addr;
 989        }
 990
 991        dev = dev_get_by_index(sock_net(&po->sk), ifindex);
 992        err = -ENXIO;
 993        if (unlikely(dev == NULL))
 994                goto out;
 995
 996        reserve = dev->hard_header_len;
 997
 998        err = -ENETDOWN;
 999        if (unlikely(!(dev->flags & IFF_UP)))
1000                goto out_put;
1001
1002        size_max = po->tx_ring.frame_size
1003                - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
1004
1005        if (size_max > dev->mtu + reserve)
1006                size_max = dev->mtu + reserve;
1007
1008        do {
1009                ph = packet_current_frame(po, &po->tx_ring,
1010                                TP_STATUS_SEND_REQUEST);
1011
1012                if (unlikely(ph == NULL)) {
1013                        schedule();
1014                        continue;
1015                }
1016
1017                status = TP_STATUS_SEND_REQUEST;
1018                skb = sock_alloc_send_skb(&po->sk,
1019                                LL_ALLOCATED_SPACE(dev)
1020                                + sizeof(struct sockaddr_ll),
1021                                0, &err);
1022
1023                if (unlikely(skb == NULL))
1024                        goto out_status;
1025
1026                tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
1027                                addr);
1028
1029                if (unlikely(tp_len < 0)) {
1030                        if (po->tp_loss) {
1031                                __packet_set_status(po, ph,
1032                                                TP_STATUS_AVAILABLE);
1033                                packet_increment_head(&po->tx_ring);
1034                                kfree_skb(skb);
1035                                continue;
1036                        } else {
1037                                status = TP_STATUS_WRONG_FORMAT;
1038                                err = tp_len;
1039                                goto out_status;
1040                        }
1041                }
1042
1043                skb->destructor = tpacket_destruct_skb;
1044                __packet_set_status(po, ph, TP_STATUS_SENDING);
1045                atomic_inc(&po->tx_ring.pending);
1046
1047                status = TP_STATUS_SEND_REQUEST;
1048                err = dev_queue_xmit(skb);
1049                if (unlikely(err > 0)) {
1050                        err = net_xmit_errno(err);
1051                        if (err && __packet_get_status(po, ph) ==
1052                                   TP_STATUS_AVAILABLE) {
1053                                /* skb was destructed already */
1054                                skb = NULL;
1055                                goto out_status;
1056                        }
1057                        /*
1058                         * skb was dropped but not destructed yet;
1059                         * let's treat it like congestion or err < 0
1060                         */
1061                        err = 0;
1062                }
1063                packet_increment_head(&po->tx_ring);
1064                len_sum += tp_len;
1065        } while (likely((ph != NULL) ||
1066                        ((!(msg->msg_flags & MSG_DONTWAIT)) &&
1067                         (atomic_read(&po->tx_ring.pending))))
1068                );
1069
1070        err = len_sum;
1071        goto out_put;
1072
1073out_status:
1074        __packet_set_status(po, ph, status);
1075        kfree_skb(skb);
1076out_put:
1077        dev_put(dev);
1078out:
1079        mutex_unlock(&po->pg_vec_lock);
1080        return err;
1081}
1082
1083static inline struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
1084                                               size_t reserve, size_t len,
1085                                               size_t linear, int noblock,
1086                                               int *err)
1087{
1088        struct sk_buff *skb;
1089
1090        /* Under a page?  Don't bother with paged skb. */
1091        if (prepad + len < PAGE_SIZE || !linear)
1092                linear = len;
1093
1094        skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
1095                                   err);
1096        if (!skb)
1097                return NULL;
1098
1099        skb_reserve(skb, reserve);
1100        skb_put(skb, linear);
1101        skb->data_len = len - linear;
1102        skb->len += len - linear;
1103
1104        return skb;
1105}
1106
1107static int packet_snd(struct socket *sock,
1108                          struct msghdr *msg, size_t len)
1109{
1110        struct sock *sk = sock->sk;
1111        struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1112        struct sk_buff *skb;
1113        struct net_device *dev;
1114        __be16 proto;
1115        unsigned char *addr;
1116        int ifindex, err, reserve = 0;
1117        struct virtio_net_hdr vnet_hdr = { 0 };
1118        int offset = 0;
1119        int vnet_hdr_len;
1120        struct packet_sock *po = pkt_sk(sk);
1121        unsigned short gso_type = 0;
1122
1123        /*
1124         *      Get and verify the address.
1125         */
1126
1127        if (saddr == NULL) {
1128                ifindex = po->ifindex;
1129                proto   = po->num;
1130                addr    = NULL;
1131        } else {
1132                err = -EINVAL;
1133                if (msg->msg_namelen < sizeof(struct sockaddr_ll))
1134                        goto out;
1135                if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
1136                        goto out;
1137                ifindex = saddr->sll_ifindex;
1138                proto   = saddr->sll_protocol;
1139                addr    = saddr->sll_addr;
1140        }
1141
1142
1143        dev = dev_get_by_index(sock_net(sk), ifindex);
1144        err = -ENXIO;
1145        if (dev == NULL)
1146                goto out_unlock;
1147        if (sock->type == SOCK_RAW)
1148                reserve = dev->hard_header_len;
1149
1150        err = -ENETDOWN;
1151        if (!(dev->flags & IFF_UP))
1152                goto out_unlock;
1153
1154        if (po->has_vnet_hdr) {
1155                vnet_hdr_len = sizeof(vnet_hdr);
1156
1157                err = -EINVAL;
1158                if (len < vnet_hdr_len)
1159                        goto out_unlock;
1160
1161                len -= vnet_hdr_len;
1162
1163                err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
1164                                       vnet_hdr_len);
1165                if (err < 0)
1166                        goto out_unlock;
1167
1168                if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
1169                    (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
1170                      vnet_hdr.hdr_len))
1171                        vnet_hdr.hdr_len = vnet_hdr.csum_start +
1172                                                 vnet_hdr.csum_offset + 2;
1173
1174                err = -EINVAL;
1175                if (vnet_hdr.hdr_len > len)
1176                        goto out_unlock;
1177
1178                if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
1179                        switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
1180                        case VIRTIO_NET_HDR_GSO_TCPV4:
1181                                gso_type = SKB_GSO_TCPV4;
1182                                break;
1183                        case VIRTIO_NET_HDR_GSO_TCPV6:
1184                                gso_type = SKB_GSO_TCPV6;
1185                                break;
1186                        case VIRTIO_NET_HDR_GSO_UDP:
1187                                gso_type = SKB_GSO_UDP;
1188                                break;
1189                        default:
1190                                goto out_unlock;
1191                        }
1192
1193                        if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
1194                                gso_type |= SKB_GSO_TCP_ECN;
1195
1196                        if (vnet_hdr.gso_size == 0)
1197                                goto out_unlock;
1198
1199                }
1200        }
1201
1202        err = -EMSGSIZE;
1203        if (!gso_type && (len > dev->mtu+reserve))
1204                goto out_unlock;
1205
1206        err = -ENOBUFS;
1207        skb = packet_alloc_skb(sk, LL_ALLOCATED_SPACE(dev),
1208                               LL_RESERVED_SPACE(dev), len, vnet_hdr.hdr_len,
1209                               msg->msg_flags & MSG_DONTWAIT, &err);
1210        if (skb == NULL)
1211                goto out_unlock;
1212
1213        skb_set_network_header(skb, reserve);
1214
1215        err = -EINVAL;
1216        if (sock->type == SOCK_DGRAM &&
1217            (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
1218                goto out_free;
1219
1220        /* Returns -EFAULT on error */
1221        err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
1222        if (err)
1223                goto out_free;
1224        err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1225        if (err < 0)
1226                goto out_free;
1227
1228        skb->protocol = proto;
1229        skb->dev = dev;
1230        skb->priority = sk->sk_priority;
1231        skb->mark = sk->sk_mark;
1232
1233        if (po->has_vnet_hdr) {
1234                if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
1235                        if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
1236                                                  vnet_hdr.csum_offset)) {
1237                                err = -EINVAL;
1238                                goto out_free;
1239                        }
1240                }
1241
1242                skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
1243                skb_shinfo(skb)->gso_type = gso_type;
1244
1245                /* Header must be checked, and gso_segs computed. */
1246                skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
1247                skb_shinfo(skb)->gso_segs = 0;
1248
1249                len += vnet_hdr_len;
1250        }
1251
1252        /*
1253         *      Now send it
1254         */
1255
1256        err = dev_queue_xmit(skb);
1257        if (err > 0 && (err = net_xmit_errno(err)) != 0)
1258                goto out_unlock;
1259
1260        dev_put(dev);
1261
1262        return len;
1263
1264out_free:
1265        kfree_skb(skb);
1266out_unlock:
1267        if (dev)
1268                dev_put(dev);
1269out:
1270        return err;
1271}
1272
1273static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
1274                struct msghdr *msg, size_t len)
1275{
1276        struct sock *sk = sock->sk;
1277        struct packet_sock *po = pkt_sk(sk);
1278        if (po->tx_ring.pg_vec)
1279                return tpacket_snd(po, msg);
1280        else
1281                return packet_snd(sock, msg, len);
1282}
1283
1284/*
1285 *      Close a PACKET socket. This is fairly simple. We immediately go
1286 *      to 'closed' state and remove our protocol entry in the device list.
1287 */
1288
1289static int packet_release(struct socket *sock)
1290{
1291        struct sock *sk = sock->sk;
1292        struct packet_sock *po;
1293        struct net *net;
1294        struct tpacket_req req;
1295
1296        if (!sk)
1297                return 0;
1298
1299        net = sock_net(sk);
1300        po = pkt_sk(sk);
1301
1302        spin_lock_bh(&net->packet.sklist_lock);
1303        sk_del_node_init_rcu(sk);
1304        sock_prot_inuse_add(net, sk->sk_prot, -1);
1305        spin_unlock_bh(&net->packet.sklist_lock);
1306
1307        spin_lock(&po->bind_lock);
1308        if (po->running) {
1309                /*
1310                 * Remove from protocol table
1311                 */
1312                po->running = 0;
1313                po->num = 0;
1314                __dev_remove_pack(&po->prot_hook);
1315                __sock_put(sk);
1316        }
1317        spin_unlock(&po->bind_lock);
1318
1319        packet_flush_mclist(sk);
1320
1321        memset(&req, 0, sizeof(req));
1322
1323        if (po->rx_ring.pg_vec)
1324                packet_set_ring(sk, &req, 1, 0);
1325
1326        if (po->tx_ring.pg_vec)
1327                packet_set_ring(sk, &req, 1, 1);
1328
1329        synchronize_net();
1330        /*
1331         *      Now the socket is dead. No more input will appear.
1332         */
1333        sock_orphan(sk);
1334        sock->sk = NULL;
1335
1336        /* Purge queues */
1337
1338        skb_queue_purge(&sk->sk_receive_queue);
1339        sk_refcnt_debug_release(sk);
1340
1341        sock_put(sk);
1342        return 0;
1343}
1344
1345/*
1346 *      Attach a packet hook.
1347 */
1348
1349static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1350{
1351        struct packet_sock *po = pkt_sk(sk);
1352        /*
1353         *      Detach an existing hook if present.
1354         */
1355
1356        lock_sock(sk);
1357
1358        spin_lock(&po->bind_lock);
1359        if (po->running) {
1360                __sock_put(sk);
1361                po->running = 0;
1362                po->num = 0;
1363                spin_unlock(&po->bind_lock);
1364                dev_remove_pack(&po->prot_hook);
1365                spin_lock(&po->bind_lock);
1366        }
1367
1368        po->num = protocol;
1369        po->prot_hook.type = protocol;
1370        po->prot_hook.dev = dev;
1371
1372        po->ifindex = dev ? dev->ifindex : 0;
1373
1374        if (protocol == 0)
1375                goto out_unlock;
1376
1377        if (!dev || (dev->flags & IFF_UP)) {
1378                dev_add_pack(&po->prot_hook);
1379                sock_hold(sk);
1380                po->running = 1;
1381        } else {
1382                sk->sk_err = ENETDOWN;
1383                if (!sock_flag(sk, SOCK_DEAD))
1384                        sk->sk_error_report(sk);
1385        }
1386
1387out_unlock:
1388        spin_unlock(&po->bind_lock);
1389        release_sock(sk);
1390        return 0;
1391}
1392
1393/*
1394 *      Bind a packet socket to a device
1395 */
1396
1397static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
1398                            int addr_len)
1399{
1400        struct sock *sk = sock->sk;
1401        char name[15];
1402        struct net_device *dev;
1403        int err = -ENODEV;
1404
1405        /*
1406         *      Check legality
1407         */
1408
1409        if (addr_len != sizeof(struct sockaddr))
1410                return -EINVAL;
1411        strlcpy(name, uaddr->sa_data, sizeof(name));
1412
1413        dev = dev_get_by_name(sock_net(sk), name);
1414        if (dev) {
1415                err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1416                dev_put(dev);
1417        }
1418        return err;
1419}
1420
1421static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1422{
1423        struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1424        struct sock *sk = sock->sk;
1425        struct net_device *dev = NULL;
1426        int err;
1427
1428
1429        /*
1430         *      Check legality
1431         */
1432
1433        if (addr_len < sizeof(struct sockaddr_ll))
1434                return -EINVAL;
1435        if (sll->sll_family != AF_PACKET)
1436                return -EINVAL;
1437
1438        if (sll->sll_ifindex) {
1439                err = -ENODEV;
1440                dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1441                if (dev == NULL)
1442                        goto out;
1443        }
1444        err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1445        if (dev)
1446                dev_put(dev);
1447
1448out:
1449        return err;
1450}
1451
1452static struct proto packet_proto = {
1453        .name     = "PACKET",
1454        .owner    = THIS_MODULE,
1455        .obj_size = sizeof(struct packet_sock),
1456};
1457
1458/*
1459 *      Create a packet of type SOCK_PACKET.
1460 */
1461
1462static int packet_create(struct net *net, struct socket *sock, int protocol,
1463                         int kern)
1464{
1465        struct sock *sk;
1466        struct packet_sock *po;
1467        __be16 proto = (__force __be16)protocol; /* weird, but documented */
1468        int err;
1469
1470        if (!capable(CAP_NET_RAW))
1471                return -EPERM;
1472        if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1473            sock->type != SOCK_PACKET)
1474                return -ESOCKTNOSUPPORT;
1475
1476        sock->state = SS_UNCONNECTED;
1477
1478        err = -ENOBUFS;
1479        sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1480        if (sk == NULL)
1481                goto out;
1482
1483        sock->ops = &packet_ops;
1484        if (sock->type == SOCK_PACKET)
1485                sock->ops = &packet_ops_spkt;
1486
1487        sock_init_data(sock, sk);
1488
1489        po = pkt_sk(sk);
1490        sk->sk_family = PF_PACKET;
1491        po->num = proto;
1492
1493        sk->sk_destruct = packet_sock_destruct;
1494        sk_refcnt_debug_inc(sk);
1495
1496        /*
1497         *      Attach a protocol block
1498         */
1499
1500        spin_lock_init(&po->bind_lock);
1501        mutex_init(&po->pg_vec_lock);
1502        po->prot_hook.func = packet_rcv;
1503
1504        if (sock->type == SOCK_PACKET)
1505                po->prot_hook.func = packet_rcv_spkt;
1506
1507        po->prot_hook.af_packet_priv = sk;
1508
1509        if (proto) {
1510                po->prot_hook.type = proto;
1511                dev_add_pack(&po->prot_hook);
1512                sock_hold(sk);
1513                po->running = 1;
1514        }
1515
1516        spin_lock_bh(&net->packet.sklist_lock);
1517        sk_add_node_rcu(sk, &net->packet.sklist);
1518        sock_prot_inuse_add(net, &packet_proto, 1);
1519        spin_unlock_bh(&net->packet.sklist_lock);
1520
1521        return 0;
1522out:
1523        return err;
1524}
1525
1526static int packet_recv_error(struct sock *sk, struct msghdr *msg, int len)
1527{
1528        struct sock_exterr_skb *serr;
1529        struct sk_buff *skb, *skb2;
1530        int copied, err;
1531
1532        err = -EAGAIN;
1533        skb = skb_dequeue(&sk->sk_error_queue);
1534        if (skb == NULL)
1535                goto out;
1536
1537        copied = skb->len;
1538        if (copied > len) {
1539                msg->msg_flags |= MSG_TRUNC;
1540                copied = len;
1541        }
1542        err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1543        if (err)
1544                goto out_free_skb;
1545
1546        sock_recv_timestamp(msg, sk, skb);
1547
1548        serr = SKB_EXT_ERR(skb);
1549        put_cmsg(msg, SOL_PACKET, PACKET_TX_TIMESTAMP,
1550                 sizeof(serr->ee), &serr->ee);
1551
1552        msg->msg_flags |= MSG_ERRQUEUE;
1553        err = copied;
1554
1555        /* Reset and regenerate socket error */
1556        spin_lock_bh(&sk->sk_error_queue.lock);
1557        sk->sk_err = 0;
1558        if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
1559                sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
1560                spin_unlock_bh(&sk->sk_error_queue.lock);
1561                sk->sk_error_report(sk);
1562        } else
1563                spin_unlock_bh(&sk->sk_error_queue.lock);
1564
1565out_free_skb:
1566        kfree_skb(skb);
1567out:
1568        return err;
1569}
1570
1571/*
1572 *      Pull a packet from our receive queue and hand it to the user.
1573 *      If necessary we block.
1574 */
1575
1576static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1577                          struct msghdr *msg, size_t len, int flags)
1578{
1579        struct sock *sk = sock->sk;
1580        struct sk_buff *skb;
1581        int copied, err;
1582        struct sockaddr_ll *sll;
1583        int vnet_hdr_len = 0;
1584
1585        err = -EINVAL;
1586        if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1587                goto out;
1588
1589#if 0
1590        /* What error should we return now? EUNATTACH? */
1591        if (pkt_sk(sk)->ifindex < 0)
1592                return -ENODEV;
1593#endif
1594
1595        if (flags & MSG_ERRQUEUE) {
1596                err = packet_recv_error(sk, msg, len);
1597                goto out;
1598        }
1599
1600        /*
1601         *      Call the generic datagram receiver. This handles all sorts
1602         *      of horrible races and re-entrancy so we can forget about it
1603         *      in the protocol layers.
1604         *
1605         *      Now it will return ENETDOWN, if device have just gone down,
1606         *      but then it will block.
1607         */
1608
1609        skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1610
1611        /*
1612         *      An error occurred so return it. Because skb_recv_datagram()
1613         *      handles the blocking we don't see and worry about blocking
1614         *      retries.
1615         */
1616
1617        if (skb == NULL)
1618                goto out;
1619
1620        if (pkt_sk(sk)->has_vnet_hdr) {
1621                struct virtio_net_hdr vnet_hdr = { 0 };
1622
1623                err = -EINVAL;
1624                vnet_hdr_len = sizeof(vnet_hdr);
1625                if (len < vnet_hdr_len)
1626                        goto out_free;
1627
1628                len -= vnet_hdr_len;
1629
1630                if (skb_is_gso(skb)) {
1631                        struct skb_shared_info *sinfo = skb_shinfo(skb);
1632
1633                        /* This is a hint as to how much should be linear. */
1634                        vnet_hdr.hdr_len = skb_headlen(skb);
1635                        vnet_hdr.gso_size = sinfo->gso_size;
1636                        if (sinfo->gso_type & SKB_GSO_TCPV4)
1637                                vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
1638                        else if (sinfo->gso_type & SKB_GSO_TCPV6)
1639                                vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
1640                        else if (sinfo->gso_type & SKB_GSO_UDP)
1641                                vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
1642                        else if (sinfo->gso_type & SKB_GSO_FCOE)
1643                                goto out_free;
1644                        else
1645                                BUG();
1646                        if (sinfo->gso_type & SKB_GSO_TCP_ECN)
1647                                vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
1648                } else
1649                        vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
1650
1651                if (skb->ip_summed == CHECKSUM_PARTIAL) {
1652                        vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
1653                        vnet_hdr.csum_start = skb_checksum_start_offset(skb);
1654                        vnet_hdr.csum_offset = skb->csum_offset;
1655                } /* else everything is zero */
1656
1657                err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
1658                                     vnet_hdr_len);
1659                if (err < 0)
1660                        goto out_free;
1661        }
1662
1663        /*
1664         *      If the address length field is there to be filled in, we fill
1665         *      it in now.
1666         */
1667
1668        sll = &PACKET_SKB_CB(skb)->sa.ll;
1669        if (sock->type == SOCK_PACKET)
1670                msg->msg_namelen = sizeof(struct sockaddr_pkt);
1671        else
1672                msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1673
1674        /*
1675         *      You lose any data beyond the buffer you gave. If it worries a
1676         *      user program they can ask the device for its MTU anyway.
1677         */
1678
1679        copied = skb->len;
1680        if (copied > len) {
1681                copied = len;
1682                msg->msg_flags |= MSG_TRUNC;
1683        }
1684
1685        err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1686        if (err)
1687                goto out_free;
1688
1689        sock_recv_ts_and_drops(msg, sk, skb);
1690
1691        if (msg->msg_name)
1692                memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1693                       msg->msg_namelen);
1694
1695        if (pkt_sk(sk)->auxdata) {
1696                struct tpacket_auxdata aux;
1697
1698                aux.tp_status = TP_STATUS_USER;
1699                if (skb->ip_summed == CHECKSUM_PARTIAL)
1700                        aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1701                aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1702                aux.tp_snaplen = skb->len;
1703                aux.tp_mac = 0;
1704                aux.tp_net = skb_network_offset(skb);
1705                aux.tp_vlan_tci = vlan_tx_tag_get(skb);
1706
1707                put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1708        }
1709
1710        /*
1711         *      Free or return the buffer as appropriate. Again this
1712         *      hides all the races and re-entrancy issues from us.
1713         */
1714        err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1715
1716out_free:
1717        skb_free_datagram(sk, skb);
1718out:
1719        return err;
1720}
1721
1722static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1723                               int *uaddr_len, int peer)
1724{
1725        struct net_device *dev;
1726        struct sock *sk = sock->sk;
1727
1728        if (peer)
1729                return -EOPNOTSUPP;
1730
1731        uaddr->sa_family = AF_PACKET;
1732        rcu_read_lock();
1733        dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
1734        if (dev)
1735                strncpy(uaddr->sa_data, dev->name, 14);
1736        else
1737                memset(uaddr->sa_data, 0, 14);
1738        rcu_read_unlock();
1739        *uaddr_len = sizeof(*uaddr);
1740
1741        return 0;
1742}
1743
1744static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1745                          int *uaddr_len, int peer)
1746{
1747        struct net_device *dev;
1748        struct sock *sk = sock->sk;
1749        struct packet_sock *po = pkt_sk(sk);
1750        DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1751
1752        if (peer)
1753                return -EOPNOTSUPP;
1754
1755        sll->sll_family = AF_PACKET;
1756        sll->sll_ifindex = po->ifindex;
1757        sll->sll_protocol = po->num;
1758        sll->sll_pkttype = 0;
1759        rcu_read_lock();
1760        dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1761        if (dev) {
1762                sll->sll_hatype = dev->type;
1763                sll->sll_halen = dev->addr_len;
1764                memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1765        } else {
1766                sll->sll_hatype = 0;    /* Bad: we have no ARPHRD_UNSPEC */
1767                sll->sll_halen = 0;
1768        }
1769        rcu_read_unlock();
1770        *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1771
1772        return 0;
1773}
1774
1775static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1776                         int what)
1777{
1778        switch (i->type) {
1779        case PACKET_MR_MULTICAST:
1780                if (i->alen != dev->addr_len)
1781                        return -EINVAL;
1782                if (what > 0)
1783                        return dev_mc_add(dev, i->addr);
1784                else
1785                        return dev_mc_del(dev, i->addr);
1786                break;
1787        case PACKET_MR_PROMISC:
1788                return dev_set_promiscuity(dev, what);
1789                break;
1790        case PACKET_MR_ALLMULTI:
1791                return dev_set_allmulti(dev, what);
1792                break;
1793        case PACKET_MR_UNICAST:
1794                if (i->alen != dev->addr_len)
1795                        return -EINVAL;
1796                if (what > 0)
1797                        return dev_uc_add(dev, i->addr);
1798                else
1799                        return dev_uc_del(dev, i->addr);
1800                break;
1801        default:
1802                break;
1803        }
1804        return 0;
1805}
1806
1807static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1808{
1809        for ( ; i; i = i->next) {
1810                if (i->ifindex == dev->ifindex)
1811                        packet_dev_mc(dev, i, what);
1812        }
1813}
1814
1815static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1816{
1817        struct packet_sock *po = pkt_sk(sk);
1818        struct packet_mclist *ml, *i;
1819        struct net_device *dev;
1820        int err;
1821
1822        rtnl_lock();
1823
1824        err = -ENODEV;
1825        dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1826        if (!dev)
1827                goto done;
1828
1829        err = -EINVAL;
1830        if (mreq->mr_alen > dev->addr_len)
1831                goto done;
1832
1833        err = -ENOBUFS;
1834        i = kmalloc(sizeof(*i), GFP_KERNEL);
1835        if (i == NULL)
1836                goto done;
1837
1838        err = 0;
1839        for (ml = po->mclist; ml; ml = ml->next) {
1840                if (ml->ifindex == mreq->mr_ifindex &&
1841                    ml->type == mreq->mr_type &&
1842                    ml->alen == mreq->mr_alen &&
1843                    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1844                        ml->count++;
1845                        /* Free the new element ... */
1846                        kfree(i);
1847                        goto done;
1848                }
1849        }
1850
1851        i->type = mreq->mr_type;
1852        i->ifindex = mreq->mr_ifindex;
1853        i->alen = mreq->mr_alen;
1854        memcpy(i->addr, mreq->mr_address, i->alen);
1855        i->count = 1;
1856        i->next = po->mclist;
1857        po->mclist = i;
1858        err = packet_dev_mc(dev, i, 1);
1859        if (err) {
1860                po->mclist = i->next;
1861                kfree(i);
1862        }
1863
1864done:
1865        rtnl_unlock();
1866        return err;
1867}
1868
1869static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1870{
1871        struct packet_mclist *ml, **mlp;
1872
1873        rtnl_lock();
1874
1875        for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1876                if (ml->ifindex == mreq->mr_ifindex &&
1877                    ml->type == mreq->mr_type &&
1878                    ml->alen == mreq->mr_alen &&
1879                    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1880                        if (--ml->count == 0) {
1881                                struct net_device *dev;
1882                                *mlp = ml->next;
1883                                dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1884                                if (dev)
1885                                        packet_dev_mc(dev, ml, -1);
1886                                kfree(ml);
1887                        }
1888                        rtnl_unlock();
1889                        return 0;
1890                }
1891        }
1892        rtnl_unlock();
1893        return -EADDRNOTAVAIL;
1894}
1895
1896static void packet_flush_mclist(struct sock *sk)
1897{
1898        struct packet_sock *po = pkt_sk(sk);
1899        struct packet_mclist *ml;
1900
1901        if (!po->mclist)
1902                return;
1903
1904        rtnl_lock();
1905        while ((ml = po->mclist) != NULL) {
1906                struct net_device *dev;
1907
1908                po->mclist = ml->next;
1909                dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1910                if (dev != NULL)
1911                        packet_dev_mc(dev, ml, -1);
1912                kfree(ml);
1913        }
1914        rtnl_unlock();
1915}
1916
1917static int
1918packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1919{
1920        struct sock *sk = sock->sk;
1921        struct packet_sock *po = pkt_sk(sk);
1922        int ret;
1923
1924        if (level != SOL_PACKET)
1925                return -ENOPROTOOPT;
1926
1927        switch (optname) {
1928        case PACKET_ADD_MEMBERSHIP:
1929        case PACKET_DROP_MEMBERSHIP:
1930        {
1931                struct packet_mreq_max mreq;
1932                int len = optlen;
1933                memset(&mreq, 0, sizeof(mreq));
1934                if (len < sizeof(struct packet_mreq))
1935                        return -EINVAL;
1936                if (len > sizeof(mreq))
1937                        len = sizeof(mreq);
1938                if (copy_from_user(&mreq, optval, len))
1939                        return -EFAULT;
1940                if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1941                        return -EINVAL;
1942                if (optname == PACKET_ADD_MEMBERSHIP)
1943                        ret = packet_mc_add(sk, &mreq);
1944                else
1945                        ret = packet_mc_drop(sk, &mreq);
1946                return ret;
1947        }
1948
1949        case PACKET_RX_RING:
1950        case PACKET_TX_RING:
1951        {
1952                struct tpacket_req req;
1953
1954                if (optlen < sizeof(req))
1955                        return -EINVAL;
1956                if (pkt_sk(sk)->has_vnet_hdr)
1957                        return -EINVAL;
1958                if (copy_from_user(&req, optval, sizeof(req)))
1959                        return -EFAULT;
1960                return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
1961        }
1962        case PACKET_COPY_THRESH:
1963        {
1964                int val;
1965
1966                if (optlen != sizeof(val))
1967                        return -EINVAL;
1968                if (copy_from_user(&val, optval, sizeof(val)))
1969                        return -EFAULT;
1970
1971                pkt_sk(sk)->copy_thresh = val;
1972                return 0;
1973        }
1974        case PACKET_VERSION:
1975        {
1976                int val;
1977
1978                if (optlen != sizeof(val))
1979                        return -EINVAL;
1980                if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1981                        return -EBUSY;
1982                if (copy_from_user(&val, optval, sizeof(val)))
1983                        return -EFAULT;
1984                switch (val) {
1985                case TPACKET_V1:
1986                case TPACKET_V2:
1987                        po->tp_version = val;
1988                        return 0;
1989                default:
1990                        return -EINVAL;
1991                }
1992        }
1993        case PACKET_RESERVE:
1994        {
1995                unsigned int val;
1996
1997                if (optlen != sizeof(val))
1998                        return -EINVAL;
1999                if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
2000                        return -EBUSY;
2001                if (copy_from_user(&val, optval, sizeof(val)))
2002                        return -EFAULT;
2003                po->tp_reserve = val;
2004                return 0;
2005        }
2006        case PACKET_LOSS:
2007        {
2008                unsigned int val;
2009
2010                if (optlen != sizeof(val))
2011                        return -EINVAL;
2012                if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
2013                        return -EBUSY;
2014                if (copy_from_user(&val, optval, sizeof(val)))
2015                        return -EFAULT;
2016                po->tp_loss = !!val;
2017                return 0;
2018        }
2019        case PACKET_AUXDATA:
2020        {
2021                int val;
2022
2023                if (optlen < sizeof(val))
2024                        return -EINVAL;
2025                if (copy_from_user(&val, optval, sizeof(val)))
2026                        return -EFAULT;
2027
2028                po->auxdata = !!val;
2029                return 0;
2030        }
2031        case PACKET_ORIGDEV:
2032        {
2033                int val;
2034
2035                if (optlen < sizeof(val))
2036                        return -EINVAL;
2037                if (copy_from_user(&val, optval, sizeof(val)))
2038                        return -EFAULT;
2039
2040                po->origdev = !!val;
2041                return 0;
2042        }
2043        case PACKET_VNET_HDR:
2044        {
2045                int val;
2046
2047                if (sock->type != SOCK_RAW)
2048                        return -EINVAL;
2049                if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
2050                        return -EBUSY;
2051                if (optlen < sizeof(val))
2052                        return -EINVAL;
2053                if (copy_from_user(&val, optval, sizeof(val)))
2054                        return -EFAULT;
2055
2056                po->has_vnet_hdr = !!val;
2057                return 0;
2058        }
2059        case PACKET_TIMESTAMP:
2060        {
2061                int val;
2062
2063                if (optlen != sizeof(val))
2064                        return -EINVAL;
2065                if (copy_from_user(&val, optval, sizeof(val)))
2066                        return -EFAULT;
2067
2068                po->tp_tstamp = val;
2069                return 0;
2070        }
2071        default:
2072                return -ENOPROTOOPT;
2073        }
2074}
2075
2076static int packet_getsockopt(struct socket *sock, int level, int optname,
2077                             char __user *optval, int __user *optlen)
2078{
2079        int len;
2080        int val;
2081        struct sock *sk = sock->sk;
2082        struct packet_sock *po = pkt_sk(sk);
2083        void *data;
2084        struct tpacket_stats st;
2085
2086        if (level != SOL_PACKET)
2087                return -ENOPROTOOPT;
2088
2089        if (get_user(len, optlen))
2090                return -EFAULT;
2091
2092        if (len < 0)
2093                return -EINVAL;
2094
2095        switch (optname) {
2096        case PACKET_STATISTICS:
2097                if (len > sizeof(struct tpacket_stats))
2098                        len = sizeof(struct tpacket_stats);
2099                spin_lock_bh(&sk->sk_receive_queue.lock);
2100                st = po->stats;
2101                memset(&po->stats, 0, sizeof(st));
2102                spin_unlock_bh(&sk->sk_receive_queue.lock);
2103                st.tp_packets += st.tp_drops;
2104
2105                data = &st;
2106                break;
2107        case PACKET_AUXDATA:
2108                if (len > sizeof(int))
2109                        len = sizeof(int);
2110                val = po->auxdata;
2111
2112                data = &val;
2113                break;
2114        case PACKET_ORIGDEV:
2115                if (len > sizeof(int))
2116                        len = sizeof(int);
2117                val = po->origdev;
2118
2119                data = &val;
2120                break;
2121        case PACKET_VNET_HDR:
2122                if (len > sizeof(int))
2123                        len = sizeof(int);
2124                val = po->has_vnet_hdr;
2125
2126                data = &val;
2127                break;
2128        case PACKET_VERSION:
2129                if (len > sizeof(int))
2130                        len = sizeof(int);
2131                val = po->tp_version;
2132                data = &val;
2133                break;
2134        case PACKET_HDRLEN:
2135                if (len > sizeof(int))
2136                        len = sizeof(int);
2137                if (copy_from_user(&val, optval, len))
2138                        return -EFAULT;
2139                switch (val) {
2140                case TPACKET_V1:
2141                        val = sizeof(struct tpacket_hdr);
2142                        break;
2143                case TPACKET_V2:
2144                        val = sizeof(struct tpacket2_hdr);
2145                        break;
2146                default:
2147                        return -EINVAL;
2148                }
2149                data = &val;
2150                break;
2151        case PACKET_RESERVE:
2152                if (len > sizeof(unsigned int))
2153                        len = sizeof(unsigned int);
2154                val = po->tp_reserve;
2155                data = &val;
2156                break;
2157        case PACKET_LOSS:
2158                if (len > sizeof(unsigned int))
2159                        len = sizeof(unsigned int);
2160                val = po->tp_loss;
2161                data = &val;
2162                break;
2163        case PACKET_TIMESTAMP:
2164                if (len > sizeof(int))
2165                        len = sizeof(int);
2166                val = po->tp_tstamp;
2167                data = &val;
2168                break;
2169        default:
2170                return -ENOPROTOOPT;
2171        }
2172
2173        if (put_user(len, optlen))
2174                return -EFAULT;
2175        if (copy_to_user(optval, data, len))
2176                return -EFAULT;
2177        return 0;
2178}
2179
2180
2181static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
2182{
2183        struct sock *sk;
2184        struct hlist_node *node;
2185        struct net_device *dev = data;
2186        struct net *net = dev_net(dev);
2187
2188        rcu_read_lock();
2189        sk_for_each_rcu(sk, node, &net->packet.sklist) {
2190                struct packet_sock *po = pkt_sk(sk);
2191
2192                switch (msg) {
2193                case NETDEV_UNREGISTER:
2194                        if (po->mclist)
2195                                packet_dev_mclist(dev, po->mclist, -1);
2196                        /* fallthrough */
2197
2198                case NETDEV_DOWN:
2199                        if (dev->ifindex == po->ifindex) {
2200                                spin_lock(&po->bind_lock);
2201                                if (po->running) {
2202                                        __dev_remove_pack(&po->prot_hook);
2203                                        __sock_put(sk);
2204                                        po->running = 0;
2205                                        sk->sk_err = ENETDOWN;
2206                                        if (!sock_flag(sk, SOCK_DEAD))
2207                                                sk->sk_error_report(sk);
2208                                }
2209                                if (msg == NETDEV_UNREGISTER) {
2210                                        po->ifindex = -1;
2211                                        po->prot_hook.dev = NULL;
2212                                }
2213                                spin_unlock(&po->bind_lock);
2214                        }
2215                        break;
2216                case NETDEV_UP:
2217                        if (dev->ifindex == po->ifindex) {
2218                                spin_lock(&po->bind_lock);
2219                                if (po->num && !po->running) {
2220                                        dev_add_pack(&po->prot_hook);
2221                                        sock_hold(sk);
2222                                        po->running = 1;
2223                                }
2224                                spin_unlock(&po->bind_lock);
2225                        }
2226                        break;
2227                }
2228        }
2229        rcu_read_unlock();
2230        return NOTIFY_DONE;
2231}
2232
2233
2234static int packet_ioctl(struct socket *sock, unsigned int cmd,
2235                        unsigned long arg)
2236{
2237        struct sock *sk = sock->sk;
2238
2239        switch (cmd) {
2240        case SIOCOUTQ:
2241        {
2242                int amount = sk_wmem_alloc_get(sk);
2243
2244                return put_user(amount, (int __user *)arg);
2245        }
2246        case SIOCINQ:
2247        {
2248                struct sk_buff *skb;
2249                int amount = 0;
2250
2251                spin_lock_bh(&sk->sk_receive_queue.lock);
2252                skb = skb_peek(&sk->sk_receive_queue);
2253                if (skb)
2254                        amount = skb->len;
2255                spin_unlock_bh(&sk->sk_receive_queue.lock);
2256                return put_user(amount, (int __user *)arg);
2257        }
2258        case SIOCGSTAMP:
2259                return sock_get_timestamp(sk, (struct timeval __user *)arg);
2260        case SIOCGSTAMPNS:
2261                return sock_get_timestampns(sk, (struct timespec __user *)arg);
2262
2263#ifdef CONFIG_INET
2264        case SIOCADDRT:
2265        case SIOCDELRT:
2266        case SIOCDARP:
2267        case SIOCGARP:
2268        case SIOCSARP:
2269        case SIOCGIFADDR:
2270        case SIOCSIFADDR:
2271        case SIOCGIFBRDADDR:
2272        case SIOCSIFBRDADDR:
2273        case SIOCGIFNETMASK:
2274        case SIOCSIFNETMASK:
2275        case SIOCGIFDSTADDR:
2276        case SIOCSIFDSTADDR:
2277        case SIOCSIFFLAGS:
2278                return inet_dgram_ops.ioctl(sock, cmd, arg);
2279#endif
2280
2281        default:
2282                return -ENOIOCTLCMD;
2283        }
2284        return 0;
2285}
2286
2287static unsigned int packet_poll(struct file *file, struct socket *sock,
2288                                poll_table *wait)
2289{
2290        struct sock *sk = sock->sk;
2291        struct packet_sock *po = pkt_sk(sk);
2292        unsigned int mask = datagram_poll(file, sock, wait);
2293
2294        spin_lock_bh(&sk->sk_receive_queue.lock);
2295        if (po->rx_ring.pg_vec) {
2296                if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
2297                        mask |= POLLIN | POLLRDNORM;
2298        }
2299        spin_unlock_bh(&sk->sk_receive_queue.lock);
2300        spin_lock_bh(&sk->sk_write_queue.lock);
2301        if (po->tx_ring.pg_vec) {
2302                if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
2303                        mask |= POLLOUT | POLLWRNORM;
2304        }
2305        spin_unlock_bh(&sk->sk_write_queue.lock);
2306        return mask;
2307}
2308
2309
2310/* Dirty? Well, I still did not learn better way to account
2311 * for user mmaps.
2312 */
2313
2314static void packet_mm_open(struct vm_area_struct *vma)
2315{
2316        struct file *file = vma->vm_file;
2317        struct socket *sock = file->private_data;
2318        struct sock *sk = sock->sk;
2319
2320        if (sk)
2321                atomic_inc(&pkt_sk(sk)->mapped);
2322}
2323
2324static void packet_mm_close(struct vm_area_struct *vma)
2325{
2326        struct file *file = vma->vm_file;
2327        struct socket *sock = file->private_data;
2328        struct sock *sk = sock->sk;
2329
2330        if (sk)
2331                atomic_dec(&pkt_sk(sk)->mapped);
2332}
2333
2334static const struct vm_operations_struct packet_mmap_ops = {
2335        .open   =       packet_mm_open,
2336        .close  =       packet_mm_close,
2337};
2338
2339static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
2340                        unsigned int len)
2341{
2342        int i;
2343
2344        for (i = 0; i < len; i++) {
2345                if (likely(pg_vec[i].buffer)) {
2346                        if (is_vmalloc_addr(pg_vec[i].buffer))
2347                                vfree(pg_vec[i].buffer);
2348                        else
2349                                free_pages((unsigned long)pg_vec[i].buffer,
2350                                           order);
2351                        pg_vec[i].buffer = NULL;
2352                }
2353        }
2354        kfree(pg_vec);
2355}
2356
2357static inline char *alloc_one_pg_vec_page(unsigned long order)
2358{
2359        char *buffer = NULL;
2360        gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
2361                          __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
2362
2363        buffer = (char *) __get_free_pages(gfp_flags, order);
2364
2365        if (buffer)
2366                return buffer;
2367
2368        /*
2369         * __get_free_pages failed, fall back to vmalloc
2370         */
2371        buffer = vzalloc((1 << order) * PAGE_SIZE);
2372
2373        if (buffer)
2374                return buffer;
2375
2376        /*
2377         * vmalloc failed, lets dig into swap here
2378         */
2379        gfp_flags &= ~__GFP_NORETRY;
2380        buffer = (char *)__get_free_pages(gfp_flags, order);
2381        if (buffer)
2382                return buffer;
2383
2384        /*
2385         * complete and utter failure
2386         */
2387        return NULL;
2388}
2389
2390static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
2391{
2392        unsigned int block_nr = req->tp_block_nr;
2393        struct pgv *pg_vec;
2394        int i;
2395
2396        pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
2397        if (unlikely(!pg_vec))
2398                goto out;
2399
2400        for (i = 0; i < block_nr; i++) {
2401                pg_vec[i].buffer = alloc_one_pg_vec_page(order);
2402                if (unlikely(!pg_vec[i].buffer))
2403                        goto out_free_pgvec;
2404        }
2405
2406out:
2407        return pg_vec;
2408
2409out_free_pgvec:
2410        free_pg_vec(pg_vec, order, block_nr);
2411        pg_vec = NULL;
2412        goto out;
2413}
2414
2415static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2416                int closing, int tx_ring)
2417{
2418        struct pgv *pg_vec = NULL;
2419        struct packet_sock *po = pkt_sk(sk);
2420        int was_running, order = 0;
2421        struct packet_ring_buffer *rb;
2422        struct sk_buff_head *rb_queue;
2423        __be16 num;
2424        int err;
2425
2426        rb = tx_ring ? &po->tx_ring : &po->rx_ring;
2427        rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
2428
2429        err = -EBUSY;
2430        if (!closing) {
2431                if (atomic_read(&po->mapped))
2432                        goto out;
2433                if (atomic_read(&rb->pending))
2434                        goto out;
2435        }
2436
2437        if (req->tp_block_nr) {
2438                /* Sanity tests and some calculations */
2439                err = -EBUSY;
2440                if (unlikely(rb->pg_vec))
2441                        goto out;
2442
2443                switch (po->tp_version) {
2444                case TPACKET_V1:
2445                        po->tp_hdrlen = TPACKET_HDRLEN;
2446                        break;
2447                case TPACKET_V2:
2448                        po->tp_hdrlen = TPACKET2_HDRLEN;
2449                        break;
2450                }
2451
2452                err = -EINVAL;
2453                if (unlikely((int)req->tp_block_size <= 0))
2454                        goto out;
2455                if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
2456                        goto out;
2457                if (unlikely(req->tp_frame_size < po->tp_hdrlen +
2458                                        po->tp_reserve))
2459                        goto out;
2460                if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
2461                        goto out;
2462
2463                rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
2464                if (unlikely(rb->frames_per_block <= 0))
2465                        goto out;
2466                if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
2467                                        req->tp_frame_nr))
2468                        goto out;
2469
2470                err = -ENOMEM;
2471                order = get_order(req->tp_block_size);
2472                pg_vec = alloc_pg_vec(req, order);
2473                if (unlikely(!pg_vec))
2474                        goto out;
2475        }
2476        /* Done */
2477        else {
2478                err = -EINVAL;
2479                if (unlikely(req->tp_frame_nr))
2480                        goto out;
2481        }
2482
2483        lock_sock(sk);
2484
2485        /* Detach socket from network */
2486        spin_lock(&po->bind_lock);
2487        was_running = po->running;
2488        num = po->num;
2489        if (was_running) {
2490                __dev_remove_pack(&po->prot_hook);
2491                po->num = 0;
2492                po->running = 0;
2493                __sock_put(sk);
2494        }
2495        spin_unlock(&po->bind_lock);
2496
2497        synchronize_net();
2498
2499        err = -EBUSY;
2500        mutex_lock(&po->pg_vec_lock);
2501        if (closing || atomic_read(&po->mapped) == 0) {
2502                err = 0;
2503                spin_lock_bh(&rb_queue->lock);
2504                swap(rb->pg_vec, pg_vec);
2505                rb->frame_max = (req->tp_frame_nr - 1);
2506                rb->head = 0;
2507                rb->frame_size = req->tp_frame_size;
2508                spin_unlock_bh(&rb_queue->lock);
2509
2510                swap(rb->pg_vec_order, order);
2511                swap(rb->pg_vec_len, req->tp_block_nr);
2512
2513                rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
2514                po->prot_hook.func = (po->rx_ring.pg_vec) ?
2515                                                tpacket_rcv : packet_rcv;
2516                skb_queue_purge(rb_queue);
2517                if (atomic_read(&po->mapped))
2518                        pr_err("packet_mmap: vma is busy: %d\n",
2519                               atomic_read(&po->mapped));
2520        }
2521        mutex_unlock(&po->pg_vec_lock);
2522
2523        spin_lock(&po->bind_lock);
2524        if (was_running && !po->running) {
2525                sock_hold(sk);
2526                po->running = 1;
2527                po->num = num;
2528                dev_add_pack(&po->prot_hook);
2529        }
2530        spin_unlock(&po->bind_lock);
2531
2532        release_sock(sk);
2533
2534        if (pg_vec)
2535                free_pg_vec(pg_vec, order, req->tp_block_nr);
2536out:
2537        return err;
2538}
2539
2540static int packet_mmap(struct file *file, struct socket *sock,
2541                struct vm_area_struct *vma)
2542{
2543        struct sock *sk = sock->sk;
2544        struct packet_sock *po = pkt_sk(sk);
2545        unsigned long size, expected_size;
2546        struct packet_ring_buffer *rb;
2547        unsigned long start;
2548        int err = -EINVAL;
2549        int i;
2550
2551        if (vma->vm_pgoff)
2552                return -EINVAL;
2553
2554        mutex_lock(&po->pg_vec_lock);
2555
2556        expected_size = 0;
2557        for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2558                if (rb->pg_vec) {
2559                        expected_size += rb->pg_vec_len
2560                                                * rb->pg_vec_pages
2561                                                * PAGE_SIZE;
2562                }
2563        }
2564
2565        if (expected_size == 0)
2566                goto out;
2567
2568        size = vma->vm_end - vma->vm_start;
2569        if (size != expected_size)
2570                goto out;
2571
2572        start = vma->vm_start;
2573        for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2574                if (rb->pg_vec == NULL)
2575                        continue;
2576
2577                for (i = 0; i < rb->pg_vec_len; i++) {
2578                        struct page *page;
2579                        void *kaddr = rb->pg_vec[i].buffer;
2580                        int pg_num;
2581
2582                        for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
2583                                page = pgv_to_page(kaddr);
2584                                err = vm_insert_page(vma, start, page);
2585                                if (unlikely(err))
2586                                        goto out;
2587                                start += PAGE_SIZE;
2588                                kaddr += PAGE_SIZE;
2589                        }
2590                }
2591        }
2592
2593        atomic_inc(&po->mapped);
2594        vma->vm_ops = &packet_mmap_ops;
2595        err = 0;
2596
2597out:
2598        mutex_unlock(&po->pg_vec_lock);
2599        return err;
2600}
2601
2602static const struct proto_ops packet_ops_spkt = {
2603        .family =       PF_PACKET,
2604        .owner =        THIS_MODULE,
2605        .release =      packet_release,
2606        .bind =         packet_bind_spkt,
2607        .connect =      sock_no_connect,
2608        .socketpair =   sock_no_socketpair,
2609        .accept =       sock_no_accept,
2610        .getname =      packet_getname_spkt,
2611        .poll =         datagram_poll,
2612        .ioctl =        packet_ioctl,
2613        .listen =       sock_no_listen,
2614        .shutdown =     sock_no_shutdown,
2615        .setsockopt =   sock_no_setsockopt,
2616        .getsockopt =   sock_no_getsockopt,
2617        .sendmsg =      packet_sendmsg_spkt,
2618        .recvmsg =      packet_recvmsg,
2619        .mmap =         sock_no_mmap,
2620        .sendpage =     sock_no_sendpage,
2621};
2622
2623static const struct proto_ops packet_ops = {
2624        .family =       PF_PACKET,
2625        .owner =        THIS_MODULE,
2626        .release =      packet_release,
2627        .bind =         packet_bind,
2628        .connect =      sock_no_connect,
2629        .socketpair =   sock_no_socketpair,
2630        .accept =       sock_no_accept,
2631        .getname =      packet_getname,
2632        .poll =         packet_poll,
2633        .ioctl =        packet_ioctl,
2634        .listen =       sock_no_listen,
2635        .shutdown =     sock_no_shutdown,
2636        .setsockopt =   packet_setsockopt,
2637        .getsockopt =   packet_getsockopt,
2638        .sendmsg =      packet_sendmsg,
2639        .recvmsg =      packet_recvmsg,
2640        .mmap =         packet_mmap,
2641        .sendpage =     sock_no_sendpage,
2642};
2643
2644static const struct net_proto_family packet_family_ops = {
2645        .family =       PF_PACKET,
2646        .create =       packet_create,
2647        .owner  =       THIS_MODULE,
2648};
2649
2650static struct notifier_block packet_netdev_notifier = {
2651        .notifier_call =        packet_notifier,
2652};
2653
2654#ifdef CONFIG_PROC_FS
2655
2656static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
2657        __acquires(RCU)
2658{
2659        struct net *net = seq_file_net(seq);
2660
2661        rcu_read_lock();
2662        return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
2663}
2664
2665static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2666{
2667        struct net *net = seq_file_net(seq);
2668        return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
2669}
2670
2671static void packet_seq_stop(struct seq_file *seq, void *v)
2672        __releases(RCU)
2673{
2674        rcu_read_unlock();
2675}
2676
2677static int packet_seq_show(struct seq_file *seq, void *v)
2678{
2679        if (v == SEQ_START_TOKEN)
2680                seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
2681        else {
2682                struct sock *s = sk_entry(v);
2683                const struct packet_sock *po = pkt_sk(s);
2684
2685                seq_printf(seq,
2686                           "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
2687                           s,
2688                           atomic_read(&s->sk_refcnt),
2689                           s->sk_type,
2690                           ntohs(po->num),
2691                           po->ifindex,
2692                           po->running,
2693                           atomic_read(&s->sk_rmem_alloc),
2694                           sock_i_uid(s),
2695                           sock_i_ino(s));
2696        }
2697
2698        return 0;
2699}
2700
2701static const struct seq_operations packet_seq_ops = {
2702        .start  = packet_seq_start,
2703        .next   = packet_seq_next,
2704        .stop   = packet_seq_stop,
2705        .show   = packet_seq_show,
2706};
2707
2708static int packet_seq_open(struct inode *inode, struct file *file)
2709{
2710        return seq_open_net(inode, file, &packet_seq_ops,
2711                            sizeof(struct seq_net_private));
2712}
2713
2714static const struct file_operations packet_seq_fops = {
2715        .owner          = THIS_MODULE,
2716        .open           = packet_seq_open,
2717        .read           = seq_read,
2718        .llseek         = seq_lseek,
2719        .release        = seq_release_net,
2720};
2721
2722#endif
2723
2724static int __net_init packet_net_init(struct net *net)
2725{
2726        spin_lock_init(&net->packet.sklist_lock);
2727        INIT_HLIST_HEAD(&net->packet.sklist);
2728
2729        if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
2730                return -ENOMEM;
2731
2732        return 0;
2733}
2734
2735static void __net_exit packet_net_exit(struct net *net)
2736{
2737        proc_net_remove(net, "packet");
2738}
2739
2740static struct pernet_operations packet_net_ops = {
2741        .init = packet_net_init,
2742        .exit = packet_net_exit,
2743};
2744
2745
2746static void __exit packet_exit(void)
2747{
2748        unregister_netdevice_notifier(&packet_netdev_notifier);
2749        unregister_pernet_subsys(&packet_net_ops);
2750        sock_unregister(PF_PACKET);
2751        proto_unregister(&packet_proto);
2752}
2753
2754static int __init packet_init(void)
2755{
2756        int rc = proto_register(&packet_proto, 0);
2757
2758        if (rc != 0)
2759                goto out;
2760
2761        sock_register(&packet_family_ops);
2762        register_pernet_subsys(&packet_net_ops);
2763        register_netdevice_notifier(&packet_netdev_notifier);
2764out:
2765        return rc;
2766}
2767
2768module_init(packet_init);
2769module_exit(packet_exit);
2770MODULE_LICENSE("GPL");
2771MODULE_ALIAS_NETPROTO(PF_PACKET);
2772