linux/net/packet/af_packet.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4 *              operating system.  INET is implemented using the  BSD Socket
   5 *              interface as the means of communication with the user level.
   6 *
   7 *              PACKET - implements raw packet sockets.
   8 *
   9 * Authors:     Ross Biro
  10 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  11 *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  12 *
  13 * Fixes:
  14 *              Alan Cox        :       verify_area() now used correctly
  15 *              Alan Cox        :       new skbuff lists, look ma no backlogs!
  16 *              Alan Cox        :       tidied skbuff lists.
  17 *              Alan Cox        :       Now uses generic datagram routines I
  18 *                                      added. Also fixed the peek/read crash
  19 *                                      from all old Linux datagram code.
  20 *              Alan Cox        :       Uses the improved datagram code.
  21 *              Alan Cox        :       Added NULL's for socket options.
  22 *              Alan Cox        :       Re-commented the code.
  23 *              Alan Cox        :       Use new kernel side addressing
  24 *              Rob Janssen     :       Correct MTU usage.
  25 *              Dave Platt      :       Counter leaks caused by incorrect
  26 *                                      interrupt locking and some slightly
  27 *                                      dubious gcc output. Can you read
  28 *                                      compiler: it said _VOLATILE_
  29 *      Richard Kooijman        :       Timestamp fixes.
  30 *              Alan Cox        :       New buffers. Use sk->mac.raw.
  31 *              Alan Cox        :       sendmsg/recvmsg support.
  32 *              Alan Cox        :       Protocol setting support
  33 *      Alexey Kuznetsov        :       Untied from IPv4 stack.
  34 *      Cyrus Durgin            :       Fixed kerneld for kmod.
  35 *      Michal Ostrowski        :       Module initialization cleanup.
  36 *         Ulises Alonso        :       Frame number limit removal and
  37 *                                      packet_set_ring memory leak.
  38 *              Eric Biederman  :       Allow for > 8 byte hardware addresses.
  39 *                                      The convention is that longer addresses
  40 *                                      will simply extend the hardware address
  41 *                                      byte arrays at the end of sockaddr_ll
  42 *                                      and packet_mreq.
  43 *              Johann Baudy    :       Added TX RING.
  44 *              Chetan Loke     :       Implemented TPACKET_V3 block abstraction
  45 *                                      layer.
  46 *                                      Copyright (C) 2011, <lokec@ccs.neu.edu>
  47 */
  48
  49#include <linux/types.h>
  50#include <linux/mm.h>
  51#include <linux/capability.h>
  52#include <linux/fcntl.h>
  53#include <linux/socket.h>
  54#include <linux/in.h>
  55#include <linux/inet.h>
  56#include <linux/netdevice.h>
  57#include <linux/if_packet.h>
  58#include <linux/wireless.h>
  59#include <linux/kernel.h>
  60#include <linux/kmod.h>
  61#include <linux/slab.h>
  62#include <linux/vmalloc.h>
  63#include <net/net_namespace.h>
  64#include <net/ip.h>
  65#include <net/protocol.h>
  66#include <linux/skbuff.h>
  67#include <net/sock.h>
  68#include <linux/errno.h>
  69#include <linux/timer.h>
  70#include <linux/uaccess.h>
  71#include <asm/ioctls.h>
  72#include <asm/page.h>
  73#include <asm/cacheflush.h>
  74#include <asm/io.h>
  75#include <linux/proc_fs.h>
  76#include <linux/seq_file.h>
  77#include <linux/poll.h>
  78#include <linux/module.h>
  79#include <linux/init.h>
  80#include <linux/mutex.h>
  81#include <linux/if_vlan.h>
  82#include <linux/virtio_net.h>
  83#include <linux/errqueue.h>
  84#include <linux/net_tstamp.h>
  85#include <linux/percpu.h>
  86#ifdef CONFIG_INET
  87#include <net/inet_common.h>
  88#endif
  89#include <linux/bpf.h>
  90#include <net/compat.h>
  91
  92#include "internal.h"
  93
  94/*
  95   Assumptions:
  96   - if device has no dev->hard_header routine, it adds and removes ll header
  97     inside itself. In this case ll header is invisible outside of device,
  98     but higher levels still should reserve dev->hard_header_len.
  99     Some devices are enough clever to reallocate skb, when header
 100     will not fit to reserved space (tunnel), another ones are silly
 101     (PPP).
 102   - packet socket receives packets with pulled ll header,
 103     so that SOCK_RAW should push it back.
 104
 105On receive:
 106-----------
 107
 108Incoming, dev->hard_header!=NULL
 109   mac_header -> ll header
 110   data       -> data
 111
 112Outgoing, dev->hard_header!=NULL
 113   mac_header -> ll header
 114   data       -> ll header
 115
 116Incoming, dev->hard_header==NULL
 117   mac_header -> UNKNOWN position. It is very likely, that it points to ll
 118                 header.  PPP makes it, that is wrong, because introduce
 119                 assymetry between rx and tx paths.
 120   data       -> data
 121
 122Outgoing, dev->hard_header==NULL
 123   mac_header -> data. ll header is still not built!
 124   data       -> data
 125
 126Resume
 127  If dev->hard_header==NULL we are unlikely to restore sensible ll header.
 128
 129
 130On transmit:
 131------------
 132
 133dev->hard_header != NULL
 134   mac_header -> ll header
 135   data       -> ll header
 136
 137dev->hard_header == NULL (ll header is added by device, we cannot control it)
 138   mac_header -> data
 139   data       -> data
 140
 141   We should set nh.raw on output to correct posistion,
 142   packet classifier depends on it.
 143 */
 144
 145/* Private packet socket structures. */
 146
 147/* identical to struct packet_mreq except it has
 148 * a longer address field.
 149 */
 150struct packet_mreq_max {
 151        int             mr_ifindex;
 152        unsigned short  mr_type;
 153        unsigned short  mr_alen;
 154        unsigned char   mr_address[MAX_ADDR_LEN];
 155};
 156
 157union tpacket_uhdr {
 158        struct tpacket_hdr  *h1;
 159        struct tpacket2_hdr *h2;
 160        struct tpacket3_hdr *h3;
 161        void *raw;
 162};
 163
 164static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
 165                int closing, int tx_ring);
 166
 167#define V3_ALIGNMENT    (8)
 168
 169#define BLK_HDR_LEN     (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
 170
 171#define BLK_PLUS_PRIV(sz_of_priv) \
 172        (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
 173
 174#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
 175#define BLOCK_NUM_PKTS(x)       ((x)->hdr.bh1.num_pkts)
 176#define BLOCK_O2FP(x)           ((x)->hdr.bh1.offset_to_first_pkt)
 177#define BLOCK_LEN(x)            ((x)->hdr.bh1.blk_len)
 178#define BLOCK_SNUM(x)           ((x)->hdr.bh1.seq_num)
 179#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
 180#define BLOCK_PRIV(x)           ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
 181
 182struct packet_sock;
 183static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 184                       struct packet_type *pt, struct net_device *orig_dev);
 185
 186static void *packet_previous_frame(struct packet_sock *po,
 187                struct packet_ring_buffer *rb,
 188                int status);
 189static void packet_increment_head(struct packet_ring_buffer *buff);
 190static int prb_curr_blk_in_use(struct tpacket_block_desc *);
 191static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
 192                        struct packet_sock *);
 193static void prb_retire_current_block(struct tpacket_kbdq_core *,
 194                struct packet_sock *, unsigned int status);
 195static int prb_queue_frozen(struct tpacket_kbdq_core *);
 196static void prb_open_block(struct tpacket_kbdq_core *,
 197                struct tpacket_block_desc *);
 198static void prb_retire_rx_blk_timer_expired(struct timer_list *);
 199static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
 200static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
 201static void prb_clear_rxhash(struct tpacket_kbdq_core *,
 202                struct tpacket3_hdr *);
 203static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
 204                struct tpacket3_hdr *);
 205static void packet_flush_mclist(struct sock *sk);
 206static u16 packet_pick_tx_queue(struct sk_buff *skb);
 207
 208struct packet_skb_cb {
 209        union {
 210                struct sockaddr_pkt pkt;
 211                union {
 212                        /* Trick: alias skb original length with
 213                         * ll.sll_family and ll.protocol in order
 214                         * to save room.
 215                         */
 216                        unsigned int origlen;
 217                        struct sockaddr_ll ll;
 218                };
 219        } sa;
 220};
 221
 222#define vio_le() virtio_legacy_is_little_endian()
 223
 224#define PACKET_SKB_CB(__skb)    ((struct packet_skb_cb *)((__skb)->cb))
 225
 226#define GET_PBDQC_FROM_RB(x)    ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
 227#define GET_PBLOCK_DESC(x, bid) \
 228        ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
 229#define GET_CURR_PBLOCK_DESC_FROM_CORE(x)       \
 230        ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
 231#define GET_NEXT_PRB_BLK_NUM(x) \
 232        (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
 233        ((x)->kactive_blk_num+1) : 0)
 234
 235static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
 236static void __fanout_link(struct sock *sk, struct packet_sock *po);
 237
 238static int packet_direct_xmit(struct sk_buff *skb)
 239{
 240        return dev_direct_xmit(skb, packet_pick_tx_queue(skb));
 241}
 242
 243static struct net_device *packet_cached_dev_get(struct packet_sock *po)
 244{
 245        struct net_device *dev;
 246
 247        rcu_read_lock();
 248        dev = rcu_dereference(po->cached_dev);
 249        if (likely(dev))
 250                dev_hold(dev);
 251        rcu_read_unlock();
 252
 253        return dev;
 254}
 255
 256static void packet_cached_dev_assign(struct packet_sock *po,
 257                                     struct net_device *dev)
 258{
 259        rcu_assign_pointer(po->cached_dev, dev);
 260}
 261
 262static void packet_cached_dev_reset(struct packet_sock *po)
 263{
 264        RCU_INIT_POINTER(po->cached_dev, NULL);
 265}
 266
 267static bool packet_use_direct_xmit(const struct packet_sock *po)
 268{
 269        return po->xmit == packet_direct_xmit;
 270}
 271
 272static u16 packet_pick_tx_queue(struct sk_buff *skb)
 273{
 274        struct net_device *dev = skb->dev;
 275        const struct net_device_ops *ops = dev->netdev_ops;
 276        int cpu = raw_smp_processor_id();
 277        u16 queue_index;
 278
 279#ifdef CONFIG_XPS
 280        skb->sender_cpu = cpu + 1;
 281#endif
 282        skb_record_rx_queue(skb, cpu % dev->real_num_tx_queues);
 283        if (ops->ndo_select_queue) {
 284                queue_index = ops->ndo_select_queue(dev, skb, NULL);
 285                queue_index = netdev_cap_txqueue(dev, queue_index);
 286        } else {
 287                queue_index = netdev_pick_tx(dev, skb, NULL);
 288        }
 289
 290        return queue_index;
 291}
 292
 293/* __register_prot_hook must be invoked through register_prot_hook
 294 * or from a context in which asynchronous accesses to the packet
 295 * socket is not possible (packet_create()).
 296 */
 297static void __register_prot_hook(struct sock *sk)
 298{
 299        struct packet_sock *po = pkt_sk(sk);
 300
 301        if (!po->running) {
 302                if (po->fanout)
 303                        __fanout_link(sk, po);
 304                else
 305                        dev_add_pack(&po->prot_hook);
 306
 307                sock_hold(sk);
 308                po->running = 1;
 309        }
 310}
 311
 312static void register_prot_hook(struct sock *sk)
 313{
 314        lockdep_assert_held_once(&pkt_sk(sk)->bind_lock);
 315        __register_prot_hook(sk);
 316}
 317
 318/* If the sync parameter is true, we will temporarily drop
 319 * the po->bind_lock and do a synchronize_net to make sure no
 320 * asynchronous packet processing paths still refer to the elements
 321 * of po->prot_hook.  If the sync parameter is false, it is the
 322 * callers responsibility to take care of this.
 323 */
 324static void __unregister_prot_hook(struct sock *sk, bool sync)
 325{
 326        struct packet_sock *po = pkt_sk(sk);
 327
 328        lockdep_assert_held_once(&po->bind_lock);
 329
 330        po->running = 0;
 331
 332        if (po->fanout)
 333                __fanout_unlink(sk, po);
 334        else
 335                __dev_remove_pack(&po->prot_hook);
 336
 337        __sock_put(sk);
 338
 339        if (sync) {
 340                spin_unlock(&po->bind_lock);
 341                synchronize_net();
 342                spin_lock(&po->bind_lock);
 343        }
 344}
 345
 346static void unregister_prot_hook(struct sock *sk, bool sync)
 347{
 348        struct packet_sock *po = pkt_sk(sk);
 349
 350        if (po->running)
 351                __unregister_prot_hook(sk, sync);
 352}
 353
 354static inline struct page * __pure pgv_to_page(void *addr)
 355{
 356        if (is_vmalloc_addr(addr))
 357                return vmalloc_to_page(addr);
 358        return virt_to_page(addr);
 359}
 360
 361static void __packet_set_status(struct packet_sock *po, void *frame, int status)
 362{
 363        union tpacket_uhdr h;
 364
 365        h.raw = frame;
 366        switch (po->tp_version) {
 367        case TPACKET_V1:
 368                h.h1->tp_status = status;
 369                flush_dcache_page(pgv_to_page(&h.h1->tp_status));
 370                break;
 371        case TPACKET_V2:
 372                h.h2->tp_status = status;
 373                flush_dcache_page(pgv_to_page(&h.h2->tp_status));
 374                break;
 375        case TPACKET_V3:
 376                h.h3->tp_status = status;
 377                flush_dcache_page(pgv_to_page(&h.h3->tp_status));
 378                break;
 379        default:
 380                WARN(1, "TPACKET version not supported.\n");
 381                BUG();
 382        }
 383
 384        smp_wmb();
 385}
 386
 387static int __packet_get_status(const struct packet_sock *po, void *frame)
 388{
 389        union tpacket_uhdr h;
 390
 391        smp_rmb();
 392
 393        h.raw = frame;
 394        switch (po->tp_version) {
 395        case TPACKET_V1:
 396                flush_dcache_page(pgv_to_page(&h.h1->tp_status));
 397                return h.h1->tp_status;
 398        case TPACKET_V2:
 399                flush_dcache_page(pgv_to_page(&h.h2->tp_status));
 400                return h.h2->tp_status;
 401        case TPACKET_V3:
 402                flush_dcache_page(pgv_to_page(&h.h3->tp_status));
 403                return h.h3->tp_status;
 404        default:
 405                WARN(1, "TPACKET version not supported.\n");
 406                BUG();
 407                return 0;
 408        }
 409}
 410
 411static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
 412                                   unsigned int flags)
 413{
 414        struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
 415
 416        if (shhwtstamps &&
 417            (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
 418            ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
 419                return TP_STATUS_TS_RAW_HARDWARE;
 420
 421        if (ktime_to_timespec_cond(skb->tstamp, ts))
 422                return TP_STATUS_TS_SOFTWARE;
 423
 424        return 0;
 425}
 426
 427static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
 428                                    struct sk_buff *skb)
 429{
 430        union tpacket_uhdr h;
 431        struct timespec ts;
 432        __u32 ts_status;
 433
 434        if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
 435                return 0;
 436
 437        h.raw = frame;
 438        switch (po->tp_version) {
 439        case TPACKET_V1:
 440                h.h1->tp_sec = ts.tv_sec;
 441                h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
 442                break;
 443        case TPACKET_V2:
 444                h.h2->tp_sec = ts.tv_sec;
 445                h.h2->tp_nsec = ts.tv_nsec;
 446                break;
 447        case TPACKET_V3:
 448                h.h3->tp_sec = ts.tv_sec;
 449                h.h3->tp_nsec = ts.tv_nsec;
 450                break;
 451        default:
 452                WARN(1, "TPACKET version not supported.\n");
 453                BUG();
 454        }
 455
 456        /* one flush is safe, as both fields always lie on the same cacheline */
 457        flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
 458        smp_wmb();
 459
 460        return ts_status;
 461}
 462
 463static void *packet_lookup_frame(const struct packet_sock *po,
 464                                 const struct packet_ring_buffer *rb,
 465                                 unsigned int position,
 466                                 int status)
 467{
 468        unsigned int pg_vec_pos, frame_offset;
 469        union tpacket_uhdr h;
 470
 471        pg_vec_pos = position / rb->frames_per_block;
 472        frame_offset = position % rb->frames_per_block;
 473
 474        h.raw = rb->pg_vec[pg_vec_pos].buffer +
 475                (frame_offset * rb->frame_size);
 476
 477        if (status != __packet_get_status(po, h.raw))
 478                return NULL;
 479
 480        return h.raw;
 481}
 482
 483static void *packet_current_frame(struct packet_sock *po,
 484                struct packet_ring_buffer *rb,
 485                int status)
 486{
 487        return packet_lookup_frame(po, rb, rb->head, status);
 488}
 489
 490static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
 491{
 492        del_timer_sync(&pkc->retire_blk_timer);
 493}
 494
 495static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
 496                struct sk_buff_head *rb_queue)
 497{
 498        struct tpacket_kbdq_core *pkc;
 499
 500        pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
 501
 502        spin_lock_bh(&rb_queue->lock);
 503        pkc->delete_blk_timer = 1;
 504        spin_unlock_bh(&rb_queue->lock);
 505
 506        prb_del_retire_blk_timer(pkc);
 507}
 508
 509static void prb_setup_retire_blk_timer(struct packet_sock *po)
 510{
 511        struct tpacket_kbdq_core *pkc;
 512
 513        pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
 514        timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
 515                    0);
 516        pkc->retire_blk_timer.expires = jiffies;
 517}
 518
 519static int prb_calc_retire_blk_tmo(struct packet_sock *po,
 520                                int blk_size_in_bytes)
 521{
 522        struct net_device *dev;
 523        unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
 524        struct ethtool_link_ksettings ecmd;
 525        int err;
 526
 527        rtnl_lock();
 528        dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
 529        if (unlikely(!dev)) {
 530                rtnl_unlock();
 531                return DEFAULT_PRB_RETIRE_TOV;
 532        }
 533        err = __ethtool_get_link_ksettings(dev, &ecmd);
 534        rtnl_unlock();
 535        if (!err) {
 536                /*
 537                 * If the link speed is so slow you don't really
 538                 * need to worry about perf anyways
 539                 */
 540                if (ecmd.base.speed < SPEED_1000 ||
 541                    ecmd.base.speed == SPEED_UNKNOWN) {
 542                        return DEFAULT_PRB_RETIRE_TOV;
 543                } else {
 544                        msec = 1;
 545                        div = ecmd.base.speed / 1000;
 546                }
 547        } else
 548                return DEFAULT_PRB_RETIRE_TOV;
 549
 550        mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
 551
 552        if (div)
 553                mbits /= div;
 554
 555        tmo = mbits * msec;
 556
 557        if (div)
 558                return tmo+1;
 559        return tmo;
 560}
 561
 562static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
 563                        union tpacket_req_u *req_u)
 564{
 565        p1->feature_req_word = req_u->req3.tp_feature_req_word;
 566}
 567
 568static void init_prb_bdqc(struct packet_sock *po,
 569                        struct packet_ring_buffer *rb,
 570                        struct pgv *pg_vec,
 571                        union tpacket_req_u *req_u)
 572{
 573        struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
 574        struct tpacket_block_desc *pbd;
 575
 576        memset(p1, 0x0, sizeof(*p1));
 577
 578        p1->knxt_seq_num = 1;
 579        p1->pkbdq = pg_vec;
 580        pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
 581        p1->pkblk_start = pg_vec[0].buffer;
 582        p1->kblk_size = req_u->req3.tp_block_size;
 583        p1->knum_blocks = req_u->req3.tp_block_nr;
 584        p1->hdrlen = po->tp_hdrlen;
 585        p1->version = po->tp_version;
 586        p1->last_kactive_blk_num = 0;
 587        po->stats.stats3.tp_freeze_q_cnt = 0;
 588        if (req_u->req3.tp_retire_blk_tov)
 589                p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
 590        else
 591                p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
 592                                                req_u->req3.tp_block_size);
 593        p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
 594        p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
 595
 596        p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
 597        prb_init_ft_ops(p1, req_u);
 598        prb_setup_retire_blk_timer(po);
 599        prb_open_block(p1, pbd);
 600}
 601
 602/*  Do NOT update the last_blk_num first.
 603 *  Assumes sk_buff_head lock is held.
 604 */
 605static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
 606{
 607        mod_timer(&pkc->retire_blk_timer,
 608                        jiffies + pkc->tov_in_jiffies);
 609        pkc->last_kactive_blk_num = pkc->kactive_blk_num;
 610}
 611
 612/*
 613 * Timer logic:
 614 * 1) We refresh the timer only when we open a block.
 615 *    By doing this we don't waste cycles refreshing the timer
 616 *        on packet-by-packet basis.
 617 *
 618 * With a 1MB block-size, on a 1Gbps line, it will take
 619 * i) ~8 ms to fill a block + ii) memcpy etc.
 620 * In this cut we are not accounting for the memcpy time.
 621 *
 622 * So, if the user sets the 'tmo' to 10ms then the timer
 623 * will never fire while the block is still getting filled
 624 * (which is what we want). However, the user could choose
 625 * to close a block early and that's fine.
 626 *
 627 * But when the timer does fire, we check whether or not to refresh it.
 628 * Since the tmo granularity is in msecs, it is not too expensive
 629 * to refresh the timer, lets say every '8' msecs.
 630 * Either the user can set the 'tmo' or we can derive it based on
 631 * a) line-speed and b) block-size.
 632 * prb_calc_retire_blk_tmo() calculates the tmo.
 633 *
 634 */
 635static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
 636{
 637        struct packet_sock *po =
 638                from_timer(po, t, rx_ring.prb_bdqc.retire_blk_timer);
 639        struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
 640        unsigned int frozen;
 641        struct tpacket_block_desc *pbd;
 642
 643        spin_lock(&po->sk.sk_receive_queue.lock);
 644
 645        frozen = prb_queue_frozen(pkc);
 646        pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
 647
 648        if (unlikely(pkc->delete_blk_timer))
 649                goto out;
 650
 651        /* We only need to plug the race when the block is partially filled.
 652         * tpacket_rcv:
 653         *              lock(); increment BLOCK_NUM_PKTS; unlock()
 654         *              copy_bits() is in progress ...
 655         *              timer fires on other cpu:
 656         *              we can't retire the current block because copy_bits
 657         *              is in progress.
 658         *
 659         */
 660        if (BLOCK_NUM_PKTS(pbd)) {
 661                while (atomic_read(&pkc->blk_fill_in_prog)) {
 662                        /* Waiting for skb_copy_bits to finish... */
 663                        cpu_relax();
 664                }
 665        }
 666
 667        if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
 668                if (!frozen) {
 669                        if (!BLOCK_NUM_PKTS(pbd)) {
 670                                /* An empty block. Just refresh the timer. */
 671                                goto refresh_timer;
 672                        }
 673                        prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
 674                        if (!prb_dispatch_next_block(pkc, po))
 675                                goto refresh_timer;
 676                        else
 677                                goto out;
 678                } else {
 679                        /* Case 1. Queue was frozen because user-space was
 680                         *         lagging behind.
 681                         */
 682                        if (prb_curr_blk_in_use(pbd)) {
 683                                /*
 684                                 * Ok, user-space is still behind.
 685                                 * So just refresh the timer.
 686                                 */
 687                                goto refresh_timer;
 688                        } else {
 689                               /* Case 2. queue was frozen,user-space caught up,
 690                                * now the link went idle && the timer fired.
 691                                * We don't have a block to close.So we open this
 692                                * block and restart the timer.
 693                                * opening a block thaws the queue,restarts timer
 694                                * Thawing/timer-refresh is a side effect.
 695                                */
 696                                prb_open_block(pkc, pbd);
 697                                goto out;
 698                        }
 699                }
 700        }
 701
 702refresh_timer:
 703        _prb_refresh_rx_retire_blk_timer(pkc);
 704
 705out:
 706        spin_unlock(&po->sk.sk_receive_queue.lock);
 707}
 708
 709static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
 710                struct tpacket_block_desc *pbd1, __u32 status)
 711{
 712        /* Flush everything minus the block header */
 713
 714#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
 715        u8 *start, *end;
 716
 717        start = (u8 *)pbd1;
 718
 719        /* Skip the block header(we know header WILL fit in 4K) */
 720        start += PAGE_SIZE;
 721
 722        end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
 723        for (; start < end; start += PAGE_SIZE)
 724                flush_dcache_page(pgv_to_page(start));
 725
 726        smp_wmb();
 727#endif
 728
 729        /* Now update the block status. */
 730
 731        BLOCK_STATUS(pbd1) = status;
 732
 733        /* Flush the block header */
 734
 735#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
 736        start = (u8 *)pbd1;
 737        flush_dcache_page(pgv_to_page(start));
 738
 739        smp_wmb();
 740#endif
 741}
 742
 743/*
 744 * Side effect:
 745 *
 746 * 1) flush the block
 747 * 2) Increment active_blk_num
 748 *
 749 * Note:We DONT refresh the timer on purpose.
 750 *      Because almost always the next block will be opened.
 751 */
 752static void prb_close_block(struct tpacket_kbdq_core *pkc1,
 753                struct tpacket_block_desc *pbd1,
 754                struct packet_sock *po, unsigned int stat)
 755{
 756        __u32 status = TP_STATUS_USER | stat;
 757
 758        struct tpacket3_hdr *last_pkt;
 759        struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
 760        struct sock *sk = &po->sk;
 761
 762        if (atomic_read(&po->tp_drops))
 763                status |= TP_STATUS_LOSING;
 764
 765        last_pkt = (struct tpacket3_hdr *)pkc1->prev;
 766        last_pkt->tp_next_offset = 0;
 767
 768        /* Get the ts of the last pkt */
 769        if (BLOCK_NUM_PKTS(pbd1)) {
 770                h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
 771                h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
 772        } else {
 773                /* Ok, we tmo'd - so get the current time.
 774                 *
 775                 * It shouldn't really happen as we don't close empty
 776                 * blocks. See prb_retire_rx_blk_timer_expired().
 777                 */
 778                struct timespec ts;
 779                getnstimeofday(&ts);
 780                h1->ts_last_pkt.ts_sec = ts.tv_sec;
 781                h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
 782        }
 783
 784        smp_wmb();
 785
 786        /* Flush the block */
 787        prb_flush_block(pkc1, pbd1, status);
 788
 789        sk->sk_data_ready(sk);
 790
 791        pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
 792}
 793
 794static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
 795{
 796        pkc->reset_pending_on_curr_blk = 0;
 797}
 798
 799/*
 800 * Side effect of opening a block:
 801 *
 802 * 1) prb_queue is thawed.
 803 * 2) retire_blk_timer is refreshed.
 804 *
 805 */
 806static void prb_open_block(struct tpacket_kbdq_core *pkc1,
 807        struct tpacket_block_desc *pbd1)
 808{
 809        struct timespec ts;
 810        struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
 811
 812        smp_rmb();
 813
 814        /* We could have just memset this but we will lose the
 815         * flexibility of making the priv area sticky
 816         */
 817
 818        BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
 819        BLOCK_NUM_PKTS(pbd1) = 0;
 820        BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
 821
 822        getnstimeofday(&ts);
 823
 824        h1->ts_first_pkt.ts_sec = ts.tv_sec;
 825        h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
 826
 827        pkc1->pkblk_start = (char *)pbd1;
 828        pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
 829
 830        BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
 831        BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
 832
 833        pbd1->version = pkc1->version;
 834        pkc1->prev = pkc1->nxt_offset;
 835        pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
 836
 837        prb_thaw_queue(pkc1);
 838        _prb_refresh_rx_retire_blk_timer(pkc1);
 839
 840        smp_wmb();
 841}
 842
 843/*
 844 * Queue freeze logic:
 845 * 1) Assume tp_block_nr = 8 blocks.
 846 * 2) At time 't0', user opens Rx ring.
 847 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
 848 * 4) user-space is either sleeping or processing block '0'.
 849 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
 850 *    it will close block-7,loop around and try to fill block '0'.
 851 *    call-flow:
 852 *    __packet_lookup_frame_in_block
 853 *      prb_retire_current_block()
 854 *      prb_dispatch_next_block()
 855 *        |->(BLOCK_STATUS == USER) evaluates to true
 856 *    5.1) Since block-0 is currently in-use, we just freeze the queue.
 857 * 6) Now there are two cases:
 858 *    6.1) Link goes idle right after the queue is frozen.
 859 *         But remember, the last open_block() refreshed the timer.
 860 *         When this timer expires,it will refresh itself so that we can
 861 *         re-open block-0 in near future.
 862 *    6.2) Link is busy and keeps on receiving packets. This is a simple
 863 *         case and __packet_lookup_frame_in_block will check if block-0
 864 *         is free and can now be re-used.
 865 */
 866static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
 867                                  struct packet_sock *po)
 868{
 869        pkc->reset_pending_on_curr_blk = 1;
 870        po->stats.stats3.tp_freeze_q_cnt++;
 871}
 872
 873#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
 874
 875/*
 876 * If the next block is free then we will dispatch it
 877 * and return a good offset.
 878 * Else, we will freeze the queue.
 879 * So, caller must check the return value.
 880 */
 881static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
 882                struct packet_sock *po)
 883{
 884        struct tpacket_block_desc *pbd;
 885
 886        smp_rmb();
 887
 888        /* 1. Get current block num */
 889        pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
 890
 891        /* 2. If this block is currently in_use then freeze the queue */
 892        if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
 893                prb_freeze_queue(pkc, po);
 894                return NULL;
 895        }
 896
 897        /*
 898         * 3.
 899         * open this block and return the offset where the first packet
 900         * needs to get stored.
 901         */
 902        prb_open_block(pkc, pbd);
 903        return (void *)pkc->nxt_offset;
 904}
 905
 906static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
 907                struct packet_sock *po, unsigned int status)
 908{
 909        struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
 910
 911        /* retire/close the current block */
 912        if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
 913                /*
 914                 * Plug the case where copy_bits() is in progress on
 915                 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
 916                 * have space to copy the pkt in the current block and
 917                 * called prb_retire_current_block()
 918                 *
 919                 * We don't need to worry about the TMO case because
 920                 * the timer-handler already handled this case.
 921                 */
 922                if (!(status & TP_STATUS_BLK_TMO)) {
 923                        while (atomic_read(&pkc->blk_fill_in_prog)) {
 924                                /* Waiting for skb_copy_bits to finish... */
 925                                cpu_relax();
 926                        }
 927                }
 928                prb_close_block(pkc, pbd, po, status);
 929                return;
 930        }
 931}
 932
 933static int prb_curr_blk_in_use(struct tpacket_block_desc *pbd)
 934{
 935        return TP_STATUS_USER & BLOCK_STATUS(pbd);
 936}
 937
 938static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
 939{
 940        return pkc->reset_pending_on_curr_blk;
 941}
 942
 943static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
 944{
 945        struct tpacket_kbdq_core *pkc  = GET_PBDQC_FROM_RB(rb);
 946        atomic_dec(&pkc->blk_fill_in_prog);
 947}
 948
 949static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
 950                        struct tpacket3_hdr *ppd)
 951{
 952        ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
 953}
 954
 955static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
 956                        struct tpacket3_hdr *ppd)
 957{
 958        ppd->hv1.tp_rxhash = 0;
 959}
 960
 961static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
 962                        struct tpacket3_hdr *ppd)
 963{
 964        if (skb_vlan_tag_present(pkc->skb)) {
 965                ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
 966                ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
 967                ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
 968        } else {
 969                ppd->hv1.tp_vlan_tci = 0;
 970                ppd->hv1.tp_vlan_tpid = 0;
 971                ppd->tp_status = TP_STATUS_AVAILABLE;
 972        }
 973}
 974
 975static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
 976                        struct tpacket3_hdr *ppd)
 977{
 978        ppd->hv1.tp_padding = 0;
 979        prb_fill_vlan_info(pkc, ppd);
 980
 981        if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
 982                prb_fill_rxhash(pkc, ppd);
 983        else
 984                prb_clear_rxhash(pkc, ppd);
 985}
 986
 987static void prb_fill_curr_block(char *curr,
 988                                struct tpacket_kbdq_core *pkc,
 989                                struct tpacket_block_desc *pbd,
 990                                unsigned int len)
 991{
 992        struct tpacket3_hdr *ppd;
 993
 994        ppd  = (struct tpacket3_hdr *)curr;
 995        ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
 996        pkc->prev = curr;
 997        pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
 998        BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
 999        BLOCK_NUM_PKTS(pbd) += 1;
1000        atomic_inc(&pkc->blk_fill_in_prog);
1001        prb_run_all_ft_ops(pkc, ppd);
1002}
1003
1004/* Assumes caller has the sk->rx_queue.lock */
1005static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1006                                            struct sk_buff *skb,
1007                                            unsigned int len
1008                                            )
1009{
1010        struct tpacket_kbdq_core *pkc;
1011        struct tpacket_block_desc *pbd;
1012        char *curr, *end;
1013
1014        pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
1015        pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1016
1017        /* Queue is frozen when user space is lagging behind */
1018        if (prb_queue_frozen(pkc)) {
1019                /*
1020                 * Check if that last block which caused the queue to freeze,
1021                 * is still in_use by user-space.
1022                 */
1023                if (prb_curr_blk_in_use(pbd)) {
1024                        /* Can't record this packet */
1025                        return NULL;
1026                } else {
1027                        /*
1028                         * Ok, the block was released by user-space.
1029                         * Now let's open that block.
1030                         * opening a block also thaws the queue.
1031                         * Thawing is a side effect.
1032                         */
1033                        prb_open_block(pkc, pbd);
1034                }
1035        }
1036
1037        smp_mb();
1038        curr = pkc->nxt_offset;
1039        pkc->skb = skb;
1040        end = (char *)pbd + pkc->kblk_size;
1041
1042        /* first try the current block */
1043        if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1044                prb_fill_curr_block(curr, pkc, pbd, len);
1045                return (void *)curr;
1046        }
1047
1048        /* Ok, close the current block */
1049        prb_retire_current_block(pkc, po, 0);
1050
1051        /* Now, try to dispatch the next block */
1052        curr = (char *)prb_dispatch_next_block(pkc, po);
1053        if (curr) {
1054                pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1055                prb_fill_curr_block(curr, pkc, pbd, len);
1056                return (void *)curr;
1057        }
1058
1059        /*
1060         * No free blocks are available.user_space hasn't caught up yet.
1061         * Queue was just frozen and now this packet will get dropped.
1062         */
1063        return NULL;
1064}
1065
1066static void *packet_current_rx_frame(struct packet_sock *po,
1067                                            struct sk_buff *skb,
1068                                            int status, unsigned int len)
1069{
1070        char *curr = NULL;
1071        switch (po->tp_version) {
1072        case TPACKET_V1:
1073        case TPACKET_V2:
1074                curr = packet_lookup_frame(po, &po->rx_ring,
1075                                        po->rx_ring.head, status);
1076                return curr;
1077        case TPACKET_V3:
1078                return __packet_lookup_frame_in_block(po, skb, len);
1079        default:
1080                WARN(1, "TPACKET version not supported\n");
1081                BUG();
1082                return NULL;
1083        }
1084}
1085
1086static void *prb_lookup_block(const struct packet_sock *po,
1087                              const struct packet_ring_buffer *rb,
1088                              unsigned int idx,
1089                              int status)
1090{
1091        struct tpacket_kbdq_core *pkc  = GET_PBDQC_FROM_RB(rb);
1092        struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
1093
1094        if (status != BLOCK_STATUS(pbd))
1095                return NULL;
1096        return pbd;
1097}
1098
1099static int prb_previous_blk_num(struct packet_ring_buffer *rb)
1100{
1101        unsigned int prev;
1102        if (rb->prb_bdqc.kactive_blk_num)
1103                prev = rb->prb_bdqc.kactive_blk_num-1;
1104        else
1105                prev = rb->prb_bdqc.knum_blocks-1;
1106        return prev;
1107}
1108
1109/* Assumes caller has held the rx_queue.lock */
1110static void *__prb_previous_block(struct packet_sock *po,
1111                                         struct packet_ring_buffer *rb,
1112                                         int status)
1113{
1114        unsigned int previous = prb_previous_blk_num(rb);
1115        return prb_lookup_block(po, rb, previous, status);
1116}
1117
1118static void *packet_previous_rx_frame(struct packet_sock *po,
1119                                             struct packet_ring_buffer *rb,
1120                                             int status)
1121{
1122        if (po->tp_version <= TPACKET_V2)
1123                return packet_previous_frame(po, rb, status);
1124
1125        return __prb_previous_block(po, rb, status);
1126}
1127
1128static void packet_increment_rx_head(struct packet_sock *po,
1129                                            struct packet_ring_buffer *rb)
1130{
1131        switch (po->tp_version) {
1132        case TPACKET_V1:
1133        case TPACKET_V2:
1134                return packet_increment_head(rb);
1135        case TPACKET_V3:
1136        default:
1137                WARN(1, "TPACKET version not supported.\n");
1138                BUG();
1139                return;
1140        }
1141}
1142
1143static void *packet_previous_frame(struct packet_sock *po,
1144                struct packet_ring_buffer *rb,
1145                int status)
1146{
1147        unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1148        return packet_lookup_frame(po, rb, previous, status);
1149}
1150
1151static void packet_increment_head(struct packet_ring_buffer *buff)
1152{
1153        buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1154}
1155
1156static void packet_inc_pending(struct packet_ring_buffer *rb)
1157{
1158        this_cpu_inc(*rb->pending_refcnt);
1159}
1160
1161static void packet_dec_pending(struct packet_ring_buffer *rb)
1162{
1163        this_cpu_dec(*rb->pending_refcnt);
1164}
1165
1166static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1167{
1168        unsigned int refcnt = 0;
1169        int cpu;
1170
1171        /* We don't use pending refcount in rx_ring. */
1172        if (rb->pending_refcnt == NULL)
1173                return 0;
1174
1175        for_each_possible_cpu(cpu)
1176                refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1177
1178        return refcnt;
1179}
1180
1181static int packet_alloc_pending(struct packet_sock *po)
1182{
1183        po->rx_ring.pending_refcnt = NULL;
1184
1185        po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1186        if (unlikely(po->tx_ring.pending_refcnt == NULL))
1187                return -ENOBUFS;
1188
1189        return 0;
1190}
1191
1192static void packet_free_pending(struct packet_sock *po)
1193{
1194        free_percpu(po->tx_ring.pending_refcnt);
1195}
1196
1197#define ROOM_POW_OFF    2
1198#define ROOM_NONE       0x0
1199#define ROOM_LOW        0x1
1200#define ROOM_NORMAL     0x2
1201
1202static bool __tpacket_has_room(const struct packet_sock *po, int pow_off)
1203{
1204        int idx, len;
1205
1206        len = READ_ONCE(po->rx_ring.frame_max) + 1;
1207        idx = READ_ONCE(po->rx_ring.head);
1208        if (pow_off)
1209                idx += len >> pow_off;
1210        if (idx >= len)
1211                idx -= len;
1212        return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1213}
1214
1215static bool __tpacket_v3_has_room(const struct packet_sock *po, int pow_off)
1216{
1217        int idx, len;
1218
1219        len = READ_ONCE(po->rx_ring.prb_bdqc.knum_blocks);
1220        idx = READ_ONCE(po->rx_ring.prb_bdqc.kactive_blk_num);
1221        if (pow_off)
1222                idx += len >> pow_off;
1223        if (idx >= len)
1224                idx -= len;
1225        return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1226}
1227
1228static int __packet_rcv_has_room(const struct packet_sock *po,
1229                                 const struct sk_buff *skb)
1230{
1231        const struct sock *sk = &po->sk;
1232        int ret = ROOM_NONE;
1233
1234        if (po->prot_hook.func != tpacket_rcv) {
1235                int rcvbuf = READ_ONCE(sk->sk_rcvbuf);
1236                int avail = rcvbuf - atomic_read(&sk->sk_rmem_alloc)
1237                                   - (skb ? skb->truesize : 0);
1238
1239                if (avail > (rcvbuf >> ROOM_POW_OFF))
1240                        return ROOM_NORMAL;
1241                else if (avail > 0)
1242                        return ROOM_LOW;
1243                else
1244                        return ROOM_NONE;
1245        }
1246
1247        if (po->tp_version == TPACKET_V3) {
1248                if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
1249                        ret = ROOM_NORMAL;
1250                else if (__tpacket_v3_has_room(po, 0))
1251                        ret = ROOM_LOW;
1252        } else {
1253                if (__tpacket_has_room(po, ROOM_POW_OFF))
1254                        ret = ROOM_NORMAL;
1255                else if (__tpacket_has_room(po, 0))
1256                        ret = ROOM_LOW;
1257        }
1258
1259        return ret;
1260}
1261
1262static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1263{
1264        int pressure, ret;
1265
1266        ret = __packet_rcv_has_room(po, skb);
1267        pressure = ret != ROOM_NORMAL;
1268
1269        if (READ_ONCE(po->pressure) != pressure)
1270                WRITE_ONCE(po->pressure, pressure);
1271
1272        return ret;
1273}
1274
1275static void packet_rcv_try_clear_pressure(struct packet_sock *po)
1276{
1277        if (READ_ONCE(po->pressure) &&
1278            __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
1279                WRITE_ONCE(po->pressure,  0);
1280}
1281
1282static void packet_sock_destruct(struct sock *sk)
1283{
1284        skb_queue_purge(&sk->sk_error_queue);
1285
1286        WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1287        WARN_ON(refcount_read(&sk->sk_wmem_alloc));
1288
1289        if (!sock_flag(sk, SOCK_DEAD)) {
1290                pr_err("Attempt to release alive packet socket: %p\n", sk);
1291                return;
1292        }
1293
1294        sk_refcnt_debug_dec(sk);
1295}
1296
1297static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
1298{
1299        u32 *history = po->rollover->history;
1300        u32 victim, rxhash;
1301        int i, count = 0;
1302
1303        rxhash = skb_get_hash(skb);
1304        for (i = 0; i < ROLLOVER_HLEN; i++)
1305                if (READ_ONCE(history[i]) == rxhash)
1306                        count++;
1307
1308        victim = prandom_u32() % ROLLOVER_HLEN;
1309
1310        /* Avoid dirtying the cache line if possible */
1311        if (READ_ONCE(history[victim]) != rxhash)
1312                WRITE_ONCE(history[victim], rxhash);
1313
1314        return count > (ROLLOVER_HLEN >> 1);
1315}
1316
1317static unsigned int fanout_demux_hash(struct packet_fanout *f,
1318                                      struct sk_buff *skb,
1319                                      unsigned int num)
1320{
1321        return reciprocal_scale(__skb_get_hash_symmetric(skb), num);
1322}
1323
1324static unsigned int fanout_demux_lb(struct packet_fanout *f,
1325                                    struct sk_buff *skb,
1326                                    unsigned int num)
1327{
1328        unsigned int val = atomic_inc_return(&f->rr_cur);
1329
1330        return val % num;
1331}
1332
1333static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1334                                     struct sk_buff *skb,
1335                                     unsigned int num)
1336{
1337        return smp_processor_id() % num;
1338}
1339
1340static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1341                                     struct sk_buff *skb,
1342                                     unsigned int num)
1343{
1344        return prandom_u32_max(num);
1345}
1346
1347static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1348                                          struct sk_buff *skb,
1349                                          unsigned int idx, bool try_self,
1350                                          unsigned int num)
1351{
1352        struct packet_sock *po, *po_next, *po_skip = NULL;
1353        unsigned int i, j, room = ROOM_NONE;
1354
1355        po = pkt_sk(f->arr[idx]);
1356
1357        if (try_self) {
1358                room = packet_rcv_has_room(po, skb);
1359                if (room == ROOM_NORMAL ||
1360                    (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
1361                        return idx;
1362                po_skip = po;
1363        }
1364
1365        i = j = min_t(int, po->rollover->sock, num - 1);
1366        do {
1367                po_next = pkt_sk(f->arr[i]);
1368                if (po_next != po_skip && !READ_ONCE(po_next->pressure) &&
1369                    packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
1370                        if (i != j)
1371                                po->rollover->sock = i;
1372                        atomic_long_inc(&po->rollover->num);
1373                        if (room == ROOM_LOW)
1374                                atomic_long_inc(&po->rollover->num_huge);
1375                        return i;
1376                }
1377
1378                if (++i == num)
1379                        i = 0;
1380        } while (i != j);
1381
1382        atomic_long_inc(&po->rollover->num_failed);
1383        return idx;
1384}
1385
1386static unsigned int fanout_demux_qm(struct packet_fanout *f,
1387                                    struct sk_buff *skb,
1388                                    unsigned int num)
1389{
1390        return skb_get_queue_mapping(skb) % num;
1391}
1392
1393static unsigned int fanout_demux_bpf(struct packet_fanout *f,
1394                                     struct sk_buff *skb,
1395                                     unsigned int num)
1396{
1397        struct bpf_prog *prog;
1398        unsigned int ret = 0;
1399
1400        rcu_read_lock();
1401        prog = rcu_dereference(f->bpf_prog);
1402        if (prog)
1403                ret = bpf_prog_run_clear_cb(prog, skb) % num;
1404        rcu_read_unlock();
1405
1406        return ret;
1407}
1408
1409static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1410{
1411        return f->flags & (flag >> 8);
1412}
1413
1414static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1415                             struct packet_type *pt, struct net_device *orig_dev)
1416{
1417        struct packet_fanout *f = pt->af_packet_priv;
1418        unsigned int num = READ_ONCE(f->num_members);
1419        struct net *net = read_pnet(&f->net);
1420        struct packet_sock *po;
1421        unsigned int idx;
1422
1423        if (!net_eq(dev_net(dev), net) || !num) {
1424                kfree_skb(skb);
1425                return 0;
1426        }
1427
1428        if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
1429                skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET);
1430                if (!skb)
1431                        return 0;
1432        }
1433        switch (f->type) {
1434        case PACKET_FANOUT_HASH:
1435        default:
1436                idx = fanout_demux_hash(f, skb, num);
1437                break;
1438        case PACKET_FANOUT_LB:
1439                idx = fanout_demux_lb(f, skb, num);
1440                break;
1441        case PACKET_FANOUT_CPU:
1442                idx = fanout_demux_cpu(f, skb, num);
1443                break;
1444        case PACKET_FANOUT_RND:
1445                idx = fanout_demux_rnd(f, skb, num);
1446                break;
1447        case PACKET_FANOUT_QM:
1448                idx = fanout_demux_qm(f, skb, num);
1449                break;
1450        case PACKET_FANOUT_ROLLOVER:
1451                idx = fanout_demux_rollover(f, skb, 0, false, num);
1452                break;
1453        case PACKET_FANOUT_CBPF:
1454        case PACKET_FANOUT_EBPF:
1455                idx = fanout_demux_bpf(f, skb, num);
1456                break;
1457        }
1458
1459        if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
1460                idx = fanout_demux_rollover(f, skb, idx, true, num);
1461
1462        po = pkt_sk(f->arr[idx]);
1463        return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1464}
1465
1466DEFINE_MUTEX(fanout_mutex);
1467EXPORT_SYMBOL_GPL(fanout_mutex);
1468static LIST_HEAD(fanout_list);
1469static u16 fanout_next_id;
1470
1471static void __fanout_link(struct sock *sk, struct packet_sock *po)
1472{
1473        struct packet_fanout *f = po->fanout;
1474
1475        spin_lock(&f->lock);
1476        f->arr[f->num_members] = sk;
1477        smp_wmb();
1478        f->num_members++;
1479        if (f->num_members == 1)
1480                dev_add_pack(&f->prot_hook);
1481        spin_unlock(&f->lock);
1482}
1483
1484static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1485{
1486        struct packet_fanout *f = po->fanout;
1487        int i;
1488
1489        spin_lock(&f->lock);
1490        for (i = 0; i < f->num_members; i++) {
1491                if (f->arr[i] == sk)
1492                        break;
1493        }
1494        BUG_ON(i >= f->num_members);
1495        f->arr[i] = f->arr[f->num_members - 1];
1496        f->num_members--;
1497        if (f->num_members == 0)
1498                __dev_remove_pack(&f->prot_hook);
1499        spin_unlock(&f->lock);
1500}
1501
1502static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
1503{
1504        if (sk->sk_family != PF_PACKET)
1505                return false;
1506
1507        return ptype->af_packet_priv == pkt_sk(sk)->fanout;
1508}
1509
1510static void fanout_init_data(struct packet_fanout *f)
1511{
1512        switch (f->type) {
1513        case PACKET_FANOUT_LB:
1514                atomic_set(&f->rr_cur, 0);
1515                break;
1516        case PACKET_FANOUT_CBPF:
1517        case PACKET_FANOUT_EBPF:
1518                RCU_INIT_POINTER(f->bpf_prog, NULL);
1519                break;
1520        }
1521}
1522
1523static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
1524{
1525        struct bpf_prog *old;
1526
1527        spin_lock(&f->lock);
1528        old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
1529        rcu_assign_pointer(f->bpf_prog, new);
1530        spin_unlock(&f->lock);
1531
1532        if (old) {
1533                synchronize_net();
1534                bpf_prog_destroy(old);
1535        }
1536}
1537
1538static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data,
1539                                unsigned int len)
1540{
1541        struct bpf_prog *new;
1542        struct sock_fprog fprog;
1543        int ret;
1544
1545        if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1546                return -EPERM;
1547        if (len != sizeof(fprog))
1548                return -EINVAL;
1549        if (copy_from_user(&fprog, data, len))
1550                return -EFAULT;
1551
1552        ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
1553        if (ret)
1554                return ret;
1555
1556        __fanout_set_data_bpf(po->fanout, new);
1557        return 0;
1558}
1559
1560static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data,
1561                                unsigned int len)
1562{
1563        struct bpf_prog *new;
1564        u32 fd;
1565
1566        if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1567                return -EPERM;
1568        if (len != sizeof(fd))
1569                return -EINVAL;
1570        if (copy_from_user(&fd, data, len))
1571                return -EFAULT;
1572
1573        new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
1574        if (IS_ERR(new))
1575                return PTR_ERR(new);
1576
1577        __fanout_set_data_bpf(po->fanout, new);
1578        return 0;
1579}
1580
1581static int fanout_set_data(struct packet_sock *po, char __user *data,
1582                           unsigned int len)
1583{
1584        switch (po->fanout->type) {
1585        case PACKET_FANOUT_CBPF:
1586                return fanout_set_data_cbpf(po, data, len);
1587        case PACKET_FANOUT_EBPF:
1588                return fanout_set_data_ebpf(po, data, len);
1589        default:
1590                return -EINVAL;
1591        }
1592}
1593
1594static void fanout_release_data(struct packet_fanout *f)
1595{
1596        switch (f->type) {
1597        case PACKET_FANOUT_CBPF:
1598        case PACKET_FANOUT_EBPF:
1599                __fanout_set_data_bpf(f, NULL);
1600        }
1601}
1602
1603static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id)
1604{
1605        struct packet_fanout *f;
1606
1607        list_for_each_entry(f, &fanout_list, list) {
1608                if (f->id == candidate_id &&
1609                    read_pnet(&f->net) == sock_net(sk)) {
1610                        return false;
1611                }
1612        }
1613        return true;
1614}
1615
1616static bool fanout_find_new_id(struct sock *sk, u16 *new_id)
1617{
1618        u16 id = fanout_next_id;
1619
1620        do {
1621                if (__fanout_id_is_free(sk, id)) {
1622                        *new_id = id;
1623                        fanout_next_id = id + 1;
1624                        return true;
1625                }
1626
1627                id++;
1628        } while (id != fanout_next_id);
1629
1630        return false;
1631}
1632
1633static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
1634{
1635        struct packet_rollover *rollover = NULL;
1636        struct packet_sock *po = pkt_sk(sk);
1637        struct packet_fanout *f, *match;
1638        u8 type = type_flags & 0xff;
1639        u8 flags = type_flags >> 8;
1640        int err;
1641
1642        switch (type) {
1643        case PACKET_FANOUT_ROLLOVER:
1644                if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1645                        return -EINVAL;
1646        case PACKET_FANOUT_HASH:
1647        case PACKET_FANOUT_LB:
1648        case PACKET_FANOUT_CPU:
1649        case PACKET_FANOUT_RND:
1650        case PACKET_FANOUT_QM:
1651        case PACKET_FANOUT_CBPF:
1652        case PACKET_FANOUT_EBPF:
1653                break;
1654        default:
1655                return -EINVAL;
1656        }
1657
1658        mutex_lock(&fanout_mutex);
1659
1660        err = -EALREADY;
1661        if (po->fanout)
1662                goto out;
1663
1664        if (type == PACKET_FANOUT_ROLLOVER ||
1665            (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
1666                err = -ENOMEM;
1667                rollover = kzalloc(sizeof(*rollover), GFP_KERNEL);
1668                if (!rollover)
1669                        goto out;
1670                atomic_long_set(&rollover->num, 0);
1671                atomic_long_set(&rollover->num_huge, 0);
1672                atomic_long_set(&rollover->num_failed, 0);
1673        }
1674
1675        if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) {
1676                if (id != 0) {
1677                        err = -EINVAL;
1678                        goto out;
1679                }
1680                if (!fanout_find_new_id(sk, &id)) {
1681                        err = -ENOMEM;
1682                        goto out;
1683                }
1684                /* ephemeral flag for the first socket in the group: drop it */
1685                flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8);
1686        }
1687
1688        match = NULL;
1689        list_for_each_entry(f, &fanout_list, list) {
1690                if (f->id == id &&
1691                    read_pnet(&f->net) == sock_net(sk)) {
1692                        match = f;
1693                        break;
1694                }
1695        }
1696        err = -EINVAL;
1697        if (match && match->flags != flags)
1698                goto out;
1699        if (!match) {
1700                err = -ENOMEM;
1701                match = kzalloc(sizeof(*match), GFP_KERNEL);
1702                if (!match)
1703                        goto out;
1704                write_pnet(&match->net, sock_net(sk));
1705                match->id = id;
1706                match->type = type;
1707                match->flags = flags;
1708                INIT_LIST_HEAD(&match->list);
1709                spin_lock_init(&match->lock);
1710                refcount_set(&match->sk_ref, 0);
1711                fanout_init_data(match);
1712                match->prot_hook.type = po->prot_hook.type;
1713                match->prot_hook.dev = po->prot_hook.dev;
1714                match->prot_hook.func = packet_rcv_fanout;
1715                match->prot_hook.af_packet_priv = match;
1716                match->prot_hook.id_match = match_fanout_group;
1717                list_add(&match->list, &fanout_list);
1718        }
1719        err = -EINVAL;
1720
1721        spin_lock(&po->bind_lock);
1722        if (po->running &&
1723            match->type == type &&
1724            match->prot_hook.type == po->prot_hook.type &&
1725            match->prot_hook.dev == po->prot_hook.dev) {
1726                err = -ENOSPC;
1727                if (refcount_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1728                        __dev_remove_pack(&po->prot_hook);
1729                        po->fanout = match;
1730                        po->rollover = rollover;
1731                        rollover = NULL;
1732                        refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1);
1733                        __fanout_link(sk, po);
1734                        err = 0;
1735                }
1736        }
1737        spin_unlock(&po->bind_lock);
1738
1739        if (err && !refcount_read(&match->sk_ref)) {
1740                list_del(&match->list);
1741                kfree(match);
1742        }
1743
1744out:
1745        kfree(rollover);
1746        mutex_unlock(&fanout_mutex);
1747        return err;
1748}
1749
1750/* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
1751 * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
1752 * It is the responsibility of the caller to call fanout_release_data() and
1753 * free the returned packet_fanout (after synchronize_net())
1754 */
1755static struct packet_fanout *fanout_release(struct sock *sk)
1756{
1757        struct packet_sock *po = pkt_sk(sk);
1758        struct packet_fanout *f;
1759
1760        mutex_lock(&fanout_mutex);
1761        f = po->fanout;
1762        if (f) {
1763                po->fanout = NULL;
1764
1765                if (refcount_dec_and_test(&f->sk_ref))
1766                        list_del(&f->list);
1767                else
1768                        f = NULL;
1769        }
1770        mutex_unlock(&fanout_mutex);
1771
1772        return f;
1773}
1774
1775static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
1776                                          struct sk_buff *skb)
1777{
1778        /* Earlier code assumed this would be a VLAN pkt, double-check
1779         * this now that we have the actual packet in hand. We can only
1780         * do this check on Ethernet devices.
1781         */
1782        if (unlikely(dev->type != ARPHRD_ETHER))
1783                return false;
1784
1785        skb_reset_mac_header(skb);
1786        return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
1787}
1788
1789static const struct proto_ops packet_ops;
1790
1791static const struct proto_ops packet_ops_spkt;
1792
1793static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1794                           struct packet_type *pt, struct net_device *orig_dev)
1795{
1796        struct sock *sk;
1797        struct sockaddr_pkt *spkt;
1798
1799        /*
1800         *      When we registered the protocol we saved the socket in the data
1801         *      field for just this event.
1802         */
1803
1804        sk = pt->af_packet_priv;
1805
1806        /*
1807         *      Yank back the headers [hope the device set this
1808         *      right or kerboom...]
1809         *
1810         *      Incoming packets have ll header pulled,
1811         *      push it back.
1812         *
1813         *      For outgoing ones skb->data == skb_mac_header(skb)
1814         *      so that this procedure is noop.
1815         */
1816
1817        if (skb->pkt_type == PACKET_LOOPBACK)
1818                goto out;
1819
1820        if (!net_eq(dev_net(dev), sock_net(sk)))
1821                goto out;
1822
1823        skb = skb_share_check(skb, GFP_ATOMIC);
1824        if (skb == NULL)
1825                goto oom;
1826
1827        /* drop any routing info */
1828        skb_dst_drop(skb);
1829
1830        /* drop conntrack reference */
1831        nf_reset_ct(skb);
1832
1833        spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1834
1835        skb_push(skb, skb->data - skb_mac_header(skb));
1836
1837        /*
1838         *      The SOCK_PACKET socket receives _all_ frames.
1839         */
1840
1841        spkt->spkt_family = dev->type;
1842        strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1843        spkt->spkt_protocol = skb->protocol;
1844
1845        /*
1846         *      Charge the memory to the socket. This is done specifically
1847         *      to prevent sockets using all the memory up.
1848         */
1849
1850        if (sock_queue_rcv_skb(sk, skb) == 0)
1851                return 0;
1852
1853out:
1854        kfree_skb(skb);
1855oom:
1856        return 0;
1857}
1858
1859static void packet_parse_headers(struct sk_buff *skb, struct socket *sock)
1860{
1861        if ((!skb->protocol || skb->protocol == htons(ETH_P_ALL)) &&
1862            sock->type == SOCK_RAW) {
1863                skb_reset_mac_header(skb);
1864                skb->protocol = dev_parse_header_protocol(skb);
1865        }
1866
1867        skb_probe_transport_header(skb);
1868}
1869
1870/*
1871 *      Output a raw packet to a device layer. This bypasses all the other
1872 *      protocol layers and you must therefore supply it with a complete frame
1873 */
1874
1875static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
1876                               size_t len)
1877{
1878        struct sock *sk = sock->sk;
1879        DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1880        struct sk_buff *skb = NULL;
1881        struct net_device *dev;
1882        struct sockcm_cookie sockc;
1883        __be16 proto = 0;
1884        int err;
1885        int extra_len = 0;
1886
1887        /*
1888         *      Get and verify the address.
1889         */
1890
1891        if (saddr) {
1892                if (msg->msg_namelen < sizeof(struct sockaddr))
1893                        return -EINVAL;
1894                if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1895                        proto = saddr->spkt_protocol;
1896        } else
1897                return -ENOTCONN;       /* SOCK_PACKET must be sent giving an address */
1898
1899        /*
1900         *      Find the device first to size check it
1901         */
1902
1903        saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1904retry:
1905        rcu_read_lock();
1906        dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1907        err = -ENODEV;
1908        if (dev == NULL)
1909                goto out_unlock;
1910
1911        err = -ENETDOWN;
1912        if (!(dev->flags & IFF_UP))
1913                goto out_unlock;
1914
1915        /*
1916         * You may not queue a frame bigger than the mtu. This is the lowest level
1917         * raw protocol and you must do your own fragmentation at this level.
1918         */
1919
1920        if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1921                if (!netif_supports_nofcs(dev)) {
1922                        err = -EPROTONOSUPPORT;
1923                        goto out_unlock;
1924                }
1925                extra_len = 4; /* We're doing our own CRC */
1926        }
1927
1928        err = -EMSGSIZE;
1929        if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1930                goto out_unlock;
1931
1932        if (!skb) {
1933                size_t reserved = LL_RESERVED_SPACE(dev);
1934                int tlen = dev->needed_tailroom;
1935                unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1936
1937                rcu_read_unlock();
1938                skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1939                if (skb == NULL)
1940                        return -ENOBUFS;
1941                /* FIXME: Save some space for broken drivers that write a hard
1942                 * header at transmission time by themselves. PPP is the notable
1943                 * one here. This should really be fixed at the driver level.
1944                 */
1945                skb_reserve(skb, reserved);
1946                skb_reset_network_header(skb);
1947
1948                /* Try to align data part correctly */
1949                if (hhlen) {
1950                        skb->data -= hhlen;
1951                        skb->tail -= hhlen;
1952                        if (len < hhlen)
1953                                skb_reset_network_header(skb);
1954                }
1955                err = memcpy_from_msg(skb_put(skb, len), msg, len);
1956                if (err)
1957                        goto out_free;
1958                goto retry;
1959        }
1960
1961        if (!dev_validate_header(dev, skb->data, len)) {
1962                err = -EINVAL;
1963                goto out_unlock;
1964        }
1965        if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
1966            !packet_extra_vlan_len_allowed(dev, skb)) {
1967                err = -EMSGSIZE;
1968                goto out_unlock;
1969        }
1970
1971        sockcm_init(&sockc, sk);
1972        if (msg->msg_controllen) {
1973                err = sock_cmsg_send(sk, msg, &sockc);
1974                if (unlikely(err))
1975                        goto out_unlock;
1976        }
1977
1978        skb->protocol = proto;
1979        skb->dev = dev;
1980        skb->priority = sk->sk_priority;
1981        skb->mark = sk->sk_mark;
1982        skb->tstamp = sockc.transmit_time;
1983
1984        skb_setup_tx_timestamp(skb, sockc.tsflags);
1985
1986        if (unlikely(extra_len == 4))
1987                skb->no_fcs = 1;
1988
1989        packet_parse_headers(skb, sock);
1990
1991        dev_queue_xmit(skb);
1992        rcu_read_unlock();
1993        return len;
1994
1995out_unlock:
1996        rcu_read_unlock();
1997out_free:
1998        kfree_skb(skb);
1999        return err;
2000}
2001
2002static unsigned int run_filter(struct sk_buff *skb,
2003                               const struct sock *sk,
2004                               unsigned int res)
2005{
2006        struct sk_filter *filter;
2007
2008        rcu_read_lock();
2009        filter = rcu_dereference(sk->sk_filter);
2010        if (filter != NULL)
2011                res = bpf_prog_run_clear_cb(filter->prog, skb);
2012        rcu_read_unlock();
2013
2014        return res;
2015}
2016
2017static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
2018                           size_t *len)
2019{
2020        struct virtio_net_hdr vnet_hdr;
2021
2022        if (*len < sizeof(vnet_hdr))
2023                return -EINVAL;
2024        *len -= sizeof(vnet_hdr);
2025
2026        if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true, 0))
2027                return -EINVAL;
2028
2029        return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
2030}
2031
2032/*
2033 * This function makes lazy skb cloning in hope that most of packets
2034 * are discarded by BPF.
2035 *
2036 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
2037 * and skb->cb are mangled. It works because (and until) packets
2038 * falling here are owned by current CPU. Output packets are cloned
2039 * by dev_queue_xmit_nit(), input packets are processed by net_bh
2040 * sequencially, so that if we return skb to original state on exit,
2041 * we will not harm anyone.
2042 */
2043
2044static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
2045                      struct packet_type *pt, struct net_device *orig_dev)
2046{
2047        struct sock *sk;
2048        struct sockaddr_ll *sll;
2049        struct packet_sock *po;
2050        u8 *skb_head = skb->data;
2051        int skb_len = skb->len;
2052        unsigned int snaplen, res;
2053        bool is_drop_n_account = false;
2054
2055        if (skb->pkt_type == PACKET_LOOPBACK)
2056                goto drop;
2057
2058        sk = pt->af_packet_priv;
2059        po = pkt_sk(sk);
2060
2061        if (!net_eq(dev_net(dev), sock_net(sk)))
2062                goto drop;
2063
2064        skb->dev = dev;
2065
2066        if (dev->header_ops) {
2067                /* The device has an explicit notion of ll header,
2068                 * exported to higher levels.
2069                 *
2070                 * Otherwise, the device hides details of its frame
2071                 * structure, so that corresponding packet head is
2072                 * never delivered to user.
2073                 */
2074                if (sk->sk_type != SOCK_DGRAM)
2075                        skb_push(skb, skb->data - skb_mac_header(skb));
2076                else if (skb->pkt_type == PACKET_OUTGOING) {
2077                        /* Special case: outgoing packets have ll header at head */
2078                        skb_pull(skb, skb_network_offset(skb));
2079                }
2080        }
2081
2082        snaplen = skb->len;
2083
2084        res = run_filter(skb, sk, snaplen);
2085        if (!res)
2086                goto drop_n_restore;
2087        if (snaplen > res)
2088                snaplen = res;
2089
2090        if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
2091                goto drop_n_acct;
2092
2093        if (skb_shared(skb)) {
2094                struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
2095                if (nskb == NULL)
2096                        goto drop_n_acct;
2097
2098                if (skb_head != skb->data) {
2099                        skb->data = skb_head;
2100                        skb->len = skb_len;
2101                }
2102                consume_skb(skb);
2103                skb = nskb;
2104        }
2105
2106        sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
2107
2108        sll = &PACKET_SKB_CB(skb)->sa.ll;
2109        sll->sll_hatype = dev->type;
2110        sll->sll_pkttype = skb->pkt_type;
2111        if (unlikely(po->origdev))
2112                sll->sll_ifindex = orig_dev->ifindex;
2113        else
2114                sll->sll_ifindex = dev->ifindex;
2115
2116        sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
2117
2118        /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
2119         * Use their space for storing the original skb length.
2120         */
2121        PACKET_SKB_CB(skb)->sa.origlen = skb->len;
2122
2123        if (pskb_trim(skb, snaplen))
2124                goto drop_n_acct;
2125
2126        skb_set_owner_r(skb, sk);
2127        skb->dev = NULL;
2128        skb_dst_drop(skb);
2129
2130        /* drop conntrack reference */
2131        nf_reset_ct(skb);
2132
2133        spin_lock(&sk->sk_receive_queue.lock);
2134        po->stats.stats1.tp_packets++;
2135        sock_skb_set_dropcount(sk, skb);
2136        __skb_queue_tail(&sk->sk_receive_queue, skb);
2137        spin_unlock(&sk->sk_receive_queue.lock);
2138        sk->sk_data_ready(sk);
2139        return 0;
2140
2141drop_n_acct:
2142        is_drop_n_account = true;
2143        atomic_inc(&po->tp_drops);
2144        atomic_inc(&sk->sk_drops);
2145
2146drop_n_restore:
2147        if (skb_head != skb->data && skb_shared(skb)) {
2148                skb->data = skb_head;
2149                skb->len = skb_len;
2150        }
2151drop:
2152        if (!is_drop_n_account)
2153                consume_skb(skb);
2154        else
2155                kfree_skb(skb);
2156        return 0;
2157}
2158
2159static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
2160                       struct packet_type *pt, struct net_device *orig_dev)
2161{
2162        struct sock *sk;
2163        struct packet_sock *po;
2164        struct sockaddr_ll *sll;
2165        union tpacket_uhdr h;
2166        u8 *skb_head = skb->data;
2167        int skb_len = skb->len;
2168        unsigned int snaplen, res;
2169        unsigned long status = TP_STATUS_USER;
2170        unsigned short macoff, netoff, hdrlen;
2171        struct sk_buff *copy_skb = NULL;
2172        struct timespec ts;
2173        __u32 ts_status;
2174        bool is_drop_n_account = false;
2175        bool do_vnet = false;
2176
2177        /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
2178         * We may add members to them until current aligned size without forcing
2179         * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
2180         */
2181        BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
2182        BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
2183
2184        if (skb->pkt_type == PACKET_LOOPBACK)
2185                goto drop;
2186
2187        sk = pt->af_packet_priv;
2188        po = pkt_sk(sk);
2189
2190        if (!net_eq(dev_net(dev), sock_net(sk)))
2191                goto drop;
2192
2193        if (dev->header_ops) {
2194                if (sk->sk_type != SOCK_DGRAM)
2195                        skb_push(skb, skb->data - skb_mac_header(skb));
2196                else if (skb->pkt_type == PACKET_OUTGOING) {
2197                        /* Special case: outgoing packets have ll header at head */
2198                        skb_pull(skb, skb_network_offset(skb));
2199                }
2200        }
2201
2202        snaplen = skb->len;
2203
2204        res = run_filter(skb, sk, snaplen);
2205        if (!res)
2206                goto drop_n_restore;
2207
2208        /* If we are flooded, just give up */
2209        if (__packet_rcv_has_room(po, skb) == ROOM_NONE) {
2210                atomic_inc(&po->tp_drops);
2211                goto drop_n_restore;
2212        }
2213
2214        if (skb->ip_summed == CHECKSUM_PARTIAL)
2215                status |= TP_STATUS_CSUMNOTREADY;
2216        else if (skb->pkt_type != PACKET_OUTGOING &&
2217                 (skb->ip_summed == CHECKSUM_COMPLETE ||
2218                  skb_csum_unnecessary(skb)))
2219                status |= TP_STATUS_CSUM_VALID;
2220
2221        if (snaplen > res)
2222                snaplen = res;
2223
2224        if (sk->sk_type == SOCK_DGRAM) {
2225                macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
2226                                  po->tp_reserve;
2227        } else {
2228                unsigned int maclen = skb_network_offset(skb);
2229                netoff = TPACKET_ALIGN(po->tp_hdrlen +
2230                                       (maclen < 16 ? 16 : maclen)) +
2231                                       po->tp_reserve;
2232                if (po->has_vnet_hdr) {
2233                        netoff += sizeof(struct virtio_net_hdr);
2234                        do_vnet = true;
2235                }
2236                macoff = netoff - maclen;
2237        }
2238        if (po->tp_version <= TPACKET_V2) {
2239                if (macoff + snaplen > po->rx_ring.frame_size) {
2240                        if (po->copy_thresh &&
2241                            atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
2242                                if (skb_shared(skb)) {
2243                                        copy_skb = skb_clone(skb, GFP_ATOMIC);
2244                                } else {
2245                                        copy_skb = skb_get(skb);
2246                                        skb_head = skb->data;
2247                                }
2248                                if (copy_skb)
2249                                        skb_set_owner_r(copy_skb, sk);
2250                        }
2251                        snaplen = po->rx_ring.frame_size - macoff;
2252                        if ((int)snaplen < 0) {
2253                                snaplen = 0;
2254                                do_vnet = false;
2255                        }
2256                }
2257        } else if (unlikely(macoff + snaplen >
2258                            GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
2259                u32 nval;
2260
2261                nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
2262                pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2263                            snaplen, nval, macoff);
2264                snaplen = nval;
2265                if (unlikely((int)snaplen < 0)) {
2266                        snaplen = 0;
2267                        macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
2268                        do_vnet = false;
2269                }
2270        }
2271        spin_lock(&sk->sk_receive_queue.lock);
2272        h.raw = packet_current_rx_frame(po, skb,
2273                                        TP_STATUS_KERNEL, (macoff+snaplen));
2274        if (!h.raw)
2275                goto drop_n_account;
2276        if (po->tp_version <= TPACKET_V2) {
2277                packet_increment_rx_head(po, &po->rx_ring);
2278        /*
2279         * LOSING will be reported till you read the stats,
2280         * because it's COR - Clear On Read.
2281         * Anyways, moving it for V1/V2 only as V3 doesn't need this
2282         * at packet level.
2283         */
2284                if (atomic_read(&po->tp_drops))
2285                        status |= TP_STATUS_LOSING;
2286        }
2287
2288        if (do_vnet &&
2289            virtio_net_hdr_from_skb(skb, h.raw + macoff -
2290                                    sizeof(struct virtio_net_hdr),
2291                                    vio_le(), true, 0))
2292                goto drop_n_account;
2293
2294        po->stats.stats1.tp_packets++;
2295        if (copy_skb) {
2296                status |= TP_STATUS_COPY;
2297                __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
2298        }
2299        spin_unlock(&sk->sk_receive_queue.lock);
2300
2301        skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
2302
2303        if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
2304                getnstimeofday(&ts);
2305
2306        status |= ts_status;
2307
2308        switch (po->tp_version) {
2309        case TPACKET_V1:
2310                h.h1->tp_len = skb->len;
2311                h.h1->tp_snaplen = snaplen;
2312                h.h1->tp_mac = macoff;
2313                h.h1->tp_net = netoff;
2314                h.h1->tp_sec = ts.tv_sec;
2315                h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
2316                hdrlen = sizeof(*h.h1);
2317                break;
2318        case TPACKET_V2:
2319                h.h2->tp_len = skb->len;
2320                h.h2->tp_snaplen = snaplen;
2321                h.h2->tp_mac = macoff;
2322                h.h2->tp_net = netoff;
2323                h.h2->tp_sec = ts.tv_sec;
2324                h.h2->tp_nsec = ts.tv_nsec;
2325                if (skb_vlan_tag_present(skb)) {
2326                        h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
2327                        h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2328                        status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
2329                } else {
2330                        h.h2->tp_vlan_tci = 0;
2331                        h.h2->tp_vlan_tpid = 0;
2332                }
2333                memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
2334                hdrlen = sizeof(*h.h2);
2335                break;
2336        case TPACKET_V3:
2337                /* tp_nxt_offset,vlan are already populated above.
2338                 * So DONT clear those fields here
2339                 */
2340                h.h3->tp_status |= status;
2341                h.h3->tp_len = skb->len;
2342                h.h3->tp_snaplen = snaplen;
2343                h.h3->tp_mac = macoff;
2344                h.h3->tp_net = netoff;
2345                h.h3->tp_sec  = ts.tv_sec;
2346                h.h3->tp_nsec = ts.tv_nsec;
2347                memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
2348                hdrlen = sizeof(*h.h3);
2349                break;
2350        default:
2351                BUG();
2352        }
2353
2354        sll = h.raw + TPACKET_ALIGN(hdrlen);
2355        sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
2356        sll->sll_family = AF_PACKET;
2357        sll->sll_hatype = dev->type;
2358        sll->sll_protocol = skb->protocol;
2359        sll->sll_pkttype = skb->pkt_type;
2360        if (unlikely(po->origdev))
2361                sll->sll_ifindex = orig_dev->ifindex;
2362        else
2363                sll->sll_ifindex = dev->ifindex;
2364
2365        smp_mb();
2366
2367#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
2368        if (po->tp_version <= TPACKET_V2) {
2369                u8 *start, *end;
2370
2371                end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2372                                        macoff + snaplen);
2373
2374                for (start = h.raw; start < end; start += PAGE_SIZE)
2375                        flush_dcache_page(pgv_to_page(start));
2376        }
2377        smp_wmb();
2378#endif
2379
2380        if (po->tp_version <= TPACKET_V2) {
2381                __packet_set_status(po, h.raw, status);
2382                sk->sk_data_ready(sk);
2383        } else {
2384                prb_clear_blk_fill_status(&po->rx_ring);
2385        }
2386
2387drop_n_restore:
2388        if (skb_head != skb->data && skb_shared(skb)) {
2389                skb->data = skb_head;
2390                skb->len = skb_len;
2391        }
2392drop:
2393        if (!is_drop_n_account)
2394                consume_skb(skb);
2395        else
2396                kfree_skb(skb);
2397        return 0;
2398
2399drop_n_account:
2400        spin_unlock(&sk->sk_receive_queue.lock);
2401        atomic_inc(&po->tp_drops);
2402        is_drop_n_account = true;
2403
2404        sk->sk_data_ready(sk);
2405        kfree_skb(copy_skb);
2406        goto drop_n_restore;
2407}
2408
2409static void tpacket_destruct_skb(struct sk_buff *skb)
2410{
2411        struct packet_sock *po = pkt_sk(skb->sk);
2412
2413        if (likely(po->tx_ring.pg_vec)) {
2414                void *ph;
2415                __u32 ts;
2416
2417                ph = skb_zcopy_get_nouarg(skb);
2418                packet_dec_pending(&po->tx_ring);
2419
2420                ts = __packet_set_timestamp(po, ph, skb);
2421                __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
2422
2423                if (!packet_read_pending(&po->tx_ring))
2424                        complete(&po->skb_completion);
2425        }
2426
2427        sock_wfree(skb);
2428}
2429
2430static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
2431{
2432        if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2433            (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2434             __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
2435              __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len)))
2436                vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
2437                         __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2438                        __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2);
2439
2440        if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
2441                return -EINVAL;
2442
2443        return 0;
2444}
2445
2446static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
2447                                 struct virtio_net_hdr *vnet_hdr)
2448{
2449        if (*len < sizeof(*vnet_hdr))
2450                return -EINVAL;
2451        *len -= sizeof(*vnet_hdr);
2452
2453        if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
2454                return -EFAULT;
2455
2456        return __packet_snd_vnet_parse(vnet_hdr, *len);
2457}
2458
2459static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
2460                void *frame, struct net_device *dev, void *data, int tp_len,
2461                __be16 proto, unsigned char *addr, int hlen, int copylen,
2462                const struct sockcm_cookie *sockc)
2463{
2464        union tpacket_uhdr ph;
2465        int to_write, offset, len, nr_frags, len_max;
2466        struct socket *sock = po->sk.sk_socket;
2467        struct page *page;
2468        int err;
2469
2470        ph.raw = frame;
2471
2472        skb->protocol = proto;
2473        skb->dev = dev;
2474        skb->priority = po->sk.sk_priority;
2475        skb->mark = po->sk.sk_mark;
2476        skb->tstamp = sockc->transmit_time;
2477        skb_setup_tx_timestamp(skb, sockc->tsflags);
2478        skb_zcopy_set_nouarg(skb, ph.raw);
2479
2480        skb_reserve(skb, hlen);
2481        skb_reset_network_header(skb);
2482
2483        to_write = tp_len;
2484
2485        if (sock->type == SOCK_DGRAM) {
2486                err = dev_hard_header(skb, dev, ntohs(proto), addr,
2487                                NULL, tp_len);
2488                if (unlikely(err < 0))
2489                        return -EINVAL;
2490        } else if (copylen) {
2491                int hdrlen = min_t(int, copylen, tp_len);
2492
2493                skb_push(skb, dev->hard_header_len);
2494                skb_put(skb, copylen - dev->hard_header_len);
2495                err = skb_store_bits(skb, 0, data, hdrlen);
2496                if (unlikely(err))
2497                        return err;
2498                if (!dev_validate_header(dev, skb->data, hdrlen))
2499                        return -EINVAL;
2500
2501                data += hdrlen;
2502                to_write -= hdrlen;
2503        }
2504
2505        offset = offset_in_page(data);
2506        len_max = PAGE_SIZE - offset;
2507        len = ((to_write > len_max) ? len_max : to_write);
2508
2509        skb->data_len = to_write;
2510        skb->len += to_write;
2511        skb->truesize += to_write;
2512        refcount_add(to_write, &po->sk.sk_wmem_alloc);
2513
2514        while (likely(to_write)) {
2515                nr_frags = skb_shinfo(skb)->nr_frags;
2516
2517                if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
2518                        pr_err("Packet exceed the number of skb frags(%lu)\n",
2519                               MAX_SKB_FRAGS);
2520                        return -EFAULT;
2521                }
2522
2523                page = pgv_to_page(data);
2524                data += len;
2525                flush_dcache_page(page);
2526                get_page(page);
2527                skb_fill_page_desc(skb, nr_frags, page, offset, len);
2528                to_write -= len;
2529                offset = 0;
2530                len_max = PAGE_SIZE;
2531                len = ((to_write > len_max) ? len_max : to_write);
2532        }
2533
2534        packet_parse_headers(skb, sock);
2535
2536        return tp_len;
2537}
2538
2539static int tpacket_parse_header(struct packet_sock *po, void *frame,
2540                                int size_max, void **data)
2541{
2542        union tpacket_uhdr ph;
2543        int tp_len, off;
2544
2545        ph.raw = frame;
2546
2547        switch (po->tp_version) {
2548        case TPACKET_V3:
2549                if (ph.h3->tp_next_offset != 0) {
2550                        pr_warn_once("variable sized slot not supported");
2551                        return -EINVAL;
2552                }
2553                tp_len = ph.h3->tp_len;
2554                break;
2555        case TPACKET_V2:
2556                tp_len = ph.h2->tp_len;
2557                break;
2558        default:
2559                tp_len = ph.h1->tp_len;
2560                break;
2561        }
2562        if (unlikely(tp_len > size_max)) {
2563                pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2564                return -EMSGSIZE;
2565        }
2566
2567        if (unlikely(po->tp_tx_has_off)) {
2568                int off_min, off_max;
2569
2570                off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2571                off_max = po->tx_ring.frame_size - tp_len;
2572                if (po->sk.sk_type == SOCK_DGRAM) {
2573                        switch (po->tp_version) {
2574                        case TPACKET_V3:
2575                                off = ph.h3->tp_net;
2576                                break;
2577                        case TPACKET_V2:
2578                                off = ph.h2->tp_net;
2579                                break;
2580                        default:
2581                                off = ph.h1->tp_net;
2582                                break;
2583                        }
2584                } else {
2585                        switch (po->tp_version) {
2586                        case TPACKET_V3:
2587                                off = ph.h3->tp_mac;
2588                                break;
2589                        case TPACKET_V2:
2590                                off = ph.h2->tp_mac;
2591                                break;
2592                        default:
2593                                off = ph.h1->tp_mac;
2594                                break;
2595                        }
2596                }
2597                if (unlikely((off < off_min) || (off_max < off)))
2598                        return -EINVAL;
2599        } else {
2600                off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2601        }
2602
2603        *data = frame + off;
2604        return tp_len;
2605}
2606
2607static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2608{
2609        struct sk_buff *skb = NULL;
2610        struct net_device *dev;
2611        struct virtio_net_hdr *vnet_hdr = NULL;
2612        struct sockcm_cookie sockc;
2613        __be16 proto;
2614        int err, reserve = 0;
2615        void *ph;
2616        DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
2617        bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
2618        unsigned char *addr = NULL;
2619        int tp_len, size_max;
2620        void *data;
2621        int len_sum = 0;
2622        int status = TP_STATUS_AVAILABLE;
2623        int hlen, tlen, copylen = 0;
2624        long timeo = 0;
2625
2626        mutex_lock(&po->pg_vec_lock);
2627
2628        /* packet_sendmsg() check on tx_ring.pg_vec was lockless,
2629         * we need to confirm it under protection of pg_vec_lock.
2630         */
2631        if (unlikely(!po->tx_ring.pg_vec)) {
2632                err = -EBUSY;
2633                goto out;
2634        }
2635        if (likely(saddr == NULL)) {
2636                dev     = packet_cached_dev_get(po);
2637                proto   = po->num;
2638        } else {
2639                err = -EINVAL;
2640                if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2641                        goto out;
2642                if (msg->msg_namelen < (saddr->sll_halen
2643                                        + offsetof(struct sockaddr_ll,
2644                                                sll_addr)))
2645                        goto out;
2646                proto   = saddr->sll_protocol;
2647                dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
2648                if (po->sk.sk_socket->type == SOCK_DGRAM) {
2649                        if (dev && msg->msg_namelen < dev->addr_len +
2650                                   offsetof(struct sockaddr_ll, sll_addr))
2651                                goto out_put;
2652                        addr = saddr->sll_addr;
2653                }
2654        }
2655
2656        err = -ENXIO;
2657        if (unlikely(dev == NULL))
2658                goto out;
2659        err = -ENETDOWN;
2660        if (unlikely(!(dev->flags & IFF_UP)))
2661                goto out_put;
2662
2663        sockcm_init(&sockc, &po->sk);
2664        if (msg->msg_controllen) {
2665                err = sock_cmsg_send(&po->sk, msg, &sockc);
2666                if (unlikely(err))
2667                        goto out_put;
2668        }
2669
2670        if (po->sk.sk_socket->type == SOCK_RAW)
2671                reserve = dev->hard_header_len;
2672        size_max = po->tx_ring.frame_size
2673                - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
2674
2675        if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
2676                size_max = dev->mtu + reserve + VLAN_HLEN;
2677
2678        reinit_completion(&po->skb_completion);
2679
2680        do {
2681                ph = packet_current_frame(po, &po->tx_ring,
2682                                          TP_STATUS_SEND_REQUEST);
2683                if (unlikely(ph == NULL)) {
2684                        if (need_wait && skb) {
2685                                timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT);
2686                                timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo);
2687                                if (timeo <= 0) {
2688                                        err = !timeo ? -ETIMEDOUT : -ERESTARTSYS;
2689                                        goto out_put;
2690                                }
2691                        }
2692                        /* check for additional frames */
2693                        continue;
2694                }
2695
2696                skb = NULL;
2697                tp_len = tpacket_parse_header(po, ph, size_max, &data);
2698                if (tp_len < 0)
2699                        goto tpacket_error;
2700
2701                status = TP_STATUS_SEND_REQUEST;
2702                hlen = LL_RESERVED_SPACE(dev);
2703                tlen = dev->needed_tailroom;
2704                if (po->has_vnet_hdr) {
2705                        vnet_hdr = data;
2706                        data += sizeof(*vnet_hdr);
2707                        tp_len -= sizeof(*vnet_hdr);
2708                        if (tp_len < 0 ||
2709                            __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
2710                                tp_len = -EINVAL;
2711                                goto tpacket_error;
2712                        }
2713                        copylen = __virtio16_to_cpu(vio_le(),
2714                                                    vnet_hdr->hdr_len);
2715                }
2716                copylen = max_t(int, copylen, dev->hard_header_len);
2717                skb = sock_alloc_send_skb(&po->sk,
2718                                hlen + tlen + sizeof(struct sockaddr_ll) +
2719                                (copylen - dev->hard_header_len),
2720                                !need_wait, &err);
2721
2722                if (unlikely(skb == NULL)) {
2723                        /* we assume the socket was initially writeable ... */
2724                        if (likely(len_sum > 0))
2725                                err = len_sum;
2726                        goto out_status;
2727                }
2728                tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
2729                                          addr, hlen, copylen, &sockc);
2730                if (likely(tp_len >= 0) &&
2731                    tp_len > dev->mtu + reserve &&
2732                    !po->has_vnet_hdr &&
2733                    !packet_extra_vlan_len_allowed(dev, skb))
2734                        tp_len = -EMSGSIZE;
2735
2736                if (unlikely(tp_len < 0)) {
2737tpacket_error:
2738                        if (po->tp_loss) {
2739                                __packet_set_status(po, ph,
2740                                                TP_STATUS_AVAILABLE);
2741                                packet_increment_head(&po->tx_ring);
2742                                kfree_skb(skb);
2743                                continue;
2744                        } else {
2745                                status = TP_STATUS_WRONG_FORMAT;
2746                                err = tp_len;
2747                                goto out_status;
2748                        }
2749                }
2750
2751                if (po->has_vnet_hdr) {
2752                        if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) {
2753                                tp_len = -EINVAL;
2754                                goto tpacket_error;
2755                        }
2756                        virtio_net_hdr_set_proto(skb, vnet_hdr);
2757                }
2758
2759                skb->destructor = tpacket_destruct_skb;
2760                __packet_set_status(po, ph, TP_STATUS_SENDING);
2761                packet_inc_pending(&po->tx_ring);
2762
2763                status = TP_STATUS_SEND_REQUEST;
2764                err = po->xmit(skb);
2765                if (unlikely(err > 0)) {
2766                        err = net_xmit_errno(err);
2767                        if (err && __packet_get_status(po, ph) ==
2768                                   TP_STATUS_AVAILABLE) {
2769                                /* skb was destructed already */
2770                                skb = NULL;
2771                                goto out_status;
2772                        }
2773                        /*
2774                         * skb was dropped but not destructed yet;
2775                         * let's treat it like congestion or err < 0
2776                         */
2777                        err = 0;
2778                }
2779                packet_increment_head(&po->tx_ring);
2780                len_sum += tp_len;
2781        } while (likely((ph != NULL) ||
2782                /* Note: packet_read_pending() might be slow if we have
2783                 * to call it as it's per_cpu variable, but in fast-path
2784                 * we already short-circuit the loop with the first
2785                 * condition, and luckily don't have to go that path
2786                 * anyway.
2787                 */
2788                 (need_wait && packet_read_pending(&po->tx_ring))));
2789
2790        err = len_sum;
2791        goto out_put;
2792
2793out_status:
2794        __packet_set_status(po, ph, status);
2795        kfree_skb(skb);
2796out_put:
2797        dev_put(dev);
2798out:
2799        mutex_unlock(&po->pg_vec_lock);
2800        return err;
2801}
2802
2803static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2804                                        size_t reserve, size_t len,
2805                                        size_t linear, int noblock,
2806                                        int *err)
2807{
2808        struct sk_buff *skb;
2809
2810        /* Under a page?  Don't bother with paged skb. */
2811        if (prepad + len < PAGE_SIZE || !linear)
2812                linear = len;
2813
2814        skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
2815                                   err, 0);
2816        if (!skb)
2817                return NULL;
2818
2819        skb_reserve(skb, reserve);
2820        skb_put(skb, linear);
2821        skb->data_len = len - linear;
2822        skb->len += len - linear;
2823
2824        return skb;
2825}
2826
2827static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
2828{
2829        struct sock *sk = sock->sk;
2830        DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
2831        struct sk_buff *skb;
2832        struct net_device *dev;
2833        __be16 proto;
2834        unsigned char *addr = NULL;
2835        int err, reserve = 0;
2836        struct sockcm_cookie sockc;
2837        struct virtio_net_hdr vnet_hdr = { 0 };
2838        int offset = 0;
2839        struct packet_sock *po = pkt_sk(sk);
2840        bool has_vnet_hdr = false;
2841        int hlen, tlen, linear;
2842        int extra_len = 0;
2843
2844        /*
2845         *      Get and verify the address.
2846         */
2847
2848        if (likely(saddr == NULL)) {
2849                dev     = packet_cached_dev_get(po);
2850                proto   = po->num;
2851        } else {
2852                err = -EINVAL;
2853                if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2854                        goto out;
2855                if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2856                        goto out;
2857                proto   = saddr->sll_protocol;
2858                dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
2859                if (sock->type == SOCK_DGRAM) {
2860                        if (dev && msg->msg_namelen < dev->addr_len +
2861                                   offsetof(struct sockaddr_ll, sll_addr))
2862                                goto out_unlock;
2863                        addr = saddr->sll_addr;
2864                }
2865        }
2866
2867        err = -ENXIO;
2868        if (unlikely(dev == NULL))
2869                goto out_unlock;
2870        err = -ENETDOWN;
2871        if (unlikely(!(dev->flags & IFF_UP)))
2872                goto out_unlock;
2873
2874        sockcm_init(&sockc, sk);
2875        sockc.mark = sk->sk_mark;
2876        if (msg->msg_controllen) {
2877                err = sock_cmsg_send(sk, msg, &sockc);
2878                if (unlikely(err))
2879                        goto out_unlock;
2880        }
2881
2882        if (sock->type == SOCK_RAW)
2883                reserve = dev->hard_header_len;
2884        if (po->has_vnet_hdr) {
2885                err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
2886                if (err)
2887                        goto out_unlock;
2888                has_vnet_hdr = true;
2889        }
2890
2891        if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2892                if (!netif_supports_nofcs(dev)) {
2893                        err = -EPROTONOSUPPORT;
2894                        goto out_unlock;
2895                }
2896                extra_len = 4; /* We're doing our own CRC */
2897        }
2898
2899        err = -EMSGSIZE;
2900        if (!vnet_hdr.gso_type &&
2901            (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
2902                goto out_unlock;
2903
2904        err = -ENOBUFS;
2905        hlen = LL_RESERVED_SPACE(dev);
2906        tlen = dev->needed_tailroom;
2907        linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len);
2908        linear = max(linear, min_t(int, len, dev->hard_header_len));
2909        skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear,
2910                               msg->msg_flags & MSG_DONTWAIT, &err);
2911        if (skb == NULL)
2912                goto out_unlock;
2913
2914        skb_reset_network_header(skb);
2915
2916        err = -EINVAL;
2917        if (sock->type == SOCK_DGRAM) {
2918                offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
2919                if (unlikely(offset < 0))
2920                        goto out_free;
2921        } else if (reserve) {
2922                skb_reserve(skb, -reserve);
2923                if (len < reserve + sizeof(struct ipv6hdr) &&
2924                    dev->min_header_len != dev->hard_header_len)
2925                        skb_reset_network_header(skb);
2926        }
2927
2928        /* Returns -EFAULT on error */
2929        err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
2930        if (err)
2931                goto out_free;
2932
2933        if (sock->type == SOCK_RAW &&
2934            !dev_validate_header(dev, skb->data, len)) {
2935                err = -EINVAL;
2936                goto out_free;
2937        }
2938
2939        skb_setup_tx_timestamp(skb, sockc.tsflags);
2940
2941        if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
2942            !packet_extra_vlan_len_allowed(dev, skb)) {
2943                err = -EMSGSIZE;
2944                goto out_free;
2945        }
2946
2947        skb->protocol = proto;
2948        skb->dev = dev;
2949        skb->priority = sk->sk_priority;
2950        skb->mark = sockc.mark;
2951        skb->tstamp = sockc.transmit_time;
2952
2953        if (has_vnet_hdr) {
2954                err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
2955                if (err)
2956                        goto out_free;
2957                len += sizeof(vnet_hdr);
2958                virtio_net_hdr_set_proto(skb, &vnet_hdr);
2959        }
2960
2961        packet_parse_headers(skb, sock);
2962
2963        if (unlikely(extra_len == 4))
2964                skb->no_fcs = 1;
2965
2966        err = po->xmit(skb);
2967        if (err > 0 && (err = net_xmit_errno(err)) != 0)
2968                goto out_unlock;
2969
2970        dev_put(dev);
2971
2972        return len;
2973
2974out_free:
2975        kfree_skb(skb);
2976out_unlock:
2977        if (dev)
2978                dev_put(dev);
2979out:
2980        return err;
2981}
2982
2983static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
2984{
2985        struct sock *sk = sock->sk;
2986        struct packet_sock *po = pkt_sk(sk);
2987
2988        if (po->tx_ring.pg_vec)
2989                return tpacket_snd(po, msg);
2990        else
2991                return packet_snd(sock, msg, len);
2992}
2993
2994/*
2995 *      Close a PACKET socket. This is fairly simple. We immediately go
2996 *      to 'closed' state and remove our protocol entry in the device list.
2997 */
2998
2999static int packet_release(struct socket *sock)
3000{
3001        struct sock *sk = sock->sk;
3002        struct packet_sock *po;
3003        struct packet_fanout *f;
3004        struct net *net;
3005        union tpacket_req_u req_u;
3006
3007        if (!sk)
3008                return 0;
3009
3010        net = sock_net(sk);
3011        po = pkt_sk(sk);
3012
3013        mutex_lock(&net->packet.sklist_lock);
3014        sk_del_node_init_rcu(sk);
3015        mutex_unlock(&net->packet.sklist_lock);
3016
3017        preempt_disable();
3018        sock_prot_inuse_add(net, sk->sk_prot, -1);
3019        preempt_enable();
3020
3021        spin_lock(&po->bind_lock);
3022        unregister_prot_hook(sk, false);
3023        packet_cached_dev_reset(po);
3024
3025        if (po->prot_hook.dev) {
3026                dev_put(po->prot_hook.dev);
3027                po->prot_hook.dev = NULL;
3028        }
3029        spin_unlock(&po->bind_lock);
3030
3031        packet_flush_mclist(sk);
3032
3033        lock_sock(sk);
3034        if (po->rx_ring.pg_vec) {
3035                memset(&req_u, 0, sizeof(req_u));
3036                packet_set_ring(sk, &req_u, 1, 0);
3037        }
3038
3039        if (po->tx_ring.pg_vec) {
3040                memset(&req_u, 0, sizeof(req_u));
3041                packet_set_ring(sk, &req_u, 1, 1);
3042        }
3043        release_sock(sk);
3044
3045        f = fanout_release(sk);
3046
3047        synchronize_net();
3048
3049        kfree(po->rollover);
3050        if (f) {
3051                fanout_release_data(f);
3052                kfree(f);
3053        }
3054        /*
3055         *      Now the socket is dead. No more input will appear.
3056         */
3057        sock_orphan(sk);
3058        sock->sk = NULL;
3059
3060        /* Purge queues */
3061
3062        skb_queue_purge(&sk->sk_receive_queue);
3063        packet_free_pending(po);
3064        sk_refcnt_debug_release(sk);
3065
3066        sock_put(sk);
3067        return 0;
3068}
3069
3070/*
3071 *      Attach a packet hook.
3072 */
3073
3074static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
3075                          __be16 proto)
3076{
3077        struct packet_sock *po = pkt_sk(sk);
3078        struct net_device *dev_curr;
3079        __be16 proto_curr;
3080        bool need_rehook;
3081        struct net_device *dev = NULL;
3082        int ret = 0;
3083        bool unlisted = false;
3084
3085        lock_sock(sk);
3086        spin_lock(&po->bind_lock);
3087        rcu_read_lock();
3088
3089        if (po->fanout) {
3090                ret = -EINVAL;
3091                goto out_unlock;
3092        }
3093
3094        if (name) {
3095                dev = dev_get_by_name_rcu(sock_net(sk), name);
3096                if (!dev) {
3097                        ret = -ENODEV;
3098                        goto out_unlock;
3099                }
3100        } else if (ifindex) {
3101                dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
3102                if (!dev) {
3103                        ret = -ENODEV;
3104                        goto out_unlock;
3105                }
3106        }
3107
3108        if (dev)
3109                dev_hold(dev);
3110
3111        proto_curr = po->prot_hook.type;
3112        dev_curr = po->prot_hook.dev;
3113
3114        need_rehook = proto_curr != proto || dev_curr != dev;
3115
3116        if (need_rehook) {
3117                if (po->running) {
3118                        rcu_read_unlock();
3119                        /* prevents packet_notifier() from calling
3120                         * register_prot_hook()
3121                         */
3122                        po->num = 0;
3123                        __unregister_prot_hook(sk, true);
3124                        rcu_read_lock();
3125                        dev_curr = po->prot_hook.dev;
3126                        if (dev)
3127                                unlisted = !dev_get_by_index_rcu(sock_net(sk),
3128                                                                 dev->ifindex);
3129                }
3130
3131                BUG_ON(po->running);
3132                po->num = proto;
3133                po->prot_hook.type = proto;
3134
3135                if (unlikely(unlisted)) {
3136                        dev_put(dev);
3137                        po->prot_hook.dev = NULL;
3138                        po->ifindex = -1;
3139                        packet_cached_dev_reset(po);
3140                } else {
3141                        po->prot_hook.dev = dev;
3142                        po->ifindex = dev ? dev->ifindex : 0;
3143                        packet_cached_dev_assign(po, dev);
3144                }
3145        }
3146        if (dev_curr)
3147                dev_put(dev_curr);
3148
3149        if (proto == 0 || !need_rehook)
3150                goto out_unlock;
3151
3152        if (!unlisted && (!dev || (dev->flags & IFF_UP))) {
3153                register_prot_hook(sk);
3154        } else {
3155                sk->sk_err = ENETDOWN;
3156                if (!sock_flag(sk, SOCK_DEAD))
3157                        sk->sk_error_report(sk);
3158        }
3159
3160out_unlock:
3161        rcu_read_unlock();
3162        spin_unlock(&po->bind_lock);
3163        release_sock(sk);
3164        return ret;
3165}
3166
3167/*
3168 *      Bind a packet socket to a device
3169 */
3170
3171static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
3172                            int addr_len)
3173{
3174        struct sock *sk = sock->sk;
3175        char name[sizeof(uaddr->sa_data) + 1];
3176
3177        /*
3178         *      Check legality
3179         */
3180
3181        if (addr_len != sizeof(struct sockaddr))
3182                return -EINVAL;
3183        /* uaddr->sa_data comes from the userspace, it's not guaranteed to be
3184         * zero-terminated.
3185         */
3186        memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data));
3187        name[sizeof(uaddr->sa_data)] = 0;
3188
3189        return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
3190}
3191
3192static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
3193{
3194        struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
3195        struct sock *sk = sock->sk;
3196
3197        /*
3198         *      Check legality
3199         */
3200
3201        if (addr_len < sizeof(struct sockaddr_ll))
3202                return -EINVAL;
3203        if (sll->sll_family != AF_PACKET)
3204                return -EINVAL;
3205
3206        return packet_do_bind(sk, NULL, sll->sll_ifindex,
3207                              sll->sll_protocol ? : pkt_sk(sk)->num);
3208}
3209
3210static struct proto packet_proto = {
3211        .name     = "PACKET",
3212        .owner    = THIS_MODULE,
3213        .obj_size = sizeof(struct packet_sock),
3214};
3215
3216/*
3217 *      Create a packet of type SOCK_PACKET.
3218 */
3219
3220static int packet_create(struct net *net, struct socket *sock, int protocol,
3221                         int kern)
3222{
3223        struct sock *sk;
3224        struct packet_sock *po;
3225        __be16 proto = (__force __be16)protocol; /* weird, but documented */
3226        int err;
3227
3228        if (!ns_capable(net->user_ns, CAP_NET_RAW))
3229                return -EPERM;
3230        if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
3231            sock->type != SOCK_PACKET)
3232                return -ESOCKTNOSUPPORT;
3233
3234        sock->state = SS_UNCONNECTED;
3235
3236        err = -ENOBUFS;
3237        sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
3238        if (sk == NULL)
3239                goto out;
3240
3241        sock->ops = &packet_ops;
3242        if (sock->type == SOCK_PACKET)
3243                sock->ops = &packet_ops_spkt;
3244
3245        sock_init_data(sock, sk);
3246
3247        po = pkt_sk(sk);
3248        init_completion(&po->skb_completion);
3249        sk->sk_family = PF_PACKET;
3250        po->num = proto;
3251        po->xmit = dev_queue_xmit;
3252
3253        err = packet_alloc_pending(po);
3254        if (err)
3255                goto out2;
3256
3257        packet_cached_dev_reset(po);
3258
3259        sk->sk_destruct = packet_sock_destruct;
3260        sk_refcnt_debug_inc(sk);
3261
3262        /*
3263         *      Attach a protocol block
3264         */
3265
3266        spin_lock_init(&po->bind_lock);
3267        mutex_init(&po->pg_vec_lock);
3268        po->rollover = NULL;
3269        po->prot_hook.func = packet_rcv;
3270
3271        if (sock->type == SOCK_PACKET)
3272                po->prot_hook.func = packet_rcv_spkt;
3273
3274        po->prot_hook.af_packet_priv = sk;
3275
3276        if (proto) {
3277                po->prot_hook.type = proto;
3278                __register_prot_hook(sk);
3279        }
3280
3281        mutex_lock(&net->packet.sklist_lock);
3282        sk_add_node_tail_rcu(sk, &net->packet.sklist);
3283        mutex_unlock(&net->packet.sklist_lock);
3284
3285        preempt_disable();
3286        sock_prot_inuse_add(net, &packet_proto, 1);
3287        preempt_enable();
3288
3289        return 0;
3290out2:
3291        sk_free(sk);
3292out:
3293        return err;
3294}
3295
3296/*
3297 *      Pull a packet from our receive queue and hand it to the user.
3298 *      If necessary we block.
3299 */
3300
3301static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
3302                          int flags)
3303{
3304        struct sock *sk = sock->sk;
3305        struct sk_buff *skb;
3306        int copied, err;
3307        int vnet_hdr_len = 0;
3308        unsigned int origlen = 0;
3309
3310        err = -EINVAL;
3311        if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
3312                goto out;
3313
3314#if 0
3315        /* What error should we return now? EUNATTACH? */
3316        if (pkt_sk(sk)->ifindex < 0)
3317                return -ENODEV;
3318#endif
3319
3320        if (flags & MSG_ERRQUEUE) {
3321                err = sock_recv_errqueue(sk, msg, len,
3322                                         SOL_PACKET, PACKET_TX_TIMESTAMP);
3323                goto out;
3324        }
3325
3326        /*
3327         *      Call the generic datagram receiver. This handles all sorts
3328         *      of horrible races and re-entrancy so we can forget about it
3329         *      in the protocol layers.
3330         *
3331         *      Now it will return ENETDOWN, if device have just gone down,
3332         *      but then it will block.
3333         */
3334
3335        skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
3336
3337        /*
3338         *      An error occurred so return it. Because skb_recv_datagram()
3339         *      handles the blocking we don't see and worry about blocking
3340         *      retries.
3341         */
3342
3343        if (skb == NULL)
3344                goto out;
3345
3346        packet_rcv_try_clear_pressure(pkt_sk(sk));
3347
3348        if (pkt_sk(sk)->has_vnet_hdr) {
3349                err = packet_rcv_vnet(msg, skb, &len);
3350                if (err)
3351                        goto out_free;
3352                vnet_hdr_len = sizeof(struct virtio_net_hdr);
3353        }
3354
3355        /* You lose any data beyond the buffer you gave. If it worries
3356         * a user program they can ask the device for its MTU
3357         * anyway.
3358         */
3359        copied = skb->len;
3360        if (copied > len) {
3361                copied = len;
3362                msg->msg_flags |= MSG_TRUNC;
3363        }
3364
3365        err = skb_copy_datagram_msg(skb, 0, msg, copied);
3366        if (err)
3367                goto out_free;
3368
3369        if (sock->type != SOCK_PACKET) {
3370                struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3371
3372                /* Original length was stored in sockaddr_ll fields */
3373                origlen = PACKET_SKB_CB(skb)->sa.origlen;
3374                sll->sll_family = AF_PACKET;
3375                sll->sll_protocol = skb->protocol;
3376        }
3377
3378        sock_recv_ts_and_drops(msg, sk, skb);
3379
3380        if (msg->msg_name) {
3381                int copy_len;
3382
3383                /* If the address length field is there to be filled
3384                 * in, we fill it in now.
3385                 */
3386                if (sock->type == SOCK_PACKET) {
3387                        __sockaddr_check_size(sizeof(struct sockaddr_pkt));
3388                        msg->msg_namelen = sizeof(struct sockaddr_pkt);
3389                        copy_len = msg->msg_namelen;
3390                } else {
3391                        struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3392
3393                        msg->msg_namelen = sll->sll_halen +
3394                                offsetof(struct sockaddr_ll, sll_addr);
3395                        copy_len = msg->msg_namelen;
3396                        if (msg->msg_namelen < sizeof(struct sockaddr_ll)) {
3397                                memset(msg->msg_name +
3398                                       offsetof(struct sockaddr_ll, sll_addr),
3399                                       0, sizeof(sll->sll_addr));
3400                                msg->msg_namelen = sizeof(struct sockaddr_ll);
3401                        }
3402                }
3403                memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len);
3404        }
3405
3406        if (pkt_sk(sk)->auxdata) {
3407                struct tpacket_auxdata aux;
3408
3409                aux.tp_status = TP_STATUS_USER;
3410                if (skb->ip_summed == CHECKSUM_PARTIAL)
3411                        aux.tp_status |= TP_STATUS_CSUMNOTREADY;
3412                else if (skb->pkt_type != PACKET_OUTGOING &&
3413                         (skb->ip_summed == CHECKSUM_COMPLETE ||
3414                          skb_csum_unnecessary(skb)))
3415                        aux.tp_status |= TP_STATUS_CSUM_VALID;
3416
3417                aux.tp_len = origlen;
3418                aux.tp_snaplen = skb->len;
3419                aux.tp_mac = 0;
3420                aux.tp_net = skb_network_offset(skb);
3421                if (skb_vlan_tag_present(skb)) {
3422                        aux.tp_vlan_tci = skb_vlan_tag_get(skb);
3423                        aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3424                        aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
3425                } else {
3426                        aux.tp_vlan_tci = 0;
3427                        aux.tp_vlan_tpid = 0;
3428                }
3429                put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
3430        }
3431
3432        /*
3433         *      Free or return the buffer as appropriate. Again this
3434         *      hides all the races and re-entrancy issues from us.
3435         */
3436        err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
3437
3438out_free:
3439        skb_free_datagram(sk, skb);
3440out:
3441        return err;
3442}
3443
3444static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
3445                               int peer)
3446{
3447        struct net_device *dev;
3448        struct sock *sk = sock->sk;
3449
3450        if (peer)
3451                return -EOPNOTSUPP;
3452
3453        uaddr->sa_family = AF_PACKET;
3454        memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
3455        rcu_read_lock();
3456        dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
3457        if (dev)
3458                strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
3459        rcu_read_unlock();
3460
3461        return sizeof(*uaddr);
3462}
3463
3464static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
3465                          int peer)
3466{
3467        struct net_device *dev;
3468        struct sock *sk = sock->sk;
3469        struct packet_sock *po = pkt_sk(sk);
3470        DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
3471
3472        if (peer)
3473                return -EOPNOTSUPP;
3474
3475        sll->sll_family = AF_PACKET;
3476        sll->sll_ifindex = po->ifindex;
3477        sll->sll_protocol = po->num;
3478        sll->sll_pkttype = 0;
3479        rcu_read_lock();
3480        dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
3481        if (dev) {
3482                sll->sll_hatype = dev->type;
3483                sll->sll_halen = dev->addr_len;
3484                memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
3485        } else {
3486                sll->sll_hatype = 0;    /* Bad: we have no ARPHRD_UNSPEC */
3487                sll->sll_halen = 0;
3488        }
3489        rcu_read_unlock();
3490
3491        return offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
3492}
3493
3494static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3495                         int what)
3496{
3497        switch (i->type) {
3498        case PACKET_MR_MULTICAST:
3499                if (i->alen != dev->addr_len)
3500                        return -EINVAL;
3501                if (what > 0)
3502                        return dev_mc_add(dev, i->addr);
3503                else
3504                        return dev_mc_del(dev, i->addr);
3505                break;
3506        case PACKET_MR_PROMISC:
3507                return dev_set_promiscuity(dev, what);
3508        case PACKET_MR_ALLMULTI:
3509                return dev_set_allmulti(dev, what);
3510        case PACKET_MR_UNICAST:
3511                if (i->alen != dev->addr_len)
3512                        return -EINVAL;
3513                if (what > 0)
3514                        return dev_uc_add(dev, i->addr);
3515                else
3516                        return dev_uc_del(dev, i->addr);
3517                break;
3518        default:
3519                break;
3520        }
3521        return 0;
3522}
3523
3524static void packet_dev_mclist_delete(struct net_device *dev,
3525                                     struct packet_mclist **mlp)
3526{
3527        struct packet_mclist *ml;
3528
3529        while ((ml = *mlp) != NULL) {
3530                if (ml->ifindex == dev->ifindex) {
3531                        packet_dev_mc(dev, ml, -1);
3532                        *mlp = ml->next;
3533                        kfree(ml);
3534                } else
3535                        mlp = &ml->next;
3536        }
3537}
3538
3539static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
3540{
3541        struct packet_sock *po = pkt_sk(sk);
3542        struct packet_mclist *ml, *i;
3543        struct net_device *dev;
3544        int err;
3545
3546        rtnl_lock();
3547
3548        err = -ENODEV;
3549        dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
3550        if (!dev)
3551                goto done;
3552
3553        err = -EINVAL;
3554        if (mreq->mr_alen > dev->addr_len)
3555                goto done;
3556
3557        err = -ENOBUFS;
3558        i = kmalloc(sizeof(*i), GFP_KERNEL);
3559        if (i == NULL)
3560                goto done;
3561
3562        err = 0;
3563        for (ml = po->mclist; ml; ml = ml->next) {
3564                if (ml->ifindex == mreq->mr_ifindex &&
3565                    ml->type == mreq->mr_type &&
3566                    ml->alen == mreq->mr_alen &&
3567                    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3568                        ml->count++;
3569                        /* Free the new element ... */
3570                        kfree(i);
3571                        goto done;
3572                }
3573        }
3574
3575        i->type = mreq->mr_type;
3576        i->ifindex = mreq->mr_ifindex;
3577        i->alen = mreq->mr_alen;
3578        memcpy(i->addr, mreq->mr_address, i->alen);
3579        memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
3580        i->count = 1;
3581        i->next = po->mclist;
3582        po->mclist = i;
3583        err = packet_dev_mc(dev, i, 1);
3584        if (err) {
3585                po->mclist = i->next;
3586                kfree(i);
3587        }
3588
3589done:
3590        rtnl_unlock();
3591        return err;
3592}
3593
3594static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
3595{
3596        struct packet_mclist *ml, **mlp;
3597
3598        rtnl_lock();
3599
3600        for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3601                if (ml->ifindex == mreq->mr_ifindex &&
3602                    ml->type == mreq->mr_type &&
3603                    ml->alen == mreq->mr_alen &&
3604                    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3605                        if (--ml->count == 0) {
3606                                struct net_device *dev;
3607                                *mlp = ml->next;
3608                                dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3609                                if (dev)
3610                                        packet_dev_mc(dev, ml, -1);
3611                                kfree(ml);
3612                        }
3613                        break;
3614                }
3615        }
3616        rtnl_unlock();
3617        return 0;
3618}
3619
3620static void packet_flush_mclist(struct sock *sk)
3621{
3622        struct packet_sock *po = pkt_sk(sk);
3623        struct packet_mclist *ml;
3624
3625        if (!po->mclist)
3626                return;
3627
3628        rtnl_lock();
3629        while ((ml = po->mclist) != NULL) {
3630                struct net_device *dev;
3631
3632                po->mclist = ml->next;
3633                dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3634                if (dev != NULL)
3635                        packet_dev_mc(dev, ml, -1);
3636                kfree(ml);
3637        }
3638        rtnl_unlock();
3639}
3640
3641static int
3642packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
3643{
3644        struct sock *sk = sock->sk;
3645        struct packet_sock *po = pkt_sk(sk);
3646        int ret;
3647
3648        if (level != SOL_PACKET)
3649                return -ENOPROTOOPT;
3650
3651        switch (optname) {
3652        case PACKET_ADD_MEMBERSHIP:
3653        case PACKET_DROP_MEMBERSHIP:
3654        {
3655                struct packet_mreq_max mreq;
3656                int len = optlen;
3657                memset(&mreq, 0, sizeof(mreq));
3658                if (len < sizeof(struct packet_mreq))
3659                        return -EINVAL;
3660                if (len > sizeof(mreq))
3661                        len = sizeof(mreq);
3662                if (copy_from_user(&mreq, optval, len))
3663                        return -EFAULT;
3664                if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3665                        return -EINVAL;
3666                if (optname == PACKET_ADD_MEMBERSHIP)
3667                        ret = packet_mc_add(sk, &mreq);
3668                else
3669                        ret = packet_mc_drop(sk, &mreq);
3670                return ret;
3671        }
3672
3673        case PACKET_RX_RING:
3674        case PACKET_TX_RING:
3675        {
3676                union tpacket_req_u req_u;
3677                int len;
3678
3679                lock_sock(sk);
3680                switch (po->tp_version) {
3681                case TPACKET_V1:
3682                case TPACKET_V2:
3683                        len = sizeof(req_u.req);
3684                        break;
3685                case TPACKET_V3:
3686                default:
3687                        len = sizeof(req_u.req3);
3688                        break;
3689                }
3690                if (optlen < len) {
3691                        ret = -EINVAL;
3692                } else {
3693                        if (copy_from_user(&req_u.req, optval, len))
3694                                ret = -EFAULT;
3695                        else
3696                                ret = packet_set_ring(sk, &req_u, 0,
3697                                                    optname == PACKET_TX_RING);
3698                }
3699                release_sock(sk);
3700                return ret;
3701        }
3702        case PACKET_COPY_THRESH:
3703        {
3704                int val;
3705
3706                if (optlen != sizeof(val))
3707                        return -EINVAL;
3708                if (copy_from_user(&val, optval, sizeof(val)))
3709                        return -EFAULT;
3710
3711                pkt_sk(sk)->copy_thresh = val;
3712                return 0;
3713        }
3714        case PACKET_VERSION:
3715        {
3716                int val;
3717
3718                if (optlen != sizeof(val))
3719                        return -EINVAL;
3720                if (copy_from_user(&val, optval, sizeof(val)))
3721                        return -EFAULT;
3722                switch (val) {
3723                case TPACKET_V1:
3724                case TPACKET_V2:
3725                case TPACKET_V3:
3726                        break;
3727                default:
3728                        return -EINVAL;
3729                }
3730                lock_sock(sk);
3731                if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3732                        ret = -EBUSY;
3733                } else {
3734                        po->tp_version = val;
3735                        ret = 0;
3736                }
3737                release_sock(sk);
3738                return ret;
3739        }
3740        case PACKET_RESERVE:
3741        {
3742                unsigned int val;
3743
3744                if (optlen != sizeof(val))
3745                        return -EINVAL;
3746                if (copy_from_user(&val, optval, sizeof(val)))
3747                        return -EFAULT;
3748                if (val > INT_MAX)
3749                        return -EINVAL;
3750                lock_sock(sk);
3751                if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3752                        ret = -EBUSY;
3753                } else {
3754                        po->tp_reserve = val;
3755                        ret = 0;
3756                }
3757                release_sock(sk);
3758                return ret;
3759        }
3760        case PACKET_LOSS:
3761        {
3762                unsigned int val;
3763
3764                if (optlen != sizeof(val))
3765                        return -EINVAL;
3766                if (copy_from_user(&val, optval, sizeof(val)))
3767                        return -EFAULT;
3768
3769                lock_sock(sk);
3770                if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3771                        ret = -EBUSY;
3772                } else {
3773                        po->tp_loss = !!val;
3774                        ret = 0;
3775                }
3776                release_sock(sk);
3777                return ret;
3778        }
3779        case PACKET_AUXDATA:
3780        {
3781                int val;
3782
3783                if (optlen < sizeof(val))
3784                        return -EINVAL;
3785                if (copy_from_user(&val, optval, sizeof(val)))
3786                        return -EFAULT;
3787
3788                lock_sock(sk);
3789                po->auxdata = !!val;
3790                release_sock(sk);
3791                return 0;
3792        }
3793        case PACKET_ORIGDEV:
3794        {
3795                int val;
3796
3797                if (optlen < sizeof(val))
3798                        return -EINVAL;
3799                if (copy_from_user(&val, optval, sizeof(val)))
3800                        return -EFAULT;
3801
3802                lock_sock(sk);
3803                po->origdev = !!val;
3804                release_sock(sk);
3805                return 0;
3806        }
3807        case PACKET_VNET_HDR:
3808        {
3809                int val;
3810
3811                if (sock->type != SOCK_RAW)
3812                        return -EINVAL;
3813                if (optlen < sizeof(val))
3814                        return -EINVAL;
3815                if (copy_from_user(&val, optval, sizeof(val)))
3816                        return -EFAULT;
3817
3818                lock_sock(sk);
3819                if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3820                        ret = -EBUSY;
3821                } else {
3822                        po->has_vnet_hdr = !!val;
3823                        ret = 0;
3824                }
3825                release_sock(sk);
3826                return ret;
3827        }
3828        case PACKET_TIMESTAMP:
3829        {
3830                int val;
3831
3832                if (optlen != sizeof(val))
3833                        return -EINVAL;
3834                if (copy_from_user(&val, optval, sizeof(val)))
3835                        return -EFAULT;
3836
3837                po->tp_tstamp = val;
3838                return 0;
3839        }
3840        case PACKET_FANOUT:
3841        {
3842                int val;
3843
3844                if (optlen != sizeof(val))
3845                        return -EINVAL;
3846                if (copy_from_user(&val, optval, sizeof(val)))
3847                        return -EFAULT;
3848
3849                return fanout_add(sk, val & 0xffff, val >> 16);
3850        }
3851        case PACKET_FANOUT_DATA:
3852        {
3853                if (!po->fanout)
3854                        return -EINVAL;
3855
3856                return fanout_set_data(po, optval, optlen);
3857        }
3858        case PACKET_IGNORE_OUTGOING:
3859        {
3860                int val;
3861
3862                if (optlen != sizeof(val))
3863                        return -EINVAL;
3864                if (copy_from_user(&val, optval, sizeof(val)))
3865                        return -EFAULT;
3866                if (val < 0 || val > 1)
3867                        return -EINVAL;
3868
3869                po->prot_hook.ignore_outgoing = !!val;
3870                return 0;
3871        }
3872        case PACKET_TX_HAS_OFF:
3873        {
3874                unsigned int val;
3875
3876                if (optlen != sizeof(val))
3877                        return -EINVAL;
3878                if (copy_from_user(&val, optval, sizeof(val)))
3879                        return -EFAULT;
3880
3881                lock_sock(sk);
3882                if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3883                        ret = -EBUSY;
3884                } else {
3885                        po->tp_tx_has_off = !!val;
3886                        ret = 0;
3887                }
3888                release_sock(sk);
3889                return 0;
3890        }
3891        case PACKET_QDISC_BYPASS:
3892        {
3893                int val;
3894
3895                if (optlen != sizeof(val))
3896                        return -EINVAL;
3897                if (copy_from_user(&val, optval, sizeof(val)))
3898                        return -EFAULT;
3899
3900                po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3901                return 0;
3902        }
3903        default:
3904                return -ENOPROTOOPT;
3905        }
3906}
3907
3908static int packet_getsockopt(struct socket *sock, int level, int optname,
3909                             char __user *optval, int __user *optlen)
3910{
3911        int len;
3912        int val, lv = sizeof(val);
3913        struct sock *sk = sock->sk;
3914        struct packet_sock *po = pkt_sk(sk);
3915        void *data = &val;
3916        union tpacket_stats_u st;
3917        struct tpacket_rollover_stats rstats;
3918        int drops;
3919
3920        if (level != SOL_PACKET)
3921                return -ENOPROTOOPT;
3922
3923        if (get_user(len, optlen))
3924                return -EFAULT;
3925
3926        if (len < 0)
3927                return -EINVAL;
3928
3929        switch (optname) {
3930        case PACKET_STATISTICS:
3931                spin_lock_bh(&sk->sk_receive_queue.lock);
3932                memcpy(&st, &po->stats, sizeof(st));
3933                memset(&po->stats, 0, sizeof(po->stats));
3934                spin_unlock_bh(&sk->sk_receive_queue.lock);
3935                drops = atomic_xchg(&po->tp_drops, 0);
3936
3937                if (po->tp_version == TPACKET_V3) {
3938                        lv = sizeof(struct tpacket_stats_v3);
3939                        st.stats3.tp_drops = drops;
3940                        st.stats3.tp_packets += drops;
3941                        data = &st.stats3;
3942                } else {
3943                        lv = sizeof(struct tpacket_stats);
3944                        st.stats1.tp_drops = drops;
3945                        st.stats1.tp_packets += drops;
3946                        data = &st.stats1;
3947                }
3948
3949                break;
3950        case PACKET_AUXDATA:
3951                val = po->auxdata;
3952                break;
3953        case PACKET_ORIGDEV:
3954                val = po->origdev;
3955                break;
3956        case PACKET_VNET_HDR:
3957                val = po->has_vnet_hdr;
3958                break;
3959        case PACKET_VERSION:
3960                val = po->tp_version;
3961                break;
3962        case PACKET_HDRLEN:
3963                if (len > sizeof(int))
3964                        len = sizeof(int);
3965                if (len < sizeof(int))
3966                        return -EINVAL;
3967                if (copy_from_user(&val, optval, len))
3968                        return -EFAULT;
3969                switch (val) {
3970                case TPACKET_V1:
3971                        val = sizeof(struct tpacket_hdr);
3972                        break;
3973                case TPACKET_V2:
3974                        val = sizeof(struct tpacket2_hdr);
3975                        break;
3976                case TPACKET_V3:
3977                        val = sizeof(struct tpacket3_hdr);
3978                        break;
3979                default:
3980                        return -EINVAL;
3981                }
3982                break;
3983        case PACKET_RESERVE:
3984                val = po->tp_reserve;
3985                break;
3986        case PACKET_LOSS:
3987                val = po->tp_loss;
3988                break;
3989        case PACKET_TIMESTAMP:
3990                val = po->tp_tstamp;
3991                break;
3992        case PACKET_FANOUT:
3993                val = (po->fanout ?
3994                       ((u32)po->fanout->id |
3995                        ((u32)po->fanout->type << 16) |
3996                        ((u32)po->fanout->flags << 24)) :
3997                       0);
3998                break;
3999        case PACKET_IGNORE_OUTGOING:
4000                val = po->prot_hook.ignore_outgoing;
4001                break;
4002        case PACKET_ROLLOVER_STATS:
4003                if (!po->rollover)
4004                        return -EINVAL;
4005                rstats.tp_all = atomic_long_read(&po->rollover->num);
4006                rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
4007                rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
4008                data = &rstats;
4009                lv = sizeof(rstats);
4010                break;
4011        case PACKET_TX_HAS_OFF:
4012                val = po->tp_tx_has_off;
4013                break;
4014        case PACKET_QDISC_BYPASS:
4015                val = packet_use_direct_xmit(po);
4016                break;
4017        default:
4018                return -ENOPROTOOPT;
4019        }
4020
4021        if (len > lv)
4022                len = lv;
4023        if (put_user(len, optlen))
4024                return -EFAULT;
4025        if (copy_to_user(optval, data, len))
4026                return -EFAULT;
4027        return 0;
4028}
4029
4030
4031#ifdef CONFIG_COMPAT
4032static int compat_packet_setsockopt(struct socket *sock, int level, int optname,
4033                                    char __user *optval, unsigned int optlen)
4034{
4035        struct packet_sock *po = pkt_sk(sock->sk);
4036
4037        if (level != SOL_PACKET)
4038                return -ENOPROTOOPT;
4039
4040        if (optname == PACKET_FANOUT_DATA &&
4041            po->fanout && po->fanout->type == PACKET_FANOUT_CBPF) {
4042                optval = (char __user *)get_compat_bpf_fprog(optval);
4043                if (!optval)
4044                        return -EFAULT;
4045                optlen = sizeof(struct sock_fprog);
4046        }
4047
4048        return packet_setsockopt(sock, level, optname, optval, optlen);
4049}
4050#endif
4051
4052static int packet_notifier(struct notifier_block *this,
4053                           unsigned long msg, void *ptr)
4054{
4055        struct sock *sk;
4056        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4057        struct net *net = dev_net(dev);
4058
4059        rcu_read_lock();
4060        sk_for_each_rcu(sk, &net->packet.sklist) {
4061                struct packet_sock *po = pkt_sk(sk);
4062
4063                switch (msg) {
4064                case NETDEV_UNREGISTER:
4065                        if (po->mclist)
4066                                packet_dev_mclist_delete(dev, &po->mclist);
4067                        /* fallthrough */
4068
4069                case NETDEV_DOWN:
4070                        if (dev->ifindex == po->ifindex) {
4071                                spin_lock(&po->bind_lock);
4072                                if (po->running) {
4073                                        __unregister_prot_hook(sk, false);
4074                                        sk->sk_err = ENETDOWN;
4075                                        if (!sock_flag(sk, SOCK_DEAD))
4076                                                sk->sk_error_report(sk);
4077                                }
4078                                if (msg == NETDEV_UNREGISTER) {
4079                                        packet_cached_dev_reset(po);
4080                                        po->ifindex = -1;
4081                                        if (po->prot_hook.dev)
4082                                                dev_put(po->prot_hook.dev);
4083                                        po->prot_hook.dev = NULL;
4084                                }
4085                                spin_unlock(&po->bind_lock);
4086                        }
4087                        break;
4088                case NETDEV_UP:
4089                        if (dev->ifindex == po->ifindex) {
4090                                spin_lock(&po->bind_lock);
4091                                if (po->num)
4092                                        register_prot_hook(sk);
4093                                spin_unlock(&po->bind_lock);
4094                        }
4095                        break;
4096                }
4097        }
4098        rcu_read_unlock();
4099        return NOTIFY_DONE;
4100}
4101
4102
4103static int packet_ioctl(struct socket *sock, unsigned int cmd,
4104                        unsigned long arg)
4105{
4106        struct sock *sk = sock->sk;
4107
4108        switch (cmd) {
4109        case SIOCOUTQ:
4110        {
4111                int amount = sk_wmem_alloc_get(sk);
4112
4113                return put_user(amount, (int __user *)arg);
4114        }
4115        case SIOCINQ:
4116        {
4117                struct sk_buff *skb;
4118                int amount = 0;
4119
4120                spin_lock_bh(&sk->sk_receive_queue.lock);
4121                skb = skb_peek(&sk->sk_receive_queue);
4122                if (skb)
4123                        amount = skb->len;
4124                spin_unlock_bh(&sk->sk_receive_queue.lock);
4125                return put_user(amount, (int __user *)arg);
4126        }
4127#ifdef CONFIG_INET
4128        case SIOCADDRT:
4129        case SIOCDELRT:
4130        case SIOCDARP:
4131        case SIOCGARP:
4132        case SIOCSARP:
4133        case SIOCGIFADDR:
4134        case SIOCSIFADDR:
4135        case SIOCGIFBRDADDR:
4136        case SIOCSIFBRDADDR:
4137        case SIOCGIFNETMASK:
4138        case SIOCSIFNETMASK:
4139        case SIOCGIFDSTADDR:
4140        case SIOCSIFDSTADDR:
4141        case SIOCSIFFLAGS:
4142                return inet_dgram_ops.ioctl(sock, cmd, arg);
4143#endif
4144
4145        default:
4146                return -ENOIOCTLCMD;
4147        }
4148        return 0;
4149}
4150
4151static __poll_t packet_poll(struct file *file, struct socket *sock,
4152                                poll_table *wait)
4153{
4154        struct sock *sk = sock->sk;
4155        struct packet_sock *po = pkt_sk(sk);
4156        __poll_t mask = datagram_poll(file, sock, wait);
4157
4158        spin_lock_bh(&sk->sk_receive_queue.lock);
4159        if (po->rx_ring.pg_vec) {
4160                if (!packet_previous_rx_frame(po, &po->rx_ring,
4161                        TP_STATUS_KERNEL))
4162                        mask |= EPOLLIN | EPOLLRDNORM;
4163        }
4164        packet_rcv_try_clear_pressure(po);
4165        spin_unlock_bh(&sk->sk_receive_queue.lock);
4166        spin_lock_bh(&sk->sk_write_queue.lock);
4167        if (po->tx_ring.pg_vec) {
4168                if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
4169                        mask |= EPOLLOUT | EPOLLWRNORM;
4170        }
4171        spin_unlock_bh(&sk->sk_write_queue.lock);
4172        return mask;
4173}
4174
4175
4176/* Dirty? Well, I still did not learn better way to account
4177 * for user mmaps.
4178 */
4179
4180static void packet_mm_open(struct vm_area_struct *vma)
4181{
4182        struct file *file = vma->vm_file;
4183        struct socket *sock = file->private_data;
4184        struct sock *sk = sock->sk;
4185
4186        if (sk)
4187                atomic_inc(&pkt_sk(sk)->mapped);
4188}
4189
4190static void packet_mm_close(struct vm_area_struct *vma)
4191{
4192        struct file *file = vma->vm_file;
4193        struct socket *sock = file->private_data;
4194        struct sock *sk = sock->sk;
4195
4196        if (sk)
4197                atomic_dec(&pkt_sk(sk)->mapped);
4198}
4199
4200static const struct vm_operations_struct packet_mmap_ops = {
4201        .open   =       packet_mm_open,
4202        .close  =       packet_mm_close,
4203};
4204
4205static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
4206                        unsigned int len)
4207{
4208        int i;
4209
4210        for (i = 0; i < len; i++) {
4211                if (likely(pg_vec[i].buffer)) {
4212                        if (is_vmalloc_addr(pg_vec[i].buffer))
4213                                vfree(pg_vec[i].buffer);
4214                        else
4215                                free_pages((unsigned long)pg_vec[i].buffer,
4216                                           order);
4217                        pg_vec[i].buffer = NULL;
4218                }
4219        }
4220        kfree(pg_vec);
4221}
4222
4223static char *alloc_one_pg_vec_page(unsigned long order)
4224{
4225        char *buffer;
4226        gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
4227                          __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
4228
4229        buffer = (char *) __get_free_pages(gfp_flags, order);
4230        if (buffer)
4231                return buffer;
4232
4233        /* __get_free_pages failed, fall back to vmalloc */
4234        buffer = vzalloc(array_size((1 << order), PAGE_SIZE));
4235        if (buffer)
4236                return buffer;
4237
4238        /* vmalloc failed, lets dig into swap here */
4239        gfp_flags &= ~__GFP_NORETRY;
4240        buffer = (char *) __get_free_pages(gfp_flags, order);
4241        if (buffer)
4242                return buffer;
4243
4244        /* complete and utter failure */
4245        return NULL;
4246}
4247
4248static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4249{
4250        unsigned int block_nr = req->tp_block_nr;
4251        struct pgv *pg_vec;
4252        int i;
4253
4254        pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL | __GFP_NOWARN);
4255        if (unlikely(!pg_vec))
4256                goto out;
4257
4258        for (i = 0; i < block_nr; i++) {
4259                pg_vec[i].buffer = alloc_one_pg_vec_page(order);
4260                if (unlikely(!pg_vec[i].buffer))
4261                        goto out_free_pgvec;
4262        }
4263
4264out:
4265        return pg_vec;
4266
4267out_free_pgvec:
4268        free_pg_vec(pg_vec, order, block_nr);
4269        pg_vec = NULL;
4270        goto out;
4271}
4272
4273static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
4274                int closing, int tx_ring)
4275{
4276        struct pgv *pg_vec = NULL;
4277        struct packet_sock *po = pkt_sk(sk);
4278        int was_running, order = 0;
4279        struct packet_ring_buffer *rb;
4280        struct sk_buff_head *rb_queue;
4281        __be16 num;
4282        int err = -EINVAL;
4283        /* Added to avoid minimal code churn */
4284        struct tpacket_req *req = &req_u->req;
4285
4286        rb = tx_ring ? &po->tx_ring : &po->rx_ring;
4287        rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
4288
4289        err = -EBUSY;
4290        if (!closing) {
4291                if (atomic_read(&po->mapped))
4292                        goto out;
4293                if (packet_read_pending(rb))
4294                        goto out;
4295        }
4296
4297        if (req->tp_block_nr) {
4298                unsigned int min_frame_size;
4299
4300                /* Sanity tests and some calculations */
4301                err = -EBUSY;
4302                if (unlikely(rb->pg_vec))
4303                        goto out;
4304
4305                switch (po->tp_version) {
4306                case TPACKET_V1:
4307                        po->tp_hdrlen = TPACKET_HDRLEN;
4308                        break;
4309                case TPACKET_V2:
4310                        po->tp_hdrlen = TPACKET2_HDRLEN;
4311                        break;
4312                case TPACKET_V3:
4313                        po->tp_hdrlen = TPACKET3_HDRLEN;
4314                        break;
4315                }
4316
4317                err = -EINVAL;
4318                if (unlikely((int)req->tp_block_size <= 0))
4319                        goto out;
4320                if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
4321                        goto out;
4322                min_frame_size = po->tp_hdrlen + po->tp_reserve;
4323                if (po->tp_version >= TPACKET_V3 &&
4324                    req->tp_block_size <
4325                    BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + min_frame_size)
4326                        goto out;
4327                if (unlikely(req->tp_frame_size < min_frame_size))
4328                        goto out;
4329                if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
4330                        goto out;
4331
4332                rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
4333                if (unlikely(rb->frames_per_block == 0))
4334                        goto out;
4335                if (unlikely(rb->frames_per_block > UINT_MAX / req->tp_block_nr))
4336                        goto out;
4337                if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
4338                                        req->tp_frame_nr))
4339                        goto out;
4340
4341                err = -ENOMEM;
4342                order = get_order(req->tp_block_size);
4343                pg_vec = alloc_pg_vec(req, order);
4344                if (unlikely(!pg_vec))
4345                        goto out;
4346                switch (po->tp_version) {
4347                case TPACKET_V3:
4348                        /* Block transmit is not supported yet */
4349                        if (!tx_ring) {
4350                                init_prb_bdqc(po, rb, pg_vec, req_u);
4351                        } else {
4352                                struct tpacket_req3 *req3 = &req_u->req3;
4353
4354                                if (req3->tp_retire_blk_tov ||
4355                                    req3->tp_sizeof_priv ||
4356                                    req3->tp_feature_req_word) {
4357                                        err = -EINVAL;
4358                                        goto out_free_pg_vec;
4359                                }
4360                        }
4361                        break;
4362                default:
4363                        break;
4364                }
4365        }
4366        /* Done */
4367        else {
4368                err = -EINVAL;
4369                if (unlikely(req->tp_frame_nr))
4370                        goto out;
4371        }
4372
4373
4374        /* Detach socket from network */
4375        spin_lock(&po->bind_lock);
4376        was_running = po->running;
4377        num = po->num;
4378        if (was_running) {
4379                po->num = 0;
4380                __unregister_prot_hook(sk, false);
4381        }
4382        spin_unlock(&po->bind_lock);
4383
4384        synchronize_net();
4385
4386        err = -EBUSY;
4387        mutex_lock(&po->pg_vec_lock);
4388        if (closing || atomic_read(&po->mapped) == 0) {
4389                err = 0;
4390                spin_lock_bh(&rb_queue->lock);
4391                swap(rb->pg_vec, pg_vec);
4392                rb->frame_max = (req->tp_frame_nr - 1);
4393                rb->head = 0;
4394                rb->frame_size = req->tp_frame_size;
4395                spin_unlock_bh(&rb_queue->lock);
4396
4397                swap(rb->pg_vec_order, order);
4398                swap(rb->pg_vec_len, req->tp_block_nr);
4399
4400                rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
4401                po->prot_hook.func = (po->rx_ring.pg_vec) ?
4402                                                tpacket_rcv : packet_rcv;
4403                skb_queue_purge(rb_queue);
4404                if (atomic_read(&po->mapped))
4405                        pr_err("packet_mmap: vma is busy: %d\n",
4406                               atomic_read(&po->mapped));
4407        }
4408        mutex_unlock(&po->pg_vec_lock);
4409
4410        spin_lock(&po->bind_lock);
4411        if (was_running) {
4412                po->num = num;
4413                register_prot_hook(sk);
4414        }
4415        spin_unlock(&po->bind_lock);
4416        if (pg_vec && (po->tp_version > TPACKET_V2)) {
4417                /* Because we don't support block-based V3 on tx-ring */
4418                if (!tx_ring)
4419                        prb_shutdown_retire_blk_timer(po, rb_queue);
4420        }
4421
4422out_free_pg_vec:
4423        if (pg_vec)
4424                free_pg_vec(pg_vec, order, req->tp_block_nr);
4425out:
4426        return err;
4427}
4428
4429static int packet_mmap(struct file *file, struct socket *sock,
4430                struct vm_area_struct *vma)
4431{
4432        struct sock *sk = sock->sk;
4433        struct packet_sock *po = pkt_sk(sk);
4434        unsigned long size, expected_size;
4435        struct packet_ring_buffer *rb;
4436        unsigned long start;
4437        int err = -EINVAL;
4438        int i;
4439
4440        if (vma->vm_pgoff)
4441                return -EINVAL;
4442
4443        mutex_lock(&po->pg_vec_lock);
4444
4445        expected_size = 0;
4446        for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4447                if (rb->pg_vec) {
4448                        expected_size += rb->pg_vec_len
4449                                                * rb->pg_vec_pages
4450                                                * PAGE_SIZE;
4451                }
4452        }
4453
4454        if (expected_size == 0)
4455                goto out;
4456
4457        size = vma->vm_end - vma->vm_start;
4458        if (size != expected_size)
4459                goto out;
4460
4461        start = vma->vm_start;
4462        for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4463                if (rb->pg_vec == NULL)
4464                        continue;
4465
4466                for (i = 0; i < rb->pg_vec_len; i++) {
4467                        struct page *page;
4468                        void *kaddr = rb->pg_vec[i].buffer;
4469                        int pg_num;
4470
4471                        for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
4472                                page = pgv_to_page(kaddr);
4473                                err = vm_insert_page(vma, start, page);
4474                                if (unlikely(err))
4475                                        goto out;
4476                                start += PAGE_SIZE;
4477                                kaddr += PAGE_SIZE;
4478                        }
4479                }
4480        }
4481
4482        atomic_inc(&po->mapped);
4483        vma->vm_ops = &packet_mmap_ops;
4484        err = 0;
4485
4486out:
4487        mutex_unlock(&po->pg_vec_lock);
4488        return err;
4489}
4490
4491static const struct proto_ops packet_ops_spkt = {
4492        .family =       PF_PACKET,
4493        .owner =        THIS_MODULE,
4494        .release =      packet_release,
4495        .bind =         packet_bind_spkt,
4496        .connect =      sock_no_connect,
4497        .socketpair =   sock_no_socketpair,
4498        .accept =       sock_no_accept,
4499        .getname =      packet_getname_spkt,
4500        .poll =         datagram_poll,
4501        .ioctl =        packet_ioctl,
4502        .gettstamp =    sock_gettstamp,
4503        .listen =       sock_no_listen,
4504        .shutdown =     sock_no_shutdown,
4505        .setsockopt =   sock_no_setsockopt,
4506        .getsockopt =   sock_no_getsockopt,
4507        .sendmsg =      packet_sendmsg_spkt,
4508        .recvmsg =      packet_recvmsg,
4509        .mmap =         sock_no_mmap,
4510        .sendpage =     sock_no_sendpage,
4511};
4512
4513static const struct proto_ops packet_ops = {
4514        .family =       PF_PACKET,
4515        .owner =        THIS_MODULE,
4516        .release =      packet_release,
4517        .bind =         packet_bind,
4518        .connect =      sock_no_connect,
4519        .socketpair =   sock_no_socketpair,
4520        .accept =       sock_no_accept,
4521        .getname =      packet_getname,
4522        .poll =         packet_poll,
4523        .ioctl =        packet_ioctl,
4524        .gettstamp =    sock_gettstamp,
4525        .listen =       sock_no_listen,
4526        .shutdown =     sock_no_shutdown,
4527        .setsockopt =   packet_setsockopt,
4528        .getsockopt =   packet_getsockopt,
4529#ifdef CONFIG_COMPAT
4530        .compat_setsockopt = compat_packet_setsockopt,
4531#endif
4532        .sendmsg =      packet_sendmsg,
4533        .recvmsg =      packet_recvmsg,
4534        .mmap =         packet_mmap,
4535        .sendpage =     sock_no_sendpage,
4536};
4537
4538static const struct net_proto_family packet_family_ops = {
4539        .family =       PF_PACKET,
4540        .create =       packet_create,
4541        .owner  =       THIS_MODULE,
4542};
4543
4544static struct notifier_block packet_netdev_notifier = {
4545        .notifier_call =        packet_notifier,
4546};
4547
4548#ifdef CONFIG_PROC_FS
4549
4550static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
4551        __acquires(RCU)
4552{
4553        struct net *net = seq_file_net(seq);
4554
4555        rcu_read_lock();
4556        return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
4557}
4558
4559static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4560{
4561        struct net *net = seq_file_net(seq);
4562        return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
4563}
4564
4565static void packet_seq_stop(struct seq_file *seq, void *v)
4566        __releases(RCU)
4567{
4568        rcu_read_unlock();
4569}
4570
4571static int packet_seq_show(struct seq_file *seq, void *v)
4572{
4573        if (v == SEQ_START_TOKEN)
4574                seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
4575        else {
4576                struct sock *s = sk_entry(v);
4577                const struct packet_sock *po = pkt_sk(s);
4578
4579                seq_printf(seq,
4580                           "%pK %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
4581                           s,
4582                           refcount_read(&s->sk_refcnt),
4583                           s->sk_type,
4584                           ntohs(po->num),
4585                           po->ifindex,
4586                           po->running,
4587                           atomic_read(&s->sk_rmem_alloc),
4588                           from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
4589                           sock_i_ino(s));
4590        }
4591
4592        return 0;
4593}
4594
4595static const struct seq_operations packet_seq_ops = {
4596        .start  = packet_seq_start,
4597        .next   = packet_seq_next,
4598        .stop   = packet_seq_stop,
4599        .show   = packet_seq_show,
4600};
4601#endif
4602
4603static int __net_init packet_net_init(struct net *net)
4604{
4605        mutex_init(&net->packet.sklist_lock);
4606        INIT_HLIST_HEAD(&net->packet.sklist);
4607
4608        if (!proc_create_net("packet", 0, net->proc_net, &packet_seq_ops,
4609                        sizeof(struct seq_net_private)))
4610                return -ENOMEM;
4611
4612        return 0;
4613}
4614
4615static void __net_exit packet_net_exit(struct net *net)
4616{
4617        remove_proc_entry("packet", net->proc_net);
4618        WARN_ON_ONCE(!hlist_empty(&net->packet.sklist));
4619}
4620
4621static struct pernet_operations packet_net_ops = {
4622        .init = packet_net_init,
4623        .exit = packet_net_exit,
4624};
4625
4626
4627static void __exit packet_exit(void)
4628{
4629        unregister_netdevice_notifier(&packet_netdev_notifier);
4630        unregister_pernet_subsys(&packet_net_ops);
4631        sock_unregister(PF_PACKET);
4632        proto_unregister(&packet_proto);
4633}
4634
4635static int __init packet_init(void)
4636{
4637        int rc;
4638
4639        rc = proto_register(&packet_proto, 0);
4640        if (rc)
4641                goto out;
4642        rc = sock_register(&packet_family_ops);
4643        if (rc)
4644                goto out_proto;
4645        rc = register_pernet_subsys(&packet_net_ops);
4646        if (rc)
4647                goto out_sock;
4648        rc = register_netdevice_notifier(&packet_netdev_notifier);
4649        if (rc)
4650                goto out_pernet;
4651
4652        return 0;
4653
4654out_pernet:
4655        unregister_pernet_subsys(&packet_net_ops);
4656out_sock:
4657        sock_unregister(PF_PACKET);
4658out_proto:
4659        proto_unregister(&packet_proto);
4660out:
4661        return rc;
4662}
4663
4664module_init(packet_init);
4665module_exit(packet_exit);
4666MODULE_LICENSE("GPL");
4667MODULE_ALIAS_NETPROTO(PF_PACKET);
4668