linux/net/core/dev.c
<<
>>
Prefs
   1/*
   2 *      NET3    Protocol independent device support routines.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 *      Derived from the non IP parts of dev.c 1.0.19
  10 *              Authors:        Ross Biro
  11 *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *      Additional Authors:
  15 *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *              David Hinds <dahinds@users.sourceforge.net>
  18 *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *              Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *      Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *                                      to 2 if register_netdev gets called
  25 *                                      before net_dev_init & also removed a
  26 *                                      few lines of code in the process.
  27 *              Alan Cox        :       device private ioctl copies fields back.
  28 *              Alan Cox        :       Transmit queue code does relevant
  29 *                                      stunts to keep the queue safe.
  30 *              Alan Cox        :       Fixed double lock.
  31 *              Alan Cox        :       Fixed promisc NULL pointer trap
  32 *              ????????        :       Support the full private ioctl range
  33 *              Alan Cox        :       Moved ioctl permission check into
  34 *                                      drivers
  35 *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36 *              Alan Cox        :       100 backlog just doesn't cut it when
  37 *                                      you start doing multicast video 8)
  38 *              Alan Cox        :       Rewrote net_bh and list manager.
  39 *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40 *              Alan Cox        :       Took out transmit every packet pass
  41 *                                      Saved a few bytes in the ioctl handler
  42 *              Alan Cox        :       Network driver sets packet type before
  43 *                                      calling netif_rx. Saves a function
  44 *                                      call a packet.
  45 *              Alan Cox        :       Hashed net_bh()
  46 *              Richard Kooijman:       Timestamp fixes.
  47 *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48 *              Alan Cox        :       Device lock protection.
  49 *              Alan Cox        :       Fixed nasty side effect of device close
  50 *                                      changes.
  51 *              Rudi Cilibrasi  :       Pass the right thing to
  52 *                                      set_mac_address()
  53 *              Dave Miller     :       32bit quantity for the device lock to
  54 *                                      make it work out on a Sparc.
  55 *              Bjorn Ekwall    :       Added KERNELD hack.
  56 *              Alan Cox        :       Cleaned up the backlog initialise.
  57 *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58 *                                      1 device.
  59 *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60 *                                      is no device open function.
  61 *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62 *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63 *              Cyrus Durgin    :       Cleaned for KMOD
  64 *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65 *                                      A network device unload needs to purge
  66 *                                      the backlog queue.
  67 *      Paul Rusty Russell      :       SIOCSIFNAME
  68 *              Pekka Riikonen  :       Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *                                      indefinitely on dev->refcnt
  71 *              J Hadi Salim    :       - Backlog queue sampling
  72 *                                      - netif_rx() feedback
  73 */
  74
  75#include <asm/uaccess.h>
  76#include <linux/bitops.h>
  77#include <linux/capability.h>
  78#include <linux/cpu.h>
  79#include <linux/types.h>
  80#include <linux/kernel.h>
  81#include <linux/hash.h>
  82#include <linux/slab.h>
  83#include <linux/sched.h>
  84#include <linux/mutex.h>
  85#include <linux/string.h>
  86#include <linux/mm.h>
  87#include <linux/socket.h>
  88#include <linux/sockios.h>
  89#include <linux/errno.h>
  90#include <linux/interrupt.h>
  91#include <linux/if_ether.h>
  92#include <linux/netdevice.h>
  93#include <linux/etherdevice.h>
  94#include <linux/ethtool.h>
  95#include <linux/notifier.h>
  96#include <linux/skbuff.h>
  97#include <net/net_namespace.h>
  98#include <net/sock.h>
  99#include <linux/rtnetlink.h>
 100#include <linux/proc_fs.h>
 101#include <linux/seq_file.h>
 102#include <linux/stat.h>
 103#include <net/dst.h>
 104#include <net/pkt_sched.h>
 105#include <net/checksum.h>
 106#include <net/xfrm.h>
 107#include <linux/highmem.h>
 108#include <linux/init.h>
 109#include <linux/kmod.h>
 110#include <linux/module.h>
 111#include <linux/netpoll.h>
 112#include <linux/rcupdate.h>
 113#include <linux/delay.h>
 114#include <net/wext.h>
 115#include <net/iw_handler.h>
 116#include <asm/current.h>
 117#include <linux/audit.h>
 118#include <linux/dmaengine.h>
 119#include <linux/err.h>
 120#include <linux/ctype.h>
 121#include <linux/if_arp.h>
 122#include <linux/if_vlan.h>
 123#include <linux/ip.h>
 124#include <net/ip.h>
 125#include <linux/ipv6.h>
 126#include <linux/in.h>
 127#include <linux/jhash.h>
 128#include <linux/random.h>
 129#include <trace/events/napi.h>
 130#include <trace/events/net.h>
 131#include <trace/events/skb.h>
 132#include <linux/pci.h>
 133#include <linux/inetdevice.h>
 134#include <linux/cpu_rmap.h>
 135#include <linux/net_tstamp.h>
 136#include <linux/static_key.h>
 137#include <net/flow_keys.h>
 138
 139#include "net-sysfs.h"
 140
 141/* Instead of increasing this, you should create a hash table. */
 142#define MAX_GRO_SKBS 8
 143
 144/* This should be increased if a protocol with a bigger head is added. */
 145#define GRO_MAX_HEAD (MAX_HEADER + 128)
 146
 147/*
 148 *      The list of packet types we will receive (as opposed to discard)
 149 *      and the routines to invoke.
 150 *
 151 *      Why 16. Because with 16 the only overlap we get on a hash of the
 152 *      low nibble of the protocol value is RARP/SNAP/X.25.
 153 *
 154 *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 155 *             sure which should go first, but I bet it won't make much
 156 *             difference if we are running VLANs.  The good news is that
 157 *             this protocol won't be in the list unless compiled in, so
 158 *             the average user (w/out VLANs) will not be adversely affected.
 159 *             --BLG
 160 *
 161 *              0800    IP
 162 *              8100    802.1Q VLAN
 163 *              0001    802.3
 164 *              0002    AX.25
 165 *              0004    802.2
 166 *              8035    RARP
 167 *              0005    SNAP
 168 *              0805    X.25
 169 *              0806    ARP
 170 *              8137    IPX
 171 *              0009    Localtalk
 172 *              86DD    IPv6
 173 */
 174
 175#define PTYPE_HASH_SIZE (16)
 176#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 177
 178static DEFINE_SPINLOCK(ptype_lock);
 179static DEFINE_SPINLOCK(offload_lock);
 180static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 181static struct list_head ptype_all __read_mostly;        /* Taps */
 182static struct list_head offload_base __read_mostly;
 183
 184/*
 185 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 186 * semaphore.
 187 *
 188 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 189 *
 190 * Writers must hold the rtnl semaphore while they loop through the
 191 * dev_base_head list, and hold dev_base_lock for writing when they do the
 192 * actual updates.  This allows pure readers to access the list even
 193 * while a writer is preparing to update it.
 194 *
 195 * To put it another way, dev_base_lock is held for writing only to
 196 * protect against pure readers; the rtnl semaphore provides the
 197 * protection against other writers.
 198 *
 199 * See, for example usages, register_netdevice() and
 200 * unregister_netdevice(), which must be called with the rtnl
 201 * semaphore held.
 202 */
 203DEFINE_RWLOCK(dev_base_lock);
 204EXPORT_SYMBOL(dev_base_lock);
 205
 206seqcount_t devnet_rename_seq;
 207
 208static inline void dev_base_seq_inc(struct net *net)
 209{
 210        while (++net->dev_base_seq == 0);
 211}
 212
 213static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 214{
 215        unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 216
 217        return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 218}
 219
 220static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 221{
 222        return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 223}
 224
 225static inline void rps_lock(struct softnet_data *sd)
 226{
 227#ifdef CONFIG_RPS
 228        spin_lock(&sd->input_pkt_queue.lock);
 229#endif
 230}
 231
 232static inline void rps_unlock(struct softnet_data *sd)
 233{
 234#ifdef CONFIG_RPS
 235        spin_unlock(&sd->input_pkt_queue.lock);
 236#endif
 237}
 238
 239/* Device list insertion */
 240static int list_netdevice(struct net_device *dev)
 241{
 242        struct net *net = dev_net(dev);
 243
 244        ASSERT_RTNL();
 245
 246        write_lock_bh(&dev_base_lock);
 247        list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 248        hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 249        hlist_add_head_rcu(&dev->index_hlist,
 250                           dev_index_hash(net, dev->ifindex));
 251        write_unlock_bh(&dev_base_lock);
 252
 253        dev_base_seq_inc(net);
 254
 255        return 0;
 256}
 257
 258/* Device list removal
 259 * caller must respect a RCU grace period before freeing/reusing dev
 260 */
 261static void unlist_netdevice(struct net_device *dev)
 262{
 263        ASSERT_RTNL();
 264
 265        /* Unlink dev from the device chain */
 266        write_lock_bh(&dev_base_lock);
 267        list_del_rcu(&dev->dev_list);
 268        hlist_del_rcu(&dev->name_hlist);
 269        hlist_del_rcu(&dev->index_hlist);
 270        write_unlock_bh(&dev_base_lock);
 271
 272        dev_base_seq_inc(dev_net(dev));
 273}
 274
 275/*
 276 *      Our notifier list
 277 */
 278
 279static RAW_NOTIFIER_HEAD(netdev_chain);
 280
 281/*
 282 *      Device drivers call our routines to queue packets here. We empty the
 283 *      queue in the local softnet handler.
 284 */
 285
 286DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 287EXPORT_PER_CPU_SYMBOL(softnet_data);
 288
 289#ifdef CONFIG_LOCKDEP
 290/*
 291 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 292 * according to dev->type
 293 */
 294static const unsigned short netdev_lock_type[] =
 295        {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 296         ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 297         ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 298         ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 299         ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 300         ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 301         ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 302         ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 303         ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 304         ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 305         ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 306         ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 307         ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 308         ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 309         ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 310
 311static const char *const netdev_lock_name[] =
 312        {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 313         "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 314         "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 315         "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 316         "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 317         "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 318         "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 319         "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 320         "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 321         "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 322         "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 323         "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 324         "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 325         "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 326         "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 327
 328static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 329static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 330
 331static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 332{
 333        int i;
 334
 335        for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 336                if (netdev_lock_type[i] == dev_type)
 337                        return i;
 338        /* the last key is used by default */
 339        return ARRAY_SIZE(netdev_lock_type) - 1;
 340}
 341
 342static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 343                                                 unsigned short dev_type)
 344{
 345        int i;
 346
 347        i = netdev_lock_pos(dev_type);
 348        lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 349                                   netdev_lock_name[i]);
 350}
 351
 352static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 353{
 354        int i;
 355
 356        i = netdev_lock_pos(dev->type);
 357        lockdep_set_class_and_name(&dev->addr_list_lock,
 358                                   &netdev_addr_lock_key[i],
 359                                   netdev_lock_name[i]);
 360}
 361#else
 362static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 363                                                 unsigned short dev_type)
 364{
 365}
 366static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 367{
 368}
 369#endif
 370
 371/*******************************************************************************
 372
 373                Protocol management and registration routines
 374
 375*******************************************************************************/
 376
 377/*
 378 *      Add a protocol ID to the list. Now that the input handler is
 379 *      smarter we can dispense with all the messy stuff that used to be
 380 *      here.
 381 *
 382 *      BEWARE!!! Protocol handlers, mangling input packets,
 383 *      MUST BE last in hash buckets and checking protocol handlers
 384 *      MUST start from promiscuous ptype_all chain in net_bh.
 385 *      It is true now, do not change it.
 386 *      Explanation follows: if protocol handler, mangling packet, will
 387 *      be the first on list, it is not able to sense, that packet
 388 *      is cloned and should be copied-on-write, so that it will
 389 *      change it and subsequent readers will get broken packet.
 390 *                                                      --ANK (980803)
 391 */
 392
 393static inline struct list_head *ptype_head(const struct packet_type *pt)
 394{
 395        if (pt->type == htons(ETH_P_ALL))
 396                return &ptype_all;
 397        else
 398                return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 399}
 400
 401/**
 402 *      dev_add_pack - add packet handler
 403 *      @pt: packet type declaration
 404 *
 405 *      Add a protocol handler to the networking stack. The passed &packet_type
 406 *      is linked into kernel lists and may not be freed until it has been
 407 *      removed from the kernel lists.
 408 *
 409 *      This call does not sleep therefore it can not
 410 *      guarantee all CPU's that are in middle of receiving packets
 411 *      will see the new packet type (until the next received packet).
 412 */
 413
 414void dev_add_pack(struct packet_type *pt)
 415{
 416        struct list_head *head = ptype_head(pt);
 417
 418        spin_lock(&ptype_lock);
 419        list_add_rcu(&pt->list, head);
 420        spin_unlock(&ptype_lock);
 421}
 422EXPORT_SYMBOL(dev_add_pack);
 423
 424/**
 425 *      __dev_remove_pack        - remove packet handler
 426 *      @pt: packet type declaration
 427 *
 428 *      Remove a protocol handler that was previously added to the kernel
 429 *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 430 *      from the kernel lists and can be freed or reused once this function
 431 *      returns.
 432 *
 433 *      The packet type might still be in use by receivers
 434 *      and must not be freed until after all the CPU's have gone
 435 *      through a quiescent state.
 436 */
 437void __dev_remove_pack(struct packet_type *pt)
 438{
 439        struct list_head *head = ptype_head(pt);
 440        struct packet_type *pt1;
 441
 442        spin_lock(&ptype_lock);
 443
 444        list_for_each_entry(pt1, head, list) {
 445                if (pt == pt1) {
 446                        list_del_rcu(&pt->list);
 447                        goto out;
 448                }
 449        }
 450
 451        pr_warn("dev_remove_pack: %p not found\n", pt);
 452out:
 453        spin_unlock(&ptype_lock);
 454}
 455EXPORT_SYMBOL(__dev_remove_pack);
 456
 457/**
 458 *      dev_remove_pack  - remove packet handler
 459 *      @pt: packet type declaration
 460 *
 461 *      Remove a protocol handler that was previously added to the kernel
 462 *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 463 *      from the kernel lists and can be freed or reused once this function
 464 *      returns.
 465 *
 466 *      This call sleeps to guarantee that no CPU is looking at the packet
 467 *      type after return.
 468 */
 469void dev_remove_pack(struct packet_type *pt)
 470{
 471        __dev_remove_pack(pt);
 472
 473        synchronize_net();
 474}
 475EXPORT_SYMBOL(dev_remove_pack);
 476
 477
 478/**
 479 *      dev_add_offload - register offload handlers
 480 *      @po: protocol offload declaration
 481 *
 482 *      Add protocol offload handlers to the networking stack. The passed
 483 *      &proto_offload is linked into kernel lists and may not be freed until
 484 *      it has been removed from the kernel lists.
 485 *
 486 *      This call does not sleep therefore it can not
 487 *      guarantee all CPU's that are in middle of receiving packets
 488 *      will see the new offload handlers (until the next received packet).
 489 */
 490void dev_add_offload(struct packet_offload *po)
 491{
 492        struct list_head *head = &offload_base;
 493
 494        spin_lock(&offload_lock);
 495        list_add_rcu(&po->list, head);
 496        spin_unlock(&offload_lock);
 497}
 498EXPORT_SYMBOL(dev_add_offload);
 499
 500/**
 501 *      __dev_remove_offload     - remove offload handler
 502 *      @po: packet offload declaration
 503 *
 504 *      Remove a protocol offload handler that was previously added to the
 505 *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 506 *      is removed from the kernel lists and can be freed or reused once this
 507 *      function returns.
 508 *
 509 *      The packet type might still be in use by receivers
 510 *      and must not be freed until after all the CPU's have gone
 511 *      through a quiescent state.
 512 */
 513void __dev_remove_offload(struct packet_offload *po)
 514{
 515        struct list_head *head = &offload_base;
 516        struct packet_offload *po1;
 517
 518        spin_lock(&offload_lock);
 519
 520        list_for_each_entry(po1, head, list) {
 521                if (po == po1) {
 522                        list_del_rcu(&po->list);
 523                        goto out;
 524                }
 525        }
 526
 527        pr_warn("dev_remove_offload: %p not found\n", po);
 528out:
 529        spin_unlock(&offload_lock);
 530}
 531EXPORT_SYMBOL(__dev_remove_offload);
 532
 533/**
 534 *      dev_remove_offload       - remove packet offload handler
 535 *      @po: packet offload declaration
 536 *
 537 *      Remove a packet offload handler that was previously added to the kernel
 538 *      offload handlers by dev_add_offload(). The passed &offload_type is
 539 *      removed from the kernel lists and can be freed or reused once this
 540 *      function returns.
 541 *
 542 *      This call sleeps to guarantee that no CPU is looking at the packet
 543 *      type after return.
 544 */
 545void dev_remove_offload(struct packet_offload *po)
 546{
 547        __dev_remove_offload(po);
 548
 549        synchronize_net();
 550}
 551EXPORT_SYMBOL(dev_remove_offload);
 552
 553/******************************************************************************
 554
 555                      Device Boot-time Settings Routines
 556
 557*******************************************************************************/
 558
 559/* Boot time configuration table */
 560static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 561
 562/**
 563 *      netdev_boot_setup_add   - add new setup entry
 564 *      @name: name of the device
 565 *      @map: configured settings for the device
 566 *
 567 *      Adds new setup entry to the dev_boot_setup list.  The function
 568 *      returns 0 on error and 1 on success.  This is a generic routine to
 569 *      all netdevices.
 570 */
 571static int netdev_boot_setup_add(char *name, struct ifmap *map)
 572{
 573        struct netdev_boot_setup *s;
 574        int i;
 575
 576        s = dev_boot_setup;
 577        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 578                if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 579                        memset(s[i].name, 0, sizeof(s[i].name));
 580                        strlcpy(s[i].name, name, IFNAMSIZ);
 581                        memcpy(&s[i].map, map, sizeof(s[i].map));
 582                        break;
 583                }
 584        }
 585
 586        return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 587}
 588
 589/**
 590 *      netdev_boot_setup_check - check boot time settings
 591 *      @dev: the netdevice
 592 *
 593 *      Check boot time settings for the device.
 594 *      The found settings are set for the device to be used
 595 *      later in the device probing.
 596 *      Returns 0 if no settings found, 1 if they are.
 597 */
 598int netdev_boot_setup_check(struct net_device *dev)
 599{
 600        struct netdev_boot_setup *s = dev_boot_setup;
 601        int i;
 602
 603        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 604                if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 605                    !strcmp(dev->name, s[i].name)) {
 606                        dev->irq        = s[i].map.irq;
 607                        dev->base_addr  = s[i].map.base_addr;
 608                        dev->mem_start  = s[i].map.mem_start;
 609                        dev->mem_end    = s[i].map.mem_end;
 610                        return 1;
 611                }
 612        }
 613        return 0;
 614}
 615EXPORT_SYMBOL(netdev_boot_setup_check);
 616
 617
 618/**
 619 *      netdev_boot_base        - get address from boot time settings
 620 *      @prefix: prefix for network device
 621 *      @unit: id for network device
 622 *
 623 *      Check boot time settings for the base address of device.
 624 *      The found settings are set for the device to be used
 625 *      later in the device probing.
 626 *      Returns 0 if no settings found.
 627 */
 628unsigned long netdev_boot_base(const char *prefix, int unit)
 629{
 630        const struct netdev_boot_setup *s = dev_boot_setup;
 631        char name[IFNAMSIZ];
 632        int i;
 633
 634        sprintf(name, "%s%d", prefix, unit);
 635
 636        /*
 637         * If device already registered then return base of 1
 638         * to indicate not to probe for this interface
 639         */
 640        if (__dev_get_by_name(&init_net, name))
 641                return 1;
 642
 643        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 644                if (!strcmp(name, s[i].name))
 645                        return s[i].map.base_addr;
 646        return 0;
 647}
 648
 649/*
 650 * Saves at boot time configured settings for any netdevice.
 651 */
 652int __init netdev_boot_setup(char *str)
 653{
 654        int ints[5];
 655        struct ifmap map;
 656
 657        str = get_options(str, ARRAY_SIZE(ints), ints);
 658        if (!str || !*str)
 659                return 0;
 660
 661        /* Save settings */
 662        memset(&map, 0, sizeof(map));
 663        if (ints[0] > 0)
 664                map.irq = ints[1];
 665        if (ints[0] > 1)
 666                map.base_addr = ints[2];
 667        if (ints[0] > 2)
 668                map.mem_start = ints[3];
 669        if (ints[0] > 3)
 670                map.mem_end = ints[4];
 671
 672        /* Add new entry to the list */
 673        return netdev_boot_setup_add(str, &map);
 674}
 675
 676__setup("netdev=", netdev_boot_setup);
 677
 678/*******************************************************************************
 679
 680                            Device Interface Subroutines
 681
 682*******************************************************************************/
 683
 684/**
 685 *      __dev_get_by_name       - find a device by its name
 686 *      @net: the applicable net namespace
 687 *      @name: name to find
 688 *
 689 *      Find an interface by name. Must be called under RTNL semaphore
 690 *      or @dev_base_lock. If the name is found a pointer to the device
 691 *      is returned. If the name is not found then %NULL is returned. The
 692 *      reference counters are not incremented so the caller must be
 693 *      careful with locks.
 694 */
 695
 696struct net_device *__dev_get_by_name(struct net *net, const char *name)
 697{
 698        struct hlist_node *p;
 699        struct net_device *dev;
 700        struct hlist_head *head = dev_name_hash(net, name);
 701
 702        hlist_for_each_entry(dev, p, head, name_hlist)
 703                if (!strncmp(dev->name, name, IFNAMSIZ))
 704                        return dev;
 705
 706        return NULL;
 707}
 708EXPORT_SYMBOL(__dev_get_by_name);
 709
 710/**
 711 *      dev_get_by_name_rcu     - find a device by its name
 712 *      @net: the applicable net namespace
 713 *      @name: name to find
 714 *
 715 *      Find an interface by name.
 716 *      If the name is found a pointer to the device is returned.
 717 *      If the name is not found then %NULL is returned.
 718 *      The reference counters are not incremented so the caller must be
 719 *      careful with locks. The caller must hold RCU lock.
 720 */
 721
 722struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 723{
 724        struct hlist_node *p;
 725        struct net_device *dev;
 726        struct hlist_head *head = dev_name_hash(net, name);
 727
 728        hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 729                if (!strncmp(dev->name, name, IFNAMSIZ))
 730                        return dev;
 731
 732        return NULL;
 733}
 734EXPORT_SYMBOL(dev_get_by_name_rcu);
 735
 736/**
 737 *      dev_get_by_name         - find a device by its name
 738 *      @net: the applicable net namespace
 739 *      @name: name to find
 740 *
 741 *      Find an interface by name. This can be called from any
 742 *      context and does its own locking. The returned handle has
 743 *      the usage count incremented and the caller must use dev_put() to
 744 *      release it when it is no longer needed. %NULL is returned if no
 745 *      matching device is found.
 746 */
 747
 748struct net_device *dev_get_by_name(struct net *net, const char *name)
 749{
 750        struct net_device *dev;
 751
 752        rcu_read_lock();
 753        dev = dev_get_by_name_rcu(net, name);
 754        if (dev)
 755                dev_hold(dev);
 756        rcu_read_unlock();
 757        return dev;
 758}
 759EXPORT_SYMBOL(dev_get_by_name);
 760
 761/**
 762 *      __dev_get_by_index - find a device by its ifindex
 763 *      @net: the applicable net namespace
 764 *      @ifindex: index of device
 765 *
 766 *      Search for an interface by index. Returns %NULL if the device
 767 *      is not found or a pointer to the device. The device has not
 768 *      had its reference counter increased so the caller must be careful
 769 *      about locking. The caller must hold either the RTNL semaphore
 770 *      or @dev_base_lock.
 771 */
 772
 773struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 774{
 775        struct hlist_node *p;
 776        struct net_device *dev;
 777        struct hlist_head *head = dev_index_hash(net, ifindex);
 778
 779        hlist_for_each_entry(dev, p, head, index_hlist)
 780                if (dev->ifindex == ifindex)
 781                        return dev;
 782
 783        return NULL;
 784}
 785EXPORT_SYMBOL(__dev_get_by_index);
 786
 787/**
 788 *      dev_get_by_index_rcu - find a device by its ifindex
 789 *      @net: the applicable net namespace
 790 *      @ifindex: index of device
 791 *
 792 *      Search for an interface by index. Returns %NULL if the device
 793 *      is not found or a pointer to the device. The device has not
 794 *      had its reference counter increased so the caller must be careful
 795 *      about locking. The caller must hold RCU lock.
 796 */
 797
 798struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 799{
 800        struct hlist_node *p;
 801        struct net_device *dev;
 802        struct hlist_head *head = dev_index_hash(net, ifindex);
 803
 804        hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 805                if (dev->ifindex == ifindex)
 806                        return dev;
 807
 808        return NULL;
 809}
 810EXPORT_SYMBOL(dev_get_by_index_rcu);
 811
 812
 813/**
 814 *      dev_get_by_index - find a device by its ifindex
 815 *      @net: the applicable net namespace
 816 *      @ifindex: index of device
 817 *
 818 *      Search for an interface by index. Returns NULL if the device
 819 *      is not found or a pointer to the device. The device returned has
 820 *      had a reference added and the pointer is safe until the user calls
 821 *      dev_put to indicate they have finished with it.
 822 */
 823
 824struct net_device *dev_get_by_index(struct net *net, int ifindex)
 825{
 826        struct net_device *dev;
 827
 828        rcu_read_lock();
 829        dev = dev_get_by_index_rcu(net, ifindex);
 830        if (dev)
 831                dev_hold(dev);
 832        rcu_read_unlock();
 833        return dev;
 834}
 835EXPORT_SYMBOL(dev_get_by_index);
 836
 837/**
 838 *      dev_getbyhwaddr_rcu - find a device by its hardware address
 839 *      @net: the applicable net namespace
 840 *      @type: media type of device
 841 *      @ha: hardware address
 842 *
 843 *      Search for an interface by MAC address. Returns NULL if the device
 844 *      is not found or a pointer to the device.
 845 *      The caller must hold RCU or RTNL.
 846 *      The returned device has not had its ref count increased
 847 *      and the caller must therefore be careful about locking
 848 *
 849 */
 850
 851struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 852                                       const char *ha)
 853{
 854        struct net_device *dev;
 855
 856        for_each_netdev_rcu(net, dev)
 857                if (dev->type == type &&
 858                    !memcmp(dev->dev_addr, ha, dev->addr_len))
 859                        return dev;
 860
 861        return NULL;
 862}
 863EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 864
 865struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 866{
 867        struct net_device *dev;
 868
 869        ASSERT_RTNL();
 870        for_each_netdev(net, dev)
 871                if (dev->type == type)
 872                        return dev;
 873
 874        return NULL;
 875}
 876EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 877
 878struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 879{
 880        struct net_device *dev, *ret = NULL;
 881
 882        rcu_read_lock();
 883        for_each_netdev_rcu(net, dev)
 884                if (dev->type == type) {
 885                        dev_hold(dev);
 886                        ret = dev;
 887                        break;
 888                }
 889        rcu_read_unlock();
 890        return ret;
 891}
 892EXPORT_SYMBOL(dev_getfirstbyhwtype);
 893
 894/**
 895 *      dev_get_by_flags_rcu - find any device with given flags
 896 *      @net: the applicable net namespace
 897 *      @if_flags: IFF_* values
 898 *      @mask: bitmask of bits in if_flags to check
 899 *
 900 *      Search for any interface with the given flags. Returns NULL if a device
 901 *      is not found or a pointer to the device. Must be called inside
 902 *      rcu_read_lock(), and result refcount is unchanged.
 903 */
 904
 905struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 906                                    unsigned short mask)
 907{
 908        struct net_device *dev, *ret;
 909
 910        ret = NULL;
 911        for_each_netdev_rcu(net, dev) {
 912                if (((dev->flags ^ if_flags) & mask) == 0) {
 913                        ret = dev;
 914                        break;
 915                }
 916        }
 917        return ret;
 918}
 919EXPORT_SYMBOL(dev_get_by_flags_rcu);
 920
 921/**
 922 *      dev_valid_name - check if name is okay for network device
 923 *      @name: name string
 924 *
 925 *      Network device names need to be valid file names to
 926 *      to allow sysfs to work.  We also disallow any kind of
 927 *      whitespace.
 928 */
 929bool dev_valid_name(const char *name)
 930{
 931        if (*name == '\0')
 932                return false;
 933        if (strlen(name) >= IFNAMSIZ)
 934                return false;
 935        if (!strcmp(name, ".") || !strcmp(name, ".."))
 936                return false;
 937
 938        while (*name) {
 939                if (*name == '/' || isspace(*name))
 940                        return false;
 941                name++;
 942        }
 943        return true;
 944}
 945EXPORT_SYMBOL(dev_valid_name);
 946
 947/**
 948 *      __dev_alloc_name - allocate a name for a device
 949 *      @net: network namespace to allocate the device name in
 950 *      @name: name format string
 951 *      @buf:  scratch buffer and result name string
 952 *
 953 *      Passed a format string - eg "lt%d" it will try and find a suitable
 954 *      id. It scans list of devices to build up a free map, then chooses
 955 *      the first empty slot. The caller must hold the dev_base or rtnl lock
 956 *      while allocating the name and adding the device in order to avoid
 957 *      duplicates.
 958 *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 959 *      Returns the number of the unit assigned or a negative errno code.
 960 */
 961
 962static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 963{
 964        int i = 0;
 965        const char *p;
 966        const int max_netdevices = 8*PAGE_SIZE;
 967        unsigned long *inuse;
 968        struct net_device *d;
 969
 970        p = strnchr(name, IFNAMSIZ-1, '%');
 971        if (p) {
 972                /*
 973                 * Verify the string as this thing may have come from
 974                 * the user.  There must be either one "%d" and no other "%"
 975                 * characters.
 976                 */
 977                if (p[1] != 'd' || strchr(p + 2, '%'))
 978                        return -EINVAL;
 979
 980                /* Use one page as a bit array of possible slots */
 981                inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 982                if (!inuse)
 983                        return -ENOMEM;
 984
 985                for_each_netdev(net, d) {
 986                        if (!sscanf(d->name, name, &i))
 987                                continue;
 988                        if (i < 0 || i >= max_netdevices)
 989                                continue;
 990
 991                        /*  avoid cases where sscanf is not exact inverse of printf */
 992                        snprintf(buf, IFNAMSIZ, name, i);
 993                        if (!strncmp(buf, d->name, IFNAMSIZ))
 994                                set_bit(i, inuse);
 995                }
 996
 997                i = find_first_zero_bit(inuse, max_netdevices);
 998                free_page((unsigned long) inuse);
 999        }
1000
1001        if (buf != name)
1002                snprintf(buf, IFNAMSIZ, name, i);
1003        if (!__dev_get_by_name(net, buf))
1004                return i;
1005
1006        /* It is possible to run out of possible slots
1007         * when the name is long and there isn't enough space left
1008         * for the digits, or if all bits are used.
1009         */
1010        return -ENFILE;
1011}
1012
1013/**
1014 *      dev_alloc_name - allocate a name for a device
1015 *      @dev: device
1016 *      @name: name format string
1017 *
1018 *      Passed a format string - eg "lt%d" it will try and find a suitable
1019 *      id. It scans list of devices to build up a free map, then chooses
1020 *      the first empty slot. The caller must hold the dev_base or rtnl lock
1021 *      while allocating the name and adding the device in order to avoid
1022 *      duplicates.
1023 *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1024 *      Returns the number of the unit assigned or a negative errno code.
1025 */
1026
1027int dev_alloc_name(struct net_device *dev, const char *name)
1028{
1029        char buf[IFNAMSIZ];
1030        struct net *net;
1031        int ret;
1032
1033        BUG_ON(!dev_net(dev));
1034        net = dev_net(dev);
1035        ret = __dev_alloc_name(net, name, buf);
1036        if (ret >= 0)
1037                strlcpy(dev->name, buf, IFNAMSIZ);
1038        return ret;
1039}
1040EXPORT_SYMBOL(dev_alloc_name);
1041
1042static int dev_alloc_name_ns(struct net *net,
1043                             struct net_device *dev,
1044                             const char *name)
1045{
1046        char buf[IFNAMSIZ];
1047        int ret;
1048
1049        ret = __dev_alloc_name(net, name, buf);
1050        if (ret >= 0)
1051                strlcpy(dev->name, buf, IFNAMSIZ);
1052        return ret;
1053}
1054
1055static int dev_get_valid_name(struct net *net,
1056                              struct net_device *dev,
1057                              const char *name)
1058{
1059        BUG_ON(!net);
1060
1061        if (!dev_valid_name(name))
1062                return -EINVAL;
1063
1064        if (strchr(name, '%'))
1065                return dev_alloc_name_ns(net, dev, name);
1066        else if (__dev_get_by_name(net, name))
1067                return -EEXIST;
1068        else if (dev->name != name)
1069                strlcpy(dev->name, name, IFNAMSIZ);
1070
1071        return 0;
1072}
1073
1074/**
1075 *      dev_change_name - change name of a device
1076 *      @dev: device
1077 *      @newname: name (or format string) must be at least IFNAMSIZ
1078 *
1079 *      Change name of a device, can pass format strings "eth%d".
1080 *      for wildcarding.
1081 */
1082int dev_change_name(struct net_device *dev, const char *newname)
1083{
1084        char oldname[IFNAMSIZ];
1085        int err = 0;
1086        int ret;
1087        struct net *net;
1088
1089        ASSERT_RTNL();
1090        BUG_ON(!dev_net(dev));
1091
1092        net = dev_net(dev);
1093        if (dev->flags & IFF_UP)
1094                return -EBUSY;
1095
1096        write_seqcount_begin(&devnet_rename_seq);
1097
1098        if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1099                write_seqcount_end(&devnet_rename_seq);
1100                return 0;
1101        }
1102
1103        memcpy(oldname, dev->name, IFNAMSIZ);
1104
1105        err = dev_get_valid_name(net, dev, newname);
1106        if (err < 0) {
1107                write_seqcount_end(&devnet_rename_seq);
1108                return err;
1109        }
1110
1111rollback:
1112        ret = device_rename(&dev->dev, dev->name);
1113        if (ret) {
1114                memcpy(dev->name, oldname, IFNAMSIZ);
1115                write_seqcount_end(&devnet_rename_seq);
1116                return ret;
1117        }
1118
1119        write_seqcount_end(&devnet_rename_seq);
1120
1121        write_lock_bh(&dev_base_lock);
1122        hlist_del_rcu(&dev->name_hlist);
1123        write_unlock_bh(&dev_base_lock);
1124
1125        synchronize_rcu();
1126
1127        write_lock_bh(&dev_base_lock);
1128        hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1129        write_unlock_bh(&dev_base_lock);
1130
1131        ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1132        ret = notifier_to_errno(ret);
1133
1134        if (ret) {
1135                /* err >= 0 after dev_alloc_name() or stores the first errno */
1136                if (err >= 0) {
1137                        err = ret;
1138                        write_seqcount_begin(&devnet_rename_seq);
1139                        memcpy(dev->name, oldname, IFNAMSIZ);
1140                        goto rollback;
1141                } else {
1142                        pr_err("%s: name change rollback failed: %d\n",
1143                               dev->name, ret);
1144                }
1145        }
1146
1147        return err;
1148}
1149
1150/**
1151 *      dev_set_alias - change ifalias of a device
1152 *      @dev: device
1153 *      @alias: name up to IFALIASZ
1154 *      @len: limit of bytes to copy from info
1155 *
1156 *      Set ifalias for a device,
1157 */
1158int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1159{
1160        char *new_ifalias;
1161
1162        ASSERT_RTNL();
1163
1164        if (len >= IFALIASZ)
1165                return -EINVAL;
1166
1167        if (!len) {
1168                kfree(dev->ifalias);
1169                dev->ifalias = NULL;
1170                return 0;
1171        }
1172
1173        new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1174        if (!new_ifalias)
1175                return -ENOMEM;
1176        dev->ifalias = new_ifalias;
1177
1178        strlcpy(dev->ifalias, alias, len+1);
1179        return len;
1180}
1181
1182
1183/**
1184 *      netdev_features_change - device changes features
1185 *      @dev: device to cause notification
1186 *
1187 *      Called to indicate a device has changed features.
1188 */
1189void netdev_features_change(struct net_device *dev)
1190{
1191        call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1192}
1193EXPORT_SYMBOL(netdev_features_change);
1194
1195/**
1196 *      netdev_state_change - device changes state
1197 *      @dev: device to cause notification
1198 *
1199 *      Called to indicate a device has changed state. This function calls
1200 *      the notifier chains for netdev_chain and sends a NEWLINK message
1201 *      to the routing socket.
1202 */
1203void netdev_state_change(struct net_device *dev)
1204{
1205        if (dev->flags & IFF_UP) {
1206                call_netdevice_notifiers(NETDEV_CHANGE, dev);
1207                rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1208        }
1209}
1210EXPORT_SYMBOL(netdev_state_change);
1211
1212/**
1213 *      netdev_notify_peers - notify network peers about existence of @dev
1214 *      @dev: network device
1215 *
1216 * Generate traffic such that interested network peers are aware of
1217 * @dev, such as by generating a gratuitous ARP. This may be used when
1218 * a device wants to inform the rest of the network about some sort of
1219 * reconfiguration such as a failover event or virtual machine
1220 * migration.
1221 */
1222void netdev_notify_peers(struct net_device *dev)
1223{
1224        rtnl_lock();
1225        call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1226        rtnl_unlock();
1227}
1228EXPORT_SYMBOL(netdev_notify_peers);
1229
1230/**
1231 *      dev_load        - load a network module
1232 *      @net: the applicable net namespace
1233 *      @name: name of interface
1234 *
1235 *      If a network interface is not present and the process has suitable
1236 *      privileges this function loads the module. If module loading is not
1237 *      available in this kernel then it becomes a nop.
1238 */
1239
1240void dev_load(struct net *net, const char *name)
1241{
1242        struct net_device *dev;
1243        int no_module;
1244
1245        rcu_read_lock();
1246        dev = dev_get_by_name_rcu(net, name);
1247        rcu_read_unlock();
1248
1249        no_module = !dev;
1250        if (no_module && capable(CAP_NET_ADMIN))
1251                no_module = request_module("netdev-%s", name);
1252        if (no_module && capable(CAP_SYS_MODULE)) {
1253                if (!request_module("%s", name))
1254                        pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s instead.\n",
1255                                name);
1256        }
1257}
1258EXPORT_SYMBOL(dev_load);
1259
1260static int __dev_open(struct net_device *dev)
1261{
1262        const struct net_device_ops *ops = dev->netdev_ops;
1263        int ret;
1264
1265        ASSERT_RTNL();
1266
1267        if (!netif_device_present(dev))
1268                return -ENODEV;
1269
1270        ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1271        ret = notifier_to_errno(ret);
1272        if (ret)
1273                return ret;
1274
1275        set_bit(__LINK_STATE_START, &dev->state);
1276
1277        if (ops->ndo_validate_addr)
1278                ret = ops->ndo_validate_addr(dev);
1279
1280        if (!ret && ops->ndo_open)
1281                ret = ops->ndo_open(dev);
1282
1283        if (ret)
1284                clear_bit(__LINK_STATE_START, &dev->state);
1285        else {
1286                dev->flags |= IFF_UP;
1287                net_dmaengine_get();
1288                dev_set_rx_mode(dev);
1289                dev_activate(dev);
1290                add_device_randomness(dev->dev_addr, dev->addr_len);
1291        }
1292
1293        return ret;
1294}
1295
1296/**
1297 *      dev_open        - prepare an interface for use.
1298 *      @dev:   device to open
1299 *
1300 *      Takes a device from down to up state. The device's private open
1301 *      function is invoked and then the multicast lists are loaded. Finally
1302 *      the device is moved into the up state and a %NETDEV_UP message is
1303 *      sent to the netdev notifier chain.
1304 *
1305 *      Calling this function on an active interface is a nop. On a failure
1306 *      a negative errno code is returned.
1307 */
1308int dev_open(struct net_device *dev)
1309{
1310        int ret;
1311
1312        if (dev->flags & IFF_UP)
1313                return 0;
1314
1315        ret = __dev_open(dev);
1316        if (ret < 0)
1317                return ret;
1318
1319        rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1320        call_netdevice_notifiers(NETDEV_UP, dev);
1321
1322        return ret;
1323}
1324EXPORT_SYMBOL(dev_open);
1325
1326static int __dev_close_many(struct list_head *head)
1327{
1328        struct net_device *dev;
1329
1330        ASSERT_RTNL();
1331        might_sleep();
1332
1333        list_for_each_entry(dev, head, unreg_list) {
1334                call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1335
1336                clear_bit(__LINK_STATE_START, &dev->state);
1337
1338                /* Synchronize to scheduled poll. We cannot touch poll list, it
1339                 * can be even on different cpu. So just clear netif_running().
1340                 *
1341                 * dev->stop() will invoke napi_disable() on all of it's
1342                 * napi_struct instances on this device.
1343                 */
1344                smp_mb__after_clear_bit(); /* Commit netif_running(). */
1345        }
1346
1347        dev_deactivate_many(head);
1348
1349        list_for_each_entry(dev, head, unreg_list) {
1350                const struct net_device_ops *ops = dev->netdev_ops;
1351
1352                /*
1353                 *      Call the device specific close. This cannot fail.
1354                 *      Only if device is UP
1355                 *
1356                 *      We allow it to be called even after a DETACH hot-plug
1357                 *      event.
1358                 */
1359                if (ops->ndo_stop)
1360                        ops->ndo_stop(dev);
1361
1362                dev->flags &= ~IFF_UP;
1363                net_dmaengine_put();
1364        }
1365
1366        return 0;
1367}
1368
1369static int __dev_close(struct net_device *dev)
1370{
1371        int retval;
1372        LIST_HEAD(single);
1373
1374        list_add(&dev->unreg_list, &single);
1375        retval = __dev_close_many(&single);
1376        list_del(&single);
1377        return retval;
1378}
1379
1380static int dev_close_many(struct list_head *head)
1381{
1382        struct net_device *dev, *tmp;
1383        LIST_HEAD(tmp_list);
1384
1385        list_for_each_entry_safe(dev, tmp, head, unreg_list)
1386                if (!(dev->flags & IFF_UP))
1387                        list_move(&dev->unreg_list, &tmp_list);
1388
1389        __dev_close_many(head);
1390
1391        list_for_each_entry(dev, head, unreg_list) {
1392                rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1393                call_netdevice_notifiers(NETDEV_DOWN, dev);
1394        }
1395
1396        /* rollback_registered_many needs the complete original list */
1397        list_splice(&tmp_list, head);
1398        return 0;
1399}
1400
1401/**
1402 *      dev_close - shutdown an interface.
1403 *      @dev: device to shutdown
1404 *
1405 *      This function moves an active device into down state. A
1406 *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1407 *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1408 *      chain.
1409 */
1410int dev_close(struct net_device *dev)
1411{
1412        if (dev->flags & IFF_UP) {
1413                LIST_HEAD(single);
1414
1415                list_add(&dev->unreg_list, &single);
1416                dev_close_many(&single);
1417                list_del(&single);
1418        }
1419        return 0;
1420}
1421EXPORT_SYMBOL(dev_close);
1422
1423
1424/**
1425 *      dev_disable_lro - disable Large Receive Offload on a device
1426 *      @dev: device
1427 *
1428 *      Disable Large Receive Offload (LRO) on a net device.  Must be
1429 *      called under RTNL.  This is needed if received packets may be
1430 *      forwarded to another interface.
1431 */
1432void dev_disable_lro(struct net_device *dev)
1433{
1434        /*
1435         * If we're trying to disable lro on a vlan device
1436         * use the underlying physical device instead
1437         */
1438        if (is_vlan_dev(dev))
1439                dev = vlan_dev_real_dev(dev);
1440
1441        dev->wanted_features &= ~NETIF_F_LRO;
1442        netdev_update_features(dev);
1443
1444        if (unlikely(dev->features & NETIF_F_LRO))
1445                netdev_WARN(dev, "failed to disable LRO!\n");
1446}
1447EXPORT_SYMBOL(dev_disable_lro);
1448
1449
1450static int dev_boot_phase = 1;
1451
1452/**
1453 *      register_netdevice_notifier - register a network notifier block
1454 *      @nb: notifier
1455 *
1456 *      Register a notifier to be called when network device events occur.
1457 *      The notifier passed is linked into the kernel structures and must
1458 *      not be reused until it has been unregistered. A negative errno code
1459 *      is returned on a failure.
1460 *
1461 *      When registered all registration and up events are replayed
1462 *      to the new notifier to allow device to have a race free
1463 *      view of the network device list.
1464 */
1465
1466int register_netdevice_notifier(struct notifier_block *nb)
1467{
1468        struct net_device *dev;
1469        struct net_device *last;
1470        struct net *net;
1471        int err;
1472
1473        rtnl_lock();
1474        err = raw_notifier_chain_register(&netdev_chain, nb);
1475        if (err)
1476                goto unlock;
1477        if (dev_boot_phase)
1478                goto unlock;
1479        for_each_net(net) {
1480                for_each_netdev(net, dev) {
1481                        err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1482                        err = notifier_to_errno(err);
1483                        if (err)
1484                                goto rollback;
1485
1486                        if (!(dev->flags & IFF_UP))
1487                                continue;
1488
1489                        nb->notifier_call(nb, NETDEV_UP, dev);
1490                }
1491        }
1492
1493unlock:
1494        rtnl_unlock();
1495        return err;
1496
1497rollback:
1498        last = dev;
1499        for_each_net(net) {
1500                for_each_netdev(net, dev) {
1501                        if (dev == last)
1502                                goto outroll;
1503
1504                        if (dev->flags & IFF_UP) {
1505                                nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1506                                nb->notifier_call(nb, NETDEV_DOWN, dev);
1507                        }
1508                        nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1509                }
1510        }
1511
1512outroll:
1513        raw_notifier_chain_unregister(&netdev_chain, nb);
1514        goto unlock;
1515}
1516EXPORT_SYMBOL(register_netdevice_notifier);
1517
1518/**
1519 *      unregister_netdevice_notifier - unregister a network notifier block
1520 *      @nb: notifier
1521 *
1522 *      Unregister a notifier previously registered by
1523 *      register_netdevice_notifier(). The notifier is unlinked into the
1524 *      kernel structures and may then be reused. A negative errno code
1525 *      is returned on a failure.
1526 *
1527 *      After unregistering unregister and down device events are synthesized
1528 *      for all devices on the device list to the removed notifier to remove
1529 *      the need for special case cleanup code.
1530 */
1531
1532int unregister_netdevice_notifier(struct notifier_block *nb)
1533{
1534        struct net_device *dev;
1535        struct net *net;
1536        int err;
1537
1538        rtnl_lock();
1539        err = raw_notifier_chain_unregister(&netdev_chain, nb);
1540        if (err)
1541                goto unlock;
1542
1543        for_each_net(net) {
1544                for_each_netdev(net, dev) {
1545                        if (dev->flags & IFF_UP) {
1546                                nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1547                                nb->notifier_call(nb, NETDEV_DOWN, dev);
1548                        }
1549                        nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1550                }
1551        }
1552unlock:
1553        rtnl_unlock();
1554        return err;
1555}
1556EXPORT_SYMBOL(unregister_netdevice_notifier);
1557
1558/**
1559 *      call_netdevice_notifiers - call all network notifier blocks
1560 *      @val: value passed unmodified to notifier function
1561 *      @dev: net_device pointer passed unmodified to notifier function
1562 *
1563 *      Call all network notifier blocks.  Parameters and return value
1564 *      are as for raw_notifier_call_chain().
1565 */
1566
1567int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1568{
1569        ASSERT_RTNL();
1570        return raw_notifier_call_chain(&netdev_chain, val, dev);
1571}
1572EXPORT_SYMBOL(call_netdevice_notifiers);
1573
1574static struct static_key netstamp_needed __read_mostly;
1575#ifdef HAVE_JUMP_LABEL
1576/* We are not allowed to call static_key_slow_dec() from irq context
1577 * If net_disable_timestamp() is called from irq context, defer the
1578 * static_key_slow_dec() calls.
1579 */
1580static atomic_t netstamp_needed_deferred;
1581#endif
1582
1583void net_enable_timestamp(void)
1584{
1585#ifdef HAVE_JUMP_LABEL
1586        int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1587
1588        if (deferred) {
1589                while (--deferred)
1590                        static_key_slow_dec(&netstamp_needed);
1591                return;
1592        }
1593#endif
1594        WARN_ON(in_interrupt());
1595        static_key_slow_inc(&netstamp_needed);
1596}
1597EXPORT_SYMBOL(net_enable_timestamp);
1598
1599void net_disable_timestamp(void)
1600{
1601#ifdef HAVE_JUMP_LABEL
1602        if (in_interrupt()) {
1603                atomic_inc(&netstamp_needed_deferred);
1604                return;
1605        }
1606#endif
1607        static_key_slow_dec(&netstamp_needed);
1608}
1609EXPORT_SYMBOL(net_disable_timestamp);
1610
1611static inline void net_timestamp_set(struct sk_buff *skb)
1612{
1613        skb->tstamp.tv64 = 0;
1614        if (static_key_false(&netstamp_needed))
1615                __net_timestamp(skb);
1616}
1617
1618#define net_timestamp_check(COND, SKB)                  \
1619        if (static_key_false(&netstamp_needed)) {               \
1620                if ((COND) && !(SKB)->tstamp.tv64)      \
1621                        __net_timestamp(SKB);           \
1622        }                                               \
1623
1624static int net_hwtstamp_validate(struct ifreq *ifr)
1625{
1626        struct hwtstamp_config cfg;
1627        enum hwtstamp_tx_types tx_type;
1628        enum hwtstamp_rx_filters rx_filter;
1629        int tx_type_valid = 0;
1630        int rx_filter_valid = 0;
1631
1632        if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
1633                return -EFAULT;
1634
1635        if (cfg.flags) /* reserved for future extensions */
1636                return -EINVAL;
1637
1638        tx_type = cfg.tx_type;
1639        rx_filter = cfg.rx_filter;
1640
1641        switch (tx_type) {
1642        case HWTSTAMP_TX_OFF:
1643        case HWTSTAMP_TX_ON:
1644        case HWTSTAMP_TX_ONESTEP_SYNC:
1645                tx_type_valid = 1;
1646                break;
1647        }
1648
1649        switch (rx_filter) {
1650        case HWTSTAMP_FILTER_NONE:
1651        case HWTSTAMP_FILTER_ALL:
1652        case HWTSTAMP_FILTER_SOME:
1653        case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
1654        case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
1655        case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
1656        case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
1657        case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
1658        case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
1659        case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
1660        case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
1661        case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
1662        case HWTSTAMP_FILTER_PTP_V2_EVENT:
1663        case HWTSTAMP_FILTER_PTP_V2_SYNC:
1664        case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
1665                rx_filter_valid = 1;
1666                break;
1667        }
1668
1669        if (!tx_type_valid || !rx_filter_valid)
1670                return -ERANGE;
1671
1672        return 0;
1673}
1674
1675static inline bool is_skb_forwardable(struct net_device *dev,
1676                                      struct sk_buff *skb)
1677{
1678        unsigned int len;
1679
1680        if (!(dev->flags & IFF_UP))
1681                return false;
1682
1683        len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1684        if (skb->len <= len)
1685                return true;
1686
1687        /* if TSO is enabled, we don't care about the length as the packet
1688         * could be forwarded without being segmented before
1689         */
1690        if (skb_is_gso(skb))
1691                return true;
1692
1693        return false;
1694}
1695
1696/**
1697 * dev_forward_skb - loopback an skb to another netif
1698 *
1699 * @dev: destination network device
1700 * @skb: buffer to forward
1701 *
1702 * return values:
1703 *      NET_RX_SUCCESS  (no congestion)
1704 *      NET_RX_DROP     (packet was dropped, but freed)
1705 *
1706 * dev_forward_skb can be used for injecting an skb from the
1707 * start_xmit function of one device into the receive queue
1708 * of another device.
1709 *
1710 * The receiving device may be in another namespace, so
1711 * we have to clear all information in the skb that could
1712 * impact namespace isolation.
1713 */
1714int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1715{
1716        if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1717                if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1718                        atomic_long_inc(&dev->rx_dropped);
1719                        kfree_skb(skb);
1720                        return NET_RX_DROP;
1721                }
1722        }
1723
1724        skb_orphan(skb);
1725        nf_reset(skb);
1726
1727        if (unlikely(!is_skb_forwardable(dev, skb))) {
1728                atomic_long_inc(&dev->rx_dropped);
1729                kfree_skb(skb);
1730                return NET_RX_DROP;
1731        }
1732        skb->skb_iif = 0;
1733        skb->dev = dev;
1734        skb_dst_drop(skb);
1735        skb->tstamp.tv64 = 0;
1736        skb->pkt_type = PACKET_HOST;
1737        skb->protocol = eth_type_trans(skb, dev);
1738        skb->mark = 0;
1739        secpath_reset(skb);
1740        nf_reset(skb);
1741        return netif_rx(skb);
1742}
1743EXPORT_SYMBOL_GPL(dev_forward_skb);
1744
1745static inline int deliver_skb(struct sk_buff *skb,
1746                              struct packet_type *pt_prev,
1747                              struct net_device *orig_dev)
1748{
1749        if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1750                return -ENOMEM;
1751        atomic_inc(&skb->users);
1752        return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1753}
1754
1755static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1756{
1757        if (!ptype->af_packet_priv || !skb->sk)
1758                return false;
1759
1760        if (ptype->id_match)
1761                return ptype->id_match(ptype, skb->sk);
1762        else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1763                return true;
1764
1765        return false;
1766}
1767
1768/*
1769 *      Support routine. Sends outgoing frames to any network
1770 *      taps currently in use.
1771 */
1772
1773static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1774{
1775        struct packet_type *ptype;
1776        struct sk_buff *skb2 = NULL;
1777        struct packet_type *pt_prev = NULL;
1778
1779        rcu_read_lock();
1780        list_for_each_entry_rcu(ptype, &ptype_all, list) {
1781                /* Never send packets back to the socket
1782                 * they originated from - MvS (miquels@drinkel.ow.org)
1783                 */
1784                if ((ptype->dev == dev || !ptype->dev) &&
1785                    (!skb_loop_sk(ptype, skb))) {
1786                        if (pt_prev) {
1787                                deliver_skb(skb2, pt_prev, skb->dev);
1788                                pt_prev = ptype;
1789                                continue;
1790                        }
1791
1792                        skb2 = skb_clone(skb, GFP_ATOMIC);
1793                        if (!skb2)
1794                                break;
1795
1796                        net_timestamp_set(skb2);
1797
1798                        /* skb->nh should be correctly
1799                           set by sender, so that the second statement is
1800                           just protection against buggy protocols.
1801                         */
1802                        skb_reset_mac_header(skb2);
1803
1804                        if (skb_network_header(skb2) < skb2->data ||
1805                            skb2->network_header > skb2->tail) {
1806                                net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1807                                                     ntohs(skb2->protocol),
1808                                                     dev->name);
1809                                skb_reset_network_header(skb2);
1810                        }
1811
1812                        skb2->transport_header = skb2->network_header;
1813                        skb2->pkt_type = PACKET_OUTGOING;
1814                        pt_prev = ptype;
1815                }
1816        }
1817        if (pt_prev)
1818                pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1819        rcu_read_unlock();
1820}
1821
1822/**
1823 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1824 * @dev: Network device
1825 * @txq: number of queues available
1826 *
1827 * If real_num_tx_queues is changed the tc mappings may no longer be
1828 * valid. To resolve this verify the tc mapping remains valid and if
1829 * not NULL the mapping. With no priorities mapping to this
1830 * offset/count pair it will no longer be used. In the worst case TC0
1831 * is invalid nothing can be done so disable priority mappings. If is
1832 * expected that drivers will fix this mapping if they can before
1833 * calling netif_set_real_num_tx_queues.
1834 */
1835static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1836{
1837        int i;
1838        struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1839
1840        /* If TC0 is invalidated disable TC mapping */
1841        if (tc->offset + tc->count > txq) {
1842                pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1843                dev->num_tc = 0;
1844                return;
1845        }
1846
1847        /* Invalidated prio to tc mappings set to TC0 */
1848        for (i = 1; i < TC_BITMASK + 1; i++) {
1849                int q = netdev_get_prio_tc_map(dev, i);
1850
1851                tc = &dev->tc_to_txq[q];
1852                if (tc->offset + tc->count > txq) {
1853                        pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1854                                i, q);
1855                        netdev_set_prio_tc_map(dev, i, 0);
1856                }
1857        }
1858}
1859
1860/*
1861 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1862 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1863 */
1864int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1865{
1866        int rc;
1867
1868        if (txq < 1 || txq > dev->num_tx_queues)
1869                return -EINVAL;
1870
1871        if (dev->reg_state == NETREG_REGISTERED ||
1872            dev->reg_state == NETREG_UNREGISTERING) {
1873                ASSERT_RTNL();
1874
1875                rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1876                                                  txq);
1877                if (rc)
1878                        return rc;
1879
1880                if (dev->num_tc)
1881                        netif_setup_tc(dev, txq);
1882
1883                if (txq < dev->real_num_tx_queues)
1884                        qdisc_reset_all_tx_gt(dev, txq);
1885        }
1886
1887        dev->real_num_tx_queues = txq;
1888        return 0;
1889}
1890EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1891
1892#ifdef CONFIG_RPS
1893/**
1894 *      netif_set_real_num_rx_queues - set actual number of RX queues used
1895 *      @dev: Network device
1896 *      @rxq: Actual number of RX queues
1897 *
1898 *      This must be called either with the rtnl_lock held or before
1899 *      registration of the net device.  Returns 0 on success, or a
1900 *      negative error code.  If called before registration, it always
1901 *      succeeds.
1902 */
1903int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1904{
1905        int rc;
1906
1907        if (rxq < 1 || rxq > dev->num_rx_queues)
1908                return -EINVAL;
1909
1910        if (dev->reg_state == NETREG_REGISTERED) {
1911                ASSERT_RTNL();
1912
1913                rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1914                                                  rxq);
1915                if (rc)
1916                        return rc;
1917        }
1918
1919        dev->real_num_rx_queues = rxq;
1920        return 0;
1921}
1922EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1923#endif
1924
1925/**
1926 * netif_get_num_default_rss_queues - default number of RSS queues
1927 *
1928 * This routine should set an upper limit on the number of RSS queues
1929 * used by default by multiqueue devices.
1930 */
1931int netif_get_num_default_rss_queues(void)
1932{
1933        return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
1934}
1935EXPORT_SYMBOL(netif_get_num_default_rss_queues);
1936
1937static inline void __netif_reschedule(struct Qdisc *q)
1938{
1939        struct softnet_data *sd;
1940        unsigned long flags;
1941
1942        local_irq_save(flags);
1943        sd = &__get_cpu_var(softnet_data);
1944        q->next_sched = NULL;
1945        *sd->output_queue_tailp = q;
1946        sd->output_queue_tailp = &q->next_sched;
1947        raise_softirq_irqoff(NET_TX_SOFTIRQ);
1948        local_irq_restore(flags);
1949}
1950
1951void __netif_schedule(struct Qdisc *q)
1952{
1953        if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1954                __netif_reschedule(q);
1955}
1956EXPORT_SYMBOL(__netif_schedule);
1957
1958void dev_kfree_skb_irq(struct sk_buff *skb)
1959{
1960        if (atomic_dec_and_test(&skb->users)) {
1961                struct softnet_data *sd;
1962                unsigned long flags;
1963
1964                local_irq_save(flags);
1965                sd = &__get_cpu_var(softnet_data);
1966                skb->next = sd->completion_queue;
1967                sd->completion_queue = skb;
1968                raise_softirq_irqoff(NET_TX_SOFTIRQ);
1969                local_irq_restore(flags);
1970        }
1971}
1972EXPORT_SYMBOL(dev_kfree_skb_irq);
1973
1974void dev_kfree_skb_any(struct sk_buff *skb)
1975{
1976        if (in_irq() || irqs_disabled())
1977                dev_kfree_skb_irq(skb);
1978        else
1979                dev_kfree_skb(skb);
1980}
1981EXPORT_SYMBOL(dev_kfree_skb_any);
1982
1983
1984/**
1985 * netif_device_detach - mark device as removed
1986 * @dev: network device
1987 *
1988 * Mark device as removed from system and therefore no longer available.
1989 */
1990void netif_device_detach(struct net_device *dev)
1991{
1992        if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1993            netif_running(dev)) {
1994                netif_tx_stop_all_queues(dev);
1995        }
1996}
1997EXPORT_SYMBOL(netif_device_detach);
1998
1999/**
2000 * netif_device_attach - mark device as attached
2001 * @dev: network device
2002 *
2003 * Mark device as attached from system and restart if needed.
2004 */
2005void netif_device_attach(struct net_device *dev)
2006{
2007        if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2008            netif_running(dev)) {
2009                netif_tx_wake_all_queues(dev);
2010                __netdev_watchdog_up(dev);
2011        }
2012}
2013EXPORT_SYMBOL(netif_device_attach);
2014
2015static void skb_warn_bad_offload(const struct sk_buff *skb)
2016{
2017        static const netdev_features_t null_features = 0;
2018        struct net_device *dev = skb->dev;
2019        const char *driver = "";
2020
2021        if (dev && dev->dev.parent)
2022                driver = dev_driver_string(dev->dev.parent);
2023
2024        WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2025             "gso_type=%d ip_summed=%d\n",
2026             driver, dev ? &dev->features : &null_features,
2027             skb->sk ? &skb->sk->sk_route_caps : &null_features,
2028             skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2029             skb_shinfo(skb)->gso_type, skb->ip_summed);
2030}
2031
2032/*
2033 * Invalidate hardware checksum when packet is to be mangled, and
2034 * complete checksum manually on outgoing path.
2035 */
2036int skb_checksum_help(struct sk_buff *skb)
2037{
2038        __wsum csum;
2039        int ret = 0, offset;
2040
2041        if (skb->ip_summed == CHECKSUM_COMPLETE)
2042                goto out_set_summed;
2043
2044        if (unlikely(skb_shinfo(skb)->gso_size)) {
2045                skb_warn_bad_offload(skb);
2046                return -EINVAL;
2047        }
2048
2049        offset = skb_checksum_start_offset(skb);
2050        BUG_ON(offset >= skb_headlen(skb));
2051        csum = skb_checksum(skb, offset, skb->len - offset, 0);
2052
2053        offset += skb->csum_offset;
2054        BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2055
2056        if (skb_cloned(skb) &&
2057            !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2058                ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2059                if (ret)
2060                        goto out;
2061        }
2062
2063        *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2064out_set_summed:
2065        skb->ip_summed = CHECKSUM_NONE;
2066out:
2067        return ret;
2068}
2069EXPORT_SYMBOL(skb_checksum_help);
2070
2071/**
2072 *      skb_gso_segment - Perform segmentation on skb.
2073 *      @skb: buffer to segment
2074 *      @features: features for the output path (see dev->features)
2075 *
2076 *      This function segments the given skb and returns a list of segments.
2077 *
2078 *      It may return NULL if the skb requires no segmentation.  This is
2079 *      only possible when GSO is used for verifying header integrity.
2080 */
2081struct sk_buff *skb_gso_segment(struct sk_buff *skb,
2082        netdev_features_t features)
2083{
2084        struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2085        struct packet_offload *ptype;
2086        __be16 type = skb->protocol;
2087        int vlan_depth = ETH_HLEN;
2088        int err;
2089
2090        while (type == htons(ETH_P_8021Q)) {
2091                struct vlan_hdr *vh;
2092
2093                if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
2094                        return ERR_PTR(-EINVAL);
2095
2096                vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2097                type = vh->h_vlan_encapsulated_proto;
2098                vlan_depth += VLAN_HLEN;
2099        }
2100
2101        skb_reset_mac_header(skb);
2102        skb->mac_len = skb->network_header - skb->mac_header;
2103        __skb_pull(skb, skb->mac_len);
2104
2105        if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2106                skb_warn_bad_offload(skb);
2107
2108                if (skb_header_cloned(skb) &&
2109                    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2110                        return ERR_PTR(err);
2111        }
2112
2113        rcu_read_lock();
2114        list_for_each_entry_rcu(ptype, &offload_base, list) {
2115                if (ptype->type == type && ptype->callbacks.gso_segment) {
2116                        if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2117                                err = ptype->callbacks.gso_send_check(skb);
2118                                segs = ERR_PTR(err);
2119                                if (err || skb_gso_ok(skb, features))
2120                                        break;
2121                                __skb_push(skb, (skb->data -
2122                                                 skb_network_header(skb)));
2123                        }
2124                        segs = ptype->callbacks.gso_segment(skb, features);
2125                        break;
2126                }
2127        }
2128        rcu_read_unlock();
2129
2130        __skb_push(skb, skb->data - skb_mac_header(skb));
2131
2132        return segs;
2133}
2134EXPORT_SYMBOL(skb_gso_segment);
2135
2136/* Take action when hardware reception checksum errors are detected. */
2137#ifdef CONFIG_BUG
2138void netdev_rx_csum_fault(struct net_device *dev)
2139{
2140        if (net_ratelimit()) {
2141                pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2142                dump_stack();
2143        }
2144}
2145EXPORT_SYMBOL(netdev_rx_csum_fault);
2146#endif
2147
2148/* Actually, we should eliminate this check as soon as we know, that:
2149 * 1. IOMMU is present and allows to map all the memory.
2150 * 2. No high memory really exists on this machine.
2151 */
2152
2153static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2154{
2155#ifdef CONFIG_HIGHMEM
2156        int i;
2157        if (!(dev->features & NETIF_F_HIGHDMA)) {
2158                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2159                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2160                        if (PageHighMem(skb_frag_page(frag)))
2161                                return 1;
2162                }
2163        }
2164
2165        if (PCI_DMA_BUS_IS_PHYS) {
2166                struct device *pdev = dev->dev.parent;
2167
2168                if (!pdev)
2169                        return 0;
2170                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2171                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2172                        dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2173                        if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2174                                return 1;
2175                }
2176        }
2177#endif
2178        return 0;
2179}
2180
2181struct dev_gso_cb {
2182        void (*destructor)(struct sk_buff *skb);
2183};
2184
2185#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2186
2187static void dev_gso_skb_destructor(struct sk_buff *skb)
2188{
2189        struct dev_gso_cb *cb;
2190
2191        do {
2192                struct sk_buff *nskb = skb->next;
2193
2194                skb->next = nskb->next;
2195                nskb->next = NULL;
2196                kfree_skb(nskb);
2197        } while (skb->next);
2198
2199        cb = DEV_GSO_CB(skb);
2200        if (cb->destructor)
2201                cb->destructor(skb);
2202}
2203
2204/**
2205 *      dev_gso_segment - Perform emulated hardware segmentation on skb.
2206 *      @skb: buffer to segment
2207 *      @features: device features as applicable to this skb
2208 *
2209 *      This function segments the given skb and stores the list of segments
2210 *      in skb->next.
2211 */
2212static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2213{
2214        struct sk_buff *segs;
2215
2216        segs = skb_gso_segment(skb, features);
2217
2218        /* Verifying header integrity only. */
2219        if (!segs)
2220                return 0;
2221
2222        if (IS_ERR(segs))
2223                return PTR_ERR(segs);
2224
2225        skb->next = segs;
2226        DEV_GSO_CB(skb)->destructor = skb->destructor;
2227        skb->destructor = dev_gso_skb_destructor;
2228
2229        return 0;
2230}
2231
2232static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
2233{
2234        return ((features & NETIF_F_GEN_CSUM) ||
2235                ((features & NETIF_F_V4_CSUM) &&
2236                 protocol == htons(ETH_P_IP)) ||
2237                ((features & NETIF_F_V6_CSUM) &&
2238                 protocol == htons(ETH_P_IPV6)) ||
2239                ((features & NETIF_F_FCOE_CRC) &&
2240                 protocol == htons(ETH_P_FCOE)));
2241}
2242
2243static netdev_features_t harmonize_features(struct sk_buff *skb,
2244        __be16 protocol, netdev_features_t features)
2245{
2246        if (skb->ip_summed != CHECKSUM_NONE &&
2247            !can_checksum_protocol(features, protocol)) {
2248                features &= ~NETIF_F_ALL_CSUM;
2249                features &= ~NETIF_F_SG;
2250        } else if (illegal_highdma(skb->dev, skb)) {
2251                features &= ~NETIF_F_SG;
2252        }
2253
2254        return features;
2255}
2256
2257netdev_features_t netif_skb_features(struct sk_buff *skb)
2258{
2259        __be16 protocol = skb->protocol;
2260        netdev_features_t features = skb->dev->features;
2261
2262        if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2263                features &= ~NETIF_F_GSO_MASK;
2264
2265        if (protocol == htons(ETH_P_8021Q)) {
2266                struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2267                protocol = veh->h_vlan_encapsulated_proto;
2268        } else if (!vlan_tx_tag_present(skb)) {
2269                return harmonize_features(skb, protocol, features);
2270        }
2271
2272        features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2273
2274        if (protocol != htons(ETH_P_8021Q)) {
2275                return harmonize_features(skb, protocol, features);
2276        } else {
2277                features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2278                                NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2279                return harmonize_features(skb, protocol, features);
2280        }
2281}
2282EXPORT_SYMBOL(netif_skb_features);
2283
2284/*
2285 * Returns true if either:
2286 *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2287 *      2. skb is fragmented and the device does not support SG.
2288 */
2289static inline int skb_needs_linearize(struct sk_buff *skb,
2290                                      int features)
2291{
2292        return skb_is_nonlinear(skb) &&
2293                        ((skb_has_frag_list(skb) &&
2294                                !(features & NETIF_F_FRAGLIST)) ||
2295                        (skb_shinfo(skb)->nr_frags &&
2296                                !(features & NETIF_F_SG)));
2297}
2298
2299int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2300                        struct netdev_queue *txq)
2301{
2302        const struct net_device_ops *ops = dev->netdev_ops;
2303        int rc = NETDEV_TX_OK;
2304        unsigned int skb_len;
2305
2306        if (likely(!skb->next)) {
2307                netdev_features_t features;
2308
2309                /*
2310                 * If device doesn't need skb->dst, release it right now while
2311                 * its hot in this cpu cache
2312                 */
2313                if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2314                        skb_dst_drop(skb);
2315
2316                features = netif_skb_features(skb);
2317
2318                if (vlan_tx_tag_present(skb) &&
2319                    !(features & NETIF_F_HW_VLAN_TX)) {
2320                        skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2321                        if (unlikely(!skb))
2322                                goto out;
2323
2324                        skb->vlan_tci = 0;
2325                }
2326
2327                /* If encapsulation offload request, verify we are testing
2328                 * hardware encapsulation features instead of standard
2329                 * features for the netdev
2330                 */
2331                if (skb->encapsulation)
2332                        features &= dev->hw_enc_features;
2333
2334                if (netif_needs_gso(skb, features)) {
2335                        if (unlikely(dev_gso_segment(skb, features)))
2336                                goto out_kfree_skb;
2337                        if (skb->next)
2338                                goto gso;
2339                } else {
2340                        if (skb_needs_linearize(skb, features) &&
2341                            __skb_linearize(skb))
2342                                goto out_kfree_skb;
2343
2344                        /* If packet is not checksummed and device does not
2345                         * support checksumming for this protocol, complete
2346                         * checksumming here.
2347                         */
2348                        if (skb->ip_summed == CHECKSUM_PARTIAL) {
2349                                if (skb->encapsulation)
2350                                        skb_set_inner_transport_header(skb,
2351                                                skb_checksum_start_offset(skb));
2352                                else
2353                                        skb_set_transport_header(skb,
2354                                                skb_checksum_start_offset(skb));
2355                                if (!(features & NETIF_F_ALL_CSUM) &&
2356                                     skb_checksum_help(skb))
2357                                        goto out_kfree_skb;
2358                        }
2359                }
2360
2361                if (!list_empty(&ptype_all))
2362                        dev_queue_xmit_nit(skb, dev);
2363
2364                skb_len = skb->len;
2365                rc = ops->ndo_start_xmit(skb, dev);
2366                trace_net_dev_xmit(skb, rc, dev, skb_len);
2367                if (rc == NETDEV_TX_OK)
2368                        txq_trans_update(txq);
2369                return rc;
2370        }
2371
2372gso:
2373        do {
2374                struct sk_buff *nskb = skb->next;
2375
2376                skb->next = nskb->next;
2377                nskb->next = NULL;
2378
2379                /*
2380                 * If device doesn't need nskb->dst, release it right now while
2381                 * its hot in this cpu cache
2382                 */
2383                if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2384                        skb_dst_drop(nskb);
2385
2386                if (!list_empty(&ptype_all))
2387                        dev_queue_xmit_nit(nskb, dev);
2388
2389                skb_len = nskb->len;
2390                rc = ops->ndo_start_xmit(nskb, dev);
2391                trace_net_dev_xmit(nskb, rc, dev, skb_len);
2392                if (unlikely(rc != NETDEV_TX_OK)) {
2393                        if (rc & ~NETDEV_TX_MASK)
2394                                goto out_kfree_gso_skb;
2395                        nskb->next = skb->next;
2396                        skb->next = nskb;
2397                        return rc;
2398                }
2399                txq_trans_update(txq);
2400                if (unlikely(netif_xmit_stopped(txq) && skb->next))
2401                        return NETDEV_TX_BUSY;
2402        } while (skb->next);
2403
2404out_kfree_gso_skb:
2405        if (likely(skb->next == NULL))
2406                skb->destructor = DEV_GSO_CB(skb)->destructor;
2407out_kfree_skb:
2408        kfree_skb(skb);
2409out:
2410        return rc;
2411}
2412
2413static u32 hashrnd __read_mostly;
2414
2415/*
2416 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2417 * to be used as a distribution range.
2418 */
2419u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2420                  unsigned int num_tx_queues)
2421{
2422        u32 hash;
2423        u16 qoffset = 0;
2424        u16 qcount = num_tx_queues;
2425
2426        if (skb_rx_queue_recorded(skb)) {
2427                hash = skb_get_rx_queue(skb);
2428                while (unlikely(hash >= num_tx_queues))
2429                        hash -= num_tx_queues;
2430                return hash;
2431        }
2432
2433        if (dev->num_tc) {
2434                u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2435                qoffset = dev->tc_to_txq[tc].offset;
2436                qcount = dev->tc_to_txq[tc].count;
2437        }
2438
2439        if (skb->sk && skb->sk->sk_hash)
2440                hash = skb->sk->sk_hash;
2441        else
2442                hash = (__force u16) skb->protocol;
2443        hash = jhash_1word(hash, hashrnd);
2444
2445        return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2446}
2447EXPORT_SYMBOL(__skb_tx_hash);
2448
2449static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2450{
2451        if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2452                net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n",
2453                                     dev->name, queue_index,
2454                                     dev->real_num_tx_queues);
2455                return 0;
2456        }
2457        return queue_index;
2458}
2459
2460static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2461{
2462#ifdef CONFIG_XPS
2463        struct xps_dev_maps *dev_maps;
2464        struct xps_map *map;
2465        int queue_index = -1;
2466
2467        rcu_read_lock();
2468        dev_maps = rcu_dereference(dev->xps_maps);
2469        if (dev_maps) {
2470                map = rcu_dereference(
2471                    dev_maps->cpu_map[raw_smp_processor_id()]);
2472                if (map) {
2473                        if (map->len == 1)
2474                                queue_index = map->queues[0];
2475                        else {
2476                                u32 hash;
2477                                if (skb->sk && skb->sk->sk_hash)
2478                                        hash = skb->sk->sk_hash;
2479                                else
2480                                        hash = (__force u16) skb->protocol ^
2481                                            skb->rxhash;
2482                                hash = jhash_1word(hash, hashrnd);
2483                                queue_index = map->queues[
2484                                    ((u64)hash * map->len) >> 32];
2485                        }
2486                        if (unlikely(queue_index >= dev->real_num_tx_queues))
2487                                queue_index = -1;
2488                }
2489        }
2490        rcu_read_unlock();
2491
2492        return queue_index;
2493#else
2494        return -1;
2495#endif
2496}
2497
2498struct netdev_queue *netdev_pick_tx(struct net_device *dev,
2499                                    struct sk_buff *skb)
2500{
2501        int queue_index;
2502        const struct net_device_ops *ops = dev->netdev_ops;
2503
2504        if (dev->real_num_tx_queues == 1)
2505                queue_index = 0;
2506        else if (ops->ndo_select_queue) {
2507                queue_index = ops->ndo_select_queue(dev, skb);
2508                queue_index = dev_cap_txqueue(dev, queue_index);
2509        } else {
2510                struct sock *sk = skb->sk;
2511                queue_index = sk_tx_queue_get(sk);
2512
2513                if (queue_index < 0 || skb->ooo_okay ||
2514                    queue_index >= dev->real_num_tx_queues) {
2515                        int old_index = queue_index;
2516
2517                        queue_index = get_xps_queue(dev, skb);
2518                        if (queue_index < 0)
2519                                queue_index = skb_tx_hash(dev, skb);
2520
2521                        if (queue_index != old_index && sk) {
2522                                struct dst_entry *dst =
2523                                    rcu_dereference_check(sk->sk_dst_cache, 1);
2524
2525                                if (dst && skb_dst(skb) == dst)
2526                                        sk_tx_queue_set(sk, queue_index);
2527                        }
2528                }
2529        }
2530
2531        skb_set_queue_mapping(skb, queue_index);
2532        return netdev_get_tx_queue(dev, queue_index);
2533}
2534
2535static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2536                                 struct net_device *dev,
2537                                 struct netdev_queue *txq)
2538{
2539        spinlock_t *root_lock = qdisc_lock(q);
2540        bool contended;
2541        int rc;
2542
2543        qdisc_skb_cb(skb)->pkt_len = skb->len;
2544        qdisc_calculate_pkt_len(skb, q);
2545        /*
2546         * Heuristic to force contended enqueues to serialize on a
2547         * separate lock before trying to get qdisc main lock.
2548         * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2549         * and dequeue packets faster.
2550         */
2551        contended = qdisc_is_running(q);
2552        if (unlikely(contended))
2553                spin_lock(&q->busylock);
2554
2555        spin_lock(root_lock);
2556        if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2557                kfree_skb(skb);
2558                rc = NET_XMIT_DROP;
2559        } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2560                   qdisc_run_begin(q)) {
2561                /*
2562                 * This is a work-conserving queue; there are no old skbs
2563                 * waiting to be sent out; and the qdisc is not running -
2564                 * xmit the skb directly.
2565                 */
2566                if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2567                        skb_dst_force(skb);
2568
2569                qdisc_bstats_update(q, skb);
2570
2571                if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2572                        if (unlikely(contended)) {
2573                                spin_unlock(&q->busylock);
2574                                contended = false;
2575                        }
2576                        __qdisc_run(q);
2577                } else
2578                        qdisc_run_end(q);
2579
2580                rc = NET_XMIT_SUCCESS;
2581        } else {
2582                skb_dst_force(skb);
2583                rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2584                if (qdisc_run_begin(q)) {
2585                        if (unlikely(contended)) {
2586                                spin_unlock(&q->busylock);
2587                                contended = false;
2588                        }
2589                        __qdisc_run(q);
2590                }
2591        }
2592        spin_unlock(root_lock);
2593        if (unlikely(contended))
2594                spin_unlock(&q->busylock);
2595        return rc;
2596}
2597
2598#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2599static void skb_update_prio(struct sk_buff *skb)
2600{
2601        struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2602
2603        if (!skb->priority && skb->sk && map) {
2604                unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2605
2606                if (prioidx < map->priomap_len)
2607                        skb->priority = map->priomap[prioidx];
2608        }
2609}
2610#else
2611#define skb_update_prio(skb)
2612#endif
2613
2614static DEFINE_PER_CPU(int, xmit_recursion);
2615#define RECURSION_LIMIT 10
2616
2617/**
2618 *      dev_loopback_xmit - loop back @skb
2619 *      @skb: buffer to transmit
2620 */
2621int dev_loopback_xmit(struct sk_buff *skb)
2622{
2623        skb_reset_mac_header(skb);
2624        __skb_pull(skb, skb_network_offset(skb));
2625        skb->pkt_type = PACKET_LOOPBACK;
2626        skb->ip_summed = CHECKSUM_UNNECESSARY;
2627        WARN_ON(!skb_dst(skb));
2628        skb_dst_force(skb);
2629        netif_rx_ni(skb);
2630        return 0;
2631}
2632EXPORT_SYMBOL(dev_loopback_xmit);
2633
2634/**
2635 *      dev_queue_xmit - transmit a buffer
2636 *      @skb: buffer to transmit
2637 *
2638 *      Queue a buffer for transmission to a network device. The caller must
2639 *      have set the device and priority and built the buffer before calling
2640 *      this function. The function can be called from an interrupt.
2641 *
2642 *      A negative errno code is returned on a failure. A success does not
2643 *      guarantee the frame will be transmitted as it may be dropped due
2644 *      to congestion or traffic shaping.
2645 *
2646 * -----------------------------------------------------------------------------------
2647 *      I notice this method can also return errors from the queue disciplines,
2648 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2649 *      be positive.
2650 *
2651 *      Regardless of the return value, the skb is consumed, so it is currently
2652 *      difficult to retry a send to this method.  (You can bump the ref count
2653 *      before sending to hold a reference for retry if you are careful.)
2654 *
2655 *      When calling this method, interrupts MUST be enabled.  This is because
2656 *      the BH enable code must have IRQs enabled so that it will not deadlock.
2657 *          --BLG
2658 */
2659int dev_queue_xmit(struct sk_buff *skb)
2660{
2661        struct net_device *dev = skb->dev;
2662        struct netdev_queue *txq;
2663        struct Qdisc *q;
2664        int rc = -ENOMEM;
2665
2666        /* Disable soft irqs for various locks below. Also
2667         * stops preemption for RCU.
2668         */
2669        rcu_read_lock_bh();
2670
2671        skb_update_prio(skb);
2672
2673        txq = netdev_pick_tx(dev, skb);
2674        q = rcu_dereference_bh(txq->qdisc);
2675
2676#ifdef CONFIG_NET_CLS_ACT
2677        skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2678#endif
2679        trace_net_dev_queue(skb);
2680        if (q->enqueue) {
2681                rc = __dev_xmit_skb(skb, q, dev, txq);
2682                goto out;
2683        }
2684
2685        /* The device has no queue. Common case for software devices:
2686           loopback, all the sorts of tunnels...
2687
2688           Really, it is unlikely that netif_tx_lock protection is necessary
2689           here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2690           counters.)
2691           However, it is possible, that they rely on protection
2692           made by us here.
2693
2694           Check this and shot the lock. It is not prone from deadlocks.
2695           Either shot noqueue qdisc, it is even simpler 8)
2696         */
2697        if (dev->flags & IFF_UP) {
2698                int cpu = smp_processor_id(); /* ok because BHs are off */
2699
2700                if (txq->xmit_lock_owner != cpu) {
2701
2702                        if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2703                                goto recursion_alert;
2704
2705                        HARD_TX_LOCK(dev, txq, cpu);
2706
2707                        if (!netif_xmit_stopped(txq)) {
2708                                __this_cpu_inc(xmit_recursion);
2709                                rc = dev_hard_start_xmit(skb, dev, txq);
2710                                __this_cpu_dec(xmit_recursion);
2711                                if (dev_xmit_complete(rc)) {
2712                                        HARD_TX_UNLOCK(dev, txq);
2713                                        goto out;
2714                                }
2715                        }
2716                        HARD_TX_UNLOCK(dev, txq);
2717                        net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2718                                             dev->name);
2719                } else {
2720                        /* Recursion is detected! It is possible,
2721                         * unfortunately
2722                         */
2723recursion_alert:
2724                        net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2725                                             dev->name);
2726                }
2727        }
2728
2729        rc = -ENETDOWN;
2730        rcu_read_unlock_bh();
2731
2732        kfree_skb(skb);
2733        return rc;
2734out:
2735        rcu_read_unlock_bh();
2736        return rc;
2737}
2738EXPORT_SYMBOL(dev_queue_xmit);
2739
2740
2741/*=======================================================================
2742                        Receiver routines
2743  =======================================================================*/
2744
2745int netdev_max_backlog __read_mostly = 1000;
2746EXPORT_SYMBOL(netdev_max_backlog);
2747
2748int netdev_tstamp_prequeue __read_mostly = 1;
2749int netdev_budget __read_mostly = 300;
2750int weight_p __read_mostly = 64;            /* old backlog weight */
2751
2752/* Called with irq disabled */
2753static inline void ____napi_schedule(struct softnet_data *sd,
2754                                     struct napi_struct *napi)
2755{
2756        list_add_tail(&napi->poll_list, &sd->poll_list);
2757        __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2758}
2759
2760/*
2761 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2762 * and src/dst port numbers.  Sets rxhash in skb to non-zero hash value
2763 * on success, zero indicates no valid hash.  Also, sets l4_rxhash in skb
2764 * if hash is a canonical 4-tuple hash over transport ports.
2765 */
2766void __skb_get_rxhash(struct sk_buff *skb)
2767{
2768        struct flow_keys keys;
2769        u32 hash;
2770
2771        if (!skb_flow_dissect(skb, &keys))
2772                return;
2773
2774        if (keys.ports)
2775                skb->l4_rxhash = 1;
2776
2777        /* get a consistent hash (same value on both flow directions) */
2778        if (((__force u32)keys.dst < (__force u32)keys.src) ||
2779            (((__force u32)keys.dst == (__force u32)keys.src) &&
2780             ((__force u16)keys.port16[1] < (__force u16)keys.port16[0]))) {
2781                swap(keys.dst, keys.src);
2782                swap(keys.port16[0], keys.port16[1]);
2783        }
2784
2785        hash = jhash_3words((__force u32)keys.dst,
2786                            (__force u32)keys.src,
2787                            (__force u32)keys.ports, hashrnd);
2788        if (!hash)
2789                hash = 1;
2790
2791        skb->rxhash = hash;
2792}
2793EXPORT_SYMBOL(__skb_get_rxhash);
2794
2795#ifdef CONFIG_RPS
2796
2797/* One global table that all flow-based protocols share. */
2798struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2799EXPORT_SYMBOL(rps_sock_flow_table);
2800
2801struct static_key rps_needed __read_mostly;
2802
2803static struct rps_dev_flow *
2804set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2805            struct rps_dev_flow *rflow, u16 next_cpu)
2806{
2807        if (next_cpu != RPS_NO_CPU) {
2808#ifdef CONFIG_RFS_ACCEL
2809                struct netdev_rx_queue *rxqueue;
2810                struct rps_dev_flow_table *flow_table;
2811                struct rps_dev_flow *old_rflow;
2812                u32 flow_id;
2813                u16 rxq_index;
2814                int rc;
2815
2816                /* Should we steer this flow to a different hardware queue? */
2817                if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2818                    !(dev->features & NETIF_F_NTUPLE))
2819                        goto out;
2820                rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2821                if (rxq_index == skb_get_rx_queue(skb))
2822                        goto out;
2823
2824                rxqueue = dev->_rx + rxq_index;
2825                flow_table = rcu_dereference(rxqueue->rps_flow_table);
2826                if (!flow_table)
2827                        goto out;
2828                flow_id = skb->rxhash & flow_table->mask;
2829                rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2830                                                        rxq_index, flow_id);
2831                if (rc < 0)
2832                        goto out;
2833                old_rflow = rflow;
2834                rflow = &flow_table->flows[flow_id];
2835                rflow->filter = rc;
2836                if (old_rflow->filter == rflow->filter)
2837                        old_rflow->filter = RPS_NO_FILTER;
2838        out:
2839#endif
2840                rflow->last_qtail =
2841                        per_cpu(softnet_data, next_cpu).input_queue_head;
2842        }
2843
2844        rflow->cpu = next_cpu;
2845        return rflow;
2846}
2847
2848/*
2849 * get_rps_cpu is called from netif_receive_skb and returns the target
2850 * CPU from the RPS map of the receiving queue for a given skb.
2851 * rcu_read_lock must be held on entry.
2852 */
2853static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2854                       struct rps_dev_flow **rflowp)
2855{
2856        struct netdev_rx_queue *rxqueue;
2857        struct rps_map *map;
2858        struct rps_dev_flow_table *flow_table;
2859        struct rps_sock_flow_table *sock_flow_table;
2860        int cpu = -1;
2861        u16 tcpu;
2862
2863        if (skb_rx_queue_recorded(skb)) {
2864                u16 index = skb_get_rx_queue(skb);
2865                if (unlikely(index >= dev->real_num_rx_queues)) {
2866                        WARN_ONCE(dev->real_num_rx_queues > 1,
2867                                  "%s received packet on queue %u, but number "
2868                                  "of RX queues is %u\n",
2869                                  dev->name, index, dev->real_num_rx_queues);
2870                        goto done;
2871                }
2872                rxqueue = dev->_rx + index;
2873        } else
2874                rxqueue = dev->_rx;
2875
2876        map = rcu_dereference(rxqueue->rps_map);
2877        if (map) {
2878                if (map->len == 1 &&
2879                    !rcu_access_pointer(rxqueue->rps_flow_table)) {
2880                        tcpu = map->cpus[0];
2881                        if (cpu_online(tcpu))
2882                                cpu = tcpu;
2883                        goto done;
2884                }
2885        } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2886                goto done;
2887        }
2888
2889        skb_reset_network_header(skb);
2890        if (!skb_get_rxhash(skb))
2891                goto done;
2892
2893        flow_table = rcu_dereference(rxqueue->rps_flow_table);
2894        sock_flow_table = rcu_dereference(rps_sock_flow_table);
2895        if (flow_table && sock_flow_table) {
2896                u16 next_cpu;
2897                struct rps_dev_flow *rflow;
2898
2899                rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2900                tcpu = rflow->cpu;
2901
2902                next_cpu = sock_flow_table->ents[skb->rxhash &
2903                    sock_flow_table->mask];
2904
2905                /*
2906                 * If the desired CPU (where last recvmsg was done) is
2907                 * different from current CPU (one in the rx-queue flow
2908                 * table entry), switch if one of the following holds:
2909                 *   - Current CPU is unset (equal to RPS_NO_CPU).
2910                 *   - Current CPU is offline.
2911                 *   - The current CPU's queue tail has advanced beyond the
2912                 *     last packet that was enqueued using this table entry.
2913                 *     This guarantees that all previous packets for the flow
2914                 *     have been dequeued, thus preserving in order delivery.
2915                 */
2916                if (unlikely(tcpu != next_cpu) &&
2917                    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2918                     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2919                      rflow->last_qtail)) >= 0)) {
2920                        tcpu = next_cpu;
2921                        rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2922                }
2923
2924                if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2925                        *rflowp = rflow;
2926                        cpu = tcpu;
2927                        goto done;
2928                }
2929        }
2930
2931        if (map) {
2932                tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2933
2934                if (cpu_online(tcpu)) {
2935                        cpu = tcpu;
2936                        goto done;
2937                }
2938        }
2939
2940done:
2941        return cpu;
2942}
2943
2944#ifdef CONFIG_RFS_ACCEL
2945
2946/**
2947 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2948 * @dev: Device on which the filter was set
2949 * @rxq_index: RX queue index
2950 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2951 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2952 *
2953 * Drivers that implement ndo_rx_flow_steer() should periodically call
2954 * this function for each installed filter and remove the filters for
2955 * which it returns %true.
2956 */
2957bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2958                         u32 flow_id, u16 filter_id)
2959{
2960        struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2961        struct rps_dev_flow_table *flow_table;
2962        struct rps_dev_flow *rflow;
2963        bool expire = true;
2964        int cpu;
2965
2966        rcu_read_lock();
2967        flow_table = rcu_dereference(rxqueue->rps_flow_table);
2968        if (flow_table && flow_id <= flow_table->mask) {
2969                rflow = &flow_table->flows[flow_id];
2970                cpu = ACCESS_ONCE(rflow->cpu);
2971                if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2972                    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2973                           rflow->last_qtail) <
2974                     (int)(10 * flow_table->mask)))
2975                        expire = false;
2976        }
2977        rcu_read_unlock();
2978        return expire;
2979}
2980EXPORT_SYMBOL(rps_may_expire_flow);
2981
2982#endif /* CONFIG_RFS_ACCEL */
2983
2984/* Called from hardirq (IPI) context */
2985static void rps_trigger_softirq(void *data)
2986{
2987        struct softnet_data *sd = data;
2988
2989        ____napi_schedule(sd, &sd->backlog);
2990        sd->received_rps++;
2991}
2992
2993#endif /* CONFIG_RPS */
2994
2995/*
2996 * Check if this softnet_data structure is another cpu one
2997 * If yes, queue it to our IPI list and return 1
2998 * If no, return 0
2999 */
3000static int rps_ipi_queued(struct softnet_data *sd)
3001{
3002#ifdef CONFIG_RPS
3003        struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3004
3005        if (sd != mysd) {
3006                sd->rps_ipi_next = mysd->rps_ipi_list;
3007                mysd->rps_ipi_list = sd;
3008
3009                __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3010                return 1;
3011        }
3012#endif /* CONFIG_RPS */
3013        return 0;
3014}
3015
3016/*
3017 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3018 * queue (may be a remote CPU queue).
3019 */
3020static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3021                              unsigned int *qtail)
3022{
3023        struct softnet_data *sd;
3024        unsigned long flags;
3025
3026        sd = &per_cpu(softnet_data, cpu);
3027
3028        local_irq_save(flags);
3029
3030        rps_lock(sd);
3031        if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
3032                if (skb_queue_len(&sd->input_pkt_queue)) {
3033enqueue:
3034                        __skb_queue_tail(&sd->input_pkt_queue, skb);
3035                        input_queue_tail_incr_save(sd, qtail);
3036                        rps_unlock(sd);
3037                        local_irq_restore(flags);
3038                        return NET_RX_SUCCESS;
3039                }
3040
3041                /* Schedule NAPI for backlog device
3042                 * We can use non atomic operation since we own the queue lock
3043                 */
3044                if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3045                        if (!rps_ipi_queued(sd))
3046                                ____napi_schedule(sd, &sd->backlog);
3047                }
3048                goto enqueue;
3049        }
3050
3051        sd->dropped++;
3052        rps_unlock(sd);
3053
3054        local_irq_restore(flags);
3055
3056        atomic_long_inc(&skb->dev->rx_dropped);
3057        kfree_skb(skb);
3058        return NET_RX_DROP;
3059}
3060
3061/**
3062 *      netif_rx        -       post buffer to the network code
3063 *      @skb: buffer to post
3064 *
3065 *      This function receives a packet from a device driver and queues it for
3066 *      the upper (protocol) levels to process.  It always succeeds. The buffer
3067 *      may be dropped during processing for congestion control or by the
3068 *      protocol layers.
3069 *
3070 *      return values:
3071 *      NET_RX_SUCCESS  (no congestion)
3072 *      NET_RX_DROP     (packet was dropped)
3073 *
3074 */
3075
3076int netif_rx(struct sk_buff *skb)
3077{
3078        int ret;
3079
3080        /* if netpoll wants it, pretend we never saw it */
3081        if (netpoll_rx(skb))
3082                return NET_RX_DROP;
3083
3084        net_timestamp_check(netdev_tstamp_prequeue, skb);
3085
3086        trace_netif_rx(skb);
3087#ifdef CONFIG_RPS
3088        if (static_key_false(&rps_needed)) {
3089                struct rps_dev_flow voidflow, *rflow = &voidflow;
3090                int cpu;
3091
3092                preempt_disable();
3093                rcu_read_lock();
3094
3095                cpu = get_rps_cpu(skb->dev, skb, &rflow);
3096                if (cpu < 0)
3097                        cpu = smp_processor_id();
3098
3099                ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3100
3101                rcu_read_unlock();
3102                preempt_enable();
3103        } else
3104#endif
3105        {
3106                unsigned int qtail;
3107                ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3108                put_cpu();
3109        }
3110        return ret;
3111}
3112EXPORT_SYMBOL(netif_rx);
3113
3114int netif_rx_ni(struct sk_buff *skb)
3115{
3116        int err;
3117
3118        preempt_disable();
3119        err = netif_rx(skb);
3120        if (local_softirq_pending())
3121                do_softirq();
3122        preempt_enable();
3123
3124        return err;
3125}
3126EXPORT_SYMBOL(netif_rx_ni);
3127
3128static void net_tx_action(struct softirq_action *h)
3129{
3130        struct softnet_data *sd = &__get_cpu_var(softnet_data);
3131
3132        if (sd->completion_queue) {
3133                struct sk_buff *clist;
3134
3135                local_irq_disable();
3136                clist = sd->completion_queue;
3137                sd->completion_queue = NULL;
3138                local_irq_enable();
3139
3140                while (clist) {
3141                        struct sk_buff *skb = clist;
3142                        clist = clist->next;
3143
3144                        WARN_ON(atomic_read(&skb->users));
3145                        trace_kfree_skb(skb, net_tx_action);
3146                        __kfree_skb(skb);
3147                }
3148        }
3149
3150        if (sd->output_queue) {
3151                struct Qdisc *head;
3152
3153                local_irq_disable();
3154                head = sd->output_queue;
3155                sd->output_queue = NULL;
3156                sd->output_queue_tailp = &sd->output_queue;
3157                local_irq_enable();
3158
3159                while (head) {
3160                        struct Qdisc *q = head;
3161                        spinlock_t *root_lock;
3162
3163                        head = head->next_sched;
3164
3165                        root_lock = qdisc_lock(q);
3166                        if (spin_trylock(root_lock)) {
3167                                smp_mb__before_clear_bit();
3168                                clear_bit(__QDISC_STATE_SCHED,
3169                                          &q->state);
3170                                qdisc_run(q);
3171                                spin_unlock(root_lock);
3172                        } else {
3173                                if (!test_bit(__QDISC_STATE_DEACTIVATED,
3174                                              &q->state)) {
3175                                        __netif_reschedule(q);
3176                                } else {
3177                                        smp_mb__before_clear_bit();
3178                                        clear_bit(__QDISC_STATE_SCHED,
3179                                                  &q->state);
3180                                }
3181                        }
3182                }
3183        }
3184}
3185
3186#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3187    (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3188/* This hook is defined here for ATM LANE */
3189int (*br_fdb_test_addr_hook)(struct net_device *dev,
3190                             unsigned char *addr) __read_mostly;
3191EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3192#endif
3193
3194#ifdef CONFIG_NET_CLS_ACT
3195/* TODO: Maybe we should just force sch_ingress to be compiled in
3196 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3197 * a compare and 2 stores extra right now if we dont have it on
3198 * but have CONFIG_NET_CLS_ACT
3199 * NOTE: This doesn't stop any functionality; if you dont have
3200 * the ingress scheduler, you just can't add policies on ingress.
3201 *
3202 */
3203static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3204{
3205        struct net_device *dev = skb->dev;
3206        u32 ttl = G_TC_RTTL(skb->tc_verd);
3207        int result = TC_ACT_OK;
3208        struct Qdisc *q;
3209
3210        if (unlikely(MAX_RED_LOOP < ttl++)) {
3211                net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3212                                     skb->skb_iif, dev->ifindex);
3213                return TC_ACT_SHOT;
3214        }
3215
3216        skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3217        skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3218
3219        q = rxq->qdisc;
3220        if (q != &noop_qdisc) {
3221                spin_lock(qdisc_lock(q));
3222                if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3223                        result = qdisc_enqueue_root(skb, q);
3224                spin_unlock(qdisc_lock(q));
3225        }
3226
3227        return result;
3228}
3229
3230static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3231                                         struct packet_type **pt_prev,
3232                                         int *ret, struct net_device *orig_dev)
3233{
3234        struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3235
3236        if (!rxq || rxq->qdisc == &noop_qdisc)
3237                goto out;
3238
3239        if (*pt_prev) {
3240                *ret = deliver_skb(skb, *pt_prev, orig_dev);
3241                *pt_prev = NULL;
3242        }
3243
3244        switch (ing_filter(skb, rxq)) {
3245        case TC_ACT_SHOT:
3246        case TC_ACT_STOLEN:
3247                kfree_skb(skb);
3248                return NULL;
3249        }
3250
3251out:
3252        skb->tc_verd = 0;
3253        return skb;
3254}
3255#endif
3256
3257/**
3258 *      netdev_rx_handler_register - register receive handler
3259 *      @dev: device to register a handler for
3260 *      @rx_handler: receive handler to register
3261 *      @rx_handler_data: data pointer that is used by rx handler
3262 *
3263 *      Register a receive hander for a device. This handler will then be
3264 *      called from __netif_receive_skb. A negative errno code is returned
3265 *      on a failure.
3266 *
3267 *      The caller must hold the rtnl_mutex.
3268 *
3269 *      For a general description of rx_handler, see enum rx_handler_result.
3270 */
3271int netdev_rx_handler_register(struct net_device *dev,
3272                               rx_handler_func_t *rx_handler,
3273                               void *rx_handler_data)
3274{
3275        ASSERT_RTNL();
3276
3277        if (dev->rx_handler)
3278                return -EBUSY;
3279
3280        rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3281        rcu_assign_pointer(dev->rx_handler, rx_handler);
3282
3283        return 0;
3284}
3285EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3286
3287/**
3288 *      netdev_rx_handler_unregister - unregister receive handler
3289 *      @dev: device to unregister a handler from
3290 *
3291 *      Unregister a receive hander from a device.
3292 *
3293 *      The caller must hold the rtnl_mutex.
3294 */
3295void netdev_rx_handler_unregister(struct net_device *dev)
3296{
3297
3298        ASSERT_RTNL();
3299        RCU_INIT_POINTER(dev->rx_handler, NULL);
3300        RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3301}
3302EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3303
3304/*
3305 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3306 * the special handling of PFMEMALLOC skbs.
3307 */
3308static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3309{
3310        switch (skb->protocol) {
3311        case __constant_htons(ETH_P_ARP):
3312        case __constant_htons(ETH_P_IP):
3313        case __constant_htons(ETH_P_IPV6):
3314        case __constant_htons(ETH_P_8021Q):
3315                return true;
3316        default:
3317                return false;
3318        }
3319}
3320
3321static int __netif_receive_skb(struct sk_buff *skb)
3322{
3323        struct packet_type *ptype, *pt_prev;
3324        rx_handler_func_t *rx_handler;
3325        struct net_device *orig_dev;
3326        struct net_device *null_or_dev;
3327        bool deliver_exact = false;
3328        int ret = NET_RX_DROP;
3329        __be16 type;
3330        unsigned long pflags = current->flags;
3331
3332        net_timestamp_check(!netdev_tstamp_prequeue, skb);
3333
3334        trace_netif_receive_skb(skb);
3335
3336        /*
3337         * PFMEMALLOC skbs are special, they should
3338         * - be delivered to SOCK_MEMALLOC sockets only
3339         * - stay away from userspace
3340         * - have bounded memory usage
3341         *
3342         * Use PF_MEMALLOC as this saves us from propagating the allocation
3343         * context down to all allocation sites.
3344         */
3345        if (sk_memalloc_socks() && skb_pfmemalloc(skb))
3346                current->flags |= PF_MEMALLOC;
3347
3348        /* if we've gotten here through NAPI, check netpoll */
3349        if (netpoll_receive_skb(skb))
3350                goto out;
3351
3352        orig_dev = skb->dev;
3353
3354        skb_reset_network_header(skb);
3355        skb_reset_transport_header(skb);
3356        skb_reset_mac_len(skb);
3357
3358        pt_prev = NULL;
3359
3360        rcu_read_lock();
3361
3362another_round:
3363        skb->skb_iif = skb->dev->ifindex;
3364
3365        __this_cpu_inc(softnet_data.processed);
3366
3367        if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3368                skb = vlan_untag(skb);
3369                if (unlikely(!skb))
3370                        goto unlock;
3371        }
3372
3373#ifdef CONFIG_NET_CLS_ACT
3374        if (skb->tc_verd & TC_NCLS) {
3375                skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3376                goto ncls;
3377        }
3378#endif
3379
3380        if (sk_memalloc_socks() && skb_pfmemalloc(skb))
3381                goto skip_taps;
3382
3383        list_for_each_entry_rcu(ptype, &ptype_all, list) {
3384                if (!ptype->dev || ptype->dev == skb->dev) {
3385                        if (pt_prev)
3386                                ret = deliver_skb(skb, pt_prev, orig_dev);
3387                        pt_prev = ptype;
3388                }
3389        }
3390
3391skip_taps:
3392#ifdef CONFIG_NET_CLS_ACT
3393        skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3394        if (!skb)
3395                goto unlock;
3396ncls:
3397#endif
3398
3399        if (sk_memalloc_socks() && skb_pfmemalloc(skb)
3400                                && !skb_pfmemalloc_protocol(skb))
3401                goto drop;
3402
3403        if (vlan_tx_tag_present(skb)) {
3404                if (pt_prev) {
3405                        ret = deliver_skb(skb, pt_prev, orig_dev);
3406                        pt_prev = NULL;
3407                }
3408                if (vlan_do_receive(&skb))
3409                        goto another_round;
3410                else if (unlikely(!skb))
3411                        goto unlock;
3412        }
3413
3414        rx_handler = rcu_dereference(skb->dev->rx_handler);
3415        if (rx_handler) {
3416                if (pt_prev) {
3417                        ret = deliver_skb(skb, pt_prev, orig_dev);
3418                        pt_prev = NULL;
3419                }
3420                switch (rx_handler(&skb)) {
3421                case RX_HANDLER_CONSUMED:
3422                        goto unlock;
3423                case RX_HANDLER_ANOTHER:
3424                        goto another_round;
3425                case RX_HANDLER_EXACT:
3426                        deliver_exact = true;
3427                case RX_HANDLER_PASS:
3428                        break;
3429                default:
3430                        BUG();
3431                }
3432        }
3433
3434        if (vlan_tx_nonzero_tag_present(skb))
3435                skb->pkt_type = PACKET_OTHERHOST;
3436
3437        /* deliver only exact match when indicated */
3438        null_or_dev = deliver_exact ? skb->dev : NULL;
3439
3440        type = skb->protocol;
3441        list_for_each_entry_rcu(ptype,
3442                        &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3443                if (ptype->type == type &&
3444                    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3445                     ptype->dev == orig_dev)) {
3446                        if (pt_prev)
3447                                ret = deliver_skb(skb, pt_prev, orig_dev);
3448                        pt_prev = ptype;
3449                }
3450        }
3451
3452        if (pt_prev) {
3453                if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3454                        goto drop;
3455                else
3456                        ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3457        } else {
3458drop:
3459                atomic_long_inc(&skb->dev->rx_dropped);
3460                kfree_skb(skb);
3461                /* Jamal, now you will not able to escape explaining
3462                 * me how you were going to use this. :-)
3463                 */
3464                ret = NET_RX_DROP;
3465        }
3466
3467unlock:
3468        rcu_read_unlock();
3469out:
3470        tsk_restore_flags(current, pflags, PF_MEMALLOC);
3471        return ret;
3472}
3473
3474/**
3475 *      netif_receive_skb - process receive buffer from network
3476 *      @skb: buffer to process
3477 *
3478 *      netif_receive_skb() is the main receive data processing function.
3479 *      It always succeeds. The buffer may be dropped during processing
3480 *      for congestion control or by the protocol layers.
3481 *
3482 *      This function may only be called from softirq context and interrupts
3483 *      should be enabled.
3484 *
3485 *      Return values (usually ignored):
3486 *      NET_RX_SUCCESS: no congestion
3487 *      NET_RX_DROP: packet was dropped
3488 */
3489int netif_receive_skb(struct sk_buff *skb)
3490{
3491        net_timestamp_check(netdev_tstamp_prequeue, skb);
3492
3493        if (skb_defer_rx_timestamp(skb))
3494                return NET_RX_SUCCESS;
3495
3496#ifdef CONFIG_RPS
3497        if (static_key_false(&rps_needed)) {
3498                struct rps_dev_flow voidflow, *rflow = &voidflow;
3499                int cpu, ret;
3500
3501                rcu_read_lock();
3502
3503                cpu = get_rps_cpu(skb->dev, skb, &rflow);
3504
3505                if (cpu >= 0) {
3506                        ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3507                        rcu_read_unlock();
3508                        return ret;
3509                }
3510                rcu_read_unlock();
3511        }
3512#endif
3513        return __netif_receive_skb(skb);
3514}
3515EXPORT_SYMBOL(netif_receive_skb);
3516
3517/* Network device is going away, flush any packets still pending
3518 * Called with irqs disabled.
3519 */
3520static void flush_backlog(void *arg)
3521{
3522        struct net_device *dev = arg;
3523        struct softnet_data *sd = &__get_cpu_var(softnet_data);
3524        struct sk_buff *skb, *tmp;
3525
3526        rps_lock(sd);
3527        skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3528                if (skb->dev == dev) {
3529                        __skb_unlink(skb, &sd->input_pkt_queue);
3530                        kfree_skb(skb);
3531                        input_queue_head_incr(sd);
3532                }
3533        }
3534        rps_unlock(sd);
3535
3536        skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3537                if (skb->dev == dev) {
3538                        __skb_unlink(skb, &sd->process_queue);
3539                        kfree_skb(skb);
3540                        input_queue_head_incr(sd);
3541                }
3542        }
3543}
3544
3545static int napi_gro_complete(struct sk_buff *skb)
3546{
3547        struct packet_offload *ptype;
3548        __be16 type = skb->protocol;
3549        struct list_head *head = &offload_base;
3550        int err = -ENOENT;
3551
3552        BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3553
3554        if (NAPI_GRO_CB(skb)->count == 1) {
3555                skb_shinfo(skb)->gso_size = 0;
3556                goto out;
3557        }
3558
3559        rcu_read_lock();
3560        list_for_each_entry_rcu(ptype, head, list) {
3561                if (ptype->type != type || !ptype->callbacks.gro_complete)
3562                        continue;
3563
3564                err = ptype->callbacks.gro_complete(skb);
3565                break;
3566        }
3567        rcu_read_unlock();
3568
3569        if (err) {
3570                WARN_ON(&ptype->list == head);
3571                kfree_skb(skb);
3572                return NET_RX_SUCCESS;
3573        }
3574
3575out:
3576        return netif_receive_skb(skb);
3577}
3578
3579/* napi->gro_list contains packets ordered by age.
3580 * youngest packets at the head of it.
3581 * Complete skbs in reverse order to reduce latencies.
3582 */
3583void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3584{
3585        struct sk_buff *skb, *prev = NULL;
3586
3587        /* scan list and build reverse chain */
3588        for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3589                skb->prev = prev;
3590                prev = skb;
3591        }
3592
3593        for (skb = prev; skb; skb = prev) {
3594                skb->next = NULL;
3595
3596                if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3597                        return;
3598
3599                prev = skb->prev;
3600                napi_gro_complete(skb);
3601                napi->gro_count--;
3602        }
3603
3604        napi->gro_list = NULL;
3605}
3606EXPORT_SYMBOL(napi_gro_flush);
3607
3608static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3609{
3610        struct sk_buff *p;
3611        unsigned int maclen = skb->dev->hard_header_len;
3612
3613        for (p = napi->gro_list; p; p = p->next) {
3614                unsigned long diffs;
3615
3616                diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3617                diffs |= p->vlan_tci ^ skb->vlan_tci;
3618                if (maclen == ETH_HLEN)
3619                        diffs |= compare_ether_header(skb_mac_header(p),
3620                                                      skb_gro_mac_header(skb));
3621                else if (!diffs)
3622                        diffs = memcmp(skb_mac_header(p),
3623                                       skb_gro_mac_header(skb),
3624                                       maclen);
3625                NAPI_GRO_CB(p)->same_flow = !diffs;
3626                NAPI_GRO_CB(p)->flush = 0;
3627        }
3628}
3629
3630static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3631{
3632        struct sk_buff **pp = NULL;
3633        struct packet_offload *ptype;
3634        __be16 type = skb->protocol;
3635        struct list_head *head = &offload_base;
3636        int same_flow;
3637        int mac_len;
3638        enum gro_result ret;
3639
3640        if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3641                goto normal;
3642
3643        if (skb_is_gso(skb) || skb_has_frag_list(skb))
3644                goto normal;
3645
3646        gro_list_prepare(napi, skb);
3647
3648        rcu_read_lock();
3649        list_for_each_entry_rcu(ptype, head, list) {
3650                if (ptype->type != type || !ptype->callbacks.gro_receive)
3651                        continue;
3652
3653                skb_set_network_header(skb, skb_gro_offset(skb));
3654                mac_len = skb->network_header - skb->mac_header;
3655                skb->mac_len = mac_len;
3656                NAPI_GRO_CB(skb)->same_flow = 0;
3657                NAPI_GRO_CB(skb)->flush = 0;
3658                NAPI_GRO_CB(skb)->free = 0;
3659
3660                pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
3661                break;
3662        }
3663        rcu_read_unlock();
3664
3665        if (&ptype->list == head)
3666                goto normal;
3667
3668        same_flow = NAPI_GRO_CB(skb)->same_flow;
3669        ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3670
3671        if (pp) {
3672                struct sk_buff *nskb = *pp;
3673
3674                *pp = nskb->next;
3675                nskb->next = NULL;
3676                napi_gro_complete(nskb);
3677                napi->gro_count--;
3678        }
3679
3680        if (same_flow)
3681                goto ok;
3682
3683        if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3684                goto normal;
3685
3686        napi->gro_count++;
3687        NAPI_GRO_CB(skb)->count = 1;
3688        NAPI_GRO_CB(skb)->age = jiffies;
3689        skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3690        skb->next = napi->gro_list;
3691        napi->gro_list = skb;
3692        ret = GRO_HELD;
3693
3694pull:
3695        if (skb_headlen(skb) < skb_gro_offset(skb)) {
3696                int grow = skb_gro_offset(skb) - skb_headlen(skb);
3697
3698                BUG_ON(skb->end - skb->tail < grow);
3699
3700                memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3701
3702                skb->tail += grow;
3703                skb->data_len -= grow;
3704
3705                skb_shinfo(skb)->frags[0].page_offset += grow;
3706                skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3707
3708                if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3709                        skb_frag_unref(skb, 0);
3710                        memmove(skb_shinfo(skb)->frags,
3711                                skb_shinfo(skb)->frags + 1,
3712                                --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3713                }
3714        }
3715
3716ok:
3717        return ret;
3718
3719normal:
3720        ret = GRO_NORMAL;
3721        goto pull;
3722}
3723
3724
3725static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3726{
3727        switch (ret) {
3728        case GRO_NORMAL:
3729                if (netif_receive_skb(skb))
3730                        ret = GRO_DROP;
3731                break;
3732
3733        case GRO_DROP:
3734                kfree_skb(skb);
3735                break;
3736
3737        case GRO_MERGED_FREE:
3738                if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3739                        kmem_cache_free(skbuff_head_cache, skb);
3740                else
3741                        __kfree_skb(skb);
3742                break;
3743
3744        case GRO_HELD:
3745        case GRO_MERGED:
3746                break;
3747        }
3748
3749        return ret;
3750}
3751
3752static void skb_gro_reset_offset(struct sk_buff *skb)
3753{
3754        const struct skb_shared_info *pinfo = skb_shinfo(skb);
3755        const skb_frag_t *frag0 = &pinfo->frags[0];
3756
3757        NAPI_GRO_CB(skb)->data_offset = 0;
3758        NAPI_GRO_CB(skb)->frag0 = NULL;
3759        NAPI_GRO_CB(skb)->frag0_len = 0;
3760
3761        if (skb->mac_header == skb->tail &&
3762            pinfo->nr_frags &&
3763            !PageHighMem(skb_frag_page(frag0))) {
3764                NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3765                NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3766        }
3767}
3768
3769gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3770{
3771        skb_gro_reset_offset(skb);
3772
3773        return napi_skb_finish(dev_gro_receive(napi, skb), skb);
3774}
3775EXPORT_SYMBOL(napi_gro_receive);
3776
3777static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3778{
3779        __skb_pull(skb, skb_headlen(skb));
3780        /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3781        skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3782        skb->vlan_tci = 0;
3783        skb->dev = napi->dev;
3784        skb->skb_iif = 0;
3785
3786        napi->skb = skb;
3787}
3788
3789struct sk_buff *napi_get_frags(struct napi_struct *napi)
3790{
3791        struct sk_buff *skb = napi->skb;
3792
3793        if (!skb) {
3794                skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3795                if (skb)
3796                        napi->skb = skb;
3797        }
3798        return skb;
3799}
3800EXPORT_SYMBOL(napi_get_frags);
3801
3802static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3803                               gro_result_t ret)
3804{
3805        switch (ret) {
3806        case GRO_NORMAL:
3807        case GRO_HELD:
3808                skb->protocol = eth_type_trans(skb, skb->dev);
3809
3810                if (ret == GRO_HELD)
3811                        skb_gro_pull(skb, -ETH_HLEN);
3812                else if (netif_receive_skb(skb))
3813                        ret = GRO_DROP;
3814                break;
3815
3816        case GRO_DROP:
3817        case GRO_MERGED_FREE:
3818                napi_reuse_skb(napi, skb);
3819                break;
3820
3821        case GRO_MERGED:
3822                break;
3823        }
3824
3825        return ret;
3826}
3827
3828static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3829{
3830        struct sk_buff *skb = napi->skb;
3831        struct ethhdr *eth;
3832        unsigned int hlen;
3833        unsigned int off;
3834
3835        napi->skb = NULL;
3836
3837        skb_reset_mac_header(skb);
3838        skb_gro_reset_offset(skb);
3839
3840        off = skb_gro_offset(skb);
3841        hlen = off + sizeof(*eth);
3842        eth = skb_gro_header_fast(skb, off);
3843        if (skb_gro_header_hard(skb, hlen)) {
3844                eth = skb_gro_header_slow(skb, hlen, off);
3845                if (unlikely(!eth)) {
3846                        napi_reuse_skb(napi, skb);
3847                        skb = NULL;
3848                        goto out;
3849                }
3850        }
3851
3852        skb_gro_pull(skb, sizeof(*eth));
3853
3854        /*
3855         * This works because the only protocols we care about don't require
3856         * special handling.  We'll fix it up properly at the end.
3857         */
3858        skb->protocol = eth->h_proto;
3859
3860out:
3861        return skb;
3862}
3863
3864gro_result_t napi_gro_frags(struct napi_struct *napi)
3865{
3866        struct sk_buff *skb = napi_frags_skb(napi);
3867
3868        if (!skb)
3869                return GRO_DROP;
3870
3871        return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
3872}
3873EXPORT_SYMBOL(napi_gro_frags);
3874
3875/*
3876 * net_rps_action sends any pending IPI's for rps.
3877 * Note: called with local irq disabled, but exits with local irq enabled.
3878 */
3879static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3880{
3881#ifdef CONFIG_RPS
3882        struct softnet_data *remsd = sd->rps_ipi_list;
3883
3884        if (remsd) {
3885                sd->rps_ipi_list = NULL;
3886
3887                local_irq_enable();
3888
3889                /* Send pending IPI's to kick RPS processing on remote cpus. */
3890                while (remsd) {
3891                        struct softnet_data *next = remsd->rps_ipi_next;
3892
3893                        if (cpu_online(remsd->cpu))
3894                                __smp_call_function_single(remsd->cpu,
3895                                                           &remsd->csd, 0);
3896                        remsd = next;
3897                }
3898        } else
3899#endif
3900                local_irq_enable();
3901}
3902
3903static int process_backlog(struct napi_struct *napi, int quota)
3904{
3905        int work = 0;
3906        struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3907
3908#ifdef CONFIG_RPS
3909        /* Check if we have pending ipi, its better to send them now,
3910         * not waiting net_rx_action() end.
3911         */
3912        if (sd->rps_ipi_list) {
3913                local_irq_disable();
3914                net_rps_action_and_irq_enable(sd);
3915        }
3916#endif
3917        napi->weight = weight_p;
3918        local_irq_disable();
3919        while (work < quota) {
3920                struct sk_buff *skb;
3921                unsigned int qlen;
3922
3923                while ((skb = __skb_dequeue(&sd->process_queue))) {
3924                        local_irq_enable();
3925                        __netif_receive_skb(skb);
3926                        local_irq_disable();
3927                        input_queue_head_incr(sd);
3928                        if (++work >= quota) {
3929                                local_irq_enable();
3930                                return work;
3931                        }
3932                }
3933
3934                rps_lock(sd);
3935                qlen = skb_queue_len(&sd->input_pkt_queue);
3936                if (qlen)
3937                        skb_queue_splice_tail_init(&sd->input_pkt_queue,
3938                                                   &sd->process_queue);
3939
3940                if (qlen < quota - work) {
3941                        /*
3942                         * Inline a custom version of __napi_complete().
3943                         * only current cpu owns and manipulates this napi,
3944                         * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3945                         * we can use a plain write instead of clear_bit(),
3946                         * and we dont need an smp_mb() memory barrier.
3947                         */
3948                        list_del(&napi->poll_list);
3949                        napi->state = 0;
3950
3951                        quota = work + qlen;
3952                }
3953                rps_unlock(sd);
3954        }
3955        local_irq_enable();
3956
3957        return work;
3958}
3959
3960/**
3961 * __napi_schedule - schedule for receive
3962 * @n: entry to schedule
3963 *
3964 * The entry's receive function will be scheduled to run
3965 */
3966void __napi_schedule(struct napi_struct *n)
3967{
3968        unsigned long flags;
3969
3970        local_irq_save(flags);
3971        ____napi_schedule(&__get_cpu_var(softnet_data), n);
3972        local_irq_restore(flags);
3973}
3974EXPORT_SYMBOL(__napi_schedule);
3975
3976void __napi_complete(struct napi_struct *n)
3977{
3978        BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3979        BUG_ON(n->gro_list);
3980
3981        list_del(&n->poll_list);
3982        smp_mb__before_clear_bit();
3983        clear_bit(NAPI_STATE_SCHED, &n->state);
3984}
3985EXPORT_SYMBOL(__napi_complete);
3986
3987void napi_complete(struct napi_struct *n)
3988{
3989        unsigned long flags;
3990
3991        /*
3992         * don't let napi dequeue from the cpu poll list
3993         * just in case its running on a different cpu
3994         */
3995        if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3996                return;
3997
3998        napi_gro_flush(n, false);
3999        local_irq_save(flags);
4000        __napi_complete(n);
4001        local_irq_restore(flags);
4002}
4003EXPORT_SYMBOL(napi_complete);
4004
4005void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4006                    int (*poll)(struct napi_struct *, int), int weight)
4007{
4008        INIT_LIST_HEAD(&napi->poll_list);
4009        napi->gro_count = 0;
4010        napi->gro_list = NULL;
4011        napi->skb = NULL;
4012        napi->poll = poll;
4013        napi->weight = weight;
4014        list_add(&napi->dev_list, &dev->napi_list);
4015        napi->dev = dev;
4016#ifdef CONFIG_NETPOLL
4017        spin_lock_init(&napi->poll_lock);
4018        napi->poll_owner = -1;
4019#endif
4020        set_bit(NAPI_STATE_SCHED, &napi->state);
4021}
4022EXPORT_SYMBOL(netif_napi_add);
4023
4024void netif_napi_del(struct napi_struct *napi)
4025{
4026        struct sk_buff *skb, *next;
4027
4028        list_del_init(&napi->dev_list);
4029        napi_free_frags(napi);
4030
4031        for (skb = napi->gro_list; skb; skb = next) {
4032                next = skb->next;
4033                skb->next = NULL;
4034                kfree_skb(skb);
4035        }
4036
4037        napi->gro_list = NULL;
4038        napi->gro_count = 0;
4039}
4040EXPORT_SYMBOL(netif_napi_del);
4041
4042static void net_rx_action(struct softirq_action *h)
4043{
4044        struct softnet_data *sd = &__get_cpu_var(softnet_data);
4045        unsigned long time_limit = jiffies + 2;
4046        int budget = netdev_budget;
4047        void *have;
4048
4049        local_irq_disable();
4050
4051        while (!list_empty(&sd->poll_list)) {
4052                struct napi_struct *n;
4053                int work, weight;
4054
4055                /* If softirq window is exhuasted then punt.
4056                 * Allow this to run for 2 jiffies since which will allow
4057                 * an average latency of 1.5/HZ.
4058                 */
4059                if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
4060                        goto softnet_break;
4061
4062                local_irq_enable();
4063
4064                /* Even though interrupts have been re-enabled, this
4065                 * access is safe because interrupts can only add new
4066                 * entries to the tail of this list, and only ->poll()
4067                 * calls can remove this head entry from the list.
4068                 */
4069                n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4070
4071                have = netpoll_poll_lock(n);
4072
4073                weight = n->weight;
4074
4075                /* This NAPI_STATE_SCHED test is for avoiding a race
4076                 * with netpoll's poll_napi().  Only the entity which
4077                 * obtains the lock and sees NAPI_STATE_SCHED set will
4078                 * actually make the ->poll() call.  Therefore we avoid
4079                 * accidentally calling ->poll() when NAPI is not scheduled.
4080                 */
4081                work = 0;
4082                if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4083                        work = n->poll(n, weight);
4084                        trace_napi_poll(n);
4085                }
4086
4087                WARN_ON_ONCE(work > weight);
4088
4089                budget -= work;
4090
4091                local_irq_disable();
4092
4093                /* Drivers must not modify the NAPI state if they
4094                 * consume the entire weight.  In such cases this code
4095                 * still "owns" the NAPI instance and therefore can
4096                 * move the instance around on the list at-will.
4097                 */
4098                if (unlikely(work == weight)) {
4099                        if (unlikely(napi_disable_pending(n))) {
4100                                local_irq_enable();
4101                                napi_complete(n);
4102                                local_irq_disable();
4103                        } else {
4104                                if (n->gro_list) {
4105                                        /* flush too old packets
4106                                         * If HZ < 1000, flush all packets.
4107                                         */
4108                                        local_irq_enable();
4109                                        napi_gro_flush(n, HZ >= 1000);
4110                                        local_irq_disable();
4111                                }
4112                                list_move_tail(&n->poll_list, &sd->poll_list);
4113                        }
4114                }
4115
4116                netpoll_poll_unlock(have);
4117        }
4118out:
4119        net_rps_action_and_irq_enable(sd);
4120
4121#ifdef CONFIG_NET_DMA
4122        /*
4123         * There may not be any more sk_buffs coming right now, so push
4124         * any pending DMA copies to hardware
4125         */
4126        dma_issue_pending_all();
4127#endif
4128
4129        return;
4130
4131softnet_break:
4132        sd->time_squeeze++;
4133        __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4134        goto out;
4135}
4136
4137static gifconf_func_t *gifconf_list[NPROTO];
4138
4139/**
4140 *      register_gifconf        -       register a SIOCGIF handler
4141 *      @family: Address family
4142 *      @gifconf: Function handler
4143 *
4144 *      Register protocol dependent address dumping routines. The handler
4145 *      that is passed must not be freed or reused until it has been replaced
4146 *      by another handler.
4147 */
4148int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
4149{
4150        if (family >= NPROTO)
4151                return -EINVAL;
4152        gifconf_list[family] = gifconf;
4153        return 0;
4154}
4155EXPORT_SYMBOL(register_gifconf);
4156
4157
4158/*
4159 *      Map an interface index to its name (SIOCGIFNAME)
4160 */
4161
4162/*
4163 *      We need this ioctl for efficient implementation of the
4164 *      if_indextoname() function required by the IPv6 API.  Without
4165 *      it, we would have to search all the interfaces to find a
4166 *      match.  --pb
4167 */
4168
4169static int dev_ifname(struct net *net, struct ifreq __user *arg)
4170{
4171        struct net_device *dev;
4172        struct ifreq ifr;
4173        unsigned seq;
4174
4175        /*
4176         *      Fetch the caller's info block.
4177         */
4178
4179        if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4180                return -EFAULT;
4181
4182retry:
4183        seq = read_seqcount_begin(&devnet_rename_seq);
4184        rcu_read_lock();
4185        dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
4186        if (!dev) {
4187                rcu_read_unlock();
4188                return -ENODEV;
4189        }
4190
4191        strcpy(ifr.ifr_name, dev->name);
4192        rcu_read_unlock();
4193        if (read_seqcount_retry(&devnet_rename_seq, seq))
4194                goto retry;
4195
4196        if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
4197                return -EFAULT;
4198        return 0;
4199}
4200
4201/*
4202 *      Perform a SIOCGIFCONF call. This structure will change
4203 *      size eventually, and there is nothing I can do about it.
4204 *      Thus we will need a 'compatibility mode'.
4205 */
4206
4207static int dev_ifconf(struct net *net, char __user *arg)
4208{
4209        struct ifconf ifc;
4210        struct net_device *dev;
4211        char __user *pos;
4212        int len;
4213        int total;
4214        int i;
4215
4216        /*
4217         *      Fetch the caller's info block.
4218         */
4219
4220        if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
4221                return -EFAULT;
4222
4223        pos = ifc.ifc_buf;
4224        len = ifc.ifc_len;
4225
4226        /*
4227         *      Loop over the interfaces, and write an info block for each.
4228         */
4229
4230        total = 0;
4231        for_each_netdev(net, dev) {
4232                for (i = 0; i < NPROTO; i++) {
4233                        if (gifconf_list[i]) {
4234                                int done;
4235                                if (!pos)
4236                                        done = gifconf_list[i](dev, NULL, 0);
4237                                else
4238                                        done = gifconf_list[i](dev, pos + total,
4239                                                               len - total);
4240                                if (done < 0)
4241                                        return -EFAULT;
4242                                total += done;
4243                        }
4244                }
4245        }
4246
4247        /*
4248         *      All done.  Write the updated control block back to the caller.
4249         */
4250        ifc.ifc_len = total;
4251
4252        /*
4253         *      Both BSD and Solaris return 0 here, so we do too.
4254         */
4255        return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
4256}
4257
4258#ifdef CONFIG_PROC_FS
4259
4260#define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
4261
4262#define get_bucket(x) ((x) >> BUCKET_SPACE)
4263#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4264#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4265
4266static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
4267{
4268        struct net *net = seq_file_net(seq);
4269        struct net_device *dev;
4270        struct hlist_node *p;
4271        struct hlist_head *h;
4272        unsigned int count = 0, offset = get_offset(*pos);
4273
4274        h = &net->dev_name_head[get_bucket(*pos)];
4275        hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
4276                if (++count == offset)
4277                        return dev;
4278        }
4279
4280        return NULL;
4281}
4282
4283static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos)
4284{
4285        struct net_device *dev;
4286        unsigned int bucket;
4287
4288        do {
4289                dev = dev_from_same_bucket(seq, pos);
4290                if (dev)
4291                        return dev;
4292
4293                bucket = get_bucket(*pos) + 1;
4294                *pos = set_bucket_offset(bucket, 1);
4295        } while (bucket < NETDEV_HASHENTRIES);
4296
4297        return NULL;
4298}
4299
4300/*
4301 *      This is invoked by the /proc filesystem handler to display a device
4302 *      in detail.
4303 */
4304void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4305        __acquires(RCU)
4306{
4307        rcu_read_lock();
4308        if (!*pos)
4309                return SEQ_START_TOKEN;
4310
4311        if (get_bucket(*pos) >= NETDEV_HASHENTRIES)
4312                return NULL;
4313
4314        return dev_from_bucket(seq, pos);
4315}
4316
4317void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4318{
4319        ++*pos;
4320        return dev_from_bucket(seq, pos);
4321}
4322
4323void dev_seq_stop(struct seq_file *seq, void *v)
4324        __releases(RCU)
4325{
4326        rcu_read_unlock();
4327}
4328
4329static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4330{
4331        struct rtnl_link_stats64 temp;
4332        const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4333
4334        seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4335                   "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4336                   dev->name, stats->rx_bytes, stats->rx_packets,
4337                   stats->rx_errors,
4338                   stats->rx_dropped + stats->rx_missed_errors,
4339                   stats->rx_fifo_errors,
4340                   stats->rx_length_errors + stats->rx_over_errors +
4341                    stats->rx_crc_errors + stats->rx_frame_errors,
4342                   stats->rx_compressed, stats->multicast,
4343                   stats->tx_bytes, stats->tx_packets,
4344                   stats->tx_errors, stats->tx_dropped,
4345                   stats->tx_fifo_errors, stats->collisions,
4346                   stats->tx_carrier_errors +
4347                    stats->tx_aborted_errors +
4348                    stats->tx_window_errors +
4349                    stats->tx_heartbeat_errors,
4350                   stats->tx_compressed);
4351}
4352
4353/*
4354 *      Called from the PROCfs module. This now uses the new arbitrary sized
4355 *      /proc/net interface to create /proc/net/dev
4356 */
4357static int dev_seq_show(struct seq_file *seq, void *v)
4358{
4359        if (v == SEQ_START_TOKEN)
4360                seq_puts(seq, "Inter-|   Receive                            "
4361                              "                    |  Transmit\n"
4362                              " face |bytes    packets errs drop fifo frame "
4363                              "compressed multicast|bytes    packets errs "
4364                              "drop fifo colls carrier compressed\n");
4365        else
4366                dev_seq_printf_stats(seq, v);
4367        return 0;
4368}
4369
4370static struct softnet_data *softnet_get_online(loff_t *pos)
4371{
4372        struct softnet_data *sd = NULL;
4373
4374        while (*pos < nr_cpu_ids)
4375                if (cpu_online(*pos)) {
4376                        sd = &per_cpu(softnet_data, *pos);
4377                        break;
4378                } else
4379                        ++*pos;
4380        return sd;
4381}
4382
4383static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4384{
4385        return softnet_get_online(pos);
4386}
4387
4388static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4389{
4390        ++*pos;
4391        return softnet_get_online(pos);
4392}
4393
4394static void softnet_seq_stop(struct seq_file *seq, void *v)
4395{
4396}
4397
4398static int softnet_seq_show(struct seq_file *seq, void *v)
4399{
4400        struct softnet_data *sd = v;
4401
4402        seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4403                   sd->processed, sd->dropped, sd->time_squeeze, 0,
4404                   0, 0, 0, 0, /* was fastroute */
4405                   sd->cpu_collision, sd->received_rps);
4406        return 0;
4407}
4408
4409static const struct seq_operations dev_seq_ops = {
4410        .start = dev_seq_start,
4411        .next  = dev_seq_next,
4412        .stop  = dev_seq_stop,
4413        .show  = dev_seq_show,
4414};
4415
4416static int dev_seq_open(struct inode *inode, struct file *file)
4417{
4418        return seq_open_net(inode, file, &dev_seq_ops,
4419                            sizeof(struct seq_net_private));
4420}
4421
4422static const struct file_operations dev_seq_fops = {
4423        .owner   = THIS_MODULE,
4424        .open    = dev_seq_open,
4425        .read    = seq_read,
4426        .llseek  = seq_lseek,
4427        .release = seq_release_net,
4428};
4429
4430static const struct seq_operations softnet_seq_ops = {
4431        .start = softnet_seq_start,
4432        .next  = softnet_seq_next,
4433        .stop  = softnet_seq_stop,
4434        .show  = softnet_seq_show,
4435};
4436
4437static int softnet_seq_open(struct inode *inode, struct file *file)
4438{
4439        return seq_open(file, &softnet_seq_ops);
4440}
4441
4442static const struct file_operations softnet_seq_fops = {
4443        .owner   = THIS_MODULE,
4444        .open    = softnet_seq_open,
4445        .read    = seq_read,
4446        .llseek  = seq_lseek,
4447        .release = seq_release,
4448};
4449
4450static void *ptype_get_idx(loff_t pos)
4451{
4452        struct packet_type *pt = NULL;
4453        loff_t i = 0;
4454        int t;
4455
4456        list_for_each_entry_rcu(pt, &ptype_all, list) {
4457                if (i == pos)
4458                        return pt;
4459                ++i;
4460        }
4461
4462        for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4463                list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4464                        if (i == pos)
4465                                return pt;
4466                        ++i;
4467                }
4468        }
4469        return NULL;
4470}
4471
4472static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4473        __acquires(RCU)
4474{
4475        rcu_read_lock();
4476        return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4477}
4478
4479static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4480{
4481        struct packet_type *pt;
4482        struct list_head *nxt;
4483        int hash;
4484
4485        ++*pos;
4486        if (v == SEQ_START_TOKEN)
4487                return ptype_get_idx(0);
4488
4489        pt = v;
4490        nxt = pt->list.next;
4491        if (pt->type == htons(ETH_P_ALL)) {
4492                if (nxt != &ptype_all)
4493                        goto found;
4494                hash = 0;
4495                nxt = ptype_base[0].next;
4496        } else
4497                hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4498
4499        while (nxt == &ptype_base[hash]) {
4500                if (++hash >= PTYPE_HASH_SIZE)
4501                        return NULL;
4502                nxt = ptype_base[hash].next;
4503        }
4504found:
4505        return list_entry(nxt, struct packet_type, list);
4506}
4507
4508static void ptype_seq_stop(struct seq_file *seq, void *v)
4509        __releases(RCU)
4510{
4511        rcu_read_unlock();
4512}
4513
4514static int ptype_seq_show(struct seq_file *seq, void *v)
4515{
4516        struct packet_type *pt = v;
4517
4518        if (v == SEQ_START_TOKEN)
4519                seq_puts(seq, "Type Device      Function\n");
4520        else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4521                if (pt->type == htons(ETH_P_ALL))
4522                        seq_puts(seq, "ALL ");
4523                else
4524                        seq_printf(seq, "%04x", ntohs(pt->type));
4525
4526                seq_printf(seq, " %-8s %pF\n",
4527                           pt->dev ? pt->dev->name : "", pt->func);
4528        }
4529
4530        return 0;
4531}
4532
4533static const struct seq_operations ptype_seq_ops = {
4534        .start = ptype_seq_start,
4535        .next  = ptype_seq_next,
4536        .stop  = ptype_seq_stop,
4537        .show  = ptype_seq_show,
4538};
4539
4540static int ptype_seq_open(struct inode *inode, struct file *file)
4541{
4542        return seq_open_net(inode, file, &ptype_seq_ops,
4543                        sizeof(struct seq_net_private));
4544}
4545
4546static const struct file_operations ptype_seq_fops = {
4547        .owner   = THIS_MODULE,
4548        .open    = ptype_seq_open,
4549        .read    = seq_read,
4550        .llseek  = seq_lseek,
4551        .release = seq_release_net,
4552};
4553
4554
4555static int __net_init dev_proc_net_init(struct net *net)
4556{
4557        int rc = -ENOMEM;
4558
4559        if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4560                goto out;
4561        if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4562                goto out_dev;
4563        if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4564                goto out_softnet;
4565
4566        if (wext_proc_init(net))
4567                goto out_ptype;
4568        rc = 0;
4569out:
4570        return rc;
4571out_ptype:
4572        proc_net_remove(net, "ptype");
4573out_softnet:
4574        proc_net_remove(net, "softnet_stat");
4575out_dev:
4576        proc_net_remove(net, "dev");
4577        goto out;
4578}
4579
4580static void __net_exit dev_proc_net_exit(struct net *net)
4581{
4582        wext_proc_exit(net);
4583
4584        proc_net_remove(net, "ptype");
4585        proc_net_remove(net, "softnet_stat");
4586        proc_net_remove(net, "dev");
4587}
4588
4589static struct pernet_operations __net_initdata dev_proc_ops = {
4590        .init = dev_proc_net_init,
4591        .exit = dev_proc_net_exit,
4592};
4593
4594static int __init dev_proc_init(void)
4595{
4596        return register_pernet_subsys(&dev_proc_ops);
4597}
4598#else
4599#define dev_proc_init() 0
4600#endif  /* CONFIG_PROC_FS */
4601
4602
4603/**
4604 *      netdev_set_master       -       set up master pointer
4605 *      @slave: slave device
4606 *      @master: new master device
4607 *
4608 *      Changes the master device of the slave. Pass %NULL to break the
4609 *      bonding. The caller must hold the RTNL semaphore. On a failure
4610 *      a negative errno code is returned. On success the reference counts
4611 *      are adjusted and the function returns zero.
4612 */
4613int netdev_set_master(struct net_device *slave, struct net_device *master)
4614{
4615        struct net_device *old = slave->master;
4616
4617        ASSERT_RTNL();
4618
4619        if (master) {
4620                if (old)
4621                        return -EBUSY;
4622                dev_hold(master);
4623        }
4624
4625        slave->master = master;
4626
4627        if (old)
4628                dev_put(old);
4629        return 0;
4630}
4631EXPORT_SYMBOL(netdev_set_master);
4632
4633/**
4634 *      netdev_set_bond_master  -       set up bonding master/slave pair
4635 *      @slave: slave device
4636 *      @master: new master device
4637 *
4638 *      Changes the master device of the slave. Pass %NULL to break the
4639 *      bonding. The caller must hold the RTNL semaphore. On a failure
4640 *      a negative errno code is returned. On success %RTM_NEWLINK is sent
4641 *      to the routing socket and the function returns zero.
4642 */
4643int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4644{
4645        int err;
4646
4647        ASSERT_RTNL();
4648
4649        err = netdev_set_master(slave, master);
4650        if (err)
4651                return err;
4652        if (master)
4653                slave->flags |= IFF_SLAVE;
4654        else
4655                slave->flags &= ~IFF_SLAVE;
4656
4657        rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4658        return 0;
4659}
4660EXPORT_SYMBOL(netdev_set_bond_master);
4661
4662static void dev_change_rx_flags(struct net_device *dev, int flags)
4663{
4664        const struct net_device_ops *ops = dev->netdev_ops;
4665
4666        if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4667                ops->ndo_change_rx_flags(dev, flags);
4668}
4669
4670static int __dev_set_promiscuity(struct net_device *dev, int inc)
4671{
4672        unsigned int old_flags = dev->flags;
4673        kuid_t uid;
4674        kgid_t gid;
4675
4676        ASSERT_RTNL();
4677
4678        dev->flags |= IFF_PROMISC;
4679        dev->promiscuity += inc;
4680        if (dev->promiscuity == 0) {
4681                /*
4682                 * Avoid overflow.
4683                 * If inc causes overflow, untouch promisc and return error.
4684                 */
4685                if (inc < 0)
4686                        dev->flags &= ~IFF_PROMISC;
4687                else {
4688                        dev->promiscuity -= inc;
4689                        pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4690                                dev->name);
4691                        return -EOVERFLOW;
4692                }
4693        }
4694        if (dev->flags != old_flags) {
4695                pr_info("device %s %s promiscuous mode\n",
4696                        dev->name,
4697                        dev->flags & IFF_PROMISC ? "entered" : "left");
4698                if (audit_enabled) {
4699                        current_uid_gid(&uid, &gid);
4700                        audit_log(current->audit_context, GFP_ATOMIC,
4701                                AUDIT_ANOM_PROMISCUOUS,
4702                                "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4703                                dev->name, (dev->flags & IFF_PROMISC),
4704                                (old_flags & IFF_PROMISC),
4705                                from_kuid(&init_user_ns, audit_get_loginuid(current)),
4706                                from_kuid(&init_user_ns, uid),
4707                                from_kgid(&init_user_ns, gid),
4708                                audit_get_sessionid(current));
4709                }
4710
4711                dev_change_rx_flags(dev, IFF_PROMISC);
4712        }
4713        return 0;
4714}
4715
4716/**
4717 *      dev_set_promiscuity     - update promiscuity count on a device
4718 *      @dev: device
4719 *      @inc: modifier
4720 *
4721 *      Add or remove promiscuity from a device. While the count in the device
4722 *      remains above zero the interface remains promiscuous. Once it hits zero
4723 *      the device reverts back to normal filtering operation. A negative inc
4724 *      value is used to drop promiscuity on the device.
4725 *      Return 0 if successful or a negative errno code on error.
4726 */
4727int dev_set_promiscuity(struct net_device *dev, int inc)
4728{
4729        unsigned int old_flags = dev->flags;
4730        int err;
4731
4732        err = __dev_set_promiscuity(dev, inc);
4733        if (err < 0)
4734                return err;
4735        if (dev->flags != old_flags)
4736                dev_set_rx_mode(dev);
4737        return err;
4738}
4739EXPORT_SYMBOL(dev_set_promiscuity);
4740
4741/**
4742 *      dev_set_allmulti        - update allmulti count on a device
4743 *      @dev: device
4744 *      @inc: modifier
4745 *
4746 *      Add or remove reception of all multicast frames to a device. While the
4747 *      count in the device remains above zero the interface remains listening
4748 *      to all interfaces. Once it hits zero the device reverts back to normal
4749 *      filtering operation. A negative @inc value is used to drop the counter
4750 *      when releasing a resource needing all multicasts.
4751 *      Return 0 if successful or a negative errno code on error.
4752 */
4753
4754int dev_set_allmulti(struct net_device *dev, int inc)
4755{
4756        unsigned int old_flags = dev->flags;
4757
4758        ASSERT_RTNL();
4759
4760        dev->flags |= IFF_ALLMULTI;
4761        dev->allmulti += inc;
4762        if (dev->allmulti == 0) {
4763                /*
4764                 * Avoid overflow.
4765                 * If inc causes overflow, untouch allmulti and return error.
4766                 */
4767                if (inc < 0)
4768                        dev->flags &= ~IFF_ALLMULTI;
4769                else {
4770                        dev->allmulti -= inc;
4771                        pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4772                                dev->name);
4773                        return -EOVERFLOW;
4774                }
4775        }
4776        if (dev->flags ^ old_flags) {
4777                dev_change_rx_flags(dev, IFF_ALLMULTI);
4778                dev_set_rx_mode(dev);
4779        }
4780        return 0;
4781}
4782EXPORT_SYMBOL(dev_set_allmulti);
4783
4784/*
4785 *      Upload unicast and multicast address lists to device and
4786 *      configure RX filtering. When the device doesn't support unicast
4787 *      filtering it is put in promiscuous mode while unicast addresses
4788 *      are present.
4789 */
4790void __dev_set_rx_mode(struct net_device *dev)
4791{
4792        const struct net_device_ops *ops = dev->netdev_ops;
4793
4794        /* dev_open will call this function so the list will stay sane. */
4795        if (!(dev->flags&IFF_UP))
4796                return;
4797
4798        if (!netif_device_present(dev))
4799                return;
4800
4801        if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4802                /* Unicast addresses changes may only happen under the rtnl,
4803                 * therefore calling __dev_set_promiscuity here is safe.
4804                 */
4805                if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4806                        __dev_set_promiscuity(dev, 1);
4807                        dev->uc_promisc = true;
4808                } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4809                        __dev_set_promiscuity(dev, -1);
4810                        dev->uc_promisc = false;
4811                }
4812        }
4813
4814        if (ops->ndo_set_rx_mode)
4815                ops->ndo_set_rx_mode(dev);
4816}
4817
4818void dev_set_rx_mode(struct net_device *dev)
4819{
4820        netif_addr_lock_bh(dev);
4821        __dev_set_rx_mode(dev);
4822        netif_addr_unlock_bh(dev);
4823}
4824
4825/**
4826 *      dev_get_flags - get flags reported to userspace
4827 *      @dev: device
4828 *
4829 *      Get the combination of flag bits exported through APIs to userspace.
4830 */
4831unsigned int dev_get_flags(const struct net_device *dev)
4832{
4833        unsigned int flags;
4834
4835        flags = (dev->flags & ~(IFF_PROMISC |
4836                                IFF_ALLMULTI |
4837                                IFF_RUNNING |
4838                                IFF_LOWER_UP |
4839                                IFF_DORMANT)) |
4840                (dev->gflags & (IFF_PROMISC |
4841                                IFF_ALLMULTI));
4842
4843        if (netif_running(dev)) {
4844                if (netif_oper_up(dev))
4845                        flags |= IFF_RUNNING;
4846                if (netif_carrier_ok(dev))
4847                        flags |= IFF_LOWER_UP;
4848                if (netif_dormant(dev))
4849                        flags |= IFF_DORMANT;
4850        }
4851
4852        return flags;
4853}
4854EXPORT_SYMBOL(dev_get_flags);
4855
4856int __dev_change_flags(struct net_device *dev, unsigned int flags)
4857{
4858        unsigned int old_flags = dev->flags;
4859        int ret;
4860
4861        ASSERT_RTNL();
4862
4863        /*
4864         *      Set the flags on our device.
4865         */
4866
4867        dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4868                               IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4869                               IFF_AUTOMEDIA)) |
4870                     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4871                                    IFF_ALLMULTI));
4872
4873        /*
4874         *      Load in the correct multicast list now the flags have changed.
4875         */
4876
4877        if ((old_flags ^ flags) & IFF_MULTICAST)
4878                dev_change_rx_flags(dev, IFF_MULTICAST);
4879
4880        dev_set_rx_mode(dev);
4881
4882        /*
4883         *      Have we downed the interface. We handle IFF_UP ourselves
4884         *      according to user attempts to set it, rather than blindly
4885         *      setting it.
4886         */
4887
4888        ret = 0;
4889        if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4890                ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4891
4892                if (!ret)
4893                        dev_set_rx_mode(dev);
4894        }
4895
4896        if ((flags ^ dev->gflags) & IFF_PROMISC) {
4897                int inc = (flags & IFF_PROMISC) ? 1 : -1;
4898
4899                dev->gflags ^= IFF_PROMISC;
4900                dev_set_promiscuity(dev, inc);
4901        }
4902
4903        /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4904           is important. Some (broken) drivers set IFF_PROMISC, when
4905           IFF_ALLMULTI is requested not asking us and not reporting.
4906         */
4907        if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4908                int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4909
4910                dev->gflags ^= IFF_ALLMULTI;
4911                dev_set_allmulti(dev, inc);
4912        }
4913
4914        return ret;
4915}
4916
4917void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4918{
4919        unsigned int changes = dev->flags ^ old_flags;
4920
4921        if (changes & IFF_UP) {
4922                if (dev->flags & IFF_UP)
4923                        call_netdevice_notifiers(NETDEV_UP, dev);
4924                else
4925                        call_netdevice_notifiers(NETDEV_DOWN, dev);
4926        }
4927
4928        if (dev->flags & IFF_UP &&
4929            (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4930                call_netdevice_notifiers(NETDEV_CHANGE, dev);
4931}
4932
4933/**
4934 *      dev_change_flags - change device settings
4935 *      @dev: device
4936 *      @flags: device state flags
4937 *
4938 *      Change settings on device based state flags. The flags are
4939 *      in the userspace exported format.
4940 */
4941int dev_change_flags(struct net_device *dev, unsigned int flags)
4942{
4943        int ret;
4944        unsigned int changes, old_flags = dev->flags;
4945
4946        ret = __dev_change_flags(dev, flags);
4947        if (ret < 0)
4948                return ret;
4949
4950        changes = old_flags ^ dev->flags;
4951        if (changes)
4952                rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4953
4954        __dev_notify_flags(dev, old_flags);
4955        return ret;
4956}
4957EXPORT_SYMBOL(dev_change_flags);
4958
4959/**
4960 *      dev_set_mtu - Change maximum transfer unit
4961 *      @dev: device
4962 *      @new_mtu: new transfer unit
4963 *
4964 *      Change the maximum transfer size of the network device.
4965 */
4966int dev_set_mtu(struct net_device *dev, int new_mtu)
4967{
4968        const struct net_device_ops *ops = dev->netdev_ops;
4969        int err;
4970
4971        if (new_mtu == dev->mtu)
4972                return 0;
4973
4974        /*      MTU must be positive.    */
4975        if (new_mtu < 0)
4976                return -EINVAL;
4977
4978        if (!netif_device_present(dev))
4979                return -ENODEV;
4980
4981        err = 0;
4982        if (ops->ndo_change_mtu)
4983                err = ops->ndo_change_mtu(dev, new_mtu);
4984        else
4985                dev->mtu = new_mtu;
4986
4987        if (!err)
4988                call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4989        return err;
4990}
4991EXPORT_SYMBOL(dev_set_mtu);
4992
4993/**
4994 *      dev_set_group - Change group this device belongs to
4995 *      @dev: device
4996 *      @new_group: group this device should belong to
4997 */
4998void dev_set_group(struct net_device *dev, int new_group)
4999{
5000        dev->group = new_group;
5001}
5002EXPORT_SYMBOL(dev_set_group);
5003
5004/**
5005 *      dev_set_mac_address - Change Media Access Control Address
5006 *      @dev: device
5007 *      @sa: new address
5008 *
5009 *      Change the hardware (MAC) address of the device
5010 */
5011int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5012{
5013        const struct net_device_ops *ops = dev->netdev_ops;
5014        int err;
5015
5016        if (!ops->ndo_set_mac_address)
5017                return -EOPNOTSUPP;
5018        if (sa->sa_family != dev->type)
5019                return -EINVAL;
5020        if (!netif_device_present(dev))
5021                return -ENODEV;
5022        err = ops->ndo_set_mac_address(dev, sa);
5023        if (!err)
5024                call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5025        add_device_randomness(dev->dev_addr, dev->addr_len);
5026        return err;
5027}
5028EXPORT_SYMBOL(dev_set_mac_address);
5029
5030/*
5031 *      Perform the SIOCxIFxxx calls, inside rcu_read_lock()
5032 */
5033static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
5034{
5035        int err;
5036        struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
5037
5038        if (!dev)
5039                return -ENODEV;
5040
5041        switch (cmd) {
5042        case SIOCGIFFLAGS:      /* Get interface flags */
5043                ifr->ifr_flags = (short) dev_get_flags(dev);
5044                return 0;
5045
5046        case SIOCGIFMETRIC:     /* Get the metric on the interface
5047                                   (currently unused) */
5048                ifr->ifr_metric = 0;
5049                return 0;
5050
5051        case SIOCGIFMTU:        /* Get the MTU of a device */
5052                ifr->ifr_mtu = dev->mtu;
5053                return 0;
5054
5055        case SIOCGIFHWADDR:
5056                if (!dev->addr_len)
5057                        memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
5058                else
5059                        memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
5060                               min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
5061                ifr->ifr_hwaddr.sa_family = dev->type;
5062                return 0;
5063
5064        case SIOCGIFSLAVE:
5065                err = -EINVAL;
5066                break;
5067
5068        case SIOCGIFMAP:
5069                ifr->ifr_map.mem_start = dev->mem_start;
5070                ifr->ifr_map.mem_end   = dev->mem_end;
5071                ifr->ifr_map.base_addr = dev->base_addr;
5072                ifr->ifr_map.irq       = dev->irq;
5073                ifr->ifr_map.dma       = dev->dma;
5074                ifr->ifr_map.port      = dev->if_port;
5075                return 0;
5076
5077        case SIOCGIFINDEX:
5078                ifr->ifr_ifindex = dev->ifindex;
5079                return 0;
5080
5081        case SIOCGIFTXQLEN:
5082                ifr->ifr_qlen = dev->tx_queue_len;
5083                return 0;
5084
5085        default:
5086                /* dev_ioctl() should ensure this case
5087                 * is never reached
5088                 */
5089                WARN_ON(1);
5090                err = -ENOTTY;
5091                break;
5092
5093        }
5094        return err;
5095}
5096
5097/*
5098 *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
5099 */
5100static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
5101{
5102        int err;
5103        struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
5104        const struct net_device_ops *ops;
5105
5106        if (!dev)
5107                return -ENODEV;
5108
5109        ops = dev->netdev_ops;
5110
5111        switch (cmd) {
5112        case SIOCSIFFLAGS:      /* Set interface flags */
5113                return dev_change_flags(dev, ifr->ifr_flags);
5114
5115        case SIOCSIFMETRIC:     /* Set the metric on the interface
5116                                   (currently unused) */
5117                return -EOPNOTSUPP;
5118
5119        case SIOCSIFMTU:        /* Set the MTU of a device */
5120                return dev_set_mtu(dev, ifr->ifr_mtu);
5121
5122        case SIOCSIFHWADDR:
5123                return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
5124
5125        case SIOCSIFHWBROADCAST:
5126                if (ifr->ifr_hwaddr.sa_family != dev->type)
5127                        return -EINVAL;
5128                memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
5129                       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
5130                call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5131                return 0;
5132
5133        case SIOCSIFMAP:
5134                if (ops->ndo_set_config) {
5135                        if (!netif_device_present(dev))
5136                                return -ENODEV;
5137                        return ops->ndo_set_config(dev, &ifr->ifr_map);
5138                }
5139                return -EOPNOTSUPP;
5140
5141        case SIOCADDMULTI:
5142                if (!ops->ndo_set_rx_mode ||
5143                    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5144                        return -EINVAL;
5145                if (!netif_device_present(dev))
5146                        return -ENODEV;
5147                return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
5148
5149        case SIOCDELMULTI:
5150                if (!ops->ndo_set_rx_mode ||
5151                    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5152                        return -EINVAL;
5153                if (!netif_device_present(dev))
5154                        return -ENODEV;
5155                return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
5156
5157        case SIOCSIFTXQLEN:
5158                if (ifr->ifr_qlen < 0)
5159                        return -EINVAL;
5160                dev->tx_queue_len = ifr->ifr_qlen;
5161                return 0;
5162
5163        case SIOCSIFNAME:
5164                ifr->ifr_newname[IFNAMSIZ-1] = '\0';
5165                return dev_change_name(dev, ifr->ifr_newname);
5166
5167        case SIOCSHWTSTAMP:
5168                err = net_hwtstamp_validate(ifr);
5169                if (err)
5170                        return err;
5171                /* fall through */
5172
5173        /*
5174         *      Unknown or private ioctl
5175         */
5176        default:
5177                if ((cmd >= SIOCDEVPRIVATE &&
5178                    cmd <= SIOCDEVPRIVATE + 15) ||
5179                    cmd == SIOCBONDENSLAVE ||
5180                    cmd == SIOCBONDRELEASE ||
5181                    cmd == SIOCBONDSETHWADDR ||
5182                    cmd == SIOCBONDSLAVEINFOQUERY ||
5183                    cmd == SIOCBONDINFOQUERY ||
5184                    cmd == SIOCBONDCHANGEACTIVE ||
5185                    cmd == SIOCGMIIPHY ||
5186                    cmd == SIOCGMIIREG ||
5187                    cmd == SIOCSMIIREG ||
5188                    cmd == SIOCBRADDIF ||
5189                    cmd == SIOCBRDELIF ||
5190                    cmd == SIOCSHWTSTAMP ||
5191                    cmd == SIOCWANDEV) {
5192                        err = -EOPNOTSUPP;
5193                        if (ops->ndo_do_ioctl) {
5194                                if (netif_device_present(dev))
5195                                        err = ops->ndo_do_ioctl(dev, ifr, cmd);
5196                                else
5197                                        err = -ENODEV;
5198                        }
5199                } else
5200                        err = -EINVAL;
5201
5202        }
5203        return err;
5204}
5205
5206/*
5207 *      This function handles all "interface"-type I/O control requests. The actual
5208 *      'doing' part of this is dev_ifsioc above.
5209 */
5210
5211/**
5212 *      dev_ioctl       -       network device ioctl
5213 *      @net: the applicable net namespace
5214 *      @cmd: command to issue
5215 *      @arg: pointer to a struct ifreq in user space
5216 *
5217 *      Issue ioctl functions to devices. This is normally called by the
5218 *      user space syscall interfaces but can sometimes be useful for
5219 *      other purposes. The return value is the return from the syscall if
5220 *      positive or a negative errno code on error.
5221 */
5222
5223int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
5224{
5225        struct ifreq ifr;
5226        int ret;
5227        char *colon;
5228
5229        /* One special case: SIOCGIFCONF takes ifconf argument
5230           and requires shared lock, because it sleeps writing
5231           to user space.
5232         */
5233
5234        if (cmd == SIOCGIFCONF) {
5235                rtnl_lock();
5236                ret = dev_ifconf(net, (char __user *) arg);
5237                rtnl_unlock();
5238                return ret;
5239        }
5240        if (cmd == SIOCGIFNAME)
5241                return dev_ifname(net, (struct ifreq __user *)arg);
5242
5243        if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
5244                return -EFAULT;
5245
5246        ifr.ifr_name[IFNAMSIZ-1] = 0;
5247
5248        colon = strchr(ifr.ifr_name, ':');
5249        if (colon)
5250                *colon = 0;
5251
5252        /*
5253         *      See which interface the caller is talking about.
5254         */
5255
5256        switch (cmd) {
5257        /*
5258         *      These ioctl calls:
5259         *      - can be done by all.
5260         *      - atomic and do not require locking.
5261         *      - return a value
5262         */
5263        case SIOCGIFFLAGS:
5264        case SIOCGIFMETRIC:
5265        case SIOCGIFMTU:
5266        case SIOCGIFHWADDR:
5267        case SIOCGIFSLAVE:
5268        case SIOCGIFMAP:
5269        case SIOCGIFINDEX:
5270        case SIOCGIFTXQLEN:
5271                dev_load(net, ifr.ifr_name);
5272                rcu_read_lock();
5273                ret = dev_ifsioc_locked(net, &ifr, cmd);
5274                rcu_read_unlock();
5275                if (!ret) {
5276                        if (colon)
5277                                *colon = ':';
5278                        if (copy_to_user(arg, &ifr,
5279                                         sizeof(struct ifreq)))
5280                                ret = -EFAULT;
5281                }
5282                return ret;
5283
5284        case SIOCETHTOOL:
5285                dev_load(net, ifr.ifr_name);
5286                rtnl_lock();
5287                ret = dev_ethtool(net, &ifr);
5288                rtnl_unlock();
5289                if (!ret) {
5290                        if (colon)
5291                                *colon = ':';
5292                        if (copy_to_user(arg, &ifr,
5293                                         sizeof(struct ifreq)))
5294                                ret = -EFAULT;
5295                }
5296                return ret;
5297
5298        /*
5299         *      These ioctl calls:
5300         *      - require superuser power.
5301         *      - require strict serialization.
5302         *      - return a value
5303         */
5304        case SIOCGMIIPHY:
5305        case SIOCGMIIREG:
5306        case SIOCSIFNAME:
5307                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
5308                        return -EPERM;
5309                dev_load(net, ifr.ifr_name);
5310                rtnl_lock();
5311                ret = dev_ifsioc(net, &ifr, cmd);
5312                rtnl_unlock();
5313                if (!ret) {
5314                        if (colon)
5315                                *colon = ':';
5316                        if (copy_to_user(arg, &ifr,
5317                                         sizeof(struct ifreq)))
5318                                ret = -EFAULT;
5319                }
5320                return ret;
5321
5322        /*
5323         *      These ioctl calls:
5324         *      - require superuser power.
5325         *      - require strict serialization.
5326         *      - do not return a value
5327         */
5328        case SIOCSIFMAP:
5329        case SIOCSIFTXQLEN:
5330                if (!capable(CAP_NET_ADMIN))
5331                        return -EPERM;
5332                /* fall through */
5333        /*
5334         *      These ioctl calls:
5335         *      - require local superuser power.
5336         *      - require strict serialization.
5337         *      - do not return a value
5338         */
5339        case SIOCSIFFLAGS:
5340        case SIOCSIFMETRIC:
5341        case SIOCSIFMTU:
5342        case SIOCSIFHWADDR:
5343        case SIOCSIFSLAVE:
5344        case SIOCADDMULTI:
5345        case SIOCDELMULTI:
5346        case SIOCSIFHWBROADCAST:
5347        case SIOCSMIIREG:
5348        case SIOCBONDENSLAVE:
5349        case SIOCBONDRELEASE:
5350        case SIOCBONDSETHWADDR:
5351        case SIOCBONDCHANGEACTIVE:
5352        case SIOCBRADDIF:
5353        case SIOCBRDELIF:
5354        case SIOCSHWTSTAMP:
5355                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
5356                        return -EPERM;
5357                /* fall through */
5358        case SIOCBONDSLAVEINFOQUERY:
5359        case SIOCBONDINFOQUERY:
5360                dev_load(net, ifr.ifr_name);
5361                rtnl_lock();
5362                ret = dev_ifsioc(net, &ifr, cmd);
5363                rtnl_unlock();
5364                return ret;
5365
5366        case SIOCGIFMEM:
5367                /* Get the per device memory space. We can add this but
5368                 * currently do not support it */
5369        case SIOCSIFMEM:
5370                /* Set the per device memory buffer space.
5371                 * Not applicable in our case */
5372        case SIOCSIFLINK:
5373                return -ENOTTY;
5374
5375        /*
5376         *      Unknown or private ioctl.
5377         */
5378        default:
5379                if (cmd == SIOCWANDEV ||
5380                    (cmd >= SIOCDEVPRIVATE &&
5381                     cmd <= SIOCDEVPRIVATE + 15)) {
5382                        dev_load(net, ifr.ifr_name);
5383                        rtnl_lock();
5384                        ret = dev_ifsioc(net, &ifr, cmd);
5385                        rtnl_unlock();
5386                        if (!ret && copy_to_user(arg, &ifr,
5387                                                 sizeof(struct ifreq)))
5388                                ret = -EFAULT;
5389                        return ret;
5390                }
5391                /* Take care of Wireless Extensions */
5392                if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5393                        return wext_handle_ioctl(net, &ifr, cmd, arg);
5394                return -ENOTTY;
5395        }
5396}
5397
5398
5399/**
5400 *      dev_new_index   -       allocate an ifindex
5401 *      @net: the applicable net namespace
5402 *
5403 *      Returns a suitable unique value for a new device interface
5404 *      number.  The caller must hold the rtnl semaphore or the
5405 *      dev_base_lock to be sure it remains unique.
5406 */
5407static int dev_new_index(struct net *net)
5408{
5409        int ifindex = net->ifindex;
5410        for (;;) {
5411                if (++ifindex <= 0)
5412                        ifindex = 1;
5413                if (!__dev_get_by_index(net, ifindex))
5414                        return net->ifindex = ifindex;
5415        }
5416}
5417
5418/* Delayed registration/unregisteration */
5419static LIST_HEAD(net_todo_list);
5420
5421static void net_set_todo(struct net_device *dev)
5422{
5423        list_add_tail(&dev->todo_list, &net_todo_list);
5424}
5425
5426static void rollback_registered_many(struct list_head *head)
5427{
5428        struct net_device *dev, *tmp;
5429
5430        BUG_ON(dev_boot_phase);
5431        ASSERT_RTNL();
5432
5433        list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5434                /* Some devices call without registering
5435                 * for initialization unwind. Remove those
5436                 * devices and proceed with the remaining.
5437                 */
5438                if (dev->reg_state == NETREG_UNINITIALIZED) {
5439                        pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5440                                 dev->name, dev);
5441
5442                        WARN_ON(1);
5443                        list_del(&dev->unreg_list);
5444                        continue;
5445                }
5446                dev->dismantle = true;
5447                BUG_ON(dev->reg_state != NETREG_REGISTERED);
5448        }
5449
5450        /* If device is running, close it first. */
5451        dev_close_many(head);
5452
5453        list_for_each_entry(dev, head, unreg_list) {
5454                /* And unlink it from device chain. */
5455                unlist_netdevice(dev);
5456
5457                dev->reg_state = NETREG_UNREGISTERING;
5458        }
5459
5460        synchronize_net();
5461
5462        list_for_each_entry(dev, head, unreg_list) {
5463                /* Shutdown queueing discipline. */
5464                dev_shutdown(dev);
5465
5466
5467                /* Notify protocols, that we are about to destroy
5468                   this device. They should clean all the things.
5469                */
5470                call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5471
5472                if (!dev->rtnl_link_ops ||
5473                    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5474                        rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5475
5476                /*
5477                 *      Flush the unicast and multicast chains
5478                 */
5479                dev_uc_flush(dev);
5480                dev_mc_flush(dev);
5481
5482                if (dev->netdev_ops->ndo_uninit)
5483                        dev->netdev_ops->ndo_uninit(dev);
5484
5485                /* Notifier chain MUST detach us from master device. */
5486                WARN_ON(dev->master);
5487
5488                /* Remove entries from kobject tree */
5489                netdev_unregister_kobject(dev);
5490        }
5491
5492        synchronize_net();
5493
5494        list_for_each_entry(dev, head, unreg_list)
5495                dev_put(dev);
5496}
5497
5498static void rollback_registered(struct net_device *dev)
5499{
5500        LIST_HEAD(single);
5501
5502        list_add(&dev->unreg_list, &single);
5503        rollback_registered_many(&single);
5504        list_del(&single);
5505}
5506
5507static netdev_features_t netdev_fix_features(struct net_device *dev,
5508        netdev_features_t features)
5509{
5510        /* Fix illegal checksum combinations */
5511        if ((features & NETIF_F_HW_CSUM) &&
5512            (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5513                netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5514                features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5515        }
5516
5517        /* Fix illegal SG+CSUM combinations. */
5518        if ((features & NETIF_F_SG) &&
5519            !(features & NETIF_F_ALL_CSUM)) {
5520                netdev_dbg(dev,
5521                        "Dropping NETIF_F_SG since no checksum feature.\n");
5522                features &= ~NETIF_F_SG;
5523        }
5524
5525        /* TSO requires that SG is present as well. */
5526        if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5527                netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5528                features &= ~NETIF_F_ALL_TSO;
5529        }
5530
5531        /* TSO ECN requires that TSO is present as well. */
5532        if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5533                features &= ~NETIF_F_TSO_ECN;
5534
5535        /* Software GSO depends on SG. */
5536        if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5537                netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5538                features &= ~NETIF_F_GSO;
5539        }
5540
5541        /* UFO needs SG and checksumming */
5542        if (features & NETIF_F_UFO) {
5543                /* maybe split UFO into V4 and V6? */
5544                if (!((features & NETIF_F_GEN_CSUM) ||
5545                    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5546                            == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5547                        netdev_dbg(dev,
5548                                "Dropping NETIF_F_UFO since no checksum offload features.\n");
5549                        features &= ~NETIF_F_UFO;
5550                }
5551
5552                if (!(features & NETIF_F_SG)) {
5553                        netdev_dbg(dev,
5554                                "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5555                        features &= ~NETIF_F_UFO;
5556                }
5557        }
5558
5559        return features;
5560}
5561
5562int __netdev_update_features(struct net_device *dev)
5563{
5564        netdev_features_t features;
5565        int err = 0;
5566
5567        ASSERT_RTNL();
5568
5569        features = netdev_get_wanted_features(dev);
5570
5571        if (dev->netdev_ops->ndo_fix_features)
5572                features = dev->netdev_ops->ndo_fix_features(dev, features);
5573
5574        /* driver might be less strict about feature dependencies */
5575        features = netdev_fix_features(dev, features);
5576
5577        if (dev->features == features)
5578                return 0;
5579
5580        netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5581                &dev->features, &features);
5582
5583        if (dev->netdev_ops->ndo_set_features)
5584                err = dev->netdev_ops->ndo_set_features(dev, features);
5585
5586        if (unlikely(err < 0)) {
5587                netdev_err(dev,
5588                        "set_features() failed (%d); wanted %pNF, left %pNF\n",
5589                        err, &features, &dev->features);
5590                return -1;
5591        }
5592
5593        if (!err)
5594                dev->features = features;
5595
5596        return 1;
5597}
5598
5599/**
5600 *      netdev_update_features - recalculate device features
5601 *      @dev: the device to check
5602 *
5603 *      Recalculate dev->features set and send notifications if it
5604 *      has changed. Should be called after driver or hardware dependent
5605 *      conditions might have changed that influence the features.
5606 */
5607void netdev_update_features(struct net_device *dev)
5608{
5609        if (__netdev_update_features(dev))
5610                netdev_features_change(dev);
5611}
5612EXPORT_SYMBOL(netdev_update_features);
5613
5614/**
5615 *      netdev_change_features - recalculate device features
5616 *      @dev: the device to check
5617 *
5618 *      Recalculate dev->features set and send notifications even
5619 *      if they have not changed. Should be called instead of
5620 *      netdev_update_features() if also dev->vlan_features might
5621 *      have changed to allow the changes to be propagated to stacked
5622 *      VLAN devices.
5623 */
5624void netdev_change_features(struct net_device *dev)
5625{
5626        __netdev_update_features(dev);
5627        netdev_features_change(dev);
5628}
5629EXPORT_SYMBOL(netdev_change_features);
5630
5631/**
5632 *      netif_stacked_transfer_operstate -      transfer operstate
5633 *      @rootdev: the root or lower level device to transfer state from
5634 *      @dev: the device to transfer operstate to
5635 *
5636 *      Transfer operational state from root to device. This is normally
5637 *      called when a stacking relationship exists between the root
5638 *      device and the device(a leaf device).
5639 */
5640void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5641                                        struct net_device *dev)
5642{
5643        if (rootdev->operstate == IF_OPER_DORMANT)
5644                netif_dormant_on(dev);
5645        else
5646                netif_dormant_off(dev);
5647
5648        if (netif_carrier_ok(rootdev)) {
5649                if (!netif_carrier_ok(dev))
5650                        netif_carrier_on(dev);
5651        } else {
5652                if (netif_carrier_ok(dev))
5653                        netif_carrier_off(dev);
5654        }
5655}
5656EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5657
5658#ifdef CONFIG_RPS
5659static int netif_alloc_rx_queues(struct net_device *dev)
5660{
5661        unsigned int i, count = dev->num_rx_queues;
5662        struct netdev_rx_queue *rx;
5663
5664        BUG_ON(count < 1);
5665
5666        rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5667        if (!rx) {
5668                pr_err("netdev: Unable to allocate %u rx queues\n", count);
5669                return -ENOMEM;
5670        }
5671        dev->_rx = rx;
5672
5673        for (i = 0; i < count; i++)
5674                rx[i].dev = dev;
5675        return 0;
5676}
5677#endif
5678
5679static void netdev_init_one_queue(struct net_device *dev,
5680                                  struct netdev_queue *queue, void *_unused)
5681{
5682        /* Initialize queue lock */
5683        spin_lock_init(&queue->_xmit_lock);
5684        netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5685        queue->xmit_lock_owner = -1;
5686        netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5687        queue->dev = dev;
5688#ifdef CONFIG_BQL
5689        dql_init(&queue->dql, HZ);
5690#endif
5691}
5692
5693static int netif_alloc_netdev_queues(struct net_device *dev)
5694{
5695        unsigned int count = dev->num_tx_queues;
5696        struct netdev_queue *tx;
5697
5698        BUG_ON(count < 1);
5699
5700        tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5701        if (!tx) {
5702                pr_err("netdev: Unable to allocate %u tx queues\n", count);
5703                return -ENOMEM;
5704        }
5705        dev->_tx = tx;
5706
5707        netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5708        spin_lock_init(&dev->tx_global_lock);
5709
5710        return 0;
5711}
5712
5713/**
5714 *      register_netdevice      - register a network device
5715 *      @dev: device to register
5716 *
5717 *      Take a completed network device structure and add it to the kernel
5718 *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5719 *      chain. 0 is returned on success. A negative errno code is returned
5720 *      on a failure to set up the device, or if the name is a duplicate.
5721 *
5722 *      Callers must hold the rtnl semaphore. You may want
5723 *      register_netdev() instead of this.
5724 *
5725 *      BUGS:
5726 *      The locking appears insufficient to guarantee two parallel registers
5727 *      will not get the same name.
5728 */
5729
5730int register_netdevice(struct net_device *dev)
5731{
5732        int ret;
5733        struct net *net = dev_net(dev);
5734
5735        BUG_ON(dev_boot_phase);
5736        ASSERT_RTNL();
5737
5738        might_sleep();
5739
5740        /* When net_device's are persistent, this will be fatal. */
5741        BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5742        BUG_ON(!net);
5743
5744        spin_lock_init(&dev->addr_list_lock);
5745        netdev_set_addr_lockdep_class(dev);
5746
5747        dev->iflink = -1;
5748
5749        ret = dev_get_valid_name(net, dev, dev->name);
5750        if (ret < 0)
5751                goto out;
5752
5753        /* Init, if this function is available */
5754        if (dev->netdev_ops->ndo_init) {
5755                ret = dev->netdev_ops->ndo_init(dev);
5756                if (ret) {
5757                        if (ret > 0)
5758                                ret = -EIO;
5759                        goto out;
5760                }
5761        }
5762
5763        ret = -EBUSY;
5764        if (!dev->ifindex)
5765                dev->ifindex = dev_new_index(net);
5766        else if (__dev_get_by_index(net, dev->ifindex))
5767                goto err_uninit;
5768
5769        if (dev->iflink == -1)
5770                dev->iflink = dev->ifindex;
5771
5772        /* Transfer changeable features to wanted_features and enable
5773         * software offloads (GSO and GRO).
5774         */
5775        dev->hw_features |= NETIF_F_SOFT_FEATURES;
5776        dev->features |= NETIF_F_SOFT_FEATURES;
5777        dev->wanted_features = dev->features & dev->hw_features;
5778
5779        /* Turn on no cache copy if HW is doing checksum */
5780        if (!(dev->flags & IFF_LOOPBACK)) {
5781                dev->hw_features |= NETIF_F_NOCACHE_COPY;
5782                if (dev->features & NETIF_F_ALL_CSUM) {
5783                        dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5784                        dev->features |= NETIF_F_NOCACHE_COPY;
5785                }
5786        }
5787
5788        /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5789         */
5790        dev->vlan_features |= NETIF_F_HIGHDMA;
5791
5792        ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5793        ret = notifier_to_errno(ret);
5794        if (ret)
5795                goto err_uninit;
5796
5797        ret = netdev_register_kobject(dev);
5798        if (ret)
5799                goto err_uninit;
5800        dev->reg_state = NETREG_REGISTERED;
5801
5802        __netdev_update_features(dev);
5803
5804        /*
5805         *      Default initial state at registry is that the
5806         *      device is present.
5807         */
5808
5809        set_bit(__LINK_STATE_PRESENT, &dev->state);
5810
5811        linkwatch_init_dev(dev);
5812
5813        dev_init_scheduler(dev);
5814        dev_hold(dev);
5815        list_netdevice(dev);
5816        add_device_randomness(dev->dev_addr, dev->addr_len);
5817
5818        /* Notify protocols, that a new device appeared. */
5819        ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5820        ret = notifier_to_errno(ret);
5821        if (ret) {
5822                rollback_registered(dev);
5823                dev->reg_state = NETREG_UNREGISTERED;
5824        }
5825        /*
5826         *      Prevent userspace races by waiting until the network
5827         *      device is fully setup before sending notifications.
5828         */
5829        if (!dev->rtnl_link_ops ||
5830            dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5831                rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5832
5833out:
5834        return ret;
5835
5836err_uninit:
5837        if (dev->netdev_ops->ndo_uninit)
5838                dev->netdev_ops->ndo_uninit(dev);
5839        goto out;
5840}
5841EXPORT_SYMBOL(register_netdevice);
5842
5843/**
5844 *      init_dummy_netdev       - init a dummy network device for NAPI
5845 *      @dev: device to init
5846 *
5847 *      This takes a network device structure and initialize the minimum
5848 *      amount of fields so it can be used to schedule NAPI polls without
5849 *      registering a full blown interface. This is to be used by drivers
5850 *      that need to tie several hardware interfaces to a single NAPI
5851 *      poll scheduler due to HW limitations.
5852 */
5853int init_dummy_netdev(struct net_device *dev)
5854{
5855        /* Clear everything. Note we don't initialize spinlocks
5856         * are they aren't supposed to be taken by any of the
5857         * NAPI code and this dummy netdev is supposed to be
5858         * only ever used for NAPI polls
5859         */
5860        memset(dev, 0, sizeof(struct net_device));
5861
5862        /* make sure we BUG if trying to hit standard
5863         * register/unregister code path
5864         */
5865        dev->reg_state = NETREG_DUMMY;
5866
5867        /* NAPI wants this */
5868        INIT_LIST_HEAD(&dev->napi_list);
5869
5870        /* a dummy interface is started by default */
5871        set_bit(__LINK_STATE_PRESENT, &dev->state);
5872        set_bit(__LINK_STATE_START, &dev->state);
5873
5874        /* Note : We dont allocate pcpu_refcnt for dummy devices,
5875         * because users of this 'device' dont need to change
5876         * its refcount.
5877         */
5878
5879        return 0;
5880}
5881EXPORT_SYMBOL_GPL(init_dummy_netdev);
5882
5883
5884/**
5885 *      register_netdev - register a network device
5886 *      @dev: device to register
5887 *
5888 *      Take a completed network device structure and add it to the kernel
5889 *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5890 *      chain. 0 is returned on success. A negative errno code is returned
5891 *      on a failure to set up the device, or if the name is a duplicate.
5892 *
5893 *      This is a wrapper around register_netdevice that takes the rtnl semaphore
5894 *      and expands the device name if you passed a format string to
5895 *      alloc_netdev.
5896 */
5897int register_netdev(struct net_device *dev)
5898{
5899        int err;
5900
5901        rtnl_lock();
5902        err = register_netdevice(dev);
5903        rtnl_unlock();
5904        return err;
5905}
5906EXPORT_SYMBOL(register_netdev);
5907
5908int netdev_refcnt_read(const struct net_device *dev)
5909{
5910        int i, refcnt = 0;
5911
5912        for_each_possible_cpu(i)
5913                refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5914        return refcnt;
5915}
5916EXPORT_SYMBOL(netdev_refcnt_read);
5917
5918/**
5919 * netdev_wait_allrefs - wait until all references are gone.
5920 * @dev: target net_device
5921 *
5922 * This is called when unregistering network devices.
5923 *
5924 * Any protocol or device that holds a reference should register
5925 * for netdevice notification, and cleanup and put back the
5926 * reference if they receive an UNREGISTER event.
5927 * We can get stuck here if buggy protocols don't correctly
5928 * call dev_put.
5929 */
5930static void netdev_wait_allrefs(struct net_device *dev)
5931{
5932        unsigned long rebroadcast_time, warning_time;
5933        int refcnt;
5934
5935        linkwatch_forget_dev(dev);
5936
5937        rebroadcast_time = warning_time = jiffies;
5938        refcnt = netdev_refcnt_read(dev);
5939
5940        while (refcnt != 0) {
5941                if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5942                        rtnl_lock();
5943
5944                        /* Rebroadcast unregister notification */
5945                        call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5946
5947                        __rtnl_unlock();
5948                        rcu_barrier();
5949                        rtnl_lock();
5950
5951                        call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5952                        if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5953                                     &dev->state)) {
5954                                /* We must not have linkwatch events
5955                                 * pending on unregister. If this
5956                                 * happens, we simply run the queue
5957                                 * unscheduled, resulting in a noop
5958                                 * for this device.
5959                                 */
5960                                linkwatch_run_queue();
5961                        }
5962
5963                        __rtnl_unlock();
5964
5965                        rebroadcast_time = jiffies;
5966                }
5967
5968                msleep(250);
5969
5970                refcnt = netdev_refcnt_read(dev);
5971
5972                if (time_after(jiffies, warning_time + 10 * HZ)) {
5973                        pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5974                                 dev->name, refcnt);
5975                        warning_time = jiffies;
5976                }
5977        }
5978}
5979
5980/* The sequence is:
5981 *
5982 *      rtnl_lock();
5983 *      ...
5984 *      register_netdevice(x1);
5985 *      register_netdevice(x2);
5986 *      ...
5987 *      unregister_netdevice(y1);
5988 *      unregister_netdevice(y2);
5989 *      ...
5990 *      rtnl_unlock();
5991 *      free_netdev(y1);
5992 *      free_netdev(y2);
5993 *
5994 * We are invoked by rtnl_unlock().
5995 * This allows us to deal with problems:
5996 * 1) We can delete sysfs objects which invoke hotplug
5997 *    without deadlocking with linkwatch via keventd.
5998 * 2) Since we run with the RTNL semaphore not held, we can sleep
5999 *    safely in order to wait for the netdev refcnt to drop to zero.
6000 *
6001 * We must not return until all unregister events added during
6002 * the interval the lock was held have been completed.
6003 */
6004void netdev_run_todo(void)
6005{
6006        struct list_head list;
6007
6008        /* Snapshot list, allow later requests */
6009        list_replace_init(&net_todo_list, &list);
6010
6011        __rtnl_unlock();
6012
6013
6014        /* Wait for rcu callbacks to finish before next phase */
6015        if (!list_empty(&list))
6016                rcu_barrier();
6017
6018        while (!list_empty(&list)) {
6019                struct net_device *dev
6020                        = list_first_entry(&list, struct net_device, todo_list);
6021                list_del(&dev->todo_list);
6022
6023                rtnl_lock();
6024                call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6025                __rtnl_unlock();
6026
6027                if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6028                        pr_err("network todo '%s' but state %d\n",
6029                               dev->name, dev->reg_state);
6030                        dump_stack();
6031                        continue;
6032                }
6033
6034                dev->reg_state = NETREG_UNREGISTERED;
6035
6036                on_each_cpu(flush_backlog, dev, 1);
6037
6038                netdev_wait_allrefs(dev);
6039
6040                /* paranoia */
6041                BUG_ON(netdev_refcnt_read(dev));
6042                WARN_ON(rcu_access_pointer(dev->ip_ptr));
6043                WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6044                WARN_ON(dev->dn_ptr);
6045
6046                if (dev->destructor)
6047                        dev->destructor(dev);
6048
6049                /* Free network device */
6050                kobject_put(&dev->dev.kobj);
6051        }
6052}
6053
6054/* Convert net_device_stats to rtnl_link_stats64.  They have the same
6055 * fields in the same order, with only the type differing.
6056 */
6057void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6058                             const struct net_device_stats *netdev_stats)
6059{
6060#if BITS_PER_LONG == 64
6061        BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6062        memcpy(stats64, netdev_stats, sizeof(*stats64));
6063#else
6064        size_t i, n = sizeof(*stats64) / sizeof(u64);
6065        const unsigned long *src = (const unsigned long *)netdev_stats;
6066        u64 *dst = (u64 *)stats64;
6067
6068        BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6069                     sizeof(*stats64) / sizeof(u64));
6070        for (i = 0; i < n; i++)
6071                dst[i] = src[i];
6072#endif
6073}
6074EXPORT_SYMBOL(netdev_stats_to_stats64);
6075
6076/**
6077 *      dev_get_stats   - get network device statistics
6078 *      @dev: device to get statistics from
6079 *      @storage: place to store stats
6080 *
6081 *      Get network statistics from device. Return @storage.
6082 *      The device driver may provide its own method by setting
6083 *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6084 *      otherwise the internal statistics structure is used.
6085 */
6086struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6087                                        struct rtnl_link_stats64 *storage)
6088{
6089        const struct net_device_ops *ops = dev->netdev_ops;
6090
6091        if (ops->ndo_get_stats64) {
6092                memset(storage, 0, sizeof(*storage));
6093                ops->ndo_get_stats64(dev, storage);
6094        } else if (ops->ndo_get_stats) {
6095                netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6096        } else {
6097                netdev_stats_to_stats64(storage, &dev->stats);
6098        }
6099        storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6100        return storage;
6101}
6102EXPORT_SYMBOL(dev_get_stats);
6103
6104struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6105{
6106        struct netdev_queue *queue = dev_ingress_queue(dev);
6107
6108#ifdef CONFIG_NET_CLS_ACT
6109        if (queue)
6110                return queue;
6111        queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6112        if (!queue)
6113                return NULL;
6114        netdev_init_one_queue(dev, queue, NULL);
6115        queue->qdisc = &noop_qdisc;
6116        queue->qdisc_sleeping = &noop_qdisc;
6117        rcu_assign_pointer(dev->ingress_queue, queue);
6118#endif
6119        return queue;
6120}
6121
6122static const struct ethtool_ops default_ethtool_ops;
6123
6124void netdev_set_default_ethtool_ops(struct net_device *dev,
6125                                    const struct ethtool_ops *ops)
6126{
6127        if (dev->ethtool_ops == &default_ethtool_ops)
6128                dev->ethtool_ops = ops;
6129}
6130EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6131
6132/**
6133 *      alloc_netdev_mqs - allocate network device
6134 *      @sizeof_priv:   size of private data to allocate space for
6135 *      @name:          device name format string
6136 *      @setup:         callback to initialize device
6137 *      @txqs:          the number of TX subqueues to allocate
6138 *      @rxqs:          the number of RX subqueues to allocate
6139 *
6140 *      Allocates a struct net_device with private data area for driver use
6141 *      and performs basic initialization.  Also allocates subquue structs
6142 *      for each queue on the device.
6143 */
6144struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6145                void (*setup)(struct net_device *),
6146                unsigned int txqs, unsigned int rxqs)
6147{
6148        struct net_device *dev;
6149        size_t alloc_size;
6150        struct net_device *p;
6151
6152        BUG_ON(strlen(name) >= sizeof(dev->name));
6153
6154        if (txqs < 1) {
6155                pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6156                return NULL;
6157        }
6158
6159#ifdef CONFIG_RPS
6160        if (rxqs < 1) {
6161                pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6162                return NULL;
6163        }
6164#endif
6165
6166        alloc_size = sizeof(struct net_device);
6167        if (sizeof_priv) {
6168                /* ensure 32-byte alignment of private area */
6169                alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6170                alloc_size += sizeof_priv;
6171        }
6172        /* ensure 32-byte alignment of whole construct */
6173        alloc_size += NETDEV_ALIGN - 1;
6174
6175        p = kzalloc(alloc_size, GFP_KERNEL);
6176        if (!p) {
6177                pr_err("alloc_netdev: Unable to allocate device\n");
6178                return NULL;
6179        }
6180
6181        dev = PTR_ALIGN(p, NETDEV_ALIGN);
6182        dev->padded = (char *)dev - (char *)p;
6183
6184        dev->pcpu_refcnt = alloc_percpu(int);
6185        if (!dev->pcpu_refcnt)
6186                goto free_p;
6187
6188        if (dev_addr_init(dev))
6189                goto free_pcpu;
6190
6191        dev_mc_init(dev);
6192        dev_uc_init(dev);
6193
6194        dev_net_set(dev, &init_net);
6195
6196        dev->gso_max_size = GSO_MAX_SIZE;
6197        dev->gso_max_segs = GSO_MAX_SEGS;
6198
6199        INIT_LIST_HEAD(&dev->napi_list);
6200        INIT_LIST_HEAD(&dev->unreg_list);
6201        INIT_LIST_HEAD(&dev->link_watch_list);
6202        dev->priv_flags = IFF_XMIT_DST_RELEASE;
6203        setup(dev);
6204
6205        dev->num_tx_queues = txqs;
6206        dev->real_num_tx_queues = txqs;
6207        if (netif_alloc_netdev_queues(dev))
6208                goto free_all;
6209
6210#ifdef CONFIG_RPS
6211        dev->num_rx_queues = rxqs;
6212        dev->real_num_rx_queues = rxqs;
6213        if (netif_alloc_rx_queues(dev))
6214                goto free_all;
6215#endif
6216
6217        strcpy(dev->name, name);
6218        dev->group = INIT_NETDEV_GROUP;
6219        if (!dev->ethtool_ops)
6220                dev->ethtool_ops = &default_ethtool_ops;
6221        return dev;
6222
6223free_all:
6224        free_netdev(dev);
6225        return NULL;
6226
6227free_pcpu:
6228        free_percpu(dev->pcpu_refcnt);
6229        kfree(dev->_tx);
6230#ifdef CONFIG_RPS
6231        kfree(dev->_rx);
6232#endif
6233
6234free_p:
6235        kfree(p);
6236        return NULL;
6237}
6238EXPORT_SYMBOL(alloc_netdev_mqs);
6239
6240/**
6241 *      free_netdev - free network device
6242 *      @dev: device
6243 *
6244 *      This function does the last stage of destroying an allocated device
6245 *      interface. The reference to the device object is released.
6246 *      If this is the last reference then it will be freed.
6247 */
6248void free_netdev(struct net_device *dev)
6249{
6250        struct napi_struct *p, *n;
6251
6252        release_net(dev_net(dev));
6253
6254        kfree(dev->_tx);
6255#ifdef CONFIG_RPS
6256        kfree(dev->_rx);
6257#endif
6258
6259        kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6260
6261        /* Flush device addresses */
6262        dev_addr_flush(dev);
6263
6264        list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6265                netif_napi_del(p);
6266
6267        free_percpu(dev->pcpu_refcnt);
6268        dev->pcpu_refcnt = NULL;
6269
6270        /*  Compatibility with error handling in drivers */
6271        if (dev->reg_state == NETREG_UNINITIALIZED) {
6272                kfree((char *)dev - dev->padded);
6273                return;
6274        }
6275
6276        BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6277        dev->reg_state = NETREG_RELEASED;
6278
6279        /* will free via device release */
6280        put_device(&dev->dev);
6281}
6282EXPORT_SYMBOL(free_netdev);
6283
6284/**
6285 *      synchronize_net -  Synchronize with packet receive processing
6286 *
6287 *      Wait for packets currently being received to be done.
6288 *      Does not block later packets from starting.
6289 */
6290void synchronize_net(void)
6291{
6292        might_sleep();
6293        if (rtnl_is_locked())
6294                synchronize_rcu_expedited();
6295        else
6296                synchronize_rcu();
6297}
6298EXPORT_SYMBOL(synchronize_net);
6299
6300/**
6301 *      unregister_netdevice_queue - remove device from the kernel
6302 *      @dev: device
6303 *      @head: list
6304 *
6305 *      This function shuts down a device interface and removes it
6306 *      from the kernel tables.
6307 *      If head not NULL, device is queued to be unregistered later.
6308 *
6309 *      Callers must hold the rtnl semaphore.  You may want
6310 *      unregister_netdev() instead of this.
6311 */
6312
6313void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6314{
6315        ASSERT_RTNL();
6316
6317        if (head) {
6318                list_move_tail(&dev->unreg_list, head);
6319        } else {
6320                rollback_registered(dev);
6321                /* Finish processing unregister after unlock */
6322                net_set_todo(dev);
6323        }
6324}
6325EXPORT_SYMBOL(unregister_netdevice_queue);
6326
6327/**
6328 *      unregister_netdevice_many - unregister many devices
6329 *      @head: list of devices
6330 */
6331void unregister_netdevice_many(struct list_head *head)
6332{
6333        struct net_device *dev;
6334
6335        if (!list_empty(head)) {
6336                rollback_registered_many(head);
6337                list_for_each_entry(dev, head, unreg_list)
6338                        net_set_todo(dev);
6339        }
6340}
6341EXPORT_SYMBOL(unregister_netdevice_many);
6342
6343/**
6344 *      unregister_netdev - remove device from the kernel
6345 *      @dev: device
6346 *
6347 *      This function shuts down a device interface and removes it
6348 *      from the kernel tables.
6349 *
6350 *      This is just a wrapper for unregister_netdevice that takes
6351 *      the rtnl semaphore.  In general you want to use this and not
6352 *      unregister_netdevice.
6353 */
6354void unregister_netdev(struct net_device *dev)
6355{
6356        rtnl_lock();
6357        unregister_netdevice(dev);
6358        rtnl_unlock();
6359}
6360EXPORT_SYMBOL(unregister_netdev);
6361
6362/**
6363 *      dev_change_net_namespace - move device to different nethost namespace
6364 *      @dev: device
6365 *      @net: network namespace
6366 *      @pat: If not NULL name pattern to try if the current device name
6367 *            is already taken in the destination network namespace.
6368 *
6369 *      This function shuts down a device interface and moves it
6370 *      to a new network namespace. On success 0 is returned, on
6371 *      a failure a netagive errno code is returned.
6372 *
6373 *      Callers must hold the rtnl semaphore.
6374 */
6375
6376int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6377{
6378        int err;
6379
6380        ASSERT_RTNL();
6381
6382        /* Don't allow namespace local devices to be moved. */
6383        err = -EINVAL;
6384        if (dev->features & NETIF_F_NETNS_LOCAL)
6385                goto out;
6386
6387        /* Ensure the device has been registrered */
6388        if (dev->reg_state != NETREG_REGISTERED)
6389                goto out;
6390
6391        /* Get out if there is nothing todo */
6392        err = 0;
6393        if (net_eq(dev_net(dev), net))
6394                goto out;
6395
6396        /* Pick the destination device name, and ensure
6397         * we can use it in the destination network namespace.
6398         */
6399        err = -EEXIST;
6400        if (__dev_get_by_name(net, dev->name)) {
6401                /* We get here if we can't use the current device name */
6402                if (!pat)
6403                        goto out;
6404                if (dev_get_valid_name(net, dev, pat) < 0)
6405                        goto out;
6406        }
6407
6408        /*
6409         * And now a mini version of register_netdevice unregister_netdevice.
6410         */
6411
6412        /* If device is running close it first. */
6413        dev_close(dev);
6414
6415        /* And unlink it from device chain */
6416        err = -ENODEV;
6417        unlist_netdevice(dev);
6418
6419        synchronize_net();
6420
6421        /* Shutdown queueing discipline. */
6422        dev_shutdown(dev);
6423
6424        /* Notify protocols, that we are about to destroy
6425           this device. They should clean all the things.
6426
6427           Note that dev->reg_state stays at NETREG_REGISTERED.
6428           This is wanted because this way 8021q and macvlan know
6429           the device is just moving and can keep their slaves up.
6430        */
6431        call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6432        rcu_barrier();
6433        call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6434        rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
6435
6436        /*
6437         *      Flush the unicast and multicast chains
6438         */
6439        dev_uc_flush(dev);
6440        dev_mc_flush(dev);
6441
6442        /* Send a netdev-removed uevent to the old namespace */
6443        kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6444
6445        /* Actually switch the network namespace */
6446        dev_net_set(dev, net);
6447
6448        /* If there is an ifindex conflict assign a new one */
6449        if (__dev_get_by_index(net, dev->ifindex)) {
6450                int iflink = (dev->iflink == dev->ifindex);
6451                dev->ifindex = dev_new_index(net);
6452                if (iflink)
6453                        dev->iflink = dev->ifindex;
6454        }
6455
6456        /* Send a netdev-add uevent to the new namespace */
6457        kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6458
6459        /* Fixup kobjects */
6460        err = device_rename(&dev->dev, dev->name);
6461        WARN_ON(err);
6462
6463        /* Add the device back in the hashes */
6464        list_netdevice(dev);
6465
6466        /* Notify protocols, that a new device appeared. */
6467        call_netdevice_notifiers(NETDEV_REGISTER, dev);
6468
6469        /*
6470         *      Prevent userspace races by waiting until the network
6471         *      device is fully setup before sending notifications.
6472         */
6473        rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6474
6475        synchronize_net();
6476        err = 0;
6477out:
6478        return err;
6479}
6480EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6481
6482static int dev_cpu_callback(struct notifier_block *nfb,
6483                            unsigned long action,
6484                            void *ocpu)
6485{
6486        struct sk_buff **list_skb;
6487        struct sk_buff *skb;
6488        unsigned int cpu, oldcpu = (unsigned long)ocpu;
6489        struct softnet_data *sd, *oldsd;
6490
6491        if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6492                return NOTIFY_OK;
6493
6494        local_irq_disable();
6495        cpu = smp_processor_id();
6496        sd = &per_cpu(softnet_data, cpu);
6497        oldsd = &per_cpu(softnet_data, oldcpu);
6498
6499        /* Find end of our completion_queue. */
6500        list_skb = &sd->completion_queue;
6501        while (*list_skb)
6502                list_skb = &(*list_skb)->next;
6503        /* Append completion queue from offline CPU. */
6504        *list_skb = oldsd->completion_queue;
6505        oldsd->completion_queue = NULL;
6506
6507        /* Append output queue from offline CPU. */
6508        if (oldsd->output_queue) {
6509                *sd->output_queue_tailp = oldsd->output_queue;
6510                sd->output_queue_tailp = oldsd->output_queue_tailp;
6511                oldsd->output_queue = NULL;
6512                oldsd->output_queue_tailp = &oldsd->output_queue;
6513        }
6514        /* Append NAPI poll list from offline CPU. */
6515        if (!list_empty(&oldsd->poll_list)) {
6516                list_splice_init(&oldsd->poll_list, &sd->poll_list);
6517                raise_softirq_irqoff(NET_RX_SOFTIRQ);
6518        }
6519
6520        raise_softirq_irqoff(NET_TX_SOFTIRQ);
6521        local_irq_enable();
6522
6523        /* Process offline CPU's input_pkt_queue */
6524        while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6525                netif_rx(skb);
6526                input_queue_head_incr(oldsd);
6527        }
6528        while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6529                netif_rx(skb);
6530                input_queue_head_incr(oldsd);
6531        }
6532
6533        return NOTIFY_OK;
6534}
6535
6536
6537/**
6538 *      netdev_increment_features - increment feature set by one
6539 *      @all: current feature set
6540 *      @one: new feature set
6541 *      @mask: mask feature set
6542 *
6543 *      Computes a new feature set after adding a device with feature set
6544 *      @one to the master device with current feature set @all.  Will not
6545 *      enable anything that is off in @mask. Returns the new feature set.
6546 */
6547netdev_features_t netdev_increment_features(netdev_features_t all,
6548        netdev_features_t one, netdev_features_t mask)
6549{
6550        if (mask & NETIF_F_GEN_CSUM)
6551                mask |= NETIF_F_ALL_CSUM;
6552        mask |= NETIF_F_VLAN_CHALLENGED;
6553
6554        all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6555        all &= one | ~NETIF_F_ALL_FOR_ALL;
6556
6557        /* If one device supports hw checksumming, set for all. */
6558        if (all & NETIF_F_GEN_CSUM)
6559                all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6560
6561        return all;
6562}
6563EXPORT_SYMBOL(netdev_increment_features);
6564
6565static struct hlist_head *netdev_create_hash(void)
6566{
6567        int i;
6568        struct hlist_head *hash;
6569
6570        hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6571        if (hash != NULL)
6572                for (i = 0; i < NETDEV_HASHENTRIES; i++)
6573                        INIT_HLIST_HEAD(&hash[i]);
6574
6575        return hash;
6576}
6577
6578/* Initialize per network namespace state */
6579static int __net_init netdev_init(struct net *net)
6580{
6581        if (net != &init_net)
6582                INIT_LIST_HEAD(&net->dev_base_head);
6583
6584        net->dev_name_head = netdev_create_hash();
6585        if (net->dev_name_head == NULL)
6586                goto err_name;
6587
6588        net->dev_index_head = netdev_create_hash();
6589        if (net->dev_index_head == NULL)
6590                goto err_idx;
6591
6592        return 0;
6593
6594err_idx:
6595        kfree(net->dev_name_head);
6596err_name:
6597        return -ENOMEM;
6598}
6599
6600/**
6601 *      netdev_drivername - network driver for the device
6602 *      @dev: network device
6603 *
6604 *      Determine network driver for device.
6605 */
6606const char *netdev_drivername(const struct net_device *dev)
6607{
6608        const struct device_driver *driver;
6609        const struct device *parent;
6610        const char *empty = "";
6611
6612        parent = dev->dev.parent;
6613        if (!parent)
6614                return empty;
6615
6616        driver = parent->driver;
6617        if (driver && driver->name)
6618                return driver->name;
6619        return empty;
6620}
6621
6622static int __netdev_printk(const char *level, const struct net_device *dev,
6623                           struct va_format *vaf)
6624{
6625        int r;
6626
6627        if (dev && dev->dev.parent) {
6628                r = dev_printk_emit(level[1] - '0',
6629                                    dev->dev.parent,
6630                                    "%s %s %s: %pV",
6631                                    dev_driver_string(dev->dev.parent),
6632                                    dev_name(dev->dev.parent),
6633                                    netdev_name(dev), vaf);
6634        } else if (dev) {
6635                r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6636        } else {
6637                r = printk("%s(NULL net_device): %pV", level, vaf);
6638        }
6639
6640        return r;
6641}
6642
6643int netdev_printk(const char *level, const struct net_device *dev,
6644                  const char *format, ...)
6645{
6646        struct va_format vaf;
6647        va_list args;
6648        int r;
6649
6650        va_start(args, format);
6651
6652        vaf.fmt = format;
6653        vaf.va = &args;
6654
6655        r = __netdev_printk(level, dev, &vaf);
6656
6657        va_end(args);
6658
6659        return r;
6660}
6661EXPORT_SYMBOL(netdev_printk);
6662
6663#define define_netdev_printk_level(func, level)                 \
6664int func(const struct net_device *dev, const char *fmt, ...)    \
6665{                                                               \
6666        int r;                                                  \
6667        struct va_format vaf;                                   \
6668        va_list args;                                           \
6669                                                                \
6670        va_start(args, fmt);                                    \
6671                                                                \
6672        vaf.fmt = fmt;                                          \
6673        vaf.va = &args;                                         \
6674                                                                \
6675        r = __netdev_printk(level, dev, &vaf);                  \
6676                                                                \
6677        va_end(args);                                           \
6678                                                                \
6679        return r;                                               \
6680}                                                               \
6681EXPORT_SYMBOL(func);
6682
6683define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6684define_netdev_printk_level(netdev_alert, KERN_ALERT);
6685define_netdev_printk_level(netdev_crit, KERN_CRIT);
6686define_netdev_printk_level(netdev_err, KERN_ERR);
6687define_netdev_printk_level(netdev_warn, KERN_WARNING);
6688define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6689define_netdev_printk_level(netdev_info, KERN_INFO);
6690
6691static void __net_exit netdev_exit(struct net *net)
6692{
6693        kfree(net->dev_name_head);
6694        kfree(net->dev_index_head);
6695}
6696
6697static struct pernet_operations __net_initdata netdev_net_ops = {
6698        .init = netdev_init,
6699        .exit = netdev_exit,
6700};
6701
6702static void __net_exit default_device_exit(struct net *net)
6703{
6704        struct net_device *dev, *aux;
6705        /*
6706         * Push all migratable network devices back to the
6707         * initial network namespace
6708         */
6709        rtnl_lock();
6710        for_each_netdev_safe(net, dev, aux) {
6711                int err;
6712                char fb_name[IFNAMSIZ];
6713
6714                /* Ignore unmoveable devices (i.e. loopback) */
6715                if (dev->features & NETIF_F_NETNS_LOCAL)
6716                        continue;
6717
6718                /* Leave virtual devices for the generic cleanup */
6719                if (dev->rtnl_link_ops)
6720                        continue;
6721
6722                /* Push remaining network devices to init_net */
6723                snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6724                err = dev_change_net_namespace(dev, &init_net, fb_name);
6725                if (err) {
6726                        pr_emerg("%s: failed to move %s to init_net: %d\n",
6727                                 __func__, dev->name, err);
6728                        BUG();
6729                }
6730        }
6731        rtnl_unlock();
6732}
6733
6734static void __net_exit default_device_exit_batch(struct list_head *net_list)
6735{
6736        /* At exit all network devices most be removed from a network
6737         * namespace.  Do this in the reverse order of registration.
6738         * Do this across as many network namespaces as possible to
6739         * improve batching efficiency.
6740         */
6741        struct net_device *dev;
6742        struct net *net;
6743        LIST_HEAD(dev_kill_list);
6744
6745        rtnl_lock();
6746        list_for_each_entry(net, net_list, exit_list) {
6747                for_each_netdev_reverse(net, dev) {
6748                        if (dev->rtnl_link_ops)
6749                                dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6750                        else
6751                                unregister_netdevice_queue(dev, &dev_kill_list);
6752                }
6753        }
6754        unregister_netdevice_many(&dev_kill_list);
6755        list_del(&dev_kill_list);
6756        rtnl_unlock();
6757}
6758
6759static struct pernet_operations __net_initdata default_device_ops = {
6760        .exit = default_device_exit,
6761        .exit_batch = default_device_exit_batch,
6762};
6763
6764/*
6765 *      Initialize the DEV module. At boot time this walks the device list and
6766 *      unhooks any devices that fail to initialise (normally hardware not
6767 *      present) and leaves us with a valid list of present and active devices.
6768 *
6769 */
6770
6771/*
6772 *       This is called single threaded during boot, so no need
6773 *       to take the rtnl semaphore.
6774 */
6775static int __init net_dev_init(void)
6776{
6777        int i, rc = -ENOMEM;
6778
6779        BUG_ON(!dev_boot_phase);
6780
6781        if (dev_proc_init())
6782                goto out;
6783
6784        if (netdev_kobject_init())
6785                goto out;
6786
6787        INIT_LIST_HEAD(&ptype_all);
6788        for (i = 0; i < PTYPE_HASH_SIZE; i++)
6789                INIT_LIST_HEAD(&ptype_base[i]);
6790
6791        INIT_LIST_HEAD(&offload_base);
6792
6793        if (register_pernet_subsys(&netdev_net_ops))
6794                goto out;
6795
6796        /*
6797         *      Initialise the packet receive queues.
6798         */
6799
6800        for_each_possible_cpu(i) {
6801                struct softnet_data *sd = &per_cpu(softnet_data, i);
6802
6803                memset(sd, 0, sizeof(*sd));
6804                skb_queue_head_init(&sd->input_pkt_queue);
6805                skb_queue_head_init(&sd->process_queue);
6806                sd->completion_queue = NULL;
6807                INIT_LIST_HEAD(&sd->poll_list);
6808                sd->output_queue = NULL;
6809                sd->output_queue_tailp = &sd->output_queue;
6810#ifdef CONFIG_RPS
6811                sd->csd.func = rps_trigger_softirq;
6812                sd->csd.info = sd;
6813                sd->csd.flags = 0;
6814                sd->cpu = i;
6815#endif
6816
6817                sd->backlog.poll = process_backlog;
6818                sd->backlog.weight = weight_p;
6819                sd->backlog.gro_list = NULL;
6820                sd->backlog.gro_count = 0;
6821        }
6822
6823        dev_boot_phase = 0;
6824
6825        /* The loopback device is special if any other network devices
6826         * is present in a network namespace the loopback device must
6827         * be present. Since we now dynamically allocate and free the
6828         * loopback device ensure this invariant is maintained by
6829         * keeping the loopback device as the first device on the
6830         * list of network devices.  Ensuring the loopback devices
6831         * is the first device that appears and the last network device
6832         * that disappears.
6833         */
6834        if (register_pernet_device(&loopback_net_ops))
6835                goto out;
6836
6837        if (register_pernet_device(&default_device_ops))
6838                goto out;
6839
6840        open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6841        open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6842
6843        hotcpu_notifier(dev_cpu_callback, 0);
6844        dst_init();
6845        dev_mcast_init();
6846        rc = 0;
6847out:
6848        return rc;
6849}
6850
6851subsys_initcall(net_dev_init);
6852
6853static int __init initialize_hashrnd(void)
6854{
6855        get_random_bytes(&hashrnd, sizeof(hashrnd));
6856        return 0;
6857}
6858
6859late_initcall_sync(initialize_hashrnd);
6860
6861