LXR linux/net/core/dev.c

   1/*
   2 *      NET3    Protocol independent device support routines.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 *      Derived from the non IP parts of dev.c 1.0.19
  10 *              Authors:        Ross Biro
  11 *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *      Additional Authors:
  15 *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *              David Hinds <dahinds@users.sourceforge.net>
  18 *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *              Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *      Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *                                      to 2 if register_netdev gets called
  25 *                                      before net_dev_init & also removed a
  26 *                                      few lines of code in the process.
  27 *              Alan Cox        :       device private ioctl copies fields back.
  28 *              Alan Cox        :       Transmit queue code does relevant
  29 *                                      stunts to keep the queue safe.
  30 *              Alan Cox        :       Fixed double lock.
  31 *              Alan Cox        :       Fixed promisc NULL pointer trap
  32 *              ????????        :       Support the full private ioctl range
  33 *              Alan Cox        :       Moved ioctl permission check into
  34 *                                      drivers
  35 *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36 *              Alan Cox        :       100 backlog just doesn't cut it when
  37 *                                      you start doing multicast video 8)
  38 *              Alan Cox        :       Rewrote net_bh and list manager.
  39 *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40 *              Alan Cox        :       Took out transmit every packet pass
  41 *                                      Saved a few bytes in the ioctl handler
  42 *              Alan Cox        :       Network driver sets packet type before
  43 *                                      calling netif_rx. Saves a function
  44 *                                      call a packet.
  45 *              Alan Cox        :       Hashed net_bh()
  46 *              Richard Kooijman:       Timestamp fixes.
  47 *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48 *              Alan Cox        :       Device lock protection.
  49 *              Alan Cox        :       Fixed nasty side effect of device close
  50 *                                      changes.
  51 *              Rudi Cilibrasi  :       Pass the right thing to
  52 *                                      set_mac_address()
  53 *              Dave Miller     :       32bit quantity for the device lock to
  54 *                                      make it work out on a Sparc.
  55 *              Bjorn Ekwall    :       Added KERNELD hack.
  56 *              Alan Cox        :       Cleaned up the backlog initialise.
  57 *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58 *                                      1 device.
  59 *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60 *                                      is no device open function.
  61 *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62 *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63 *              Cyrus Durgin    :       Cleaned for KMOD
  64 *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65 *                                      A network device unload needs to purge
  66 *                                      the backlog queue.
  67 *      Paul Rusty Russell      :       SIOCSIFNAME
  68 *              Pekka Riikonen  :       Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *                                      indefinitely on dev->refcnt
  71 *              J Hadi Salim    :       - Backlog queue sampling
  72 *                                      - netif_rx() feedback
  73 */
  74
  75#include <asm/uaccess.h>
  76#include <asm/system.h>
  77#include <linux/bitops.h>
  78#include <linux/capability.h>
  79#include <linux/cpu.h>
  80#include <linux/types.h>
  81#include <linux/kernel.h>
  82#include <linux/sched.h>
  83#include <linux/mutex.h>
  84#include <linux/string.h>
  85#include <linux/mm.h>
  86#include <linux/socket.h>
  87#include <linux/sockios.h>
  88#include <linux/errno.h>
  89#include <linux/interrupt.h>
  90#include <linux/if_ether.h>
  91#include <linux/netdevice.h>
  92#include <linux/etherdevice.h>
  93#include <linux/ethtool.h>
  94#include <linux/notifier.h>
  95#include <linux/skbuff.h>
  96#include <net/net_namespace.h>
  97#include <net/sock.h>
  98#include <linux/rtnetlink.h>
  99#include <linux/proc_fs.h>
 100#include <linux/seq_file.h>
 101#include <linux/stat.h>
 102#include <linux/if_bridge.h>
 103#include <linux/if_macvlan.h>
 104#include <net/dst.h>
 105#include <net/pkt_sched.h>
 106#include <net/checksum.h>
 107#include <linux/highmem.h>
 108#include <linux/init.h>
 109#include <linux/kmod.h>
 110#include <linux/module.h>
 111#include <linux/netpoll.h>
 112#include <linux/rcupdate.h>
 113#include <linux/delay.h>
 114#include <net/wext.h>
 115#include <net/iw_handler.h>
 116#include <asm/current.h>
 117#include <linux/audit.h>
 118#include <linux/dmaengine.h>
 119#include <linux/err.h>
 120#include <linux/ctype.h>
 121#include <linux/if_arp.h>
 122#include <linux/if_vlan.h>
 123#include <linux/ip.h>
 124#include <net/ip.h>
 125#include <linux/ipv6.h>
 126#include <linux/in.h>
 127#include <linux/jhash.h>
 128#include <linux/random.h>
 129#include <trace/events/napi.h>
 130
 131#include "net-sysfs.h"
 132
 133/* Instead of increasing this, you should create a hash table. */
 134#define MAX_GRO_SKBS 8
 135
 136/* This should be increased if a protocol with a bigger head is added. */
 137#define GRO_MAX_HEAD (MAX_HEADER + 128)
 138
 139/*
 140 *      The list of packet types we will receive (as opposed to discard)
 141 *      and the routines to invoke.
 142 *
 143 *      Why 16. Because with 16 the only overlap we get on a hash of the
 144 *      low nibble of the protocol value is RARP/SNAP/X.25.
 145 *
 146 *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 147 *             sure which should go first, but I bet it won't make much
 148 *             difference if we are running VLANs.  The good news is that
 149 *             this protocol won't be in the list unless compiled in, so
 150 *             the average user (w/out VLANs) will not be adversely affected.
 151 *             --BLG
 152 *
 153 *              0800    IP
 154 *              8100    802.1Q VLAN
 155 *              0001    802.3
 156 *              0002    AX.25
 157 *              0004    802.2
 158 *              8035    RARP
 159 *              0005    SNAP
 160 *              0805    X.25
 161 *              0806    ARP
 162 *              8137    IPX
 163 *              0009    Localtalk
 164 *              86DD    IPv6
 165 */
 166
 167#define PTYPE_HASH_SIZE (16)
 168#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 169
 170static DEFINE_SPINLOCK(ptype_lock);
 171static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 172static struct list_head ptype_all __read_mostly;        /* Taps */
 173
 174/*
 175 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 176 * semaphore.
 177 *
 178 * Pure readers hold dev_base_lock for reading.
 179 *
 180 * Writers must hold the rtnl semaphore while they loop through the
 181 * dev_base_head list, and hold dev_base_lock for writing when they do the
 182 * actual updates.  This allows pure readers to access the list even
 183 * while a writer is preparing to update it.
 184 *
 185 * To put it another way, dev_base_lock is held for writing only to
 186 * protect against pure readers; the rtnl semaphore provides the
 187 * protection against other writers.
 188 *
 189 * See, for example usages, register_netdevice() and
 190 * unregister_netdevice(), which must be called with the rtnl
 191 * semaphore held.
 192 */
 193DEFINE_RWLOCK(dev_base_lock);
 194EXPORT_SYMBOL(dev_base_lock);
 195
 196#define NETDEV_HASHBITS 8
 197#define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)
 198
 199static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 200{
 201        unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 202        return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)];
 203}
 204
 205static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 206{
 207        return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
 208}
 209
 210/* Device list insertion */
 211static int list_netdevice(struct net_device *dev)
 212{
 213        struct net *net = dev_net(dev);
 214
 215        ASSERT_RTNL();
 216
 217        write_lock_bh(&dev_base_lock);
 218        list_add_tail(&dev->dev_list, &net->dev_base_head);
 219        hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
 220        hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex));
 221        write_unlock_bh(&dev_base_lock);
 222        return 0;
 223}
 224
 225/* Device list removal */
 226static void unlist_netdevice(struct net_device *dev)
 227{
 228        ASSERT_RTNL();
 229
 230        /* Unlink dev from the device chain */
 231        write_lock_bh(&dev_base_lock);
 232        list_del(&dev->dev_list);
 233        hlist_del(&dev->name_hlist);
 234        hlist_del(&dev->index_hlist);
 235        write_unlock_bh(&dev_base_lock);
 236}
 237
 238/*
 239 *      Our notifier list
 240 */
 241
 242static RAW_NOTIFIER_HEAD(netdev_chain);
 243
 244/*
 245 *      Device drivers call our routines to queue packets here. We empty the
 246 *      queue in the local softnet handler.
 247 */
 248
 249DEFINE_PER_CPU(struct softnet_data, softnet_data);
 250EXPORT_PER_CPU_SYMBOL(softnet_data);
 251
 252#ifdef CONFIG_LOCKDEP
 253/*
 254 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 255 * according to dev->type
 256 */
 257static const unsigned short netdev_lock_type[] =
 258        {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 259         ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 260         ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 261         ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 262         ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 263         ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 264         ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 265         ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 266         ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 267         ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 268         ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 269         ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 270         ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 271         ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 272         ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
 273         ARPHRD_VOID, ARPHRD_NONE};
 274
 275static const char *const netdev_lock_name[] =
 276        {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 277         "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 278         "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 279         "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 280         "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 281         "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 282         "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 283         "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 284         "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 285         "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 286         "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 287         "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 288         "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 289         "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 290         "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
 291         "_xmit_VOID", "_xmit_NONE"};
 292
 293static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 294static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 295
 296static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 297{
 298        int i;
 299
 300        for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 301                if (netdev_lock_type[i] == dev_type)
 302                        return i;
 303        /* the last key is used by default */
 304        return ARRAY_SIZE(netdev_lock_type) - 1;
 305}
 306
 307static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 308                                                 unsigned short dev_type)
 309{
 310        int i;
 311
 312        i = netdev_lock_pos(dev_type);
 313        lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 314                                   netdev_lock_name[i]);
 315}
 316
 317static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 318{
 319        int i;
 320
 321        i = netdev_lock_pos(dev->type);
 322        lockdep_set_class_and_name(&dev->addr_list_lock,
 323                                   &netdev_addr_lock_key[i],
 324                                   netdev_lock_name[i]);
 325}
 326#else
 327static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 328                                                 unsigned short dev_type)
 329{
 330}
 331static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 332{
 333}
 334#endif
 335
 336/*******************************************************************************
 337
 338                Protocol management and registration routines
 339
 340*******************************************************************************/
 341
 342/*
 343 *      Add a protocol ID to the list. Now that the input handler is
 344 *      smarter we can dispense with all the messy stuff that used to be
 345 *      here.
 346 *
 347 *      BEWARE!!! Protocol handlers, mangling input packets,
 348 *      MUST BE last in hash buckets and checking protocol handlers
 349 *      MUST start from promiscuous ptype_all chain in net_bh.
 350 *      It is true now, do not change it.
 351 *      Explanation follows: if protocol handler, mangling packet, will
 352 *      be the first on list, it is not able to sense, that packet
 353 *      is cloned and should be copied-on-write, so that it will
 354 *      change it and subsequent readers will get broken packet.
 355 *                                                      --ANK (980803)
 356 */
 357
 358/**
 359 *      dev_add_pack - add packet handler
 360 *      @pt: packet type declaration
 361 *
 362 *      Add a protocol handler to the networking stack. The passed &packet_type
 363 *      is linked into kernel lists and may not be freed until it has been
 364 *      removed from the kernel lists.
 365 *
 366 *      This call does not sleep therefore it can not
 367 *      guarantee all CPU's that are in middle of receiving packets
 368 *      will see the new packet type (until the next received packet).
 369 */
 370
 371void dev_add_pack(struct packet_type *pt)
 372{
 373        int hash;
 374
 375        spin_lock_bh(&ptype_lock);
 376        if (pt->type == htons(ETH_P_ALL))
 377                list_add_rcu(&pt->list, &ptype_all);
 378        else {
 379                hash = ntohs(pt->type) & PTYPE_HASH_MASK;
 380                list_add_rcu(&pt->list, &ptype_base[hash]);
 381        }
 382        spin_unlock_bh(&ptype_lock);
 383}
 384EXPORT_SYMBOL(dev_add_pack);
 385
 386/**
 387 *      __dev_remove_pack        - remove packet handler
 388 *      @pt: packet type declaration
 389 *
 390 *      Remove a protocol handler that was previously added to the kernel
 391 *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 392 *      from the kernel lists and can be freed or reused once this function
 393 *      returns.
 394 *
 395 *      The packet type might still be in use by receivers
 396 *      and must not be freed until after all the CPU's have gone
 397 *      through a quiescent state.
 398 */
 399void __dev_remove_pack(struct packet_type *pt)
 400{
 401        struct list_head *head;
 402        struct packet_type *pt1;
 403
 404        spin_lock_bh(&ptype_lock);
 405
 406        if (pt->type == htons(ETH_P_ALL))
 407                head = &ptype_all;
 408        else
 409                head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 410
 411        list_for_each_entry(pt1, head, list) {
 412                if (pt == pt1) {
 413                        list_del_rcu(&pt->list);
 414                        goto out;
 415                }
 416        }
 417
 418        printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 419out:
 420        spin_unlock_bh(&ptype_lock);
 421}
 422EXPORT_SYMBOL(__dev_remove_pack);
 423
 424/**
 425 *      dev_remove_pack  - remove packet handler
 426 *      @pt: packet type declaration
 427 *
 428 *      Remove a protocol handler that was previously added to the kernel
 429 *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 430 *      from the kernel lists and can be freed or reused once this function
 431 *      returns.
 432 *
 433 *      This call sleeps to guarantee that no CPU is looking at the packet
 434 *      type after return.
 435 */
 436void dev_remove_pack(struct packet_type *pt)
 437{
 438        __dev_remove_pack(pt);
 439
 440        synchronize_net();
 441}
 442EXPORT_SYMBOL(dev_remove_pack);
 443
 444/******************************************************************************
 445
 446                      Device Boot-time Settings Routines
 447
 448*******************************************************************************/
 449
 450/* Boot time configuration table */
 451static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 452
 453/**
 454 *      netdev_boot_setup_add   - add new setup entry
 455 *      @name: name of the device
 456 *      @map: configured settings for the device
 457 *
 458 *      Adds new setup entry to the dev_boot_setup list.  The function
 459 *      returns 0 on error and 1 on success.  This is a generic routine to
 460 *      all netdevices.
 461 */
 462static int netdev_boot_setup_add(char *name, struct ifmap *map)
 463{
 464        struct netdev_boot_setup *s;
 465        int i;
 466
 467        s = dev_boot_setup;
 468        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 469                if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 470                        memset(s[i].name, 0, sizeof(s[i].name));
 471                        strlcpy(s[i].name, name, IFNAMSIZ);
 472                        memcpy(&s[i].map, map, sizeof(s[i].map));
 473                        break;
 474                }
 475        }
 476
 477        return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 478}
 479
 480/**
 481 *      netdev_boot_setup_check - check boot time settings
 482 *      @dev: the netdevice
 483 *
 484 *      Check boot time settings for the device.
 485 *      The found settings are set for the device to be used
 486 *      later in the device probing.
 487 *      Returns 0 if no settings found, 1 if they are.
 488 */
 489int netdev_boot_setup_check(struct net_device *dev)
 490{
 491        struct netdev_boot_setup *s = dev_boot_setup;
 492        int i;
 493
 494        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 495                if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 496                    !strcmp(dev->name, s[i].name)) {
 497                        dev->irq        = s[i].map.irq;
 498                        dev->base_addr  = s[i].map.base_addr;
 499                        dev->mem_start  = s[i].map.mem_start;
 500                        dev->mem_end    = s[i].map.mem_end;
 501                        return 1;
 502                }
 503        }
 504        return 0;
 505}
 506EXPORT_SYMBOL(netdev_boot_setup_check);
 507
 508
 509/**
 510 *      netdev_boot_base        - get address from boot time settings
 511 *      @prefix: prefix for network device
 512 *      @unit: id for network device
 513 *
 514 *      Check boot time settings for the base address of device.
 515 *      The found settings are set for the device to be used
 516 *      later in the device probing.
 517 *      Returns 0 if no settings found.
 518 */
 519unsigned long netdev_boot_base(const char *prefix, int unit)
 520{
 521        const struct netdev_boot_setup *s = dev_boot_setup;
 522        char name[IFNAMSIZ];
 523        int i;
 524
 525        sprintf(name, "%s%d", prefix, unit);
 526
 527        /*
 528         * If device already registered then return base of 1
 529         * to indicate not to probe for this interface
 530         */
 531        if (__dev_get_by_name(&init_net, name))
 532                return 1;
 533
 534        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 535                if (!strcmp(name, s[i].name))
 536                        return s[i].map.base_addr;
 537        return 0;
 538}
 539
 540/*
 541 * Saves at boot time configured settings for any netdevice.
 542 */
 543int __init netdev_boot_setup(char *str)
 544{
 545        int ints[5];
 546        struct ifmap map;
 547
 548        str = get_options(str, ARRAY_SIZE(ints), ints);
 549        if (!str || !*str)
 550                return 0;
 551
 552        /* Save settings */
 553        memset(&map, 0, sizeof(map));
 554        if (ints[0] > 0)
 555                map.irq = ints[1];
 556        if (ints[0] > 1)
 557                map.base_addr = ints[2];
 558        if (ints[0] > 2)
 559                map.mem_start = ints[3];
 560        if (ints[0] > 3)
 561                map.mem_end = ints[4];
 562
 563        /* Add new entry to the list */
 564        return netdev_boot_setup_add(str, &map);
 565}
 566
 567__setup("netdev=", netdev_boot_setup);
 568
 569/*******************************************************************************
 570
 571                            Device Interface Subroutines
 572
 573*******************************************************************************/
 574
 575/**
 576 *      __dev_get_by_name       - find a device by its name
 577 *      @net: the applicable net namespace
 578 *      @name: name to find
 579 *
 580 *      Find an interface by name. Must be called under RTNL semaphore
 581 *      or @dev_base_lock. If the name is found a pointer to the device
 582 *      is returned. If the name is not found then %NULL is returned. The
 583 *      reference counters are not incremented so the caller must be
 584 *      careful with locks.
 585 */
 586
 587struct net_device *__dev_get_by_name(struct net *net, const char *name)
 588{
 589        struct hlist_node *p;
 590
 591        hlist_for_each(p, dev_name_hash(net, name)) {
 592                struct net_device *dev
 593                        = hlist_entry(p, struct net_device, name_hlist);
 594                if (!strncmp(dev->name, name, IFNAMSIZ))
 595                        return dev;
 596        }
 597        return NULL;
 598}
 599EXPORT_SYMBOL(__dev_get_by_name);
 600
 601/**
 602 *      dev_get_by_name         - find a device by its name
 603 *      @net: the applicable net namespace
 604 *      @name: name to find
 605 *
 606 *      Find an interface by name. This can be called from any
 607 *      context and does its own locking. The returned handle has
 608 *      the usage count incremented and the caller must use dev_put() to
 609 *      release it when it is no longer needed. %NULL is returned if no
 610 *      matching device is found.
 611 */
 612
 613struct net_device *dev_get_by_name(struct net *net, const char *name)
 614{
 615        struct net_device *dev;
 616
 617        read_lock(&dev_base_lock);
 618        dev = __dev_get_by_name(net, name);
 619        if (dev)
 620                dev_hold(dev);
 621        read_unlock(&dev_base_lock);
 622        return dev;
 623}
 624EXPORT_SYMBOL(dev_get_by_name);
 625
 626/**
 627 *      __dev_get_by_index - find a device by its ifindex
 628 *      @net: the applicable net namespace
 629 *      @ifindex: index of device
 630 *
 631 *      Search for an interface by index. Returns %NULL if the device
 632 *      is not found or a pointer to the device. The device has not
 633 *      had its reference counter increased so the caller must be careful
 634 *      about locking. The caller must hold either the RTNL semaphore
 635 *      or @dev_base_lock.
 636 */
 637
 638struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 639{
 640        struct hlist_node *p;
 641
 642        hlist_for_each(p, dev_index_hash(net, ifindex)) {
 643                struct net_device *dev
 644                        = hlist_entry(p, struct net_device, index_hlist);
 645                if (dev->ifindex == ifindex)
 646                        return dev;
 647        }
 648        return NULL;
 649}
 650EXPORT_SYMBOL(__dev_get_by_index);
 651
 652
 653/**
 654 *      dev_get_by_index - find a device by its ifindex
 655 *      @net: the applicable net namespace
 656 *      @ifindex: index of device
 657 *
 658 *      Search for an interface by index. Returns NULL if the device
 659 *      is not found or a pointer to the device. The device returned has
 660 *      had a reference added and the pointer is safe until the user calls
 661 *      dev_put to indicate they have finished with it.
 662 */
 663
 664struct net_device *dev_get_by_index(struct net *net, int ifindex)
 665{
 666        struct net_device *dev;
 667
 668        read_lock(&dev_base_lock);
 669        dev = __dev_get_by_index(net, ifindex);
 670        if (dev)
 671                dev_hold(dev);
 672        read_unlock(&dev_base_lock);
 673        return dev;
 674}
 675EXPORT_SYMBOL(dev_get_by_index);
 676
 677/**
 678 *      dev_getbyhwaddr - find a device by its hardware address
 679 *      @net: the applicable net namespace
 680 *      @type: media type of device
 681 *      @ha: hardware address
 682 *
 683 *      Search for an interface by MAC address. Returns NULL if the device
 684 *      is not found or a pointer to the device. The caller must hold the
 685 *      rtnl semaphore. The returned device has not had its ref count increased
 686 *      and the caller must therefore be careful about locking
 687 *
 688 *      BUGS:
 689 *      If the API was consistent this would be __dev_get_by_hwaddr
 690 */
 691
 692struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
 693{
 694        struct net_device *dev;
 695
 696        ASSERT_RTNL();
 697
 698        for_each_netdev(net, dev)
 699                if (dev->type == type &&
 700                    !memcmp(dev->dev_addr, ha, dev->addr_len))
 701                        return dev;
 702
 703        return NULL;
 704}
 705EXPORT_SYMBOL(dev_getbyhwaddr);
 706
 707struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 708{
 709        struct net_device *dev;
 710
 711        ASSERT_RTNL();
 712        for_each_netdev(net, dev)
 713                if (dev->type == type)
 714                        return dev;
 715
 716        return NULL;
 717}
 718EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 719
 720struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 721{
 722        struct net_device *dev;
 723
 724        rtnl_lock();
 725        dev = __dev_getfirstbyhwtype(net, type);
 726        if (dev)
 727                dev_hold(dev);
 728        rtnl_unlock();
 729        return dev;
 730}
 731EXPORT_SYMBOL(dev_getfirstbyhwtype);
 732
 733/**
 734 *      dev_get_by_flags - find any device with given flags
 735 *      @net: the applicable net namespace
 736 *      @if_flags: IFF_* values
 737 *      @mask: bitmask of bits in if_flags to check
 738 *
 739 *      Search for any interface with the given flags. Returns NULL if a device
 740 *      is not found or a pointer to the device. The device returned has
 741 *      had a reference added and the pointer is safe until the user calls
 742 *      dev_put to indicate they have finished with it.
 743 */
 744
 745struct net_device *dev_get_by_flags(struct net *net, unsigned short if_flags,
 746                                    unsigned short mask)
 747{
 748        struct net_device *dev, *ret;
 749
 750        ret = NULL;
 751        read_lock(&dev_base_lock);
 752        for_each_netdev(net, dev) {
 753                if (((dev->flags ^ if_flags) & mask) == 0) {
 754                        dev_hold(dev);
 755                        ret = dev;
 756                        break;
 757                }
 758        }
 759        read_unlock(&dev_base_lock);
 760        return ret;
 761}
 762EXPORT_SYMBOL(dev_get_by_flags);
 763
 764/**
 765 *      dev_valid_name - check if name is okay for network device
 766 *      @name: name string
 767 *
 768 *      Network device names need to be valid file names to
 769 *      to allow sysfs to work.  We also disallow any kind of
 770 *      whitespace.
 771 */
 772int dev_valid_name(const char *name)
 773{
 774        if (*name == '\0')
 775                return 0;
 776        if (strlen(name) >= IFNAMSIZ)
 777                return 0;
 778        if (!strcmp(name, ".") || !strcmp(name, ".."))
 779                return 0;
 780
 781        while (*name) {
 782                if (*name == '/' || isspace(*name))
 783                        return 0;
 784                name++;
 785        }
 786        return 1;
 787}
 788EXPORT_SYMBOL(dev_valid_name);
 789
 790/**
 791 *      __dev_alloc_name - allocate a name for a device
 792 *      @net: network namespace to allocate the device name in
 793 *      @name: name format string
 794 *      @buf:  scratch buffer and result name string
 795 *
 796 *      Passed a format string - eg "lt%d" it will try and find a suitable
 797 *      id. It scans list of devices to build up a free map, then chooses
 798 *      the first empty slot. The caller must hold the dev_base or rtnl lock
 799 *      while allocating the name and adding the device in order to avoid
 800 *      duplicates.
 801 *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 802 *      Returns the number of the unit assigned or a negative errno code.
 803 */
 804
 805static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 806{
 807        int i = 0;
 808        const char *p;
 809        const int max_netdevices = 8*PAGE_SIZE;
 810        unsigned long *inuse;
 811        struct net_device *d;
 812
 813        p = strnchr(name, IFNAMSIZ-1, '%');
 814        if (p) {
 815                /*
 816                 * Verify the string as this thing may have come from
 817                 * the user.  There must be either one "%d" and no other "%"
 818                 * characters.
 819                 */
 820                if (p[1] != 'd' || strchr(p + 2, '%'))
 821                        return -EINVAL;
 822
 823                /* Use one page as a bit array of possible slots */
 824                inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 825                if (!inuse)
 826                        return -ENOMEM;
 827
 828                for_each_netdev(net, d) {
 829                        if (!sscanf(d->name, name, &i))
 830                                continue;
 831                        if (i < 0 || i >= max_netdevices)
 832                                continue;
 833
 834                        /*  avoid cases where sscanf is not exact inverse of printf */
 835                        snprintf(buf, IFNAMSIZ, name, i);
 836                        if (!strncmp(buf, d->name, IFNAMSIZ))
 837                                set_bit(i, inuse);
 838                }
 839
 840                i = find_first_zero_bit(inuse, max_netdevices);
 841                free_page((unsigned long) inuse);
 842        }
 843
 844        snprintf(buf, IFNAMSIZ, name, i);
 845        if (!__dev_get_by_name(net, buf))
 846                return i;
 847
 848        /* It is possible to run out of possible slots
 849         * when the name is long and there isn't enough space left
 850         * for the digits, or if all bits are used.
 851         */
 852        return -ENFILE;
 853}
 854
 855/**
 856 *      dev_alloc_name - allocate a name for a device
 857 *      @dev: device
 858 *      @name: name format string
 859 *
 860 *      Passed a format string - eg "lt%d" it will try and find a suitable
 861 *      id. It scans list of devices to build up a free map, then chooses
 862 *      the first empty slot. The caller must hold the dev_base or rtnl lock
 863 *      while allocating the name and adding the device in order to avoid
 864 *      duplicates.
 865 *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 866 *      Returns the number of the unit assigned or a negative errno code.
 867 */
 868
 869int dev_alloc_name(struct net_device *dev, const char *name)
 870{
 871        char buf[IFNAMSIZ];
 872        struct net *net;
 873        int ret;
 874
 875        BUG_ON(!dev_net(dev));
 876        net = dev_net(dev);
 877        ret = __dev_alloc_name(net, name, buf);
 878        if (ret >= 0)
 879                strlcpy(dev->name, buf, IFNAMSIZ);
 880        return ret;
 881}
 882EXPORT_SYMBOL(dev_alloc_name);
 883
 884
 885/**
 886 *      dev_change_name - change name of a device
 887 *      @dev: device
 888 *      @newname: name (or format string) must be at least IFNAMSIZ
 889 *
 890 *      Change name of a device, can pass format strings "eth%d".
 891 *      for wildcarding.
 892 */
 893int dev_change_name(struct net_device *dev, const char *newname)
 894{
 895        char oldname[IFNAMSIZ];
 896        int err = 0;
 897        int ret;
 898        struct net *net;
 899
 900        ASSERT_RTNL();
 901        BUG_ON(!dev_net(dev));
 902
 903        net = dev_net(dev);
 904        if (dev->flags & IFF_UP)
 905                return -EBUSY;
 906
 907        if (!dev_valid_name(newname))
 908                return -EINVAL;
 909
 910        if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
 911                return 0;
 912
 913        memcpy(oldname, dev->name, IFNAMSIZ);
 914
 915        if (strchr(newname, '%')) {
 916                err = dev_alloc_name(dev, newname);
 917                if (err < 0)
 918                        return err;
 919        } else if (__dev_get_by_name(net, newname))
 920                return -EEXIST;
 921        else
 922                strlcpy(dev->name, newname, IFNAMSIZ);
 923
 924rollback:
 925        /* For now only devices in the initial network namespace
 926         * are in sysfs.
 927         */
 928        if (net == &init_net) {
 929                ret = device_rename(&dev->dev, dev->name);
 930                if (ret) {
 931                        memcpy(dev->name, oldname, IFNAMSIZ);
 932                        return ret;
 933                }
 934        }
 935
 936        write_lock_bh(&dev_base_lock);
 937        hlist_del(&dev->name_hlist);
 938        hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
 939        write_unlock_bh(&dev_base_lock);
 940
 941        ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
 942        ret = notifier_to_errno(ret);
 943
 944        if (ret) {
 945                /* err >= 0 after dev_alloc_name() or stores the first errno */
 946                if (err >= 0) {
 947                        err = ret;
 948                        memcpy(dev->name, oldname, IFNAMSIZ);
 949                        goto rollback;
 950                } else {
 951                        printk(KERN_ERR
 952                               "%s: name change rollback failed: %d.\n",
 953                               dev->name, ret);
 954                }
 955        }
 956
 957        return err;
 958}
 959
 960/**
 961 *      dev_set_alias - change ifalias of a device
 962 *      @dev: device
 963 *      @alias: name up to IFALIASZ
 964 *      @len: limit of bytes to copy from info
 965 *
 966 *      Set ifalias for a device,
 967 */
 968int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
 969{
 970        ASSERT_RTNL();
 971
 972        if (len >= IFALIASZ)
 973                return -EINVAL;
 974
 975        if (!len) {
 976                if (dev->ifalias) {
 977                        kfree(dev->ifalias);
 978                        dev->ifalias = NULL;
 979                }
 980                return 0;
 981        }
 982
 983        dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
 984        if (!dev->ifalias)
 985                return -ENOMEM;
 986
 987        strlcpy(dev->ifalias, alias, len+1);
 988        return len;
 989}
 990
 991
 992/**
 993 *      netdev_features_change - device changes features
 994 *      @dev: device to cause notification
 995 *
 996 *      Called to indicate a device has changed features.
 997 */
 998void netdev_features_change(struct net_device *dev)
 999{
1000        call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);

1001}
1002EXPORT_SYMBOL(netdev_features_change);
1003
1004/**
1005 *      netdev_state_change - device changes state
1006 *      @dev: device to cause notification
1007 *
1008 *      Called to indicate a device has changed state. This function calls
1009 *      the notifier chains for netdev_chain and sends a NEWLINK message
1010 *      to the routing socket.
1011 */
1012void netdev_state_change(struct net_device *dev)
1013{
1014        if (dev->flags & IFF_UP) {
1015                call_netdevice_notifiers(NETDEV_CHANGE, dev);
1016                rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1017        }
1018}
1019EXPORT_SYMBOL(netdev_state_change);
1020
1021void netdev_bonding_change(struct net_device *dev, unsigned long event)
1022{
1023        call_netdevice_notifiers(event, dev);
1024}
1025EXPORT_SYMBOL(netdev_bonding_change);
1026
1027/**
1028 *      dev_load        - load a network module
1029 *      @net: the applicable net namespace
1030 *      @name: name of interface
1031 *
1032 *      If a network interface is not present and the process has suitable
1033 *      privileges this function loads the module. If module loading is not
1034 *      available in this kernel then it becomes a nop.
1035 */
1036
1037void dev_load(struct net *net, const char *name)
1038{
1039        struct net_device *dev;
1040
1041        read_lock(&dev_base_lock);
1042        dev = __dev_get_by_name(net, name);
1043        read_unlock(&dev_base_lock);
1044
1045        if (!dev && capable(CAP_NET_ADMIN))
1046                request_module("%s", name);
1047}
1048EXPORT_SYMBOL(dev_load);
1049
1050/**
1051 *      dev_open        - prepare an interface for use.
1052 *      @dev:   device to open
1053 *
1054 *      Takes a device from down to up state. The device's private open
1055 *      function is invoked and then the multicast lists are loaded. Finally
1056 *      the device is moved into the up state and a %NETDEV_UP message is
1057 *      sent to the netdev notifier chain.
1058 *
1059 *      Calling this function on an active interface is a nop. On a failure
1060 *      a negative errno code is returned.
1061 */
1062int dev_open(struct net_device *dev)
1063{
1064        const struct net_device_ops *ops = dev->netdev_ops;
1065        int ret;
1066
1067        ASSERT_RTNL();
1068
1069        /*
1070         *      Is it already up?
1071         */
1072
1073        if (dev->flags & IFF_UP)
1074                return 0;
1075
1076        /*
1077         *      Is it even present?
1078         */
1079        if (!netif_device_present(dev))
1080                return -ENODEV;
1081
1082        ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1083        ret = notifier_to_errno(ret);
1084        if (ret)
1085                return ret;
1086
1087        /*
1088         *      Call device private open method
1089         */
1090        set_bit(__LINK_STATE_START, &dev->state);
1091
1092        if (ops->ndo_validate_addr)
1093                ret = ops->ndo_validate_addr(dev);
1094
1095        if (!ret && ops->ndo_open)
1096                ret = ops->ndo_open(dev);
1097
1098        /*
1099         *      If it went open OK then:
1100         */
1101
1102        if (ret)
1103                clear_bit(__LINK_STATE_START, &dev->state);
1104        else {
1105                /*
1106                 *      Set the flags.
1107                 */
1108                dev->flags |= IFF_UP;
1109
1110                /*
1111                 *      Enable NET_DMA
1112                 */
1113                net_dmaengine_get();
1114
1115                /*
1116                 *      Initialize multicasting status
1117                 */
1118                dev_set_rx_mode(dev);
1119
1120                /*
1121                 *      Wakeup transmit queue engine
1122                 */
1123                dev_activate(dev);
1124
1125                /*
1126                 *      ... and announce new interface.
1127                 */
1128                call_netdevice_notifiers(NETDEV_UP, dev);
1129        }
1130
1131        return ret;
1132}
1133EXPORT_SYMBOL(dev_open);
1134
1135/**
1136 *      dev_close - shutdown an interface.
1137 *      @dev: device to shutdown
1138 *
1139 *      This function moves an active device into down state. A
1140 *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1141 *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1142 *      chain.
1143 */
1144int dev_close(struct net_device *dev)
1145{
1146        const struct net_device_ops *ops = dev->netdev_ops;
1147        ASSERT_RTNL();
1148
1149        might_sleep();
1150
1151        if (!(dev->flags & IFF_UP))
1152                return 0;
1153
1154        /*
1155         *      Tell people we are going down, so that they can
1156         *      prepare to death, when device is still operating.
1157         */
1158        call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1159
1160        clear_bit(__LINK_STATE_START, &dev->state);
1161
1162        /* Synchronize to scheduled poll. We cannot touch poll list,
1163         * it can be even on different cpu. So just clear netif_running().
1164         *
1165         * dev->stop() will invoke napi_disable() on all of it's
1166         * napi_struct instances on this device.
1167         */
1168        smp_mb__after_clear_bit(); /* Commit netif_running(). */
1169
1170        dev_deactivate(dev);
1171
1172        /*
1173         *      Call the device specific close. This cannot fail.
1174         *      Only if device is UP
1175         *
1176         *      We allow it to be called even after a DETACH hot-plug
1177         *      event.
1178         */
1179        if (ops->ndo_stop)
1180                ops->ndo_stop(dev);
1181
1182        /*
1183         *      Device is now down.
1184         */
1185
1186        dev->flags &= ~IFF_UP;
1187
1188        /*
1189         * Tell people we are down
1190         */
1191        call_netdevice_notifiers(NETDEV_DOWN, dev);
1192
1193        /*
1194         *      Shutdown NET_DMA
1195         */
1196        net_dmaengine_put();
1197
1198        return 0;
1199}
1200EXPORT_SYMBOL(dev_close);
1201
1202
1203/**
1204 *      dev_disable_lro - disable Large Receive Offload on a device
1205 *      @dev: device
1206 *
1207 *      Disable Large Receive Offload (LRO) on a net device.  Must be
1208 *      called under RTNL.  This is needed if received packets may be
1209 *      forwarded to another interface.
1210 */
1211void dev_disable_lro(struct net_device *dev)
1212{
1213        if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1214            dev->ethtool_ops->set_flags) {
1215                u32 flags = dev->ethtool_ops->get_flags(dev);
1216                if (flags & ETH_FLAG_LRO) {
1217                        flags &= ~ETH_FLAG_LRO;
1218                        dev->ethtool_ops->set_flags(dev, flags);
1219                }
1220        }
1221        WARN_ON(dev->features & NETIF_F_LRO);
1222}
1223EXPORT_SYMBOL(dev_disable_lro);
1224
1225
1226static int dev_boot_phase = 1;
1227
1228/*
1229 *      Device change register/unregister. These are not inline or static
1230 *      as we export them to the world.
1231 */
1232
1233/**
1234 *      register_netdevice_notifier - register a network notifier block
1235 *      @nb: notifier
1236 *
1237 *      Register a notifier to be called when network device events occur.
1238 *      The notifier passed is linked into the kernel structures and must
1239 *      not be reused until it has been unregistered. A negative errno code
1240 *      is returned on a failure.
1241 *
1242 *      When registered all registration and up events are replayed
1243 *      to the new notifier to allow device to have a race free
1244 *      view of the network device list.
1245 */
1246
1247int register_netdevice_notifier(struct notifier_block *nb)
1248{
1249        struct net_device *dev;
1250        struct net_device *last;
1251        struct net *net;
1252        int err;
1253
1254        rtnl_lock();
1255        err = raw_notifier_chain_register(&netdev_chain, nb);
1256        if (err)
1257                goto unlock;
1258        if (dev_boot_phase)
1259                goto unlock;
1260        for_each_net(net) {
1261                for_each_netdev(net, dev) {
1262                        err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1263                        err = notifier_to_errno(err);
1264                        if (err)
1265                                goto rollback;
1266
1267                        if (!(dev->flags & IFF_UP))
1268                                continue;
1269
1270                        nb->notifier_call(nb, NETDEV_UP, dev);
1271                }
1272        }
1273
1274unlock:
1275        rtnl_unlock();
1276        return err;
1277
1278rollback:
1279        last = dev;
1280        for_each_net(net) {
1281                for_each_netdev(net, dev) {
1282                        if (dev == last)
1283                                break;
1284
1285                        if (dev->flags & IFF_UP) {
1286                                nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1287                                nb->notifier_call(nb, NETDEV_DOWN, dev);
1288                        }
1289                        nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1290                }
1291        }
1292
1293        raw_notifier_chain_unregister(&netdev_chain, nb);
1294        goto unlock;
1295}
1296EXPORT_SYMBOL(register_netdevice_notifier);
1297
1298/**
1299 *      unregister_netdevice_notifier - unregister a network notifier block
1300 *      @nb: notifier
1301 *
1302 *      Unregister a notifier previously registered by
1303 *      register_netdevice_notifier(). The notifier is unlinked into the
1304 *      kernel structures and may then be reused. A negative errno code
1305 *      is returned on a failure.
1306 */
1307
1308int unregister_netdevice_notifier(struct notifier_block *nb)
1309{
1310        int err;
1311
1312        rtnl_lock();
1313        err = raw_notifier_chain_unregister(&netdev_chain, nb);
1314        rtnl_unlock();
1315        return err;
1316}
1317EXPORT_SYMBOL(unregister_netdevice_notifier);
1318
1319/**
1320 *      call_netdevice_notifiers - call all network notifier blocks
1321 *      @val: value passed unmodified to notifier function
1322 *      @dev: net_device pointer passed unmodified to notifier function
1323 *
1324 *      Call all network notifier blocks.  Parameters and return value
1325 *      are as for raw_notifier_call_chain().
1326 */
1327
1328int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1329{
1330        return raw_notifier_call_chain(&netdev_chain, val, dev);
1331}
1332
1333/* When > 0 there are consumers of rx skb time stamps */
1334static atomic_t netstamp_needed = ATOMIC_INIT(0);
1335
1336void net_enable_timestamp(void)
1337{
1338        atomic_inc(&netstamp_needed);
1339}
1340EXPORT_SYMBOL(net_enable_timestamp);
1341
1342void net_disable_timestamp(void)
1343{
1344        atomic_dec(&netstamp_needed);
1345}
1346EXPORT_SYMBOL(net_disable_timestamp);
1347
1348static inline void net_timestamp(struct sk_buff *skb)
1349{
1350        if (atomic_read(&netstamp_needed))
1351                __net_timestamp(skb);
1352        else
1353                skb->tstamp.tv64 = 0;
1354}
1355
1356/*
1357 *      Support routine. Sends outgoing frames to any network
1358 *      taps currently in use.
1359 */
1360
1361static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1362{
1363        struct packet_type *ptype;
1364
1365#ifdef CONFIG_NET_CLS_ACT
1366        if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
1367                net_timestamp(skb);
1368#else
1369        net_timestamp(skb);
1370#endif
1371
1372        rcu_read_lock();
1373        list_for_each_entry_rcu(ptype, &ptype_all, list) {
1374                /* Never send packets back to the socket
1375                 * they originated from - MvS (miquels@drinkel.ow.org)
1376                 */
1377                if ((ptype->dev == dev || !ptype->dev) &&
1378                    (ptype->af_packet_priv == NULL ||
1379                     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1380                        struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1381                        if (!skb2)
1382                                break;
1383
1384                        /* skb->nh should be correctly
1385                           set by sender, so that the second statement is
1386                           just protection against buggy protocols.
1387                         */
1388                        skb_reset_mac_header(skb2);
1389
1390                        if (skb_network_header(skb2) < skb2->data ||
1391                            skb2->network_header > skb2->tail) {
1392                                if (net_ratelimit())
1393                                        printk(KERN_CRIT "protocol %04x is "
1394                                               "buggy, dev %s\n",
1395                                               skb2->protocol, dev->name);
1396                                skb_reset_network_header(skb2);
1397                        }
1398
1399                        skb2->transport_header = skb2->network_header;
1400                        skb2->pkt_type = PACKET_OUTGOING;
1401                        ptype->func(skb2, skb->dev, ptype, skb->dev);
1402                }
1403        }
1404        rcu_read_unlock();
1405}
1406
1407
1408static inline void __netif_reschedule(struct Qdisc *q)
1409{
1410        struct softnet_data *sd;
1411        unsigned long flags;
1412
1413        local_irq_save(flags);
1414        sd = &__get_cpu_var(softnet_data);
1415        q->next_sched = sd->output_queue;
1416        sd->output_queue = q;
1417        raise_softirq_irqoff(NET_TX_SOFTIRQ);
1418        local_irq_restore(flags);
1419}
1420
1421void __netif_schedule(struct Qdisc *q)
1422{
1423        if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1424                __netif_reschedule(q);
1425}
1426EXPORT_SYMBOL(__netif_schedule);
1427
1428void dev_kfree_skb_irq(struct sk_buff *skb)
1429{
1430        if (atomic_dec_and_test(&skb->users)) {
1431                struct softnet_data *sd;
1432                unsigned long flags;
1433
1434                local_irq_save(flags);
1435                sd = &__get_cpu_var(softnet_data);
1436                skb->next = sd->completion_queue;
1437                sd->completion_queue = skb;
1438                raise_softirq_irqoff(NET_TX_SOFTIRQ);
1439                local_irq_restore(flags);
1440        }
1441}
1442EXPORT_SYMBOL(dev_kfree_skb_irq);
1443
1444void dev_kfree_skb_any(struct sk_buff *skb)
1445{
1446        if (in_irq() || irqs_disabled())
1447                dev_kfree_skb_irq(skb);
1448        else
1449                dev_kfree_skb(skb);
1450}
1451EXPORT_SYMBOL(dev_kfree_skb_any);
1452
1453
1454/**
1455 * netif_device_detach - mark device as removed
1456 * @dev: network device
1457 *
1458 * Mark device as removed from system and therefore no longer available.
1459 */
1460void netif_device_detach(struct net_device *dev)
1461{
1462        if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1463            netif_running(dev)) {
1464                netif_tx_stop_all_queues(dev);
1465        }
1466}
1467EXPORT_SYMBOL(netif_device_detach);
1468
1469/**
1470 * netif_device_attach - mark device as attached
1471 * @dev: network device
1472 *
1473 * Mark device as attached from system and restart if needed.
1474 */
1475void netif_device_attach(struct net_device *dev)
1476{
1477        if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1478            netif_running(dev)) {
1479                netif_tx_wake_all_queues(dev);
1480                __netdev_watchdog_up(dev);
1481        }
1482}
1483EXPORT_SYMBOL(netif_device_attach);
1484
1485static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1486{
1487        return ((features & NETIF_F_GEN_CSUM) ||
1488                ((features & NETIF_F_IP_CSUM) &&
1489                 protocol == htons(ETH_P_IP)) ||
1490                ((features & NETIF_F_IPV6_CSUM) &&
1491                 protocol == htons(ETH_P_IPV6)) ||
1492                ((features & NETIF_F_FCOE_CRC) &&
1493                 protocol == htons(ETH_P_FCOE)));
1494}
1495
1496static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1497{
1498        if (can_checksum_protocol(dev->features, skb->protocol))
1499                return true;
1500
1501        if (skb->protocol == htons(ETH_P_8021Q)) {
1502                struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1503                if (can_checksum_protocol(dev->features & dev->vlan_features,
1504                                          veh->h_vlan_encapsulated_proto))
1505                        return true;
1506        }
1507
1508        return false;
1509}
1510
1511/*
1512 * Invalidate hardware checksum when packet is to be mangled, and
1513 * complete checksum manually on outgoing path.
1514 */
1515int skb_checksum_help(struct sk_buff *skb)
1516{
1517        __wsum csum;
1518        int ret = 0, offset;
1519
1520        if (skb->ip_summed == CHECKSUM_COMPLETE)
1521                goto out_set_summed;
1522
1523        if (unlikely(skb_shinfo(skb)->gso_size)) {
1524                /* Let GSO fix up the checksum. */
1525                goto out_set_summed;
1526        }
1527
1528        offset = skb->csum_start - skb_headroom(skb);
1529        BUG_ON(offset >= skb_headlen(skb));
1530        csum = skb_checksum(skb, offset, skb->len - offset, 0);
1531
1532        offset += skb->csum_offset;
1533        BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1534
1535        if (skb_cloned(skb) &&
1536            !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1537                ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1538                if (ret)
1539                        goto out;
1540        }
1541
1542        *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1543out_set_summed:
1544        skb->ip_summed = CHECKSUM_NONE;
1545out:
1546        return ret;
1547}
1548EXPORT_SYMBOL(skb_checksum_help);
1549
1550/**
1551 *      skb_gso_segment - Perform segmentation on skb.
1552 *      @skb: buffer to segment
1553 *      @features: features for the output path (see dev->features)
1554 *
1555 *      This function segments the given skb and returns a list of segments.
1556 *
1557 *      It may return NULL if the skb requires no segmentation.  This is
1558 *      only possible when GSO is used for verifying header integrity.
1559 */
1560struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1561{
1562        struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1563        struct packet_type *ptype;
1564        __be16 type = skb->protocol;
1565        int err;
1566
1567        skb_reset_mac_header(skb);
1568        skb->mac_len = skb->network_header - skb->mac_header;
1569        __skb_pull(skb, skb->mac_len);
1570
1571        if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1572                struct net_device *dev = skb->dev;
1573                struct ethtool_drvinfo info = {};
1574
1575                if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1576                        dev->ethtool_ops->get_drvinfo(dev, &info);
1577
1578                WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d "
1579                        "ip_summed=%d",
1580                     info.driver, dev ? dev->features : 0L,
1581                     skb->sk ? skb->sk->sk_route_caps : 0L,
1582                     skb->len, skb->data_len, skb->ip_summed);
1583
1584                if (skb_header_cloned(skb) &&
1585                    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1586                        return ERR_PTR(err);
1587        }
1588
1589        rcu_read_lock();
1590        list_for_each_entry_rcu(ptype,
1591                        &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1592                if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1593                        if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1594                                err = ptype->gso_send_check(skb);
1595                                segs = ERR_PTR(err);
1596                                if (err || skb_gso_ok(skb, features))
1597                                        break;
1598                                __skb_push(skb, (skb->data -
1599                                                 skb_network_header(skb)));
1600                        }
1601                        segs = ptype->gso_segment(skb, features);
1602                        break;
1603                }
1604        }
1605        rcu_read_unlock();
1606
1607        __skb_push(skb, skb->data - skb_mac_header(skb));
1608
1609        return segs;
1610}
1611EXPORT_SYMBOL(skb_gso_segment);
1612
1613/* Take action when hardware reception checksum errors are detected. */
1614#ifdef CONFIG_BUG
1615void netdev_rx_csum_fault(struct net_device *dev)
1616{
1617        if (net_ratelimit()) {
1618                printk(KERN_ERR "%s: hw csum failure.\n",
1619                        dev ? dev->name : "<unknown>");
1620                dump_stack();
1621        }
1622}
1623EXPORT_SYMBOL(netdev_rx_csum_fault);
1624#endif
1625
1626/* Actually, we should eliminate this check as soon as we know, that:
1627 * 1. IOMMU is present and allows to map all the memory.
1628 * 2. No high memory really exists on this machine.
1629 */
1630
1631static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1632{
1633#ifdef CONFIG_HIGHMEM
1634        int i;
1635
1636        if (dev->features & NETIF_F_HIGHDMA)
1637                return 0;
1638
1639        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1640                if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1641                        return 1;
1642
1643#endif
1644        return 0;
1645}
1646
1647struct dev_gso_cb {
1648        void (*destructor)(struct sk_buff *skb);
1649};
1650
1651#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1652
1653static void dev_gso_skb_destructor(struct sk_buff *skb)
1654{
1655        struct dev_gso_cb *cb;
1656
1657        do {
1658                struct sk_buff *nskb = skb->next;
1659
1660                skb->next = nskb->next;
1661                nskb->next = NULL;
1662                kfree_skb(nskb);
1663        } while (skb->next);
1664
1665        cb = DEV_GSO_CB(skb);
1666        if (cb->destructor)
1667                cb->destructor(skb);
1668}
1669
1670/**
1671 *      dev_gso_segment - Perform emulated hardware segmentation on skb.
1672 *      @skb: buffer to segment
1673 *
1674 *      This function segments the given skb and stores the list of segments
1675 *      in skb->next.
1676 */
1677static int dev_gso_segment(struct sk_buff *skb)
1678{
1679        struct net_device *dev = skb->dev;
1680        struct sk_buff *segs;
1681        int features = dev->features & ~(illegal_highdma(dev, skb) ?
1682                                         NETIF_F_SG : 0);
1683
1684        segs = skb_gso_segment(skb, features);
1685
1686        /* Verifying header integrity only. */
1687        if (!segs)
1688                return 0;
1689
1690        if (IS_ERR(segs))
1691                return PTR_ERR(segs);
1692
1693        skb->next = segs;
1694        DEV_GSO_CB(skb)->destructor = skb->destructor;
1695        skb->destructor = dev_gso_skb_destructor;
1696
1697        return 0;
1698}
1699
1700int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1701                        struct netdev_queue *txq)
1702{
1703        const struct net_device_ops *ops = dev->netdev_ops;
1704        int rc;
1705
1706        if (likely(!skb->next)) {
1707                if (!list_empty(&ptype_all))
1708                        dev_queue_xmit_nit(skb, dev);
1709
1710                if (netif_needs_gso(dev, skb)) {
1711                        if (unlikely(dev_gso_segment(skb)))
1712                                goto out_kfree_skb;
1713                        if (skb->next)
1714                                goto gso;
1715                }
1716
1717                /*
1718                 * If device doesnt need skb->dst, release it right now while
1719                 * its hot in this cpu cache
1720                 */
1721                if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1722                        skb_dst_drop(skb);
1723
1724                rc = ops->ndo_start_xmit(skb, dev);
1725                if (rc == NETDEV_TX_OK)
1726                        txq_trans_update(txq);
1727                /*
1728                 * TODO: if skb_orphan() was called by
1729                 * dev->hard_start_xmit() (for example, the unmodified
1730                 * igb driver does that; bnx2 doesn't), then
1731                 * skb_tx_software_timestamp() will be unable to send
1732                 * back the time stamp.
1733                 *
1734                 * How can this be prevented? Always create another
1735                 * reference to the socket before calling
1736                 * dev->hard_start_xmit()? Prevent that skb_orphan()
1737                 * does anything in dev->hard_start_xmit() by clearing
1738                 * the skb destructor before the call and restoring it
1739                 * afterwards, then doing the skb_orphan() ourselves?
1740                 */
1741                return rc;
1742        }
1743
1744gso:
1745        do {
1746                struct sk_buff *nskb = skb->next;
1747
1748                skb->next = nskb->next;
1749                nskb->next = NULL;
1750                rc = ops->ndo_start_xmit(nskb, dev);
1751                if (unlikely(rc != NETDEV_TX_OK)) {
1752                        nskb->next = skb->next;
1753                        skb->next = nskb;
1754                        return rc;
1755                }
1756                txq_trans_update(txq);
1757                if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
1758                        return NETDEV_TX_BUSY;
1759        } while (skb->next);
1760
1761        skb->destructor = DEV_GSO_CB(skb)->destructor;
1762
1763out_kfree_skb:
1764        kfree_skb(skb);
1765        return NETDEV_TX_OK;
1766}
1767
1768static u32 skb_tx_hashrnd;
1769
1770u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
1771{
1772        u32 hash;
1773
1774        if (skb_rx_queue_recorded(skb)) {
1775                hash = skb_get_rx_queue(skb);
1776                while (unlikely(hash >= dev->real_num_tx_queues))
1777                        hash -= dev->real_num_tx_queues;
1778                return hash;
1779        }
1780
1781        if (skb->sk && skb->sk->sk_hash)
1782                hash = skb->sk->sk_hash;
1783        else
1784                hash = skb->protocol;
1785
1786        hash = jhash_1word(hash, skb_tx_hashrnd);
1787
1788        return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
1789}
1790EXPORT_SYMBOL(skb_tx_hash);
1791
1792static struct netdev_queue *dev_pick_tx(struct net_device *dev,
1793                                        struct sk_buff *skb)
1794{
1795        const struct net_device_ops *ops = dev->netdev_ops;
1796        u16 queue_index = 0;
1797
1798        if (ops->ndo_select_queue)
1799                queue_index = ops->ndo_select_queue(dev, skb);
1800        else if (dev->real_num_tx_queues > 1)
1801                queue_index = skb_tx_hash(dev, skb);
1802
1803        skb_set_queue_mapping(skb, queue_index);
1804        return netdev_get_tx_queue(dev, queue_index);
1805}
1806
1807static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
1808                                 struct net_device *dev,
1809                                 struct netdev_queue *txq)
1810{
1811        spinlock_t *root_lock = qdisc_lock(q);
1812        int rc;
1813
1814        spin_lock(root_lock);
1815        if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
1816                kfree_skb(skb);
1817                rc = NET_XMIT_DROP;
1818        } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
1819                   !test_and_set_bit(__QDISC_STATE_RUNNING, &q->state)) {
1820                /*
1821                 * This is a work-conserving queue; there are no old skbs
1822                 * waiting to be sent out; and the qdisc is not running -
1823                 * xmit the skb directly.
1824                 */
1825                __qdisc_update_bstats(q, skb->len);
1826                if (sch_direct_xmit(skb, q, dev, txq, root_lock))
1827                        __qdisc_run(q);
1828                else
1829                        clear_bit(__QDISC_STATE_RUNNING, &q->state);
1830
1831                rc = NET_XMIT_SUCCESS;
1832        } else {
1833                rc = qdisc_enqueue_root(skb, q);
1834                qdisc_run(q);
1835        }
1836        spin_unlock(root_lock);
1837
1838        return rc;
1839}
1840
1841/**
1842 *      dev_queue_xmit - transmit a buffer
1843 *      @skb: buffer to transmit
1844 *
1845 *      Queue a buffer for transmission to a network device. The caller must
1846 *      have set the device and priority and built the buffer before calling
1847 *      this function. The function can be called from an interrupt.
1848 *
1849 *      A negative errno code is returned on a failure. A success does not
1850 *      guarantee the frame will be transmitted as it may be dropped due
1851 *      to congestion or traffic shaping.
1852 *
1853 * -----------------------------------------------------------------------------------
1854 *      I notice this method can also return errors from the queue disciplines,
1855 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
1856 *      be positive.
1857 *
1858 *      Regardless of the return value, the skb is consumed, so it is currently
1859 *      difficult to retry a send to this method.  (You can bump the ref count
1860 *      before sending to hold a reference for retry if you are careful.)
1861 *
1862 *      When calling this method, interrupts MUST be enabled.  This is because
1863 *      the BH enable code must have IRQs enabled so that it will not deadlock.
1864 *          --BLG
1865 */
1866int dev_queue_xmit(struct sk_buff *skb)
1867{
1868        struct net_device *dev = skb->dev;
1869        struct netdev_queue *txq;
1870        struct Qdisc *q;
1871        int rc = -ENOMEM;
1872
1873        /* GSO will handle the following emulations directly. */
1874        if (netif_needs_gso(dev, skb))
1875                goto gso;
1876
1877        if (skb_has_frags(skb) &&
1878            !(dev->features & NETIF_F_FRAGLIST) &&
1879            __skb_linearize(skb))
1880                goto out_kfree_skb;
1881
1882        /* Fragmented skb is linearized if device does not support SG,
1883         * or if at least one of fragments is in highmem and device
1884         * does not support DMA from it.
1885         */
1886        if (skb_shinfo(skb)->nr_frags &&
1887            (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
1888            __skb_linearize(skb))
1889                goto out_kfree_skb;
1890
1891        /* If packet is not checksummed and device does not support
1892         * checksumming for this protocol, complete checksumming here.
1893         */
1894        if (skb->ip_summed == CHECKSUM_PARTIAL) {
1895                skb_set_transport_header(skb, skb->csum_start -
1896                                              skb_headroom(skb));
1897                if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
1898                        goto out_kfree_skb;
1899        }
1900
1901gso:
1902        /* Disable soft irqs for various locks below. Also
1903         * stops preemption for RCU.
1904         */
1905        rcu_read_lock_bh();
1906
1907        txq = dev_pick_tx(dev, skb);
1908        q = rcu_dereference(txq->qdisc);
1909
1910#ifdef CONFIG_NET_CLS_ACT
1911        skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
1912#endif
1913        if (q->enqueue) {
1914                rc = __dev_xmit_skb(skb, q, dev, txq);
1915                goto out;
1916        }
1917
1918        /* The device has no queue. Common case for software devices:
1919           loopback, all the sorts of tunnels...
1920
1921           Really, it is unlikely that netif_tx_lock protection is necessary
1922           here.  (f.e. loopback and IP tunnels are clean ignoring statistics
1923           counters.)
1924           However, it is possible, that they rely on protection
1925           made by us here.
1926
1927           Check this and shot the lock. It is not prone from deadlocks.
1928           Either shot noqueue qdisc, it is even simpler 8)
1929         */
1930        if (dev->flags & IFF_UP) {
1931                int cpu = smp_processor_id(); /* ok because BHs are off */
1932
1933                if (txq->xmit_lock_owner != cpu) {
1934
1935                        HARD_TX_LOCK(dev, txq, cpu);
1936
1937                        if (!netif_tx_queue_stopped(txq)) {
1938                                rc = NET_XMIT_SUCCESS;
1939                                if (!dev_hard_start_xmit(skb, dev, txq)) {
1940                                        HARD_TX_UNLOCK(dev, txq);
1941                                        goto out;
1942                                }
1943                        }
1944                        HARD_TX_UNLOCK(dev, txq);
1945                        if (net_ratelimit())
1946                                printk(KERN_CRIT "Virtual device %s asks to "
1947                                       "queue packet!\n", dev->name);
1948                } else {
1949                        /* Recursion is detected! It is possible,
1950                         * unfortunately */
1951                        if (net_ratelimit())
1952                                printk(KERN_CRIT "Dead loop on virtual device "
1953                                       "%s, fix it urgently!\n", dev->name);
1954                }
1955        }
1956
1957        rc = -ENETDOWN;
1958        rcu_read_unlock_bh();
1959
1960out_kfree_skb:
1961        kfree_skb(skb);
1962        return rc;
1963out:
1964        rcu_read_unlock_bh();
1965        return rc;
1966}
1967EXPORT_SYMBOL(dev_queue_xmit);
1968
1969
1970/*=======================================================================
1971                        Receiver routines
1972  =======================================================================*/
1973
1974int netdev_max_backlog __read_mostly = 1000;
1975int netdev_budget __read_mostly = 300;
1976int weight_p __read_mostly = 64;            /* old backlog weight */
1977
1978DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1979
1980
1981/**
1982 *      netif_rx        -       post buffer to the network code
1983 *      @skb: buffer to post
1984 *
1985 *      This function receives a packet from a device driver and queues it for
1986 *      the upper (protocol) levels to process.  It always succeeds. The buffer
1987 *      may be dropped during processing for congestion control or by the
1988 *      protocol layers.
1989 *
1990 *      return values:
1991 *      NET_RX_SUCCESS  (no congestion)
1992 *      NET_RX_DROP     (packet was dropped)
1993 *
1994 */
1995
1996int netif_rx(struct sk_buff *skb)
1997{
1998        struct softnet_data *queue;
1999        unsigned long flags;
2000

2001        /* if netpoll wants it, pretend we never saw it */
2002        if (netpoll_rx(skb))
2003                return NET_RX_DROP;
2004
2005        if (!skb->tstamp.tv64)
2006                net_timestamp(skb);
2007
2008        /*
2009         * The code is rearranged so that the path is the most
2010         * short when CPU is congested, but is still operating.
2011         */
2012        local_irq_save(flags);
2013        queue = &__get_cpu_var(softnet_data);
2014
2015        __get_cpu_var(netdev_rx_stat).total++;
2016        if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
2017                if (queue->input_pkt_queue.qlen) {
2018enqueue:
2019                        __skb_queue_tail(&queue->input_pkt_queue, skb);
2020                        local_irq_restore(flags);
2021                        return NET_RX_SUCCESS;
2022                }
2023
2024                napi_schedule(&queue->backlog);
2025                goto enqueue;
2026        }
2027
2028        __get_cpu_var(netdev_rx_stat).dropped++;
2029        local_irq_restore(flags);
2030
2031        kfree_skb(skb);
2032        return NET_RX_DROP;
2033}
2034EXPORT_SYMBOL(netif_rx);
2035
2036int netif_rx_ni(struct sk_buff *skb)
2037{
2038        int err;
2039
2040        preempt_disable();
2041        err = netif_rx(skb);
2042        if (local_softirq_pending())
2043                do_softirq();
2044        preempt_enable();
2045
2046        return err;
2047}
2048EXPORT_SYMBOL(netif_rx_ni);
2049
2050static void net_tx_action(struct softirq_action *h)
2051{
2052        struct softnet_data *sd = &__get_cpu_var(softnet_data);
2053
2054        if (sd->completion_queue) {
2055                struct sk_buff *clist;
2056
2057                local_irq_disable();
2058                clist = sd->completion_queue;
2059                sd->completion_queue = NULL;
2060                local_irq_enable();
2061
2062                while (clist) {
2063                        struct sk_buff *skb = clist;
2064                        clist = clist->next;
2065
2066                        WARN_ON(atomic_read(&skb->users));
2067                        __kfree_skb(skb);
2068                }
2069        }
2070
2071        if (sd->output_queue) {
2072                struct Qdisc *head;
2073
2074                local_irq_disable();
2075                head = sd->output_queue;
2076                sd->output_queue = NULL;
2077                local_irq_enable();
2078
2079                while (head) {
2080                        struct Qdisc *q = head;
2081                        spinlock_t *root_lock;
2082
2083                        head = head->next_sched;
2084
2085                        root_lock = qdisc_lock(q);
2086                        if (spin_trylock(root_lock)) {
2087                                smp_mb__before_clear_bit();
2088                                clear_bit(__QDISC_STATE_SCHED,
2089                                          &q->state);
2090                                qdisc_run(q);
2091                                spin_unlock(root_lock);
2092                        } else {
2093                                if (!test_bit(__QDISC_STATE_DEACTIVATED,
2094                                              &q->state)) {
2095                                        __netif_reschedule(q);
2096                                } else {
2097                                        smp_mb__before_clear_bit();
2098                                        clear_bit(__QDISC_STATE_SCHED,
2099                                                  &q->state);
2100                                }
2101                        }
2102                }
2103        }
2104}
2105
2106static inline int deliver_skb(struct sk_buff *skb,
2107                              struct packet_type *pt_prev,
2108                              struct net_device *orig_dev)
2109{
2110        atomic_inc(&skb->users);
2111        return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2112}
2113
2114#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
2115
2116#if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)
2117/* This hook is defined here for ATM LANE */
2118int (*br_fdb_test_addr_hook)(struct net_device *dev,
2119                             unsigned char *addr) __read_mostly;
2120EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2121#endif
2122
2123/*
2124 * If bridge module is loaded call bridging hook.
2125 *  returns NULL if packet was consumed.
2126 */
2127struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
2128                                        struct sk_buff *skb) __read_mostly;
2129EXPORT_SYMBOL_GPL(br_handle_frame_hook);
2130
2131static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
2132                                            struct packet_type **pt_prev, int *ret,
2133                                            struct net_device *orig_dev)
2134{
2135        struct net_bridge_port *port;
2136
2137        if (skb->pkt_type == PACKET_LOOPBACK ||
2138            (port = rcu_dereference(skb->dev->br_port)) == NULL)
2139                return skb;
2140
2141        if (*pt_prev) {
2142                *ret = deliver_skb(skb, *pt_prev, orig_dev);
2143                *pt_prev = NULL;
2144        }
2145
2146        return br_handle_frame_hook(port, skb);
2147}
2148#else
2149#define handle_bridge(skb, pt_prev, ret, orig_dev)      (skb)
2150#endif
2151
2152#if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
2153struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
2154EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
2155
2156static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
2157                                             struct packet_type **pt_prev,
2158                                             int *ret,
2159                                             struct net_device *orig_dev)
2160{
2161        if (skb->dev->macvlan_port == NULL)
2162                return skb;
2163
2164        if (*pt_prev) {
2165                *ret = deliver_skb(skb, *pt_prev, orig_dev);
2166                *pt_prev = NULL;
2167        }
2168        return macvlan_handle_frame_hook(skb);
2169}
2170#else
2171#define handle_macvlan(skb, pt_prev, ret, orig_dev)     (skb)
2172#endif
2173
2174#ifdef CONFIG_NET_CLS_ACT
2175/* TODO: Maybe we should just force sch_ingress to be compiled in
2176 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2177 * a compare and 2 stores extra right now if we dont have it on
2178 * but have CONFIG_NET_CLS_ACT
2179 * NOTE: This doesnt stop any functionality; if you dont have
2180 * the ingress scheduler, you just cant add policies on ingress.
2181 *
2182 */
2183static int ing_filter(struct sk_buff *skb)
2184{
2185        struct net_device *dev = skb->dev;
2186        u32 ttl = G_TC_RTTL(skb->tc_verd);
2187        struct netdev_queue *rxq;
2188        int result = TC_ACT_OK;
2189        struct Qdisc *q;
2190
2191        if (MAX_RED_LOOP < ttl++) {
2192                printk(KERN_WARNING
2193                       "Redir loop detected Dropping packet (%d->%d)\n",
2194                       skb->iif, dev->ifindex);
2195                return TC_ACT_SHOT;
2196        }
2197
2198        skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2199        skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2200
2201        rxq = &dev->rx_queue;
2202
2203        q = rxq->qdisc;
2204        if (q != &noop_qdisc) {
2205                spin_lock(qdisc_lock(q));
2206                if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2207                        result = qdisc_enqueue_root(skb, q);
2208                spin_unlock(qdisc_lock(q));
2209        }
2210
2211        return result;
2212}
2213
2214static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2215                                         struct packet_type **pt_prev,
2216                                         int *ret, struct net_device *orig_dev)
2217{
2218        if (skb->dev->rx_queue.qdisc == &noop_qdisc)
2219                goto out;
2220
2221        if (*pt_prev) {
2222                *ret = deliver_skb(skb, *pt_prev, orig_dev);
2223                *pt_prev = NULL;
2224        } else {
2225                /* Huh? Why does turning on AF_PACKET affect this? */
2226                skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2227        }
2228
2229        switch (ing_filter(skb)) {
2230        case TC_ACT_SHOT:
2231        case TC_ACT_STOLEN:
2232                kfree_skb(skb);
2233                return NULL;
2234        }
2235
2236out:
2237        skb->tc_verd = 0;
2238        return skb;
2239}
2240#endif
2241
2242/*
2243 *      netif_nit_deliver - deliver received packets to network taps
2244 *      @skb: buffer
2245 *
2246 *      This function is used to deliver incoming packets to network
2247 *      taps. It should be used when the normal netif_receive_skb path
2248 *      is bypassed, for example because of VLAN acceleration.
2249 */
2250void netif_nit_deliver(struct sk_buff *skb)
2251{
2252        struct packet_type *ptype;
2253
2254        if (list_empty(&ptype_all))
2255                return;
2256
2257        skb_reset_network_header(skb);
2258        skb_reset_transport_header(skb);
2259        skb->mac_len = skb->network_header - skb->mac_header;
2260
2261        rcu_read_lock();
2262        list_for_each_entry_rcu(ptype, &ptype_all, list) {
2263                if (!ptype->dev || ptype->dev == skb->dev)
2264                        deliver_skb(skb, ptype, skb->dev);
2265        }
2266        rcu_read_unlock();
2267}
2268
2269/**
2270 *      netif_receive_skb - process receive buffer from network
2271 *      @skb: buffer to process
2272 *
2273 *      netif_receive_skb() is the main receive data processing function.
2274 *      It always succeeds. The buffer may be dropped during processing
2275 *      for congestion control or by the protocol layers.
2276 *
2277 *      This function may only be called from softirq context and interrupts
2278 *      should be enabled.
2279 *
2280 *      Return values (usually ignored):
2281 *      NET_RX_SUCCESS: no congestion
2282 *      NET_RX_DROP: packet was dropped
2283 */
2284int netif_receive_skb(struct sk_buff *skb)
2285{
2286        struct packet_type *ptype, *pt_prev;
2287        struct net_device *orig_dev;
2288        struct net_device *null_or_orig;
2289        int ret = NET_RX_DROP;
2290        __be16 type;
2291
2292        if (!skb->tstamp.tv64)
2293                net_timestamp(skb);
2294
2295        if (skb->vlan_tci && vlan_hwaccel_do_receive(skb))
2296                return NET_RX_SUCCESS;
2297
2298        /* if we've gotten here through NAPI, check netpoll */
2299        if (netpoll_receive_skb(skb))
2300                return NET_RX_DROP;
2301
2302        if (!skb->iif)
2303                skb->iif = skb->dev->ifindex;
2304
2305        null_or_orig = NULL;
2306        orig_dev = skb->dev;
2307        if (orig_dev->master) {
2308                if (skb_bond_should_drop(skb))
2309                        null_or_orig = orig_dev; /* deliver only exact match */
2310                else
2311                        skb->dev = orig_dev->master;
2312        }
2313
2314        __get_cpu_var(netdev_rx_stat).total++;
2315
2316        skb_reset_network_header(skb);
2317        skb_reset_transport_header(skb);
2318        skb->mac_len = skb->network_header - skb->mac_header;
2319
2320        pt_prev = NULL;
2321
2322        rcu_read_lock();
2323
2324#ifdef CONFIG_NET_CLS_ACT
2325        if (skb->tc_verd & TC_NCLS) {
2326                skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2327                goto ncls;
2328        }
2329#endif
2330
2331        list_for_each_entry_rcu(ptype, &ptype_all, list) {
2332                if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2333                    ptype->dev == orig_dev) {
2334                        if (pt_prev)
2335                                ret = deliver_skb(skb, pt_prev, orig_dev);
2336                        pt_prev = ptype;
2337                }
2338        }
2339
2340#ifdef CONFIG_NET_CLS_ACT
2341        skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2342        if (!skb)
2343                goto out;
2344ncls:
2345#endif
2346
2347        skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
2348        if (!skb)
2349                goto out;
2350        skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
2351        if (!skb)
2352                goto out;
2353
2354        type = skb->protocol;
2355        list_for_each_entry_rcu(ptype,
2356                        &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2357                if (ptype->type == type &&
2358                    (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2359                     ptype->dev == orig_dev)) {
2360                        if (pt_prev)
2361                                ret = deliver_skb(skb, pt_prev, orig_dev);
2362                        pt_prev = ptype;
2363                }
2364        }
2365
2366        if (pt_prev) {
2367                ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2368        } else {
2369                kfree_skb(skb);
2370                /* Jamal, now you will not able to escape explaining
2371                 * me how you were going to use this. :-)
2372                 */
2373                ret = NET_RX_DROP;
2374        }
2375
2376out:
2377        rcu_read_unlock();
2378        return ret;
2379}
2380EXPORT_SYMBOL(netif_receive_skb);
2381
2382/* Network device is going away, flush any packets still pending  */
2383static void flush_backlog(void *arg)
2384{
2385        struct net_device *dev = arg;
2386        struct softnet_data *queue = &__get_cpu_var(softnet_data);
2387        struct sk_buff *skb, *tmp;
2388
2389        skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp)
2390                if (skb->dev == dev) {
2391                        __skb_unlink(skb, &queue->input_pkt_queue);
2392                        kfree_skb(skb);
2393                }
2394}
2395
2396static int napi_gro_complete(struct sk_buff *skb)
2397{
2398        struct packet_type *ptype;
2399        __be16 type = skb->protocol;
2400        struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2401        int err = -ENOENT;
2402
2403        if (NAPI_GRO_CB(skb)->count == 1) {
2404                skb_shinfo(skb)->gso_size = 0;
2405                goto out;
2406        }
2407
2408        rcu_read_lock();
2409        list_for_each_entry_rcu(ptype, head, list) {
2410                if (ptype->type != type || ptype->dev || !ptype->gro_complete)
2411                        continue;
2412
2413                err = ptype->gro_complete(skb);
2414                break;
2415        }
2416        rcu_read_unlock();
2417
2418        if (err) {
2419                WARN_ON(&ptype->list == head);
2420                kfree_skb(skb);
2421                return NET_RX_SUCCESS;
2422        }
2423
2424out:
2425        return netif_receive_skb(skb);
2426}
2427
2428void napi_gro_flush(struct napi_struct *napi)
2429{
2430        struct sk_buff *skb, *next;
2431
2432        for (skb = napi->gro_list; skb; skb = next) {
2433                next = skb->next;
2434                skb->next = NULL;
2435                napi_gro_complete(skb);
2436        }
2437
2438        napi->gro_count = 0;
2439        napi->gro_list = NULL;
2440}
2441EXPORT_SYMBOL(napi_gro_flush);
2442
2443int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2444{
2445        struct sk_buff **pp = NULL;
2446        struct packet_type *ptype;
2447        __be16 type = skb->protocol;
2448        struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2449        int same_flow;
2450        int mac_len;
2451        int ret;
2452
2453        if (!(skb->dev->features & NETIF_F_GRO))
2454                goto normal;
2455
2456        if (skb_is_gso(skb) || skb_has_frags(skb))
2457                goto normal;
2458
2459        rcu_read_lock();
2460        list_for_each_entry_rcu(ptype, head, list) {
2461                if (ptype->type != type || ptype->dev || !ptype->gro_receive)
2462                        continue;
2463
2464                skb_set_network_header(skb, skb_gro_offset(skb));
2465                mac_len = skb->network_header - skb->mac_header;
2466                skb->mac_len = mac_len;
2467                NAPI_GRO_CB(skb)->same_flow = 0;
2468                NAPI_GRO_CB(skb)->flush = 0;
2469                NAPI_GRO_CB(skb)->free = 0;
2470
2471                pp = ptype->gro_receive(&napi->gro_list, skb);
2472                break;
2473        }
2474        rcu_read_unlock();
2475
2476        if (&ptype->list == head)
2477                goto normal;
2478
2479        same_flow = NAPI_GRO_CB(skb)->same_flow;
2480        ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
2481
2482        if (pp) {
2483                struct sk_buff *nskb = *pp;
2484
2485                *pp = nskb->next;
2486                nskb->next = NULL;
2487                napi_gro_complete(nskb);
2488                napi->gro_count--;
2489        }
2490
2491        if (same_flow)
2492                goto ok;
2493
2494        if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
2495                goto normal;
2496
2497        napi->gro_count++;
2498        NAPI_GRO_CB(skb)->count = 1;
2499        skb_shinfo(skb)->gso_size = skb_gro_len(skb);
2500        skb->next = napi->gro_list;
2501        napi->gro_list = skb;
2502        ret = GRO_HELD;
2503
2504pull:
2505        if (skb_headlen(skb) < skb_gro_offset(skb)) {
2506                int grow = skb_gro_offset(skb) - skb_headlen(skb);
2507
2508                BUG_ON(skb->end - skb->tail < grow);
2509
2510                memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
2511
2512                skb->tail += grow;
2513                skb->data_len -= grow;
2514
2515                skb_shinfo(skb)->frags[0].page_offset += grow;
2516                skb_shinfo(skb)->frags[0].size -= grow;
2517
2518                if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
2519                        put_page(skb_shinfo(skb)->frags[0].page);
2520                        memmove(skb_shinfo(skb)->frags,
2521                                skb_shinfo(skb)->frags + 1,
2522                                --skb_shinfo(skb)->nr_frags);
2523                }
2524        }
2525
2526ok:
2527        return ret;
2528
2529normal:
2530        ret = GRO_NORMAL;
2531        goto pull;
2532}
2533EXPORT_SYMBOL(dev_gro_receive);
2534
2535static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2536{
2537        struct sk_buff *p;
2538
2539        if (netpoll_rx_on(skb))
2540                return GRO_NORMAL;
2541
2542        for (p = napi->gro_list; p; p = p->next) {
2543                NAPI_GRO_CB(p)->same_flow = (p->dev == skb->dev)
2544                        && !compare_ether_header(skb_mac_header(p),
2545                                                 skb_gro_mac_header(skb));
2546                NAPI_GRO_CB(p)->flush = 0;
2547        }
2548
2549        return dev_gro_receive(napi, skb);
2550}
2551
2552int napi_skb_finish(int ret, struct sk_buff *skb)
2553{
2554        int err = NET_RX_SUCCESS;
2555
2556        switch (ret) {
2557        case GRO_NORMAL:
2558                return netif_receive_skb(skb);
2559
2560        case GRO_DROP:
2561                err = NET_RX_DROP;
2562                /* fall through */
2563
2564        case GRO_MERGED_FREE:
2565                kfree_skb(skb);
2566                break;
2567        }
2568
2569        return err;
2570}
2571EXPORT_SYMBOL(napi_skb_finish);
2572
2573void skb_gro_reset_offset(struct sk_buff *skb)
2574{
2575        NAPI_GRO_CB(skb)->data_offset = 0;
2576        NAPI_GRO_CB(skb)->frag0 = NULL;
2577        NAPI_GRO_CB(skb)->frag0_len = 0;
2578
2579        if (skb->mac_header == skb->tail &&
2580            !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
2581                NAPI_GRO_CB(skb)->frag0 =
2582                        page_address(skb_shinfo(skb)->frags[0].page) +
2583                        skb_shinfo(skb)->frags[0].page_offset;
2584                NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
2585        }
2586}
2587EXPORT_SYMBOL(skb_gro_reset_offset);
2588
2589int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2590{
2591        skb_gro_reset_offset(skb);
2592
2593        return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
2594}
2595EXPORT_SYMBOL(napi_gro_receive);
2596
2597void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
2598{
2599        __skb_pull(skb, skb_headlen(skb));
2600        skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
2601
2602        napi->skb = skb;
2603}
2604EXPORT_SYMBOL(napi_reuse_skb);
2605
2606struct sk_buff *napi_get_frags(struct napi_struct *napi)
2607{
2608        struct net_device *dev = napi->dev;
2609        struct sk_buff *skb = napi->skb;
2610
2611        if (!skb) {
2612                skb = netdev_alloc_skb(dev, GRO_MAX_HEAD + NET_IP_ALIGN);
2613                if (!skb)
2614                        goto out;
2615
2616                skb_reserve(skb, NET_IP_ALIGN);
2617
2618                napi->skb = skb;
2619        }
2620
2621out:
2622        return skb;
2623}
2624EXPORT_SYMBOL(napi_get_frags);
2625
2626int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret)
2627{
2628        int err = NET_RX_SUCCESS;
2629
2630        switch (ret) {
2631        case GRO_NORMAL:
2632        case GRO_HELD:
2633                skb->protocol = eth_type_trans(skb, napi->dev);
2634
2635                if (ret == GRO_NORMAL)
2636                        return netif_receive_skb(skb);
2637
2638                skb_gro_pull(skb, -ETH_HLEN);
2639                break;
2640
2641        case GRO_DROP:
2642                err = NET_RX_DROP;
2643                /* fall through */
2644
2645        case GRO_MERGED_FREE:
2646                napi_reuse_skb(napi, skb);
2647                break;
2648        }
2649
2650        return err;
2651}
2652EXPORT_SYMBOL(napi_frags_finish);
2653
2654struct sk_buff *napi_frags_skb(struct napi_struct *napi)
2655{
2656        struct sk_buff *skb = napi->skb;
2657        struct ethhdr *eth;
2658        unsigned int hlen;
2659        unsigned int off;
2660
2661        napi->skb = NULL;
2662
2663        skb_reset_mac_header(skb);
2664        skb_gro_reset_offset(skb);
2665
2666        off = skb_gro_offset(skb);
2667        hlen = off + sizeof(*eth);
2668        eth = skb_gro_header_fast(skb, off);
2669        if (skb_gro_header_hard(skb, hlen)) {
2670                eth = skb_gro_header_slow(skb, hlen, off);
2671                if (unlikely(!eth)) {
2672                        napi_reuse_skb(napi, skb);
2673                        skb = NULL;
2674                        goto out;
2675                }
2676        }
2677
2678        skb_gro_pull(skb, sizeof(*eth));
2679
2680        /*
2681         * This works because the only protocols we care about don't require
2682         * special handling.  We'll fix it up properly at the end.
2683         */
2684        skb->protocol = eth->h_proto;
2685
2686out:
2687        return skb;
2688}
2689EXPORT_SYMBOL(napi_frags_skb);
2690
2691int napi_gro_frags(struct napi_struct *napi)
2692{
2693        struct sk_buff *skb = napi_frags_skb(napi);
2694
2695        if (!skb)
2696                return NET_RX_DROP;
2697
2698        return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
2699}
2700EXPORT_SYMBOL(napi_gro_frags);
2701
2702static int process_backlog(struct napi_struct *napi, int quota)
2703{
2704        int work = 0;
2705        struct softnet_data *queue = &__get_cpu_var(softnet_data);
2706        unsigned long start_time = jiffies;
2707
2708        napi->weight = weight_p;
2709        do {
2710                struct sk_buff *skb;
2711
2712                local_irq_disable();
2713                skb = __skb_dequeue(&queue->input_pkt_queue);
2714                if (!skb) {
2715                        __napi_complete(napi);
2716                        local_irq_enable();
2717                        break;
2718                }
2719                local_irq_enable();
2720
2721                netif_receive_skb(skb);
2722        } while (++work < quota && jiffies == start_time);
2723
2724        return work;
2725}
2726
2727/**
2728 * __napi_schedule - schedule for receive
2729 * @n: entry to schedule
2730 *
2731 * The entry's receive function will be scheduled to run
2732 */
2733void __napi_schedule(struct napi_struct *n)
2734{
2735        unsigned long flags;
2736
2737        local_irq_save(flags);
2738        list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
2739        __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2740        local_irq_restore(flags);
2741}
2742EXPORT_SYMBOL(__napi_schedule);
2743
2744void __napi_complete(struct napi_struct *n)
2745{
2746        BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
2747        BUG_ON(n->gro_list);
2748
2749        list_del(&n->poll_list);
2750        smp_mb__before_clear_bit();
2751        clear_bit(NAPI_STATE_SCHED, &n->state);
2752}
2753EXPORT_SYMBOL(__napi_complete);
2754
2755void napi_complete(struct napi_struct *n)
2756{
2757        unsigned long flags;
2758
2759        /*
2760         * don't let napi dequeue from the cpu poll list
2761         * just in case its running on a different cpu
2762         */
2763        if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
2764                return;
2765
2766        napi_gro_flush(n);
2767        local_irq_save(flags);
2768        __napi_complete(n);
2769        local_irq_restore(flags);
2770}
2771EXPORT_SYMBOL(napi_complete);
2772
2773void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
2774                    int (*poll)(struct napi_struct *, int), int weight)
2775{
2776        INIT_LIST_HEAD(&napi->poll_list);
2777        napi->gro_count = 0;
2778        napi->gro_list = NULL;
2779        napi->skb = NULL;
2780        napi->poll = poll;
2781        napi->weight = weight;
2782        list_add(&napi->dev_list, &dev->napi_list);
2783        napi->dev = dev;
2784#ifdef CONFIG_NETPOLL
2785        spin_lock_init(&napi->poll_lock);
2786        napi->poll_owner = -1;
2787#endif
2788        set_bit(NAPI_STATE_SCHED, &napi->state);
2789}
2790EXPORT_SYMBOL(netif_napi_add);
2791
2792void netif_napi_del(struct napi_struct *napi)
2793{
2794        struct sk_buff *skb, *next;
2795
2796        list_del_init(&napi->dev_list);
2797        napi_free_frags(napi);
2798
2799        for (skb = napi->gro_list; skb; skb = next) {
2800                next = skb->next;
2801                skb->next = NULL;
2802                kfree_skb(skb);
2803        }
2804
2805        napi->gro_list = NULL;
2806        napi->gro_count = 0;
2807}
2808EXPORT_SYMBOL(netif_napi_del);
2809
2810
2811static void net_rx_action(struct softirq_action *h)
2812{
2813        struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
2814        unsigned long time_limit = jiffies + 2;
2815        int budget = netdev_budget;
2816        void *have;
2817
2818        local_irq_disable();
2819
2820        while (!list_empty(list)) {
2821                struct napi_struct *n;
2822                int work, weight;
2823
2824                /* If softirq window is exhuasted then punt.
2825                 * Allow this to run for 2 jiffies since which will allow
2826                 * an average latency of 1.5/HZ.
2827                 */
2828                if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
2829                        goto softnet_break;
2830
2831                local_irq_enable();
2832
2833                /* Even though interrupts have been re-enabled, this
2834                 * access is safe because interrupts can only add new
2835                 * entries to the tail of this list, and only ->poll()
2836                 * calls can remove this head entry from the list.
2837                 */
2838                n = list_entry(list->next, struct napi_struct, poll_list);
2839
2840                have = netpoll_poll_lock(n);
2841
2842                weight = n->weight;
2843
2844                /* This NAPI_STATE_SCHED test is for avoiding a race
2845                 * with netpoll's poll_napi().  Only the entity which
2846                 * obtains the lock and sees NAPI_STATE_SCHED set will
2847                 * actually make the ->poll() call.  Therefore we avoid
2848                 * accidently calling ->poll() when NAPI is not scheduled.
2849                 */
2850                work = 0;
2851                if (test_bit(NAPI_STATE_SCHED, &n->state)) {
2852                        work = n->poll(n, weight);
2853                        trace_napi_poll(n);
2854                }
2855
2856                WARN_ON_ONCE(work > weight);
2857
2858                budget -= work;
2859
2860                local_irq_disable();
2861
2862                /* Drivers must not modify the NAPI state if they
2863                 * consume the entire weight.  In such cases this code
2864                 * still "owns" the NAPI instance and therefore can
2865                 * move the instance around on the list at-will.
2866                 */
2867                if (unlikely(work == weight)) {
2868                        if (unlikely(napi_disable_pending(n))) {
2869                                local_irq_enable();
2870                                napi_complete(n);
2871                                local_irq_disable();
2872                        } else
2873                                list_move_tail(&n->poll_list, list);
2874                }
2875
2876                netpoll_poll_unlock(have);
2877        }
2878out:
2879        local_irq_enable();
2880
2881#ifdef CONFIG_NET_DMA
2882        /*
2883         * There may not be any more sk_buffs coming right now, so push
2884         * any pending DMA copies to hardware
2885         */
2886        dma_issue_pending_all();
2887#endif
2888
2889        return;
2890
2891softnet_break:
2892        __get_cpu_var(netdev_rx_stat).time_squeeze++;
2893        __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2894        goto out;
2895}
2896
2897static gifconf_func_t *gifconf_list[NPROTO];
2898
2899/**
2900 *      register_gifconf        -       register a SIOCGIF handler
2901 *      @family: Address family
2902 *      @gifconf: Function handler
2903 *
2904 *      Register protocol dependent address dumping routines. The handler
2905 *      that is passed must not be freed or reused until it has been replaced
2906 *      by another handler.
2907 */
2908int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
2909{
2910        if (family >= NPROTO)
2911                return -EINVAL;
2912        gifconf_list[family] = gifconf;
2913        return 0;
2914}
2915EXPORT_SYMBOL(register_gifconf);
2916
2917
2918/*
2919 *      Map an interface index to its name (SIOCGIFNAME)
2920 */
2921
2922/*
2923 *      We need this ioctl for efficient implementation of the
2924 *      if_indextoname() function required by the IPv6 API.  Without
2925 *      it, we would have to search all the interfaces to find a
2926 *      match.  --pb
2927 */
2928
2929static int dev_ifname(struct net *net, struct ifreq __user *arg)
2930{
2931        struct net_device *dev;
2932        struct ifreq ifr;
2933
2934        /*
2935         *      Fetch the caller's info block.
2936         */
2937
2938        if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2939                return -EFAULT;
2940
2941        read_lock(&dev_base_lock);
2942        dev = __dev_get_by_index(net, ifr.ifr_ifindex);
2943        if (!dev) {
2944                read_unlock(&dev_base_lock);
2945                return -ENODEV;
2946        }
2947
2948        strcpy(ifr.ifr_name, dev->name);
2949        read_unlock(&dev_base_lock);
2950
2951        if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
2952                return -EFAULT;
2953        return 0;
2954}
2955
2956/*
2957 *      Perform a SIOCGIFCONF call. This structure will change
2958 *      size eventually, and there is nothing I can do about it.
2959 *      Thus we will need a 'compatibility mode'.
2960 */
2961
2962static int dev_ifconf(struct net *net, char __user *arg)
2963{
2964        struct ifconf ifc;
2965        struct net_device *dev;
2966        char __user *pos;
2967        int len;
2968        int total;
2969        int i;
2970
2971        /*
2972         *      Fetch the caller's info block.
2973         */
2974
2975        if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
2976                return -EFAULT;
2977
2978        pos = ifc.ifc_buf;
2979        len = ifc.ifc_len;
2980
2981        /*
2982         *      Loop over the interfaces, and write an info block for each.
2983         */
2984
2985        total = 0;
2986        for_each_netdev(net, dev) {
2987                for (i = 0; i < NPROTO; i++) {
2988                        if (gifconf_list[i]) {
2989                                int done;
2990                                if (!pos)
2991                                        done = gifconf_list[i](dev, NULL, 0);
2992                                else
2993                                        done = gifconf_list[i](dev, pos + total,
2994                                                               len - total);
2995                                if (done < 0)
2996                                        return -EFAULT;
2997                                total += done;
2998                        }
2999                }
3000        }

3001
3002        /*
3003         *      All done.  Write the updated control block back to the caller.
3004         */
3005        ifc.ifc_len = total;
3006
3007        /*
3008         *      Both BSD and Solaris return 0 here, so we do too.
3009         */
3010        return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3011}
3012
3013#ifdef CONFIG_PROC_FS
3014/*
3015 *      This is invoked by the /proc filesystem handler to display a device
3016 *      in detail.
3017 */
3018void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3019        __acquires(dev_base_lock)
3020{
3021        struct net *net = seq_file_net(seq);
3022        loff_t off;
3023        struct net_device *dev;
3024
3025        read_lock(&dev_base_lock);
3026        if (!*pos)
3027                return SEQ_START_TOKEN;
3028
3029        off = 1;
3030        for_each_netdev(net, dev)
3031                if (off++ == *pos)
3032                        return dev;
3033
3034        return NULL;
3035}
3036
3037void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3038{
3039        struct net *net = seq_file_net(seq);
3040        ++*pos;
3041        return v == SEQ_START_TOKEN ?
3042                first_net_device(net) : next_net_device((struct net_device *)v);
3043}
3044
3045void dev_seq_stop(struct seq_file *seq, void *v)
3046        __releases(dev_base_lock)
3047{
3048        read_unlock(&dev_base_lock);
3049}
3050
3051static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3052{
3053        const struct net_device_stats *stats = dev_get_stats(dev);
3054
3055        seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
3056                   "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
3057                   dev->name, stats->rx_bytes, stats->rx_packets,
3058                   stats->rx_errors,
3059                   stats->rx_dropped + stats->rx_missed_errors,
3060                   stats->rx_fifo_errors,
3061                   stats->rx_length_errors + stats->rx_over_errors +
3062                    stats->rx_crc_errors + stats->rx_frame_errors,
3063                   stats->rx_compressed, stats->multicast,
3064                   stats->tx_bytes, stats->tx_packets,
3065                   stats->tx_errors, stats->tx_dropped,
3066                   stats->tx_fifo_errors, stats->collisions,
3067                   stats->tx_carrier_errors +
3068                    stats->tx_aborted_errors +
3069                    stats->tx_window_errors +
3070                    stats->tx_heartbeat_errors,
3071                   stats->tx_compressed);
3072}
3073
3074/*
3075 *      Called from the PROCfs module. This now uses the new arbitrary sized
3076 *      /proc/net interface to create /proc/net/dev
3077 */
3078static int dev_seq_show(struct seq_file *seq, void *v)
3079{
3080        if (v == SEQ_START_TOKEN)
3081                seq_puts(seq, "Inter-|   Receive                            "
3082                              "                    |  Transmit\n"
3083                              " face |bytes    packets errs drop fifo frame "
3084                              "compressed multicast|bytes    packets errs "
3085                              "drop fifo colls carrier compressed\n");
3086        else
3087                dev_seq_printf_stats(seq, v);
3088        return 0;
3089}
3090
3091static struct netif_rx_stats *softnet_get_online(loff_t *pos)
3092{
3093        struct netif_rx_stats *rc = NULL;
3094
3095        while (*pos < nr_cpu_ids)
3096                if (cpu_online(*pos)) {
3097                        rc = &per_cpu(netdev_rx_stat, *pos);
3098                        break;
3099                } else
3100                        ++*pos;
3101        return rc;
3102}
3103
3104static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3105{
3106        return softnet_get_online(pos);
3107}
3108
3109static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3110{
3111        ++*pos;
3112        return softnet_get_online(pos);
3113}
3114
3115static void softnet_seq_stop(struct seq_file *seq, void *v)
3116{
3117}
3118
3119static int softnet_seq_show(struct seq_file *seq, void *v)
3120{
3121        struct netif_rx_stats *s = v;
3122
3123        seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
3124                   s->total, s->dropped, s->time_squeeze, 0,
3125                   0, 0, 0, 0, /* was fastroute */
3126                   s->cpu_collision);
3127        return 0;
3128}
3129
3130static const struct seq_operations dev_seq_ops = {
3131        .start = dev_seq_start,
3132        .next  = dev_seq_next,
3133        .stop  = dev_seq_stop,
3134        .show  = dev_seq_show,
3135};
3136
3137static int dev_seq_open(struct inode *inode, struct file *file)
3138{
3139        return seq_open_net(inode, file, &dev_seq_ops,
3140                            sizeof(struct seq_net_private));
3141}
3142
3143static const struct file_operations dev_seq_fops = {
3144        .owner   = THIS_MODULE,
3145        .open    = dev_seq_open,
3146        .read    = seq_read,
3147        .llseek  = seq_lseek,
3148        .release = seq_release_net,
3149};
3150
3151static const struct seq_operations softnet_seq_ops = {
3152        .start = softnet_seq_start,
3153        .next  = softnet_seq_next,
3154        .stop  = softnet_seq_stop,
3155        .show  = softnet_seq_show,
3156};
3157
3158static int softnet_seq_open(struct inode *inode, struct file *file)
3159{
3160        return seq_open(file, &softnet_seq_ops);
3161}
3162
3163static const struct file_operations softnet_seq_fops = {
3164        .owner   = THIS_MODULE,
3165        .open    = softnet_seq_open,
3166        .read    = seq_read,
3167        .llseek  = seq_lseek,
3168        .release = seq_release,
3169};
3170
3171static void *ptype_get_idx(loff_t pos)
3172{
3173        struct packet_type *pt = NULL;
3174        loff_t i = 0;
3175        int t;
3176
3177        list_for_each_entry_rcu(pt, &ptype_all, list) {
3178                if (i == pos)
3179                        return pt;
3180                ++i;
3181        }
3182
3183        for (t = 0; t < PTYPE_HASH_SIZE; t++) {
3184                list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3185                        if (i == pos)
3186                                return pt;
3187                        ++i;
3188                }
3189        }
3190        return NULL;
3191}
3192
3193static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
3194        __acquires(RCU)
3195{
3196        rcu_read_lock();
3197        return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3198}
3199
3200static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3201{
3202        struct packet_type *pt;
3203        struct list_head *nxt;
3204        int hash;
3205
3206        ++*pos;
3207        if (v == SEQ_START_TOKEN)
3208                return ptype_get_idx(0);
3209
3210        pt = v;
3211        nxt = pt->list.next;
3212        if (pt->type == htons(ETH_P_ALL)) {
3213                if (nxt != &ptype_all)
3214                        goto found;
3215                hash = 0;
3216                nxt = ptype_base[0].next;
3217        } else
3218                hash = ntohs(pt->type) & PTYPE_HASH_MASK;
3219
3220        while (nxt == &ptype_base[hash]) {
3221                if (++hash >= PTYPE_HASH_SIZE)
3222                        return NULL;
3223                nxt = ptype_base[hash].next;
3224        }
3225found:
3226        return list_entry(nxt, struct packet_type, list);
3227}
3228
3229static void ptype_seq_stop(struct seq_file *seq, void *v)
3230        __releases(RCU)
3231{
3232        rcu_read_unlock();
3233}
3234
3235static int ptype_seq_show(struct seq_file *seq, void *v)
3236{
3237        struct packet_type *pt = v;
3238
3239        if (v == SEQ_START_TOKEN)
3240                seq_puts(seq, "Type Device      Function\n");
3241        else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
3242                if (pt->type == htons(ETH_P_ALL))
3243                        seq_puts(seq, "ALL ");
3244                else
3245                        seq_printf(seq, "%04x", ntohs(pt->type));
3246
3247                seq_printf(seq, " %-8s %pF\n",
3248                           pt->dev ? pt->dev->name : "", pt->func);
3249        }
3250
3251        return 0;
3252}
3253
3254static const struct seq_operations ptype_seq_ops = {
3255        .start = ptype_seq_start,
3256        .next  = ptype_seq_next,
3257        .stop  = ptype_seq_stop,
3258        .show  = ptype_seq_show,
3259};
3260
3261static int ptype_seq_open(struct inode *inode, struct file *file)
3262{
3263        return seq_open_net(inode, file, &ptype_seq_ops,
3264                        sizeof(struct seq_net_private));
3265}
3266
3267static const struct file_operations ptype_seq_fops = {
3268        .owner   = THIS_MODULE,
3269        .open    = ptype_seq_open,
3270        .read    = seq_read,
3271        .llseek  = seq_lseek,
3272        .release = seq_release_net,
3273};
3274
3275
3276static int __net_init dev_proc_net_init(struct net *net)
3277{
3278        int rc = -ENOMEM;
3279
3280        if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
3281                goto out;
3282        if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
3283                goto out_dev;
3284        if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
3285                goto out_softnet;
3286
3287        if (wext_proc_init(net))
3288                goto out_ptype;
3289        rc = 0;
3290out:
3291        return rc;
3292out_ptype:
3293        proc_net_remove(net, "ptype");
3294out_softnet:
3295        proc_net_remove(net, "softnet_stat");
3296out_dev:
3297        proc_net_remove(net, "dev");
3298        goto out;
3299}
3300
3301static void __net_exit dev_proc_net_exit(struct net *net)
3302{
3303        wext_proc_exit(net);
3304
3305        proc_net_remove(net, "ptype");
3306        proc_net_remove(net, "softnet_stat");
3307        proc_net_remove(net, "dev");
3308}
3309
3310static struct pernet_operations __net_initdata dev_proc_ops = {
3311        .init = dev_proc_net_init,
3312        .exit = dev_proc_net_exit,
3313};
3314
3315static int __init dev_proc_init(void)
3316{
3317        return register_pernet_subsys(&dev_proc_ops);
3318}
3319#else
3320#define dev_proc_init() 0
3321#endif  /* CONFIG_PROC_FS */
3322
3323
3324/**
3325 *      netdev_set_master       -       set up master/slave pair
3326 *      @slave: slave device
3327 *      @master: new master device
3328 *
3329 *      Changes the master device of the slave. Pass %NULL to break the
3330 *      bonding. The caller must hold the RTNL semaphore. On a failure
3331 *      a negative errno code is returned. On success the reference counts
3332 *      are adjusted, %RTM_NEWLINK is sent to the routing socket and the
3333 *      function returns zero.
3334 */
3335int netdev_set_master(struct net_device *slave, struct net_device *master)
3336{
3337        struct net_device *old = slave->master;
3338
3339        ASSERT_RTNL();
3340
3341        if (master) {
3342                if (old)
3343                        return -EBUSY;
3344                dev_hold(master);
3345        }
3346
3347        slave->master = master;
3348
3349        synchronize_net();
3350
3351        if (old)
3352                dev_put(old);
3353
3354        if (master)
3355                slave->flags |= IFF_SLAVE;
3356        else
3357                slave->flags &= ~IFF_SLAVE;
3358
3359        rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
3360        return 0;
3361}
3362EXPORT_SYMBOL(netdev_set_master);
3363
3364static void dev_change_rx_flags(struct net_device *dev, int flags)
3365{
3366        const struct net_device_ops *ops = dev->netdev_ops;
3367
3368        if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
3369                ops->ndo_change_rx_flags(dev, flags);
3370}
3371
3372static int __dev_set_promiscuity(struct net_device *dev, int inc)
3373{
3374        unsigned short old_flags = dev->flags;
3375        uid_t uid;
3376        gid_t gid;
3377
3378        ASSERT_RTNL();
3379
3380        dev->flags |= IFF_PROMISC;
3381        dev->promiscuity += inc;
3382        if (dev->promiscuity == 0) {
3383                /*
3384                 * Avoid overflow.
3385                 * If inc causes overflow, untouch promisc and return error.
3386                 */
3387                if (inc < 0)
3388                        dev->flags &= ~IFF_PROMISC;
3389                else {
3390                        dev->promiscuity -= inc;
3391                        printk(KERN_WARNING "%s: promiscuity touches roof, "
3392                                "set promiscuity failed, promiscuity feature "
3393                                "of device might be broken.\n", dev->name);
3394                        return -EOVERFLOW;
3395                }
3396        }
3397        if (dev->flags != old_flags) {
3398                printk(KERN_INFO "device %s %s promiscuous mode\n",
3399                       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
3400                                                               "left");
3401                if (audit_enabled) {
3402                        current_uid_gid(&uid, &gid);
3403                        audit_log(current->audit_context, GFP_ATOMIC,
3404                                AUDIT_ANOM_PROMISCUOUS,
3405                                "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
3406                                dev->name, (dev->flags & IFF_PROMISC),
3407                                (old_flags & IFF_PROMISC),
3408                                audit_get_loginuid(current),
3409                                uid, gid,
3410                                audit_get_sessionid(current));
3411                }
3412
3413                dev_change_rx_flags(dev, IFF_PROMISC);
3414        }
3415        return 0;
3416}
3417
3418/**
3419 *      dev_set_promiscuity     - update promiscuity count on a device
3420 *      @dev: device
3421 *      @inc: modifier
3422 *
3423 *      Add or remove promiscuity from a device. While the count in the device
3424 *      remains above zero the interface remains promiscuous. Once it hits zero
3425 *      the device reverts back to normal filtering operation. A negative inc
3426 *      value is used to drop promiscuity on the device.
3427 *      Return 0 if successful or a negative errno code on error.
3428 */
3429int dev_set_promiscuity(struct net_device *dev, int inc)
3430{
3431        unsigned short old_flags = dev->flags;
3432        int err;
3433
3434        err = __dev_set_promiscuity(dev, inc);
3435        if (err < 0)
3436                return err;
3437        if (dev->flags != old_flags)
3438                dev_set_rx_mode(dev);
3439        return err;
3440}
3441EXPORT_SYMBOL(dev_set_promiscuity);
3442
3443/**
3444 *      dev_set_allmulti        - update allmulti count on a device
3445 *      @dev: device
3446 *      @inc: modifier
3447 *
3448 *      Add or remove reception of all multicast frames to a device. While the
3449 *      count in the device remains above zero the interface remains listening
3450 *      to all interfaces. Once it hits zero the device reverts back to normal
3451 *      filtering operation. A negative @inc value is used to drop the counter
3452 *      when releasing a resource needing all multicasts.
3453 *      Return 0 if successful or a negative errno code on error.
3454 */
3455
3456int dev_set_allmulti(struct net_device *dev, int inc)
3457{
3458        unsigned short old_flags = dev->flags;
3459
3460        ASSERT_RTNL();
3461
3462        dev->flags |= IFF_ALLMULTI;
3463        dev->allmulti += inc;
3464        if (dev->allmulti == 0) {
3465                /*
3466                 * Avoid overflow.
3467                 * If inc causes overflow, untouch allmulti and return error.
3468                 */
3469                if (inc < 0)
3470                        dev->flags &= ~IFF_ALLMULTI;
3471                else {
3472                        dev->allmulti -= inc;
3473                        printk(KERN_WARNING "%s: allmulti touches roof, "
3474                                "set allmulti failed, allmulti feature of "
3475                                "device might be broken.\n", dev->name);
3476                        return -EOVERFLOW;
3477                }
3478        }
3479        if (dev->flags ^ old_flags) {
3480                dev_change_rx_flags(dev, IFF_ALLMULTI);
3481                dev_set_rx_mode(dev);
3482        }
3483        return 0;
3484}
3485EXPORT_SYMBOL(dev_set_allmulti);
3486
3487/*
3488 *      Upload unicast and multicast address lists to device and
3489 *      configure RX filtering. When the device doesn't support unicast
3490 *      filtering it is put in promiscuous mode while unicast addresses
3491 *      are present.
3492 */
3493void __dev_set_rx_mode(struct net_device *dev)
3494{
3495        const struct net_device_ops *ops = dev->netdev_ops;
3496
3497        /* dev_open will call this function so the list will stay sane. */
3498        if (!(dev->flags&IFF_UP))
3499                return;
3500
3501        if (!netif_device_present(dev))
3502                return;
3503
3504        if (ops->ndo_set_rx_mode)
3505                ops->ndo_set_rx_mode(dev);
3506        else {
3507                /* Unicast addresses changes may only happen under the rtnl,
3508                 * therefore calling __dev_set_promiscuity here is safe.
3509                 */
3510                if (dev->uc.count > 0 && !dev->uc_promisc) {
3511                        __dev_set_promiscuity(dev, 1);
3512                        dev->uc_promisc = 1;
3513                } else if (dev->uc.count == 0 && dev->uc_promisc) {
3514                        __dev_set_promiscuity(dev, -1);
3515                        dev->uc_promisc = 0;
3516                }
3517
3518                if (ops->ndo_set_multicast_list)
3519                        ops->ndo_set_multicast_list(dev);
3520        }
3521}
3522
3523void dev_set_rx_mode(struct net_device *dev)
3524{
3525        netif_addr_lock_bh(dev);
3526        __dev_set_rx_mode(dev);
3527        netif_addr_unlock_bh(dev);
3528}
3529
3530/* hw addresses list handling functions */
3531
3532static int __hw_addr_add(struct netdev_hw_addr_list *list, unsigned char *addr,
3533                         int addr_len, unsigned char addr_type)
3534{
3535        struct netdev_hw_addr *ha;
3536        int alloc_size;
3537
3538        if (addr_len > MAX_ADDR_LEN)
3539                return -EINVAL;
3540
3541        list_for_each_entry(ha, &list->list, list) {
3542                if (!memcmp(ha->addr, addr, addr_len) &&
3543                    ha->type == addr_type) {
3544                        ha->refcount++;
3545                        return 0;
3546                }
3547        }
3548
3549
3550        alloc_size = sizeof(*ha);
3551        if (alloc_size < L1_CACHE_BYTES)
3552                alloc_size = L1_CACHE_BYTES;
3553        ha = kmalloc(alloc_size, GFP_ATOMIC);
3554        if (!ha)
3555                return -ENOMEM;
3556        memcpy(ha->addr, addr, addr_len);
3557        ha->type = addr_type;
3558        ha->refcount = 1;
3559        ha->synced = false;
3560        list_add_tail_rcu(&ha->list, &list->list);
3561        list->count++;
3562        return 0;
3563}
3564
3565static void ha_rcu_free(struct rcu_head *head)
3566{
3567        struct netdev_hw_addr *ha;
3568
3569        ha = container_of(head, struct netdev_hw_addr, rcu_head);
3570        kfree(ha);
3571}
3572
3573static int __hw_addr_del(struct netdev_hw_addr_list *list, unsigned char *addr,
3574                         int addr_len, unsigned char addr_type)
3575{
3576        struct netdev_hw_addr *ha;
3577
3578        list_for_each_entry(ha, &list->list, list) {
3579                if (!memcmp(ha->addr, addr, addr_len) &&
3580                    (ha->type == addr_type || !addr_type)) {
3581                        if (--ha->refcount)
3582                                return 0;
3583                        list_del_rcu(&ha->list);
3584                        call_rcu(&ha->rcu_head, ha_rcu_free);
3585                        list->count--;
3586                        return 0;
3587                }
3588        }
3589        return -ENOENT;
3590}
3591
3592static int __hw_addr_add_multiple(struct netdev_hw_addr_list *to_list,
3593                                  struct netdev_hw_addr_list *from_list,
3594                                  int addr_len,
3595                                  unsigned char addr_type)
3596{
3597        int err;
3598        struct netdev_hw_addr *ha, *ha2;
3599        unsigned char type;
3600
3601        list_for_each_entry(ha, &from_list->list, list) {
3602                type = addr_type ? addr_type : ha->type;
3603                err = __hw_addr_add(to_list, ha->addr, addr_len, type);
3604                if (err)
3605                        goto unroll;
3606        }
3607        return 0;
3608
3609unroll:
3610        list_for_each_entry(ha2, &from_list->list, list) {
3611                if (ha2 == ha)
3612                        break;
3613                type = addr_type ? addr_type : ha2->type;
3614                __hw_addr_del(to_list, ha2->addr, addr_len, type);
3615        }
3616        return err;
3617}
3618
3619static void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list,
3620                                   struct netdev_hw_addr_list *from_list,
3621                                   int addr_len,
3622                                   unsigned char addr_type)
3623{
3624        struct netdev_hw_addr *ha;
3625        unsigned char type;
3626
3627        list_for_each_entry(ha, &from_list->list, list) {
3628                type = addr_type ? addr_type : ha->type;
3629                __hw_addr_del(to_list, ha->addr, addr_len, addr_type);
3630        }
3631}
3632
3633static int __hw_addr_sync(struct netdev_hw_addr_list *to_list,
3634                          struct netdev_hw_addr_list *from_list,
3635                          int addr_len)
3636{
3637        int err = 0;
3638        struct netdev_hw_addr *ha, *tmp;
3639
3640        list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
3641                if (!ha->synced) {
3642                        err = __hw_addr_add(to_list, ha->addr,
3643                                            addr_len, ha->type);
3644                        if (err)
3645                                break;
3646                        ha->synced = true;
3647                        ha->refcount++;
3648                } else if (ha->refcount == 1) {
3649                        __hw_addr_del(to_list, ha->addr, addr_len, ha->type);
3650                        __hw_addr_del(from_list, ha->addr, addr_len, ha->type);
3651                }
3652        }
3653        return err;
3654}
3655
3656static void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,
3657                             struct netdev_hw_addr_list *from_list,
3658                             int addr_len)
3659{
3660        struct netdev_hw_addr *ha, *tmp;
3661
3662        list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
3663                if (ha->synced) {
3664                        __hw_addr_del(to_list, ha->addr,
3665                                      addr_len, ha->type);
3666                        ha->synced = false;
3667                        __hw_addr_del(from_list, ha->addr,
3668                                      addr_len, ha->type);
3669                }
3670        }
3671}
3672
3673static void __hw_addr_flush(struct netdev_hw_addr_list *list)
3674{
3675        struct netdev_hw_addr *ha, *tmp;
3676
3677        list_for_each_entry_safe(ha, tmp, &list->list, list) {
3678                list_del_rcu(&ha->list);
3679                call_rcu(&ha->rcu_head, ha_rcu_free);
3680        }
3681        list->count = 0;
3682}
3683
3684static void __hw_addr_init(struct netdev_hw_addr_list *list)
3685{
3686        INIT_LIST_HEAD(&list->list);
3687        list->count = 0;
3688}
3689
3690/* Device addresses handling functions */
3691
3692static void dev_addr_flush(struct net_device *dev)
3693{
3694        /* rtnl_mutex must be held here */
3695
3696        __hw_addr_flush(&dev->dev_addrs);
3697        dev->dev_addr = NULL;
3698}
3699
3700static int dev_addr_init(struct net_device *dev)
3701{
3702        unsigned char addr[MAX_ADDR_LEN];
3703        struct netdev_hw_addr *ha;
3704        int err;
3705
3706        /* rtnl_mutex must be held here */
3707
3708        __hw_addr_init(&dev->dev_addrs);
3709        memset(addr, 0, sizeof(addr));
3710        err = __hw_addr_add(&dev->dev_addrs, addr, sizeof(addr),
3711                            NETDEV_HW_ADDR_T_LAN);
3712        if (!err) {
3713                /*
3714                 * Get the first (previously created) address from the list
3715                 * and set dev_addr pointer to this location.
3716                 */
3717                ha = list_first_entry(&dev->dev_addrs.list,
3718                                      struct netdev_hw_addr, list);
3719                dev->dev_addr = ha->addr;
3720        }
3721        return err;
3722}
3723
3724/**
3725 *      dev_addr_add    - Add a device address
3726 *      @dev: device
3727 *      @addr: address to add
3728 *      @addr_type: address type
3729 *
3730 *      Add a device address to the device or increase the reference count if
3731 *      it already exists.
3732 *
3733 *      The caller must hold the rtnl_mutex.
3734 */
3735int dev_addr_add(struct net_device *dev, unsigned char *addr,
3736                 unsigned char addr_type)
3737{
3738        int err;
3739
3740        ASSERT_RTNL();
3741
3742        err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type);
3743        if (!err)
3744                call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3745        return err;
3746}
3747EXPORT_SYMBOL(dev_addr_add);
3748
3749/**
3750 *      dev_addr_del    - Release a device address.
3751 *      @dev: device
3752 *      @addr: address to delete
3753 *      @addr_type: address type
3754 *
3755 *      Release reference to a device address and remove it from the device
3756 *      if the reference count drops to zero.
3757 *
3758 *      The caller must hold the rtnl_mutex.
3759 */
3760int dev_addr_del(struct net_device *dev, unsigned char *addr,
3761                 unsigned char addr_type)
3762{
3763        int err;
3764        struct netdev_hw_addr *ha;
3765
3766        ASSERT_RTNL();
3767
3768        /*
3769         * We can not remove the first address from the list because
3770         * dev->dev_addr points to that.
3771         */
3772        ha = list_first_entry(&dev->dev_addrs.list,
3773                              struct netdev_hw_addr, list);
3774        if (ha->addr == dev->dev_addr && ha->refcount == 1)
3775                return -ENOENT;
3776
3777        err = __hw_addr_del(&dev->dev_addrs, addr, dev->addr_len,
3778                            addr_type);
3779        if (!err)
3780                call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3781        return err;
3782}
3783EXPORT_SYMBOL(dev_addr_del);
3784
3785/**
3786 *      dev_addr_add_multiple   - Add device addresses from another device
3787 *      @to_dev: device to which addresses will be added
3788 *      @from_dev: device from which addresses will be added
3789 *      @addr_type: address type - 0 means type will be used from from_dev
3790 *
3791 *      Add device addresses of the one device to another.
3792 **
3793 *      The caller must hold the rtnl_mutex.
3794 */
3795int dev_addr_add_multiple(struct net_device *to_dev,
3796                          struct net_device *from_dev,
3797                          unsigned char addr_type)
3798{
3799        int err;
3800
3801        ASSERT_RTNL();
3802
3803        if (from_dev->addr_len != to_dev->addr_len)
3804                return -EINVAL;
3805        err = __hw_addr_add_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs,
3806                                     to_dev->addr_len, addr_type);
3807        if (!err)
3808                call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
3809        return err;
3810}
3811EXPORT_SYMBOL(dev_addr_add_multiple);
3812
3813/**
3814 *      dev_addr_del_multiple   - Delete device addresses by another device
3815 *      @to_dev: device where the addresses will be deleted
3816 *      @from_dev: device by which addresses the addresses will be deleted
3817 *      @addr_type: address type - 0 means type will used from from_dev
3818 *
3819 *      Deletes addresses in to device by the list of addresses in from device.
3820 *
3821 *      The caller must hold the rtnl_mutex.
3822 */
3823int dev_addr_del_multiple(struct net_device *to_dev,
3824                          struct net_device *from_dev,
3825                          unsigned char addr_type)
3826{
3827        ASSERT_RTNL();
3828
3829        if (from_dev->addr_len != to_dev->addr_len)
3830                return -EINVAL;
3831        __hw_addr_del_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs,
3832                               to_dev->addr_len, addr_type);
3833        call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
3834        return 0;
3835}
3836EXPORT_SYMBOL(dev_addr_del_multiple);
3837
3838/* multicast addresses handling functions */
3839
3840int __dev_addr_delete(struct dev_addr_list **list, int *count,
3841                      void *addr, int alen, int glbl)
3842{
3843        struct dev_addr_list *da;
3844
3845        for (; (da = *list) != NULL; list = &da->next) {
3846                if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3847                    alen == da->da_addrlen) {
3848                        if (glbl) {
3849                                int old_glbl = da->da_gusers;
3850                                da->da_gusers = 0;
3851                                if (old_glbl == 0)
3852                                        break;
3853                        }
3854                        if (--da->da_users)
3855                                return 0;
3856
3857                        *list = da->next;
3858                        kfree(da);
3859                        (*count)--;
3860                        return 0;
3861                }
3862        }
3863        return -ENOENT;
3864}
3865
3866int __dev_addr_add(struct dev_addr_list **list, int *count,
3867                   void *addr, int alen, int glbl)
3868{
3869        struct dev_addr_list *da;
3870
3871        for (da = *list; da != NULL; da = da->next) {
3872                if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3873                    da->da_addrlen == alen) {
3874                        if (glbl) {
3875                                int old_glbl = da->da_gusers;
3876                                da->da_gusers = 1;
3877                                if (old_glbl)
3878                                        return 0;
3879                        }
3880                        da->da_users++;
3881                        return 0;
3882                }
3883        }
3884
3885        da = kzalloc(sizeof(*da), GFP_ATOMIC);
3886        if (da == NULL)
3887                return -ENOMEM;
3888        memcpy(da->da_addr, addr, alen);
3889        da->da_addrlen = alen;
3890        da->da_users = 1;
3891        da->da_gusers = glbl ? 1 : 0;
3892        da->next = *list;
3893        *list = da;
3894        (*count)++;
3895        return 0;
3896}
3897
3898/**
3899 *      dev_unicast_delete      - Release secondary unicast address.
3900 *      @dev: device
3901 *      @addr: address to delete
3902 *
3903 *      Release reference to a secondary unicast address and remove it
3904 *      from the device if the reference count drops to zero.
3905 *
3906 *      The caller must hold the rtnl_mutex.
3907 */
3908int dev_unicast_delete(struct net_device *dev, void *addr)
3909{
3910        int err;
3911
3912        ASSERT_RTNL();
3913
3914        netif_addr_lock_bh(dev);
3915        err = __hw_addr_del(&dev->uc, addr, dev->addr_len,
3916                            NETDEV_HW_ADDR_T_UNICAST);
3917        if (!err)
3918                __dev_set_rx_mode(dev);
3919        netif_addr_unlock_bh(dev);
3920        return err;
3921}
3922EXPORT_SYMBOL(dev_unicast_delete);
3923
3924/**
3925 *      dev_unicast_add         - add a secondary unicast address
3926 *      @dev: device
3927 *      @addr: address to add
3928 *
3929 *      Add a secondary unicast address to the device or increase
3930 *      the reference count if it already exists.
3931 *
3932 *      The caller must hold the rtnl_mutex.
3933 */
3934int dev_unicast_add(struct net_device *dev, void *addr)
3935{
3936        int err;
3937
3938        ASSERT_RTNL();
3939
3940        netif_addr_lock_bh(dev);
3941        err = __hw_addr_add(&dev->uc, addr, dev->addr_len,
3942                            NETDEV_HW_ADDR_T_UNICAST);
3943        if (!err)
3944                __dev_set_rx_mode(dev);
3945        netif_addr_unlock_bh(dev);
3946        return err;
3947}
3948EXPORT_SYMBOL(dev_unicast_add);
3949
3950int __dev_addr_sync(struct dev_addr_list **to, int *to_count,
3951                    struct dev_addr_list **from, int *from_count)
3952{
3953        struct dev_addr_list *da, *next;
3954        int err = 0;
3955
3956        da = *from;
3957        while (da != NULL) {
3958                next = da->next;
3959                if (!da->da_synced) {
3960                        err = __dev_addr_add(to, to_count,
3961                                             da->da_addr, da->da_addrlen, 0);
3962                        if (err < 0)
3963                                break;
3964                        da->da_synced = 1;
3965                        da->da_users++;
3966                } else if (da->da_users == 1) {
3967                        __dev_addr_delete(to, to_count,
3968                                          da->da_addr, da->da_addrlen, 0);
3969                        __dev_addr_delete(from, from_count,
3970                                          da->da_addr, da->da_addrlen, 0);
3971                }
3972                da = next;
3973        }
3974        return err;
3975}
3976EXPORT_SYMBOL_GPL(__dev_addr_sync);
3977
3978void __dev_addr_unsync(struct dev_addr_list **to, int *to_count,
3979                       struct dev_addr_list **from, int *from_count)
3980{
3981        struct dev_addr_list *da, *next;
3982
3983        da = *from;
3984        while (da != NULL) {
3985                next = da->next;
3986                if (da->da_synced) {
3987                        __dev_addr_delete(to, to_count,
3988                                          da->da_addr, da->da_addrlen, 0);
3989                        da->da_synced = 0;
3990                        __dev_addr_delete(from, from_count,
3991                                          da->da_addr, da->da_addrlen, 0);
3992                }
3993                da = next;
3994        }
3995}
3996EXPORT_SYMBOL_GPL(__dev_addr_unsync);
3997
3998/**
3999 *      dev_unicast_sync - Synchronize device's unicast list to another device
4000 *      @to: destination device

4001 *      @from: source device
4002 *
4003 *      Add newly added addresses to the destination device and release
4004 *      addresses that have no users left. The source device must be
4005 *      locked by netif_tx_lock_bh.
4006 *
4007 *      This function is intended to be called from the dev->set_rx_mode
4008 *      function of layered software devices.
4009 */
4010int dev_unicast_sync(struct net_device *to, struct net_device *from)
4011{
4012        int err = 0;
4013
4014        if (to->addr_len != from->addr_len)
4015                return -EINVAL;
4016
4017        netif_addr_lock_bh(to);
4018        err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len);
4019        if (!err)
4020                __dev_set_rx_mode(to);
4021        netif_addr_unlock_bh(to);
4022        return err;
4023}
4024EXPORT_SYMBOL(dev_unicast_sync);
4025
4026/**
4027 *      dev_unicast_unsync - Remove synchronized addresses from the destination device
4028 *      @to: destination device
4029 *      @from: source device
4030 *
4031 *      Remove all addresses that were added to the destination device by
4032 *      dev_unicast_sync(). This function is intended to be called from the
4033 *      dev->stop function of layered software devices.
4034 */
4035void dev_unicast_unsync(struct net_device *to, struct net_device *from)
4036{
4037        if (to->addr_len != from->addr_len)
4038                return;
4039
4040        netif_addr_lock_bh(from);
4041        netif_addr_lock(to);
4042        __hw_addr_unsync(&to->uc, &from->uc, to->addr_len);
4043        __dev_set_rx_mode(to);
4044        netif_addr_unlock(to);
4045        netif_addr_unlock_bh(from);
4046}
4047EXPORT_SYMBOL(dev_unicast_unsync);
4048
4049static void dev_unicast_flush(struct net_device *dev)
4050{
4051        netif_addr_lock_bh(dev);
4052        __hw_addr_flush(&dev->uc);
4053        netif_addr_unlock_bh(dev);
4054}
4055
4056static void dev_unicast_init(struct net_device *dev)
4057{
4058        __hw_addr_init(&dev->uc);
4059}
4060
4061
4062static void __dev_addr_discard(struct dev_addr_list **list)
4063{
4064        struct dev_addr_list *tmp;
4065
4066        while (*list != NULL) {
4067                tmp = *list;
4068                *list = tmp->next;
4069                if (tmp->da_users > tmp->da_gusers)
4070                        printk("__dev_addr_discard: address leakage! "
4071                               "da_users=%d\n", tmp->da_users);
4072                kfree(tmp);
4073        }
4074}
4075
4076static void dev_addr_discard(struct net_device *dev)
4077{
4078        netif_addr_lock_bh(dev);
4079
4080        __dev_addr_discard(&dev->mc_list);
4081        dev->mc_count = 0;
4082
4083        netif_addr_unlock_bh(dev);
4084}
4085
4086/**
4087 *      dev_get_flags - get flags reported to userspace
4088 *      @dev: device
4089 *
4090 *      Get the combination of flag bits exported through APIs to userspace.
4091 */
4092unsigned dev_get_flags(const struct net_device *dev)
4093{
4094        unsigned flags;
4095
4096        flags = (dev->flags & ~(IFF_PROMISC |
4097                                IFF_ALLMULTI |
4098                                IFF_RUNNING |
4099                                IFF_LOWER_UP |
4100                                IFF_DORMANT)) |
4101                (dev->gflags & (IFF_PROMISC |
4102                                IFF_ALLMULTI));
4103
4104        if (netif_running(dev)) {
4105                if (netif_oper_up(dev))
4106                        flags |= IFF_RUNNING;
4107                if (netif_carrier_ok(dev))
4108                        flags |= IFF_LOWER_UP;
4109                if (netif_dormant(dev))
4110                        flags |= IFF_DORMANT;
4111        }
4112
4113        return flags;
4114}
4115EXPORT_SYMBOL(dev_get_flags);
4116
4117/**
4118 *      dev_change_flags - change device settings
4119 *      @dev: device
4120 *      @flags: device state flags
4121 *
4122 *      Change settings on device based state flags. The flags are
4123 *      in the userspace exported format.
4124 */
4125int dev_change_flags(struct net_device *dev, unsigned flags)
4126{
4127        int ret, changes;
4128        int old_flags = dev->flags;
4129
4130        ASSERT_RTNL();
4131
4132        /*
4133         *      Set the flags on our device.
4134         */
4135
4136        dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4137                               IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4138                               IFF_AUTOMEDIA)) |
4139                     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4140                                    IFF_ALLMULTI));
4141
4142        /*
4143         *      Load in the correct multicast list now the flags have changed.
4144         */
4145
4146        if ((old_flags ^ flags) & IFF_MULTICAST)
4147                dev_change_rx_flags(dev, IFF_MULTICAST);
4148
4149        dev_set_rx_mode(dev);
4150
4151        /*
4152         *      Have we downed the interface. We handle IFF_UP ourselves
4153         *      according to user attempts to set it, rather than blindly
4154         *      setting it.
4155         */
4156
4157        ret = 0;
4158        if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4159                ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
4160
4161                if (!ret)
4162                        dev_set_rx_mode(dev);
4163        }
4164
4165        if (dev->flags & IFF_UP &&
4166            ((old_flags ^ dev->flags) & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
4167                                          IFF_VOLATILE)))
4168                call_netdevice_notifiers(NETDEV_CHANGE, dev);
4169
4170        if ((flags ^ dev->gflags) & IFF_PROMISC) {
4171                int inc = (flags & IFF_PROMISC) ? 1 : -1;
4172
4173                dev->gflags ^= IFF_PROMISC;
4174                dev_set_promiscuity(dev, inc);
4175        }
4176
4177        /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4178           is important. Some (broken) drivers set IFF_PROMISC, when
4179           IFF_ALLMULTI is requested not asking us and not reporting.
4180         */
4181        if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4182                int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4183
4184                dev->gflags ^= IFF_ALLMULTI;
4185                dev_set_allmulti(dev, inc);
4186        }
4187
4188        /* Exclude state transition flags, already notified */
4189        changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING);
4190        if (changes)
4191                rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4192
4193        return ret;
4194}
4195EXPORT_SYMBOL(dev_change_flags);
4196
4197/**
4198 *      dev_set_mtu - Change maximum transfer unit
4199 *      @dev: device
4200 *      @new_mtu: new transfer unit
4201 *
4202 *      Change the maximum transfer size of the network device.
4203 */
4204int dev_set_mtu(struct net_device *dev, int new_mtu)
4205{
4206        const struct net_device_ops *ops = dev->netdev_ops;
4207        int err;
4208
4209        if (new_mtu == dev->mtu)
4210                return 0;
4211
4212        /*      MTU must be positive.    */
4213        if (new_mtu < 0)
4214                return -EINVAL;
4215
4216        if (!netif_device_present(dev))
4217                return -ENODEV;
4218
4219        err = 0;
4220        if (ops->ndo_change_mtu)
4221                err = ops->ndo_change_mtu(dev, new_mtu);
4222        else
4223                dev->mtu = new_mtu;
4224
4225        if (!err && dev->flags & IFF_UP)
4226                call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4227        return err;
4228}
4229EXPORT_SYMBOL(dev_set_mtu);
4230
4231/**
4232 *      dev_set_mac_address - Change Media Access Control Address
4233 *      @dev: device
4234 *      @sa: new address
4235 *
4236 *      Change the hardware (MAC) address of the device
4237 */
4238int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4239{
4240        const struct net_device_ops *ops = dev->netdev_ops;
4241        int err;
4242
4243        if (!ops->ndo_set_mac_address)
4244                return -EOPNOTSUPP;
4245        if (sa->sa_family != dev->type)
4246                return -EINVAL;
4247        if (!netif_device_present(dev))
4248                return -ENODEV;
4249        err = ops->ndo_set_mac_address(dev, sa);
4250        if (!err)
4251                call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4252        return err;
4253}
4254EXPORT_SYMBOL(dev_set_mac_address);
4255
4256/*
4257 *      Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock)
4258 */
4259static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4260{
4261        int err;
4262        struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4263
4264        if (!dev)
4265                return -ENODEV;
4266
4267        switch (cmd) {
4268        case SIOCGIFFLAGS:      /* Get interface flags */
4269                ifr->ifr_flags = (short) dev_get_flags(dev);
4270                return 0;
4271
4272        case SIOCGIFMETRIC:     /* Get the metric on the interface
4273                                   (currently unused) */
4274                ifr->ifr_metric = 0;
4275                return 0;
4276
4277        case SIOCGIFMTU:        /* Get the MTU of a device */
4278                ifr->ifr_mtu = dev->mtu;
4279                return 0;
4280
4281        case SIOCGIFHWADDR:
4282                if (!dev->addr_len)
4283                        memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4284                else
4285                        memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4286                               min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4287                ifr->ifr_hwaddr.sa_family = dev->type;
4288                return 0;
4289
4290        case SIOCGIFSLAVE:
4291                err = -EINVAL;
4292                break;
4293
4294        case SIOCGIFMAP:
4295                ifr->ifr_map.mem_start = dev->mem_start;
4296                ifr->ifr_map.mem_end   = dev->mem_end;
4297                ifr->ifr_map.base_addr = dev->base_addr;
4298                ifr->ifr_map.irq       = dev->irq;
4299                ifr->ifr_map.dma       = dev->dma;
4300                ifr->ifr_map.port      = dev->if_port;
4301                return 0;
4302
4303        case SIOCGIFINDEX:
4304                ifr->ifr_ifindex = dev->ifindex;
4305                return 0;
4306
4307        case SIOCGIFTXQLEN:
4308                ifr->ifr_qlen = dev->tx_queue_len;
4309                return 0;
4310
4311        default:
4312                /* dev_ioctl() should ensure this case
4313                 * is never reached
4314                 */
4315                WARN_ON(1);
4316                err = -EINVAL;
4317                break;
4318
4319        }
4320        return err;
4321}
4322
4323/*
4324 *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
4325 */
4326static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4327{
4328        int err;
4329        struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4330        const struct net_device_ops *ops;
4331
4332        if (!dev)
4333                return -ENODEV;
4334
4335        ops = dev->netdev_ops;
4336
4337        switch (cmd) {
4338        case SIOCSIFFLAGS:      /* Set interface flags */
4339                return dev_change_flags(dev, ifr->ifr_flags);
4340
4341        case SIOCSIFMETRIC:     /* Set the metric on the interface
4342                                   (currently unused) */
4343                return -EOPNOTSUPP;
4344
4345        case SIOCSIFMTU:        /* Set the MTU of a device */
4346                return dev_set_mtu(dev, ifr->ifr_mtu);
4347
4348        case SIOCSIFHWADDR:
4349                return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4350
4351        case SIOCSIFHWBROADCAST:
4352                if (ifr->ifr_hwaddr.sa_family != dev->type)
4353                        return -EINVAL;
4354                memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4355                       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4356                call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4357                return 0;
4358
4359        case SIOCSIFMAP:
4360                if (ops->ndo_set_config) {
4361                        if (!netif_device_present(dev))
4362                                return -ENODEV;
4363                        return ops->ndo_set_config(dev, &ifr->ifr_map);
4364                }
4365                return -EOPNOTSUPP;
4366
4367        case SIOCADDMULTI:
4368                if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4369                    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4370                        return -EINVAL;
4371                if (!netif_device_present(dev))
4372                        return -ENODEV;
4373                return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
4374                                  dev->addr_len, 1);
4375
4376        case SIOCDELMULTI:
4377                if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4378                    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4379                        return -EINVAL;
4380                if (!netif_device_present(dev))
4381                        return -ENODEV;
4382                return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
4383                                     dev->addr_len, 1);
4384
4385        case SIOCSIFTXQLEN:
4386                if (ifr->ifr_qlen < 0)
4387                        return -EINVAL;
4388                dev->tx_queue_len = ifr->ifr_qlen;
4389                return 0;
4390
4391        case SIOCSIFNAME:
4392                ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4393                return dev_change_name(dev, ifr->ifr_newname);
4394
4395        /*
4396         *      Unknown or private ioctl
4397         */
4398        default:
4399                if ((cmd >= SIOCDEVPRIVATE &&
4400                    cmd <= SIOCDEVPRIVATE + 15) ||
4401                    cmd == SIOCBONDENSLAVE ||
4402                    cmd == SIOCBONDRELEASE ||
4403                    cmd == SIOCBONDSETHWADDR ||
4404                    cmd == SIOCBONDSLAVEINFOQUERY ||
4405                    cmd == SIOCBONDINFOQUERY ||
4406                    cmd == SIOCBONDCHANGEACTIVE ||
4407                    cmd == SIOCGMIIPHY ||
4408                    cmd == SIOCGMIIREG ||
4409                    cmd == SIOCSMIIREG ||
4410                    cmd == SIOCBRADDIF ||
4411                    cmd == SIOCBRDELIF ||
4412                    cmd == SIOCSHWTSTAMP ||
4413                    cmd == SIOCWANDEV) {
4414                        err = -EOPNOTSUPP;
4415                        if (ops->ndo_do_ioctl) {
4416                                if (netif_device_present(dev))
4417                                        err = ops->ndo_do_ioctl(dev, ifr, cmd);
4418                                else
4419                                        err = -ENODEV;
4420                        }
4421                } else
4422                        err = -EINVAL;
4423
4424        }
4425        return err;
4426}
4427
4428/*
4429 *      This function handles all "interface"-type I/O control requests. The actual
4430 *      'doing' part of this is dev_ifsioc above.
4431 */
4432
4433/**
4434 *      dev_ioctl       -       network device ioctl
4435 *      @net: the applicable net namespace
4436 *      @cmd: command to issue
4437 *      @arg: pointer to a struct ifreq in user space
4438 *
4439 *      Issue ioctl functions to devices. This is normally called by the
4440 *      user space syscall interfaces but can sometimes be useful for
4441 *      other purposes. The return value is the return from the syscall if
4442 *      positive or a negative errno code on error.
4443 */
4444
4445int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4446{
4447        struct ifreq ifr;
4448        int ret;
4449        char *colon;
4450
4451        /* One special case: SIOCGIFCONF takes ifconf argument
4452           and requires shared lock, because it sleeps writing
4453           to user space.
4454         */
4455
4456        if (cmd == SIOCGIFCONF) {
4457                rtnl_lock();
4458                ret = dev_ifconf(net, (char __user *) arg);
4459                rtnl_unlock();
4460                return ret;
4461        }
4462        if (cmd == SIOCGIFNAME)
4463                return dev_ifname(net, (struct ifreq __user *)arg);
4464
4465        if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4466                return -EFAULT;
4467
4468        ifr.ifr_name[IFNAMSIZ-1] = 0;
4469
4470        colon = strchr(ifr.ifr_name, ':');
4471        if (colon)
4472                *colon = 0;
4473
4474        /*
4475         *      See which interface the caller is talking about.
4476         */
4477
4478        switch (cmd) {
4479        /*
4480         *      These ioctl calls:
4481         *      - can be done by all.
4482         *      - atomic and do not require locking.
4483         *      - return a value
4484         */
4485        case SIOCGIFFLAGS:
4486        case SIOCGIFMETRIC:
4487        case SIOCGIFMTU:
4488        case SIOCGIFHWADDR:
4489        case SIOCGIFSLAVE:
4490        case SIOCGIFMAP:
4491        case SIOCGIFINDEX:
4492        case SIOCGIFTXQLEN:
4493                dev_load(net, ifr.ifr_name);
4494                read_lock(&dev_base_lock);
4495                ret = dev_ifsioc_locked(net, &ifr, cmd);
4496                read_unlock(&dev_base_lock);
4497                if (!ret) {
4498                        if (colon)
4499                                *colon = ':';
4500                        if (copy_to_user(arg, &ifr,
4501                                         sizeof(struct ifreq)))
4502                                ret = -EFAULT;
4503                }
4504                return ret;
4505
4506        case SIOCETHTOOL:
4507                dev_load(net, ifr.ifr_name);
4508                rtnl_lock();
4509                ret = dev_ethtool(net, &ifr);
4510                rtnl_unlock();
4511                if (!ret) {
4512                        if (colon)
4513                                *colon = ':';
4514                        if (copy_to_user(arg, &ifr,
4515                                         sizeof(struct ifreq)))
4516                                ret = -EFAULT;
4517                }
4518                return ret;
4519
4520        /*
4521         *      These ioctl calls:
4522         *      - require superuser power.
4523         *      - require strict serialization.
4524         *      - return a value
4525         */
4526        case SIOCGMIIPHY:
4527        case SIOCGMIIREG:
4528        case SIOCSIFNAME:
4529                if (!capable(CAP_NET_ADMIN))
4530                        return -EPERM;
4531                dev_load(net, ifr.ifr_name);
4532                rtnl_lock();
4533                ret = dev_ifsioc(net, &ifr, cmd);
4534                rtnl_unlock();
4535                if (!ret) {
4536                        if (colon)
4537                                *colon = ':';
4538                        if (copy_to_user(arg, &ifr,
4539                                         sizeof(struct ifreq)))
4540                                ret = -EFAULT;
4541                }
4542                return ret;
4543
4544        /*
4545         *      These ioctl calls:
4546         *      - require superuser power.
4547         *      - require strict serialization.
4548         *      - do not return a value
4549         */
4550        case SIOCSIFFLAGS:
4551        case SIOCSIFMETRIC:
4552        case SIOCSIFMTU:
4553        case SIOCSIFMAP:
4554        case SIOCSIFHWADDR:
4555        case SIOCSIFSLAVE:
4556        case SIOCADDMULTI:
4557        case SIOCDELMULTI:
4558        case SIOCSIFHWBROADCAST:
4559        case SIOCSIFTXQLEN:
4560        case SIOCSMIIREG:
4561        case SIOCBONDENSLAVE:
4562        case SIOCBONDRELEASE:
4563        case SIOCBONDSETHWADDR:
4564        case SIOCBONDCHANGEACTIVE:
4565        case SIOCBRADDIF:
4566        case SIOCBRDELIF:
4567        case SIOCSHWTSTAMP:
4568                if (!capable(CAP_NET_ADMIN))
4569                        return -EPERM;
4570                /* fall through */
4571        case SIOCBONDSLAVEINFOQUERY:
4572        case SIOCBONDINFOQUERY:
4573                dev_load(net, ifr.ifr_name);
4574                rtnl_lock();
4575                ret = dev_ifsioc(net, &ifr, cmd);
4576                rtnl_unlock();
4577                return ret;
4578
4579        case SIOCGIFMEM:
4580                /* Get the per device memory space. We can add this but
4581                 * currently do not support it */
4582        case SIOCSIFMEM:
4583                /* Set the per device memory buffer space.
4584                 * Not applicable in our case */
4585        case SIOCSIFLINK:
4586                return -EINVAL;
4587
4588        /*
4589         *      Unknown or private ioctl.
4590         */
4591        default:
4592                if (cmd == SIOCWANDEV ||
4593                    (cmd >= SIOCDEVPRIVATE &&
4594                     cmd <= SIOCDEVPRIVATE + 15)) {
4595                        dev_load(net, ifr.ifr_name);
4596                        rtnl_lock();
4597                        ret = dev_ifsioc(net, &ifr, cmd);
4598                        rtnl_unlock();
4599                        if (!ret && copy_to_user(arg, &ifr,
4600                                                 sizeof(struct ifreq)))
4601                                ret = -EFAULT;
4602                        return ret;
4603                }
4604                /* Take care of Wireless Extensions */
4605                if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4606                        return wext_handle_ioctl(net, &ifr, cmd, arg);
4607                return -EINVAL;
4608        }
4609}
4610
4611
4612/**
4613 *      dev_new_index   -       allocate an ifindex
4614 *      @net: the applicable net namespace
4615 *
4616 *      Returns a suitable unique value for a new device interface
4617 *      number.  The caller must hold the rtnl semaphore or the
4618 *      dev_base_lock to be sure it remains unique.
4619 */
4620static int dev_new_index(struct net *net)
4621{
4622        static int ifindex;
4623        for (;;) {
4624                if (++ifindex <= 0)
4625                        ifindex = 1;
4626                if (!__dev_get_by_index(net, ifindex))
4627                        return ifindex;
4628        }
4629}
4630
4631/* Delayed registration/unregisteration */
4632static LIST_HEAD(net_todo_list);
4633
4634static void net_set_todo(struct net_device *dev)
4635{
4636        list_add_tail(&dev->todo_list, &net_todo_list);
4637}
4638
4639static void rollback_registered(struct net_device *dev)
4640{
4641        BUG_ON(dev_boot_phase);
4642        ASSERT_RTNL();
4643
4644        /* Some devices call without registering for initialization unwind. */
4645        if (dev->reg_state == NETREG_UNINITIALIZED) {
4646                printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
4647                                  "was registered\n", dev->name, dev);
4648
4649                WARN_ON(1);
4650                return;
4651        }
4652
4653        BUG_ON(dev->reg_state != NETREG_REGISTERED);
4654
4655        /* If device is running, close it first. */
4656        dev_close(dev);
4657
4658        /* And unlink it from device chain. */
4659        unlist_netdevice(dev);
4660
4661        dev->reg_state = NETREG_UNREGISTERING;
4662
4663        synchronize_net();
4664
4665        /* Shutdown queueing discipline. */
4666        dev_shutdown(dev);
4667
4668
4669        /* Notify protocols, that we are about to destroy
4670           this device. They should clean all the things.
4671        */
4672        call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4673
4674        /*
4675         *      Flush the unicast and multicast chains
4676         */
4677        dev_unicast_flush(dev);
4678        dev_addr_discard(dev);
4679
4680        if (dev->netdev_ops->ndo_uninit)
4681                dev->netdev_ops->ndo_uninit(dev);
4682
4683        /* Notifier chain MUST detach us from master device. */
4684        WARN_ON(dev->master);
4685
4686        /* Remove entries from kobject tree */
4687        netdev_unregister_kobject(dev);
4688
4689        synchronize_net();
4690
4691        dev_put(dev);
4692}
4693
4694static void __netdev_init_queue_locks_one(struct net_device *dev,
4695                                          struct netdev_queue *dev_queue,
4696                                          void *_unused)
4697{
4698        spin_lock_init(&dev_queue->_xmit_lock);
4699        netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
4700        dev_queue->xmit_lock_owner = -1;
4701}
4702
4703static void netdev_init_queue_locks(struct net_device *dev)
4704{
4705        netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
4706        __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
4707}
4708
4709unsigned long netdev_fix_features(unsigned long features, const char *name)
4710{
4711        /* Fix illegal SG+CSUM combinations. */
4712        if ((features & NETIF_F_SG) &&
4713            !(features & NETIF_F_ALL_CSUM)) {
4714                if (name)
4715                        printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
4716                               "checksum feature.\n", name);
4717                features &= ~NETIF_F_SG;
4718        }
4719
4720        /* TSO requires that SG is present as well. */
4721        if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
4722                if (name)
4723                        printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
4724                               "SG feature.\n", name);
4725                features &= ~NETIF_F_TSO;
4726        }
4727
4728        if (features & NETIF_F_UFO) {
4729                if (!(features & NETIF_F_GEN_CSUM)) {
4730                        if (name)
4731                                printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4732                                       "since no NETIF_F_HW_CSUM feature.\n",
4733                                       name);
4734                        features &= ~NETIF_F_UFO;
4735                }
4736
4737                if (!(features & NETIF_F_SG)) {
4738                        if (name)
4739                                printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4740                                       "since no NETIF_F_SG feature.\n", name);
4741                        features &= ~NETIF_F_UFO;
4742                }
4743        }
4744
4745        return features;
4746}
4747EXPORT_SYMBOL(netdev_fix_features);
4748
4749/**
4750 *      register_netdevice      - register a network device
4751 *      @dev: device to register
4752 *
4753 *      Take a completed network device structure and add it to the kernel
4754 *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4755 *      chain. 0 is returned on success. A negative errno code is returned
4756 *      on a failure to set up the device, or if the name is a duplicate.
4757 *
4758 *      Callers must hold the rtnl semaphore. You may want
4759 *      register_netdev() instead of this.
4760 *
4761 *      BUGS:
4762 *      The locking appears insufficient to guarantee two parallel registers
4763 *      will not get the same name.
4764 */
4765
4766int register_netdevice(struct net_device *dev)
4767{
4768        struct hlist_head *head;
4769        struct hlist_node *p;
4770        int ret;
4771        struct net *net = dev_net(dev);
4772
4773        BUG_ON(dev_boot_phase);
4774        ASSERT_RTNL();
4775
4776        might_sleep();
4777
4778        /* When net_device's are persistent, this will be fatal. */
4779        BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
4780        BUG_ON(!net);
4781
4782        spin_lock_init(&dev->addr_list_lock);
4783        netdev_set_addr_lockdep_class(dev);
4784        netdev_init_queue_locks(dev);
4785
4786        dev->iflink = -1;
4787
4788        /* Init, if this function is available */
4789        if (dev->netdev_ops->ndo_init) {
4790                ret = dev->netdev_ops->ndo_init(dev);
4791                if (ret) {
4792                        if (ret > 0)
4793                                ret = -EIO;
4794                        goto out;
4795                }
4796        }
4797
4798        if (!dev_valid_name(dev->name)) {
4799                ret = -EINVAL;
4800                goto err_uninit;
4801        }
4802
4803        dev->ifindex = dev_new_index(net);
4804        if (dev->iflink == -1)
4805                dev->iflink = dev->ifindex;
4806
4807        /* Check for existence of name */
4808        head = dev_name_hash(net, dev->name);
4809        hlist_for_each(p, head) {
4810                struct net_device *d
4811                        = hlist_entry(p, struct net_device, name_hlist);
4812                if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
4813                        ret = -EEXIST;
4814                        goto err_uninit;
4815                }
4816        }
4817
4818        /* Fix illegal checksum combinations */
4819        if ((dev->features & NETIF_F_HW_CSUM) &&
4820            (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4821                printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
4822                       dev->name);
4823                dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
4824        }
4825
4826        if ((dev->features & NETIF_F_NO_CSUM) &&
4827            (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4828                printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
4829                       dev->name);
4830                dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
4831        }
4832
4833        dev->features = netdev_fix_features(dev->features, dev->name);
4834
4835        /* Enable software GSO if SG is supported. */
4836        if (dev->features & NETIF_F_SG)
4837                dev->features |= NETIF_F_GSO;
4838
4839        netdev_initialize_kobject(dev);
4840        ret = netdev_register_kobject(dev);
4841        if (ret)
4842                goto err_uninit;
4843        dev->reg_state = NETREG_REGISTERED;
4844
4845        /*
4846         *      Default initial state at registry is that the
4847         *      device is present.
4848         */
4849
4850        set_bit(__LINK_STATE_PRESENT, &dev->state);
4851
4852        dev_init_scheduler(dev);
4853        dev_hold(dev);
4854        list_netdevice(dev);
4855
4856        /* Notify protocols, that a new device appeared. */
4857        ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
4858        ret = notifier_to_errno(ret);
4859        if (ret) {
4860                rollback_registered(dev);
4861                dev->reg_state = NETREG_UNREGISTERED;
4862        }
4863
4864out:
4865        return ret;
4866
4867err_uninit:
4868        if (dev->netdev_ops->ndo_uninit)
4869                dev->netdev_ops->ndo_uninit(dev);
4870        goto out;
4871}
4872EXPORT_SYMBOL(register_netdevice);
4873
4874/**
4875 *      init_dummy_netdev       - init a dummy network device for NAPI
4876 *      @dev: device to init
4877 *
4878 *      This takes a network device structure and initialize the minimum
4879 *      amount of fields so it can be used to schedule NAPI polls without
4880 *      registering a full blown interface. This is to be used by drivers
4881 *      that need to tie several hardware interfaces to a single NAPI
4882 *      poll scheduler due to HW limitations.
4883 */
4884int init_dummy_netdev(struct net_device *dev)
4885{
4886        /* Clear everything. Note we don't initialize spinlocks
4887         * are they aren't supposed to be taken by any of the
4888         * NAPI code and this dummy netdev is supposed to be
4889         * only ever used for NAPI polls
4890         */
4891        memset(dev, 0, sizeof(struct net_device));
4892
4893        /* make sure we BUG if trying to hit standard
4894         * register/unregister code path
4895         */
4896        dev->reg_state = NETREG_DUMMY;
4897
4898        /* initialize the ref count */
4899        atomic_set(&dev->refcnt, 1);
4900
4901        /* NAPI wants this */
4902        INIT_LIST_HEAD(&dev->napi_list);
4903
4904        /* a dummy interface is started by default */
4905        set_bit(__LINK_STATE_PRESENT, &dev->state);
4906        set_bit(__LINK_STATE_START, &dev->state);
4907
4908        return 0;
4909}
4910EXPORT_SYMBOL_GPL(init_dummy_netdev);
4911
4912
4913/**
4914 *      register_netdev - register a network device
4915 *      @dev: device to register
4916 *
4917 *      Take a completed network device structure and add it to the kernel
4918 *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4919 *      chain. 0 is returned on success. A negative errno code is returned
4920 *      on a failure to set up the device, or if the name is a duplicate.
4921 *
4922 *      This is a wrapper around register_netdevice that takes the rtnl semaphore
4923 *      and expands the device name if you passed a format string to
4924 *      alloc_netdev.
4925 */
4926int register_netdev(struct net_device *dev)
4927{
4928        int err;
4929
4930        rtnl_lock();
4931
4932        /*
4933         * If the name is a format string the caller wants us to do a
4934         * name allocation.
4935         */
4936        if (strchr(dev->name, '%')) {
4937                err = dev_alloc_name(dev, dev->name);
4938                if (err < 0)
4939                        goto out;
4940        }
4941
4942        err = register_netdevice(dev);
4943out:
4944        rtnl_unlock();
4945        return err;
4946}
4947EXPORT_SYMBOL(register_netdev);
4948
4949/*
4950 * netdev_wait_allrefs - wait until all references are gone.
4951 *
4952 * This is called when unregistering network devices.
4953 *
4954 * Any protocol or device that holds a reference should register
4955 * for netdevice notification, and cleanup and put back the
4956 * reference if they receive an UNREGISTER event.
4957 * We can get stuck here if buggy protocols don't correctly
4958 * call dev_put.
4959 */
4960static void netdev_wait_allrefs(struct net_device *dev)
4961{
4962        unsigned long rebroadcast_time, warning_time;
4963
4964        rebroadcast_time = warning_time = jiffies;
4965        while (atomic_read(&dev->refcnt) != 0) {
4966                if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
4967                        rtnl_lock();
4968
4969                        /* Rebroadcast unregister notification */
4970                        call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4971
4972                        if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
4973                                     &dev->state)) {
4974                                /* We must not have linkwatch events
4975                                 * pending on unregister. If this
4976                                 * happens, we simply run the queue
4977                                 * unscheduled, resulting in a noop
4978                                 * for this device.
4979                                 */
4980                                linkwatch_run_queue();
4981                        }
4982
4983                        __rtnl_unlock();
4984
4985                        rebroadcast_time = jiffies;
4986                }
4987
4988                msleep(250);
4989
4990                if (time_after(jiffies, warning_time + 10 * HZ)) {
4991                        printk(KERN_EMERG "unregister_netdevice: "
4992                               "waiting for %s to become free. Usage "
4993                               "count = %d\n",
4994                               dev->name, atomic_read(&dev->refcnt));
4995                        warning_time = jiffies;
4996                }
4997        }
4998}
4999
5000/* The sequence is:

5001 *
5002 *      rtnl_lock();
5003 *      ...
5004 *      register_netdevice(x1);
5005 *      register_netdevice(x2);
5006 *      ...
5007 *      unregister_netdevice(y1);
5008 *      unregister_netdevice(y2);
5009 *      ...
5010 *      rtnl_unlock();
5011 *      free_netdev(y1);
5012 *      free_netdev(y2);
5013 *
5014 * We are invoked by rtnl_unlock().
5015 * This allows us to deal with problems:
5016 * 1) We can delete sysfs objects which invoke hotplug
5017 *    without deadlocking with linkwatch via keventd.
5018 * 2) Since we run with the RTNL semaphore not held, we can sleep
5019 *    safely in order to wait for the netdev refcnt to drop to zero.
5020 *
5021 * We must not return until all unregister events added during
5022 * the interval the lock was held have been completed.
5023 */
5024void netdev_run_todo(void)
5025{
5026        struct list_head list;
5027
5028        /* Snapshot list, allow later requests */
5029        list_replace_init(&net_todo_list, &list);
5030
5031        __rtnl_unlock();
5032
5033        while (!list_empty(&list)) {
5034                struct net_device *dev
5035                        = list_entry(list.next, struct net_device, todo_list);
5036                list_del(&dev->todo_list);
5037
5038                if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5039                        printk(KERN_ERR "network todo '%s' but state %d\n",
5040                               dev->name, dev->reg_state);
5041                        dump_stack();
5042                        continue;
5043                }
5044
5045                dev->reg_state = NETREG_UNREGISTERED;
5046
5047                on_each_cpu(flush_backlog, dev, 1);
5048
5049                netdev_wait_allrefs(dev);
5050
5051                /* paranoia */
5052                BUG_ON(atomic_read(&dev->refcnt));
5053                WARN_ON(dev->ip_ptr);
5054                WARN_ON(dev->ip6_ptr);
5055                WARN_ON(dev->dn_ptr);
5056
5057                if (dev->destructor)
5058                        dev->destructor(dev);
5059
5060                /* Free network device */
5061                kobject_put(&dev->dev.kobj);
5062        }
5063}
5064
5065/**
5066 *      dev_get_stats   - get network device statistics
5067 *      @dev: device to get statistics from
5068 *
5069 *      Get network statistics from device. The device driver may provide
5070 *      its own method by setting dev->netdev_ops->get_stats; otherwise
5071 *      the internal statistics structure is used.
5072 */
5073const struct net_device_stats *dev_get_stats(struct net_device *dev)
5074{
5075        const struct net_device_ops *ops = dev->netdev_ops;
5076
5077        if (ops->ndo_get_stats)
5078                return ops->ndo_get_stats(dev);
5079        else {
5080                unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
5081                struct net_device_stats *stats = &dev->stats;
5082                unsigned int i;
5083                struct netdev_queue *txq;
5084
5085                for (i = 0; i < dev->num_tx_queues; i++) {
5086                        txq = netdev_get_tx_queue(dev, i);
5087                        tx_bytes   += txq->tx_bytes;
5088                        tx_packets += txq->tx_packets;
5089                        tx_dropped += txq->tx_dropped;
5090                }
5091                if (tx_bytes || tx_packets || tx_dropped) {
5092                        stats->tx_bytes   = tx_bytes;
5093                        stats->tx_packets = tx_packets;
5094                        stats->tx_dropped = tx_dropped;
5095                }
5096                return stats;
5097        }
5098}
5099EXPORT_SYMBOL(dev_get_stats);
5100
5101static void netdev_init_one_queue(struct net_device *dev,
5102                                  struct netdev_queue *queue,
5103                                  void *_unused)
5104{
5105        queue->dev = dev;
5106}
5107
5108static void netdev_init_queues(struct net_device *dev)
5109{
5110        netdev_init_one_queue(dev, &dev->rx_queue, NULL);
5111        netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5112        spin_lock_init(&dev->tx_global_lock);
5113}
5114
5115/**
5116 *      alloc_netdev_mq - allocate network device
5117 *      @sizeof_priv:   size of private data to allocate space for
5118 *      @name:          device name format string
5119 *      @setup:         callback to initialize device
5120 *      @queue_count:   the number of subqueues to allocate
5121 *
5122 *      Allocates a struct net_device with private data area for driver use
5123 *      and performs basic initialization.  Also allocates subquue structs
5124 *      for each queue on the device at the end of the netdevice.
5125 */
5126struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5127                void (*setup)(struct net_device *), unsigned int queue_count)
5128{
5129        struct netdev_queue *tx;
5130        struct net_device *dev;
5131        size_t alloc_size;
5132        struct net_device *p;
5133
5134        BUG_ON(strlen(name) >= sizeof(dev->name));
5135
5136        alloc_size = sizeof(struct net_device);
5137        if (sizeof_priv) {
5138                /* ensure 32-byte alignment of private area */
5139                alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5140                alloc_size += sizeof_priv;
5141        }
5142        /* ensure 32-byte alignment of whole construct */
5143        alloc_size += NETDEV_ALIGN - 1;
5144
5145        p = kzalloc(alloc_size, GFP_KERNEL);
5146        if (!p) {
5147                printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5148                return NULL;
5149        }
5150
5151        tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
5152        if (!tx) {
5153                printk(KERN_ERR "alloc_netdev: Unable to allocate "
5154                       "tx qdiscs.\n");
5155                goto free_p;
5156        }
5157
5158        dev = PTR_ALIGN(p, NETDEV_ALIGN);
5159        dev->padded = (char *)dev - (char *)p;
5160
5161        if (dev_addr_init(dev))
5162                goto free_tx;
5163
5164        dev_unicast_init(dev);
5165
5166        dev_net_set(dev, &init_net);
5167
5168        dev->_tx = tx;
5169        dev->num_tx_queues = queue_count;
5170        dev->real_num_tx_queues = queue_count;
5171
5172        dev->gso_max_size = GSO_MAX_SIZE;
5173
5174        netdev_init_queues(dev);
5175
5176        INIT_LIST_HEAD(&dev->napi_list);
5177        dev->priv_flags = IFF_XMIT_DST_RELEASE;
5178        setup(dev);
5179        strcpy(dev->name, name);
5180        return dev;
5181
5182free_tx:
5183        kfree(tx);
5184
5185free_p:
5186        kfree(p);
5187        return NULL;
5188}
5189EXPORT_SYMBOL(alloc_netdev_mq);
5190
5191/**
5192 *      free_netdev - free network device
5193 *      @dev: device
5194 *
5195 *      This function does the last stage of destroying an allocated device
5196 *      interface. The reference to the device object is released.
5197 *      If this is the last reference then it will be freed.
5198 */
5199void free_netdev(struct net_device *dev)
5200{
5201        struct napi_struct *p, *n;
5202
5203        release_net(dev_net(dev));
5204
5205        kfree(dev->_tx);
5206
5207        /* Flush device addresses */
5208        dev_addr_flush(dev);
5209
5210        list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5211                netif_napi_del(p);
5212
5213        /*  Compatibility with error handling in drivers */
5214        if (dev->reg_state == NETREG_UNINITIALIZED) {
5215                kfree((char *)dev - dev->padded);
5216                return;
5217        }
5218
5219        BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5220        dev->reg_state = NETREG_RELEASED;
5221
5222        /* will free via device release */
5223        put_device(&dev->dev);
5224}
5225EXPORT_SYMBOL(free_netdev);
5226
5227/**
5228 *      synchronize_net -  Synchronize with packet receive processing
5229 *
5230 *      Wait for packets currently being received to be done.
5231 *      Does not block later packets from starting.
5232 */
5233void synchronize_net(void)
5234{
5235        might_sleep();
5236        synchronize_rcu();
5237}
5238EXPORT_SYMBOL(synchronize_net);
5239
5240/**
5241 *      unregister_netdevice - remove device from the kernel
5242 *      @dev: device
5243 *
5244 *      This function shuts down a device interface and removes it
5245 *      from the kernel tables.
5246 *
5247 *      Callers must hold the rtnl semaphore.  You may want
5248 *      unregister_netdev() instead of this.
5249 */
5250
5251void unregister_netdevice(struct net_device *dev)
5252{
5253        ASSERT_RTNL();
5254
5255        rollback_registered(dev);
5256        /* Finish processing unregister after unlock */
5257        net_set_todo(dev);
5258}
5259EXPORT_SYMBOL(unregister_netdevice);
5260
5261/**
5262 *      unregister_netdev - remove device from the kernel
5263 *      @dev: device
5264 *
5265 *      This function shuts down a device interface and removes it
5266 *      from the kernel tables.
5267 *
5268 *      This is just a wrapper for unregister_netdevice that takes
5269 *      the rtnl semaphore.  In general you want to use this and not
5270 *      unregister_netdevice.
5271 */
5272void unregister_netdev(struct net_device *dev)
5273{
5274        rtnl_lock();
5275        unregister_netdevice(dev);
5276        rtnl_unlock();
5277}
5278EXPORT_SYMBOL(unregister_netdev);
5279
5280/**
5281 *      dev_change_net_namespace - move device to different nethost namespace
5282 *      @dev: device
5283 *      @net: network namespace
5284 *      @pat: If not NULL name pattern to try if the current device name
5285 *            is already taken in the destination network namespace.
5286 *
5287 *      This function shuts down a device interface and moves it
5288 *      to a new network namespace. On success 0 is returned, on
5289 *      a failure a netagive errno code is returned.
5290 *
5291 *      Callers must hold the rtnl semaphore.
5292 */
5293
5294int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5295{
5296        char buf[IFNAMSIZ];
5297        const char *destname;
5298        int err;
5299
5300        ASSERT_RTNL();
5301
5302        /* Don't allow namespace local devices to be moved. */
5303        err = -EINVAL;
5304        if (dev->features & NETIF_F_NETNS_LOCAL)
5305                goto out;
5306
5307#ifdef CONFIG_SYSFS
5308        /* Don't allow real devices to be moved when sysfs
5309         * is enabled.
5310         */
5311        err = -EINVAL;
5312        if (dev->dev.parent)
5313                goto out;
5314#endif
5315
5316        /* Ensure the device has been registrered */
5317        err = -EINVAL;
5318        if (dev->reg_state != NETREG_REGISTERED)
5319                goto out;
5320
5321        /* Get out if there is nothing todo */
5322        err = 0;
5323        if (net_eq(dev_net(dev), net))
5324                goto out;
5325
5326        /* Pick the destination device name, and ensure
5327         * we can use it in the destination network namespace.
5328         */
5329        err = -EEXIST;
5330        destname = dev->name;
5331        if (__dev_get_by_name(net, destname)) {
5332                /* We get here if we can't use the current device name */
5333                if (!pat)
5334                        goto out;
5335                if (!dev_valid_name(pat))
5336                        goto out;
5337                if (strchr(pat, '%')) {
5338                        if (__dev_alloc_name(net, pat, buf) < 0)
5339                                goto out;
5340                        destname = buf;
5341                } else
5342                        destname = pat;
5343                if (__dev_get_by_name(net, destname))
5344                        goto out;
5345        }
5346
5347        /*
5348         * And now a mini version of register_netdevice unregister_netdevice.
5349         */
5350
5351        /* If device is running close it first. */
5352        dev_close(dev);
5353
5354        /* And unlink it from device chain */
5355        err = -ENODEV;
5356        unlist_netdevice(dev);
5357
5358        synchronize_net();
5359
5360        /* Shutdown queueing discipline. */
5361        dev_shutdown(dev);
5362
5363        /* Notify protocols, that we are about to destroy
5364           this device. They should clean all the things.
5365        */
5366        call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5367
5368        /*
5369         *      Flush the unicast and multicast chains
5370         */
5371        dev_unicast_flush(dev);
5372        dev_addr_discard(dev);
5373
5374        netdev_unregister_kobject(dev);
5375
5376        /* Actually switch the network namespace */
5377        dev_net_set(dev, net);
5378
5379        /* Assign the new device name */
5380        if (destname != dev->name)
5381                strcpy(dev->name, destname);
5382
5383        /* If there is an ifindex conflict assign a new one */
5384        if (__dev_get_by_index(net, dev->ifindex)) {
5385                int iflink = (dev->iflink == dev->ifindex);
5386                dev->ifindex = dev_new_index(net);
5387                if (iflink)
5388                        dev->iflink = dev->ifindex;
5389        }
5390
5391        /* Fixup kobjects */
5392        err = netdev_register_kobject(dev);
5393        WARN_ON(err);
5394
5395        /* Add the device back in the hashes */
5396        list_netdevice(dev);
5397
5398        /* Notify protocols, that a new device appeared. */
5399        call_netdevice_notifiers(NETDEV_REGISTER, dev);
5400
5401        synchronize_net();
5402        err = 0;
5403out:
5404        return err;
5405}
5406EXPORT_SYMBOL_GPL(dev_change_net_namespace);
5407
5408static int dev_cpu_callback(struct notifier_block *nfb,
5409                            unsigned long action,
5410                            void *ocpu)
5411{
5412        struct sk_buff **list_skb;
5413        struct Qdisc **list_net;
5414        struct sk_buff *skb;
5415        unsigned int cpu, oldcpu = (unsigned long)ocpu;
5416        struct softnet_data *sd, *oldsd;
5417
5418        if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
5419                return NOTIFY_OK;
5420
5421        local_irq_disable();
5422        cpu = smp_processor_id();
5423        sd = &per_cpu(softnet_data, cpu);
5424        oldsd = &per_cpu(softnet_data, oldcpu);
5425
5426        /* Find end of our completion_queue. */
5427        list_skb = &sd->completion_queue;
5428        while (*list_skb)
5429                list_skb = &(*list_skb)->next;
5430        /* Append completion queue from offline CPU. */
5431        *list_skb = oldsd->completion_queue;
5432        oldsd->completion_queue = NULL;
5433
5434        /* Find end of our output_queue. */
5435        list_net = &sd->output_queue;
5436        while (*list_net)
5437                list_net = &(*list_net)->next_sched;
5438        /* Append output queue from offline CPU. */
5439        *list_net = oldsd->output_queue;
5440        oldsd->output_queue = NULL;
5441
5442        raise_softirq_irqoff(NET_TX_SOFTIRQ);
5443        local_irq_enable();
5444
5445        /* Process offline CPU's input_pkt_queue */
5446        while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
5447                netif_rx(skb);
5448
5449        return NOTIFY_OK;
5450}
5451
5452
5453/**
5454 *      netdev_increment_features - increment feature set by one
5455 *      @all: current feature set
5456 *      @one: new feature set
5457 *      @mask: mask feature set
5458 *
5459 *      Computes a new feature set after adding a device with feature set
5460 *      @one to the master device with current feature set @all.  Will not
5461 *      enable anything that is off in @mask. Returns the new feature set.
5462 */
5463unsigned long netdev_increment_features(unsigned long all, unsigned long one,
5464                                        unsigned long mask)
5465{
5466        /* If device needs checksumming, downgrade to it. */
5467        if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
5468                all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
5469        else if (mask & NETIF_F_ALL_CSUM) {
5470                /* If one device supports v4/v6 checksumming, set for all. */
5471                if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
5472                    !(all & NETIF_F_GEN_CSUM)) {
5473                        all &= ~NETIF_F_ALL_CSUM;
5474                        all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
5475                }
5476
5477                /* If one device supports hw checksumming, set for all. */
5478                if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
5479                        all &= ~NETIF_F_ALL_CSUM;
5480                        all |= NETIF_F_HW_CSUM;
5481                }
5482        }
5483
5484        one |= NETIF_F_ALL_CSUM;
5485
5486        one |= all & NETIF_F_ONE_FOR_ALL;
5487        all &= one | NETIF_F_LLTX | NETIF_F_GSO;
5488        all |= one & mask & NETIF_F_ONE_FOR_ALL;
5489
5490        return all;
5491}
5492EXPORT_SYMBOL(netdev_increment_features);
5493
5494static struct hlist_head *netdev_create_hash(void)
5495{
5496        int i;
5497        struct hlist_head *hash;
5498
5499        hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
5500        if (hash != NULL)
5501                for (i = 0; i < NETDEV_HASHENTRIES; i++)
5502                        INIT_HLIST_HEAD(&hash[i]);
5503
5504        return hash;
5505}
5506
5507/* Initialize per network namespace state */
5508static int __net_init netdev_init(struct net *net)
5509{
5510        INIT_LIST_HEAD(&net->dev_base_head);
5511
5512        net->dev_name_head = netdev_create_hash();
5513        if (net->dev_name_head == NULL)
5514                goto err_name;
5515
5516        net->dev_index_head = netdev_create_hash();
5517        if (net->dev_index_head == NULL)
5518                goto err_idx;
5519
5520        return 0;
5521
5522err_idx:
5523        kfree(net->dev_name_head);
5524err_name:
5525        return -ENOMEM;
5526}
5527
5528/**
5529 *      netdev_drivername - network driver for the device
5530 *      @dev: network device
5531 *      @buffer: buffer for resulting name
5532 *      @len: size of buffer
5533 *
5534 *      Determine network driver for device.
5535 */
5536char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
5537{
5538        const struct device_driver *driver;
5539        const struct device *parent;
5540
5541        if (len <= 0 || !buffer)
5542                return buffer;
5543        buffer[0] = 0;
5544
5545        parent = dev->dev.parent;
5546
5547        if (!parent)
5548                return buffer;
5549
5550        driver = parent->driver;
5551        if (driver && driver->name)
5552                strlcpy(buffer, driver->name, len);
5553        return buffer;
5554}
5555
5556static void __net_exit netdev_exit(struct net *net)
5557{
5558        kfree(net->dev_name_head);
5559        kfree(net->dev_index_head);
5560}
5561
5562static struct pernet_operations __net_initdata netdev_net_ops = {
5563        .init = netdev_init,
5564        .exit = netdev_exit,
5565};
5566
5567static void __net_exit default_device_exit(struct net *net)
5568{
5569        struct net_device *dev;
5570        /*
5571         * Push all migratable of the network devices back to the
5572         * initial network namespace
5573         */
5574        rtnl_lock();
5575restart:
5576        for_each_netdev(net, dev) {
5577                int err;
5578                char fb_name[IFNAMSIZ];
5579
5580                /* Ignore unmoveable devices (i.e. loopback) */
5581                if (dev->features & NETIF_F_NETNS_LOCAL)
5582                        continue;
5583
5584                /* Delete virtual devices */
5585                if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) {
5586                        dev->rtnl_link_ops->dellink(dev);
5587                        goto restart;
5588                }
5589
5590                /* Push remaing network devices to init_net */
5591                snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
5592                err = dev_change_net_namespace(dev, &init_net, fb_name);
5593                if (err) {
5594                        printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
5595                                __func__, dev->name, err);
5596                        BUG();
5597                }
5598                goto restart;
5599        }
5600        rtnl_unlock();
5601}
5602
5603static struct pernet_operations __net_initdata default_device_ops = {
5604        .exit = default_device_exit,
5605};
5606
5607/*
5608 *      Initialize the DEV module. At boot time this walks the device list and
5609 *      unhooks any devices that fail to initialise (normally hardware not
5610 *      present) and leaves us with a valid list of present and active devices.
5611 *
5612 */
5613
5614/*
5615 *       This is called single threaded during boot, so no need
5616 *       to take the rtnl semaphore.
5617 */
5618static int __init net_dev_init(void)
5619{
5620        int i, rc = -ENOMEM;
5621
5622        BUG_ON(!dev_boot_phase);
5623
5624        if (dev_proc_init())
5625                goto out;
5626
5627        if (netdev_kobject_init())
5628                goto out;
5629
5630        INIT_LIST_HEAD(&ptype_all);
5631        for (i = 0; i < PTYPE_HASH_SIZE; i++)
5632                INIT_LIST_HEAD(&ptype_base[i]);
5633
5634        if (register_pernet_subsys(&netdev_net_ops))
5635                goto out;
5636
5637        /*
5638         *      Initialise the packet receive queues.
5639         */
5640
5641        for_each_possible_cpu(i) {
5642                struct softnet_data *queue;
5643
5644                queue = &per_cpu(softnet_data, i);
5645                skb_queue_head_init(&queue->input_pkt_queue);
5646                queue->completion_queue = NULL;
5647                INIT_LIST_HEAD(&queue->poll_list);
5648
5649                queue->backlog.poll = process_backlog;
5650                queue->backlog.weight = weight_p;
5651                queue->backlog.gro_list = NULL;
5652                queue->backlog.gro_count = 0;
5653        }
5654
5655        dev_boot_phase = 0;
5656
5657        /* The loopback device is special if any other network devices
5658         * is present in a network namespace the loopback device must
5659         * be present. Since we now dynamically allocate and free the
5660         * loopback device ensure this invariant is maintained by
5661         * keeping the loopback device as the first device on the
5662         * list of network devices.  Ensuring the loopback devices
5663         * is the first device that appears and the last network device
5664         * that disappears.
5665         */
5666        if (register_pernet_device(&loopback_net_ops))
5667                goto out;
5668
5669        if (register_pernet_device(&default_device_ops))
5670                goto out;
5671
5672        open_softirq(NET_TX_SOFTIRQ, net_tx_action);
5673        open_softirq(NET_RX_SOFTIRQ, net_rx_action);
5674
5675        hotcpu_notifier(dev_cpu_callback, 0);
5676        dst_init();
5677        dev_mcast_init();
5678        rc = 0;
5679out:
5680        return rc;
5681}
5682
5683subsys_initcall(net_dev_init);
5684
5685static int __init initialize_hashrnd(void)
5686{
5687        get_random_bytes(&skb_tx_hashrnd, sizeof(skb_tx_hashrnd));
5688        return 0;
5689}
5690
5691late_initcall_sync(initialize_hashrnd);
5692
5693