LXR linux/net/core/dev.c

   1/*
   2 *      NET3    Protocol independent device support routines.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 *      Derived from the non IP parts of dev.c 1.0.19
  10 *              Authors:        Ross Biro
  11 *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *      Additional Authors:
  15 *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *              David Hinds <dahinds@users.sourceforge.net>
  18 *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *              Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *      Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *                                      to 2 if register_netdev gets called
  25 *                                      before net_dev_init & also removed a
  26 *                                      few lines of code in the process.
  27 *              Alan Cox        :       device private ioctl copies fields back.
  28 *              Alan Cox        :       Transmit queue code does relevant
  29 *                                      stunts to keep the queue safe.
  30 *              Alan Cox        :       Fixed double lock.
  31 *              Alan Cox        :       Fixed promisc NULL pointer trap
  32 *              ????????        :       Support the full private ioctl range
  33 *              Alan Cox        :       Moved ioctl permission check into
  34 *                                      drivers
  35 *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36 *              Alan Cox        :       100 backlog just doesn't cut it when
  37 *                                      you start doing multicast video 8)
  38 *              Alan Cox        :       Rewrote net_bh and list manager.
  39 *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40 *              Alan Cox        :       Took out transmit every packet pass
  41 *                                      Saved a few bytes in the ioctl handler
  42 *              Alan Cox        :       Network driver sets packet type before
  43 *                                      calling netif_rx. Saves a function
  44 *                                      call a packet.
  45 *              Alan Cox        :       Hashed net_bh()
  46 *              Richard Kooijman:       Timestamp fixes.
  47 *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48 *              Alan Cox        :       Device lock protection.
  49 *              Alan Cox        :       Fixed nasty side effect of device close
  50 *                                      changes.
  51 *              Rudi Cilibrasi  :       Pass the right thing to
  52 *                                      set_mac_address()
  53 *              Dave Miller     :       32bit quantity for the device lock to
  54 *                                      make it work out on a Sparc.
  55 *              Bjorn Ekwall    :       Added KERNELD hack.
  56 *              Alan Cox        :       Cleaned up the backlog initialise.
  57 *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58 *                                      1 device.
  59 *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60 *                                      is no device open function.
  61 *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62 *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63 *              Cyrus Durgin    :       Cleaned for KMOD
  64 *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65 *                                      A network device unload needs to purge
  66 *                                      the backlog queue.
  67 *      Paul Rusty Russell      :       SIOCSIFNAME
  68 *              Pekka Riikonen  :       Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *                                      indefinitely on dev->refcnt
  71 *              J Hadi Salim    :       - Backlog queue sampling
  72 *                                      - netif_rx() feedback
  73 */
  74
  75#include <asm/uaccess.h>
  76#include <asm/system.h>
  77#include <linux/bitops.h>
  78#include <linux/capability.h>
  79#include <linux/cpu.h>
  80#include <linux/types.h>
  81#include <linux/kernel.h>
  82#include <linux/hash.h>
  83#include <linux/slab.h>
  84#include <linux/sched.h>
  85#include <linux/mutex.h>
  86#include <linux/string.h>
  87#include <linux/mm.h>
  88#include <linux/socket.h>
  89#include <linux/sockios.h>
  90#include <linux/errno.h>
  91#include <linux/interrupt.h>
  92#include <linux/if_ether.h>
  93#include <linux/netdevice.h>
  94#include <linux/etherdevice.h>
  95#include <linux/ethtool.h>
  96#include <linux/notifier.h>
  97#include <linux/skbuff.h>
  98#include <net/net_namespace.h>
  99#include <net/sock.h>
 100#include <linux/rtnetlink.h>
 101#include <linux/proc_fs.h>
 102#include <linux/seq_file.h>
 103#include <linux/stat.h>
 104#include <net/dst.h>
 105#include <net/pkt_sched.h>
 106#include <net/checksum.h>
 107#include <net/xfrm.h>
 108#include <linux/highmem.h>
 109#include <linux/init.h>
 110#include <linux/kmod.h>
 111#include <linux/module.h>
 112#include <linux/netpoll.h>
 113#include <linux/rcupdate.h>
 114#include <linux/delay.h>
 115#include <net/wext.h>
 116#include <net/iw_handler.h>
 117#include <asm/current.h>
 118#include <linux/audit.h>
 119#include <linux/dmaengine.h>
 120#include <linux/err.h>
 121#include <linux/ctype.h>
 122#include <linux/if_arp.h>
 123#include <linux/if_vlan.h>
 124#include <linux/ip.h>
 125#include <net/ip.h>
 126#include <linux/ipv6.h>
 127#include <linux/in.h>
 128#include <linux/jhash.h>
 129#include <linux/random.h>
 130#include <trace/events/napi.h>
 131#include <trace/events/net.h>
 132#include <trace/events/skb.h>
 133#include <linux/pci.h>
 134#include <linux/inetdevice.h>
 135#include <linux/cpu_rmap.h>
 136
 137#include "net-sysfs.h"
 138
 139/* Instead of increasing this, you should create a hash table. */
 140#define MAX_GRO_SKBS 8
 141
 142/* This should be increased if a protocol with a bigger head is added. */
 143#define GRO_MAX_HEAD (MAX_HEADER + 128)
 144
 145/*
 146 *      The list of packet types we will receive (as opposed to discard)
 147 *      and the routines to invoke.
 148 *
 149 *      Why 16. Because with 16 the only overlap we get on a hash of the
 150 *      low nibble of the protocol value is RARP/SNAP/X.25.
 151 *
 152 *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 153 *             sure which should go first, but I bet it won't make much
 154 *             difference if we are running VLANs.  The good news is that
 155 *             this protocol won't be in the list unless compiled in, so
 156 *             the average user (w/out VLANs) will not be adversely affected.
 157 *             --BLG
 158 *
 159 *              0800    IP
 160 *              8100    802.1Q VLAN
 161 *              0001    802.3
 162 *              0002    AX.25
 163 *              0004    802.2
 164 *              8035    RARP
 165 *              0005    SNAP
 166 *              0805    X.25
 167 *              0806    ARP
 168 *              8137    IPX
 169 *              0009    Localtalk
 170 *              86DD    IPv6
 171 */
 172
 173#define PTYPE_HASH_SIZE (16)
 174#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 175
 176static DEFINE_SPINLOCK(ptype_lock);
 177static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 178static struct list_head ptype_all __read_mostly;        /* Taps */
 179
 180/*
 181 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 182 * semaphore.
 183 *
 184 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 185 *
 186 * Writers must hold the rtnl semaphore while they loop through the
 187 * dev_base_head list, and hold dev_base_lock for writing when they do the
 188 * actual updates.  This allows pure readers to access the list even
 189 * while a writer is preparing to update it.
 190 *
 191 * To put it another way, dev_base_lock is held for writing only to
 192 * protect against pure readers; the rtnl semaphore provides the
 193 * protection against other writers.
 194 *
 195 * See, for example usages, register_netdevice() and
 196 * unregister_netdevice(), which must be called with the rtnl
 197 * semaphore held.
 198 */
 199DEFINE_RWLOCK(dev_base_lock);
 200EXPORT_SYMBOL(dev_base_lock);
 201
 202static inline void dev_base_seq_inc(struct net *net)
 203{
 204        while (++net->dev_base_seq == 0);
 205}
 206
 207static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 208{
 209        unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 210        return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 211}
 212
 213static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 214{
 215        return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 216}
 217
 218static inline void rps_lock(struct softnet_data *sd)
 219{
 220#ifdef CONFIG_RPS
 221        spin_lock(&sd->input_pkt_queue.lock);
 222#endif
 223}
 224
 225static inline void rps_unlock(struct softnet_data *sd)
 226{
 227#ifdef CONFIG_RPS
 228        spin_unlock(&sd->input_pkt_queue.lock);
 229#endif
 230}
 231
 232/* Device list insertion */
 233static int list_netdevice(struct net_device *dev)
 234{
 235        struct net *net = dev_net(dev);
 236
 237        ASSERT_RTNL();
 238
 239        write_lock_bh(&dev_base_lock);
 240        list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 241        hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 242        hlist_add_head_rcu(&dev->index_hlist,
 243                           dev_index_hash(net, dev->ifindex));
 244        write_unlock_bh(&dev_base_lock);
 245
 246        dev_base_seq_inc(net);
 247
 248        return 0;
 249}
 250
 251/* Device list removal
 252 * caller must respect a RCU grace period before freeing/reusing dev
 253 */
 254static void unlist_netdevice(struct net_device *dev)
 255{
 256        ASSERT_RTNL();
 257
 258        /* Unlink dev from the device chain */
 259        write_lock_bh(&dev_base_lock);
 260        list_del_rcu(&dev->dev_list);
 261        hlist_del_rcu(&dev->name_hlist);
 262        hlist_del_rcu(&dev->index_hlist);
 263        write_unlock_bh(&dev_base_lock);
 264
 265        dev_base_seq_inc(dev_net(dev));
 266}
 267
 268/*
 269 *      Our notifier list
 270 */
 271
 272static RAW_NOTIFIER_HEAD(netdev_chain);
 273
 274/*
 275 *      Device drivers call our routines to queue packets here. We empty the
 276 *      queue in the local softnet handler.
 277 */
 278
 279DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 280EXPORT_PER_CPU_SYMBOL(softnet_data);
 281
 282#ifdef CONFIG_LOCKDEP
 283/*
 284 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 285 * according to dev->type
 286 */
 287static const unsigned short netdev_lock_type[] =
 288        {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 289         ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 290         ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 291         ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 292         ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 293         ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 294         ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 295         ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 296         ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 297         ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 298         ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 299         ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 300         ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 301         ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 302         ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
 303         ARPHRD_VOID, ARPHRD_NONE};
 304
 305static const char *const netdev_lock_name[] =
 306        {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 307         "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 308         "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 309         "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 310         "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 311         "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 312         "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 313         "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 314         "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 315         "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 316         "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 317         "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 318         "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 319         "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 320         "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
 321         "_xmit_VOID", "_xmit_NONE"};
 322
 323static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 324static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 325
 326static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 327{
 328        int i;
 329
 330        for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 331                if (netdev_lock_type[i] == dev_type)
 332                        return i;
 333        /* the last key is used by default */
 334        return ARRAY_SIZE(netdev_lock_type) - 1;
 335}
 336
 337static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 338                                                 unsigned short dev_type)
 339{
 340        int i;
 341
 342        i = netdev_lock_pos(dev_type);
 343        lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 344                                   netdev_lock_name[i]);
 345}
 346
 347static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 348{
 349        int i;
 350
 351        i = netdev_lock_pos(dev->type);
 352        lockdep_set_class_and_name(&dev->addr_list_lock,
 353                                   &netdev_addr_lock_key[i],
 354                                   netdev_lock_name[i]);
 355}
 356#else
 357static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 358                                                 unsigned short dev_type)
 359{
 360}
 361static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 362{
 363}
 364#endif
 365
 366/*******************************************************************************
 367
 368                Protocol management and registration routines
 369
 370*******************************************************************************/
 371
 372/*
 373 *      Add a protocol ID to the list. Now that the input handler is
 374 *      smarter we can dispense with all the messy stuff that used to be
 375 *      here.
 376 *
 377 *      BEWARE!!! Protocol handlers, mangling input packets,
 378 *      MUST BE last in hash buckets and checking protocol handlers
 379 *      MUST start from promiscuous ptype_all chain in net_bh.
 380 *      It is true now, do not change it.
 381 *      Explanation follows: if protocol handler, mangling packet, will
 382 *      be the first on list, it is not able to sense, that packet
 383 *      is cloned and should be copied-on-write, so that it will
 384 *      change it and subsequent readers will get broken packet.
 385 *                                                      --ANK (980803)
 386 */
 387
 388static inline struct list_head *ptype_head(const struct packet_type *pt)
 389{
 390        if (pt->type == htons(ETH_P_ALL))
 391                return &ptype_all;
 392        else
 393                return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 394}
 395
 396/**
 397 *      dev_add_pack - add packet handler
 398 *      @pt: packet type declaration
 399 *
 400 *      Add a protocol handler to the networking stack. The passed &packet_type
 401 *      is linked into kernel lists and may not be freed until it has been
 402 *      removed from the kernel lists.
 403 *
 404 *      This call does not sleep therefore it can not
 405 *      guarantee all CPU's that are in middle of receiving packets
 406 *      will see the new packet type (until the next received packet).
 407 */
 408
 409void dev_add_pack(struct packet_type *pt)
 410{
 411        struct list_head *head = ptype_head(pt);
 412
 413        spin_lock(&ptype_lock);
 414        list_add_rcu(&pt->list, head);
 415        spin_unlock(&ptype_lock);
 416}
 417EXPORT_SYMBOL(dev_add_pack);
 418
 419/**
 420 *      __dev_remove_pack        - remove packet handler
 421 *      @pt: packet type declaration
 422 *
 423 *      Remove a protocol handler that was previously added to the kernel
 424 *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 425 *      from the kernel lists and can be freed or reused once this function
 426 *      returns.
 427 *
 428 *      The packet type might still be in use by receivers
 429 *      and must not be freed until after all the CPU's have gone
 430 *      through a quiescent state.
 431 */
 432void __dev_remove_pack(struct packet_type *pt)
 433{
 434        struct list_head *head = ptype_head(pt);
 435        struct packet_type *pt1;
 436
 437        spin_lock(&ptype_lock);
 438
 439        list_for_each_entry(pt1, head, list) {
 440                if (pt == pt1) {
 441                        list_del_rcu(&pt->list);
 442                        goto out;
 443                }
 444        }
 445
 446        printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 447out:
 448        spin_unlock(&ptype_lock);
 449}
 450EXPORT_SYMBOL(__dev_remove_pack);
 451
 452/**
 453 *      dev_remove_pack  - remove packet handler
 454 *      @pt: packet type declaration
 455 *
 456 *      Remove a protocol handler that was previously added to the kernel
 457 *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 458 *      from the kernel lists and can be freed or reused once this function
 459 *      returns.
 460 *
 461 *      This call sleeps to guarantee that no CPU is looking at the packet
 462 *      type after return.
 463 */
 464void dev_remove_pack(struct packet_type *pt)
 465{
 466        __dev_remove_pack(pt);
 467
 468        synchronize_net();
 469}
 470EXPORT_SYMBOL(dev_remove_pack);
 471
 472/******************************************************************************
 473
 474                      Device Boot-time Settings Routines
 475
 476*******************************************************************************/
 477
 478/* Boot time configuration table */
 479static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 480
 481/**
 482 *      netdev_boot_setup_add   - add new setup entry
 483 *      @name: name of the device
 484 *      @map: configured settings for the device
 485 *
 486 *      Adds new setup entry to the dev_boot_setup list.  The function
 487 *      returns 0 on error and 1 on success.  This is a generic routine to
 488 *      all netdevices.
 489 */
 490static int netdev_boot_setup_add(char *name, struct ifmap *map)
 491{
 492        struct netdev_boot_setup *s;
 493        int i;
 494
 495        s = dev_boot_setup;
 496        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 497                if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 498                        memset(s[i].name, 0, sizeof(s[i].name));
 499                        strlcpy(s[i].name, name, IFNAMSIZ);
 500                        memcpy(&s[i].map, map, sizeof(s[i].map));
 501                        break;
 502                }
 503        }
 504
 505        return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 506}
 507
 508/**
 509 *      netdev_boot_setup_check - check boot time settings
 510 *      @dev: the netdevice
 511 *
 512 *      Check boot time settings for the device.
 513 *      The found settings are set for the device to be used
 514 *      later in the device probing.
 515 *      Returns 0 if no settings found, 1 if they are.
 516 */
 517int netdev_boot_setup_check(struct net_device *dev)
 518{
 519        struct netdev_boot_setup *s = dev_boot_setup;
 520        int i;
 521
 522        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 523                if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 524                    !strcmp(dev->name, s[i].name)) {
 525                        dev->irq        = s[i].map.irq;
 526                        dev->base_addr  = s[i].map.base_addr;
 527                        dev->mem_start  = s[i].map.mem_start;
 528                        dev->mem_end    = s[i].map.mem_end;
 529                        return 1;
 530                }
 531        }
 532        return 0;
 533}
 534EXPORT_SYMBOL(netdev_boot_setup_check);
 535
 536
 537/**
 538 *      netdev_boot_base        - get address from boot time settings
 539 *      @prefix: prefix for network device
 540 *      @unit: id for network device
 541 *
 542 *      Check boot time settings for the base address of device.
 543 *      The found settings are set for the device to be used
 544 *      later in the device probing.
 545 *      Returns 0 if no settings found.
 546 */
 547unsigned long netdev_boot_base(const char *prefix, int unit)
 548{
 549        const struct netdev_boot_setup *s = dev_boot_setup;
 550        char name[IFNAMSIZ];
 551        int i;
 552
 553        sprintf(name, "%s%d", prefix, unit);
 554
 555        /*
 556         * If device already registered then return base of 1
 557         * to indicate not to probe for this interface
 558         */
 559        if (__dev_get_by_name(&init_net, name))
 560                return 1;
 561
 562        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 563                if (!strcmp(name, s[i].name))
 564                        return s[i].map.base_addr;
 565        return 0;
 566}
 567
 568/*
 569 * Saves at boot time configured settings for any netdevice.
 570 */
 571int __init netdev_boot_setup(char *str)
 572{
 573        int ints[5];
 574        struct ifmap map;
 575
 576        str = get_options(str, ARRAY_SIZE(ints), ints);
 577        if (!str || !*str)
 578                return 0;
 579
 580        /* Save settings */
 581        memset(&map, 0, sizeof(map));
 582        if (ints[0] > 0)
 583                map.irq = ints[1];
 584        if (ints[0] > 1)
 585                map.base_addr = ints[2];
 586        if (ints[0] > 2)
 587                map.mem_start = ints[3];
 588        if (ints[0] > 3)
 589                map.mem_end = ints[4];
 590
 591        /* Add new entry to the list */
 592        return netdev_boot_setup_add(str, &map);
 593}
 594
 595__setup("netdev=", netdev_boot_setup);
 596
 597/*******************************************************************************
 598
 599                            Device Interface Subroutines
 600
 601*******************************************************************************/
 602
 603/**
 604 *      __dev_get_by_name       - find a device by its name
 605 *      @net: the applicable net namespace
 606 *      @name: name to find
 607 *
 608 *      Find an interface by name. Must be called under RTNL semaphore
 609 *      or @dev_base_lock. If the name is found a pointer to the device
 610 *      is returned. If the name is not found then %NULL is returned. The
 611 *      reference counters are not incremented so the caller must be
 612 *      careful with locks.
 613 */
 614
 615struct net_device *__dev_get_by_name(struct net *net, const char *name)
 616{
 617        struct hlist_node *p;
 618        struct net_device *dev;
 619        struct hlist_head *head = dev_name_hash(net, name);
 620
 621        hlist_for_each_entry(dev, p, head, name_hlist)
 622                if (!strncmp(dev->name, name, IFNAMSIZ))
 623                        return dev;
 624
 625        return NULL;
 626}
 627EXPORT_SYMBOL(__dev_get_by_name);
 628
 629/**
 630 *      dev_get_by_name_rcu     - find a device by its name
 631 *      @net: the applicable net namespace
 632 *      @name: name to find
 633 *
 634 *      Find an interface by name.
 635 *      If the name is found a pointer to the device is returned.
 636 *      If the name is not found then %NULL is returned.
 637 *      The reference counters are not incremented so the caller must be
 638 *      careful with locks. The caller must hold RCU lock.
 639 */
 640
 641struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 642{
 643        struct hlist_node *p;
 644        struct net_device *dev;
 645        struct hlist_head *head = dev_name_hash(net, name);
 646
 647        hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 648                if (!strncmp(dev->name, name, IFNAMSIZ))
 649                        return dev;
 650
 651        return NULL;
 652}
 653EXPORT_SYMBOL(dev_get_by_name_rcu);
 654
 655/**
 656 *      dev_get_by_name         - find a device by its name
 657 *      @net: the applicable net namespace
 658 *      @name: name to find
 659 *
 660 *      Find an interface by name. This can be called from any
 661 *      context and does its own locking. The returned handle has
 662 *      the usage count incremented and the caller must use dev_put() to
 663 *      release it when it is no longer needed. %NULL is returned if no
 664 *      matching device is found.
 665 */
 666
 667struct net_device *dev_get_by_name(struct net *net, const char *name)
 668{
 669        struct net_device *dev;
 670
 671        rcu_read_lock();
 672        dev = dev_get_by_name_rcu(net, name);
 673        if (dev)
 674                dev_hold(dev);
 675        rcu_read_unlock();
 676        return dev;
 677}
 678EXPORT_SYMBOL(dev_get_by_name);
 679
 680/**
 681 *      __dev_get_by_index - find a device by its ifindex
 682 *      @net: the applicable net namespace
 683 *      @ifindex: index of device
 684 *
 685 *      Search for an interface by index. Returns %NULL if the device
 686 *      is not found or a pointer to the device. The device has not
 687 *      had its reference counter increased so the caller must be careful
 688 *      about locking. The caller must hold either the RTNL semaphore
 689 *      or @dev_base_lock.
 690 */
 691
 692struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 693{
 694        struct hlist_node *p;
 695        struct net_device *dev;
 696        struct hlist_head *head = dev_index_hash(net, ifindex);
 697
 698        hlist_for_each_entry(dev, p, head, index_hlist)
 699                if (dev->ifindex == ifindex)
 700                        return dev;
 701
 702        return NULL;
 703}
 704EXPORT_SYMBOL(__dev_get_by_index);
 705
 706/**
 707 *      dev_get_by_index_rcu - find a device by its ifindex
 708 *      @net: the applicable net namespace
 709 *      @ifindex: index of device
 710 *
 711 *      Search for an interface by index. Returns %NULL if the device
 712 *      is not found or a pointer to the device. The device has not
 713 *      had its reference counter increased so the caller must be careful
 714 *      about locking. The caller must hold RCU lock.
 715 */
 716
 717struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 718{
 719        struct hlist_node *p;
 720        struct net_device *dev;
 721        struct hlist_head *head = dev_index_hash(net, ifindex);
 722
 723        hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 724                if (dev->ifindex == ifindex)
 725                        return dev;
 726
 727        return NULL;
 728}
 729EXPORT_SYMBOL(dev_get_by_index_rcu);
 730
 731
 732/**
 733 *      dev_get_by_index - find a device by its ifindex
 734 *      @net: the applicable net namespace
 735 *      @ifindex: index of device
 736 *
 737 *      Search for an interface by index. Returns NULL if the device
 738 *      is not found or a pointer to the device. The device returned has
 739 *      had a reference added and the pointer is safe until the user calls
 740 *      dev_put to indicate they have finished with it.
 741 */
 742
 743struct net_device *dev_get_by_index(struct net *net, int ifindex)
 744{
 745        struct net_device *dev;
 746
 747        rcu_read_lock();
 748        dev = dev_get_by_index_rcu(net, ifindex);
 749        if (dev)
 750                dev_hold(dev);
 751        rcu_read_unlock();
 752        return dev;
 753}
 754EXPORT_SYMBOL(dev_get_by_index);
 755
 756/**
 757 *      dev_getbyhwaddr_rcu - find a device by its hardware address
 758 *      @net: the applicable net namespace
 759 *      @type: media type of device
 760 *      @ha: hardware address
 761 *
 762 *      Search for an interface by MAC address. Returns NULL if the device
 763 *      is not found or a pointer to the device.
 764 *      The caller must hold RCU or RTNL.
 765 *      The returned device has not had its ref count increased
 766 *      and the caller must therefore be careful about locking
 767 *
 768 */
 769
 770struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 771                                       const char *ha)
 772{
 773        struct net_device *dev;
 774
 775        for_each_netdev_rcu(net, dev)
 776                if (dev->type == type &&
 777                    !memcmp(dev->dev_addr, ha, dev->addr_len))
 778                        return dev;
 779
 780        return NULL;
 781}
 782EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 783
 784struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 785{
 786        struct net_device *dev;
 787
 788        ASSERT_RTNL();
 789        for_each_netdev(net, dev)
 790                if (dev->type == type)
 791                        return dev;
 792
 793        return NULL;
 794}
 795EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 796
 797struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 798{
 799        struct net_device *dev, *ret = NULL;
 800
 801        rcu_read_lock();
 802        for_each_netdev_rcu(net, dev)
 803                if (dev->type == type) {
 804                        dev_hold(dev);
 805                        ret = dev;
 806                        break;
 807                }
 808        rcu_read_unlock();
 809        return ret;
 810}
 811EXPORT_SYMBOL(dev_getfirstbyhwtype);
 812
 813/**
 814 *      dev_get_by_flags_rcu - find any device with given flags
 815 *      @net: the applicable net namespace
 816 *      @if_flags: IFF_* values
 817 *      @mask: bitmask of bits in if_flags to check
 818 *
 819 *      Search for any interface with the given flags. Returns NULL if a device
 820 *      is not found or a pointer to the device. Must be called inside
 821 *      rcu_read_lock(), and result refcount is unchanged.
 822 */
 823
 824struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 825                                    unsigned short mask)
 826{
 827        struct net_device *dev, *ret;
 828
 829        ret = NULL;
 830        for_each_netdev_rcu(net, dev) {
 831                if (((dev->flags ^ if_flags) & mask) == 0) {
 832                        ret = dev;
 833                        break;
 834                }
 835        }
 836        return ret;
 837}
 838EXPORT_SYMBOL(dev_get_by_flags_rcu);
 839
 840/**
 841 *      dev_valid_name - check if name is okay for network device
 842 *      @name: name string
 843 *
 844 *      Network device names need to be valid file names to
 845 *      to allow sysfs to work.  We also disallow any kind of
 846 *      whitespace.
 847 */
 848int dev_valid_name(const char *name)
 849{
 850        if (*name == '\0')
 851                return 0;
 852        if (strlen(name) >= IFNAMSIZ)
 853                return 0;
 854        if (!strcmp(name, ".") || !strcmp(name, ".."))
 855                return 0;
 856
 857        while (*name) {
 858                if (*name == '/' || isspace(*name))
 859                        return 0;
 860                name++;
 861        }
 862        return 1;
 863}
 864EXPORT_SYMBOL(dev_valid_name);
 865
 866/**
 867 *      __dev_alloc_name - allocate a name for a device
 868 *      @net: network namespace to allocate the device name in
 869 *      @name: name format string
 870 *      @buf:  scratch buffer and result name string
 871 *
 872 *      Passed a format string - eg "lt%d" it will try and find a suitable
 873 *      id. It scans list of devices to build up a free map, then chooses
 874 *      the first empty slot. The caller must hold the dev_base or rtnl lock
 875 *      while allocating the name and adding the device in order to avoid
 876 *      duplicates.
 877 *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 878 *      Returns the number of the unit assigned or a negative errno code.
 879 */
 880
 881static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 882{
 883        int i = 0;
 884        const char *p;
 885        const int max_netdevices = 8*PAGE_SIZE;
 886        unsigned long *inuse;
 887        struct net_device *d;
 888
 889        p = strnchr(name, IFNAMSIZ-1, '%');
 890        if (p) {
 891                /*
 892                 * Verify the string as this thing may have come from
 893                 * the user.  There must be either one "%d" and no other "%"
 894                 * characters.
 895                 */
 896                if (p[1] != 'd' || strchr(p + 2, '%'))
 897                        return -EINVAL;
 898
 899                /* Use one page as a bit array of possible slots */
 900                inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 901                if (!inuse)
 902                        return -ENOMEM;
 903
 904                for_each_netdev(net, d) {
 905                        if (!sscanf(d->name, name, &i))
 906                                continue;
 907                        if (i < 0 || i >= max_netdevices)
 908                                continue;
 909
 910                        /*  avoid cases where sscanf is not exact inverse of printf */
 911                        snprintf(buf, IFNAMSIZ, name, i);
 912                        if (!strncmp(buf, d->name, IFNAMSIZ))
 913                                set_bit(i, inuse);
 914                }
 915
 916                i = find_first_zero_bit(inuse, max_netdevices);
 917                free_page((unsigned long) inuse);
 918        }
 919
 920        if (buf != name)
 921                snprintf(buf, IFNAMSIZ, name, i);
 922        if (!__dev_get_by_name(net, buf))
 923                return i;
 924
 925        /* It is possible to run out of possible slots
 926         * when the name is long and there isn't enough space left
 927         * for the digits, or if all bits are used.
 928         */
 929        return -ENFILE;
 930}
 931
 932/**
 933 *      dev_alloc_name - allocate a name for a device
 934 *      @dev: device
 935 *      @name: name format string
 936 *
 937 *      Passed a format string - eg "lt%d" it will try and find a suitable
 938 *      id. It scans list of devices to build up a free map, then chooses
 939 *      the first empty slot. The caller must hold the dev_base or rtnl lock
 940 *      while allocating the name and adding the device in order to avoid
 941 *      duplicates.
 942 *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 943 *      Returns the number of the unit assigned or a negative errno code.
 944 */
 945
 946int dev_alloc_name(struct net_device *dev, const char *name)
 947{
 948        char buf[IFNAMSIZ];
 949        struct net *net;
 950        int ret;
 951
 952        BUG_ON(!dev_net(dev));
 953        net = dev_net(dev);
 954        ret = __dev_alloc_name(net, name, buf);
 955        if (ret >= 0)
 956                strlcpy(dev->name, buf, IFNAMSIZ);
 957        return ret;
 958}
 959EXPORT_SYMBOL(dev_alloc_name);
 960
 961static int dev_get_valid_name(struct net_device *dev, const char *name)
 962{
 963        struct net *net;
 964
 965        BUG_ON(!dev_net(dev));
 966        net = dev_net(dev);
 967
 968        if (!dev_valid_name(name))
 969                return -EINVAL;
 970
 971        if (strchr(name, '%'))
 972                return dev_alloc_name(dev, name);
 973        else if (__dev_get_by_name(net, name))
 974                return -EEXIST;
 975        else if (dev->name != name)
 976                strlcpy(dev->name, name, IFNAMSIZ);
 977
 978        return 0;
 979}
 980
 981/**
 982 *      dev_change_name - change name of a device
 983 *      @dev: device
 984 *      @newname: name (or format string) must be at least IFNAMSIZ
 985 *
 986 *      Change name of a device, can pass format strings "eth%d".
 987 *      for wildcarding.
 988 */
 989int dev_change_name(struct net_device *dev, const char *newname)
 990{
 991        char oldname[IFNAMSIZ];
 992        int err = 0;
 993        int ret;
 994        struct net *net;
 995
 996        ASSERT_RTNL();
 997        BUG_ON(!dev_net(dev));
 998
 999        net = dev_net(dev);
1000        if (dev->flags & IFF_UP)

1001                return -EBUSY;
1002
1003        if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1004                return 0;
1005
1006        memcpy(oldname, dev->name, IFNAMSIZ);
1007
1008        err = dev_get_valid_name(dev, newname);
1009        if (err < 0)
1010                return err;
1011
1012rollback:
1013        ret = device_rename(&dev->dev, dev->name);
1014        if (ret) {
1015                memcpy(dev->name, oldname, IFNAMSIZ);
1016                return ret;
1017        }
1018
1019        write_lock_bh(&dev_base_lock);
1020        hlist_del_rcu(&dev->name_hlist);
1021        write_unlock_bh(&dev_base_lock);
1022
1023        synchronize_rcu();
1024
1025        write_lock_bh(&dev_base_lock);
1026        hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1027        write_unlock_bh(&dev_base_lock);
1028
1029        ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1030        ret = notifier_to_errno(ret);
1031
1032        if (ret) {
1033                /* err >= 0 after dev_alloc_name() or stores the first errno */
1034                if (err >= 0) {
1035                        err = ret;
1036                        memcpy(dev->name, oldname, IFNAMSIZ);
1037                        goto rollback;
1038                } else {
1039                        printk(KERN_ERR
1040                               "%s: name change rollback failed: %d.\n",
1041                               dev->name, ret);
1042                }
1043        }
1044
1045        return err;
1046}
1047
1048/**
1049 *      dev_set_alias - change ifalias of a device
1050 *      @dev: device
1051 *      @alias: name up to IFALIASZ
1052 *      @len: limit of bytes to copy from info
1053 *
1054 *      Set ifalias for a device,
1055 */
1056int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1057{
1058        ASSERT_RTNL();
1059
1060        if (len >= IFALIASZ)
1061                return -EINVAL;
1062
1063        if (!len) {
1064                if (dev->ifalias) {
1065                        kfree(dev->ifalias);
1066                        dev->ifalias = NULL;
1067                }
1068                return 0;
1069        }
1070
1071        dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1072        if (!dev->ifalias)
1073                return -ENOMEM;
1074
1075        strlcpy(dev->ifalias, alias, len+1);
1076        return len;
1077}
1078
1079
1080/**
1081 *      netdev_features_change - device changes features
1082 *      @dev: device to cause notification
1083 *
1084 *      Called to indicate a device has changed features.
1085 */
1086void netdev_features_change(struct net_device *dev)
1087{
1088        call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1089}
1090EXPORT_SYMBOL(netdev_features_change);
1091
1092/**
1093 *      netdev_state_change - device changes state
1094 *      @dev: device to cause notification
1095 *
1096 *      Called to indicate a device has changed state. This function calls
1097 *      the notifier chains for netdev_chain and sends a NEWLINK message
1098 *      to the routing socket.
1099 */
1100void netdev_state_change(struct net_device *dev)
1101{
1102        if (dev->flags & IFF_UP) {
1103                call_netdevice_notifiers(NETDEV_CHANGE, dev);
1104                rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1105        }
1106}
1107EXPORT_SYMBOL(netdev_state_change);
1108
1109int netdev_bonding_change(struct net_device *dev, unsigned long event)
1110{
1111        return call_netdevice_notifiers(event, dev);
1112}
1113EXPORT_SYMBOL(netdev_bonding_change);
1114
1115/**
1116 *      dev_load        - load a network module
1117 *      @net: the applicable net namespace
1118 *      @name: name of interface
1119 *
1120 *      If a network interface is not present and the process has suitable
1121 *      privileges this function loads the module. If module loading is not
1122 *      available in this kernel then it becomes a nop.
1123 */
1124
1125void dev_load(struct net *net, const char *name)
1126{
1127        struct net_device *dev;
1128        int no_module;
1129
1130        rcu_read_lock();
1131        dev = dev_get_by_name_rcu(net, name);
1132        rcu_read_unlock();
1133
1134        no_module = !dev;
1135        if (no_module && capable(CAP_NET_ADMIN))
1136                no_module = request_module("netdev-%s", name);
1137        if (no_module && capable(CAP_SYS_MODULE)) {
1138                if (!request_module("%s", name))
1139                        pr_err("Loading kernel module for a network device "
1140"with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s "
1141"instead\n", name);
1142        }
1143}
1144EXPORT_SYMBOL(dev_load);
1145
1146static int __dev_open(struct net_device *dev)
1147{
1148        const struct net_device_ops *ops = dev->netdev_ops;
1149        int ret;
1150
1151        ASSERT_RTNL();
1152
1153        if (!netif_device_present(dev))
1154                return -ENODEV;
1155
1156        ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1157        ret = notifier_to_errno(ret);
1158        if (ret)
1159                return ret;
1160
1161        set_bit(__LINK_STATE_START, &dev->state);
1162
1163        if (ops->ndo_validate_addr)
1164                ret = ops->ndo_validate_addr(dev);
1165
1166        if (!ret && ops->ndo_open)
1167                ret = ops->ndo_open(dev);
1168
1169        if (ret)
1170                clear_bit(__LINK_STATE_START, &dev->state);
1171        else {
1172                dev->flags |= IFF_UP;
1173                net_dmaengine_get();
1174                dev_set_rx_mode(dev);
1175                dev_activate(dev);
1176        }
1177
1178        return ret;
1179}
1180
1181/**
1182 *      dev_open        - prepare an interface for use.
1183 *      @dev:   device to open
1184 *
1185 *      Takes a device from down to up state. The device's private open
1186 *      function is invoked and then the multicast lists are loaded. Finally
1187 *      the device is moved into the up state and a %NETDEV_UP message is
1188 *      sent to the netdev notifier chain.
1189 *
1190 *      Calling this function on an active interface is a nop. On a failure
1191 *      a negative errno code is returned.
1192 */
1193int dev_open(struct net_device *dev)
1194{
1195        int ret;
1196
1197        if (dev->flags & IFF_UP)
1198                return 0;
1199
1200        ret = __dev_open(dev);
1201        if (ret < 0)
1202                return ret;
1203
1204        rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1205        call_netdevice_notifiers(NETDEV_UP, dev);
1206
1207        return ret;
1208}
1209EXPORT_SYMBOL(dev_open);
1210
1211static int __dev_close_many(struct list_head *head)
1212{
1213        struct net_device *dev;
1214
1215        ASSERT_RTNL();
1216        might_sleep();
1217
1218        list_for_each_entry(dev, head, unreg_list) {
1219                call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1220
1221                clear_bit(__LINK_STATE_START, &dev->state);
1222
1223                /* Synchronize to scheduled poll. We cannot touch poll list, it
1224                 * can be even on different cpu. So just clear netif_running().
1225                 *
1226                 * dev->stop() will invoke napi_disable() on all of it's
1227                 * napi_struct instances on this device.
1228                 */
1229                smp_mb__after_clear_bit(); /* Commit netif_running(). */
1230        }
1231
1232        dev_deactivate_many(head);
1233
1234        list_for_each_entry(dev, head, unreg_list) {
1235                const struct net_device_ops *ops = dev->netdev_ops;
1236
1237                /*
1238                 *      Call the device specific close. This cannot fail.
1239                 *      Only if device is UP
1240                 *
1241                 *      We allow it to be called even after a DETACH hot-plug
1242                 *      event.
1243                 */
1244                if (ops->ndo_stop)
1245                        ops->ndo_stop(dev);
1246
1247                dev->flags &= ~IFF_UP;
1248                net_dmaengine_put();
1249        }
1250
1251        return 0;
1252}
1253
1254static int __dev_close(struct net_device *dev)
1255{
1256        int retval;
1257        LIST_HEAD(single);
1258
1259        list_add(&dev->unreg_list, &single);
1260        retval = __dev_close_many(&single);
1261        list_del(&single);
1262        return retval;
1263}
1264
1265static int dev_close_many(struct list_head *head)
1266{
1267        struct net_device *dev, *tmp;
1268        LIST_HEAD(tmp_list);
1269
1270        list_for_each_entry_safe(dev, tmp, head, unreg_list)
1271                if (!(dev->flags & IFF_UP))
1272                        list_move(&dev->unreg_list, &tmp_list);
1273
1274        __dev_close_many(head);
1275
1276        list_for_each_entry(dev, head, unreg_list) {
1277                rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1278                call_netdevice_notifiers(NETDEV_DOWN, dev);
1279        }
1280
1281        /* rollback_registered_many needs the complete original list */
1282        list_splice(&tmp_list, head);
1283        return 0;
1284}
1285
1286/**
1287 *      dev_close - shutdown an interface.
1288 *      @dev: device to shutdown
1289 *
1290 *      This function moves an active device into down state. A
1291 *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1292 *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1293 *      chain.
1294 */
1295int dev_close(struct net_device *dev)
1296{
1297        if (dev->flags & IFF_UP) {
1298                LIST_HEAD(single);
1299
1300                list_add(&dev->unreg_list, &single);
1301                dev_close_many(&single);
1302                list_del(&single);
1303        }
1304        return 0;
1305}
1306EXPORT_SYMBOL(dev_close);
1307
1308
1309/**
1310 *      dev_disable_lro - disable Large Receive Offload on a device
1311 *      @dev: device
1312 *
1313 *      Disable Large Receive Offload (LRO) on a net device.  Must be
1314 *      called under RTNL.  This is needed if received packets may be
1315 *      forwarded to another interface.
1316 */
1317void dev_disable_lro(struct net_device *dev)
1318{
1319        u32 flags;
1320
1321        /*
1322         * If we're trying to disable lro on a vlan device
1323         * use the underlying physical device instead
1324         */
1325        if (is_vlan_dev(dev))
1326                dev = vlan_dev_real_dev(dev);
1327
1328        if (dev->ethtool_ops && dev->ethtool_ops->get_flags)
1329                flags = dev->ethtool_ops->get_flags(dev);
1330        else
1331                flags = ethtool_op_get_flags(dev);
1332
1333        if (!(flags & ETH_FLAG_LRO))
1334                return;
1335
1336        __ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO);
1337        if (unlikely(dev->features & NETIF_F_LRO))
1338                netdev_WARN(dev, "failed to disable LRO!\n");
1339}
1340EXPORT_SYMBOL(dev_disable_lro);
1341
1342
1343static int dev_boot_phase = 1;
1344
1345/**
1346 *      register_netdevice_notifier - register a network notifier block
1347 *      @nb: notifier
1348 *
1349 *      Register a notifier to be called when network device events occur.
1350 *      The notifier passed is linked into the kernel structures and must
1351 *      not be reused until it has been unregistered. A negative errno code
1352 *      is returned on a failure.
1353 *
1354 *      When registered all registration and up events are replayed
1355 *      to the new notifier to allow device to have a race free
1356 *      view of the network device list.
1357 */
1358
1359int register_netdevice_notifier(struct notifier_block *nb)
1360{
1361        struct net_device *dev;
1362        struct net_device *last;
1363        struct net *net;
1364        int err;
1365
1366        rtnl_lock();
1367        err = raw_notifier_chain_register(&netdev_chain, nb);
1368        if (err)
1369                goto unlock;
1370        if (dev_boot_phase)
1371                goto unlock;
1372        for_each_net(net) {
1373                for_each_netdev(net, dev) {
1374                        err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1375                        err = notifier_to_errno(err);
1376                        if (err)
1377                                goto rollback;
1378
1379                        if (!(dev->flags & IFF_UP))
1380                                continue;
1381
1382                        nb->notifier_call(nb, NETDEV_UP, dev);
1383                }
1384        }
1385
1386unlock:
1387        rtnl_unlock();
1388        return err;
1389
1390rollback:
1391        last = dev;
1392        for_each_net(net) {
1393                for_each_netdev(net, dev) {
1394                        if (dev == last)
1395                                break;
1396
1397                        if (dev->flags & IFF_UP) {
1398                                nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1399                                nb->notifier_call(nb, NETDEV_DOWN, dev);
1400                        }
1401                        nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1402                        nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1403                }
1404        }
1405
1406        raw_notifier_chain_unregister(&netdev_chain, nb);
1407        goto unlock;
1408}
1409EXPORT_SYMBOL(register_netdevice_notifier);
1410
1411/**
1412 *      unregister_netdevice_notifier - unregister a network notifier block
1413 *      @nb: notifier
1414 *
1415 *      Unregister a notifier previously registered by
1416 *      register_netdevice_notifier(). The notifier is unlinked into the
1417 *      kernel structures and may then be reused. A negative errno code
1418 *      is returned on a failure.
1419 */
1420
1421int unregister_netdevice_notifier(struct notifier_block *nb)
1422{
1423        int err;
1424
1425        rtnl_lock();
1426        err = raw_notifier_chain_unregister(&netdev_chain, nb);
1427        rtnl_unlock();
1428        return err;
1429}
1430EXPORT_SYMBOL(unregister_netdevice_notifier);
1431
1432/**
1433 *      call_netdevice_notifiers - call all network notifier blocks
1434 *      @val: value passed unmodified to notifier function
1435 *      @dev: net_device pointer passed unmodified to notifier function
1436 *
1437 *      Call all network notifier blocks.  Parameters and return value
1438 *      are as for raw_notifier_call_chain().
1439 */
1440
1441int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1442{
1443        ASSERT_RTNL();
1444        return raw_notifier_call_chain(&netdev_chain, val, dev);
1445}
1446EXPORT_SYMBOL(call_netdevice_notifiers);
1447
1448/* When > 0 there are consumers of rx skb time stamps */
1449static atomic_t netstamp_needed = ATOMIC_INIT(0);
1450
1451void net_enable_timestamp(void)
1452{
1453        atomic_inc(&netstamp_needed);
1454}
1455EXPORT_SYMBOL(net_enable_timestamp);
1456
1457void net_disable_timestamp(void)
1458{
1459        atomic_dec(&netstamp_needed);
1460}
1461EXPORT_SYMBOL(net_disable_timestamp);
1462
1463static inline void net_timestamp_set(struct sk_buff *skb)
1464{
1465        if (atomic_read(&netstamp_needed))
1466                __net_timestamp(skb);
1467        else
1468                skb->tstamp.tv64 = 0;
1469}
1470
1471static inline void net_timestamp_check(struct sk_buff *skb)
1472{
1473        if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1474                __net_timestamp(skb);
1475}
1476
1477static inline bool is_skb_forwardable(struct net_device *dev,
1478                                      struct sk_buff *skb)
1479{
1480        unsigned int len;
1481
1482        if (!(dev->flags & IFF_UP))
1483                return false;
1484
1485        len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1486        if (skb->len <= len)
1487                return true;
1488
1489        /* if TSO is enabled, we don't care about the length as the packet
1490         * could be forwarded without being segmented before
1491         */
1492        if (skb_is_gso(skb))
1493                return true;
1494
1495        return false;
1496}
1497
1498/**
1499 * dev_forward_skb - loopback an skb to another netif
1500 *
1501 * @dev: destination network device
1502 * @skb: buffer to forward
1503 *
1504 * return values:
1505 *      NET_RX_SUCCESS  (no congestion)
1506 *      NET_RX_DROP     (packet was dropped, but freed)
1507 *
1508 * dev_forward_skb can be used for injecting an skb from the
1509 * start_xmit function of one device into the receive queue
1510 * of another device.
1511 *
1512 * The receiving device may be in another namespace, so
1513 * we have to clear all information in the skb that could
1514 * impact namespace isolation.
1515 */
1516int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1517{
1518        if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1519                if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1520                        atomic_long_inc(&dev->rx_dropped);
1521                        kfree_skb(skb);
1522                        return NET_RX_DROP;
1523                }
1524        }
1525
1526        skb_orphan(skb);
1527        nf_reset(skb);
1528
1529        if (unlikely(!is_skb_forwardable(dev, skb))) {
1530                atomic_long_inc(&dev->rx_dropped);
1531                kfree_skb(skb);
1532                return NET_RX_DROP;
1533        }
1534        skb_set_dev(skb, dev);
1535        skb->tstamp.tv64 = 0;
1536        skb->pkt_type = PACKET_HOST;
1537        skb->protocol = eth_type_trans(skb, dev);
1538        return netif_rx(skb);
1539}
1540EXPORT_SYMBOL_GPL(dev_forward_skb);
1541
1542static inline int deliver_skb(struct sk_buff *skb,
1543                              struct packet_type *pt_prev,
1544                              struct net_device *orig_dev)
1545{
1546        atomic_inc(&skb->users);
1547        return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1548}
1549
1550/*
1551 *      Support routine. Sends outgoing frames to any network
1552 *      taps currently in use.
1553 */
1554
1555static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1556{
1557        struct packet_type *ptype;
1558        struct sk_buff *skb2 = NULL;
1559        struct packet_type *pt_prev = NULL;
1560
1561        rcu_read_lock();
1562        list_for_each_entry_rcu(ptype, &ptype_all, list) {
1563                /* Never send packets back to the socket
1564                 * they originated from - MvS (miquels@drinkel.ow.org)
1565                 */
1566                if ((ptype->dev == dev || !ptype->dev) &&
1567                    (ptype->af_packet_priv == NULL ||
1568                     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1569                        if (pt_prev) {
1570                                deliver_skb(skb2, pt_prev, skb->dev);
1571                                pt_prev = ptype;
1572                                continue;
1573                        }
1574
1575                        skb2 = skb_clone(skb, GFP_ATOMIC);
1576                        if (!skb2)
1577                                break;
1578
1579                        net_timestamp_set(skb2);
1580
1581                        /* skb->nh should be correctly
1582                           set by sender, so that the second statement is
1583                           just protection against buggy protocols.
1584                         */
1585                        skb_reset_mac_header(skb2);
1586
1587                        if (skb_network_header(skb2) < skb2->data ||
1588                            skb2->network_header > skb2->tail) {
1589                                if (net_ratelimit())
1590                                        printk(KERN_CRIT "protocol %04x is "
1591                                               "buggy, dev %s\n",
1592                                               ntohs(skb2->protocol),
1593                                               dev->name);
1594                                skb_reset_network_header(skb2);
1595                        }
1596
1597                        skb2->transport_header = skb2->network_header;
1598                        skb2->pkt_type = PACKET_OUTGOING;
1599                        pt_prev = ptype;
1600                }
1601        }
1602        if (pt_prev)
1603                pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1604        rcu_read_unlock();
1605}
1606
1607/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1608 * @dev: Network device
1609 * @txq: number of queues available
1610 *
1611 * If real_num_tx_queues is changed the tc mappings may no longer be
1612 * valid. To resolve this verify the tc mapping remains valid and if
1613 * not NULL the mapping. With no priorities mapping to this
1614 * offset/count pair it will no longer be used. In the worst case TC0
1615 * is invalid nothing can be done so disable priority mappings. If is
1616 * expected that drivers will fix this mapping if they can before
1617 * calling netif_set_real_num_tx_queues.
1618 */
1619static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1620{
1621        int i;
1622        struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1623
1624        /* If TC0 is invalidated disable TC mapping */
1625        if (tc->offset + tc->count > txq) {
1626                pr_warning("Number of in use tx queues changed "
1627                           "invalidating tc mappings. Priority "
1628                           "traffic classification disabled!\n");
1629                dev->num_tc = 0;
1630                return;
1631        }
1632
1633        /* Invalidated prio to tc mappings set to TC0 */
1634        for (i = 1; i < TC_BITMASK + 1; i++) {
1635                int q = netdev_get_prio_tc_map(dev, i);
1636
1637                tc = &dev->tc_to_txq[q];
1638                if (tc->offset + tc->count > txq) {
1639                        pr_warning("Number of in use tx queues "
1640                                   "changed. Priority %i to tc "
1641                                   "mapping %i is no longer valid "
1642                                   "setting map to 0\n",
1643                                   i, q);
1644                        netdev_set_prio_tc_map(dev, i, 0);
1645                }
1646        }
1647}
1648
1649/*
1650 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1651 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1652 */
1653int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1654{
1655        int rc;
1656
1657        if (txq < 1 || txq > dev->num_tx_queues)
1658                return -EINVAL;
1659
1660        if (dev->reg_state == NETREG_REGISTERED ||
1661            dev->reg_state == NETREG_UNREGISTERING) {
1662                ASSERT_RTNL();
1663
1664                rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1665                                                  txq);
1666                if (rc)
1667                        return rc;
1668
1669                if (dev->num_tc)
1670                        netif_setup_tc(dev, txq);
1671
1672                if (txq < dev->real_num_tx_queues)
1673                        qdisc_reset_all_tx_gt(dev, txq);
1674        }
1675
1676        dev->real_num_tx_queues = txq;
1677        return 0;
1678}
1679EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1680
1681#ifdef CONFIG_RPS
1682/**
1683 *      netif_set_real_num_rx_queues - set actual number of RX queues used
1684 *      @dev: Network device
1685 *      @rxq: Actual number of RX queues
1686 *
1687 *      This must be called either with the rtnl_lock held or before
1688 *      registration of the net device.  Returns 0 on success, or a
1689 *      negative error code.  If called before registration, it always
1690 *      succeeds.
1691 */
1692int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1693{
1694        int rc;
1695
1696        if (rxq < 1 || rxq > dev->num_rx_queues)
1697                return -EINVAL;
1698
1699        if (dev->reg_state == NETREG_REGISTERED) {
1700                ASSERT_RTNL();
1701
1702                rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1703                                                  rxq);
1704                if (rc)
1705                        return rc;
1706        }
1707
1708        dev->real_num_rx_queues = rxq;
1709        return 0;
1710}
1711EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1712#endif
1713
1714static inline void __netif_reschedule(struct Qdisc *q)
1715{
1716        struct softnet_data *sd;
1717        unsigned long flags;
1718
1719        local_irq_save(flags);
1720        sd = &__get_cpu_var(softnet_data);
1721        q->next_sched = NULL;
1722        *sd->output_queue_tailp = q;
1723        sd->output_queue_tailp = &q->next_sched;
1724        raise_softirq_irqoff(NET_TX_SOFTIRQ);
1725        local_irq_restore(flags);
1726}
1727
1728void __netif_schedule(struct Qdisc *q)
1729{
1730        if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1731                __netif_reschedule(q);
1732}
1733EXPORT_SYMBOL(__netif_schedule);
1734
1735void dev_kfree_skb_irq(struct sk_buff *skb)
1736{
1737        if (atomic_dec_and_test(&skb->users)) {
1738                struct softnet_data *sd;
1739                unsigned long flags;
1740
1741                local_irq_save(flags);
1742                sd = &__get_cpu_var(softnet_data);
1743                skb->next = sd->completion_queue;
1744                sd->completion_queue = skb;
1745                raise_softirq_irqoff(NET_TX_SOFTIRQ);
1746                local_irq_restore(flags);
1747        }
1748}
1749EXPORT_SYMBOL(dev_kfree_skb_irq);
1750
1751void dev_kfree_skb_any(struct sk_buff *skb)
1752{
1753        if (in_irq() || irqs_disabled())
1754                dev_kfree_skb_irq(skb);
1755        else
1756                dev_kfree_skb(skb);
1757}
1758EXPORT_SYMBOL(dev_kfree_skb_any);
1759
1760
1761/**
1762 * netif_device_detach - mark device as removed
1763 * @dev: network device
1764 *
1765 * Mark device as removed from system and therefore no longer available.
1766 */
1767void netif_device_detach(struct net_device *dev)
1768{
1769        if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1770            netif_running(dev)) {
1771                netif_tx_stop_all_queues(dev);
1772        }
1773}
1774EXPORT_SYMBOL(netif_device_detach);
1775
1776/**
1777 * netif_device_attach - mark device as attached
1778 * @dev: network device
1779 *
1780 * Mark device as attached from system and restart if needed.
1781 */
1782void netif_device_attach(struct net_device *dev)
1783{
1784        if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1785            netif_running(dev)) {
1786                netif_tx_wake_all_queues(dev);
1787                __netdev_watchdog_up(dev);
1788        }
1789}
1790EXPORT_SYMBOL(netif_device_attach);
1791
1792/**
1793 * skb_dev_set -- assign a new device to a buffer
1794 * @skb: buffer for the new device
1795 * @dev: network device
1796 *
1797 * If an skb is owned by a device already, we have to reset
1798 * all data private to the namespace a device belongs to
1799 * before assigning it a new device.
1800 */
1801#ifdef CONFIG_NET_NS
1802void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1803{
1804        skb_dst_drop(skb);
1805        if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1806                secpath_reset(skb);
1807                nf_reset(skb);
1808                skb_init_secmark(skb);
1809                skb->mark = 0;
1810                skb->priority = 0;
1811                skb->nf_trace = 0;
1812                skb->ipvs_property = 0;
1813#ifdef CONFIG_NET_SCHED
1814                skb->tc_index = 0;
1815#endif
1816        }
1817        skb->dev = dev;
1818}
1819EXPORT_SYMBOL(skb_set_dev);
1820#endif /* CONFIG_NET_NS */
1821
1822/*
1823 * Invalidate hardware checksum when packet is to be mangled, and
1824 * complete checksum manually on outgoing path.
1825 */
1826int skb_checksum_help(struct sk_buff *skb)
1827{
1828        __wsum csum;
1829        int ret = 0, offset;
1830
1831        if (skb->ip_summed == CHECKSUM_COMPLETE)
1832                goto out_set_summed;
1833
1834        if (unlikely(skb_shinfo(skb)->gso_size)) {
1835                /* Let GSO fix up the checksum. */
1836                goto out_set_summed;
1837        }
1838
1839        offset = skb_checksum_start_offset(skb);
1840        BUG_ON(offset >= skb_headlen(skb));
1841        csum = skb_checksum(skb, offset, skb->len - offset, 0);
1842
1843        offset += skb->csum_offset;
1844        BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1845
1846        if (skb_cloned(skb) &&
1847            !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1848                ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1849                if (ret)
1850                        goto out;
1851        }
1852
1853        *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1854out_set_summed:
1855        skb->ip_summed = CHECKSUM_NONE;
1856out:
1857        return ret;
1858}
1859EXPORT_SYMBOL(skb_checksum_help);
1860
1861/**
1862 *      skb_gso_segment - Perform segmentation on skb.
1863 *      @skb: buffer to segment
1864 *      @features: features for the output path (see dev->features)
1865 *
1866 *      This function segments the given skb and returns a list of segments.
1867 *
1868 *      It may return NULL if the skb requires no segmentation.  This is
1869 *      only possible when GSO is used for verifying header integrity.
1870 */
1871struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
1872{
1873        struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1874        struct packet_type *ptype;
1875        __be16 type = skb->protocol;
1876        int vlan_depth = ETH_HLEN;
1877        int err;
1878
1879        while (type == htons(ETH_P_8021Q)) {
1880                struct vlan_hdr *vh;
1881
1882                if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1883                        return ERR_PTR(-EINVAL);
1884
1885                vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1886                type = vh->h_vlan_encapsulated_proto;
1887                vlan_depth += VLAN_HLEN;
1888        }
1889
1890        skb_reset_mac_header(skb);
1891        skb->mac_len = skb->network_header - skb->mac_header;
1892        __skb_pull(skb, skb->mac_len);
1893
1894        if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1895                struct net_device *dev = skb->dev;
1896                struct ethtool_drvinfo info = {};
1897
1898                if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1899                        dev->ethtool_ops->get_drvinfo(dev, &info);
1900
1901                WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1902                     info.driver, dev ? dev->features : 0L,
1903                     skb->sk ? skb->sk->sk_route_caps : 0L,
1904                     skb->len, skb->data_len, skb->ip_summed);
1905
1906                if (skb_header_cloned(skb) &&
1907                    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1908                        return ERR_PTR(err);
1909        }
1910
1911        rcu_read_lock();
1912        list_for_each_entry_rcu(ptype,
1913                        &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1914                if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1915                        if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1916                                err = ptype->gso_send_check(skb);
1917                                segs = ERR_PTR(err);
1918                                if (err || skb_gso_ok(skb, features))
1919                                        break;
1920                                __skb_push(skb, (skb->data -
1921                                                 skb_network_header(skb)));
1922                        }
1923                        segs = ptype->gso_segment(skb, features);
1924                        break;
1925                }
1926        }
1927        rcu_read_unlock();
1928
1929        __skb_push(skb, skb->data - skb_mac_header(skb));
1930
1931        return segs;
1932}
1933EXPORT_SYMBOL(skb_gso_segment);
1934
1935/* Take action when hardware reception checksum errors are detected. */
1936#ifdef CONFIG_BUG
1937void netdev_rx_csum_fault(struct net_device *dev)
1938{
1939        if (net_ratelimit()) {
1940                printk(KERN_ERR "%s: hw csum failure.\n",
1941                        dev ? dev->name : "<unknown>");
1942                dump_stack();
1943        }
1944}
1945EXPORT_SYMBOL(netdev_rx_csum_fault);
1946#endif
1947
1948/* Actually, we should eliminate this check as soon as we know, that:
1949 * 1. IOMMU is present and allows to map all the memory.
1950 * 2. No high memory really exists on this machine.
1951 */
1952
1953static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1954{
1955#ifdef CONFIG_HIGHMEM
1956        int i;
1957        if (!(dev->features & NETIF_F_HIGHDMA)) {
1958                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1959                        if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1960                                return 1;
1961        }
1962
1963        if (PCI_DMA_BUS_IS_PHYS) {
1964                struct device *pdev = dev->dev.parent;
1965
1966                if (!pdev)
1967                        return 0;
1968                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1969                        dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1970                        if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1971                                return 1;
1972                }
1973        }
1974#endif
1975        return 0;
1976}
1977
1978struct dev_gso_cb {
1979        void (*destructor)(struct sk_buff *skb);
1980};
1981
1982#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1983
1984static void dev_gso_skb_destructor(struct sk_buff *skb)
1985{
1986        struct dev_gso_cb *cb;
1987
1988        do {
1989                struct sk_buff *nskb = skb->next;
1990
1991                skb->next = nskb->next;
1992                nskb->next = NULL;
1993                kfree_skb(nskb);
1994        } while (skb->next);
1995
1996        cb = DEV_GSO_CB(skb);
1997        if (cb->destructor)
1998                cb->destructor(skb);
1999}
2000

2001/**
2002 *      dev_gso_segment - Perform emulated hardware segmentation on skb.
2003 *      @skb: buffer to segment
2004 *      @features: device features as applicable to this skb
2005 *
2006 *      This function segments the given skb and stores the list of segments
2007 *      in skb->next.
2008 */
2009static int dev_gso_segment(struct sk_buff *skb, int features)
2010{
2011        struct sk_buff *segs;
2012
2013        segs = skb_gso_segment(skb, features);
2014
2015        /* Verifying header integrity only. */
2016        if (!segs)
2017                return 0;
2018
2019        if (IS_ERR(segs))
2020                return PTR_ERR(segs);
2021
2022        skb->next = segs;
2023        DEV_GSO_CB(skb)->destructor = skb->destructor;
2024        skb->destructor = dev_gso_skb_destructor;
2025
2026        return 0;
2027}
2028
2029/*
2030 * Try to orphan skb early, right before transmission by the device.
2031 * We cannot orphan skb if tx timestamp is requested or the sk-reference
2032 * is needed on driver level for other reasons, e.g. see net/can/raw.c
2033 */
2034static inline void skb_orphan_try(struct sk_buff *skb)
2035{
2036        struct sock *sk = skb->sk;
2037
2038        if (sk && !skb_shinfo(skb)->tx_flags) {
2039                /* skb_tx_hash() wont be able to get sk.
2040                 * We copy sk_hash into skb->rxhash
2041                 */
2042                if (!skb->rxhash)
2043                        skb->rxhash = sk->sk_hash;
2044                skb_orphan(skb);
2045        }
2046}
2047
2048static bool can_checksum_protocol(unsigned long features, __be16 protocol)
2049{
2050        return ((features & NETIF_F_GEN_CSUM) ||
2051                ((features & NETIF_F_V4_CSUM) &&
2052                 protocol == htons(ETH_P_IP)) ||
2053                ((features & NETIF_F_V6_CSUM) &&
2054                 protocol == htons(ETH_P_IPV6)) ||
2055                ((features & NETIF_F_FCOE_CRC) &&
2056                 protocol == htons(ETH_P_FCOE)));
2057}
2058
2059static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features)
2060{
2061        if (!can_checksum_protocol(features, protocol)) {
2062                features &= ~NETIF_F_ALL_CSUM;
2063                features &= ~NETIF_F_SG;
2064        } else if (illegal_highdma(skb->dev, skb)) {
2065                features &= ~NETIF_F_SG;
2066        }
2067
2068        return features;
2069}
2070
2071u32 netif_skb_features(struct sk_buff *skb)
2072{
2073        __be16 protocol = skb->protocol;
2074        u32 features = skb->dev->features;
2075
2076        if (protocol == htons(ETH_P_8021Q)) {
2077                struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2078                protocol = veh->h_vlan_encapsulated_proto;
2079        } else if (!vlan_tx_tag_present(skb)) {
2080                return harmonize_features(skb, protocol, features);
2081        }
2082
2083        features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2084
2085        if (protocol != htons(ETH_P_8021Q)) {
2086                return harmonize_features(skb, protocol, features);
2087        } else {
2088                features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2089                                NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2090                return harmonize_features(skb, protocol, features);
2091        }
2092}
2093EXPORT_SYMBOL(netif_skb_features);
2094
2095/*
2096 * Returns true if either:
2097 *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2098 *      2. skb is fragmented and the device does not support SG, or if
2099 *         at least one of fragments is in highmem and device does not
2100 *         support DMA from it.
2101 */
2102static inline int skb_needs_linearize(struct sk_buff *skb,
2103                                      int features)
2104{
2105        return skb_is_nonlinear(skb) &&
2106                        ((skb_has_frag_list(skb) &&
2107                                !(features & NETIF_F_FRAGLIST)) ||
2108                        (skb_shinfo(skb)->nr_frags &&
2109                                !(features & NETIF_F_SG)));
2110}
2111
2112int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2113                        struct netdev_queue *txq)
2114{
2115        const struct net_device_ops *ops = dev->netdev_ops;
2116        int rc = NETDEV_TX_OK;
2117        unsigned int skb_len;
2118
2119        if (likely(!skb->next)) {
2120                u32 features;
2121
2122                /*
2123                 * If device doesn't need skb->dst, release it right now while
2124                 * its hot in this cpu cache
2125                 */
2126                if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2127                        skb_dst_drop(skb);
2128
2129                if (!list_empty(&ptype_all))
2130                        dev_queue_xmit_nit(skb, dev);
2131
2132                skb_orphan_try(skb);
2133
2134                features = netif_skb_features(skb);
2135
2136                if (vlan_tx_tag_present(skb) &&
2137                    !(features & NETIF_F_HW_VLAN_TX)) {
2138                        skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2139                        if (unlikely(!skb))
2140                                goto out;
2141
2142                        skb->vlan_tci = 0;
2143                }
2144
2145                if (netif_needs_gso(skb, features)) {
2146                        if (unlikely(dev_gso_segment(skb, features)))
2147                                goto out_kfree_skb;
2148                        if (skb->next)
2149                                goto gso;
2150                } else {
2151                        if (skb_needs_linearize(skb, features) &&
2152                            __skb_linearize(skb))
2153                                goto out_kfree_skb;
2154
2155                        /* If packet is not checksummed and device does not
2156                         * support checksumming for this protocol, complete
2157                         * checksumming here.
2158                         */
2159                        if (skb->ip_summed == CHECKSUM_PARTIAL) {
2160                                skb_set_transport_header(skb,
2161                                        skb_checksum_start_offset(skb));
2162                                if (!(features & NETIF_F_ALL_CSUM) &&
2163                                     skb_checksum_help(skb))
2164                                        goto out_kfree_skb;
2165                        }
2166                }
2167
2168                skb_len = skb->len;
2169                rc = ops->ndo_start_xmit(skb, dev);
2170                trace_net_dev_xmit(skb, rc, dev, skb_len);
2171                if (rc == NETDEV_TX_OK)
2172                        txq_trans_update(txq);
2173                return rc;
2174        }
2175
2176gso:
2177        do {
2178                struct sk_buff *nskb = skb->next;
2179
2180                skb->next = nskb->next;
2181                nskb->next = NULL;
2182
2183                /*
2184                 * If device doesn't need nskb->dst, release it right now while
2185                 * its hot in this cpu cache
2186                 */
2187                if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2188                        skb_dst_drop(nskb);
2189
2190                skb_len = nskb->len;
2191                rc = ops->ndo_start_xmit(nskb, dev);
2192                trace_net_dev_xmit(nskb, rc, dev, skb_len);
2193                if (unlikely(rc != NETDEV_TX_OK)) {
2194                        if (rc & ~NETDEV_TX_MASK)
2195                                goto out_kfree_gso_skb;
2196                        nskb->next = skb->next;
2197                        skb->next = nskb;
2198                        return rc;
2199                }
2200                txq_trans_update(txq);
2201                if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2202                        return NETDEV_TX_BUSY;
2203        } while (skb->next);
2204
2205out_kfree_gso_skb:
2206        if (likely(skb->next == NULL))
2207                skb->destructor = DEV_GSO_CB(skb)->destructor;
2208out_kfree_skb:
2209        kfree_skb(skb);
2210out:
2211        return rc;
2212}
2213
2214static u32 hashrnd __read_mostly;
2215
2216/*
2217 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2218 * to be used as a distribution range.
2219 */
2220u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2221                  unsigned int num_tx_queues)
2222{
2223        u32 hash;
2224        u16 qoffset = 0;
2225        u16 qcount = num_tx_queues;
2226
2227        if (skb_rx_queue_recorded(skb)) {
2228                hash = skb_get_rx_queue(skb);
2229                while (unlikely(hash >= num_tx_queues))
2230                        hash -= num_tx_queues;
2231                return hash;
2232        }
2233
2234        if (dev->num_tc) {
2235                u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2236                qoffset = dev->tc_to_txq[tc].offset;
2237                qcount = dev->tc_to_txq[tc].count;
2238        }
2239
2240        if (skb->sk && skb->sk->sk_hash)
2241                hash = skb->sk->sk_hash;
2242        else
2243                hash = (__force u16) skb->protocol ^ skb->rxhash;
2244        hash = jhash_1word(hash, hashrnd);
2245
2246        return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2247}
2248EXPORT_SYMBOL(__skb_tx_hash);
2249
2250static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2251{
2252        if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2253                if (net_ratelimit()) {
2254                        pr_warning("%s selects TX queue %d, but "
2255                                "real number of TX queues is %d\n",
2256                                dev->name, queue_index, dev->real_num_tx_queues);
2257                }
2258                return 0;
2259        }
2260        return queue_index;
2261}
2262
2263static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2264{
2265#ifdef CONFIG_XPS
2266        struct xps_dev_maps *dev_maps;
2267        struct xps_map *map;
2268        int queue_index = -1;
2269
2270        rcu_read_lock();
2271        dev_maps = rcu_dereference(dev->xps_maps);
2272        if (dev_maps) {
2273                map = rcu_dereference(
2274                    dev_maps->cpu_map[raw_smp_processor_id()]);
2275                if (map) {
2276                        if (map->len == 1)
2277                                queue_index = map->queues[0];
2278                        else {
2279                                u32 hash;
2280                                if (skb->sk && skb->sk->sk_hash)
2281                                        hash = skb->sk->sk_hash;
2282                                else
2283                                        hash = (__force u16) skb->protocol ^
2284                                            skb->rxhash;
2285                                hash = jhash_1word(hash, hashrnd);
2286                                queue_index = map->queues[
2287                                    ((u64)hash * map->len) >> 32];
2288                        }
2289                        if (unlikely(queue_index >= dev->real_num_tx_queues))
2290                                queue_index = -1;
2291                }
2292        }
2293        rcu_read_unlock();
2294
2295        return queue_index;
2296#else
2297        return -1;
2298#endif
2299}
2300
2301static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2302                                        struct sk_buff *skb)
2303{
2304        int queue_index;
2305        const struct net_device_ops *ops = dev->netdev_ops;
2306
2307        if (dev->real_num_tx_queues == 1)
2308                queue_index = 0;
2309        else if (ops->ndo_select_queue) {
2310                queue_index = ops->ndo_select_queue(dev, skb);
2311                queue_index = dev_cap_txqueue(dev, queue_index);
2312        } else {
2313                struct sock *sk = skb->sk;
2314                queue_index = sk_tx_queue_get(sk);
2315
2316                if (queue_index < 0 || skb->ooo_okay ||
2317                    queue_index >= dev->real_num_tx_queues) {
2318                        int old_index = queue_index;
2319
2320                        queue_index = get_xps_queue(dev, skb);
2321                        if (queue_index < 0)
2322                                queue_index = skb_tx_hash(dev, skb);
2323
2324                        if (queue_index != old_index && sk) {
2325                                struct dst_entry *dst =
2326                                    rcu_dereference_check(sk->sk_dst_cache, 1);
2327
2328                                if (dst && skb_dst(skb) == dst)
2329                                        sk_tx_queue_set(sk, queue_index);
2330                        }
2331                }
2332        }
2333
2334        skb_set_queue_mapping(skb, queue_index);
2335        return netdev_get_tx_queue(dev, queue_index);
2336}
2337
2338static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2339                                 struct net_device *dev,
2340                                 struct netdev_queue *txq)
2341{
2342        spinlock_t *root_lock = qdisc_lock(q);
2343        bool contended;
2344        int rc;
2345
2346        qdisc_skb_cb(skb)->pkt_len = skb->len;
2347        qdisc_calculate_pkt_len(skb, q);
2348        /*
2349         * Heuristic to force contended enqueues to serialize on a
2350         * separate lock before trying to get qdisc main lock.
2351         * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2352         * and dequeue packets faster.
2353         */
2354        contended = qdisc_is_running(q);
2355        if (unlikely(contended))
2356                spin_lock(&q->busylock);
2357
2358        spin_lock(root_lock);
2359        if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2360                kfree_skb(skb);
2361                rc = NET_XMIT_DROP;
2362        } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2363                   qdisc_run_begin(q)) {
2364                /*
2365                 * This is a work-conserving queue; there are no old skbs
2366                 * waiting to be sent out; and the qdisc is not running -
2367                 * xmit the skb directly.
2368                 */
2369                if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2370                        skb_dst_force(skb);
2371
2372                qdisc_bstats_update(q, skb);
2373
2374                if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2375                        if (unlikely(contended)) {
2376                                spin_unlock(&q->busylock);
2377                                contended = false;
2378                        }
2379                        __qdisc_run(q);
2380                } else
2381                        qdisc_run_end(q);
2382
2383                rc = NET_XMIT_SUCCESS;
2384        } else {
2385                skb_dst_force(skb);
2386                rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2387                if (qdisc_run_begin(q)) {
2388                        if (unlikely(contended)) {
2389                                spin_unlock(&q->busylock);
2390                                contended = false;
2391                        }
2392                        __qdisc_run(q);
2393                }
2394        }
2395        spin_unlock(root_lock);
2396        if (unlikely(contended))
2397                spin_unlock(&q->busylock);
2398        return rc;
2399}
2400
2401static DEFINE_PER_CPU(int, xmit_recursion);
2402#define RECURSION_LIMIT 10
2403
2404/**
2405 *      dev_queue_xmit - transmit a buffer
2406 *      @skb: buffer to transmit
2407 *
2408 *      Queue a buffer for transmission to a network device. The caller must
2409 *      have set the device and priority and built the buffer before calling
2410 *      this function. The function can be called from an interrupt.
2411 *
2412 *      A negative errno code is returned on a failure. A success does not
2413 *      guarantee the frame will be transmitted as it may be dropped due
2414 *      to congestion or traffic shaping.
2415 *
2416 * -----------------------------------------------------------------------------------
2417 *      I notice this method can also return errors from the queue disciplines,
2418 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2419 *      be positive.
2420 *
2421 *      Regardless of the return value, the skb is consumed, so it is currently
2422 *      difficult to retry a send to this method.  (You can bump the ref count
2423 *      before sending to hold a reference for retry if you are careful.)
2424 *
2425 *      When calling this method, interrupts MUST be enabled.  This is because
2426 *      the BH enable code must have IRQs enabled so that it will not deadlock.
2427 *          --BLG
2428 */
2429int dev_queue_xmit(struct sk_buff *skb)
2430{
2431        struct net_device *dev = skb->dev;
2432        struct netdev_queue *txq;
2433        struct Qdisc *q;
2434        int rc = -ENOMEM;
2435
2436        /* Disable soft irqs for various locks below. Also
2437         * stops preemption for RCU.
2438         */
2439        rcu_read_lock_bh();
2440
2441        txq = dev_pick_tx(dev, skb);
2442        q = rcu_dereference_bh(txq->qdisc);
2443
2444#ifdef CONFIG_NET_CLS_ACT
2445        skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2446#endif
2447        trace_net_dev_queue(skb);
2448        if (q->enqueue) {
2449                rc = __dev_xmit_skb(skb, q, dev, txq);
2450                goto out;
2451        }
2452
2453        /* The device has no queue. Common case for software devices:
2454           loopback, all the sorts of tunnels...
2455
2456           Really, it is unlikely that netif_tx_lock protection is necessary
2457           here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2458           counters.)
2459           However, it is possible, that they rely on protection
2460           made by us here.
2461
2462           Check this and shot the lock. It is not prone from deadlocks.
2463           Either shot noqueue qdisc, it is even simpler 8)
2464         */
2465        if (dev->flags & IFF_UP) {
2466                int cpu = smp_processor_id(); /* ok because BHs are off */
2467
2468                if (txq->xmit_lock_owner != cpu) {
2469
2470                        if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2471                                goto recursion_alert;
2472
2473                        HARD_TX_LOCK(dev, txq, cpu);
2474
2475                        if (!netif_tx_queue_stopped(txq)) {
2476                                __this_cpu_inc(xmit_recursion);
2477                                rc = dev_hard_start_xmit(skb, dev, txq);
2478                                __this_cpu_dec(xmit_recursion);
2479                                if (dev_xmit_complete(rc)) {
2480                                        HARD_TX_UNLOCK(dev, txq);
2481                                        goto out;
2482                                }
2483                        }
2484                        HARD_TX_UNLOCK(dev, txq);
2485                        if (net_ratelimit())
2486                                printk(KERN_CRIT "Virtual device %s asks to "
2487                                       "queue packet!\n", dev->name);
2488                } else {
2489                        /* Recursion is detected! It is possible,
2490                         * unfortunately
2491                         */
2492recursion_alert:
2493                        if (net_ratelimit())
2494                                printk(KERN_CRIT "Dead loop on virtual device "
2495                                       "%s, fix it urgently!\n", dev->name);
2496                }
2497        }
2498
2499        rc = -ENETDOWN;
2500        rcu_read_unlock_bh();
2501
2502        kfree_skb(skb);
2503        return rc;
2504out:
2505        rcu_read_unlock_bh();
2506        return rc;
2507}
2508EXPORT_SYMBOL(dev_queue_xmit);
2509
2510
2511/*=======================================================================
2512                        Receiver routines
2513  =======================================================================*/
2514
2515int netdev_max_backlog __read_mostly = 1000;
2516int netdev_tstamp_prequeue __read_mostly = 1;
2517int netdev_budget __read_mostly = 300;
2518int weight_p __read_mostly = 64;            /* old backlog weight */
2519
2520/* Called with irq disabled */
2521static inline void ____napi_schedule(struct softnet_data *sd,
2522                                     struct napi_struct *napi)
2523{
2524        list_add_tail(&napi->poll_list, &sd->poll_list);
2525        __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2526}
2527
2528/*
2529 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2530 * and src/dst port numbers. Returns a non-zero hash number on success
2531 * and 0 on failure.
2532 */
2533__u32 __skb_get_rxhash(struct sk_buff *skb)
2534{
2535        int nhoff, hash = 0, poff;
2536        const struct ipv6hdr *ip6;
2537        const struct iphdr *ip;
2538        u8 ip_proto;
2539        u32 addr1, addr2, ihl;
2540        union {
2541                u32 v32;
2542                u16 v16[2];
2543        } ports;
2544
2545        nhoff = skb_network_offset(skb);
2546
2547        switch (skb->protocol) {
2548        case __constant_htons(ETH_P_IP):
2549                if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2550                        goto done;
2551
2552                ip = (const struct iphdr *) (skb->data + nhoff);
2553                if (ip_is_fragment(ip))
2554                        ip_proto = 0;
2555                else
2556                        ip_proto = ip->protocol;
2557                addr1 = (__force u32) ip->saddr;
2558                addr2 = (__force u32) ip->daddr;
2559                ihl = ip->ihl;
2560                break;
2561        case __constant_htons(ETH_P_IPV6):
2562                if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2563                        goto done;
2564
2565                ip6 = (const struct ipv6hdr *) (skb->data + nhoff);
2566                ip_proto = ip6->nexthdr;
2567                addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2568                addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2569                ihl = (40 >> 2);
2570                break;
2571        default:
2572                goto done;
2573        }
2574
2575        ports.v32 = 0;
2576        poff = proto_ports_offset(ip_proto);
2577        if (poff >= 0) {
2578                nhoff += ihl * 4 + poff;
2579                if (pskb_may_pull(skb, nhoff + 4)) {
2580                        ports.v32 = * (__force u32 *) (skb->data + nhoff);
2581                        if (ports.v16[1] < ports.v16[0])
2582                                swap(ports.v16[0], ports.v16[1]);
2583                }
2584        }
2585
2586        /* get a consistent hash (same value on both flow directions) */
2587        if (addr2 < addr1)
2588                swap(addr1, addr2);
2589
2590        hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2591        if (!hash)
2592                hash = 1;
2593
2594done:
2595        return hash;
2596}
2597EXPORT_SYMBOL(__skb_get_rxhash);
2598
2599#ifdef CONFIG_RPS
2600
2601/* One global table that all flow-based protocols share. */
2602struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2603EXPORT_SYMBOL(rps_sock_flow_table);
2604
2605static struct rps_dev_flow *
2606set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2607            struct rps_dev_flow *rflow, u16 next_cpu)
2608{
2609        u16 tcpu;
2610
2611        tcpu = rflow->cpu = next_cpu;
2612        if (tcpu != RPS_NO_CPU) {
2613#ifdef CONFIG_RFS_ACCEL
2614                struct netdev_rx_queue *rxqueue;
2615                struct rps_dev_flow_table *flow_table;
2616                struct rps_dev_flow *old_rflow;
2617                u32 flow_id;
2618                u16 rxq_index;
2619                int rc;
2620
2621                /* Should we steer this flow to a different hardware queue? */
2622                if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2623                    !(dev->features & NETIF_F_NTUPLE))
2624                        goto out;
2625                rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2626                if (rxq_index == skb_get_rx_queue(skb))
2627                        goto out;
2628
2629                rxqueue = dev->_rx + rxq_index;
2630                flow_table = rcu_dereference(rxqueue->rps_flow_table);
2631                if (!flow_table)
2632                        goto out;
2633                flow_id = skb->rxhash & flow_table->mask;
2634                rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2635                                                        rxq_index, flow_id);
2636                if (rc < 0)
2637                        goto out;
2638                old_rflow = rflow;
2639                rflow = &flow_table->flows[flow_id];
2640                rflow->cpu = next_cpu;
2641                rflow->filter = rc;
2642                if (old_rflow->filter == rflow->filter)
2643                        old_rflow->filter = RPS_NO_FILTER;
2644        out:
2645#endif
2646                rflow->last_qtail =
2647                        per_cpu(softnet_data, tcpu).input_queue_head;
2648        }
2649
2650        return rflow;
2651}
2652
2653/*
2654 * get_rps_cpu is called from netif_receive_skb and returns the target
2655 * CPU from the RPS map of the receiving queue for a given skb.
2656 * rcu_read_lock must be held on entry.
2657 */
2658static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2659                       struct rps_dev_flow **rflowp)
2660{
2661        struct netdev_rx_queue *rxqueue;
2662        struct rps_map *map;
2663        struct rps_dev_flow_table *flow_table;
2664        struct rps_sock_flow_table *sock_flow_table;
2665        int cpu = -1;
2666        u16 tcpu;
2667
2668        if (skb_rx_queue_recorded(skb)) {
2669                u16 index = skb_get_rx_queue(skb);
2670                if (unlikely(index >= dev->real_num_rx_queues)) {
2671                        WARN_ONCE(dev->real_num_rx_queues > 1,
2672                                  "%s received packet on queue %u, but number "
2673                                  "of RX queues is %u\n",
2674                                  dev->name, index, dev->real_num_rx_queues);
2675                        goto done;
2676                }
2677                rxqueue = dev->_rx + index;
2678        } else
2679                rxqueue = dev->_rx;
2680
2681        map = rcu_dereference(rxqueue->rps_map);
2682        if (map) {
2683                if (map->len == 1 &&
2684                    !rcu_dereference_raw(rxqueue->rps_flow_table)) {
2685                        tcpu = map->cpus[0];
2686                        if (cpu_online(tcpu))
2687                                cpu = tcpu;
2688                        goto done;
2689                }
2690        } else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
2691                goto done;
2692        }
2693
2694        skb_reset_network_header(skb);
2695        if (!skb_get_rxhash(skb))
2696                goto done;
2697
2698        flow_table = rcu_dereference(rxqueue->rps_flow_table);
2699        sock_flow_table = rcu_dereference(rps_sock_flow_table);
2700        if (flow_table && sock_flow_table) {
2701                u16 next_cpu;
2702                struct rps_dev_flow *rflow;
2703
2704                rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2705                tcpu = rflow->cpu;
2706
2707                next_cpu = sock_flow_table->ents[skb->rxhash &
2708                    sock_flow_table->mask];
2709
2710                /*
2711                 * If the desired CPU (where last recvmsg was done) is
2712                 * different from current CPU (one in the rx-queue flow
2713                 * table entry), switch if one of the following holds:
2714                 *   - Current CPU is unset (equal to RPS_NO_CPU).
2715                 *   - Current CPU is offline.
2716                 *   - The current CPU's queue tail has advanced beyond the
2717                 *     last packet that was enqueued using this table entry.
2718                 *     This guarantees that all previous packets for the flow
2719                 *     have been dequeued, thus preserving in order delivery.
2720                 */
2721                if (unlikely(tcpu != next_cpu) &&
2722                    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2723                     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2724                      rflow->last_qtail)) >= 0))
2725                        rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2726
2727                if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2728                        *rflowp = rflow;
2729                        cpu = tcpu;
2730                        goto done;
2731                }
2732        }
2733
2734        if (map) {
2735                tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2736
2737                if (cpu_online(tcpu)) {
2738                        cpu = tcpu;
2739                        goto done;
2740                }
2741        }
2742
2743done:
2744        return cpu;
2745}
2746
2747#ifdef CONFIG_RFS_ACCEL
2748
2749/**
2750 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2751 * @dev: Device on which the filter was set
2752 * @rxq_index: RX queue index
2753 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2754 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2755 *
2756 * Drivers that implement ndo_rx_flow_steer() should periodically call
2757 * this function for each installed filter and remove the filters for
2758 * which it returns %true.
2759 */
2760bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2761                         u32 flow_id, u16 filter_id)
2762{
2763        struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2764        struct rps_dev_flow_table *flow_table;
2765        struct rps_dev_flow *rflow;
2766        bool expire = true;
2767        int cpu;
2768
2769        rcu_read_lock();
2770        flow_table = rcu_dereference(rxqueue->rps_flow_table);
2771        if (flow_table && flow_id <= flow_table->mask) {
2772                rflow = &flow_table->flows[flow_id];
2773                cpu = ACCESS_ONCE(rflow->cpu);
2774                if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2775                    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2776                           rflow->last_qtail) <
2777                     (int)(10 * flow_table->mask)))
2778                        expire = false;
2779        }
2780        rcu_read_unlock();
2781        return expire;
2782}
2783EXPORT_SYMBOL(rps_may_expire_flow);
2784
2785#endif /* CONFIG_RFS_ACCEL */
2786
2787/* Called from hardirq (IPI) context */
2788static void rps_trigger_softirq(void *data)
2789{
2790        struct softnet_data *sd = data;
2791
2792        ____napi_schedule(sd, &sd->backlog);
2793        sd->received_rps++;
2794}
2795
2796#endif /* CONFIG_RPS */
2797
2798/*
2799 * Check if this softnet_data structure is another cpu one
2800 * If yes, queue it to our IPI list and return 1
2801 * If no, return 0
2802 */
2803static int rps_ipi_queued(struct softnet_data *sd)
2804{
2805#ifdef CONFIG_RPS
2806        struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2807
2808        if (sd != mysd) {
2809                sd->rps_ipi_next = mysd->rps_ipi_list;
2810                mysd->rps_ipi_list = sd;
2811
2812                __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2813                return 1;
2814        }
2815#endif /* CONFIG_RPS */
2816        return 0;
2817}
2818
2819/*
2820 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2821 * queue (may be a remote CPU queue).
2822 */
2823static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2824                              unsigned int *qtail)
2825{
2826        struct softnet_data *sd;
2827        unsigned long flags;
2828
2829        sd = &per_cpu(softnet_data, cpu);
2830
2831        local_irq_save(flags);
2832
2833        rps_lock(sd);
2834        if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2835                if (skb_queue_len(&sd->input_pkt_queue)) {
2836enqueue:
2837                        __skb_queue_tail(&sd->input_pkt_queue, skb);
2838                        input_queue_tail_incr_save(sd, qtail);
2839                        rps_unlock(sd);
2840                        local_irq_restore(flags);
2841                        return NET_RX_SUCCESS;
2842                }
2843
2844                /* Schedule NAPI for backlog device
2845                 * We can use non atomic operation since we own the queue lock
2846                 */
2847                if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2848                        if (!rps_ipi_queued(sd))
2849                                ____napi_schedule(sd, &sd->backlog);
2850                }
2851                goto enqueue;
2852        }
2853
2854        sd->dropped++;
2855        rps_unlock(sd);
2856
2857        local_irq_restore(flags);
2858
2859        atomic_long_inc(&skb->dev->rx_dropped);
2860        kfree_skb(skb);
2861        return NET_RX_DROP;
2862}
2863
2864/**
2865 *      netif_rx        -       post buffer to the network code
2866 *      @skb: buffer to post
2867 *
2868 *      This function receives a packet from a device driver and queues it for
2869 *      the upper (protocol) levels to process.  It always succeeds. The buffer
2870 *      may be dropped during processing for congestion control or by the
2871 *      protocol layers.
2872 *
2873 *      return values:
2874 *      NET_RX_SUCCESS  (no congestion)
2875 *      NET_RX_DROP     (packet was dropped)
2876 *
2877 */
2878
2879int netif_rx(struct sk_buff *skb)
2880{
2881        int ret;
2882
2883        /* if netpoll wants it, pretend we never saw it */
2884        if (netpoll_rx(skb))
2885                return NET_RX_DROP;
2886
2887        if (netdev_tstamp_prequeue)
2888                net_timestamp_check(skb);
2889
2890        trace_netif_rx(skb);
2891#ifdef CONFIG_RPS
2892        {
2893                struct rps_dev_flow voidflow, *rflow = &voidflow;
2894                int cpu;
2895
2896                preempt_disable();
2897                rcu_read_lock();
2898
2899                cpu = get_rps_cpu(skb->dev, skb, &rflow);
2900                if (cpu < 0)
2901                        cpu = smp_processor_id();
2902
2903                ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2904
2905                rcu_read_unlock();
2906                preempt_enable();
2907        }
2908#else
2909        {
2910                unsigned int qtail;
2911                ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2912                put_cpu();
2913        }
2914#endif
2915        return ret;
2916}
2917EXPORT_SYMBOL(netif_rx);
2918
2919int netif_rx_ni(struct sk_buff *skb)
2920{
2921        int err;
2922
2923        preempt_disable();
2924        err = netif_rx(skb);
2925        if (local_softirq_pending())
2926                do_softirq();
2927        preempt_enable();
2928
2929        return err;
2930}
2931EXPORT_SYMBOL(netif_rx_ni);
2932
2933static void net_tx_action(struct softirq_action *h)
2934{
2935        struct softnet_data *sd = &__get_cpu_var(softnet_data);
2936
2937        if (sd->completion_queue) {
2938                struct sk_buff *clist;
2939
2940                local_irq_disable();
2941                clist = sd->completion_queue;
2942                sd->completion_queue = NULL;
2943                local_irq_enable();
2944
2945                while (clist) {
2946                        struct sk_buff *skb = clist;
2947                        clist = clist->next;
2948
2949                        WARN_ON(atomic_read(&skb->users));
2950                        trace_kfree_skb(skb, net_tx_action);
2951                        __kfree_skb(skb);
2952                }
2953        }
2954
2955        if (sd->output_queue) {
2956                struct Qdisc *head;
2957
2958                local_irq_disable();
2959                head = sd->output_queue;
2960                sd->output_queue = NULL;
2961                sd->output_queue_tailp = &sd->output_queue;
2962                local_irq_enable();
2963
2964                while (head) {
2965                        struct Qdisc *q = head;
2966                        spinlock_t *root_lock;
2967
2968                        head = head->next_sched;
2969
2970                        root_lock = qdisc_lock(q);
2971                        if (spin_trylock(root_lock)) {
2972                                smp_mb__before_clear_bit();
2973                                clear_bit(__QDISC_STATE_SCHED,
2974                                          &q->state);
2975                                qdisc_run(q);
2976                                spin_unlock(root_lock);
2977                        } else {
2978                                if (!test_bit(__QDISC_STATE_DEACTIVATED,
2979                                              &q->state)) {
2980                                        __netif_reschedule(q);
2981                                } else {
2982                                        smp_mb__before_clear_bit();
2983                                        clear_bit(__QDISC_STATE_SCHED,
2984                                                  &q->state);
2985                                }
2986                        }
2987                }
2988        }
2989}
2990
2991#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2992    (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2993/* This hook is defined here for ATM LANE */
2994int (*br_fdb_test_addr_hook)(struct net_device *dev,
2995                             unsigned char *addr) __read_mostly;
2996EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2997#endif
2998
2999#ifdef CONFIG_NET_CLS_ACT
3000/* TODO: Maybe we should just force sch_ingress to be compiled in

3001 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3002 * a compare and 2 stores extra right now if we dont have it on
3003 * but have CONFIG_NET_CLS_ACT
3004 * NOTE: This doesn't stop any functionality; if you dont have
3005 * the ingress scheduler, you just can't add policies on ingress.
3006 *
3007 */
3008static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3009{
3010        struct net_device *dev = skb->dev;
3011        u32 ttl = G_TC_RTTL(skb->tc_verd);
3012        int result = TC_ACT_OK;
3013        struct Qdisc *q;
3014
3015        if (unlikely(MAX_RED_LOOP < ttl++)) {
3016                if (net_ratelimit())
3017                        pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
3018                               skb->skb_iif, dev->ifindex);
3019                return TC_ACT_SHOT;
3020        }
3021
3022        skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3023        skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3024
3025        q = rxq->qdisc;
3026        if (q != &noop_qdisc) {
3027                spin_lock(qdisc_lock(q));
3028                if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3029                        result = qdisc_enqueue_root(skb, q);
3030                spin_unlock(qdisc_lock(q));
3031        }
3032
3033        return result;
3034}
3035
3036static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3037                                         struct packet_type **pt_prev,
3038                                         int *ret, struct net_device *orig_dev)
3039{
3040        struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3041
3042        if (!rxq || rxq->qdisc == &noop_qdisc)
3043                goto out;
3044
3045        if (*pt_prev) {
3046                *ret = deliver_skb(skb, *pt_prev, orig_dev);
3047                *pt_prev = NULL;
3048        }
3049
3050        switch (ing_filter(skb, rxq)) {
3051        case TC_ACT_SHOT:
3052        case TC_ACT_STOLEN:
3053                kfree_skb(skb);
3054                return NULL;
3055        }
3056
3057out:
3058        skb->tc_verd = 0;
3059        return skb;
3060}
3061#endif
3062
3063/**
3064 *      netdev_rx_handler_register - register receive handler
3065 *      @dev: device to register a handler for
3066 *      @rx_handler: receive handler to register
3067 *      @rx_handler_data: data pointer that is used by rx handler
3068 *
3069 *      Register a receive hander for a device. This handler will then be
3070 *      called from __netif_receive_skb. A negative errno code is returned
3071 *      on a failure.
3072 *
3073 *      The caller must hold the rtnl_mutex.
3074 *
3075 *      For a general description of rx_handler, see enum rx_handler_result.
3076 */
3077int netdev_rx_handler_register(struct net_device *dev,
3078                               rx_handler_func_t *rx_handler,
3079                               void *rx_handler_data)
3080{
3081        ASSERT_RTNL();
3082
3083        if (dev->rx_handler)
3084                return -EBUSY;
3085
3086        rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3087        rcu_assign_pointer(dev->rx_handler, rx_handler);
3088
3089        return 0;
3090}
3091EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3092
3093/**
3094 *      netdev_rx_handler_unregister - unregister receive handler
3095 *      @dev: device to unregister a handler from
3096 *
3097 *      Unregister a receive hander from a device.
3098 *
3099 *      The caller must hold the rtnl_mutex.
3100 */
3101void netdev_rx_handler_unregister(struct net_device *dev)
3102{
3103
3104        ASSERT_RTNL();
3105        rcu_assign_pointer(dev->rx_handler, NULL);
3106        rcu_assign_pointer(dev->rx_handler_data, NULL);
3107}
3108EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3109
3110static int __netif_receive_skb(struct sk_buff *skb)
3111{
3112        struct packet_type *ptype, *pt_prev;
3113        rx_handler_func_t *rx_handler;
3114        struct net_device *orig_dev;
3115        struct net_device *null_or_dev;
3116        bool deliver_exact = false;
3117        int ret = NET_RX_DROP;
3118        __be16 type;
3119
3120        if (!netdev_tstamp_prequeue)
3121                net_timestamp_check(skb);
3122
3123        trace_netif_receive_skb(skb);
3124
3125        /* if we've gotten here through NAPI, check netpoll */
3126        if (netpoll_receive_skb(skb))
3127                return NET_RX_DROP;
3128
3129        if (!skb->skb_iif)
3130                skb->skb_iif = skb->dev->ifindex;
3131        orig_dev = skb->dev;
3132
3133        skb_reset_network_header(skb);
3134        skb_reset_transport_header(skb);
3135        skb_reset_mac_len(skb);
3136
3137        pt_prev = NULL;
3138
3139        rcu_read_lock();
3140
3141another_round:
3142
3143        __this_cpu_inc(softnet_data.processed);
3144
3145        if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3146                skb = vlan_untag(skb);
3147                if (unlikely(!skb))
3148                        goto out;
3149        }
3150
3151#ifdef CONFIG_NET_CLS_ACT
3152        if (skb->tc_verd & TC_NCLS) {
3153                skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3154                goto ncls;
3155        }
3156#endif
3157
3158        list_for_each_entry_rcu(ptype, &ptype_all, list) {
3159                if (!ptype->dev || ptype->dev == skb->dev) {
3160                        if (pt_prev)
3161                                ret = deliver_skb(skb, pt_prev, orig_dev);
3162                        pt_prev = ptype;
3163                }
3164        }
3165
3166#ifdef CONFIG_NET_CLS_ACT
3167        skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3168        if (!skb)
3169                goto out;
3170ncls:
3171#endif
3172
3173        rx_handler = rcu_dereference(skb->dev->rx_handler);
3174        if (rx_handler) {
3175                if (pt_prev) {
3176                        ret = deliver_skb(skb, pt_prev, orig_dev);
3177                        pt_prev = NULL;
3178                }
3179                switch (rx_handler(&skb)) {
3180                case RX_HANDLER_CONSUMED:
3181                        goto out;
3182                case RX_HANDLER_ANOTHER:
3183                        goto another_round;
3184                case RX_HANDLER_EXACT:
3185                        deliver_exact = true;
3186                case RX_HANDLER_PASS:
3187                        break;
3188                default:
3189                        BUG();
3190                }
3191        }
3192
3193        if (vlan_tx_tag_present(skb)) {
3194                if (pt_prev) {
3195                        ret = deliver_skb(skb, pt_prev, orig_dev);
3196                        pt_prev = NULL;
3197                }
3198                if (vlan_do_receive(&skb)) {
3199                        ret = __netif_receive_skb(skb);
3200                        goto out;
3201                } else if (unlikely(!skb))
3202                        goto out;
3203        }
3204
3205        /* deliver only exact match when indicated */
3206        null_or_dev = deliver_exact ? skb->dev : NULL;
3207
3208        type = skb->protocol;
3209        list_for_each_entry_rcu(ptype,
3210                        &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3211                if (ptype->type == type &&
3212                    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3213                     ptype->dev == orig_dev)) {
3214                        if (pt_prev)
3215                                ret = deliver_skb(skb, pt_prev, orig_dev);
3216                        pt_prev = ptype;
3217                }
3218        }
3219
3220        if (pt_prev) {
3221                ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3222        } else {
3223                atomic_long_inc(&skb->dev->rx_dropped);
3224                kfree_skb(skb);
3225                /* Jamal, now you will not able to escape explaining
3226                 * me how you were going to use this. :-)
3227                 */
3228                ret = NET_RX_DROP;
3229        }
3230
3231out:
3232        rcu_read_unlock();
3233        return ret;
3234}
3235
3236/**
3237 *      netif_receive_skb - process receive buffer from network
3238 *      @skb: buffer to process
3239 *
3240 *      netif_receive_skb() is the main receive data processing function.
3241 *      It always succeeds. The buffer may be dropped during processing
3242 *      for congestion control or by the protocol layers.
3243 *
3244 *      This function may only be called from softirq context and interrupts
3245 *      should be enabled.
3246 *
3247 *      Return values (usually ignored):
3248 *      NET_RX_SUCCESS: no congestion
3249 *      NET_RX_DROP: packet was dropped
3250 */
3251int netif_receive_skb(struct sk_buff *skb)
3252{
3253        if (netdev_tstamp_prequeue)
3254                net_timestamp_check(skb);
3255
3256        if (skb_defer_rx_timestamp(skb))
3257                return NET_RX_SUCCESS;
3258
3259#ifdef CONFIG_RPS
3260        {
3261                struct rps_dev_flow voidflow, *rflow = &voidflow;
3262                int cpu, ret;
3263
3264                rcu_read_lock();
3265
3266                cpu = get_rps_cpu(skb->dev, skb, &rflow);
3267
3268                if (cpu >= 0) {
3269                        ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3270                        rcu_read_unlock();
3271                } else {
3272                        rcu_read_unlock();
3273                        ret = __netif_receive_skb(skb);
3274                }
3275
3276                return ret;
3277        }
3278#else
3279        return __netif_receive_skb(skb);
3280#endif
3281}
3282EXPORT_SYMBOL(netif_receive_skb);
3283
3284/* Network device is going away, flush any packets still pending
3285 * Called with irqs disabled.
3286 */
3287static void flush_backlog(void *arg)
3288{
3289        struct net_device *dev = arg;
3290        struct softnet_data *sd = &__get_cpu_var(softnet_data);
3291        struct sk_buff *skb, *tmp;
3292
3293        rps_lock(sd);
3294        skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3295                if (skb->dev == dev) {
3296                        __skb_unlink(skb, &sd->input_pkt_queue);
3297                        kfree_skb(skb);
3298                        input_queue_head_incr(sd);
3299                }
3300        }
3301        rps_unlock(sd);
3302
3303        skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3304                if (skb->dev == dev) {
3305                        __skb_unlink(skb, &sd->process_queue);
3306                        kfree_skb(skb);
3307                        input_queue_head_incr(sd);
3308                }
3309        }
3310}
3311
3312static int napi_gro_complete(struct sk_buff *skb)
3313{
3314        struct packet_type *ptype;
3315        __be16 type = skb->protocol;
3316        struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3317        int err = -ENOENT;
3318
3319        if (NAPI_GRO_CB(skb)->count == 1) {
3320                skb_shinfo(skb)->gso_size = 0;
3321                goto out;
3322        }
3323
3324        rcu_read_lock();
3325        list_for_each_entry_rcu(ptype, head, list) {
3326                if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3327                        continue;
3328
3329                err = ptype->gro_complete(skb);
3330                break;
3331        }
3332        rcu_read_unlock();
3333
3334        if (err) {
3335                WARN_ON(&ptype->list == head);
3336                kfree_skb(skb);
3337                return NET_RX_SUCCESS;
3338        }
3339
3340out:
3341        return netif_receive_skb(skb);
3342}
3343
3344inline void napi_gro_flush(struct napi_struct *napi)
3345{
3346        struct sk_buff *skb, *next;
3347
3348        for (skb = napi->gro_list; skb; skb = next) {
3349                next = skb->next;
3350                skb->next = NULL;
3351                napi_gro_complete(skb);
3352        }
3353
3354        napi->gro_count = 0;
3355        napi->gro_list = NULL;
3356}
3357EXPORT_SYMBOL(napi_gro_flush);
3358
3359enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3360{
3361        struct sk_buff **pp = NULL;
3362        struct packet_type *ptype;
3363        __be16 type = skb->protocol;
3364        struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3365        int same_flow;
3366        int mac_len;
3367        enum gro_result ret;
3368
3369        if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3370                goto normal;
3371
3372        if (skb_is_gso(skb) || skb_has_frag_list(skb))
3373                goto normal;
3374
3375        rcu_read_lock();
3376        list_for_each_entry_rcu(ptype, head, list) {
3377                if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3378                        continue;
3379
3380                skb_set_network_header(skb, skb_gro_offset(skb));
3381                mac_len = skb->network_header - skb->mac_header;
3382                skb->mac_len = mac_len;
3383                NAPI_GRO_CB(skb)->same_flow = 0;
3384                NAPI_GRO_CB(skb)->flush = 0;
3385                NAPI_GRO_CB(skb)->free = 0;
3386
3387                pp = ptype->gro_receive(&napi->gro_list, skb);
3388                break;
3389        }
3390        rcu_read_unlock();
3391
3392        if (&ptype->list == head)
3393                goto normal;
3394
3395        same_flow = NAPI_GRO_CB(skb)->same_flow;
3396        ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3397
3398        if (pp) {
3399                struct sk_buff *nskb = *pp;
3400
3401                *pp = nskb->next;
3402                nskb->next = NULL;
3403                napi_gro_complete(nskb);
3404                napi->gro_count--;
3405        }
3406
3407        if (same_flow)
3408                goto ok;
3409
3410        if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3411                goto normal;
3412
3413        napi->gro_count++;
3414        NAPI_GRO_CB(skb)->count = 1;
3415        skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3416        skb->next = napi->gro_list;
3417        napi->gro_list = skb;
3418        ret = GRO_HELD;
3419
3420pull:
3421        if (skb_headlen(skb) < skb_gro_offset(skb)) {
3422                int grow = skb_gro_offset(skb) - skb_headlen(skb);
3423
3424                BUG_ON(skb->end - skb->tail < grow);
3425
3426                memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3427
3428                skb->tail += grow;
3429                skb->data_len -= grow;
3430
3431                skb_shinfo(skb)->frags[0].page_offset += grow;
3432                skb_shinfo(skb)->frags[0].size -= grow;
3433
3434                if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3435                        put_page(skb_shinfo(skb)->frags[0].page);
3436                        memmove(skb_shinfo(skb)->frags,
3437                                skb_shinfo(skb)->frags + 1,
3438                                --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3439                }
3440        }
3441
3442ok:
3443        return ret;
3444
3445normal:
3446        ret = GRO_NORMAL;
3447        goto pull;
3448}
3449EXPORT_SYMBOL(dev_gro_receive);
3450
3451static inline gro_result_t
3452__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3453{
3454        struct sk_buff *p;
3455
3456        for (p = napi->gro_list; p; p = p->next) {
3457                unsigned long diffs;
3458
3459                diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3460                diffs |= p->vlan_tci ^ skb->vlan_tci;
3461                diffs |= compare_ether_header(skb_mac_header(p),
3462                                              skb_gro_mac_header(skb));
3463                NAPI_GRO_CB(p)->same_flow = !diffs;
3464                NAPI_GRO_CB(p)->flush = 0;
3465        }
3466
3467        return dev_gro_receive(napi, skb);
3468}
3469
3470gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3471{
3472        switch (ret) {
3473        case GRO_NORMAL:
3474                if (netif_receive_skb(skb))
3475                        ret = GRO_DROP;
3476                break;
3477
3478        case GRO_DROP:
3479        case GRO_MERGED_FREE:
3480                kfree_skb(skb);
3481                break;
3482
3483        case GRO_HELD:
3484        case GRO_MERGED:
3485                break;
3486        }
3487
3488        return ret;
3489}
3490EXPORT_SYMBOL(napi_skb_finish);
3491
3492void skb_gro_reset_offset(struct sk_buff *skb)
3493{
3494        NAPI_GRO_CB(skb)->data_offset = 0;
3495        NAPI_GRO_CB(skb)->frag0 = NULL;
3496        NAPI_GRO_CB(skb)->frag0_len = 0;
3497
3498        if (skb->mac_header == skb->tail &&
3499            !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3500                NAPI_GRO_CB(skb)->frag0 =
3501                        page_address(skb_shinfo(skb)->frags[0].page) +
3502                        skb_shinfo(skb)->frags[0].page_offset;
3503                NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3504        }
3505}
3506EXPORT_SYMBOL(skb_gro_reset_offset);
3507
3508gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3509{
3510        skb_gro_reset_offset(skb);
3511
3512        return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3513}
3514EXPORT_SYMBOL(napi_gro_receive);
3515
3516static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3517{
3518        __skb_pull(skb, skb_headlen(skb));
3519        skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3520        skb->vlan_tci = 0;
3521        skb->dev = napi->dev;
3522        skb->skb_iif = 0;
3523
3524        napi->skb = skb;
3525}
3526
3527struct sk_buff *napi_get_frags(struct napi_struct *napi)
3528{
3529        struct sk_buff *skb = napi->skb;
3530
3531        if (!skb) {
3532                skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3533                if (skb)
3534                        napi->skb = skb;
3535        }
3536        return skb;
3537}
3538EXPORT_SYMBOL(napi_get_frags);
3539
3540gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3541                               gro_result_t ret)
3542{
3543        switch (ret) {
3544        case GRO_NORMAL:
3545        case GRO_HELD:
3546                skb->protocol = eth_type_trans(skb, skb->dev);
3547
3548                if (ret == GRO_HELD)
3549                        skb_gro_pull(skb, -ETH_HLEN);
3550                else if (netif_receive_skb(skb))
3551                        ret = GRO_DROP;
3552                break;
3553
3554        case GRO_DROP:
3555        case GRO_MERGED_FREE:
3556                napi_reuse_skb(napi, skb);
3557                break;
3558
3559        case GRO_MERGED:
3560                break;
3561        }
3562
3563        return ret;
3564}
3565EXPORT_SYMBOL(napi_frags_finish);
3566
3567struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3568{
3569        struct sk_buff *skb = napi->skb;
3570        struct ethhdr *eth;
3571        unsigned int hlen;
3572        unsigned int off;
3573
3574        napi->skb = NULL;
3575
3576        skb_reset_mac_header(skb);
3577        skb_gro_reset_offset(skb);
3578
3579        off = skb_gro_offset(skb);
3580        hlen = off + sizeof(*eth);
3581        eth = skb_gro_header_fast(skb, off);
3582        if (skb_gro_header_hard(skb, hlen)) {
3583                eth = skb_gro_header_slow(skb, hlen, off);
3584                if (unlikely(!eth)) {
3585                        napi_reuse_skb(napi, skb);
3586                        skb = NULL;
3587                        goto out;
3588                }
3589        }
3590
3591        skb_gro_pull(skb, sizeof(*eth));
3592
3593        /*
3594         * This works because the only protocols we care about don't require
3595         * special handling.  We'll fix it up properly at the end.
3596         */
3597        skb->protocol = eth->h_proto;
3598
3599out:
3600        return skb;
3601}
3602EXPORT_SYMBOL(napi_frags_skb);
3603
3604gro_result_t napi_gro_frags(struct napi_struct *napi)
3605{
3606        struct sk_buff *skb = napi_frags_skb(napi);
3607
3608        if (!skb)
3609                return GRO_DROP;
3610
3611        return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3612}
3613EXPORT_SYMBOL(napi_gro_frags);
3614
3615/*
3616 * net_rps_action sends any pending IPI's for rps.
3617 * Note: called with local irq disabled, but exits with local irq enabled.
3618 */
3619static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3620{
3621#ifdef CONFIG_RPS
3622        struct softnet_data *remsd = sd->rps_ipi_list;
3623
3624        if (remsd) {
3625                sd->rps_ipi_list = NULL;
3626
3627                local_irq_enable();
3628
3629                /* Send pending IPI's to kick RPS processing on remote cpus. */
3630                while (remsd) {
3631                        struct softnet_data *next = remsd->rps_ipi_next;
3632
3633                        if (cpu_online(remsd->cpu))
3634                                __smp_call_function_single(remsd->cpu,
3635                                                           &remsd->csd, 0);
3636                        remsd = next;
3637                }
3638        } else
3639#endif
3640                local_irq_enable();
3641}
3642
3643static int process_backlog(struct napi_struct *napi, int quota)
3644{
3645        int work = 0;
3646        struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3647
3648#ifdef CONFIG_RPS
3649        /* Check if we have pending ipi, its better to send them now,
3650         * not waiting net_rx_action() end.
3651         */
3652        if (sd->rps_ipi_list) {
3653                local_irq_disable();
3654                net_rps_action_and_irq_enable(sd);
3655        }
3656#endif
3657        napi->weight = weight_p;
3658        local_irq_disable();
3659        while (work < quota) {
3660                struct sk_buff *skb;
3661                unsigned int qlen;
3662
3663                while ((skb = __skb_dequeue(&sd->process_queue))) {
3664                        local_irq_enable();
3665                        __netif_receive_skb(skb);
3666                        local_irq_disable();
3667                        input_queue_head_incr(sd);
3668                        if (++work >= quota) {
3669                                local_irq_enable();
3670                                return work;
3671                        }
3672                }
3673
3674                rps_lock(sd);
3675                qlen = skb_queue_len(&sd->input_pkt_queue);
3676                if (qlen)
3677                        skb_queue_splice_tail_init(&sd->input_pkt_queue,
3678                                                   &sd->process_queue);
3679
3680                if (qlen < quota - work) {
3681                        /*
3682                         * Inline a custom version of __napi_complete().
3683                         * only current cpu owns and manipulates this napi,
3684                         * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3685                         * we can use a plain write instead of clear_bit(),
3686                         * and we dont need an smp_mb() memory barrier.
3687                         */
3688                        list_del(&napi->poll_list);
3689                        napi->state = 0;
3690
3691                        quota = work + qlen;
3692                }
3693                rps_unlock(sd);
3694        }
3695        local_irq_enable();
3696
3697        return work;
3698}
3699
3700/**
3701 * __napi_schedule - schedule for receive
3702 * @n: entry to schedule
3703 *
3704 * The entry's receive function will be scheduled to run
3705 */
3706void __napi_schedule(struct napi_struct *n)
3707{
3708        unsigned long flags;
3709
3710        local_irq_save(flags);
3711        ____napi_schedule(&__get_cpu_var(softnet_data), n);
3712        local_irq_restore(flags);
3713}
3714EXPORT_SYMBOL(__napi_schedule);
3715
3716void __napi_complete(struct napi_struct *n)
3717{
3718        BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3719        BUG_ON(n->gro_list);
3720
3721        list_del(&n->poll_list);
3722        smp_mb__before_clear_bit();
3723        clear_bit(NAPI_STATE_SCHED, &n->state);
3724}
3725EXPORT_SYMBOL(__napi_complete);
3726
3727void napi_complete(struct napi_struct *n)
3728{
3729        unsigned long flags;
3730
3731        /*
3732         * don't let napi dequeue from the cpu poll list
3733         * just in case its running on a different cpu
3734         */
3735        if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3736                return;
3737
3738        napi_gro_flush(n);
3739        local_irq_save(flags);
3740        __napi_complete(n);
3741        local_irq_restore(flags);
3742}
3743EXPORT_SYMBOL(napi_complete);
3744
3745void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3746                    int (*poll)(struct napi_struct *, int), int weight)
3747{
3748        INIT_LIST_HEAD(&napi->poll_list);
3749        napi->gro_count = 0;
3750        napi->gro_list = NULL;
3751        napi->skb = NULL;
3752        napi->poll = poll;
3753        napi->weight = weight;
3754        list_add(&napi->dev_list, &dev->napi_list);
3755        napi->dev = dev;
3756#ifdef CONFIG_NETPOLL
3757        spin_lock_init(&napi->poll_lock);
3758        napi->poll_owner = -1;
3759#endif
3760        set_bit(NAPI_STATE_SCHED, &napi->state);
3761}
3762EXPORT_SYMBOL(netif_napi_add);
3763
3764void netif_napi_del(struct napi_struct *napi)
3765{
3766        struct sk_buff *skb, *next;
3767
3768        list_del_init(&napi->dev_list);
3769        napi_free_frags(napi);
3770
3771        for (skb = napi->gro_list; skb; skb = next) {
3772                next = skb->next;
3773                skb->next = NULL;
3774                kfree_skb(skb);
3775        }
3776
3777        napi->gro_list = NULL;
3778        napi->gro_count = 0;
3779}
3780EXPORT_SYMBOL(netif_napi_del);
3781
3782static void net_rx_action(struct softirq_action *h)
3783{
3784        struct softnet_data *sd = &__get_cpu_var(softnet_data);
3785        unsigned long time_limit = jiffies + 2;
3786        int budget = netdev_budget;
3787        void *have;
3788
3789        local_irq_disable();
3790
3791        while (!list_empty(&sd->poll_list)) {
3792                struct napi_struct *n;
3793                int work, weight;
3794
3795                /* If softirq window is exhuasted then punt.
3796                 * Allow this to run for 2 jiffies since which will allow
3797                 * an average latency of 1.5/HZ.
3798                 */
3799                if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3800                        goto softnet_break;
3801
3802                local_irq_enable();
3803
3804                /* Even though interrupts have been re-enabled, this
3805                 * access is safe because interrupts can only add new
3806                 * entries to the tail of this list, and only ->poll()
3807                 * calls can remove this head entry from the list.
3808                 */
3809                n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3810
3811                have = netpoll_poll_lock(n);
3812
3813                weight = n->weight;
3814
3815                /* This NAPI_STATE_SCHED test is for avoiding a race
3816                 * with netpoll's poll_napi().  Only the entity which
3817                 * obtains the lock and sees NAPI_STATE_SCHED set will
3818                 * actually make the ->poll() call.  Therefore we avoid
3819                 * accidentally calling ->poll() when NAPI is not scheduled.
3820                 */
3821                work = 0;
3822                if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3823                        work = n->poll(n, weight);
3824                        trace_napi_poll(n);
3825                }
3826
3827                WARN_ON_ONCE(work > weight);
3828
3829                budget -= work;
3830
3831                local_irq_disable();
3832
3833                /* Drivers must not modify the NAPI state if they
3834                 * consume the entire weight.  In such cases this code
3835                 * still "owns" the NAPI instance and therefore can
3836                 * move the instance around on the list at-will.
3837                 */
3838                if (unlikely(work == weight)) {
3839                        if (unlikely(napi_disable_pending(n))) {
3840                                local_irq_enable();
3841                                napi_complete(n);
3842                                local_irq_disable();
3843                        } else
3844                                list_move_tail(&n->poll_list, &sd->poll_list);
3845                }
3846
3847                netpoll_poll_unlock(have);
3848        }
3849out:
3850        net_rps_action_and_irq_enable(sd);
3851
3852#ifdef CONFIG_NET_DMA
3853        /*
3854         * There may not be any more sk_buffs coming right now, so push
3855         * any pending DMA copies to hardware
3856         */
3857        dma_issue_pending_all();
3858#endif
3859
3860        return;
3861
3862softnet_break:
3863        sd->time_squeeze++;
3864        __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3865        goto out;
3866}
3867
3868static gifconf_func_t *gifconf_list[NPROTO];
3869
3870/**
3871 *      register_gifconf        -       register a SIOCGIF handler
3872 *      @family: Address family
3873 *      @gifconf: Function handler
3874 *
3875 *      Register protocol dependent address dumping routines. The handler
3876 *      that is passed must not be freed or reused until it has been replaced
3877 *      by another handler.
3878 */
3879int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3880{
3881        if (family >= NPROTO)
3882                return -EINVAL;
3883        gifconf_list[family] = gifconf;
3884        return 0;
3885}
3886EXPORT_SYMBOL(register_gifconf);
3887
3888
3889/*
3890 *      Map an interface index to its name (SIOCGIFNAME)
3891 */
3892
3893/*
3894 *      We need this ioctl for efficient implementation of the
3895 *      if_indextoname() function required by the IPv6 API.  Without
3896 *      it, we would have to search all the interfaces to find a
3897 *      match.  --pb
3898 */
3899
3900static int dev_ifname(struct net *net, struct ifreq __user *arg)
3901{
3902        struct net_device *dev;
3903        struct ifreq ifr;
3904
3905        /*
3906         *      Fetch the caller's info block.
3907         */
3908
3909        if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3910                return -EFAULT;
3911
3912        rcu_read_lock();
3913        dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3914        if (!dev) {
3915                rcu_read_unlock();
3916                return -ENODEV;
3917        }
3918
3919        strcpy(ifr.ifr_name, dev->name);
3920        rcu_read_unlock();
3921
3922        if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3923                return -EFAULT;
3924        return 0;
3925}
3926
3927/*
3928 *      Perform a SIOCGIFCONF call. This structure will change
3929 *      size eventually, and there is nothing I can do about it.
3930 *      Thus we will need a 'compatibility mode'.
3931 */
3932
3933static int dev_ifconf(struct net *net, char __user *arg)
3934{
3935        struct ifconf ifc;
3936        struct net_device *dev;
3937        char __user *pos;
3938        int len;
3939        int total;
3940        int i;
3941
3942        /*
3943         *      Fetch the caller's info block.
3944         */
3945
3946        if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3947                return -EFAULT;
3948
3949        pos = ifc.ifc_buf;
3950        len = ifc.ifc_len;
3951
3952        /*
3953         *      Loop over the interfaces, and write an info block for each.
3954         */
3955
3956        total = 0;
3957        for_each_netdev(net, dev) {
3958                for (i = 0; i < NPROTO; i++) {
3959                        if (gifconf_list[i]) {
3960                                int done;
3961                                if (!pos)
3962                                        done = gifconf_list[i](dev, NULL, 0);
3963                                else
3964                                        done = gifconf_list[i](dev, pos + total,
3965                                                               len - total);
3966                                if (done < 0)
3967                                        return -EFAULT;
3968                                total += done;
3969                        }
3970                }
3971        }
3972
3973        /*
3974         *      All done.  Write the updated control block back to the caller.
3975         */
3976        ifc.ifc_len = total;
3977
3978        /*
3979         *      Both BSD and Solaris return 0 here, so we do too.
3980         */
3981        return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3982}
3983
3984#ifdef CONFIG_PROC_FS
3985/*
3986 *      This is invoked by the /proc filesystem handler to display a device
3987 *      in detail.
3988 */
3989void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3990        __acquires(RCU)
3991{
3992        struct net *net = seq_file_net(seq);
3993        loff_t off;
3994        struct net_device *dev;
3995
3996        rcu_read_lock();
3997        if (!*pos)
3998                return SEQ_START_TOKEN;
3999
4000        off = 1;

4001        for_each_netdev_rcu(net, dev)
4002                if (off++ == *pos)
4003                        return dev;
4004
4005        return NULL;
4006}
4007
4008void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4009{
4010        struct net_device *dev = v;
4011
4012        if (v == SEQ_START_TOKEN)
4013                dev = first_net_device_rcu(seq_file_net(seq));
4014        else
4015                dev = next_net_device_rcu(dev);
4016
4017        ++*pos;
4018        return dev;
4019}
4020
4021void dev_seq_stop(struct seq_file *seq, void *v)
4022        __releases(RCU)
4023{
4024        rcu_read_unlock();
4025}
4026
4027static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4028{
4029        struct rtnl_link_stats64 temp;
4030        const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4031
4032        seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4033                   "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4034                   dev->name, stats->rx_bytes, stats->rx_packets,
4035                   stats->rx_errors,
4036                   stats->rx_dropped + stats->rx_missed_errors,
4037                   stats->rx_fifo_errors,
4038                   stats->rx_length_errors + stats->rx_over_errors +
4039                    stats->rx_crc_errors + stats->rx_frame_errors,
4040                   stats->rx_compressed, stats->multicast,
4041                   stats->tx_bytes, stats->tx_packets,
4042                   stats->tx_errors, stats->tx_dropped,
4043                   stats->tx_fifo_errors, stats->collisions,
4044                   stats->tx_carrier_errors +
4045                    stats->tx_aborted_errors +
4046                    stats->tx_window_errors +
4047                    stats->tx_heartbeat_errors,
4048                   stats->tx_compressed);
4049}
4050
4051/*
4052 *      Called from the PROCfs module. This now uses the new arbitrary sized
4053 *      /proc/net interface to create /proc/net/dev
4054 */
4055static int dev_seq_show(struct seq_file *seq, void *v)
4056{
4057        if (v == SEQ_START_TOKEN)
4058                seq_puts(seq, "Inter-|   Receive                            "
4059                              "                    |  Transmit\n"
4060                              " face |bytes    packets errs drop fifo frame "
4061                              "compressed multicast|bytes    packets errs "
4062                              "drop fifo colls carrier compressed\n");
4063        else
4064                dev_seq_printf_stats(seq, v);
4065        return 0;
4066}
4067
4068static struct softnet_data *softnet_get_online(loff_t *pos)
4069{
4070        struct softnet_data *sd = NULL;
4071
4072        while (*pos < nr_cpu_ids)
4073                if (cpu_online(*pos)) {
4074                        sd = &per_cpu(softnet_data, *pos);
4075                        break;
4076                } else
4077                        ++*pos;
4078        return sd;
4079}
4080
4081static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4082{
4083        return softnet_get_online(pos);
4084}
4085
4086static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4087{
4088        ++*pos;
4089        return softnet_get_online(pos);
4090}
4091
4092static void softnet_seq_stop(struct seq_file *seq, void *v)
4093{
4094}
4095
4096static int softnet_seq_show(struct seq_file *seq, void *v)
4097{
4098        struct softnet_data *sd = v;
4099
4100        seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4101                   sd->processed, sd->dropped, sd->time_squeeze, 0,
4102                   0, 0, 0, 0, /* was fastroute */
4103                   sd->cpu_collision, sd->received_rps);
4104        return 0;
4105}
4106
4107static const struct seq_operations dev_seq_ops = {
4108        .start = dev_seq_start,
4109        .next  = dev_seq_next,
4110        .stop  = dev_seq_stop,
4111        .show  = dev_seq_show,
4112};
4113
4114static int dev_seq_open(struct inode *inode, struct file *file)
4115{
4116        return seq_open_net(inode, file, &dev_seq_ops,
4117                            sizeof(struct seq_net_private));
4118}
4119
4120static const struct file_operations dev_seq_fops = {
4121        .owner   = THIS_MODULE,
4122        .open    = dev_seq_open,
4123        .read    = seq_read,
4124        .llseek  = seq_lseek,
4125        .release = seq_release_net,
4126};
4127
4128static const struct seq_operations softnet_seq_ops = {
4129        .start = softnet_seq_start,
4130        .next  = softnet_seq_next,
4131        .stop  = softnet_seq_stop,
4132        .show  = softnet_seq_show,
4133};
4134
4135static int softnet_seq_open(struct inode *inode, struct file *file)
4136{
4137        return seq_open(file, &softnet_seq_ops);
4138}
4139
4140static const struct file_operations softnet_seq_fops = {
4141        .owner   = THIS_MODULE,
4142        .open    = softnet_seq_open,
4143        .read    = seq_read,
4144        .llseek  = seq_lseek,
4145        .release = seq_release,
4146};
4147
4148static void *ptype_get_idx(loff_t pos)
4149{
4150        struct packet_type *pt = NULL;
4151        loff_t i = 0;
4152        int t;
4153
4154        list_for_each_entry_rcu(pt, &ptype_all, list) {
4155                if (i == pos)
4156                        return pt;
4157                ++i;
4158        }
4159
4160        for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4161                list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4162                        if (i == pos)
4163                                return pt;
4164                        ++i;
4165                }
4166        }
4167        return NULL;
4168}
4169
4170static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4171        __acquires(RCU)
4172{
4173        rcu_read_lock();
4174        return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4175}
4176
4177static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4178{
4179        struct packet_type *pt;
4180        struct list_head *nxt;
4181        int hash;
4182
4183        ++*pos;
4184        if (v == SEQ_START_TOKEN)
4185                return ptype_get_idx(0);
4186
4187        pt = v;
4188        nxt = pt->list.next;
4189        if (pt->type == htons(ETH_P_ALL)) {
4190                if (nxt != &ptype_all)
4191                        goto found;
4192                hash = 0;
4193                nxt = ptype_base[0].next;
4194        } else
4195                hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4196
4197        while (nxt == &ptype_base[hash]) {
4198                if (++hash >= PTYPE_HASH_SIZE)
4199                        return NULL;
4200                nxt = ptype_base[hash].next;
4201        }
4202found:
4203        return list_entry(nxt, struct packet_type, list);
4204}
4205
4206static void ptype_seq_stop(struct seq_file *seq, void *v)
4207        __releases(RCU)
4208{
4209        rcu_read_unlock();
4210}
4211
4212static int ptype_seq_show(struct seq_file *seq, void *v)
4213{
4214        struct packet_type *pt = v;
4215
4216        if (v == SEQ_START_TOKEN)
4217                seq_puts(seq, "Type Device      Function\n");
4218        else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4219                if (pt->type == htons(ETH_P_ALL))
4220                        seq_puts(seq, "ALL ");
4221                else
4222                        seq_printf(seq, "%04x", ntohs(pt->type));
4223
4224                seq_printf(seq, " %-8s %pF\n",
4225                           pt->dev ? pt->dev->name : "", pt->func);
4226        }
4227
4228        return 0;
4229}
4230
4231static const struct seq_operations ptype_seq_ops = {
4232        .start = ptype_seq_start,
4233        .next  = ptype_seq_next,
4234        .stop  = ptype_seq_stop,
4235        .show  = ptype_seq_show,
4236};
4237
4238static int ptype_seq_open(struct inode *inode, struct file *file)
4239{
4240        return seq_open_net(inode, file, &ptype_seq_ops,
4241                        sizeof(struct seq_net_private));
4242}
4243
4244static const struct file_operations ptype_seq_fops = {
4245        .owner   = THIS_MODULE,
4246        .open    = ptype_seq_open,
4247        .read    = seq_read,
4248        .llseek  = seq_lseek,
4249        .release = seq_release_net,
4250};
4251
4252
4253static int __net_init dev_proc_net_init(struct net *net)
4254{
4255        int rc = -ENOMEM;
4256
4257        if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4258                goto out;
4259        if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4260                goto out_dev;
4261        if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4262                goto out_softnet;
4263
4264        if (wext_proc_init(net))
4265                goto out_ptype;
4266        rc = 0;
4267out:
4268        return rc;
4269out_ptype:
4270        proc_net_remove(net, "ptype");
4271out_softnet:
4272        proc_net_remove(net, "softnet_stat");
4273out_dev:
4274        proc_net_remove(net, "dev");
4275        goto out;
4276}
4277
4278static void __net_exit dev_proc_net_exit(struct net *net)
4279{
4280        wext_proc_exit(net);
4281
4282        proc_net_remove(net, "ptype");
4283        proc_net_remove(net, "softnet_stat");
4284        proc_net_remove(net, "dev");
4285}
4286
4287static struct pernet_operations __net_initdata dev_proc_ops = {
4288        .init = dev_proc_net_init,
4289        .exit = dev_proc_net_exit,
4290};
4291
4292static int __init dev_proc_init(void)
4293{
4294        return register_pernet_subsys(&dev_proc_ops);
4295}
4296#else
4297#define dev_proc_init() 0
4298#endif  /* CONFIG_PROC_FS */
4299
4300
4301/**
4302 *      netdev_set_master       -       set up master pointer
4303 *      @slave: slave device
4304 *      @master: new master device
4305 *
4306 *      Changes the master device of the slave. Pass %NULL to break the
4307 *      bonding. The caller must hold the RTNL semaphore. On a failure
4308 *      a negative errno code is returned. On success the reference counts
4309 *      are adjusted and the function returns zero.
4310 */
4311int netdev_set_master(struct net_device *slave, struct net_device *master)
4312{
4313        struct net_device *old = slave->master;
4314
4315        ASSERT_RTNL();
4316
4317        if (master) {
4318                if (old)
4319                        return -EBUSY;
4320                dev_hold(master);
4321        }
4322
4323        slave->master = master;
4324
4325        if (old)
4326                dev_put(old);
4327        return 0;
4328}
4329EXPORT_SYMBOL(netdev_set_master);
4330
4331/**
4332 *      netdev_set_bond_master  -       set up bonding master/slave pair
4333 *      @slave: slave device
4334 *      @master: new master device
4335 *
4336 *      Changes the master device of the slave. Pass %NULL to break the
4337 *      bonding. The caller must hold the RTNL semaphore. On a failure
4338 *      a negative errno code is returned. On success %RTM_NEWLINK is sent
4339 *      to the routing socket and the function returns zero.
4340 */
4341int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4342{
4343        int err;
4344
4345        ASSERT_RTNL();
4346
4347        err = netdev_set_master(slave, master);
4348        if (err)
4349                return err;
4350        if (master)
4351                slave->flags |= IFF_SLAVE;
4352        else
4353                slave->flags &= ~IFF_SLAVE;
4354
4355        rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4356        return 0;
4357}
4358EXPORT_SYMBOL(netdev_set_bond_master);
4359
4360static void dev_change_rx_flags(struct net_device *dev, int flags)
4361{
4362        const struct net_device_ops *ops = dev->netdev_ops;
4363
4364        if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4365                ops->ndo_change_rx_flags(dev, flags);
4366}
4367
4368static int __dev_set_promiscuity(struct net_device *dev, int inc)
4369{
4370        unsigned short old_flags = dev->flags;
4371        uid_t uid;
4372        gid_t gid;
4373
4374        ASSERT_RTNL();
4375
4376        dev->flags |= IFF_PROMISC;
4377        dev->promiscuity += inc;
4378        if (dev->promiscuity == 0) {
4379                /*
4380                 * Avoid overflow.
4381                 * If inc causes overflow, untouch promisc and return error.
4382                 */
4383                if (inc < 0)
4384                        dev->flags &= ~IFF_PROMISC;
4385                else {
4386                        dev->promiscuity -= inc;
4387                        printk(KERN_WARNING "%s: promiscuity touches roof, "
4388                                "set promiscuity failed, promiscuity feature "
4389                                "of device might be broken.\n", dev->name);
4390                        return -EOVERFLOW;
4391                }
4392        }
4393        if (dev->flags != old_flags) {
4394                printk(KERN_INFO "device %s %s promiscuous mode\n",
4395                       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4396                                                               "left");
4397                if (audit_enabled) {
4398                        current_uid_gid(&uid, &gid);
4399                        audit_log(current->audit_context, GFP_ATOMIC,
4400                                AUDIT_ANOM_PROMISCUOUS,
4401                                "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4402                                dev->name, (dev->flags & IFF_PROMISC),
4403                                (old_flags & IFF_PROMISC),
4404                                audit_get_loginuid(current),
4405                                uid, gid,
4406                                audit_get_sessionid(current));
4407                }
4408
4409                dev_change_rx_flags(dev, IFF_PROMISC);
4410        }
4411        return 0;
4412}
4413
4414/**
4415 *      dev_set_promiscuity     - update promiscuity count on a device
4416 *      @dev: device
4417 *      @inc: modifier
4418 *
4419 *      Add or remove promiscuity from a device. While the count in the device
4420 *      remains above zero the interface remains promiscuous. Once it hits zero
4421 *      the device reverts back to normal filtering operation. A negative inc
4422 *      value is used to drop promiscuity on the device.
4423 *      Return 0 if successful or a negative errno code on error.
4424 */
4425int dev_set_promiscuity(struct net_device *dev, int inc)
4426{
4427        unsigned short old_flags = dev->flags;
4428        int err;
4429
4430        err = __dev_set_promiscuity(dev, inc);
4431        if (err < 0)
4432                return err;
4433        if (dev->flags != old_flags)
4434                dev_set_rx_mode(dev);
4435        return err;
4436}
4437EXPORT_SYMBOL(dev_set_promiscuity);
4438
4439/**
4440 *      dev_set_allmulti        - update allmulti count on a device
4441 *      @dev: device
4442 *      @inc: modifier
4443 *
4444 *      Add or remove reception of all multicast frames to a device. While the
4445 *      count in the device remains above zero the interface remains listening
4446 *      to all interfaces. Once it hits zero the device reverts back to normal
4447 *      filtering operation. A negative @inc value is used to drop the counter
4448 *      when releasing a resource needing all multicasts.
4449 *      Return 0 if successful or a negative errno code on error.
4450 */
4451
4452int dev_set_allmulti(struct net_device *dev, int inc)
4453{
4454        unsigned short old_flags = dev->flags;
4455
4456        ASSERT_RTNL();
4457
4458        dev->flags |= IFF_ALLMULTI;
4459        dev->allmulti += inc;
4460        if (dev->allmulti == 0) {
4461                /*
4462                 * Avoid overflow.
4463                 * If inc causes overflow, untouch allmulti and return error.
4464                 */
4465                if (inc < 0)
4466                        dev->flags &= ~IFF_ALLMULTI;
4467                else {
4468                        dev->allmulti -= inc;
4469                        printk(KERN_WARNING "%s: allmulti touches roof, "
4470                                "set allmulti failed, allmulti feature of "
4471                                "device might be broken.\n", dev->name);
4472                        return -EOVERFLOW;
4473                }
4474        }
4475        if (dev->flags ^ old_flags) {
4476                dev_change_rx_flags(dev, IFF_ALLMULTI);
4477                dev_set_rx_mode(dev);
4478        }
4479        return 0;
4480}
4481EXPORT_SYMBOL(dev_set_allmulti);
4482
4483/*
4484 *      Upload unicast and multicast address lists to device and
4485 *      configure RX filtering. When the device doesn't support unicast
4486 *      filtering it is put in promiscuous mode while unicast addresses
4487 *      are present.
4488 */
4489void __dev_set_rx_mode(struct net_device *dev)
4490{
4491        const struct net_device_ops *ops = dev->netdev_ops;
4492
4493        /* dev_open will call this function so the list will stay sane. */
4494        if (!(dev->flags&IFF_UP))
4495                return;
4496
4497        if (!netif_device_present(dev))
4498                return;
4499
4500        if (ops->ndo_set_rx_mode)
4501                ops->ndo_set_rx_mode(dev);
4502        else {
4503                /* Unicast addresses changes may only happen under the rtnl,
4504                 * therefore calling __dev_set_promiscuity here is safe.
4505                 */
4506                if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4507                        __dev_set_promiscuity(dev, 1);
4508                        dev->uc_promisc = true;
4509                } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4510                        __dev_set_promiscuity(dev, -1);
4511                        dev->uc_promisc = false;
4512                }
4513
4514                if (ops->ndo_set_multicast_list)
4515                        ops->ndo_set_multicast_list(dev);
4516        }
4517}
4518
4519void dev_set_rx_mode(struct net_device *dev)
4520{
4521        netif_addr_lock_bh(dev);
4522        __dev_set_rx_mode(dev);
4523        netif_addr_unlock_bh(dev);
4524}
4525
4526/**
4527 *      dev_ethtool_get_settings - call device's ethtool_ops::get_settings()
4528 *      @dev: device
4529 *      @cmd: memory area for ethtool_ops::get_settings() result
4530 *
4531 *      The cmd arg is initialized properly (cleared and
4532 *      ethtool_cmd::cmd field set to ETHTOOL_GSET).
4533 *
4534 *      Return device's ethtool_ops::get_settings() result value or
4535 *      -EOPNOTSUPP when device doesn't expose
4536 *      ethtool_ops::get_settings() operation.
4537 */
4538int dev_ethtool_get_settings(struct net_device *dev,
4539                             struct ethtool_cmd *cmd)
4540{
4541        if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings)
4542                return -EOPNOTSUPP;
4543
4544        memset(cmd, 0, sizeof(struct ethtool_cmd));
4545        cmd->cmd = ETHTOOL_GSET;
4546        return dev->ethtool_ops->get_settings(dev, cmd);
4547}
4548EXPORT_SYMBOL(dev_ethtool_get_settings);
4549
4550/**
4551 *      dev_get_flags - get flags reported to userspace
4552 *      @dev: device
4553 *
4554 *      Get the combination of flag bits exported through APIs to userspace.
4555 */
4556unsigned dev_get_flags(const struct net_device *dev)
4557{
4558        unsigned flags;
4559
4560        flags = (dev->flags & ~(IFF_PROMISC |
4561                                IFF_ALLMULTI |
4562                                IFF_RUNNING |
4563                                IFF_LOWER_UP |
4564                                IFF_DORMANT)) |
4565                (dev->gflags & (IFF_PROMISC |
4566                                IFF_ALLMULTI));
4567
4568        if (netif_running(dev)) {
4569                if (netif_oper_up(dev))
4570                        flags |= IFF_RUNNING;
4571                if (netif_carrier_ok(dev))
4572                        flags |= IFF_LOWER_UP;
4573                if (netif_dormant(dev))
4574                        flags |= IFF_DORMANT;
4575        }
4576
4577        return flags;
4578}
4579EXPORT_SYMBOL(dev_get_flags);
4580
4581int __dev_change_flags(struct net_device *dev, unsigned int flags)
4582{
4583        int old_flags = dev->flags;
4584        int ret;
4585
4586        ASSERT_RTNL();
4587
4588        /*
4589         *      Set the flags on our device.
4590         */
4591
4592        dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4593                               IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4594                               IFF_AUTOMEDIA)) |
4595                     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4596                                    IFF_ALLMULTI));
4597
4598        /*
4599         *      Load in the correct multicast list now the flags have changed.
4600         */
4601
4602        if ((old_flags ^ flags) & IFF_MULTICAST)
4603                dev_change_rx_flags(dev, IFF_MULTICAST);
4604
4605        dev_set_rx_mode(dev);
4606
4607        /*
4608         *      Have we downed the interface. We handle IFF_UP ourselves
4609         *      according to user attempts to set it, rather than blindly
4610         *      setting it.
4611         */
4612
4613        ret = 0;
4614        if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4615                ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4616
4617                if (!ret)
4618                        dev_set_rx_mode(dev);
4619        }
4620
4621        if ((flags ^ dev->gflags) & IFF_PROMISC) {
4622                int inc = (flags & IFF_PROMISC) ? 1 : -1;
4623
4624                dev->gflags ^= IFF_PROMISC;
4625                dev_set_promiscuity(dev, inc);
4626        }
4627
4628        /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4629           is important. Some (broken) drivers set IFF_PROMISC, when
4630           IFF_ALLMULTI is requested not asking us and not reporting.
4631         */
4632        if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4633                int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4634
4635                dev->gflags ^= IFF_ALLMULTI;
4636                dev_set_allmulti(dev, inc);
4637        }
4638
4639        return ret;
4640}
4641
4642void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4643{
4644        unsigned int changes = dev->flags ^ old_flags;
4645
4646        if (changes & IFF_UP) {
4647                if (dev->flags & IFF_UP)
4648                        call_netdevice_notifiers(NETDEV_UP, dev);
4649                else
4650                        call_netdevice_notifiers(NETDEV_DOWN, dev);
4651        }
4652
4653        if (dev->flags & IFF_UP &&
4654            (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4655                call_netdevice_notifiers(NETDEV_CHANGE, dev);
4656}
4657
4658/**
4659 *      dev_change_flags - change device settings
4660 *      @dev: device
4661 *      @flags: device state flags
4662 *
4663 *      Change settings on device based state flags. The flags are
4664 *      in the userspace exported format.
4665 */
4666int dev_change_flags(struct net_device *dev, unsigned flags)
4667{
4668        int ret, changes;
4669        int old_flags = dev->flags;
4670
4671        ret = __dev_change_flags(dev, flags);
4672        if (ret < 0)
4673                return ret;
4674
4675        changes = old_flags ^ dev->flags;
4676        if (changes)
4677                rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4678
4679        __dev_notify_flags(dev, old_flags);
4680        return ret;
4681}
4682EXPORT_SYMBOL(dev_change_flags);
4683
4684/**
4685 *      dev_set_mtu - Change maximum transfer unit
4686 *      @dev: device
4687 *      @new_mtu: new transfer unit
4688 *
4689 *      Change the maximum transfer size of the network device.
4690 */
4691int dev_set_mtu(struct net_device *dev, int new_mtu)
4692{
4693        const struct net_device_ops *ops = dev->netdev_ops;
4694        int err;
4695
4696        if (new_mtu == dev->mtu)
4697                return 0;
4698
4699        /*      MTU must be positive.    */
4700        if (new_mtu < 0)
4701                return -EINVAL;
4702
4703        if (!netif_device_present(dev))
4704                return -ENODEV;
4705
4706        err = 0;
4707        if (ops->ndo_change_mtu)
4708                err = ops->ndo_change_mtu(dev, new_mtu);
4709        else
4710                dev->mtu = new_mtu;
4711
4712        if (!err && dev->flags & IFF_UP)
4713                call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4714        return err;
4715}
4716EXPORT_SYMBOL(dev_set_mtu);
4717
4718/**
4719 *      dev_set_group - Change group this device belongs to
4720 *      @dev: device
4721 *      @new_group: group this device should belong to
4722 */
4723void dev_set_group(struct net_device *dev, int new_group)
4724{
4725        dev->group = new_group;
4726}
4727EXPORT_SYMBOL(dev_set_group);
4728
4729/**
4730 *      dev_set_mac_address - Change Media Access Control Address
4731 *      @dev: device
4732 *      @sa: new address
4733 *
4734 *      Change the hardware (MAC) address of the device
4735 */
4736int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4737{
4738        const struct net_device_ops *ops = dev->netdev_ops;
4739        int err;
4740
4741        if (!ops->ndo_set_mac_address)
4742                return -EOPNOTSUPP;
4743        if (sa->sa_family != dev->type)
4744                return -EINVAL;
4745        if (!netif_device_present(dev))
4746                return -ENODEV;
4747        err = ops->ndo_set_mac_address(dev, sa);
4748        if (!err)
4749                call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4750        return err;
4751}
4752EXPORT_SYMBOL(dev_set_mac_address);
4753
4754/*
4755 *      Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4756 */
4757static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4758{
4759        int err;
4760        struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4761
4762        if (!dev)
4763                return -ENODEV;
4764
4765        switch (cmd) {
4766        case SIOCGIFFLAGS:      /* Get interface flags */
4767                ifr->ifr_flags = (short) dev_get_flags(dev);
4768                return 0;
4769
4770        case SIOCGIFMETRIC:     /* Get the metric on the interface
4771                                   (currently unused) */
4772                ifr->ifr_metric = 0;
4773                return 0;
4774
4775        case SIOCGIFMTU:        /* Get the MTU of a device */
4776                ifr->ifr_mtu = dev->mtu;
4777                return 0;
4778
4779        case SIOCGIFHWADDR:
4780                if (!dev->addr_len)
4781                        memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4782                else
4783                        memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4784                               min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4785                ifr->ifr_hwaddr.sa_family = dev->type;
4786                return 0;
4787
4788        case SIOCGIFSLAVE:
4789                err = -EINVAL;
4790                break;
4791
4792        case SIOCGIFMAP:
4793                ifr->ifr_map.mem_start = dev->mem_start;
4794                ifr->ifr_map.mem_end   = dev->mem_end;
4795                ifr->ifr_map.base_addr = dev->base_addr;
4796                ifr->ifr_map.irq       = dev->irq;
4797                ifr->ifr_map.dma       = dev->dma;
4798                ifr->ifr_map.port      = dev->if_port;
4799                return 0;
4800
4801        case SIOCGIFINDEX:
4802                ifr->ifr_ifindex = dev->ifindex;
4803                return 0;
4804
4805        case SIOCGIFTXQLEN:
4806                ifr->ifr_qlen = dev->tx_queue_len;
4807                return 0;
4808
4809        default:
4810                /* dev_ioctl() should ensure this case
4811                 * is never reached
4812                 */
4813                WARN_ON(1);
4814                err = -ENOTTY;
4815                break;
4816
4817        }
4818        return err;
4819}
4820
4821/*
4822 *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
4823 */
4824static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4825{
4826        int err;
4827        struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4828        const struct net_device_ops *ops;
4829
4830        if (!dev)
4831                return -ENODEV;
4832
4833        ops = dev->netdev_ops;
4834
4835        switch (cmd) {
4836        case SIOCSIFFLAGS:      /* Set interface flags */
4837                return dev_change_flags(dev, ifr->ifr_flags);
4838
4839        case SIOCSIFMETRIC:     /* Set the metric on the interface
4840                                   (currently unused) */
4841                return -EOPNOTSUPP;
4842
4843        case SIOCSIFMTU:        /* Set the MTU of a device */
4844                return dev_set_mtu(dev, ifr->ifr_mtu);
4845
4846        case SIOCSIFHWADDR:
4847                return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4848
4849        case SIOCSIFHWBROADCAST:
4850                if (ifr->ifr_hwaddr.sa_family != dev->type)
4851                        return -EINVAL;
4852                memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4853                       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4854                call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4855                return 0;
4856
4857        case SIOCSIFMAP:
4858                if (ops->ndo_set_config) {
4859                        if (!netif_device_present(dev))
4860                                return -ENODEV;
4861                        return ops->ndo_set_config(dev, &ifr->ifr_map);
4862                }
4863                return -EOPNOTSUPP;
4864
4865        case SIOCADDMULTI:
4866                if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4867                    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4868                        return -EINVAL;
4869                if (!netif_device_present(dev))
4870                        return -ENODEV;
4871                return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4872
4873        case SIOCDELMULTI:
4874                if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4875                    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4876                        return -EINVAL;
4877                if (!netif_device_present(dev))
4878                        return -ENODEV;
4879                return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4880
4881        case SIOCSIFTXQLEN:
4882                if (ifr->ifr_qlen < 0)
4883                        return -EINVAL;
4884                dev->tx_queue_len = ifr->ifr_qlen;
4885                return 0;
4886
4887        case SIOCSIFNAME:
4888                ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4889                return dev_change_name(dev, ifr->ifr_newname);
4890
4891        /*
4892         *      Unknown or private ioctl
4893         */
4894        default:
4895                if ((cmd >= SIOCDEVPRIVATE &&
4896                    cmd <= SIOCDEVPRIVATE + 15) ||
4897                    cmd == SIOCBONDENSLAVE ||
4898                    cmd == SIOCBONDRELEASE ||
4899                    cmd == SIOCBONDSETHWADDR ||
4900                    cmd == SIOCBONDSLAVEINFOQUERY ||
4901                    cmd == SIOCBONDINFOQUERY ||
4902                    cmd == SIOCBONDCHANGEACTIVE ||
4903                    cmd == SIOCGMIIPHY ||
4904                    cmd == SIOCGMIIREG ||
4905                    cmd == SIOCSMIIREG ||
4906                    cmd == SIOCBRADDIF ||
4907                    cmd == SIOCBRDELIF ||
4908                    cmd == SIOCSHWTSTAMP ||
4909                    cmd == SIOCWANDEV) {
4910                        err = -EOPNOTSUPP;
4911                        if (ops->ndo_do_ioctl) {
4912                                if (netif_device_present(dev))
4913                                        err = ops->ndo_do_ioctl(dev, ifr, cmd);
4914                                else
4915                                        err = -ENODEV;
4916                        }
4917                } else
4918                        err = -EINVAL;
4919
4920        }
4921        return err;
4922}
4923
4924/*
4925 *      This function handles all "interface"-type I/O control requests. The actual
4926 *      'doing' part of this is dev_ifsioc above.
4927 */
4928
4929/**
4930 *      dev_ioctl       -       network device ioctl
4931 *      @net: the applicable net namespace
4932 *      @cmd: command to issue
4933 *      @arg: pointer to a struct ifreq in user space
4934 *
4935 *      Issue ioctl functions to devices. This is normally called by the
4936 *      user space syscall interfaces but can sometimes be useful for
4937 *      other purposes. The return value is the return from the syscall if
4938 *      positive or a negative errno code on error.
4939 */
4940
4941int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4942{
4943        struct ifreq ifr;
4944        int ret;
4945        char *colon;
4946
4947        /* One special case: SIOCGIFCONF takes ifconf argument
4948           and requires shared lock, because it sleeps writing
4949           to user space.
4950         */
4951
4952        if (cmd == SIOCGIFCONF) {
4953                rtnl_lock();
4954                ret = dev_ifconf(net, (char __user *) arg);
4955                rtnl_unlock();
4956                return ret;
4957        }
4958        if (cmd == SIOCGIFNAME)
4959                return dev_ifname(net, (struct ifreq __user *)arg);
4960
4961        if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4962                return -EFAULT;
4963
4964        ifr.ifr_name[IFNAMSIZ-1] = 0;
4965
4966        colon = strchr(ifr.ifr_name, ':');
4967        if (colon)
4968                *colon = 0;
4969
4970        /*
4971         *      See which interface the caller is talking about.
4972         */
4973
4974        switch (cmd) {
4975        /*
4976         *      These ioctl calls:
4977         *      - can be done by all.
4978         *      - atomic and do not require locking.
4979         *      - return a value
4980         */
4981        case SIOCGIFFLAGS:
4982        case SIOCGIFMETRIC:
4983        case SIOCGIFMTU:
4984        case SIOCGIFHWADDR:
4985        case SIOCGIFSLAVE:
4986        case SIOCGIFMAP:
4987        case SIOCGIFINDEX:
4988        case SIOCGIFTXQLEN:
4989                dev_load(net, ifr.ifr_name);
4990                rcu_read_lock();
4991                ret = dev_ifsioc_locked(net, &ifr, cmd);
4992                rcu_read_unlock();
4993                if (!ret) {
4994                        if (colon)
4995                                *colon = ':';
4996                        if (copy_to_user(arg, &ifr,
4997                                         sizeof(struct ifreq)))
4998                                ret = -EFAULT;
4999                }
5000                return ret;

5001
5002        case SIOCETHTOOL:
5003                dev_load(net, ifr.ifr_name);
5004                rtnl_lock();
5005                ret = dev_ethtool(net, &ifr);
5006                rtnl_unlock();
5007                if (!ret) {
5008                        if (colon)
5009                                *colon = ':';
5010                        if (copy_to_user(arg, &ifr,
5011                                         sizeof(struct ifreq)))
5012                                ret = -EFAULT;
5013                }
5014                return ret;
5015
5016        /*
5017         *      These ioctl calls:
5018         *      - require superuser power.
5019         *      - require strict serialization.
5020         *      - return a value
5021         */
5022        case SIOCGMIIPHY:
5023        case SIOCGMIIREG:
5024        case SIOCSIFNAME:
5025                if (!capable(CAP_NET_ADMIN))
5026                        return -EPERM;
5027                dev_load(net, ifr.ifr_name);
5028                rtnl_lock();
5029                ret = dev_ifsioc(net, &ifr, cmd);
5030                rtnl_unlock();
5031                if (!ret) {
5032                        if (colon)
5033                                *colon = ':';
5034                        if (copy_to_user(arg, &ifr,
5035                                         sizeof(struct ifreq)))
5036                                ret = -EFAULT;
5037                }
5038                return ret;
5039
5040        /*
5041         *      These ioctl calls:
5042         *      - require superuser power.
5043         *      - require strict serialization.
5044         *      - do not return a value
5045         */
5046        case SIOCSIFFLAGS:
5047        case SIOCSIFMETRIC:
5048        case SIOCSIFMTU:
5049        case SIOCSIFMAP:
5050        case SIOCSIFHWADDR:
5051        case SIOCSIFSLAVE:
5052        case SIOCADDMULTI:
5053        case SIOCDELMULTI:
5054        case SIOCSIFHWBROADCAST:
5055        case SIOCSIFTXQLEN:
5056        case SIOCSMIIREG:
5057        case SIOCBONDENSLAVE:
5058        case SIOCBONDRELEASE:
5059        case SIOCBONDSETHWADDR:
5060        case SIOCBONDCHANGEACTIVE:
5061        case SIOCBRADDIF:
5062        case SIOCBRDELIF:
5063        case SIOCSHWTSTAMP:
5064                if (!capable(CAP_NET_ADMIN))
5065                        return -EPERM;
5066                /* fall through */
5067        case SIOCBONDSLAVEINFOQUERY:
5068        case SIOCBONDINFOQUERY:
5069                dev_load(net, ifr.ifr_name);
5070                rtnl_lock();
5071                ret = dev_ifsioc(net, &ifr, cmd);
5072                rtnl_unlock();
5073                return ret;
5074
5075        case SIOCGIFMEM:
5076                /* Get the per device memory space. We can add this but
5077                 * currently do not support it */
5078        case SIOCSIFMEM:
5079                /* Set the per device memory buffer space.
5080                 * Not applicable in our case */
5081        case SIOCSIFLINK:
5082                return -ENOTTY;
5083
5084        /*
5085         *      Unknown or private ioctl.
5086         */
5087        default:
5088                if (cmd == SIOCWANDEV ||
5089                    (cmd >= SIOCDEVPRIVATE &&
5090                     cmd <= SIOCDEVPRIVATE + 15)) {
5091                        dev_load(net, ifr.ifr_name);
5092                        rtnl_lock();
5093                        ret = dev_ifsioc(net, &ifr, cmd);
5094                        rtnl_unlock();
5095                        if (!ret && copy_to_user(arg, &ifr,
5096                                                 sizeof(struct ifreq)))
5097                                ret = -EFAULT;
5098                        return ret;
5099                }
5100                /* Take care of Wireless Extensions */
5101                if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5102                        return wext_handle_ioctl(net, &ifr, cmd, arg);
5103                return -ENOTTY;
5104        }
5105}
5106
5107
5108/**
5109 *      dev_new_index   -       allocate an ifindex
5110 *      @net: the applicable net namespace
5111 *
5112 *      Returns a suitable unique value for a new device interface
5113 *      number.  The caller must hold the rtnl semaphore or the
5114 *      dev_base_lock to be sure it remains unique.
5115 */
5116static int dev_new_index(struct net *net)
5117{
5118        static int ifindex;
5119        for (;;) {
5120                if (++ifindex <= 0)
5121                        ifindex = 1;
5122                if (!__dev_get_by_index(net, ifindex))
5123                        return ifindex;
5124        }
5125}
5126
5127/* Delayed registration/unregisteration */
5128static LIST_HEAD(net_todo_list);
5129
5130static void net_set_todo(struct net_device *dev)
5131{
5132        list_add_tail(&dev->todo_list, &net_todo_list);
5133}
5134
5135static void rollback_registered_many(struct list_head *head)
5136{
5137        struct net_device *dev, *tmp;
5138
5139        BUG_ON(dev_boot_phase);
5140        ASSERT_RTNL();
5141
5142        list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5143                /* Some devices call without registering
5144                 * for initialization unwind. Remove those
5145                 * devices and proceed with the remaining.
5146                 */
5147                if (dev->reg_state == NETREG_UNINITIALIZED) {
5148                        pr_debug("unregister_netdevice: device %s/%p never "
5149                                 "was registered\n", dev->name, dev);
5150
5151                        WARN_ON(1);
5152                        list_del(&dev->unreg_list);
5153                        continue;
5154                }
5155                dev->dismantle = true;
5156                BUG_ON(dev->reg_state != NETREG_REGISTERED);
5157        }
5158
5159        /* If device is running, close it first. */
5160        dev_close_many(head);
5161
5162        list_for_each_entry(dev, head, unreg_list) {
5163                /* And unlink it from device chain. */
5164                unlist_netdevice(dev);
5165
5166                dev->reg_state = NETREG_UNREGISTERING;
5167        }
5168
5169        synchronize_net();
5170
5171        list_for_each_entry(dev, head, unreg_list) {
5172                /* Shutdown queueing discipline. */
5173                dev_shutdown(dev);
5174
5175
5176                /* Notify protocols, that we are about to destroy
5177                   this device. They should clean all the things.
5178                */
5179                call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5180
5181                if (!dev->rtnl_link_ops ||
5182                    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5183                        rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5184
5185                /*
5186                 *      Flush the unicast and multicast chains
5187                 */
5188                dev_uc_flush(dev);
5189                dev_mc_flush(dev);
5190
5191                if (dev->netdev_ops->ndo_uninit)
5192                        dev->netdev_ops->ndo_uninit(dev);
5193
5194                /* Notifier chain MUST detach us from master device. */
5195                WARN_ON(dev->master);
5196
5197                /* Remove entries from kobject tree */
5198                netdev_unregister_kobject(dev);
5199        }
5200
5201        /* Process any work delayed until the end of the batch */
5202        dev = list_first_entry(head, struct net_device, unreg_list);
5203        call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5204
5205        rcu_barrier();
5206
5207        list_for_each_entry(dev, head, unreg_list)
5208                dev_put(dev);
5209}
5210
5211static void rollback_registered(struct net_device *dev)
5212{
5213        LIST_HEAD(single);
5214
5215        list_add(&dev->unreg_list, &single);
5216        rollback_registered_many(&single);
5217        list_del(&single);
5218}
5219
5220static u32 netdev_fix_features(struct net_device *dev, u32 features)
5221{
5222        /* Fix illegal checksum combinations */
5223        if ((features & NETIF_F_HW_CSUM) &&
5224            (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5225                netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5226                features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5227        }
5228
5229        if ((features & NETIF_F_NO_CSUM) &&
5230            (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5231                netdev_warn(dev, "mixed no checksumming and other settings.\n");
5232                features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5233        }
5234
5235        /* Fix illegal SG+CSUM combinations. */
5236        if ((features & NETIF_F_SG) &&
5237            !(features & NETIF_F_ALL_CSUM)) {
5238                netdev_dbg(dev,
5239                        "Dropping NETIF_F_SG since no checksum feature.\n");
5240                features &= ~NETIF_F_SG;
5241        }
5242
5243        /* TSO requires that SG is present as well. */
5244        if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5245                netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5246                features &= ~NETIF_F_ALL_TSO;
5247        }
5248
5249        /* TSO ECN requires that TSO is present as well. */
5250        if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5251                features &= ~NETIF_F_TSO_ECN;
5252
5253        /* Software GSO depends on SG. */
5254        if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5255                netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5256                features &= ~NETIF_F_GSO;
5257        }
5258
5259        /* UFO needs SG and checksumming */
5260        if (features & NETIF_F_UFO) {
5261                /* maybe split UFO into V4 and V6? */
5262                if (!((features & NETIF_F_GEN_CSUM) ||
5263                    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5264                            == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5265                        netdev_dbg(dev,
5266                                "Dropping NETIF_F_UFO since no checksum offload features.\n");
5267                        features &= ~NETIF_F_UFO;
5268                }
5269
5270                if (!(features & NETIF_F_SG)) {
5271                        netdev_dbg(dev,
5272                                "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5273                        features &= ~NETIF_F_UFO;
5274                }
5275        }
5276
5277        return features;
5278}
5279
5280int __netdev_update_features(struct net_device *dev)
5281{
5282        u32 features;
5283        int err = 0;
5284
5285        ASSERT_RTNL();
5286
5287        features = netdev_get_wanted_features(dev);
5288
5289        if (dev->netdev_ops->ndo_fix_features)
5290                features = dev->netdev_ops->ndo_fix_features(dev, features);
5291
5292        /* driver might be less strict about feature dependencies */
5293        features = netdev_fix_features(dev, features);
5294
5295        if (dev->features == features)
5296                return 0;
5297
5298        netdev_dbg(dev, "Features changed: 0x%08x -> 0x%08x\n",
5299                dev->features, features);
5300
5301        if (dev->netdev_ops->ndo_set_features)
5302                err = dev->netdev_ops->ndo_set_features(dev, features);
5303
5304        if (unlikely(err < 0)) {
5305                netdev_err(dev,
5306                        "set_features() failed (%d); wanted 0x%08x, left 0x%08x\n",
5307                        err, features, dev->features);
5308                return -1;
5309        }
5310
5311        if (!err)
5312                dev->features = features;
5313
5314        return 1;
5315}
5316
5317/**
5318 *      netdev_update_features - recalculate device features
5319 *      @dev: the device to check
5320 *
5321 *      Recalculate dev->features set and send notifications if it
5322 *      has changed. Should be called after driver or hardware dependent
5323 *      conditions might have changed that influence the features.
5324 */
5325void netdev_update_features(struct net_device *dev)
5326{
5327        if (__netdev_update_features(dev))
5328                netdev_features_change(dev);
5329}
5330EXPORT_SYMBOL(netdev_update_features);
5331
5332/**
5333 *      netdev_change_features - recalculate device features
5334 *      @dev: the device to check
5335 *
5336 *      Recalculate dev->features set and send notifications even
5337 *      if they have not changed. Should be called instead of
5338 *      netdev_update_features() if also dev->vlan_features might
5339 *      have changed to allow the changes to be propagated to stacked
5340 *      VLAN devices.
5341 */
5342void netdev_change_features(struct net_device *dev)
5343{
5344        __netdev_update_features(dev);
5345        netdev_features_change(dev);
5346}
5347EXPORT_SYMBOL(netdev_change_features);
5348
5349/**
5350 *      netif_stacked_transfer_operstate -      transfer operstate
5351 *      @rootdev: the root or lower level device to transfer state from
5352 *      @dev: the device to transfer operstate to
5353 *
5354 *      Transfer operational state from root to device. This is normally
5355 *      called when a stacking relationship exists between the root
5356 *      device and the device(a leaf device).
5357 */
5358void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5359                                        struct net_device *dev)
5360{
5361        if (rootdev->operstate == IF_OPER_DORMANT)
5362                netif_dormant_on(dev);
5363        else
5364                netif_dormant_off(dev);
5365
5366        if (netif_carrier_ok(rootdev)) {
5367                if (!netif_carrier_ok(dev))
5368                        netif_carrier_on(dev);
5369        } else {
5370                if (netif_carrier_ok(dev))
5371                        netif_carrier_off(dev);
5372        }
5373}
5374EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5375
5376#ifdef CONFIG_RPS
5377static int netif_alloc_rx_queues(struct net_device *dev)
5378{
5379        unsigned int i, count = dev->num_rx_queues;
5380        struct netdev_rx_queue *rx;
5381
5382        BUG_ON(count < 1);
5383
5384        rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5385        if (!rx) {
5386                pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5387                return -ENOMEM;
5388        }
5389        dev->_rx = rx;
5390
5391        for (i = 0; i < count; i++)
5392                rx[i].dev = dev;
5393        return 0;
5394}
5395#endif
5396
5397static void netdev_init_one_queue(struct net_device *dev,
5398                                  struct netdev_queue *queue, void *_unused)
5399{
5400        /* Initialize queue lock */
5401        spin_lock_init(&queue->_xmit_lock);
5402        netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5403        queue->xmit_lock_owner = -1;
5404        netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5405        queue->dev = dev;
5406}
5407
5408static int netif_alloc_netdev_queues(struct net_device *dev)
5409{
5410        unsigned int count = dev->num_tx_queues;
5411        struct netdev_queue *tx;
5412
5413        BUG_ON(count < 1);
5414
5415        tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5416        if (!tx) {
5417                pr_err("netdev: Unable to allocate %u tx queues.\n",
5418                       count);
5419                return -ENOMEM;
5420        }
5421        dev->_tx = tx;
5422
5423        netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5424        spin_lock_init(&dev->tx_global_lock);
5425
5426        return 0;
5427}
5428
5429/**
5430 *      register_netdevice      - register a network device
5431 *      @dev: device to register
5432 *
5433 *      Take a completed network device structure and add it to the kernel
5434 *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5435 *      chain. 0 is returned on success. A negative errno code is returned
5436 *      on a failure to set up the device, or if the name is a duplicate.
5437 *
5438 *      Callers must hold the rtnl semaphore. You may want
5439 *      register_netdev() instead of this.
5440 *
5441 *      BUGS:
5442 *      The locking appears insufficient to guarantee two parallel registers
5443 *      will not get the same name.
5444 */
5445
5446int register_netdevice(struct net_device *dev)
5447{
5448        int ret;
5449        struct net *net = dev_net(dev);
5450
5451        BUG_ON(dev_boot_phase);
5452        ASSERT_RTNL();
5453
5454        might_sleep();
5455
5456        /* When net_device's are persistent, this will be fatal. */
5457        BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5458        BUG_ON(!net);
5459
5460        spin_lock_init(&dev->addr_list_lock);
5461        netdev_set_addr_lockdep_class(dev);
5462
5463        dev->iflink = -1;
5464
5465        ret = dev_get_valid_name(dev, dev->name);
5466        if (ret < 0)
5467                goto out;
5468
5469        /* Init, if this function is available */
5470        if (dev->netdev_ops->ndo_init) {
5471                ret = dev->netdev_ops->ndo_init(dev);
5472                if (ret) {
5473                        if (ret > 0)
5474                                ret = -EIO;
5475                        goto out;
5476                }
5477        }
5478
5479        dev->ifindex = dev_new_index(net);
5480        if (dev->iflink == -1)
5481                dev->iflink = dev->ifindex;
5482
5483        /* Transfer changeable features to wanted_features and enable
5484         * software offloads (GSO and GRO).
5485         */
5486        dev->hw_features |= NETIF_F_SOFT_FEATURES;
5487        dev->features |= NETIF_F_SOFT_FEATURES;
5488        dev->wanted_features = dev->features & dev->hw_features;
5489
5490        /* Turn on no cache copy if HW is doing checksum */
5491        dev->hw_features |= NETIF_F_NOCACHE_COPY;
5492        if ((dev->features & NETIF_F_ALL_CSUM) &&
5493            !(dev->features & NETIF_F_NO_CSUM)) {
5494                dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5495                dev->features |= NETIF_F_NOCACHE_COPY;
5496        }
5497
5498        /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5499         */
5500        dev->vlan_features |= NETIF_F_HIGHDMA;
5501
5502        ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5503        ret = notifier_to_errno(ret);
5504        if (ret)
5505                goto err_uninit;
5506
5507        ret = netdev_register_kobject(dev);
5508        if (ret)
5509                goto err_uninit;
5510        dev->reg_state = NETREG_REGISTERED;
5511
5512        __netdev_update_features(dev);
5513
5514        /*
5515         *      Default initial state at registry is that the
5516         *      device is present.
5517         */
5518
5519        set_bit(__LINK_STATE_PRESENT, &dev->state);
5520
5521        dev_init_scheduler(dev);
5522        dev_hold(dev);
5523        list_netdevice(dev);
5524
5525        /* Notify protocols, that a new device appeared. */
5526        ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5527        ret = notifier_to_errno(ret);
5528        if (ret) {
5529                rollback_registered(dev);
5530                dev->reg_state = NETREG_UNREGISTERED;
5531        }
5532        /*
5533         *      Prevent userspace races by waiting until the network
5534         *      device is fully setup before sending notifications.
5535         */
5536        if (!dev->rtnl_link_ops ||
5537            dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5538                rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5539
5540out:
5541        return ret;
5542
5543err_uninit:
5544        if (dev->netdev_ops->ndo_uninit)
5545                dev->netdev_ops->ndo_uninit(dev);
5546        goto out;
5547}
5548EXPORT_SYMBOL(register_netdevice);
5549
5550/**
5551 *      init_dummy_netdev       - init a dummy network device for NAPI
5552 *      @dev: device to init
5553 *
5554 *      This takes a network device structure and initialize the minimum
5555 *      amount of fields so it can be used to schedule NAPI polls without
5556 *      registering a full blown interface. This is to be used by drivers
5557 *      that need to tie several hardware interfaces to a single NAPI
5558 *      poll scheduler due to HW limitations.
5559 */
5560int init_dummy_netdev(struct net_device *dev)
5561{
5562        /* Clear everything. Note we don't initialize spinlocks
5563         * are they aren't supposed to be taken by any of the
5564         * NAPI code and this dummy netdev is supposed to be
5565         * only ever used for NAPI polls
5566         */
5567        memset(dev, 0, sizeof(struct net_device));
5568
5569        /* make sure we BUG if trying to hit standard
5570         * register/unregister code path
5571         */
5572        dev->reg_state = NETREG_DUMMY;
5573
5574        /* NAPI wants this */
5575        INIT_LIST_HEAD(&dev->napi_list);
5576
5577        /* a dummy interface is started by default */
5578        set_bit(__LINK_STATE_PRESENT, &dev->state);
5579        set_bit(__LINK_STATE_START, &dev->state);
5580
5581        /* Note : We dont allocate pcpu_refcnt for dummy devices,
5582         * because users of this 'device' dont need to change
5583         * its refcount.
5584         */
5585
5586        return 0;
5587}
5588EXPORT_SYMBOL_GPL(init_dummy_netdev);
5589
5590
5591/**
5592 *      register_netdev - register a network device
5593 *      @dev: device to register
5594 *
5595 *      Take a completed network device structure and add it to the kernel
5596 *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5597 *      chain. 0 is returned on success. A negative errno code is returned
5598 *      on a failure to set up the device, or if the name is a duplicate.
5599 *
5600 *      This is a wrapper around register_netdevice that takes the rtnl semaphore
5601 *      and expands the device name if you passed a format string to
5602 *      alloc_netdev.
5603 */
5604int register_netdev(struct net_device *dev)
5605{
5606        int err;
5607
5608        rtnl_lock();
5609        err = register_netdevice(dev);
5610        rtnl_unlock();
5611        return err;
5612}
5613EXPORT_SYMBOL(register_netdev);
5614
5615int netdev_refcnt_read(const struct net_device *dev)
5616{
5617        int i, refcnt = 0;
5618
5619        for_each_possible_cpu(i)
5620                refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5621        return refcnt;
5622}
5623EXPORT_SYMBOL(netdev_refcnt_read);
5624
5625/*
5626 * netdev_wait_allrefs - wait until all references are gone.
5627 *
5628 * This is called when unregistering network devices.
5629 *
5630 * Any protocol or device that holds a reference should register
5631 * for netdevice notification, and cleanup and put back the
5632 * reference if they receive an UNREGISTER event.
5633 * We can get stuck here if buggy protocols don't correctly
5634 * call dev_put.
5635 */
5636static void netdev_wait_allrefs(struct net_device *dev)
5637{
5638        unsigned long rebroadcast_time, warning_time;
5639        int refcnt;
5640
5641        linkwatch_forget_dev(dev);
5642
5643        rebroadcast_time = warning_time = jiffies;
5644        refcnt = netdev_refcnt_read(dev);
5645
5646        while (refcnt != 0) {
5647                if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5648                        rtnl_lock();
5649
5650                        /* Rebroadcast unregister notification */
5651                        call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5652                        /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5653                         * should have already handle it the first time */
5654
5655                        if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5656                                     &dev->state)) {
5657                                /* We must not have linkwatch events
5658                                 * pending on unregister. If this
5659                                 * happens, we simply run the queue
5660                                 * unscheduled, resulting in a noop
5661                                 * for this device.
5662                                 */
5663                                linkwatch_run_queue();
5664                        }
5665
5666                        __rtnl_unlock();
5667
5668                        rebroadcast_time = jiffies;
5669                }
5670
5671                msleep(250);
5672
5673                refcnt = netdev_refcnt_read(dev);
5674
5675                if (time_after(jiffies, warning_time + 10 * HZ)) {
5676                        printk(KERN_EMERG "unregister_netdevice: "
5677                               "waiting for %s to become free. Usage "
5678                               "count = %d\n",
5679                               dev->name, refcnt);
5680                        warning_time = jiffies;
5681                }
5682        }
5683}
5684
5685/* The sequence is:
5686 *
5687 *      rtnl_lock();
5688 *      ...
5689 *      register_netdevice(x1);
5690 *      register_netdevice(x2);
5691 *      ...
5692 *      unregister_netdevice(y1);
5693 *      unregister_netdevice(y2);
5694 *      ...
5695 *      rtnl_unlock();
5696 *      free_netdev(y1);
5697 *      free_netdev(y2);
5698 *
5699 * We are invoked by rtnl_unlock().
5700 * This allows us to deal with problems:
5701 * 1) We can delete sysfs objects which invoke hotplug
5702 *    without deadlocking with linkwatch via keventd.
5703 * 2) Since we run with the RTNL semaphore not held, we can sleep
5704 *    safely in order to wait for the netdev refcnt to drop to zero.
5705 *
5706 * We must not return until all unregister events added during
5707 * the interval the lock was held have been completed.
5708 */
5709void netdev_run_todo(void)
5710{
5711        struct list_head list;
5712
5713        /* Snapshot list, allow later requests */
5714        list_replace_init(&net_todo_list, &list);
5715
5716        __rtnl_unlock();
5717
5718        while (!list_empty(&list)) {
5719                struct net_device *dev
5720                        = list_first_entry(&list, struct net_device, todo_list);
5721                list_del(&dev->todo_list);
5722
5723                if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5724                        printk(KERN_ERR "network todo '%s' but state %d\n",
5725                               dev->name, dev->reg_state);
5726                        dump_stack();
5727                        continue;
5728                }
5729
5730                dev->reg_state = NETREG_UNREGISTERED;
5731
5732                on_each_cpu(flush_backlog, dev, 1);
5733
5734                netdev_wait_allrefs(dev);
5735
5736                /* paranoia */
5737                BUG_ON(netdev_refcnt_read(dev));
5738                WARN_ON(rcu_dereference_raw(dev->ip_ptr));
5739                WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
5740                WARN_ON(dev->dn_ptr);
5741
5742                if (dev->destructor)
5743                        dev->destructor(dev);
5744
5745                /* Free network device */
5746                kobject_put(&dev->dev.kobj);
5747        }
5748}
5749
5750/* Convert net_device_stats to rtnl_link_stats64.  They have the same
5751 * fields in the same order, with only the type differing.
5752 */
5753static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5754                                    const struct net_device_stats *netdev_stats)
5755{
5756#if BITS_PER_LONG == 64
5757        BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5758        memcpy(stats64, netdev_stats, sizeof(*stats64));
5759#else
5760        size_t i, n = sizeof(*stats64) / sizeof(u64);
5761        const unsigned long *src = (const unsigned long *)netdev_stats;
5762        u64 *dst = (u64 *)stats64;
5763
5764        BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5765                     sizeof(*stats64) / sizeof(u64));
5766        for (i = 0; i < n; i++)
5767                dst[i] = src[i];
5768#endif
5769}
5770
5771/**
5772 *      dev_get_stats   - get network device statistics
5773 *      @dev: device to get statistics from
5774 *      @storage: place to store stats
5775 *
5776 *      Get network statistics from device. Return @storage.
5777 *      The device driver may provide its own method by setting
5778 *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5779 *      otherwise the internal statistics structure is used.
5780 */
5781struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5782                                        struct rtnl_link_stats64 *storage)
5783{
5784        const struct net_device_ops *ops = dev->netdev_ops;
5785
5786        if (ops->ndo_get_stats64) {
5787                memset(storage, 0, sizeof(*storage));
5788                ops->ndo_get_stats64(dev, storage);
5789        } else if (ops->ndo_get_stats) {
5790                netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5791        } else {
5792                netdev_stats_to_stats64(storage, &dev->stats);
5793        }
5794        storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5795        return storage;
5796}
5797EXPORT_SYMBOL(dev_get_stats);
5798
5799struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5800{
5801        struct netdev_queue *queue = dev_ingress_queue(dev);
5802
5803#ifdef CONFIG_NET_CLS_ACT
5804        if (queue)
5805                return queue;
5806        queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5807        if (!queue)
5808                return NULL;
5809        netdev_init_one_queue(dev, queue, NULL);
5810        queue->qdisc = &noop_qdisc;
5811        queue->qdisc_sleeping = &noop_qdisc;
5812        rcu_assign_pointer(dev->ingress_queue, queue);
5813#endif
5814        return queue;
5815}
5816
5817/**
5818 *      alloc_netdev_mqs - allocate network device
5819 *      @sizeof_priv:   size of private data to allocate space for
5820 *      @name:          device name format string
5821 *      @setup:         callback to initialize device
5822 *      @txqs:          the number of TX subqueues to allocate
5823 *      @rxqs:          the number of RX subqueues to allocate
5824 *
5825 *      Allocates a struct net_device with private data area for driver use
5826 *      and performs basic initialization.  Also allocates subquue structs
5827 *      for each queue on the device.
5828 */
5829struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5830                void (*setup)(struct net_device *),
5831                unsigned int txqs, unsigned int rxqs)
5832{
5833        struct net_device *dev;
5834        size_t alloc_size;
5835        struct net_device *p;
5836
5837        BUG_ON(strlen(name) >= sizeof(dev->name));
5838
5839        if (txqs < 1) {
5840                pr_err("alloc_netdev: Unable to allocate device "
5841                       "with zero queues.\n");
5842                return NULL;
5843        }
5844
5845#ifdef CONFIG_RPS
5846        if (rxqs < 1) {
5847                pr_err("alloc_netdev: Unable to allocate device "
5848                       "with zero RX queues.\n");
5849                return NULL;
5850        }
5851#endif
5852
5853        alloc_size = sizeof(struct net_device);
5854        if (sizeof_priv) {
5855                /* ensure 32-byte alignment of private area */
5856                alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5857                alloc_size += sizeof_priv;
5858        }
5859        /* ensure 32-byte alignment of whole construct */
5860        alloc_size += NETDEV_ALIGN - 1;
5861
5862        p = kzalloc(alloc_size, GFP_KERNEL);
5863        if (!p) {
5864                printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5865                return NULL;
5866        }
5867
5868        dev = PTR_ALIGN(p, NETDEV_ALIGN);
5869        dev->padded = (char *)dev - (char *)p;
5870
5871        dev->pcpu_refcnt = alloc_percpu(int);
5872        if (!dev->pcpu_refcnt)
5873                goto free_p;
5874
5875        if (dev_addr_init(dev))
5876                goto free_pcpu;
5877
5878        dev_mc_init(dev);
5879        dev_uc_init(dev);
5880
5881        dev_net_set(dev, &init_net);
5882
5883        dev->gso_max_size = GSO_MAX_SIZE;
5884
5885        INIT_LIST_HEAD(&dev->napi_list);
5886        INIT_LIST_HEAD(&dev->unreg_list);
5887        INIT_LIST_HEAD(&dev->link_watch_list);
5888        dev->priv_flags = IFF_XMIT_DST_RELEASE;
5889        setup(dev);
5890
5891        dev->num_tx_queues = txqs;
5892        dev->real_num_tx_queues = txqs;
5893        if (netif_alloc_netdev_queues(dev))
5894                goto free_all;
5895
5896#ifdef CONFIG_RPS
5897        dev->num_rx_queues = rxqs;
5898        dev->real_num_rx_queues = rxqs;
5899        if (netif_alloc_rx_queues(dev))
5900                goto free_all;
5901#endif
5902
5903        strcpy(dev->name, name);
5904        dev->group = INIT_NETDEV_GROUP;
5905        return dev;
5906
5907free_all:
5908        free_netdev(dev);
5909        return NULL;
5910
5911free_pcpu:
5912        free_percpu(dev->pcpu_refcnt);
5913        kfree(dev->_tx);
5914#ifdef CONFIG_RPS
5915        kfree(dev->_rx);
5916#endif
5917
5918free_p:
5919        kfree(p);
5920        return NULL;
5921}
5922EXPORT_SYMBOL(alloc_netdev_mqs);
5923
5924/**
5925 *      free_netdev - free network device
5926 *      @dev: device
5927 *
5928 *      This function does the last stage of destroying an allocated device
5929 *      interface. The reference to the device object is released.
5930 *      If this is the last reference then it will be freed.
5931 */
5932void free_netdev(struct net_device *dev)
5933{
5934        struct napi_struct *p, *n;
5935
5936        release_net(dev_net(dev));
5937
5938        kfree(dev->_tx);
5939#ifdef CONFIG_RPS
5940        kfree(dev->_rx);
5941#endif
5942
5943        kfree(rcu_dereference_raw(dev->ingress_queue));
5944
5945        /* Flush device addresses */
5946        dev_addr_flush(dev);
5947
5948        list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5949                netif_napi_del(p);
5950
5951        free_percpu(dev->pcpu_refcnt);
5952        dev->pcpu_refcnt = NULL;
5953
5954        /*  Compatibility with error handling in drivers */
5955        if (dev->reg_state == NETREG_UNINITIALIZED) {
5956                kfree((char *)dev - dev->padded);
5957                return;
5958        }
5959
5960        BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5961        dev->reg_state = NETREG_RELEASED;
5962
5963        /* will free via device release */
5964        put_device(&dev->dev);
5965}
5966EXPORT_SYMBOL(free_netdev);
5967
5968/**
5969 *      synchronize_net -  Synchronize with packet receive processing
5970 *
5971 *      Wait for packets currently being received to be done.
5972 *      Does not block later packets from starting.
5973 */
5974void synchronize_net(void)
5975{
5976        might_sleep();
5977        if (rtnl_is_locked())
5978                synchronize_rcu_expedited();
5979        else
5980                synchronize_rcu();
5981}
5982EXPORT_SYMBOL(synchronize_net);
5983
5984/**
5985 *      unregister_netdevice_queue - remove device from the kernel
5986 *      @dev: device
5987 *      @head: list
5988 *
5989 *      This function shuts down a device interface and removes it
5990 *      from the kernel tables.
5991 *      If head not NULL, device is queued to be unregistered later.
5992 *
5993 *      Callers must hold the rtnl semaphore.  You may want
5994 *      unregister_netdev() instead of this.
5995 */
5996
5997void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5998{
5999        ASSERT_RTNL();
6000

6001        if (head) {
6002                list_move_tail(&dev->unreg_list, head);
6003        } else {
6004                rollback_registered(dev);
6005                /* Finish processing unregister after unlock */
6006                net_set_todo(dev);
6007        }
6008}
6009EXPORT_SYMBOL(unregister_netdevice_queue);
6010
6011/**
6012 *      unregister_netdevice_many - unregister many devices
6013 *      @head: list of devices
6014 */
6015void unregister_netdevice_many(struct list_head *head)
6016{
6017        struct net_device *dev;
6018
6019        if (!list_empty(head)) {
6020                rollback_registered_many(head);
6021                list_for_each_entry(dev, head, unreg_list)
6022                        net_set_todo(dev);
6023        }
6024}
6025EXPORT_SYMBOL(unregister_netdevice_many);
6026
6027/**
6028 *      unregister_netdev - remove device from the kernel
6029 *      @dev: device
6030 *
6031 *      This function shuts down a device interface and removes it
6032 *      from the kernel tables.
6033 *
6034 *      This is just a wrapper for unregister_netdevice that takes
6035 *      the rtnl semaphore.  In general you want to use this and not
6036 *      unregister_netdevice.
6037 */
6038void unregister_netdev(struct net_device *dev)
6039{
6040        rtnl_lock();
6041        unregister_netdevice(dev);
6042        rtnl_unlock();
6043}
6044EXPORT_SYMBOL(unregister_netdev);
6045
6046/**
6047 *      dev_change_net_namespace - move device to different nethost namespace
6048 *      @dev: device
6049 *      @net: network namespace
6050 *      @pat: If not NULL name pattern to try if the current device name
6051 *            is already taken in the destination network namespace.
6052 *
6053 *      This function shuts down a device interface and moves it
6054 *      to a new network namespace. On success 0 is returned, on
6055 *      a failure a netagive errno code is returned.
6056 *
6057 *      Callers must hold the rtnl semaphore.
6058 */
6059
6060int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6061{
6062        int err;
6063
6064        ASSERT_RTNL();
6065
6066        /* Don't allow namespace local devices to be moved. */
6067        err = -EINVAL;
6068        if (dev->features & NETIF_F_NETNS_LOCAL)
6069                goto out;
6070
6071        /* Ensure the device has been registrered */
6072        err = -EINVAL;
6073        if (dev->reg_state != NETREG_REGISTERED)
6074                goto out;
6075
6076        /* Get out if there is nothing todo */
6077        err = 0;
6078        if (net_eq(dev_net(dev), net))
6079                goto out;
6080
6081        /* Pick the destination device name, and ensure
6082         * we can use it in the destination network namespace.
6083         */
6084        err = -EEXIST;
6085        if (__dev_get_by_name(net, dev->name)) {
6086                /* We get here if we can't use the current device name */
6087                if (!pat)
6088                        goto out;
6089                if (dev_get_valid_name(dev, pat) < 0)
6090                        goto out;
6091        }
6092
6093        /*
6094         * And now a mini version of register_netdevice unregister_netdevice.
6095         */
6096
6097        /* If device is running close it first. */
6098        dev_close(dev);
6099
6100        /* And unlink it from device chain */
6101        err = -ENODEV;
6102        unlist_netdevice(dev);
6103
6104        synchronize_net();
6105
6106        /* Shutdown queueing discipline. */
6107        dev_shutdown(dev);
6108
6109        /* Notify protocols, that we are about to destroy
6110           this device. They should clean all the things.
6111
6112           Note that dev->reg_state stays at NETREG_REGISTERED.
6113           This is wanted because this way 8021q and macvlan know
6114           the device is just moving and can keep their slaves up.
6115        */
6116        call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6117        call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6118
6119        /*
6120         *      Flush the unicast and multicast chains
6121         */
6122        dev_uc_flush(dev);
6123        dev_mc_flush(dev);
6124
6125        /* Actually switch the network namespace */
6126        dev_net_set(dev, net);
6127
6128        /* If there is an ifindex conflict assign a new one */
6129        if (__dev_get_by_index(net, dev->ifindex)) {
6130                int iflink = (dev->iflink == dev->ifindex);
6131                dev->ifindex = dev_new_index(net);
6132                if (iflink)
6133                        dev->iflink = dev->ifindex;
6134        }
6135
6136        /* Fixup kobjects */
6137        err = device_rename(&dev->dev, dev->name);
6138        WARN_ON(err);
6139
6140        /* Add the device back in the hashes */
6141        list_netdevice(dev);
6142
6143        /* Notify protocols, that a new device appeared. */
6144        call_netdevice_notifiers(NETDEV_REGISTER, dev);
6145
6146        /*
6147         *      Prevent userspace races by waiting until the network
6148         *      device is fully setup before sending notifications.
6149         */
6150        rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6151
6152        synchronize_net();
6153        err = 0;
6154out:
6155        return err;
6156}
6157EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6158
6159static int dev_cpu_callback(struct notifier_block *nfb,
6160                            unsigned long action,
6161                            void *ocpu)
6162{
6163        struct sk_buff **list_skb;
6164        struct sk_buff *skb;
6165        unsigned int cpu, oldcpu = (unsigned long)ocpu;
6166        struct softnet_data *sd, *oldsd;
6167
6168        if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6169                return NOTIFY_OK;
6170
6171        local_irq_disable();
6172        cpu = smp_processor_id();
6173        sd = &per_cpu(softnet_data, cpu);
6174        oldsd = &per_cpu(softnet_data, oldcpu);
6175
6176        /* Find end of our completion_queue. */
6177        list_skb = &sd->completion_queue;
6178        while (*list_skb)
6179                list_skb = &(*list_skb)->next;
6180        /* Append completion queue from offline CPU. */
6181        *list_skb = oldsd->completion_queue;
6182        oldsd->completion_queue = NULL;
6183
6184        /* Append output queue from offline CPU. */
6185        if (oldsd->output_queue) {
6186                *sd->output_queue_tailp = oldsd->output_queue;
6187                sd->output_queue_tailp = oldsd->output_queue_tailp;
6188                oldsd->output_queue = NULL;
6189                oldsd->output_queue_tailp = &oldsd->output_queue;
6190        }
6191        /* Append NAPI poll list from offline CPU. */
6192        if (!list_empty(&oldsd->poll_list)) {
6193                list_splice_init(&oldsd->poll_list, &sd->poll_list);
6194                raise_softirq_irqoff(NET_RX_SOFTIRQ);
6195        }
6196
6197        raise_softirq_irqoff(NET_TX_SOFTIRQ);
6198        local_irq_enable();
6199
6200        /* Process offline CPU's input_pkt_queue */
6201        while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6202                netif_rx(skb);
6203                input_queue_head_incr(oldsd);
6204        }
6205        while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6206                netif_rx(skb);
6207                input_queue_head_incr(oldsd);
6208        }
6209
6210        return NOTIFY_OK;
6211}
6212
6213
6214/**
6215 *      netdev_increment_features - increment feature set by one
6216 *      @all: current feature set
6217 *      @one: new feature set
6218 *      @mask: mask feature set
6219 *
6220 *      Computes a new feature set after adding a device with feature set
6221 *      @one to the master device with current feature set @all.  Will not
6222 *      enable anything that is off in @mask. Returns the new feature set.
6223 */
6224u32 netdev_increment_features(u32 all, u32 one, u32 mask)
6225{
6226        if (mask & NETIF_F_GEN_CSUM)
6227                mask |= NETIF_F_ALL_CSUM;
6228        mask |= NETIF_F_VLAN_CHALLENGED;
6229
6230        all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6231        all &= one | ~NETIF_F_ALL_FOR_ALL;
6232
6233        /* If device needs checksumming, downgrade to it. */
6234        if (all & (NETIF_F_ALL_CSUM & ~NETIF_F_NO_CSUM))
6235                all &= ~NETIF_F_NO_CSUM;
6236
6237        /* If one device supports hw checksumming, set for all. */
6238        if (all & NETIF_F_GEN_CSUM)
6239                all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6240
6241        return all;
6242}
6243EXPORT_SYMBOL(netdev_increment_features);
6244
6245static struct hlist_head *netdev_create_hash(void)
6246{
6247        int i;
6248        struct hlist_head *hash;
6249
6250        hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6251        if (hash != NULL)
6252                for (i = 0; i < NETDEV_HASHENTRIES; i++)
6253                        INIT_HLIST_HEAD(&hash[i]);
6254
6255        return hash;
6256}
6257
6258/* Initialize per network namespace state */
6259static int __net_init netdev_init(struct net *net)
6260{
6261        INIT_LIST_HEAD(&net->dev_base_head);
6262
6263        net->dev_name_head = netdev_create_hash();
6264        if (net->dev_name_head == NULL)
6265                goto err_name;
6266
6267        net->dev_index_head = netdev_create_hash();
6268        if (net->dev_index_head == NULL)
6269                goto err_idx;
6270
6271        return 0;
6272
6273err_idx:
6274        kfree(net->dev_name_head);
6275err_name:
6276        return -ENOMEM;
6277}
6278
6279/**
6280 *      netdev_drivername - network driver for the device
6281 *      @dev: network device
6282 *
6283 *      Determine network driver for device.
6284 */
6285const char *netdev_drivername(const struct net_device *dev)
6286{
6287        const struct device_driver *driver;
6288        const struct device *parent;
6289        const char *empty = "";
6290
6291        parent = dev->dev.parent;
6292        if (!parent)
6293                return empty;
6294
6295        driver = parent->driver;
6296        if (driver && driver->name)
6297                return driver->name;
6298        return empty;
6299}
6300
6301static int __netdev_printk(const char *level, const struct net_device *dev,
6302                           struct va_format *vaf)
6303{
6304        int r;
6305
6306        if (dev && dev->dev.parent)
6307                r = dev_printk(level, dev->dev.parent, "%s: %pV",
6308                               netdev_name(dev), vaf);
6309        else if (dev)
6310                r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6311        else
6312                r = printk("%s(NULL net_device): %pV", level, vaf);
6313
6314        return r;
6315}
6316
6317int netdev_printk(const char *level, const struct net_device *dev,
6318                  const char *format, ...)
6319{
6320        struct va_format vaf;
6321        va_list args;
6322        int r;
6323
6324        va_start(args, format);
6325
6326        vaf.fmt = format;
6327        vaf.va = &args;
6328
6329        r = __netdev_printk(level, dev, &vaf);
6330        va_end(args);
6331
6332        return r;
6333}
6334EXPORT_SYMBOL(netdev_printk);
6335
6336#define define_netdev_printk_level(func, level)                 \
6337int func(const struct net_device *dev, const char *fmt, ...)    \
6338{                                                               \
6339        int r;                                                  \
6340        struct va_format vaf;                                   \
6341        va_list args;                                           \
6342                                                                \
6343        va_start(args, fmt);                                    \
6344                                                                \
6345        vaf.fmt = fmt;                                          \
6346        vaf.va = &args;                                         \
6347                                                                \
6348        r = __netdev_printk(level, dev, &vaf);                  \
6349        va_end(args);                                           \
6350                                                                \
6351        return r;                                               \
6352}                                                               \
6353EXPORT_SYMBOL(func);
6354
6355define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6356define_netdev_printk_level(netdev_alert, KERN_ALERT);
6357define_netdev_printk_level(netdev_crit, KERN_CRIT);
6358define_netdev_printk_level(netdev_err, KERN_ERR);
6359define_netdev_printk_level(netdev_warn, KERN_WARNING);
6360define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6361define_netdev_printk_level(netdev_info, KERN_INFO);
6362
6363static void __net_exit netdev_exit(struct net *net)
6364{
6365        kfree(net->dev_name_head);
6366        kfree(net->dev_index_head);
6367}
6368
6369static struct pernet_operations __net_initdata netdev_net_ops = {
6370        .init = netdev_init,
6371        .exit = netdev_exit,
6372};
6373
6374static void __net_exit default_device_exit(struct net *net)
6375{
6376        struct net_device *dev, *aux;
6377        /*
6378         * Push all migratable network devices back to the
6379         * initial network namespace
6380         */
6381        rtnl_lock();
6382        for_each_netdev_safe(net, dev, aux) {
6383                int err;
6384                char fb_name[IFNAMSIZ];
6385
6386                /* Ignore unmoveable devices (i.e. loopback) */
6387                if (dev->features & NETIF_F_NETNS_LOCAL)
6388                        continue;
6389
6390                /* Leave virtual devices for the generic cleanup */
6391                if (dev->rtnl_link_ops)
6392                        continue;
6393
6394                /* Push remaining network devices to init_net */
6395                snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6396                err = dev_change_net_namespace(dev, &init_net, fb_name);
6397                if (err) {
6398                        printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6399                                __func__, dev->name, err);
6400                        BUG();
6401                }
6402        }
6403        rtnl_unlock();
6404}
6405
6406static void __net_exit default_device_exit_batch(struct list_head *net_list)
6407{
6408        /* At exit all network devices most be removed from a network
6409         * namespace.  Do this in the reverse order of registration.
6410         * Do this across as many network namespaces as possible to
6411         * improve batching efficiency.
6412         */
6413        struct net_device *dev;
6414        struct net *net;
6415        LIST_HEAD(dev_kill_list);
6416
6417        rtnl_lock();
6418        list_for_each_entry(net, net_list, exit_list) {
6419                for_each_netdev_reverse(net, dev) {
6420                        if (dev->rtnl_link_ops)
6421                                dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6422                        else
6423                                unregister_netdevice_queue(dev, &dev_kill_list);
6424                }
6425        }
6426        unregister_netdevice_many(&dev_kill_list);
6427        list_del(&dev_kill_list);
6428        rtnl_unlock();
6429}
6430
6431static struct pernet_operations __net_initdata default_device_ops = {
6432        .exit = default_device_exit,
6433        .exit_batch = default_device_exit_batch,
6434};
6435
6436/*
6437 *      Initialize the DEV module. At boot time this walks the device list and
6438 *      unhooks any devices that fail to initialise (normally hardware not
6439 *      present) and leaves us with a valid list of present and active devices.
6440 *
6441 */
6442
6443/*
6444 *       This is called single threaded during boot, so no need
6445 *       to take the rtnl semaphore.
6446 */
6447static int __init net_dev_init(void)
6448{
6449        int i, rc = -ENOMEM;
6450
6451        BUG_ON(!dev_boot_phase);
6452
6453        if (dev_proc_init())
6454                goto out;
6455
6456        if (netdev_kobject_init())
6457                goto out;
6458
6459        INIT_LIST_HEAD(&ptype_all);
6460        for (i = 0; i < PTYPE_HASH_SIZE; i++)
6461                INIT_LIST_HEAD(&ptype_base[i]);
6462
6463        if (register_pernet_subsys(&netdev_net_ops))
6464                goto out;
6465
6466        /*
6467         *      Initialise the packet receive queues.
6468         */
6469
6470        for_each_possible_cpu(i) {
6471                struct softnet_data *sd = &per_cpu(softnet_data, i);
6472
6473                memset(sd, 0, sizeof(*sd));
6474                skb_queue_head_init(&sd->input_pkt_queue);
6475                skb_queue_head_init(&sd->process_queue);
6476                sd->completion_queue = NULL;
6477                INIT_LIST_HEAD(&sd->poll_list);
6478                sd->output_queue = NULL;
6479                sd->output_queue_tailp = &sd->output_queue;
6480#ifdef CONFIG_RPS
6481                sd->csd.func = rps_trigger_softirq;
6482                sd->csd.info = sd;
6483                sd->csd.flags = 0;
6484                sd->cpu = i;
6485#endif
6486
6487                sd->backlog.poll = process_backlog;
6488                sd->backlog.weight = weight_p;
6489                sd->backlog.gro_list = NULL;
6490                sd->backlog.gro_count = 0;
6491        }
6492
6493        dev_boot_phase = 0;
6494
6495        /* The loopback device is special if any other network devices
6496         * is present in a network namespace the loopback device must
6497         * be present. Since we now dynamically allocate and free the
6498         * loopback device ensure this invariant is maintained by
6499         * keeping the loopback device as the first device on the
6500         * list of network devices.  Ensuring the loopback devices
6501         * is the first device that appears and the last network device
6502         * that disappears.
6503         */
6504        if (register_pernet_device(&loopback_net_ops))
6505                goto out;
6506
6507        if (register_pernet_device(&default_device_ops))
6508                goto out;
6509
6510        open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6511        open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6512
6513        hotcpu_notifier(dev_cpu_callback, 0);
6514        dst_init();
6515        dev_mcast_init();
6516        rc = 0;
6517out:
6518        return rc;
6519}
6520
6521subsys_initcall(net_dev_init);
6522
6523static int __init initialize_hashrnd(void)
6524{
6525        get_random_bytes(&hashrnd, sizeof(hashrnd));
6526        return 0;
6527}
6528
6529late_initcall_sync(initialize_hashrnd);
6530
6531