linux/net/core/dev.c
<<
>>
Prefs
   1/*
   2 *      NET3    Protocol independent device support routines.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 *      Derived from the non IP parts of dev.c 1.0.19
  10 *              Authors:        Ross Biro
  11 *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *      Additional Authors:
  15 *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *              David Hinds <dahinds@users.sourceforge.net>
  18 *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *              Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *      Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *                                      to 2 if register_netdev gets called
  25 *                                      before net_dev_init & also removed a
  26 *                                      few lines of code in the process.
  27 *              Alan Cox        :       device private ioctl copies fields back.
  28 *              Alan Cox        :       Transmit queue code does relevant
  29 *                                      stunts to keep the queue safe.
  30 *              Alan Cox        :       Fixed double lock.
  31 *              Alan Cox        :       Fixed promisc NULL pointer trap
  32 *              ????????        :       Support the full private ioctl range
  33 *              Alan Cox        :       Moved ioctl permission check into
  34 *                                      drivers
  35 *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36 *              Alan Cox        :       100 backlog just doesn't cut it when
  37 *                                      you start doing multicast video 8)
  38 *              Alan Cox        :       Rewrote net_bh and list manager.
  39 *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40 *              Alan Cox        :       Took out transmit every packet pass
  41 *                                      Saved a few bytes in the ioctl handler
  42 *              Alan Cox        :       Network driver sets packet type before
  43 *                                      calling netif_rx. Saves a function
  44 *                                      call a packet.
  45 *              Alan Cox        :       Hashed net_bh()
  46 *              Richard Kooijman:       Timestamp fixes.
  47 *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48 *              Alan Cox        :       Device lock protection.
  49 *              Alan Cox        :       Fixed nasty side effect of device close
  50 *                                      changes.
  51 *              Rudi Cilibrasi  :       Pass the right thing to
  52 *                                      set_mac_address()
  53 *              Dave Miller     :       32bit quantity for the device lock to
  54 *                                      make it work out on a Sparc.
  55 *              Bjorn Ekwall    :       Added KERNELD hack.
  56 *              Alan Cox        :       Cleaned up the backlog initialise.
  57 *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58 *                                      1 device.
  59 *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60 *                                      is no device open function.
  61 *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62 *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63 *              Cyrus Durgin    :       Cleaned for KMOD
  64 *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65 *                                      A network device unload needs to purge
  66 *                                      the backlog queue.
  67 *      Paul Rusty Russell      :       SIOCSIFNAME
  68 *              Pekka Riikonen  :       Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *                                      indefinitely on dev->refcnt
  71 *              J Hadi Salim    :       - Backlog queue sampling
  72 *                                      - netif_rx() feedback
  73 */
  74
  75#include <asm/uaccess.h>
  76#include <asm/system.h>
  77#include <linux/bitops.h>
  78#include <linux/capability.h>
  79#include <linux/cpu.h>
  80#include <linux/types.h>
  81#include <linux/kernel.h>
  82#include <linux/hash.h>
  83#include <linux/slab.h>
  84#include <linux/sched.h>
  85#include <linux/mutex.h>
  86#include <linux/string.h>
  87#include <linux/mm.h>
  88#include <linux/socket.h>
  89#include <linux/sockios.h>
  90#include <linux/errno.h>
  91#include <linux/interrupt.h>
  92#include <linux/if_ether.h>
  93#include <linux/netdevice.h>
  94#include <linux/etherdevice.h>
  95#include <linux/ethtool.h>
  96#include <linux/notifier.h>
  97#include <linux/skbuff.h>
  98#include <net/net_namespace.h>
  99#include <net/sock.h>
 100#include <linux/rtnetlink.h>
 101#include <linux/proc_fs.h>
 102#include <linux/seq_file.h>
 103#include <linux/stat.h>
 104#include <net/dst.h>
 105#include <net/pkt_sched.h>
 106#include <net/checksum.h>
 107#include <net/xfrm.h>
 108#include <linux/highmem.h>
 109#include <linux/init.h>
 110#include <linux/kmod.h>
 111#include <linux/module.h>
 112#include <linux/netpoll.h>
 113#include <linux/rcupdate.h>
 114#include <linux/delay.h>
 115#include <net/wext.h>
 116#include <net/iw_handler.h>
 117#include <asm/current.h>
 118#include <linux/audit.h>
 119#include <linux/dmaengine.h>
 120#include <linux/err.h>
 121#include <linux/ctype.h>
 122#include <linux/if_arp.h>
 123#include <linux/if_vlan.h>
 124#include <linux/ip.h>
 125#include <net/ip.h>
 126#include <linux/ipv6.h>
 127#include <linux/in.h>
 128#include <linux/jhash.h>
 129#include <linux/random.h>
 130#include <trace/events/napi.h>
 131#include <trace/events/net.h>
 132#include <trace/events/skb.h>
 133#include <linux/pci.h>
 134#include <linux/inetdevice.h>
 135#include <linux/cpu_rmap.h>
 136#include <linux/if_tunnel.h>
 137#include <linux/if_pppox.h>
 138#include <linux/ppp_defs.h>
 139#include <linux/net_tstamp.h>
 140
 141#include "net-sysfs.h"
 142
 143/* Instead of increasing this, you should create a hash table. */
 144#define MAX_GRO_SKBS 8
 145
 146/* This should be increased if a protocol with a bigger head is added. */
 147#define GRO_MAX_HEAD (MAX_HEADER + 128)
 148
 149/*
 150 *      The list of packet types we will receive (as opposed to discard)
 151 *      and the routines to invoke.
 152 *
 153 *      Why 16. Because with 16 the only overlap we get on a hash of the
 154 *      low nibble of the protocol value is RARP/SNAP/X.25.
 155 *
 156 *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 157 *             sure which should go first, but I bet it won't make much
 158 *             difference if we are running VLANs.  The good news is that
 159 *             this protocol won't be in the list unless compiled in, so
 160 *             the average user (w/out VLANs) will not be adversely affected.
 161 *             --BLG
 162 *
 163 *              0800    IP
 164 *              8100    802.1Q VLAN
 165 *              0001    802.3
 166 *              0002    AX.25
 167 *              0004    802.2
 168 *              8035    RARP
 169 *              0005    SNAP
 170 *              0805    X.25
 171 *              0806    ARP
 172 *              8137    IPX
 173 *              0009    Localtalk
 174 *              86DD    IPv6
 175 */
 176
 177#define PTYPE_HASH_SIZE (16)
 178#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 179
 180static DEFINE_SPINLOCK(ptype_lock);
 181static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 182static struct list_head ptype_all __read_mostly;        /* Taps */
 183
 184/*
 185 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 186 * semaphore.
 187 *
 188 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 189 *
 190 * Writers must hold the rtnl semaphore while they loop through the
 191 * dev_base_head list, and hold dev_base_lock for writing when they do the
 192 * actual updates.  This allows pure readers to access the list even
 193 * while a writer is preparing to update it.
 194 *
 195 * To put it another way, dev_base_lock is held for writing only to
 196 * protect against pure readers; the rtnl semaphore provides the
 197 * protection against other writers.
 198 *
 199 * See, for example usages, register_netdevice() and
 200 * unregister_netdevice(), which must be called with the rtnl
 201 * semaphore held.
 202 */
 203DEFINE_RWLOCK(dev_base_lock);
 204EXPORT_SYMBOL(dev_base_lock);
 205
 206static inline void dev_base_seq_inc(struct net *net)
 207{
 208        while (++net->dev_base_seq == 0);
 209}
 210
 211static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 212{
 213        unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 214        return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 215}
 216
 217static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 218{
 219        return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 220}
 221
 222static inline void rps_lock(struct softnet_data *sd)
 223{
 224#ifdef CONFIG_RPS
 225        spin_lock(&sd->input_pkt_queue.lock);
 226#endif
 227}
 228
 229static inline void rps_unlock(struct softnet_data *sd)
 230{
 231#ifdef CONFIG_RPS
 232        spin_unlock(&sd->input_pkt_queue.lock);
 233#endif
 234}
 235
 236/* Device list insertion */
 237static int list_netdevice(struct net_device *dev)
 238{
 239        struct net *net = dev_net(dev);
 240
 241        ASSERT_RTNL();
 242
 243        write_lock_bh(&dev_base_lock);
 244        list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 245        hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 246        hlist_add_head_rcu(&dev->index_hlist,
 247                           dev_index_hash(net, dev->ifindex));
 248        write_unlock_bh(&dev_base_lock);
 249
 250        dev_base_seq_inc(net);
 251
 252        return 0;
 253}
 254
 255/* Device list removal
 256 * caller must respect a RCU grace period before freeing/reusing dev
 257 */
 258static void unlist_netdevice(struct net_device *dev)
 259{
 260        ASSERT_RTNL();
 261
 262        /* Unlink dev from the device chain */
 263        write_lock_bh(&dev_base_lock);
 264        list_del_rcu(&dev->dev_list);
 265        hlist_del_rcu(&dev->name_hlist);
 266        hlist_del_rcu(&dev->index_hlist);
 267        write_unlock_bh(&dev_base_lock);
 268
 269        dev_base_seq_inc(dev_net(dev));
 270}
 271
 272/*
 273 *      Our notifier list
 274 */
 275
 276static RAW_NOTIFIER_HEAD(netdev_chain);
 277
 278/*
 279 *      Device drivers call our routines to queue packets here. We empty the
 280 *      queue in the local softnet handler.
 281 */
 282
 283DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 284EXPORT_PER_CPU_SYMBOL(softnet_data);
 285
 286#ifdef CONFIG_LOCKDEP
 287/*
 288 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 289 * according to dev->type
 290 */
 291static const unsigned short netdev_lock_type[] =
 292        {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 293         ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 294         ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 295         ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 296         ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 297         ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 298         ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 299         ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 300         ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 301         ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 302         ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 303         ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 304         ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 305         ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 306         ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
 307         ARPHRD_VOID, ARPHRD_NONE};
 308
 309static const char *const netdev_lock_name[] =
 310        {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 311         "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 312         "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 313         "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 314         "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 315         "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 316         "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 317         "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 318         "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 319         "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 320         "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 321         "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 322         "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 323         "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 324         "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
 325         "_xmit_VOID", "_xmit_NONE"};
 326
 327static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 328static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 329
 330static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 331{
 332        int i;
 333
 334        for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 335                if (netdev_lock_type[i] == dev_type)
 336                        return i;
 337        /* the last key is used by default */
 338        return ARRAY_SIZE(netdev_lock_type) - 1;
 339}
 340
 341static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 342                                                 unsigned short dev_type)
 343{
 344        int i;
 345
 346        i = netdev_lock_pos(dev_type);
 347        lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 348                                   netdev_lock_name[i]);
 349}
 350
 351static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 352{
 353        int i;
 354
 355        i = netdev_lock_pos(dev->type);
 356        lockdep_set_class_and_name(&dev->addr_list_lock,
 357                                   &netdev_addr_lock_key[i],
 358                                   netdev_lock_name[i]);
 359}
 360#else
 361static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 362                                                 unsigned short dev_type)
 363{
 364}
 365static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 366{
 367}
 368#endif
 369
 370/*******************************************************************************
 371
 372                Protocol management and registration routines
 373
 374*******************************************************************************/
 375
 376/*
 377 *      Add a protocol ID to the list. Now that the input handler is
 378 *      smarter we can dispense with all the messy stuff that used to be
 379 *      here.
 380 *
 381 *      BEWARE!!! Protocol handlers, mangling input packets,
 382 *      MUST BE last in hash buckets and checking protocol handlers
 383 *      MUST start from promiscuous ptype_all chain in net_bh.
 384 *      It is true now, do not change it.
 385 *      Explanation follows: if protocol handler, mangling packet, will
 386 *      be the first on list, it is not able to sense, that packet
 387 *      is cloned and should be copied-on-write, so that it will
 388 *      change it and subsequent readers will get broken packet.
 389 *                                                      --ANK (980803)
 390 */
 391
 392static inline struct list_head *ptype_head(const struct packet_type *pt)
 393{
 394        if (pt->type == htons(ETH_P_ALL))
 395                return &ptype_all;
 396        else
 397                return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 398}
 399
 400/**
 401 *      dev_add_pack - add packet handler
 402 *      @pt: packet type declaration
 403 *
 404 *      Add a protocol handler to the networking stack. The passed &packet_type
 405 *      is linked into kernel lists and may not be freed until it has been
 406 *      removed from the kernel lists.
 407 *
 408 *      This call does not sleep therefore it can not
 409 *      guarantee all CPU's that are in middle of receiving packets
 410 *      will see the new packet type (until the next received packet).
 411 */
 412
 413void dev_add_pack(struct packet_type *pt)
 414{
 415        struct list_head *head = ptype_head(pt);
 416
 417        spin_lock(&ptype_lock);
 418        list_add_rcu(&pt->list, head);
 419        spin_unlock(&ptype_lock);
 420}
 421EXPORT_SYMBOL(dev_add_pack);
 422
 423/**
 424 *      __dev_remove_pack        - remove packet handler
 425 *      @pt: packet type declaration
 426 *
 427 *      Remove a protocol handler that was previously added to the kernel
 428 *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 429 *      from the kernel lists and can be freed or reused once this function
 430 *      returns.
 431 *
 432 *      The packet type might still be in use by receivers
 433 *      and must not be freed until after all the CPU's have gone
 434 *      through a quiescent state.
 435 */
 436void __dev_remove_pack(struct packet_type *pt)
 437{
 438        struct list_head *head = ptype_head(pt);
 439        struct packet_type *pt1;
 440
 441        spin_lock(&ptype_lock);
 442
 443        list_for_each_entry(pt1, head, list) {
 444                if (pt == pt1) {
 445                        list_del_rcu(&pt->list);
 446                        goto out;
 447                }
 448        }
 449
 450        printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 451out:
 452        spin_unlock(&ptype_lock);
 453}
 454EXPORT_SYMBOL(__dev_remove_pack);
 455
 456/**
 457 *      dev_remove_pack  - remove packet handler
 458 *      @pt: packet type declaration
 459 *
 460 *      Remove a protocol handler that was previously added to the kernel
 461 *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 462 *      from the kernel lists and can be freed or reused once this function
 463 *      returns.
 464 *
 465 *      This call sleeps to guarantee that no CPU is looking at the packet
 466 *      type after return.
 467 */
 468void dev_remove_pack(struct packet_type *pt)
 469{
 470        __dev_remove_pack(pt);
 471
 472        synchronize_net();
 473}
 474EXPORT_SYMBOL(dev_remove_pack);
 475
 476/******************************************************************************
 477
 478                      Device Boot-time Settings Routines
 479
 480*******************************************************************************/
 481
 482/* Boot time configuration table */
 483static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 484
 485/**
 486 *      netdev_boot_setup_add   - add new setup entry
 487 *      @name: name of the device
 488 *      @map: configured settings for the device
 489 *
 490 *      Adds new setup entry to the dev_boot_setup list.  The function
 491 *      returns 0 on error and 1 on success.  This is a generic routine to
 492 *      all netdevices.
 493 */
 494static int netdev_boot_setup_add(char *name, struct ifmap *map)
 495{
 496        struct netdev_boot_setup *s;
 497        int i;
 498
 499        s = dev_boot_setup;
 500        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 501                if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 502                        memset(s[i].name, 0, sizeof(s[i].name));
 503                        strlcpy(s[i].name, name, IFNAMSIZ);
 504                        memcpy(&s[i].map, map, sizeof(s[i].map));
 505                        break;
 506                }
 507        }
 508
 509        return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 510}
 511
 512/**
 513 *      netdev_boot_setup_check - check boot time settings
 514 *      @dev: the netdevice
 515 *
 516 *      Check boot time settings for the device.
 517 *      The found settings are set for the device to be used
 518 *      later in the device probing.
 519 *      Returns 0 if no settings found, 1 if they are.
 520 */
 521int netdev_boot_setup_check(struct net_device *dev)
 522{
 523        struct netdev_boot_setup *s = dev_boot_setup;
 524        int i;
 525
 526        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 527                if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 528                    !strcmp(dev->name, s[i].name)) {
 529                        dev->irq        = s[i].map.irq;
 530                        dev->base_addr  = s[i].map.base_addr;
 531                        dev->mem_start  = s[i].map.mem_start;
 532                        dev->mem_end    = s[i].map.mem_end;
 533                        return 1;
 534                }
 535        }
 536        return 0;
 537}
 538EXPORT_SYMBOL(netdev_boot_setup_check);
 539
 540
 541/**
 542 *      netdev_boot_base        - get address from boot time settings
 543 *      @prefix: prefix for network device
 544 *      @unit: id for network device
 545 *
 546 *      Check boot time settings for the base address of device.
 547 *      The found settings are set for the device to be used
 548 *      later in the device probing.
 549 *      Returns 0 if no settings found.
 550 */
 551unsigned long netdev_boot_base(const char *prefix, int unit)
 552{
 553        const struct netdev_boot_setup *s = dev_boot_setup;
 554        char name[IFNAMSIZ];
 555        int i;
 556
 557        sprintf(name, "%s%d", prefix, unit);
 558
 559        /*
 560         * If device already registered then return base of 1
 561         * to indicate not to probe for this interface
 562         */
 563        if (__dev_get_by_name(&init_net, name))
 564                return 1;
 565
 566        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 567                if (!strcmp(name, s[i].name))
 568                        return s[i].map.base_addr;
 569        return 0;
 570}
 571
 572/*
 573 * Saves at boot time configured settings for any netdevice.
 574 */
 575int __init netdev_boot_setup(char *str)
 576{
 577        int ints[5];
 578        struct ifmap map;
 579
 580        str = get_options(str, ARRAY_SIZE(ints), ints);
 581        if (!str || !*str)
 582                return 0;
 583
 584        /* Save settings */
 585        memset(&map, 0, sizeof(map));
 586        if (ints[0] > 0)
 587                map.irq = ints[1];
 588        if (ints[0] > 1)
 589                map.base_addr = ints[2];
 590        if (ints[0] > 2)
 591                map.mem_start = ints[3];
 592        if (ints[0] > 3)
 593                map.mem_end = ints[4];
 594
 595        /* Add new entry to the list */
 596        return netdev_boot_setup_add(str, &map);
 597}
 598
 599__setup("netdev=", netdev_boot_setup);
 600
 601/*******************************************************************************
 602
 603                            Device Interface Subroutines
 604
 605*******************************************************************************/
 606
 607/**
 608 *      __dev_get_by_name       - find a device by its name
 609 *      @net: the applicable net namespace
 610 *      @name: name to find
 611 *
 612 *      Find an interface by name. Must be called under RTNL semaphore
 613 *      or @dev_base_lock. If the name is found a pointer to the device
 614 *      is returned. If the name is not found then %NULL is returned. The
 615 *      reference counters are not incremented so the caller must be
 616 *      careful with locks.
 617 */
 618
 619struct net_device *__dev_get_by_name(struct net *net, const char *name)
 620{
 621        struct hlist_node *p;
 622        struct net_device *dev;
 623        struct hlist_head *head = dev_name_hash(net, name);
 624
 625        hlist_for_each_entry(dev, p, head, name_hlist)
 626                if (!strncmp(dev->name, name, IFNAMSIZ))
 627                        return dev;
 628
 629        return NULL;
 630}
 631EXPORT_SYMBOL(__dev_get_by_name);
 632
 633/**
 634 *      dev_get_by_name_rcu     - find a device by its name
 635 *      @net: the applicable net namespace
 636 *      @name: name to find
 637 *
 638 *      Find an interface by name.
 639 *      If the name is found a pointer to the device is returned.
 640 *      If the name is not found then %NULL is returned.
 641 *      The reference counters are not incremented so the caller must be
 642 *      careful with locks. The caller must hold RCU lock.
 643 */
 644
 645struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 646{
 647        struct hlist_node *p;
 648        struct net_device *dev;
 649        struct hlist_head *head = dev_name_hash(net, name);
 650
 651        hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 652                if (!strncmp(dev->name, name, IFNAMSIZ))
 653                        return dev;
 654
 655        return NULL;
 656}
 657EXPORT_SYMBOL(dev_get_by_name_rcu);
 658
 659/**
 660 *      dev_get_by_name         - find a device by its name
 661 *      @net: the applicable net namespace
 662 *      @name: name to find
 663 *
 664 *      Find an interface by name. This can be called from any
 665 *      context and does its own locking. The returned handle has
 666 *      the usage count incremented and the caller must use dev_put() to
 667 *      release it when it is no longer needed. %NULL is returned if no
 668 *      matching device is found.
 669 */
 670
 671struct net_device *dev_get_by_name(struct net *net, const char *name)
 672{
 673        struct net_device *dev;
 674
 675        rcu_read_lock();
 676        dev = dev_get_by_name_rcu(net, name);
 677        if (dev)
 678                dev_hold(dev);
 679        rcu_read_unlock();
 680        return dev;
 681}
 682EXPORT_SYMBOL(dev_get_by_name);
 683
 684/**
 685 *      __dev_get_by_index - find a device by its ifindex
 686 *      @net: the applicable net namespace
 687 *      @ifindex: index of device
 688 *
 689 *      Search for an interface by index. Returns %NULL if the device
 690 *      is not found or a pointer to the device. The device has not
 691 *      had its reference counter increased so the caller must be careful
 692 *      about locking. The caller must hold either the RTNL semaphore
 693 *      or @dev_base_lock.
 694 */
 695
 696struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 697{
 698        struct hlist_node *p;
 699        struct net_device *dev;
 700        struct hlist_head *head = dev_index_hash(net, ifindex);
 701
 702        hlist_for_each_entry(dev, p, head, index_hlist)
 703                if (dev->ifindex == ifindex)
 704                        return dev;
 705
 706        return NULL;
 707}
 708EXPORT_SYMBOL(__dev_get_by_index);
 709
 710/**
 711 *      dev_get_by_index_rcu - find a device by its ifindex
 712 *      @net: the applicable net namespace
 713 *      @ifindex: index of device
 714 *
 715 *      Search for an interface by index. Returns %NULL if the device
 716 *      is not found or a pointer to the device. The device has not
 717 *      had its reference counter increased so the caller must be careful
 718 *      about locking. The caller must hold RCU lock.
 719 */
 720
 721struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 722{
 723        struct hlist_node *p;
 724        struct net_device *dev;
 725        struct hlist_head *head = dev_index_hash(net, ifindex);
 726
 727        hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 728                if (dev->ifindex == ifindex)
 729                        return dev;
 730
 731        return NULL;
 732}
 733EXPORT_SYMBOL(dev_get_by_index_rcu);
 734
 735
 736/**
 737 *      dev_get_by_index - find a device by its ifindex
 738 *      @net: the applicable net namespace
 739 *      @ifindex: index of device
 740 *
 741 *      Search for an interface by index. Returns NULL if the device
 742 *      is not found or a pointer to the device. The device returned has
 743 *      had a reference added and the pointer is safe until the user calls
 744 *      dev_put to indicate they have finished with it.
 745 */
 746
 747struct net_device *dev_get_by_index(struct net *net, int ifindex)
 748{
 749        struct net_device *dev;
 750
 751        rcu_read_lock();
 752        dev = dev_get_by_index_rcu(net, ifindex);
 753        if (dev)
 754                dev_hold(dev);
 755        rcu_read_unlock();
 756        return dev;
 757}
 758EXPORT_SYMBOL(dev_get_by_index);
 759
 760/**
 761 *      dev_getbyhwaddr_rcu - find a device by its hardware address
 762 *      @net: the applicable net namespace
 763 *      @type: media type of device
 764 *      @ha: hardware address
 765 *
 766 *      Search for an interface by MAC address. Returns NULL if the device
 767 *      is not found or a pointer to the device.
 768 *      The caller must hold RCU or RTNL.
 769 *      The returned device has not had its ref count increased
 770 *      and the caller must therefore be careful about locking
 771 *
 772 */
 773
 774struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 775                                       const char *ha)
 776{
 777        struct net_device *dev;
 778
 779        for_each_netdev_rcu(net, dev)
 780                if (dev->type == type &&
 781                    !memcmp(dev->dev_addr, ha, dev->addr_len))
 782                        return dev;
 783
 784        return NULL;
 785}
 786EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 787
 788struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 789{
 790        struct net_device *dev;
 791
 792        ASSERT_RTNL();
 793        for_each_netdev(net, dev)
 794                if (dev->type == type)
 795                        return dev;
 796
 797        return NULL;
 798}
 799EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 800
 801struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 802{
 803        struct net_device *dev, *ret = NULL;
 804
 805        rcu_read_lock();
 806        for_each_netdev_rcu(net, dev)
 807                if (dev->type == type) {
 808                        dev_hold(dev);
 809                        ret = dev;
 810                        break;
 811                }
 812        rcu_read_unlock();
 813        return ret;
 814}
 815EXPORT_SYMBOL(dev_getfirstbyhwtype);
 816
 817/**
 818 *      dev_get_by_flags_rcu - find any device with given flags
 819 *      @net: the applicable net namespace
 820 *      @if_flags: IFF_* values
 821 *      @mask: bitmask of bits in if_flags to check
 822 *
 823 *      Search for any interface with the given flags. Returns NULL if a device
 824 *      is not found or a pointer to the device. Must be called inside
 825 *      rcu_read_lock(), and result refcount is unchanged.
 826 */
 827
 828struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 829                                    unsigned short mask)
 830{
 831        struct net_device *dev, *ret;
 832
 833        ret = NULL;
 834        for_each_netdev_rcu(net, dev) {
 835                if (((dev->flags ^ if_flags) & mask) == 0) {
 836                        ret = dev;
 837                        break;
 838                }
 839        }
 840        return ret;
 841}
 842EXPORT_SYMBOL(dev_get_by_flags_rcu);
 843
 844/**
 845 *      dev_valid_name - check if name is okay for network device
 846 *      @name: name string
 847 *
 848 *      Network device names need to be valid file names to
 849 *      to allow sysfs to work.  We also disallow any kind of
 850 *      whitespace.
 851 */
 852int dev_valid_name(const char *name)
 853{
 854        if (*name == '\0')
 855                return 0;
 856        if (strlen(name) >= IFNAMSIZ)
 857                return 0;
 858        if (!strcmp(name, ".") || !strcmp(name, ".."))
 859                return 0;
 860
 861        while (*name) {
 862                if (*name == '/' || isspace(*name))
 863                        return 0;
 864                name++;
 865        }
 866        return 1;
 867}
 868EXPORT_SYMBOL(dev_valid_name);
 869
 870/**
 871 *      __dev_alloc_name - allocate a name for a device
 872 *      @net: network namespace to allocate the device name in
 873 *      @name: name format string
 874 *      @buf:  scratch buffer and result name string
 875 *
 876 *      Passed a format string - eg "lt%d" it will try and find a suitable
 877 *      id. It scans list of devices to build up a free map, then chooses
 878 *      the first empty slot. The caller must hold the dev_base or rtnl lock
 879 *      while allocating the name and adding the device in order to avoid
 880 *      duplicates.
 881 *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 882 *      Returns the number of the unit assigned or a negative errno code.
 883 */
 884
 885static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 886{
 887        int i = 0;
 888        const char *p;
 889        const int max_netdevices = 8*PAGE_SIZE;
 890        unsigned long *inuse;
 891        struct net_device *d;
 892
 893        p = strnchr(name, IFNAMSIZ-1, '%');
 894        if (p) {
 895                /*
 896                 * Verify the string as this thing may have come from
 897                 * the user.  There must be either one "%d" and no other "%"
 898                 * characters.
 899                 */
 900                if (p[1] != 'd' || strchr(p + 2, '%'))
 901                        return -EINVAL;
 902
 903                /* Use one page as a bit array of possible slots */
 904                inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 905                if (!inuse)
 906                        return -ENOMEM;
 907
 908                for_each_netdev(net, d) {
 909                        if (!sscanf(d->name, name, &i))
 910                                continue;
 911                        if (i < 0 || i >= max_netdevices)
 912                                continue;
 913
 914                        /*  avoid cases where sscanf is not exact inverse of printf */
 915                        snprintf(buf, IFNAMSIZ, name, i);
 916                        if (!strncmp(buf, d->name, IFNAMSIZ))
 917                                set_bit(i, inuse);
 918                }
 919
 920                i = find_first_zero_bit(inuse, max_netdevices);
 921                free_page((unsigned long) inuse);
 922        }
 923
 924        if (buf != name)
 925                snprintf(buf, IFNAMSIZ, name, i);
 926        if (!__dev_get_by_name(net, buf))
 927                return i;
 928
 929        /* It is possible to run out of possible slots
 930         * when the name is long and there isn't enough space left
 931         * for the digits, or if all bits are used.
 932         */
 933        return -ENFILE;
 934}
 935
 936/**
 937 *      dev_alloc_name - allocate a name for a device
 938 *      @dev: device
 939 *      @name: name format string
 940 *
 941 *      Passed a format string - eg "lt%d" it will try and find a suitable
 942 *      id. It scans list of devices to build up a free map, then chooses
 943 *      the first empty slot. The caller must hold the dev_base or rtnl lock
 944 *      while allocating the name and adding the device in order to avoid
 945 *      duplicates.
 946 *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 947 *      Returns the number of the unit assigned or a negative errno code.
 948 */
 949
 950int dev_alloc_name(struct net_device *dev, const char *name)
 951{
 952        char buf[IFNAMSIZ];
 953        struct net *net;
 954        int ret;
 955
 956        BUG_ON(!dev_net(dev));
 957        net = dev_net(dev);
 958        ret = __dev_alloc_name(net, name, buf);
 959        if (ret >= 0)
 960                strlcpy(dev->name, buf, IFNAMSIZ);
 961        return ret;
 962}
 963EXPORT_SYMBOL(dev_alloc_name);
 964
 965static int dev_get_valid_name(struct net_device *dev, const char *name)
 966{
 967        struct net *net;
 968
 969        BUG_ON(!dev_net(dev));
 970        net = dev_net(dev);
 971
 972        if (!dev_valid_name(name))
 973                return -EINVAL;
 974
 975        if (strchr(name, '%'))
 976                return dev_alloc_name(dev, name);
 977        else if (__dev_get_by_name(net, name))
 978                return -EEXIST;
 979        else if (dev->name != name)
 980                strlcpy(dev->name, name, IFNAMSIZ);
 981
 982        return 0;
 983}
 984
 985/**
 986 *      dev_change_name - change name of a device
 987 *      @dev: device
 988 *      @newname: name (or format string) must be at least IFNAMSIZ
 989 *
 990 *      Change name of a device, can pass format strings "eth%d".
 991 *      for wildcarding.
 992 */
 993int dev_change_name(struct net_device *dev, const char *newname)
 994{
 995        char oldname[IFNAMSIZ];
 996        int err = 0;
 997        int ret;
 998        struct net *net;
 999
1000        ASSERT_RTNL();
1001        BUG_ON(!dev_net(dev));
1002
1003        net = dev_net(dev);
1004        if (dev->flags & IFF_UP)
1005                return -EBUSY;
1006
1007        if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1008                return 0;
1009
1010        memcpy(oldname, dev->name, IFNAMSIZ);
1011
1012        err = dev_get_valid_name(dev, newname);
1013        if (err < 0)
1014                return err;
1015
1016rollback:
1017        ret = device_rename(&dev->dev, dev->name);
1018        if (ret) {
1019                memcpy(dev->name, oldname, IFNAMSIZ);
1020                return ret;
1021        }
1022
1023        write_lock_bh(&dev_base_lock);
1024        hlist_del_rcu(&dev->name_hlist);
1025        write_unlock_bh(&dev_base_lock);
1026
1027        synchronize_rcu();
1028
1029        write_lock_bh(&dev_base_lock);
1030        hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1031        write_unlock_bh(&dev_base_lock);
1032
1033        ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1034        ret = notifier_to_errno(ret);
1035
1036        if (ret) {
1037                /* err >= 0 after dev_alloc_name() or stores the first errno */
1038                if (err >= 0) {
1039                        err = ret;
1040                        memcpy(dev->name, oldname, IFNAMSIZ);
1041                        goto rollback;
1042                } else {
1043                        printk(KERN_ERR
1044                               "%s: name change rollback failed: %d.\n",
1045                               dev->name, ret);
1046                }
1047        }
1048
1049        return err;
1050}
1051
1052/**
1053 *      dev_set_alias - change ifalias of a device
1054 *      @dev: device
1055 *      @alias: name up to IFALIASZ
1056 *      @len: limit of bytes to copy from info
1057 *
1058 *      Set ifalias for a device,
1059 */
1060int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1061{
1062        ASSERT_RTNL();
1063
1064        if (len >= IFALIASZ)
1065                return -EINVAL;
1066
1067        if (!len) {
1068                if (dev->ifalias) {
1069                        kfree(dev->ifalias);
1070                        dev->ifalias = NULL;
1071                }
1072                return 0;
1073        }
1074
1075        dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1076        if (!dev->ifalias)
1077                return -ENOMEM;
1078
1079        strlcpy(dev->ifalias, alias, len+1);
1080        return len;
1081}
1082
1083
1084/**
1085 *      netdev_features_change - device changes features
1086 *      @dev: device to cause notification
1087 *
1088 *      Called to indicate a device has changed features.
1089 */
1090void netdev_features_change(struct net_device *dev)
1091{
1092        call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1093}
1094EXPORT_SYMBOL(netdev_features_change);
1095
1096/**
1097 *      netdev_state_change - device changes state
1098 *      @dev: device to cause notification
1099 *
1100 *      Called to indicate a device has changed state. This function calls
1101 *      the notifier chains for netdev_chain and sends a NEWLINK message
1102 *      to the routing socket.
1103 */
1104void netdev_state_change(struct net_device *dev)
1105{
1106        if (dev->flags & IFF_UP) {
1107                call_netdevice_notifiers(NETDEV_CHANGE, dev);
1108                rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1109        }
1110}
1111EXPORT_SYMBOL(netdev_state_change);
1112
1113int netdev_bonding_change(struct net_device *dev, unsigned long event)
1114{
1115        return call_netdevice_notifiers(event, dev);
1116}
1117EXPORT_SYMBOL(netdev_bonding_change);
1118
1119/**
1120 *      dev_load        - load a network module
1121 *      @net: the applicable net namespace
1122 *      @name: name of interface
1123 *
1124 *      If a network interface is not present and the process has suitable
1125 *      privileges this function loads the module. If module loading is not
1126 *      available in this kernel then it becomes a nop.
1127 */
1128
1129void dev_load(struct net *net, const char *name)
1130{
1131        struct net_device *dev;
1132        int no_module;
1133
1134        rcu_read_lock();
1135        dev = dev_get_by_name_rcu(net, name);
1136        rcu_read_unlock();
1137
1138        no_module = !dev;
1139        if (no_module && capable(CAP_NET_ADMIN))
1140                no_module = request_module("netdev-%s", name);
1141        if (no_module && capable(CAP_SYS_MODULE)) {
1142                if (!request_module("%s", name))
1143                        pr_err("Loading kernel module for a network device "
1144"with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s "
1145"instead\n", name);
1146        }
1147}
1148EXPORT_SYMBOL(dev_load);
1149
1150static int __dev_open(struct net_device *dev)
1151{
1152        const struct net_device_ops *ops = dev->netdev_ops;
1153        int ret;
1154
1155        ASSERT_RTNL();
1156
1157        if (!netif_device_present(dev))
1158                return -ENODEV;
1159
1160        ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1161        ret = notifier_to_errno(ret);
1162        if (ret)
1163                return ret;
1164
1165        set_bit(__LINK_STATE_START, &dev->state);
1166
1167        if (ops->ndo_validate_addr)
1168                ret = ops->ndo_validate_addr(dev);
1169
1170        if (!ret && ops->ndo_open)
1171                ret = ops->ndo_open(dev);
1172
1173        if (ret)
1174                clear_bit(__LINK_STATE_START, &dev->state);
1175        else {
1176                dev->flags |= IFF_UP;
1177                net_dmaengine_get();
1178                dev_set_rx_mode(dev);
1179                dev_activate(dev);
1180        }
1181
1182        return ret;
1183}
1184
1185/**
1186 *      dev_open        - prepare an interface for use.
1187 *      @dev:   device to open
1188 *
1189 *      Takes a device from down to up state. The device's private open
1190 *      function is invoked and then the multicast lists are loaded. Finally
1191 *      the device is moved into the up state and a %NETDEV_UP message is
1192 *      sent to the netdev notifier chain.
1193 *
1194 *      Calling this function on an active interface is a nop. On a failure
1195 *      a negative errno code is returned.
1196 */
1197int dev_open(struct net_device *dev)
1198{
1199        int ret;
1200
1201        if (dev->flags & IFF_UP)
1202                return 0;
1203
1204        ret = __dev_open(dev);
1205        if (ret < 0)
1206                return ret;
1207
1208        rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1209        call_netdevice_notifiers(NETDEV_UP, dev);
1210
1211        return ret;
1212}
1213EXPORT_SYMBOL(dev_open);
1214
1215static int __dev_close_many(struct list_head *head)
1216{
1217        struct net_device *dev;
1218
1219        ASSERT_RTNL();
1220        might_sleep();
1221
1222        list_for_each_entry(dev, head, unreg_list) {
1223                call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1224
1225                clear_bit(__LINK_STATE_START, &dev->state);
1226
1227                /* Synchronize to scheduled poll. We cannot touch poll list, it
1228                 * can be even on different cpu. So just clear netif_running().
1229                 *
1230                 * dev->stop() will invoke napi_disable() on all of it's
1231                 * napi_struct instances on this device.
1232                 */
1233                smp_mb__after_clear_bit(); /* Commit netif_running(). */
1234        }
1235
1236        dev_deactivate_many(head);
1237
1238        list_for_each_entry(dev, head, unreg_list) {
1239                const struct net_device_ops *ops = dev->netdev_ops;
1240
1241                /*
1242                 *      Call the device specific close. This cannot fail.
1243                 *      Only if device is UP
1244                 *
1245                 *      We allow it to be called even after a DETACH hot-plug
1246                 *      event.
1247                 */
1248                if (ops->ndo_stop)
1249                        ops->ndo_stop(dev);
1250
1251                dev->flags &= ~IFF_UP;
1252                net_dmaengine_put();
1253        }
1254
1255        return 0;
1256}
1257
1258static int __dev_close(struct net_device *dev)
1259{
1260        int retval;
1261        LIST_HEAD(single);
1262
1263        list_add(&dev->unreg_list, &single);
1264        retval = __dev_close_many(&single);
1265        list_del(&single);
1266        return retval;
1267}
1268
1269static int dev_close_many(struct list_head *head)
1270{
1271        struct net_device *dev, *tmp;
1272        LIST_HEAD(tmp_list);
1273
1274        list_for_each_entry_safe(dev, tmp, head, unreg_list)
1275                if (!(dev->flags & IFF_UP))
1276                        list_move(&dev->unreg_list, &tmp_list);
1277
1278        __dev_close_many(head);
1279
1280        list_for_each_entry(dev, head, unreg_list) {
1281                rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1282                call_netdevice_notifiers(NETDEV_DOWN, dev);
1283        }
1284
1285        /* rollback_registered_many needs the complete original list */
1286        list_splice(&tmp_list, head);
1287        return 0;
1288}
1289
1290/**
1291 *      dev_close - shutdown an interface.
1292 *      @dev: device to shutdown
1293 *
1294 *      This function moves an active device into down state. A
1295 *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1296 *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1297 *      chain.
1298 */
1299int dev_close(struct net_device *dev)
1300{
1301        if (dev->flags & IFF_UP) {
1302                LIST_HEAD(single);
1303
1304                list_add(&dev->unreg_list, &single);
1305                dev_close_many(&single);
1306                list_del(&single);
1307        }
1308        return 0;
1309}
1310EXPORT_SYMBOL(dev_close);
1311
1312
1313/**
1314 *      dev_disable_lro - disable Large Receive Offload on a device
1315 *      @dev: device
1316 *
1317 *      Disable Large Receive Offload (LRO) on a net device.  Must be
1318 *      called under RTNL.  This is needed if received packets may be
1319 *      forwarded to another interface.
1320 */
1321void dev_disable_lro(struct net_device *dev)
1322{
1323        u32 flags;
1324
1325        /*
1326         * If we're trying to disable lro on a vlan device
1327         * use the underlying physical device instead
1328         */
1329        if (is_vlan_dev(dev))
1330                dev = vlan_dev_real_dev(dev);
1331
1332        if (dev->ethtool_ops && dev->ethtool_ops->get_flags)
1333                flags = dev->ethtool_ops->get_flags(dev);
1334        else
1335                flags = ethtool_op_get_flags(dev);
1336
1337        if (!(flags & ETH_FLAG_LRO))
1338                return;
1339
1340        __ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO);
1341        if (unlikely(dev->features & NETIF_F_LRO))
1342                netdev_WARN(dev, "failed to disable LRO!\n");
1343}
1344EXPORT_SYMBOL(dev_disable_lro);
1345
1346
1347static int dev_boot_phase = 1;
1348
1349/**
1350 *      register_netdevice_notifier - register a network notifier block
1351 *      @nb: notifier
1352 *
1353 *      Register a notifier to be called when network device events occur.
1354 *      The notifier passed is linked into the kernel structures and must
1355 *      not be reused until it has been unregistered. A negative errno code
1356 *      is returned on a failure.
1357 *
1358 *      When registered all registration and up events are replayed
1359 *      to the new notifier to allow device to have a race free
1360 *      view of the network device list.
1361 */
1362
1363int register_netdevice_notifier(struct notifier_block *nb)
1364{
1365        struct net_device *dev;
1366        struct net_device *last;
1367        struct net *net;
1368        int err;
1369
1370        rtnl_lock();
1371        err = raw_notifier_chain_register(&netdev_chain, nb);
1372        if (err)
1373                goto unlock;
1374        if (dev_boot_phase)
1375                goto unlock;
1376        for_each_net(net) {
1377                for_each_netdev(net, dev) {
1378                        err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1379                        err = notifier_to_errno(err);
1380                        if (err)
1381                                goto rollback;
1382
1383                        if (!(dev->flags & IFF_UP))
1384                                continue;
1385
1386                        nb->notifier_call(nb, NETDEV_UP, dev);
1387                }
1388        }
1389
1390unlock:
1391        rtnl_unlock();
1392        return err;
1393
1394rollback:
1395        last = dev;
1396        for_each_net(net) {
1397                for_each_netdev(net, dev) {
1398                        if (dev == last)
1399                                goto outroll;
1400
1401                        if (dev->flags & IFF_UP) {
1402                                nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1403                                nb->notifier_call(nb, NETDEV_DOWN, dev);
1404                        }
1405                        nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1406                        nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1407                }
1408        }
1409
1410outroll:
1411        raw_notifier_chain_unregister(&netdev_chain, nb);
1412        goto unlock;
1413}
1414EXPORT_SYMBOL(register_netdevice_notifier);
1415
1416/**
1417 *      unregister_netdevice_notifier - unregister a network notifier block
1418 *      @nb: notifier
1419 *
1420 *      Unregister a notifier previously registered by
1421 *      register_netdevice_notifier(). The notifier is unlinked into the
1422 *      kernel structures and may then be reused. A negative errno code
1423 *      is returned on a failure.
1424 */
1425
1426int unregister_netdevice_notifier(struct notifier_block *nb)
1427{
1428        int err;
1429
1430        rtnl_lock();
1431        err = raw_notifier_chain_unregister(&netdev_chain, nb);
1432        rtnl_unlock();
1433        return err;
1434}
1435EXPORT_SYMBOL(unregister_netdevice_notifier);
1436
1437/**
1438 *      call_netdevice_notifiers - call all network notifier blocks
1439 *      @val: value passed unmodified to notifier function
1440 *      @dev: net_device pointer passed unmodified to notifier function
1441 *
1442 *      Call all network notifier blocks.  Parameters and return value
1443 *      are as for raw_notifier_call_chain().
1444 */
1445
1446int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1447{
1448        ASSERT_RTNL();
1449        return raw_notifier_call_chain(&netdev_chain, val, dev);
1450}
1451EXPORT_SYMBOL(call_netdevice_notifiers);
1452
1453/* When > 0 there are consumers of rx skb time stamps */
1454static atomic_t netstamp_needed = ATOMIC_INIT(0);
1455
1456void net_enable_timestamp(void)
1457{
1458        atomic_inc(&netstamp_needed);
1459}
1460EXPORT_SYMBOL(net_enable_timestamp);
1461
1462void net_disable_timestamp(void)
1463{
1464        atomic_dec(&netstamp_needed);
1465}
1466EXPORT_SYMBOL(net_disable_timestamp);
1467
1468static inline void net_timestamp_set(struct sk_buff *skb)
1469{
1470        if (atomic_read(&netstamp_needed))
1471                __net_timestamp(skb);
1472        else
1473                skb->tstamp.tv64 = 0;
1474}
1475
1476static inline void net_timestamp_check(struct sk_buff *skb)
1477{
1478        if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1479                __net_timestamp(skb);
1480}
1481
1482static int net_hwtstamp_validate(struct ifreq *ifr)
1483{
1484        struct hwtstamp_config cfg;
1485        enum hwtstamp_tx_types tx_type;
1486        enum hwtstamp_rx_filters rx_filter;
1487        int tx_type_valid = 0;
1488        int rx_filter_valid = 0;
1489
1490        if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
1491                return -EFAULT;
1492
1493        if (cfg.flags) /* reserved for future extensions */
1494                return -EINVAL;
1495
1496        tx_type = cfg.tx_type;
1497        rx_filter = cfg.rx_filter;
1498
1499        switch (tx_type) {
1500        case HWTSTAMP_TX_OFF:
1501        case HWTSTAMP_TX_ON:
1502        case HWTSTAMP_TX_ONESTEP_SYNC:
1503                tx_type_valid = 1;
1504                break;
1505        }
1506
1507        switch (rx_filter) {
1508        case HWTSTAMP_FILTER_NONE:
1509        case HWTSTAMP_FILTER_ALL:
1510        case HWTSTAMP_FILTER_SOME:
1511        case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
1512        case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
1513        case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
1514        case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
1515        case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
1516        case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
1517        case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
1518        case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
1519        case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
1520        case HWTSTAMP_FILTER_PTP_V2_EVENT:
1521        case HWTSTAMP_FILTER_PTP_V2_SYNC:
1522        case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
1523                rx_filter_valid = 1;
1524                break;
1525        }
1526
1527        if (!tx_type_valid || !rx_filter_valid)
1528                return -ERANGE;
1529
1530        return 0;
1531}
1532
1533static inline bool is_skb_forwardable(struct net_device *dev,
1534                                      struct sk_buff *skb)
1535{
1536        unsigned int len;
1537
1538        if (!(dev->flags & IFF_UP))
1539                return false;
1540
1541        len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1542        if (skb->len <= len)
1543                return true;
1544
1545        /* if TSO is enabled, we don't care about the length as the packet
1546         * could be forwarded without being segmented before
1547         */
1548        if (skb_is_gso(skb))
1549                return true;
1550
1551        return false;
1552}
1553
1554/**
1555 * dev_forward_skb - loopback an skb to another netif
1556 *
1557 * @dev: destination network device
1558 * @skb: buffer to forward
1559 *
1560 * return values:
1561 *      NET_RX_SUCCESS  (no congestion)
1562 *      NET_RX_DROP     (packet was dropped, but freed)
1563 *
1564 * dev_forward_skb can be used for injecting an skb from the
1565 * start_xmit function of one device into the receive queue
1566 * of another device.
1567 *
1568 * The receiving device may be in another namespace, so
1569 * we have to clear all information in the skb that could
1570 * impact namespace isolation.
1571 */
1572int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1573{
1574        if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1575                if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1576                        atomic_long_inc(&dev->rx_dropped);
1577                        kfree_skb(skb);
1578                        return NET_RX_DROP;
1579                }
1580        }
1581
1582        skb_orphan(skb);
1583        nf_reset(skb);
1584
1585        if (unlikely(!is_skb_forwardable(dev, skb))) {
1586                atomic_long_inc(&dev->rx_dropped);
1587                kfree_skb(skb);
1588                return NET_RX_DROP;
1589        }
1590        skb_set_dev(skb, dev);
1591        skb->tstamp.tv64 = 0;
1592        skb->pkt_type = PACKET_HOST;
1593        skb->protocol = eth_type_trans(skb, dev);
1594        return netif_rx(skb);
1595}
1596EXPORT_SYMBOL_GPL(dev_forward_skb);
1597
1598static inline int deliver_skb(struct sk_buff *skb,
1599                              struct packet_type *pt_prev,
1600                              struct net_device *orig_dev)
1601{
1602        atomic_inc(&skb->users);
1603        return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1604}
1605
1606/*
1607 *      Support routine. Sends outgoing frames to any network
1608 *      taps currently in use.
1609 */
1610
1611static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1612{
1613        struct packet_type *ptype;
1614        struct sk_buff *skb2 = NULL;
1615        struct packet_type *pt_prev = NULL;
1616
1617        rcu_read_lock();
1618        list_for_each_entry_rcu(ptype, &ptype_all, list) {
1619                /* Never send packets back to the socket
1620                 * they originated from - MvS (miquels@drinkel.ow.org)
1621                 */
1622                if ((ptype->dev == dev || !ptype->dev) &&
1623                    (ptype->af_packet_priv == NULL ||
1624                     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1625                        if (pt_prev) {
1626                                deliver_skb(skb2, pt_prev, skb->dev);
1627                                pt_prev = ptype;
1628                                continue;
1629                        }
1630
1631                        skb2 = skb_clone(skb, GFP_ATOMIC);
1632                        if (!skb2)
1633                                break;
1634
1635                        net_timestamp_set(skb2);
1636
1637                        /* skb->nh should be correctly
1638                           set by sender, so that the second statement is
1639                           just protection against buggy protocols.
1640                         */
1641                        skb_reset_mac_header(skb2);
1642
1643                        if (skb_network_header(skb2) < skb2->data ||
1644                            skb2->network_header > skb2->tail) {
1645                                if (net_ratelimit())
1646                                        printk(KERN_CRIT "protocol %04x is "
1647                                               "buggy, dev %s\n",
1648                                               ntohs(skb2->protocol),
1649                                               dev->name);
1650                                skb_reset_network_header(skb2);
1651                        }
1652
1653                        skb2->transport_header = skb2->network_header;
1654                        skb2->pkt_type = PACKET_OUTGOING;
1655                        pt_prev = ptype;
1656                }
1657        }
1658        if (pt_prev)
1659                pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1660        rcu_read_unlock();
1661}
1662
1663/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1664 * @dev: Network device
1665 * @txq: number of queues available
1666 *
1667 * If real_num_tx_queues is changed the tc mappings may no longer be
1668 * valid. To resolve this verify the tc mapping remains valid and if
1669 * not NULL the mapping. With no priorities mapping to this
1670 * offset/count pair it will no longer be used. In the worst case TC0
1671 * is invalid nothing can be done so disable priority mappings. If is
1672 * expected that drivers will fix this mapping if they can before
1673 * calling netif_set_real_num_tx_queues.
1674 */
1675static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1676{
1677        int i;
1678        struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1679
1680        /* If TC0 is invalidated disable TC mapping */
1681        if (tc->offset + tc->count > txq) {
1682                pr_warning("Number of in use tx queues changed "
1683                           "invalidating tc mappings. Priority "
1684                           "traffic classification disabled!\n");
1685                dev->num_tc = 0;
1686                return;
1687        }
1688
1689        /* Invalidated prio to tc mappings set to TC0 */
1690        for (i = 1; i < TC_BITMASK + 1; i++) {
1691                int q = netdev_get_prio_tc_map(dev, i);
1692
1693                tc = &dev->tc_to_txq[q];
1694                if (tc->offset + tc->count > txq) {
1695                        pr_warning("Number of in use tx queues "
1696                                   "changed. Priority %i to tc "
1697                                   "mapping %i is no longer valid "
1698                                   "setting map to 0\n",
1699                                   i, q);
1700                        netdev_set_prio_tc_map(dev, i, 0);
1701                }
1702        }
1703}
1704
1705/*
1706 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1707 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1708 */
1709int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1710{
1711        int rc;
1712
1713        if (txq < 1 || txq > dev->num_tx_queues)
1714                return -EINVAL;
1715
1716        if (dev->reg_state == NETREG_REGISTERED ||
1717            dev->reg_state == NETREG_UNREGISTERING) {
1718                ASSERT_RTNL();
1719
1720                rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1721                                                  txq);
1722                if (rc)
1723                        return rc;
1724
1725                if (dev->num_tc)
1726                        netif_setup_tc(dev, txq);
1727
1728                if (txq < dev->real_num_tx_queues)
1729                        qdisc_reset_all_tx_gt(dev, txq);
1730        }
1731
1732        dev->real_num_tx_queues = txq;
1733        return 0;
1734}
1735EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1736
1737#ifdef CONFIG_RPS
1738/**
1739 *      netif_set_real_num_rx_queues - set actual number of RX queues used
1740 *      @dev: Network device
1741 *      @rxq: Actual number of RX queues
1742 *
1743 *      This must be called either with the rtnl_lock held or before
1744 *      registration of the net device.  Returns 0 on success, or a
1745 *      negative error code.  If called before registration, it always
1746 *      succeeds.
1747 */
1748int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1749{
1750        int rc;
1751
1752        if (rxq < 1 || rxq > dev->num_rx_queues)
1753                return -EINVAL;
1754
1755        if (dev->reg_state == NETREG_REGISTERED) {
1756                ASSERT_RTNL();
1757
1758                rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1759                                                  rxq);
1760                if (rc)
1761                        return rc;
1762        }
1763
1764        dev->real_num_rx_queues = rxq;
1765        return 0;
1766}
1767EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1768#endif
1769
1770static inline void __netif_reschedule(struct Qdisc *q)
1771{
1772        struct softnet_data *sd;
1773        unsigned long flags;
1774
1775        local_irq_save(flags);
1776        sd = &__get_cpu_var(softnet_data);
1777        q->next_sched = NULL;
1778        *sd->output_queue_tailp = q;
1779        sd->output_queue_tailp = &q->next_sched;
1780        raise_softirq_irqoff(NET_TX_SOFTIRQ);
1781        local_irq_restore(flags);
1782}
1783
1784void __netif_schedule(struct Qdisc *q)
1785{
1786        if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1787                __netif_reschedule(q);
1788}
1789EXPORT_SYMBOL(__netif_schedule);
1790
1791void dev_kfree_skb_irq(struct sk_buff *skb)
1792{
1793        if (atomic_dec_and_test(&skb->users)) {
1794                struct softnet_data *sd;
1795                unsigned long flags;
1796
1797                local_irq_save(flags);
1798                sd = &__get_cpu_var(softnet_data);
1799                skb->next = sd->completion_queue;
1800                sd->completion_queue = skb;
1801                raise_softirq_irqoff(NET_TX_SOFTIRQ);
1802                local_irq_restore(flags);
1803        }
1804}
1805EXPORT_SYMBOL(dev_kfree_skb_irq);
1806
1807void dev_kfree_skb_any(struct sk_buff *skb)
1808{
1809        if (in_irq() || irqs_disabled())
1810                dev_kfree_skb_irq(skb);
1811        else
1812                dev_kfree_skb(skb);
1813}
1814EXPORT_SYMBOL(dev_kfree_skb_any);
1815
1816
1817/**
1818 * netif_device_detach - mark device as removed
1819 * @dev: network device
1820 *
1821 * Mark device as removed from system and therefore no longer available.
1822 */
1823void netif_device_detach(struct net_device *dev)
1824{
1825        if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1826            netif_running(dev)) {
1827                netif_tx_stop_all_queues(dev);
1828        }
1829}
1830EXPORT_SYMBOL(netif_device_detach);
1831
1832/**
1833 * netif_device_attach - mark device as attached
1834 * @dev: network device
1835 *
1836 * Mark device as attached from system and restart if needed.
1837 */
1838void netif_device_attach(struct net_device *dev)
1839{
1840        if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1841            netif_running(dev)) {
1842                netif_tx_wake_all_queues(dev);
1843                __netdev_watchdog_up(dev);
1844        }
1845}
1846EXPORT_SYMBOL(netif_device_attach);
1847
1848/**
1849 * skb_dev_set -- assign a new device to a buffer
1850 * @skb: buffer for the new device
1851 * @dev: network device
1852 *
1853 * If an skb is owned by a device already, we have to reset
1854 * all data private to the namespace a device belongs to
1855 * before assigning it a new device.
1856 */
1857#ifdef CONFIG_NET_NS
1858void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1859{
1860        skb_dst_drop(skb);
1861        if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1862                secpath_reset(skb);
1863                nf_reset(skb);
1864                skb_init_secmark(skb);
1865                skb->mark = 0;
1866                skb->priority = 0;
1867                skb->nf_trace = 0;
1868                skb->ipvs_property = 0;
1869#ifdef CONFIG_NET_SCHED
1870                skb->tc_index = 0;
1871#endif
1872        }
1873        skb->dev = dev;
1874}
1875EXPORT_SYMBOL(skb_set_dev);
1876#endif /* CONFIG_NET_NS */
1877
1878/*
1879 * Invalidate hardware checksum when packet is to be mangled, and
1880 * complete checksum manually on outgoing path.
1881 */
1882int skb_checksum_help(struct sk_buff *skb)
1883{
1884        __wsum csum;
1885        int ret = 0, offset;
1886
1887        if (skb->ip_summed == CHECKSUM_COMPLETE)
1888                goto out_set_summed;
1889
1890        if (unlikely(skb_shinfo(skb)->gso_size)) {
1891                /* Let GSO fix up the checksum. */
1892                goto out_set_summed;
1893        }
1894
1895        offset = skb_checksum_start_offset(skb);
1896        BUG_ON(offset >= skb_headlen(skb));
1897        csum = skb_checksum(skb, offset, skb->len - offset, 0);
1898
1899        offset += skb->csum_offset;
1900        BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1901
1902        if (skb_cloned(skb) &&
1903            !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1904                ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1905                if (ret)
1906                        goto out;
1907        }
1908
1909        *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1910out_set_summed:
1911        skb->ip_summed = CHECKSUM_NONE;
1912out:
1913        return ret;
1914}
1915EXPORT_SYMBOL(skb_checksum_help);
1916
1917/**
1918 *      skb_gso_segment - Perform segmentation on skb.
1919 *      @skb: buffer to segment
1920 *      @features: features for the output path (see dev->features)
1921 *
1922 *      This function segments the given skb and returns a list of segments.
1923 *
1924 *      It may return NULL if the skb requires no segmentation.  This is
1925 *      only possible when GSO is used for verifying header integrity.
1926 */
1927struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
1928{
1929        struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1930        struct packet_type *ptype;
1931        __be16 type = skb->protocol;
1932        int vlan_depth = ETH_HLEN;
1933        int err;
1934
1935        while (type == htons(ETH_P_8021Q)) {
1936                struct vlan_hdr *vh;
1937
1938                if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1939                        return ERR_PTR(-EINVAL);
1940
1941                vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1942                type = vh->h_vlan_encapsulated_proto;
1943                vlan_depth += VLAN_HLEN;
1944        }
1945
1946        skb_reset_mac_header(skb);
1947        skb->mac_len = skb->network_header - skb->mac_header;
1948        __skb_pull(skb, skb->mac_len);
1949
1950        if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1951                struct net_device *dev = skb->dev;
1952                struct ethtool_drvinfo info = {};
1953
1954                if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1955                        dev->ethtool_ops->get_drvinfo(dev, &info);
1956
1957                WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1958                     info.driver, dev ? dev->features : 0L,
1959                     skb->sk ? skb->sk->sk_route_caps : 0L,
1960                     skb->len, skb->data_len, skb->ip_summed);
1961
1962                if (skb_header_cloned(skb) &&
1963                    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1964                        return ERR_PTR(err);
1965        }
1966
1967        rcu_read_lock();
1968        list_for_each_entry_rcu(ptype,
1969                        &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1970                if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1971                        if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1972                                err = ptype->gso_send_check(skb);
1973                                segs = ERR_PTR(err);
1974                                if (err || skb_gso_ok(skb, features))
1975                                        break;
1976                                __skb_push(skb, (skb->data -
1977                                                 skb_network_header(skb)));
1978                        }
1979                        segs = ptype->gso_segment(skb, features);
1980                        break;
1981                }
1982        }
1983        rcu_read_unlock();
1984
1985        __skb_push(skb, skb->data - skb_mac_header(skb));
1986
1987        return segs;
1988}
1989EXPORT_SYMBOL(skb_gso_segment);
1990
1991/* Take action when hardware reception checksum errors are detected. */
1992#ifdef CONFIG_BUG
1993void netdev_rx_csum_fault(struct net_device *dev)
1994{
1995        if (net_ratelimit()) {
1996                printk(KERN_ERR "%s: hw csum failure.\n",
1997                        dev ? dev->name : "<unknown>");
1998                dump_stack();
1999        }
2000}
2001EXPORT_SYMBOL(netdev_rx_csum_fault);
2002#endif
2003
2004/* Actually, we should eliminate this check as soon as we know, that:
2005 * 1. IOMMU is present and allows to map all the memory.
2006 * 2. No high memory really exists on this machine.
2007 */
2008
2009static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2010{
2011#ifdef CONFIG_HIGHMEM
2012        int i;
2013        if (!(dev->features & NETIF_F_HIGHDMA)) {
2014                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2015                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2016                        if (PageHighMem(skb_frag_page(frag)))
2017                                return 1;
2018                }
2019        }
2020
2021        if (PCI_DMA_BUS_IS_PHYS) {
2022                struct device *pdev = dev->dev.parent;
2023
2024                if (!pdev)
2025                        return 0;
2026                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2027                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2028                        dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2029                        if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2030                                return 1;
2031                }
2032        }
2033#endif
2034        return 0;
2035}
2036
2037struct dev_gso_cb {
2038        void (*destructor)(struct sk_buff *skb);
2039};
2040
2041#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2042
2043static void dev_gso_skb_destructor(struct sk_buff *skb)
2044{
2045        struct dev_gso_cb *cb;
2046
2047        do {
2048                struct sk_buff *nskb = skb->next;
2049
2050                skb->next = nskb->next;
2051                nskb->next = NULL;
2052                kfree_skb(nskb);
2053        } while (skb->next);
2054
2055        cb = DEV_GSO_CB(skb);
2056        if (cb->destructor)
2057                cb->destructor(skb);
2058}
2059
2060/**
2061 *      dev_gso_segment - Perform emulated hardware segmentation on skb.
2062 *      @skb: buffer to segment
2063 *      @features: device features as applicable to this skb
2064 *
2065 *      This function segments the given skb and stores the list of segments
2066 *      in skb->next.
2067 */
2068static int dev_gso_segment(struct sk_buff *skb, int features)
2069{
2070        struct sk_buff *segs;
2071
2072        segs = skb_gso_segment(skb, features);
2073
2074        /* Verifying header integrity only. */
2075        if (!segs)
2076                return 0;
2077
2078        if (IS_ERR(segs))
2079                return PTR_ERR(segs);
2080
2081        skb->next = segs;
2082        DEV_GSO_CB(skb)->destructor = skb->destructor;
2083        skb->destructor = dev_gso_skb_destructor;
2084
2085        return 0;
2086}
2087
2088/*
2089 * Try to orphan skb early, right before transmission by the device.
2090 * We cannot orphan skb if tx timestamp is requested or the sk-reference
2091 * is needed on driver level for other reasons, e.g. see net/can/raw.c
2092 */
2093static inline void skb_orphan_try(struct sk_buff *skb)
2094{
2095        struct sock *sk = skb->sk;
2096
2097        if (sk && !skb_shinfo(skb)->tx_flags) {
2098                /* skb_tx_hash() wont be able to get sk.
2099                 * We copy sk_hash into skb->rxhash
2100                 */
2101                if (!skb->rxhash)
2102                        skb->rxhash = sk->sk_hash;
2103                skb_orphan(skb);
2104        }
2105}
2106
2107static bool can_checksum_protocol(unsigned long features, __be16 protocol)
2108{
2109        return ((features & NETIF_F_GEN_CSUM) ||
2110                ((features & NETIF_F_V4_CSUM) &&
2111                 protocol == htons(ETH_P_IP)) ||
2112                ((features & NETIF_F_V6_CSUM) &&
2113                 protocol == htons(ETH_P_IPV6)) ||
2114                ((features & NETIF_F_FCOE_CRC) &&
2115                 protocol == htons(ETH_P_FCOE)));
2116}
2117
2118static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features)
2119{
2120        if (!can_checksum_protocol(features, protocol)) {
2121                features &= ~NETIF_F_ALL_CSUM;
2122                features &= ~NETIF_F_SG;
2123        } else if (illegal_highdma(skb->dev, skb)) {
2124                features &= ~NETIF_F_SG;
2125        }
2126
2127        return features;
2128}
2129
2130u32 netif_skb_features(struct sk_buff *skb)
2131{
2132        __be16 protocol = skb->protocol;
2133        u32 features = skb->dev->features;
2134
2135        if (protocol == htons(ETH_P_8021Q)) {
2136                struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2137                protocol = veh->h_vlan_encapsulated_proto;
2138        } else if (!vlan_tx_tag_present(skb)) {
2139                return harmonize_features(skb, protocol, features);
2140        }
2141
2142        features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2143
2144        if (protocol != htons(ETH_P_8021Q)) {
2145                return harmonize_features(skb, protocol, features);
2146        } else {
2147                features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2148                                NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2149                return harmonize_features(skb, protocol, features);
2150        }
2151}
2152EXPORT_SYMBOL(netif_skb_features);
2153
2154/*
2155 * Returns true if either:
2156 *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2157 *      2. skb is fragmented and the device does not support SG, or if
2158 *         at least one of fragments is in highmem and device does not
2159 *         support DMA from it.
2160 */
2161static inline int skb_needs_linearize(struct sk_buff *skb,
2162                                      int features)
2163{
2164        return skb_is_nonlinear(skb) &&
2165                        ((skb_has_frag_list(skb) &&
2166                                !(features & NETIF_F_FRAGLIST)) ||
2167                        (skb_shinfo(skb)->nr_frags &&
2168                                !(features & NETIF_F_SG)));
2169}
2170
2171int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2172                        struct netdev_queue *txq)
2173{
2174        const struct net_device_ops *ops = dev->netdev_ops;
2175        int rc = NETDEV_TX_OK;
2176        unsigned int skb_len;
2177
2178        if (likely(!skb->next)) {
2179                u32 features;
2180
2181                /*
2182                 * If device doesn't need skb->dst, release it right now while
2183                 * its hot in this cpu cache
2184                 */
2185                if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2186                        skb_dst_drop(skb);
2187
2188                if (!list_empty(&ptype_all))
2189                        dev_queue_xmit_nit(skb, dev);
2190
2191                skb_orphan_try(skb);
2192
2193                features = netif_skb_features(skb);
2194
2195                if (vlan_tx_tag_present(skb) &&
2196                    !(features & NETIF_F_HW_VLAN_TX)) {
2197                        skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2198                        if (unlikely(!skb))
2199                                goto out;
2200
2201                        skb->vlan_tci = 0;
2202                }
2203
2204                if (netif_needs_gso(skb, features)) {
2205                        if (unlikely(dev_gso_segment(skb, features)))
2206                                goto out_kfree_skb;
2207                        if (skb->next)
2208                                goto gso;
2209                } else {
2210                        if (skb_needs_linearize(skb, features) &&
2211                            __skb_linearize(skb))
2212                                goto out_kfree_skb;
2213
2214                        /* If packet is not checksummed and device does not
2215                         * support checksumming for this protocol, complete
2216                         * checksumming here.
2217                         */
2218                        if (skb->ip_summed == CHECKSUM_PARTIAL) {
2219                                skb_set_transport_header(skb,
2220                                        skb_checksum_start_offset(skb));
2221                                if (!(features & NETIF_F_ALL_CSUM) &&
2222                                     skb_checksum_help(skb))
2223                                        goto out_kfree_skb;
2224                        }
2225                }
2226
2227                skb_len = skb->len;
2228                rc = ops->ndo_start_xmit(skb, dev);
2229                trace_net_dev_xmit(skb, rc, dev, skb_len);
2230                if (rc == NETDEV_TX_OK)
2231                        txq_trans_update(txq);
2232                return rc;
2233        }
2234
2235gso:
2236        do {
2237                struct sk_buff *nskb = skb->next;
2238
2239                skb->next = nskb->next;
2240                nskb->next = NULL;
2241
2242                /*
2243                 * If device doesn't need nskb->dst, release it right now while
2244                 * its hot in this cpu cache
2245                 */
2246                if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2247                        skb_dst_drop(nskb);
2248
2249                skb_len = nskb->len;
2250                rc = ops->ndo_start_xmit(nskb, dev);
2251                trace_net_dev_xmit(nskb, rc, dev, skb_len);
2252                if (unlikely(rc != NETDEV_TX_OK)) {
2253                        if (rc & ~NETDEV_TX_MASK)
2254                                goto out_kfree_gso_skb;
2255                        nskb->next = skb->next;
2256                        skb->next = nskb;
2257                        return rc;
2258                }
2259                txq_trans_update(txq);
2260                if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2261                        return NETDEV_TX_BUSY;
2262        } while (skb->next);
2263
2264out_kfree_gso_skb:
2265        if (likely(skb->next == NULL))
2266                skb->destructor = DEV_GSO_CB(skb)->destructor;
2267out_kfree_skb:
2268        kfree_skb(skb);
2269out:
2270        return rc;
2271}
2272
2273static u32 hashrnd __read_mostly;
2274
2275/*
2276 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2277 * to be used as a distribution range.
2278 */
2279u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2280                  unsigned int num_tx_queues)
2281{
2282        u32 hash;
2283        u16 qoffset = 0;
2284        u16 qcount = num_tx_queues;
2285
2286        if (skb_rx_queue_recorded(skb)) {
2287                hash = skb_get_rx_queue(skb);
2288                while (unlikely(hash >= num_tx_queues))
2289                        hash -= num_tx_queues;
2290                return hash;
2291        }
2292
2293        if (dev->num_tc) {
2294                u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2295                qoffset = dev->tc_to_txq[tc].offset;
2296                qcount = dev->tc_to_txq[tc].count;
2297        }
2298
2299        if (skb->sk && skb->sk->sk_hash)
2300                hash = skb->sk->sk_hash;
2301        else
2302                hash = (__force u16) skb->protocol ^ skb->rxhash;
2303        hash = jhash_1word(hash, hashrnd);
2304
2305        return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2306}
2307EXPORT_SYMBOL(__skb_tx_hash);
2308
2309static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2310{
2311        if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2312                if (net_ratelimit()) {
2313                        pr_warning("%s selects TX queue %d, but "
2314                                "real number of TX queues is %d\n",
2315                                dev->name, queue_index, dev->real_num_tx_queues);
2316                }
2317                return 0;
2318        }
2319        return queue_index;
2320}
2321
2322static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2323{
2324#ifdef CONFIG_XPS
2325        struct xps_dev_maps *dev_maps;
2326        struct xps_map *map;
2327        int queue_index = -1;
2328
2329        rcu_read_lock();
2330        dev_maps = rcu_dereference(dev->xps_maps);
2331        if (dev_maps) {
2332                map = rcu_dereference(
2333                    dev_maps->cpu_map[raw_smp_processor_id()]);
2334                if (map) {
2335                        if (map->len == 1)
2336                                queue_index = map->queues[0];
2337                        else {
2338                                u32 hash;
2339                                if (skb->sk && skb->sk->sk_hash)
2340                                        hash = skb->sk->sk_hash;
2341                                else
2342                                        hash = (__force u16) skb->protocol ^
2343                                            skb->rxhash;
2344                                hash = jhash_1word(hash, hashrnd);
2345                                queue_index = map->queues[
2346                                    ((u64)hash * map->len) >> 32];
2347                        }
2348                        if (unlikely(queue_index >= dev->real_num_tx_queues))
2349                                queue_index = -1;
2350                }
2351        }
2352        rcu_read_unlock();
2353
2354        return queue_index;
2355#else
2356        return -1;
2357#endif
2358}
2359
2360static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2361                                        struct sk_buff *skb)
2362{
2363        int queue_index;
2364        const struct net_device_ops *ops = dev->netdev_ops;
2365
2366        if (dev->real_num_tx_queues == 1)
2367                queue_index = 0;
2368        else if (ops->ndo_select_queue) {
2369                queue_index = ops->ndo_select_queue(dev, skb);
2370                queue_index = dev_cap_txqueue(dev, queue_index);
2371        } else {
2372                struct sock *sk = skb->sk;
2373                queue_index = sk_tx_queue_get(sk);
2374
2375                if (queue_index < 0 || skb->ooo_okay ||
2376                    queue_index >= dev->real_num_tx_queues) {
2377                        int old_index = queue_index;
2378
2379                        queue_index = get_xps_queue(dev, skb);
2380                        if (queue_index < 0)
2381                                queue_index = skb_tx_hash(dev, skb);
2382
2383                        if (queue_index != old_index && sk) {
2384                                struct dst_entry *dst =
2385                                    rcu_dereference_check(sk->sk_dst_cache, 1);
2386
2387                                if (dst && skb_dst(skb) == dst)
2388                                        sk_tx_queue_set(sk, queue_index);
2389                        }
2390                }
2391        }
2392
2393        skb_set_queue_mapping(skb, queue_index);
2394        return netdev_get_tx_queue(dev, queue_index);
2395}
2396
2397static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2398                                 struct net_device *dev,
2399                                 struct netdev_queue *txq)
2400{
2401        spinlock_t *root_lock = qdisc_lock(q);
2402        bool contended;
2403        int rc;
2404
2405        qdisc_skb_cb(skb)->pkt_len = skb->len;
2406        qdisc_calculate_pkt_len(skb, q);
2407        /*
2408         * Heuristic to force contended enqueues to serialize on a
2409         * separate lock before trying to get qdisc main lock.
2410         * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2411         * and dequeue packets faster.
2412         */
2413        contended = qdisc_is_running(q);
2414        if (unlikely(contended))
2415                spin_lock(&q->busylock);
2416
2417        spin_lock(root_lock);
2418        if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2419                kfree_skb(skb);
2420                rc = NET_XMIT_DROP;
2421        } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2422                   qdisc_run_begin(q)) {
2423                /*
2424                 * This is a work-conserving queue; there are no old skbs
2425                 * waiting to be sent out; and the qdisc is not running -
2426                 * xmit the skb directly.
2427                 */
2428                if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2429                        skb_dst_force(skb);
2430
2431                qdisc_bstats_update(q, skb);
2432
2433                if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2434                        if (unlikely(contended)) {
2435                                spin_unlock(&q->busylock);
2436                                contended = false;
2437                        }
2438                        __qdisc_run(q);
2439                } else
2440                        qdisc_run_end(q);
2441
2442                rc = NET_XMIT_SUCCESS;
2443        } else {
2444                skb_dst_force(skb);
2445                rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2446                if (qdisc_run_begin(q)) {
2447                        if (unlikely(contended)) {
2448                                spin_unlock(&q->busylock);
2449                                contended = false;
2450                        }
2451                        __qdisc_run(q);
2452                }
2453        }
2454        spin_unlock(root_lock);
2455        if (unlikely(contended))
2456                spin_unlock(&q->busylock);
2457        return rc;
2458}
2459
2460static DEFINE_PER_CPU(int, xmit_recursion);
2461#define RECURSION_LIMIT 10
2462
2463/**
2464 *      dev_queue_xmit - transmit a buffer
2465 *      @skb: buffer to transmit
2466 *
2467 *      Queue a buffer for transmission to a network device. The caller must
2468 *      have set the device and priority and built the buffer before calling
2469 *      this function. The function can be called from an interrupt.
2470 *
2471 *      A negative errno code is returned on a failure. A success does not
2472 *      guarantee the frame will be transmitted as it may be dropped due
2473 *      to congestion or traffic shaping.
2474 *
2475 * -----------------------------------------------------------------------------------
2476 *      I notice this method can also return errors from the queue disciplines,
2477 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2478 *      be positive.
2479 *
2480 *      Regardless of the return value, the skb is consumed, so it is currently
2481 *      difficult to retry a send to this method.  (You can bump the ref count
2482 *      before sending to hold a reference for retry if you are careful.)
2483 *
2484 *      When calling this method, interrupts MUST be enabled.  This is because
2485 *      the BH enable code must have IRQs enabled so that it will not deadlock.
2486 *          --BLG
2487 */
2488int dev_queue_xmit(struct sk_buff *skb)
2489{
2490        struct net_device *dev = skb->dev;
2491        struct netdev_queue *txq;
2492        struct Qdisc *q;
2493        int rc = -ENOMEM;
2494
2495        /* Disable soft irqs for various locks below. Also
2496         * stops preemption for RCU.
2497         */
2498        rcu_read_lock_bh();
2499
2500        txq = dev_pick_tx(dev, skb);
2501        q = rcu_dereference_bh(txq->qdisc);
2502
2503#ifdef CONFIG_NET_CLS_ACT
2504        skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2505#endif
2506        trace_net_dev_queue(skb);
2507        if (q->enqueue) {
2508                rc = __dev_xmit_skb(skb, q, dev, txq);
2509                goto out;
2510        }
2511
2512        /* The device has no queue. Common case for software devices:
2513           loopback, all the sorts of tunnels...
2514
2515           Really, it is unlikely that netif_tx_lock protection is necessary
2516           here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2517           counters.)
2518           However, it is possible, that they rely on protection
2519           made by us here.
2520
2521           Check this and shot the lock. It is not prone from deadlocks.
2522           Either shot noqueue qdisc, it is even simpler 8)
2523         */
2524        if (dev->flags & IFF_UP) {
2525                int cpu = smp_processor_id(); /* ok because BHs are off */
2526
2527                if (txq->xmit_lock_owner != cpu) {
2528
2529                        if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2530                                goto recursion_alert;
2531
2532                        HARD_TX_LOCK(dev, txq, cpu);
2533
2534                        if (!netif_tx_queue_stopped(txq)) {
2535                                __this_cpu_inc(xmit_recursion);
2536                                rc = dev_hard_start_xmit(skb, dev, txq);
2537                                __this_cpu_dec(xmit_recursion);
2538                                if (dev_xmit_complete(rc)) {
2539                                        HARD_TX_UNLOCK(dev, txq);
2540                                        goto out;
2541                                }
2542                        }
2543                        HARD_TX_UNLOCK(dev, txq);
2544                        if (net_ratelimit())
2545                                printk(KERN_CRIT "Virtual device %s asks to "
2546                                       "queue packet!\n", dev->name);
2547                } else {
2548                        /* Recursion is detected! It is possible,
2549                         * unfortunately
2550                         */
2551recursion_alert:
2552                        if (net_ratelimit())
2553                                printk(KERN_CRIT "Dead loop on virtual device "
2554                                       "%s, fix it urgently!\n", dev->name);
2555                }
2556        }
2557
2558        rc = -ENETDOWN;
2559        rcu_read_unlock_bh();
2560
2561        kfree_skb(skb);
2562        return rc;
2563out:
2564        rcu_read_unlock_bh();
2565        return rc;
2566}
2567EXPORT_SYMBOL(dev_queue_xmit);
2568
2569
2570/*=======================================================================
2571                        Receiver routines
2572  =======================================================================*/
2573
2574int netdev_max_backlog __read_mostly = 1000;
2575int netdev_tstamp_prequeue __read_mostly = 1;
2576int netdev_budget __read_mostly = 300;
2577int weight_p __read_mostly = 64;            /* old backlog weight */
2578
2579/* Called with irq disabled */
2580static inline void ____napi_schedule(struct softnet_data *sd,
2581                                     struct napi_struct *napi)
2582{
2583        list_add_tail(&napi->poll_list, &sd->poll_list);
2584        __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2585}
2586
2587/*
2588 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2589 * and src/dst port numbers.  Sets rxhash in skb to non-zero hash value
2590 * on success, zero indicates no valid hash.  Also, sets l4_rxhash in skb
2591 * if hash is a canonical 4-tuple hash over transport ports.
2592 */
2593void __skb_get_rxhash(struct sk_buff *skb)
2594{
2595        int nhoff, hash = 0, poff;
2596        const struct ipv6hdr *ip6;
2597        const struct iphdr *ip;
2598        const struct vlan_hdr *vlan;
2599        u8 ip_proto;
2600        u32 addr1, addr2;
2601        u16 proto;
2602        union {
2603                u32 v32;
2604                u16 v16[2];
2605        } ports;
2606
2607        nhoff = skb_network_offset(skb);
2608        proto = skb->protocol;
2609
2610again:
2611        switch (proto) {
2612        case __constant_htons(ETH_P_IP):
2613ip:
2614                if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2615                        goto done;
2616
2617                ip = (const struct iphdr *) (skb->data + nhoff);
2618                if (ip_is_fragment(ip))
2619                        ip_proto = 0;
2620                else
2621                        ip_proto = ip->protocol;
2622                addr1 = (__force u32) ip->saddr;
2623                addr2 = (__force u32) ip->daddr;
2624                nhoff += ip->ihl * 4;
2625                break;
2626        case __constant_htons(ETH_P_IPV6):
2627ipv6:
2628                if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2629                        goto done;
2630
2631                ip6 = (const struct ipv6hdr *) (skb->data + nhoff);
2632                ip_proto = ip6->nexthdr;
2633                addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2634                addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2635                nhoff += 40;
2636                break;
2637        case __constant_htons(ETH_P_8021Q):
2638                if (!pskb_may_pull(skb, sizeof(*vlan) + nhoff))
2639                        goto done;
2640                vlan = (const struct vlan_hdr *) (skb->data + nhoff);
2641                proto = vlan->h_vlan_encapsulated_proto;
2642                nhoff += sizeof(*vlan);
2643                goto again;
2644        case __constant_htons(ETH_P_PPP_SES):
2645                if (!pskb_may_pull(skb, PPPOE_SES_HLEN + nhoff))
2646                        goto done;
2647                proto = *((__be16 *) (skb->data + nhoff +
2648                                      sizeof(struct pppoe_hdr)));
2649                nhoff += PPPOE_SES_HLEN;
2650                switch (proto) {
2651                case __constant_htons(PPP_IP):
2652                        goto ip;
2653                case __constant_htons(PPP_IPV6):
2654                        goto ipv6;
2655                default:
2656                        goto done;
2657                }
2658        default:
2659                goto done;
2660        }
2661
2662        switch (ip_proto) {
2663        case IPPROTO_GRE:
2664                if (pskb_may_pull(skb, nhoff + 16)) {
2665                        u8 *h = skb->data + nhoff;
2666                        __be16 flags = *(__be16 *)h;
2667
2668                        /*
2669                         * Only look inside GRE if version zero and no
2670                         * routing
2671                         */
2672                        if (!(flags & (GRE_VERSION|GRE_ROUTING))) {
2673                                proto = *(__be16 *)(h + 2);
2674                                nhoff += 4;
2675                                if (flags & GRE_CSUM)
2676                                        nhoff += 4;
2677                                if (flags & GRE_KEY)
2678                                        nhoff += 4;
2679                                if (flags & GRE_SEQ)
2680                                        nhoff += 4;
2681                                goto again;
2682                        }
2683                }
2684                break;
2685        case IPPROTO_IPIP:
2686                goto again;
2687        default:
2688                break;
2689        }
2690
2691        ports.v32 = 0;
2692        poff = proto_ports_offset(ip_proto);
2693        if (poff >= 0) {
2694                nhoff += poff;
2695                if (pskb_may_pull(skb, nhoff + 4)) {
2696                        ports.v32 = * (__force u32 *) (skb->data + nhoff);
2697                        if (ports.v16[1] < ports.v16[0])
2698                                swap(ports.v16[0], ports.v16[1]);
2699                        skb->l4_rxhash = 1;
2700                }
2701        }
2702
2703        /* get a consistent hash (same value on both flow directions) */
2704        if (addr2 < addr1)
2705                swap(addr1, addr2);
2706
2707        hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2708        if (!hash)
2709                hash = 1;
2710
2711done:
2712        skb->rxhash = hash;
2713}
2714EXPORT_SYMBOL(__skb_get_rxhash);
2715
2716#ifdef CONFIG_RPS
2717
2718/* One global table that all flow-based protocols share. */
2719struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2720EXPORT_SYMBOL(rps_sock_flow_table);
2721
2722static struct rps_dev_flow *
2723set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2724            struct rps_dev_flow *rflow, u16 next_cpu)
2725{
2726        if (next_cpu != RPS_NO_CPU) {
2727#ifdef CONFIG_RFS_ACCEL
2728                struct netdev_rx_queue *rxqueue;
2729                struct rps_dev_flow_table *flow_table;
2730                struct rps_dev_flow *old_rflow;
2731                u32 flow_id;
2732                u16 rxq_index;
2733                int rc;
2734
2735                /* Should we steer this flow to a different hardware queue? */
2736                if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2737                    !(dev->features & NETIF_F_NTUPLE))
2738                        goto out;
2739                rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2740                if (rxq_index == skb_get_rx_queue(skb))
2741                        goto out;
2742
2743                rxqueue = dev->_rx + rxq_index;
2744                flow_table = rcu_dereference(rxqueue->rps_flow_table);
2745                if (!flow_table)
2746                        goto out;
2747                flow_id = skb->rxhash & flow_table->mask;
2748                rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2749                                                        rxq_index, flow_id);
2750                if (rc < 0)
2751                        goto out;
2752                old_rflow = rflow;
2753                rflow = &flow_table->flows[flow_id];
2754                rflow->filter = rc;
2755                if (old_rflow->filter == rflow->filter)
2756                        old_rflow->filter = RPS_NO_FILTER;
2757        out:
2758#endif
2759                rflow->last_qtail =
2760                        per_cpu(softnet_data, next_cpu).input_queue_head;
2761        }
2762
2763        rflow->cpu = next_cpu;
2764        return rflow;
2765}
2766
2767/*
2768 * get_rps_cpu is called from netif_receive_skb and returns the target
2769 * CPU from the RPS map of the receiving queue for a given skb.
2770 * rcu_read_lock must be held on entry.
2771 */
2772static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2773                       struct rps_dev_flow **rflowp)
2774{
2775        struct netdev_rx_queue *rxqueue;
2776        struct rps_map *map;
2777        struct rps_dev_flow_table *flow_table;
2778        struct rps_sock_flow_table *sock_flow_table;
2779        int cpu = -1;
2780        u16 tcpu;
2781
2782        if (skb_rx_queue_recorded(skb)) {
2783                u16 index = skb_get_rx_queue(skb);
2784                if (unlikely(index >= dev->real_num_rx_queues)) {
2785                        WARN_ONCE(dev->real_num_rx_queues > 1,
2786                                  "%s received packet on queue %u, but number "
2787                                  "of RX queues is %u\n",
2788                                  dev->name, index, dev->real_num_rx_queues);
2789                        goto done;
2790                }
2791                rxqueue = dev->_rx + index;
2792        } else
2793                rxqueue = dev->_rx;
2794
2795        map = rcu_dereference(rxqueue->rps_map);
2796        if (map) {
2797                if (map->len == 1 &&
2798                    !rcu_access_pointer(rxqueue->rps_flow_table)) {
2799                        tcpu = map->cpus[0];
2800                        if (cpu_online(tcpu))
2801                                cpu = tcpu;
2802                        goto done;
2803                }
2804        } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2805                goto done;
2806        }
2807
2808        skb_reset_network_header(skb);
2809        if (!skb_get_rxhash(skb))
2810                goto done;
2811
2812        flow_table = rcu_dereference(rxqueue->rps_flow_table);
2813        sock_flow_table = rcu_dereference(rps_sock_flow_table);
2814        if (flow_table && sock_flow_table) {
2815                u16 next_cpu;
2816                struct rps_dev_flow *rflow;
2817
2818                rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2819                tcpu = rflow->cpu;
2820
2821                next_cpu = sock_flow_table->ents[skb->rxhash &
2822                    sock_flow_table->mask];
2823
2824                /*
2825                 * If the desired CPU (where last recvmsg was done) is
2826                 * different from current CPU (one in the rx-queue flow
2827                 * table entry), switch if one of the following holds:
2828                 *   - Current CPU is unset (equal to RPS_NO_CPU).
2829                 *   - Current CPU is offline.
2830                 *   - The current CPU's queue tail has advanced beyond the
2831                 *     last packet that was enqueued using this table entry.
2832                 *     This guarantees that all previous packets for the flow
2833                 *     have been dequeued, thus preserving in order delivery.
2834                 */
2835                if (unlikely(tcpu != next_cpu) &&
2836                    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2837                     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2838                      rflow->last_qtail)) >= 0))
2839                        rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2840
2841                if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2842                        *rflowp = rflow;
2843                        cpu = tcpu;
2844                        goto done;
2845                }
2846        }
2847
2848        if (map) {
2849                tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2850
2851                if (cpu_online(tcpu)) {
2852                        cpu = tcpu;
2853                        goto done;
2854                }
2855        }
2856
2857done:
2858        return cpu;
2859}
2860
2861#ifdef CONFIG_RFS_ACCEL
2862
2863/**
2864 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2865 * @dev: Device on which the filter was set
2866 * @rxq_index: RX queue index
2867 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2868 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2869 *
2870 * Drivers that implement ndo_rx_flow_steer() should periodically call
2871 * this function for each installed filter and remove the filters for
2872 * which it returns %true.
2873 */
2874bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2875                         u32 flow_id, u16 filter_id)
2876{
2877        struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2878        struct rps_dev_flow_table *flow_table;
2879        struct rps_dev_flow *rflow;
2880        bool expire = true;
2881        int cpu;
2882
2883        rcu_read_lock();
2884        flow_table = rcu_dereference(rxqueue->rps_flow_table);
2885        if (flow_table && flow_id <= flow_table->mask) {
2886                rflow = &flow_table->flows[flow_id];
2887                cpu = ACCESS_ONCE(rflow->cpu);
2888                if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2889                    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2890                           rflow->last_qtail) <
2891                     (int)(10 * flow_table->mask)))
2892                        expire = false;
2893        }
2894        rcu_read_unlock();
2895        return expire;
2896}
2897EXPORT_SYMBOL(rps_may_expire_flow);
2898
2899#endif /* CONFIG_RFS_ACCEL */
2900
2901/* Called from hardirq (IPI) context */
2902static void rps_trigger_softirq(void *data)
2903{
2904        struct softnet_data *sd = data;
2905
2906        ____napi_schedule(sd, &sd->backlog);
2907        sd->received_rps++;
2908}
2909
2910#endif /* CONFIG_RPS */
2911
2912/*
2913 * Check if this softnet_data structure is another cpu one
2914 * If yes, queue it to our IPI list and return 1
2915 * If no, return 0
2916 */
2917static int rps_ipi_queued(struct softnet_data *sd)
2918{
2919#ifdef CONFIG_RPS
2920        struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2921
2922        if (sd != mysd) {
2923                sd->rps_ipi_next = mysd->rps_ipi_list;
2924                mysd->rps_ipi_list = sd;
2925
2926                __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2927                return 1;
2928        }
2929#endif /* CONFIG_RPS */
2930        return 0;
2931}
2932
2933/*
2934 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2935 * queue (may be a remote CPU queue).
2936 */
2937static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2938                              unsigned int *qtail)
2939{
2940        struct softnet_data *sd;
2941        unsigned long flags;
2942
2943        sd = &per_cpu(softnet_data, cpu);
2944
2945        local_irq_save(flags);
2946
2947        rps_lock(sd);
2948        if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2949                if (skb_queue_len(&sd->input_pkt_queue)) {
2950enqueue:
2951                        __skb_queue_tail(&sd->input_pkt_queue, skb);
2952                        input_queue_tail_incr_save(sd, qtail);
2953                        rps_unlock(sd);
2954                        local_irq_restore(flags);
2955                        return NET_RX_SUCCESS;
2956                }
2957
2958                /* Schedule NAPI for backlog device
2959                 * We can use non atomic operation since we own the queue lock
2960                 */
2961                if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2962                        if (!rps_ipi_queued(sd))
2963                                ____napi_schedule(sd, &sd->backlog);
2964                }
2965                goto enqueue;
2966        }
2967
2968        sd->dropped++;
2969        rps_unlock(sd);
2970
2971        local_irq_restore(flags);
2972
2973        atomic_long_inc(&skb->dev->rx_dropped);
2974        kfree_skb(skb);
2975        return NET_RX_DROP;
2976}
2977
2978/**
2979 *      netif_rx        -       post buffer to the network code
2980 *      @skb: buffer to post
2981 *
2982 *      This function receives a packet from a device driver and queues it for
2983 *      the upper (protocol) levels to process.  It always succeeds. The buffer
2984 *      may be dropped during processing for congestion control or by the
2985 *      protocol layers.
2986 *
2987 *      return values:
2988 *      NET_RX_SUCCESS  (no congestion)
2989 *      NET_RX_DROP     (packet was dropped)
2990 *
2991 */
2992
2993int netif_rx(struct sk_buff *skb)
2994{
2995        int ret;
2996
2997        /* if netpoll wants it, pretend we never saw it */
2998        if (netpoll_rx(skb))
2999                return NET_RX_DROP;
3000
3001        if (netdev_tstamp_prequeue)
3002                net_timestamp_check(skb);
3003
3004        trace_netif_rx(skb);
3005#ifdef CONFIG_RPS
3006        {
3007                struct rps_dev_flow voidflow, *rflow = &voidflow;
3008                int cpu;
3009
3010                preempt_disable();
3011                rcu_read_lock();
3012
3013                cpu = get_rps_cpu(skb->dev, skb, &rflow);
3014                if (cpu < 0)
3015                        cpu = smp_processor_id();
3016
3017                ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3018
3019                rcu_read_unlock();
3020                preempt_enable();
3021        }
3022#else
3023        {
3024                unsigned int qtail;
3025                ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3026                put_cpu();
3027        }
3028#endif
3029        return ret;
3030}
3031EXPORT_SYMBOL(netif_rx);
3032
3033int netif_rx_ni(struct sk_buff *skb)
3034{
3035        int err;
3036
3037        preempt_disable();
3038        err = netif_rx(skb);
3039        if (local_softirq_pending())
3040                do_softirq();
3041        preempt_enable();
3042
3043        return err;
3044}
3045EXPORT_SYMBOL(netif_rx_ni);
3046
3047static void net_tx_action(struct softirq_action *h)
3048{
3049        struct softnet_data *sd = &__get_cpu_var(softnet_data);
3050
3051        if (sd->completion_queue) {
3052                struct sk_buff *clist;
3053
3054                local_irq_disable();
3055                clist = sd->completion_queue;
3056                sd->completion_queue = NULL;
3057                local_irq_enable();
3058
3059                while (clist) {
3060                        struct sk_buff *skb = clist;
3061                        clist = clist->next;
3062
3063                        WARN_ON(atomic_read(&skb->users));
3064                        trace_kfree_skb(skb, net_tx_action);
3065                        __kfree_skb(skb);
3066                }
3067        }
3068
3069        if (sd->output_queue) {
3070                struct Qdisc *head;
3071
3072                local_irq_disable();
3073                head = sd->output_queue;
3074                sd->output_queue = NULL;
3075                sd->output_queue_tailp = &sd->output_queue;
3076                local_irq_enable();
3077
3078                while (head) {
3079                        struct Qdisc *q = head;
3080                        spinlock_t *root_lock;
3081
3082                        head = head->next_sched;
3083
3084                        root_lock = qdisc_lock(q);
3085                        if (spin_trylock(root_lock)) {
3086                                smp_mb__before_clear_bit();
3087                                clear_bit(__QDISC_STATE_SCHED,
3088                                          &q->state);
3089                                qdisc_run(q);
3090                                spin_unlock(root_lock);
3091                        } else {
3092                                if (!test_bit(__QDISC_STATE_DEACTIVATED,
3093                                              &q->state)) {
3094                                        __netif_reschedule(q);
3095                                } else {
3096                                        smp_mb__before_clear_bit();
3097                                        clear_bit(__QDISC_STATE_SCHED,
3098                                                  &q->state);
3099                                }
3100                        }
3101                }
3102        }
3103}
3104
3105#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3106    (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3107/* This hook is defined here for ATM LANE */
3108int (*br_fdb_test_addr_hook)(struct net_device *dev,
3109                             unsigned char *addr) __read_mostly;
3110EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3111#endif
3112
3113#ifdef CONFIG_NET_CLS_ACT
3114/* TODO: Maybe we should just force sch_ingress to be compiled in
3115 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3116 * a compare and 2 stores extra right now if we dont have it on
3117 * but have CONFIG_NET_CLS_ACT
3118 * NOTE: This doesn't stop any functionality; if you dont have
3119 * the ingress scheduler, you just can't add policies on ingress.
3120 *
3121 */
3122static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3123{
3124        struct net_device *dev = skb->dev;
3125        u32 ttl = G_TC_RTTL(skb->tc_verd);
3126        int result = TC_ACT_OK;
3127        struct Qdisc *q;
3128
3129        if (unlikely(MAX_RED_LOOP < ttl++)) {
3130                if (net_ratelimit())
3131                        pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
3132                               skb->skb_iif, dev->ifindex);
3133                return TC_ACT_SHOT;
3134        }
3135
3136        skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3137        skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3138
3139        q = rxq->qdisc;
3140        if (q != &noop_qdisc) {
3141                spin_lock(qdisc_lock(q));
3142                if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3143                        result = qdisc_enqueue_root(skb, q);
3144                spin_unlock(qdisc_lock(q));
3145        }
3146
3147        return result;
3148}
3149
3150static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3151                                         struct packet_type **pt_prev,
3152                                         int *ret, struct net_device *orig_dev)
3153{
3154        struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3155
3156        if (!rxq || rxq->qdisc == &noop_qdisc)
3157                goto out;
3158
3159        if (*pt_prev) {
3160                *ret = deliver_skb(skb, *pt_prev, orig_dev);
3161                *pt_prev = NULL;
3162        }
3163
3164        switch (ing_filter(skb, rxq)) {
3165        case TC_ACT_SHOT:
3166        case TC_ACT_STOLEN:
3167                kfree_skb(skb);
3168                return NULL;
3169        }
3170
3171out:
3172        skb->tc_verd = 0;
3173        return skb;
3174}
3175#endif
3176
3177/**
3178 *      netdev_rx_handler_register - register receive handler
3179 *      @dev: device to register a handler for
3180 *      @rx_handler: receive handler to register
3181 *      @rx_handler_data: data pointer that is used by rx handler
3182 *
3183 *      Register a receive hander for a device. This handler will then be
3184 *      called from __netif_receive_skb. A negative errno code is returned
3185 *      on a failure.
3186 *
3187 *      The caller must hold the rtnl_mutex.
3188 *
3189 *      For a general description of rx_handler, see enum rx_handler_result.
3190 */
3191int netdev_rx_handler_register(struct net_device *dev,
3192                               rx_handler_func_t *rx_handler,
3193                               void *rx_handler_data)
3194{
3195        ASSERT_RTNL();
3196
3197        if (dev->rx_handler)
3198                return -EBUSY;
3199
3200        rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3201        rcu_assign_pointer(dev->rx_handler, rx_handler);
3202
3203        return 0;
3204}
3205EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3206
3207/**
3208 *      netdev_rx_handler_unregister - unregister receive handler
3209 *      @dev: device to unregister a handler from
3210 *
3211 *      Unregister a receive hander from a device.
3212 *
3213 *      The caller must hold the rtnl_mutex.
3214 */
3215void netdev_rx_handler_unregister(struct net_device *dev)
3216{
3217
3218        ASSERT_RTNL();
3219        RCU_INIT_POINTER(dev->rx_handler, NULL);
3220        RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3221}
3222EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3223
3224static int __netif_receive_skb(struct sk_buff *skb)
3225{
3226        struct packet_type *ptype, *pt_prev;
3227        rx_handler_func_t *rx_handler;
3228        struct net_device *orig_dev;
3229        struct net_device *null_or_dev;
3230        bool deliver_exact = false;
3231        int ret = NET_RX_DROP;
3232        __be16 type;
3233
3234        if (!netdev_tstamp_prequeue)
3235                net_timestamp_check(skb);
3236
3237        trace_netif_receive_skb(skb);
3238
3239        /* if we've gotten here through NAPI, check netpoll */
3240        if (netpoll_receive_skb(skb))
3241                return NET_RX_DROP;
3242
3243        if (!skb->skb_iif)
3244                skb->skb_iif = skb->dev->ifindex;
3245        orig_dev = skb->dev;
3246
3247        skb_reset_network_header(skb);
3248        skb_reset_transport_header(skb);
3249        skb_reset_mac_len(skb);
3250
3251        pt_prev = NULL;
3252
3253        rcu_read_lock();
3254
3255another_round:
3256
3257        __this_cpu_inc(softnet_data.processed);
3258
3259        if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3260                skb = vlan_untag(skb);
3261                if (unlikely(!skb))
3262                        goto out;
3263        }
3264
3265#ifdef CONFIG_NET_CLS_ACT
3266        if (skb->tc_verd & TC_NCLS) {
3267                skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3268                goto ncls;
3269        }
3270#endif
3271
3272        list_for_each_entry_rcu(ptype, &ptype_all, list) {
3273                if (!ptype->dev || ptype->dev == skb->dev) {
3274                        if (pt_prev)
3275                                ret = deliver_skb(skb, pt_prev, orig_dev);
3276                        pt_prev = ptype;
3277                }
3278        }
3279
3280#ifdef CONFIG_NET_CLS_ACT
3281        skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3282        if (!skb)
3283                goto out;
3284ncls:
3285#endif
3286
3287        rx_handler = rcu_dereference(skb->dev->rx_handler);
3288        if (vlan_tx_tag_present(skb)) {
3289                if (pt_prev) {
3290                        ret = deliver_skb(skb, pt_prev, orig_dev);
3291                        pt_prev = NULL;
3292                }
3293                if (vlan_do_receive(&skb, !rx_handler))
3294                        goto another_round;
3295                else if (unlikely(!skb))
3296                        goto out;
3297        }
3298
3299        if (rx_handler) {
3300                if (pt_prev) {
3301                        ret = deliver_skb(skb, pt_prev, orig_dev);
3302                        pt_prev = NULL;
3303                }
3304                switch (rx_handler(&skb)) {
3305                case RX_HANDLER_CONSUMED:
3306                        goto out;
3307                case RX_HANDLER_ANOTHER:
3308                        goto another_round;
3309                case RX_HANDLER_EXACT:
3310                        deliver_exact = true;
3311                case RX_HANDLER_PASS:
3312                        break;
3313                default:
3314                        BUG();
3315                }
3316        }
3317
3318        /* deliver only exact match when indicated */
3319        null_or_dev = deliver_exact ? skb->dev : NULL;
3320
3321        type = skb->protocol;
3322        list_for_each_entry_rcu(ptype,
3323                        &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3324                if (ptype->type == type &&
3325                    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3326                     ptype->dev == orig_dev)) {
3327                        if (pt_prev)
3328                                ret = deliver_skb(skb, pt_prev, orig_dev);
3329                        pt_prev = ptype;
3330                }
3331        }
3332
3333        if (pt_prev) {
3334                ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3335        } else {
3336                atomic_long_inc(&skb->dev->rx_dropped);
3337                kfree_skb(skb);
3338                /* Jamal, now you will not able to escape explaining
3339                 * me how you were going to use this. :-)
3340                 */
3341                ret = NET_RX_DROP;
3342        }
3343
3344out:
3345        rcu_read_unlock();
3346        return ret;
3347}
3348
3349/**
3350 *      netif_receive_skb - process receive buffer from network
3351 *      @skb: buffer to process
3352 *
3353 *      netif_receive_skb() is the main receive data processing function.
3354 *      It always succeeds. The buffer may be dropped during processing
3355 *      for congestion control or by the protocol layers.
3356 *
3357 *      This function may only be called from softirq context and interrupts
3358 *      should be enabled.
3359 *
3360 *      Return values (usually ignored):
3361 *      NET_RX_SUCCESS: no congestion
3362 *      NET_RX_DROP: packet was dropped
3363 */
3364int netif_receive_skb(struct sk_buff *skb)
3365{
3366        if (netdev_tstamp_prequeue)
3367                net_timestamp_check(skb);
3368
3369        if (skb_defer_rx_timestamp(skb))
3370                return NET_RX_SUCCESS;
3371
3372#ifdef CONFIG_RPS
3373        {
3374                struct rps_dev_flow voidflow, *rflow = &voidflow;
3375                int cpu, ret;
3376
3377                rcu_read_lock();
3378
3379                cpu = get_rps_cpu(skb->dev, skb, &rflow);
3380
3381                if (cpu >= 0) {
3382                        ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3383                        rcu_read_unlock();
3384                } else {
3385                        rcu_read_unlock();
3386                        ret = __netif_receive_skb(skb);
3387                }
3388
3389                return ret;
3390        }
3391#else
3392        return __netif_receive_skb(skb);
3393#endif
3394}
3395EXPORT_SYMBOL(netif_receive_skb);
3396
3397/* Network device is going away, flush any packets still pending
3398 * Called with irqs disabled.
3399 */
3400static void flush_backlog(void *arg)
3401{
3402        struct net_device *dev = arg;
3403        struct softnet_data *sd = &__get_cpu_var(softnet_data);
3404        struct sk_buff *skb, *tmp;
3405
3406        rps_lock(sd);
3407        skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3408                if (skb->dev == dev) {
3409                        __skb_unlink(skb, &sd->input_pkt_queue);
3410                        kfree_skb(skb);
3411                        input_queue_head_incr(sd);
3412                }
3413        }
3414        rps_unlock(sd);
3415
3416        skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3417                if (skb->dev == dev) {
3418                        __skb_unlink(skb, &sd->process_queue);
3419                        kfree_skb(skb);
3420                        input_queue_head_incr(sd);
3421                }
3422        }
3423}
3424
3425static int napi_gro_complete(struct sk_buff *skb)
3426{
3427        struct packet_type *ptype;
3428        __be16 type = skb->protocol;
3429        struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3430        int err = -ENOENT;
3431
3432        if (NAPI_GRO_CB(skb)->count == 1) {
3433                skb_shinfo(skb)->gso_size = 0;
3434                goto out;
3435        }
3436
3437        rcu_read_lock();
3438        list_for_each_entry_rcu(ptype, head, list) {
3439                if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3440                        continue;
3441
3442                err = ptype->gro_complete(skb);
3443                break;
3444        }
3445        rcu_read_unlock();
3446
3447        if (err) {
3448                WARN_ON(&ptype->list == head);
3449                kfree_skb(skb);
3450                return NET_RX_SUCCESS;
3451        }
3452
3453out:
3454        return netif_receive_skb(skb);
3455}
3456
3457inline void napi_gro_flush(struct napi_struct *napi)
3458{
3459        struct sk_buff *skb, *next;
3460
3461        for (skb = napi->gro_list; skb; skb = next) {
3462                next = skb->next;
3463                skb->next = NULL;
3464                napi_gro_complete(skb);
3465        }
3466
3467        napi->gro_count = 0;
3468        napi->gro_list = NULL;
3469}
3470EXPORT_SYMBOL(napi_gro_flush);
3471
3472enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3473{
3474        struct sk_buff **pp = NULL;
3475        struct packet_type *ptype;
3476        __be16 type = skb->protocol;
3477        struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3478        int same_flow;
3479        int mac_len;
3480        enum gro_result ret;
3481
3482        if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3483                goto normal;
3484
3485        if (skb_is_gso(skb) || skb_has_frag_list(skb))
3486                goto normal;
3487
3488        rcu_read_lock();
3489        list_for_each_entry_rcu(ptype, head, list) {
3490                if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3491                        continue;
3492
3493                skb_set_network_header(skb, skb_gro_offset(skb));
3494                mac_len = skb->network_header - skb->mac_header;
3495                skb->mac_len = mac_len;
3496                NAPI_GRO_CB(skb)->same_flow = 0;
3497                NAPI_GRO_CB(skb)->flush = 0;
3498                NAPI_GRO_CB(skb)->free = 0;
3499
3500                pp = ptype->gro_receive(&napi->gro_list, skb);
3501                break;
3502        }
3503        rcu_read_unlock();
3504
3505        if (&ptype->list == head)
3506                goto normal;
3507
3508        same_flow = NAPI_GRO_CB(skb)->same_flow;
3509        ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3510
3511        if (pp) {
3512                struct sk_buff *nskb = *pp;
3513
3514                *pp = nskb->next;
3515                nskb->next = NULL;
3516                napi_gro_complete(nskb);
3517                napi->gro_count--;
3518        }
3519
3520        if (same_flow)
3521                goto ok;
3522
3523        if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3524                goto normal;
3525
3526        napi->gro_count++;
3527        NAPI_GRO_CB(skb)->count = 1;
3528        skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3529        skb->next = napi->gro_list;
3530        napi->gro_list = skb;
3531        ret = GRO_HELD;
3532
3533pull:
3534        if (skb_headlen(skb) < skb_gro_offset(skb)) {
3535                int grow = skb_gro_offset(skb) - skb_headlen(skb);
3536
3537                BUG_ON(skb->end - skb->tail < grow);
3538
3539                memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3540
3541                skb->tail += grow;
3542                skb->data_len -= grow;
3543
3544                skb_shinfo(skb)->frags[0].page_offset += grow;
3545                skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3546
3547                if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3548                        skb_frag_unref(skb, 0);
3549                        memmove(skb_shinfo(skb)->frags,
3550                                skb_shinfo(skb)->frags + 1,
3551                                --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3552                }
3553        }
3554
3555ok:
3556        return ret;
3557
3558normal:
3559        ret = GRO_NORMAL;
3560        goto pull;
3561}
3562EXPORT_SYMBOL(dev_gro_receive);
3563
3564static inline gro_result_t
3565__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3566{
3567        struct sk_buff *p;
3568
3569        for (p = napi->gro_list; p; p = p->next) {
3570                unsigned long diffs;
3571
3572                diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3573                diffs |= p->vlan_tci ^ skb->vlan_tci;
3574                diffs |= compare_ether_header(skb_mac_header(p),
3575                                              skb_gro_mac_header(skb));
3576                NAPI_GRO_CB(p)->same_flow = !diffs;
3577                NAPI_GRO_CB(p)->flush = 0;
3578        }
3579
3580        return dev_gro_receive(napi, skb);
3581}
3582
3583gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3584{
3585        switch (ret) {
3586        case GRO_NORMAL:
3587                if (netif_receive_skb(skb))
3588                        ret = GRO_DROP;
3589                break;
3590
3591        case GRO_DROP:
3592        case GRO_MERGED_FREE:
3593                kfree_skb(skb);
3594                break;
3595
3596        case GRO_HELD:
3597        case GRO_MERGED:
3598                break;
3599        }
3600
3601        return ret;
3602}
3603EXPORT_SYMBOL(napi_skb_finish);
3604
3605void skb_gro_reset_offset(struct sk_buff *skb)
3606{
3607        NAPI_GRO_CB(skb)->data_offset = 0;
3608        NAPI_GRO_CB(skb)->frag0 = NULL;
3609        NAPI_GRO_CB(skb)->frag0_len = 0;
3610
3611        if (skb->mac_header == skb->tail &&
3612            !PageHighMem(skb_frag_page(&skb_shinfo(skb)->frags[0]))) {
3613                NAPI_GRO_CB(skb)->frag0 =
3614                        skb_frag_address(&skb_shinfo(skb)->frags[0]);
3615                NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(&skb_shinfo(skb)->frags[0]);
3616        }
3617}
3618EXPORT_SYMBOL(skb_gro_reset_offset);
3619
3620gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3621{
3622        skb_gro_reset_offset(skb);
3623
3624        return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3625}
3626EXPORT_SYMBOL(napi_gro_receive);
3627
3628static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3629{
3630        __skb_pull(skb, skb_headlen(skb));
3631        skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3632        skb->vlan_tci = 0;
3633        skb->dev = napi->dev;
3634        skb->skb_iif = 0;
3635
3636        napi->skb = skb;
3637}
3638
3639struct sk_buff *napi_get_frags(struct napi_struct *napi)
3640{
3641        struct sk_buff *skb = napi->skb;
3642
3643        if (!skb) {
3644                skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3645                if (skb)
3646                        napi->skb = skb;
3647        }
3648        return skb;
3649}
3650EXPORT_SYMBOL(napi_get_frags);
3651
3652gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3653                               gro_result_t ret)
3654{
3655        switch (ret) {
3656        case GRO_NORMAL:
3657        case GRO_HELD:
3658                skb->protocol = eth_type_trans(skb, skb->dev);
3659
3660                if (ret == GRO_HELD)
3661                        skb_gro_pull(skb, -ETH_HLEN);
3662                else if (netif_receive_skb(skb))
3663                        ret = GRO_DROP;
3664                break;
3665
3666        case GRO_DROP:
3667        case GRO_MERGED_FREE:
3668                napi_reuse_skb(napi, skb);
3669                break;
3670
3671        case GRO_MERGED:
3672                break;
3673        }
3674
3675        return ret;
3676}
3677EXPORT_SYMBOL(napi_frags_finish);
3678
3679struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3680{
3681        struct sk_buff *skb = napi->skb;
3682        struct ethhdr *eth;
3683        unsigned int hlen;
3684        unsigned int off;
3685
3686        napi->skb = NULL;
3687
3688        skb_reset_mac_header(skb);
3689        skb_gro_reset_offset(skb);
3690
3691        off = skb_gro_offset(skb);
3692        hlen = off + sizeof(*eth);
3693        eth = skb_gro_header_fast(skb, off);
3694        if (skb_gro_header_hard(skb, hlen)) {
3695                eth = skb_gro_header_slow(skb, hlen, off);
3696                if (unlikely(!eth)) {
3697                        napi_reuse_skb(napi, skb);
3698                        skb = NULL;
3699                        goto out;
3700                }
3701        }
3702
3703        skb_gro_pull(skb, sizeof(*eth));
3704
3705        /*
3706         * This works because the only protocols we care about don't require
3707         * special handling.  We'll fix it up properly at the end.
3708         */
3709        skb->protocol = eth->h_proto;
3710
3711out:
3712        return skb;
3713}
3714EXPORT_SYMBOL(napi_frags_skb);
3715
3716gro_result_t napi_gro_frags(struct napi_struct *napi)
3717{
3718        struct sk_buff *skb = napi_frags_skb(napi);
3719
3720        if (!skb)
3721                return GRO_DROP;
3722
3723        return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3724}
3725EXPORT_SYMBOL(napi_gro_frags);
3726
3727/*
3728 * net_rps_action sends any pending IPI's for rps.
3729 * Note: called with local irq disabled, but exits with local irq enabled.
3730 */
3731static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3732{
3733#ifdef CONFIG_RPS
3734        struct softnet_data *remsd = sd->rps_ipi_list;
3735
3736        if (remsd) {
3737                sd->rps_ipi_list = NULL;
3738
3739                local_irq_enable();
3740
3741                /* Send pending IPI's to kick RPS processing on remote cpus. */
3742                while (remsd) {
3743                        struct softnet_data *next = remsd->rps_ipi_next;
3744
3745                        if (cpu_online(remsd->cpu))
3746                                __smp_call_function_single(remsd->cpu,
3747                                                           &remsd->csd, 0);
3748                        remsd = next;
3749                }
3750        } else
3751#endif
3752                local_irq_enable();
3753}
3754
3755static int process_backlog(struct napi_struct *napi, int quota)
3756{
3757        int work = 0;
3758        struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3759
3760#ifdef CONFIG_RPS
3761        /* Check if we have pending ipi, its better to send them now,
3762         * not waiting net_rx_action() end.
3763         */
3764        if (sd->rps_ipi_list) {
3765                local_irq_disable();
3766                net_rps_action_and_irq_enable(sd);
3767        }
3768#endif
3769        napi->weight = weight_p;
3770        local_irq_disable();
3771        while (work < quota) {
3772                struct sk_buff *skb;
3773                unsigned int qlen;
3774
3775                while ((skb = __skb_dequeue(&sd->process_queue))) {
3776                        local_irq_enable();
3777                        __netif_receive_skb(skb);
3778                        local_irq_disable();
3779                        input_queue_head_incr(sd);
3780                        if (++work >= quota) {
3781                                local_irq_enable();
3782                                return work;
3783                        }
3784                }
3785
3786                rps_lock(sd);
3787                qlen = skb_queue_len(&sd->input_pkt_queue);
3788                if (qlen)
3789                        skb_queue_splice_tail_init(&sd->input_pkt_queue,
3790                                                   &sd->process_queue);
3791
3792                if (qlen < quota - work) {
3793                        /*
3794                         * Inline a custom version of __napi_complete().
3795                         * only current cpu owns and manipulates this napi,
3796                         * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3797                         * we can use a plain write instead of clear_bit(),
3798                         * and we dont need an smp_mb() memory barrier.
3799                         */
3800                        list_del(&napi->poll_list);
3801                        napi->state = 0;
3802
3803                        quota = work + qlen;
3804                }
3805                rps_unlock(sd);
3806        }
3807        local_irq_enable();
3808
3809        return work;
3810}
3811
3812/**
3813 * __napi_schedule - schedule for receive
3814 * @n: entry to schedule
3815 *
3816 * The entry's receive function will be scheduled to run
3817 */
3818void __napi_schedule(struct napi_struct *n)
3819{
3820        unsigned long flags;
3821
3822        local_irq_save(flags);
3823        ____napi_schedule(&__get_cpu_var(softnet_data), n);
3824        local_irq_restore(flags);
3825}
3826EXPORT_SYMBOL(__napi_schedule);
3827
3828void __napi_complete(struct napi_struct *n)
3829{
3830        BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3831        BUG_ON(n->gro_list);
3832
3833        list_del(&n->poll_list);
3834        smp_mb__before_clear_bit();
3835        clear_bit(NAPI_STATE_SCHED, &n->state);
3836}
3837EXPORT_SYMBOL(__napi_complete);
3838
3839void napi_complete(struct napi_struct *n)
3840{
3841        unsigned long flags;
3842
3843        /*
3844         * don't let napi dequeue from the cpu poll list
3845         * just in case its running on a different cpu
3846         */
3847        if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3848                return;
3849
3850        napi_gro_flush(n);
3851        local_irq_save(flags);
3852        __napi_complete(n);
3853        local_irq_restore(flags);
3854}
3855EXPORT_SYMBOL(napi_complete);
3856
3857void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3858                    int (*poll)(struct napi_struct *, int), int weight)
3859{
3860        INIT_LIST_HEAD(&napi->poll_list);
3861        napi->gro_count = 0;
3862        napi->gro_list = NULL;
3863        napi->skb = NULL;
3864        napi->poll = poll;
3865        napi->weight = weight;
3866        list_add(&napi->dev_list, &dev->napi_list);
3867        napi->dev = dev;
3868#ifdef CONFIG_NETPOLL
3869        spin_lock_init(&napi->poll_lock);
3870        napi->poll_owner = -1;
3871#endif
3872        set_bit(NAPI_STATE_SCHED, &napi->state);
3873}
3874EXPORT_SYMBOL(netif_napi_add);
3875
3876void netif_napi_del(struct napi_struct *napi)
3877{
3878        struct sk_buff *skb, *next;
3879
3880        list_del_init(&napi->dev_list);
3881        napi_free_frags(napi);
3882
3883        for (skb = napi->gro_list; skb; skb = next) {
3884                next = skb->next;
3885                skb->next = NULL;
3886                kfree_skb(skb);
3887        }
3888
3889        napi->gro_list = NULL;
3890        napi->gro_count = 0;
3891}
3892EXPORT_SYMBOL(netif_napi_del);
3893
3894static void net_rx_action(struct softirq_action *h)
3895{
3896        struct softnet_data *sd = &__get_cpu_var(softnet_data);
3897        unsigned long time_limit = jiffies + 2;
3898        int budget = netdev_budget;
3899        void *have;
3900
3901        local_irq_disable();
3902
3903        while (!list_empty(&sd->poll_list)) {
3904                struct napi_struct *n;
3905                int work, weight;
3906
3907                /* If softirq window is exhuasted then punt.
3908                 * Allow this to run for 2 jiffies since which will allow
3909                 * an average latency of 1.5/HZ.
3910                 */
3911                if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3912                        goto softnet_break;
3913
3914                local_irq_enable();
3915
3916                /* Even though interrupts have been re-enabled, this
3917                 * access is safe because interrupts can only add new
3918                 * entries to the tail of this list, and only ->poll()
3919                 * calls can remove this head entry from the list.
3920                 */
3921                n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3922
3923                have = netpoll_poll_lock(n);
3924
3925                weight = n->weight;
3926
3927                /* This NAPI_STATE_SCHED test is for avoiding a race
3928                 * with netpoll's poll_napi().  Only the entity which
3929                 * obtains the lock and sees NAPI_STATE_SCHED set will
3930                 * actually make the ->poll() call.  Therefore we avoid
3931                 * accidentally calling ->poll() when NAPI is not scheduled.
3932                 */
3933                work = 0;
3934                if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3935                        work = n->poll(n, weight);
3936                        trace_napi_poll(n);
3937                }
3938
3939                WARN_ON_ONCE(work > weight);
3940
3941                budget -= work;
3942
3943                local_irq_disable();
3944
3945                /* Drivers must not modify the NAPI state if they
3946                 * consume the entire weight.  In such cases this code
3947                 * still "owns" the NAPI instance and therefore can
3948                 * move the instance around on the list at-will.
3949                 */
3950                if (unlikely(work == weight)) {
3951                        if (unlikely(napi_disable_pending(n))) {
3952                                local_irq_enable();
3953                                napi_complete(n);
3954                                local_irq_disable();
3955                        } else
3956                                list_move_tail(&n->poll_list, &sd->poll_list);
3957                }
3958
3959                netpoll_poll_unlock(have);
3960        }
3961out:
3962        net_rps_action_and_irq_enable(sd);
3963
3964#ifdef CONFIG_NET_DMA
3965        /*
3966         * There may not be any more sk_buffs coming right now, so push
3967         * any pending DMA copies to hardware
3968         */
3969        dma_issue_pending_all();
3970#endif
3971
3972        return;
3973
3974softnet_break:
3975        sd->time_squeeze++;
3976        __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3977        goto out;
3978}
3979
3980static gifconf_func_t *gifconf_list[NPROTO];
3981
3982/**
3983 *      register_gifconf        -       register a SIOCGIF handler
3984 *      @family: Address family
3985 *      @gifconf: Function handler
3986 *
3987 *      Register protocol dependent address dumping routines. The handler
3988 *      that is passed must not be freed or reused until it has been replaced
3989 *      by another handler.
3990 */
3991int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3992{
3993        if (family >= NPROTO)
3994                return -EINVAL;
3995        gifconf_list[family] = gifconf;
3996        return 0;
3997}
3998EXPORT_SYMBOL(register_gifconf);
3999
4000
4001/*
4002 *      Map an interface index to its name (SIOCGIFNAME)
4003 */
4004
4005/*
4006 *      We need this ioctl for efficient implementation of the
4007 *      if_indextoname() function required by the IPv6 API.  Without
4008 *      it, we would have to search all the interfaces to find a
4009 *      match.  --pb
4010 */
4011
4012static int dev_ifname(struct net *net, struct ifreq __user *arg)
4013{
4014        struct net_device *dev;
4015        struct ifreq ifr;
4016
4017        /*
4018         *      Fetch the caller's info block.
4019         */
4020
4021        if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4022                return -EFAULT;
4023
4024        rcu_read_lock();
4025        dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
4026        if (!dev) {
4027                rcu_read_unlock();
4028                return -ENODEV;
4029        }
4030
4031        strcpy(ifr.ifr_name, dev->name);
4032        rcu_read_unlock();
4033
4034        if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
4035                return -EFAULT;
4036        return 0;
4037}
4038
4039/*
4040 *      Perform a SIOCGIFCONF call. This structure will change
4041 *      size eventually, and there is nothing I can do about it.
4042 *      Thus we will need a 'compatibility mode'.
4043 */
4044
4045static int dev_ifconf(struct net *net, char __user *arg)
4046{
4047        struct ifconf ifc;
4048        struct net_device *dev;
4049        char __user *pos;
4050        int len;
4051        int total;
4052        int i;
4053
4054        /*
4055         *      Fetch the caller's info block.
4056         */
4057
4058        if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
4059                return -EFAULT;
4060
4061        pos = ifc.ifc_buf;
4062        len = ifc.ifc_len;
4063
4064        /*
4065         *      Loop over the interfaces, and write an info block for each.
4066         */
4067
4068        total = 0;
4069        for_each_netdev(net, dev) {
4070                for (i = 0; i < NPROTO; i++) {
4071                        if (gifconf_list[i]) {
4072                                int done;
4073                                if (!pos)
4074                                        done = gifconf_list[i](dev, NULL, 0);
4075                                else
4076                                        done = gifconf_list[i](dev, pos + total,
4077                                                               len - total);
4078                                if (done < 0)
4079                                        return -EFAULT;
4080                                total += done;
4081                        }
4082                }
4083        }
4084
4085        /*
4086         *      All done.  Write the updated control block back to the caller.
4087         */
4088        ifc.ifc_len = total;
4089
4090        /*
4091         *      Both BSD and Solaris return 0 here, so we do too.
4092         */
4093        return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
4094}
4095
4096#ifdef CONFIG_PROC_FS
4097
4098#define BUCKET_SPACE (32 - NETDEV_HASHBITS)
4099
4100struct dev_iter_state {
4101        struct seq_net_private p;
4102        unsigned int pos; /* bucket << BUCKET_SPACE + offset */
4103};
4104
4105#define get_bucket(x) ((x) >> BUCKET_SPACE)
4106#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4107#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4108
4109static inline struct net_device *dev_from_same_bucket(struct seq_file *seq)
4110{
4111        struct dev_iter_state *state = seq->private;
4112        struct net *net = seq_file_net(seq);
4113        struct net_device *dev;
4114        struct hlist_node *p;
4115        struct hlist_head *h;
4116        unsigned int count, bucket, offset;
4117
4118        bucket = get_bucket(state->pos);
4119        offset = get_offset(state->pos);
4120        h = &net->dev_name_head[bucket];
4121        count = 0;
4122        hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
4123                if (count++ == offset) {
4124                        state->pos = set_bucket_offset(bucket, count);
4125                        return dev;
4126                }
4127        }
4128
4129        return NULL;
4130}
4131
4132static inline struct net_device *dev_from_new_bucket(struct seq_file *seq)
4133{
4134        struct dev_iter_state *state = seq->private;
4135        struct net_device *dev;
4136        unsigned int bucket;
4137
4138        bucket = get_bucket(state->pos);
4139        do {
4140                dev = dev_from_same_bucket(seq);
4141                if (dev)
4142                        return dev;
4143
4144                bucket++;
4145                state->pos = set_bucket_offset(bucket, 0);
4146        } while (bucket < NETDEV_HASHENTRIES);
4147
4148        return NULL;
4149}
4150
4151/*
4152 *      This is invoked by the /proc filesystem handler to display a device
4153 *      in detail.
4154 */
4155void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4156        __acquires(RCU)
4157{
4158        struct dev_iter_state *state = seq->private;
4159
4160        rcu_read_lock();
4161        if (!*pos)
4162                return SEQ_START_TOKEN;
4163
4164        /* check for end of the hash */
4165        if (state->pos == 0 && *pos > 1)
4166                return NULL;
4167
4168        return dev_from_new_bucket(seq);
4169}
4170
4171void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4172{
4173        struct net_device *dev;
4174
4175        ++*pos;
4176
4177        if (v == SEQ_START_TOKEN)
4178                return dev_from_new_bucket(seq);
4179
4180        dev = dev_from_same_bucket(seq);
4181        if (dev)
4182                return dev;
4183
4184        return dev_from_new_bucket(seq);
4185}
4186
4187void dev_seq_stop(struct seq_file *seq, void *v)
4188        __releases(RCU)
4189{
4190        rcu_read_unlock();
4191}
4192
4193static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4194{
4195        struct rtnl_link_stats64 temp;
4196        const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4197
4198        seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4199                   "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4200                   dev->name, stats->rx_bytes, stats->rx_packets,
4201                   stats->rx_errors,
4202                   stats->rx_dropped + stats->rx_missed_errors,
4203                   stats->rx_fifo_errors,
4204                   stats->rx_length_errors + stats->rx_over_errors +
4205                    stats->rx_crc_errors + stats->rx_frame_errors,
4206                   stats->rx_compressed, stats->multicast,
4207                   stats->tx_bytes, stats->tx_packets,
4208                   stats->tx_errors, stats->tx_dropped,
4209                   stats->tx_fifo_errors, stats->collisions,
4210                   stats->tx_carrier_errors +
4211                    stats->tx_aborted_errors +
4212                    stats->tx_window_errors +
4213                    stats->tx_heartbeat_errors,
4214                   stats->tx_compressed);
4215}
4216
4217/*
4218 *      Called from the PROCfs module. This now uses the new arbitrary sized
4219 *      /proc/net interface to create /proc/net/dev
4220 */
4221static int dev_seq_show(struct seq_file *seq, void *v)
4222{
4223        if (v == SEQ_START_TOKEN)
4224                seq_puts(seq, "Inter-|   Receive                            "
4225                              "                    |  Transmit\n"
4226                              " face |bytes    packets errs drop fifo frame "
4227                              "compressed multicast|bytes    packets errs "
4228                              "drop fifo colls carrier compressed\n");
4229        else
4230                dev_seq_printf_stats(seq, v);
4231        return 0;
4232}
4233
4234static struct softnet_data *softnet_get_online(loff_t *pos)
4235{
4236        struct softnet_data *sd = NULL;
4237
4238        while (*pos < nr_cpu_ids)
4239                if (cpu_online(*pos)) {
4240                        sd = &per_cpu(softnet_data, *pos);
4241                        break;
4242                } else
4243                        ++*pos;
4244        return sd;
4245}
4246
4247static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4248{
4249        return softnet_get_online(pos);
4250}
4251
4252static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4253{
4254        ++*pos;
4255        return softnet_get_online(pos);
4256}
4257
4258static void softnet_seq_stop(struct seq_file *seq, void *v)
4259{
4260}
4261
4262static int softnet_seq_show(struct seq_file *seq, void *v)
4263{
4264        struct softnet_data *sd = v;
4265
4266        seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4267                   sd->processed, sd->dropped, sd->time_squeeze, 0,
4268                   0, 0, 0, 0, /* was fastroute */
4269                   sd->cpu_collision, sd->received_rps);
4270        return 0;
4271}
4272
4273static const struct seq_operations dev_seq_ops = {
4274        .start = dev_seq_start,
4275        .next  = dev_seq_next,
4276        .stop  = dev_seq_stop,
4277        .show  = dev_seq_show,
4278};
4279
4280static int dev_seq_open(struct inode *inode, struct file *file)
4281{
4282        return seq_open_net(inode, file, &dev_seq_ops,
4283                            sizeof(struct dev_iter_state));
4284}
4285
4286int dev_seq_open_ops(struct inode *inode, struct file *file,
4287                     const struct seq_operations *ops)
4288{
4289        return seq_open_net(inode, file, ops, sizeof(struct dev_iter_state));
4290}
4291
4292static const struct file_operations dev_seq_fops = {
4293        .owner   = THIS_MODULE,
4294        .open    = dev_seq_open,
4295        .read    = seq_read,
4296        .llseek  = seq_lseek,
4297        .release = seq_release_net,
4298};
4299
4300static const struct seq_operations softnet_seq_ops = {
4301        .start = softnet_seq_start,
4302        .next  = softnet_seq_next,
4303        .stop  = softnet_seq_stop,
4304        .show  = softnet_seq_show,
4305};
4306
4307static int softnet_seq_open(struct inode *inode, struct file *file)
4308{
4309        return seq_open(file, &softnet_seq_ops);
4310}
4311
4312static const struct file_operations softnet_seq_fops = {
4313        .owner   = THIS_MODULE,
4314        .open    = softnet_seq_open,
4315        .read    = seq_read,
4316        .llseek  = seq_lseek,
4317        .release = seq_release,
4318};
4319
4320static void *ptype_get_idx(loff_t pos)
4321{
4322        struct packet_type *pt = NULL;
4323        loff_t i = 0;
4324        int t;
4325
4326        list_for_each_entry_rcu(pt, &ptype_all, list) {
4327                if (i == pos)
4328                        return pt;
4329                ++i;
4330        }
4331
4332        for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4333                list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4334                        if (i == pos)
4335                                return pt;
4336                        ++i;
4337                }
4338        }
4339        return NULL;
4340}
4341
4342static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4343        __acquires(RCU)
4344{
4345        rcu_read_lock();
4346        return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4347}
4348
4349static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4350{
4351        struct packet_type *pt;
4352        struct list_head *nxt;
4353        int hash;
4354
4355        ++*pos;
4356        if (v == SEQ_START_TOKEN)
4357                return ptype_get_idx(0);
4358
4359        pt = v;
4360        nxt = pt->list.next;
4361        if (pt->type == htons(ETH_P_ALL)) {
4362                if (nxt != &ptype_all)
4363                        goto found;
4364                hash = 0;
4365                nxt = ptype_base[0].next;
4366        } else
4367                hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4368
4369        while (nxt == &ptype_base[hash]) {
4370                if (++hash >= PTYPE_HASH_SIZE)
4371                        return NULL;
4372                nxt = ptype_base[hash].next;
4373        }
4374found:
4375        return list_entry(nxt, struct packet_type, list);
4376}
4377
4378static void ptype_seq_stop(struct seq_file *seq, void *v)
4379        __releases(RCU)
4380{
4381        rcu_read_unlock();
4382}
4383
4384static int ptype_seq_show(struct seq_file *seq, void *v)
4385{
4386        struct packet_type *pt = v;
4387
4388        if (v == SEQ_START_TOKEN)
4389                seq_puts(seq, "Type Device      Function\n");
4390        else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4391                if (pt->type == htons(ETH_P_ALL))
4392                        seq_puts(seq, "ALL ");
4393                else
4394                        seq_printf(seq, "%04x", ntohs(pt->type));
4395
4396                seq_printf(seq, " %-8s %pF\n",
4397                           pt->dev ? pt->dev->name : "", pt->func);
4398        }
4399
4400        return 0;
4401}
4402
4403static const struct seq_operations ptype_seq_ops = {
4404        .start = ptype_seq_start,
4405        .next  = ptype_seq_next,
4406        .stop  = ptype_seq_stop,
4407        .show  = ptype_seq_show,
4408};
4409
4410static int ptype_seq_open(struct inode *inode, struct file *file)
4411{
4412        return seq_open_net(inode, file, &ptype_seq_ops,
4413                        sizeof(struct seq_net_private));
4414}
4415
4416static const struct file_operations ptype_seq_fops = {
4417        .owner   = THIS_MODULE,
4418        .open    = ptype_seq_open,
4419        .read    = seq_read,
4420        .llseek  = seq_lseek,
4421        .release = seq_release_net,
4422};
4423
4424
4425static int __net_init dev_proc_net_init(struct net *net)
4426{
4427        int rc = -ENOMEM;
4428
4429        if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4430                goto out;
4431        if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4432                goto out_dev;
4433        if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4434                goto out_softnet;
4435
4436        if (wext_proc_init(net))
4437                goto out_ptype;
4438        rc = 0;
4439out:
4440        return rc;
4441out_ptype:
4442        proc_net_remove(net, "ptype");
4443out_softnet:
4444        proc_net_remove(net, "softnet_stat");
4445out_dev:
4446        proc_net_remove(net, "dev");
4447        goto out;
4448}
4449
4450static void __net_exit dev_proc_net_exit(struct net *net)
4451{
4452        wext_proc_exit(net);
4453
4454        proc_net_remove(net, "ptype");
4455        proc_net_remove(net, "softnet_stat");
4456        proc_net_remove(net, "dev");
4457}
4458
4459static struct pernet_operations __net_initdata dev_proc_ops = {
4460        .init = dev_proc_net_init,
4461        .exit = dev_proc_net_exit,
4462};
4463
4464static int __init dev_proc_init(void)
4465{
4466        return register_pernet_subsys(&dev_proc_ops);
4467}
4468#else
4469#define dev_proc_init() 0
4470#endif  /* CONFIG_PROC_FS */
4471
4472
4473/**
4474 *      netdev_set_master       -       set up master pointer
4475 *      @slave: slave device
4476 *      @master: new master device
4477 *
4478 *      Changes the master device of the slave. Pass %NULL to break the
4479 *      bonding. The caller must hold the RTNL semaphore. On a failure
4480 *      a negative errno code is returned. On success the reference counts
4481 *      are adjusted and the function returns zero.
4482 */
4483int netdev_set_master(struct net_device *slave, struct net_device *master)
4484{
4485        struct net_device *old = slave->master;
4486
4487        ASSERT_RTNL();
4488
4489        if (master) {
4490                if (old)
4491                        return -EBUSY;
4492                dev_hold(master);
4493        }
4494
4495        slave->master = master;
4496
4497        if (old)
4498                dev_put(old);
4499        return 0;
4500}
4501EXPORT_SYMBOL(netdev_set_master);
4502
4503/**
4504 *      netdev_set_bond_master  -       set up bonding master/slave pair
4505 *      @slave: slave device
4506 *      @master: new master device
4507 *
4508 *      Changes the master device of the slave. Pass %NULL to break the
4509 *      bonding. The caller must hold the RTNL semaphore. On a failure
4510 *      a negative errno code is returned. On success %RTM_NEWLINK is sent
4511 *      to the routing socket and the function returns zero.
4512 */
4513int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4514{
4515        int err;
4516
4517        ASSERT_RTNL();
4518
4519        err = netdev_set_master(slave, master);
4520        if (err)
4521                return err;
4522        if (master)
4523                slave->flags |= IFF_SLAVE;
4524        else
4525                slave->flags &= ~IFF_SLAVE;
4526
4527        rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4528        return 0;
4529}
4530EXPORT_SYMBOL(netdev_set_bond_master);
4531
4532static void dev_change_rx_flags(struct net_device *dev, int flags)
4533{
4534        const struct net_device_ops *ops = dev->netdev_ops;
4535
4536        if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4537                ops->ndo_change_rx_flags(dev, flags);
4538}
4539
4540static int __dev_set_promiscuity(struct net_device *dev, int inc)
4541{
4542        unsigned short old_flags = dev->flags;
4543        uid_t uid;
4544        gid_t gid;
4545
4546        ASSERT_RTNL();
4547
4548        dev->flags |= IFF_PROMISC;
4549        dev->promiscuity += inc;
4550        if (dev->promiscuity == 0) {
4551                /*
4552                 * Avoid overflow.
4553                 * If inc causes overflow, untouch promisc and return error.
4554                 */
4555                if (inc < 0)
4556                        dev->flags &= ~IFF_PROMISC;
4557                else {
4558                        dev->promiscuity -= inc;
4559                        printk(KERN_WARNING "%s: promiscuity touches roof, "
4560                                "set promiscuity failed, promiscuity feature "
4561                                "of device might be broken.\n", dev->name);
4562                        return -EOVERFLOW;
4563                }
4564        }
4565        if (dev->flags != old_flags) {
4566                printk(KERN_INFO "device %s %s promiscuous mode\n",
4567                       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4568                                                               "left");
4569                if (audit_enabled) {
4570                        current_uid_gid(&uid, &gid);
4571                        audit_log(current->audit_context, GFP_ATOMIC,
4572                                AUDIT_ANOM_PROMISCUOUS,
4573                                "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4574                                dev->name, (dev->flags & IFF_PROMISC),
4575                                (old_flags & IFF_PROMISC),
4576                                audit_get_loginuid(current),
4577                                uid, gid,
4578                                audit_get_sessionid(current));
4579                }
4580
4581                dev_change_rx_flags(dev, IFF_PROMISC);
4582        }
4583        return 0;
4584}
4585
4586/**
4587 *      dev_set_promiscuity     - update promiscuity count on a device
4588 *      @dev: device
4589 *      @inc: modifier
4590 *
4591 *      Add or remove promiscuity from a device. While the count in the device
4592 *      remains above zero the interface remains promiscuous. Once it hits zero
4593 *      the device reverts back to normal filtering operation. A negative inc
4594 *      value is used to drop promiscuity on the device.
4595 *      Return 0 if successful or a negative errno code on error.
4596 */
4597int dev_set_promiscuity(struct net_device *dev, int inc)
4598{
4599        unsigned short old_flags = dev->flags;
4600        int err;
4601
4602        err = __dev_set_promiscuity(dev, inc);
4603        if (err < 0)
4604                return err;
4605        if (dev->flags != old_flags)
4606                dev_set_rx_mode(dev);
4607        return err;
4608}
4609EXPORT_SYMBOL(dev_set_promiscuity);
4610
4611/**
4612 *      dev_set_allmulti        - update allmulti count on a device
4613 *      @dev: device
4614 *      @inc: modifier
4615 *
4616 *      Add or remove reception of all multicast frames to a device. While the
4617 *      count in the device remains above zero the interface remains listening
4618 *      to all interfaces. Once it hits zero the device reverts back to normal
4619 *      filtering operation. A negative @inc value is used to drop the counter
4620 *      when releasing a resource needing all multicasts.
4621 *      Return 0 if successful or a negative errno code on error.
4622 */
4623
4624int dev_set_allmulti(struct net_device *dev, int inc)
4625{
4626        unsigned short old_flags = dev->flags;
4627
4628        ASSERT_RTNL();
4629
4630        dev->flags |= IFF_ALLMULTI;
4631        dev->allmulti += inc;
4632        if (dev->allmulti == 0) {
4633                /*
4634                 * Avoid overflow.
4635                 * If inc causes overflow, untouch allmulti and return error.
4636                 */
4637                if (inc < 0)
4638                        dev->flags &= ~IFF_ALLMULTI;
4639                else {
4640                        dev->allmulti -= inc;
4641                        printk(KERN_WARNING "%s: allmulti touches roof, "
4642                                "set allmulti failed, allmulti feature of "
4643                                "device might be broken.\n", dev->name);
4644                        return -EOVERFLOW;
4645                }
4646        }
4647        if (dev->flags ^ old_flags) {
4648                dev_change_rx_flags(dev, IFF_ALLMULTI);
4649                dev_set_rx_mode(dev);
4650        }
4651        return 0;
4652}
4653EXPORT_SYMBOL(dev_set_allmulti);
4654
4655/*
4656 *      Upload unicast and multicast address lists to device and
4657 *      configure RX filtering. When the device doesn't support unicast
4658 *      filtering it is put in promiscuous mode while unicast addresses
4659 *      are present.
4660 */
4661void __dev_set_rx_mode(struct net_device *dev)
4662{
4663        const struct net_device_ops *ops = dev->netdev_ops;
4664
4665        /* dev_open will call this function so the list will stay sane. */
4666        if (!(dev->flags&IFF_UP))
4667                return;
4668
4669        if (!netif_device_present(dev))
4670                return;
4671
4672        if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4673                /* Unicast addresses changes may only happen under the rtnl,
4674                 * therefore calling __dev_set_promiscuity here is safe.
4675                 */
4676                if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4677                        __dev_set_promiscuity(dev, 1);
4678                        dev->uc_promisc = true;
4679                } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4680                        __dev_set_promiscuity(dev, -1);
4681                        dev->uc_promisc = false;
4682                }
4683        }
4684
4685        if (ops->ndo_set_rx_mode)
4686                ops->ndo_set_rx_mode(dev);
4687}
4688
4689void dev_set_rx_mode(struct net_device *dev)
4690{
4691        netif_addr_lock_bh(dev);
4692        __dev_set_rx_mode(dev);
4693        netif_addr_unlock_bh(dev);
4694}
4695
4696/**
4697 *      dev_get_flags - get flags reported to userspace
4698 *      @dev: device
4699 *
4700 *      Get the combination of flag bits exported through APIs to userspace.
4701 */
4702unsigned dev_get_flags(const struct net_device *dev)
4703{
4704        unsigned flags;
4705
4706        flags = (dev->flags & ~(IFF_PROMISC |
4707                                IFF_ALLMULTI |
4708                                IFF_RUNNING |
4709                                IFF_LOWER_UP |
4710                                IFF_DORMANT)) |
4711                (dev->gflags & (IFF_PROMISC |
4712                                IFF_ALLMULTI));
4713
4714        if (netif_running(dev)) {
4715                if (netif_oper_up(dev))
4716                        flags |= IFF_RUNNING;
4717                if (netif_carrier_ok(dev))
4718                        flags |= IFF_LOWER_UP;
4719                if (netif_dormant(dev))
4720                        flags |= IFF_DORMANT;
4721        }
4722
4723        return flags;
4724}
4725EXPORT_SYMBOL(dev_get_flags);
4726
4727int __dev_change_flags(struct net_device *dev, unsigned int flags)
4728{
4729        int old_flags = dev->flags;
4730        int ret;
4731
4732        ASSERT_RTNL();
4733
4734        /*
4735         *      Set the flags on our device.
4736         */
4737
4738        dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4739                               IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4740                               IFF_AUTOMEDIA)) |
4741                     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4742                                    IFF_ALLMULTI));
4743
4744        /*
4745         *      Load in the correct multicast list now the flags have changed.
4746         */
4747
4748        if ((old_flags ^ flags) & IFF_MULTICAST)
4749                dev_change_rx_flags(dev, IFF_MULTICAST);
4750
4751        dev_set_rx_mode(dev);
4752
4753        /*
4754         *      Have we downed the interface. We handle IFF_UP ourselves
4755         *      according to user attempts to set it, rather than blindly
4756         *      setting it.
4757         */
4758
4759        ret = 0;
4760        if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4761                ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4762
4763                if (!ret)
4764                        dev_set_rx_mode(dev);
4765        }
4766
4767        if ((flags ^ dev->gflags) & IFF_PROMISC) {
4768                int inc = (flags & IFF_PROMISC) ? 1 : -1;
4769
4770                dev->gflags ^= IFF_PROMISC;
4771                dev_set_promiscuity(dev, inc);
4772        }
4773
4774        /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4775           is important. Some (broken) drivers set IFF_PROMISC, when
4776           IFF_ALLMULTI is requested not asking us and not reporting.
4777         */
4778        if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4779                int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4780
4781                dev->gflags ^= IFF_ALLMULTI;
4782                dev_set_allmulti(dev, inc);
4783        }
4784
4785        return ret;
4786}
4787
4788void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4789{
4790        unsigned int changes = dev->flags ^ old_flags;
4791
4792        if (changes & IFF_UP) {
4793                if (dev->flags & IFF_UP)
4794                        call_netdevice_notifiers(NETDEV_UP, dev);
4795                else
4796                        call_netdevice_notifiers(NETDEV_DOWN, dev);
4797        }
4798
4799        if (dev->flags & IFF_UP &&
4800            (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4801                call_netdevice_notifiers(NETDEV_CHANGE, dev);
4802}
4803
4804/**
4805 *      dev_change_flags - change device settings
4806 *      @dev: device
4807 *      @flags: device state flags
4808 *
4809 *      Change settings on device based state flags. The flags are
4810 *      in the userspace exported format.
4811 */
4812int dev_change_flags(struct net_device *dev, unsigned flags)
4813{
4814        int ret, changes;
4815        int old_flags = dev->flags;
4816
4817        ret = __dev_change_flags(dev, flags);
4818        if (ret < 0)
4819                return ret;
4820
4821        changes = old_flags ^ dev->flags;
4822        if (changes)
4823                rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4824
4825        __dev_notify_flags(dev, old_flags);
4826        return ret;
4827}
4828EXPORT_SYMBOL(dev_change_flags);
4829
4830/**
4831 *      dev_set_mtu - Change maximum transfer unit
4832 *      @dev: device
4833 *      @new_mtu: new transfer unit
4834 *
4835 *      Change the maximum transfer size of the network device.
4836 */
4837int dev_set_mtu(struct net_device *dev, int new_mtu)
4838{
4839        const struct net_device_ops *ops = dev->netdev_ops;
4840        int err;
4841
4842        if (new_mtu == dev->mtu)
4843                return 0;
4844
4845        /*      MTU must be positive.    */
4846        if (new_mtu < 0)
4847                return -EINVAL;
4848
4849        if (!netif_device_present(dev))
4850                return -ENODEV;
4851
4852        err = 0;
4853        if (ops->ndo_change_mtu)
4854                err = ops->ndo_change_mtu(dev, new_mtu);
4855        else
4856                dev->mtu = new_mtu;
4857
4858        if (!err && dev->flags & IFF_UP)
4859                call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4860        return err;
4861}
4862EXPORT_SYMBOL(dev_set_mtu);
4863
4864/**
4865 *      dev_set_group - Change group this device belongs to
4866 *      @dev: device
4867 *      @new_group: group this device should belong to
4868 */
4869void dev_set_group(struct net_device *dev, int new_group)
4870{
4871        dev->group = new_group;
4872}
4873EXPORT_SYMBOL(dev_set_group);
4874
4875/**
4876 *      dev_set_mac_address - Change Media Access Control Address
4877 *      @dev: device
4878 *      @sa: new address
4879 *
4880 *      Change the hardware (MAC) address of the device
4881 */
4882int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4883{
4884        const struct net_device_ops *ops = dev->netdev_ops;
4885        int err;
4886
4887        if (!ops->ndo_set_mac_address)
4888                return -EOPNOTSUPP;
4889        if (sa->sa_family != dev->type)
4890                return -EINVAL;
4891        if (!netif_device_present(dev))
4892                return -ENODEV;
4893        err = ops->ndo_set_mac_address(dev, sa);
4894        if (!err)
4895                call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4896        return err;
4897}
4898EXPORT_SYMBOL(dev_set_mac_address);
4899
4900/*
4901 *      Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4902 */
4903static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4904{
4905        int err;
4906        struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4907
4908        if (!dev)
4909                return -ENODEV;
4910
4911        switch (cmd) {
4912        case SIOCGIFFLAGS:      /* Get interface flags */
4913                ifr->ifr_flags = (short) dev_get_flags(dev);
4914                return 0;
4915
4916        case SIOCGIFMETRIC:     /* Get the metric on the interface
4917                                   (currently unused) */
4918                ifr->ifr_metric = 0;
4919                return 0;
4920
4921        case SIOCGIFMTU:        /* Get the MTU of a device */
4922                ifr->ifr_mtu = dev->mtu;
4923                return 0;
4924
4925        case SIOCGIFHWADDR:
4926                if (!dev->addr_len)
4927                        memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4928                else
4929                        memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4930                               min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4931                ifr->ifr_hwaddr.sa_family = dev->type;
4932                return 0;
4933
4934        case SIOCGIFSLAVE:
4935                err = -EINVAL;
4936                break;
4937
4938        case SIOCGIFMAP:
4939                ifr->ifr_map.mem_start = dev->mem_start;
4940                ifr->ifr_map.mem_end   = dev->mem_end;
4941                ifr->ifr_map.base_addr = dev->base_addr;
4942                ifr->ifr_map.irq       = dev->irq;
4943                ifr->ifr_map.dma       = dev->dma;
4944                ifr->ifr_map.port      = dev->if_port;
4945                return 0;
4946
4947        case SIOCGIFINDEX:
4948                ifr->ifr_ifindex = dev->ifindex;
4949                return 0;
4950
4951        case SIOCGIFTXQLEN:
4952                ifr->ifr_qlen = dev->tx_queue_len;
4953                return 0;
4954
4955        default:
4956                /* dev_ioctl() should ensure this case
4957                 * is never reached
4958                 */
4959                WARN_ON(1);
4960                err = -ENOTTY;
4961                break;
4962
4963        }
4964        return err;
4965}
4966
4967/*
4968 *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
4969 */
4970static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4971{
4972        int err;
4973        struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4974        const struct net_device_ops *ops;
4975
4976        if (!dev)
4977                return -ENODEV;
4978
4979        ops = dev->netdev_ops;
4980
4981        switch (cmd) {
4982        case SIOCSIFFLAGS:      /* Set interface flags */
4983                return dev_change_flags(dev, ifr->ifr_flags);
4984
4985        case SIOCSIFMETRIC:     /* Set the metric on the interface
4986                                   (currently unused) */
4987                return -EOPNOTSUPP;
4988
4989        case SIOCSIFMTU:        /* Set the MTU of a device */
4990                return dev_set_mtu(dev, ifr->ifr_mtu);
4991
4992        case SIOCSIFHWADDR:
4993                return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4994
4995        case SIOCSIFHWBROADCAST:
4996                if (ifr->ifr_hwaddr.sa_family != dev->type)
4997                        return -EINVAL;
4998                memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4999                       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
5000                call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5001                return 0;
5002
5003        case SIOCSIFMAP:
5004                if (ops->ndo_set_config) {
5005                        if (!netif_device_present(dev))
5006                                return -ENODEV;
5007                        return ops->ndo_set_config(dev, &ifr->ifr_map);
5008                }
5009                return -EOPNOTSUPP;
5010
5011        case SIOCADDMULTI:
5012                if (!ops->ndo_set_rx_mode ||
5013                    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5014                        return -EINVAL;
5015                if (!netif_device_present(dev))
5016                        return -ENODEV;
5017                return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
5018
5019        case SIOCDELMULTI:
5020                if (!ops->ndo_set_rx_mode ||
5021                    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5022                        return -EINVAL;
5023                if (!netif_device_present(dev))
5024                        return -ENODEV;
5025                return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
5026
5027        case SIOCSIFTXQLEN:
5028                if (ifr->ifr_qlen < 0)
5029                        return -EINVAL;
5030                dev->tx_queue_len = ifr->ifr_qlen;
5031                return 0;
5032
5033        case SIOCSIFNAME:
5034                ifr->ifr_newname[IFNAMSIZ-1] = '\0';
5035                return dev_change_name(dev, ifr->ifr_newname);
5036
5037        case SIOCSHWTSTAMP:
5038                err = net_hwtstamp_validate(ifr);
5039                if (err)
5040                        return err;
5041                /* fall through */
5042
5043        /*
5044         *      Unknown or private ioctl
5045         */
5046        default:
5047                if ((cmd >= SIOCDEVPRIVATE &&
5048                    cmd <= SIOCDEVPRIVATE + 15) ||
5049                    cmd == SIOCBONDENSLAVE ||
5050                    cmd == SIOCBONDRELEASE ||
5051                    cmd == SIOCBONDSETHWADDR ||
5052                    cmd == SIOCBONDSLAVEINFOQUERY ||
5053                    cmd == SIOCBONDINFOQUERY ||
5054                    cmd == SIOCBONDCHANGEACTIVE ||
5055                    cmd == SIOCGMIIPHY ||
5056                    cmd == SIOCGMIIREG ||
5057                    cmd == SIOCSMIIREG ||
5058                    cmd == SIOCBRADDIF ||
5059                    cmd == SIOCBRDELIF ||
5060                    cmd == SIOCSHWTSTAMP ||
5061                    cmd == SIOCWANDEV) {
5062                        err = -EOPNOTSUPP;
5063                        if (ops->ndo_do_ioctl) {
5064                                if (netif_device_present(dev))
5065                                        err = ops->ndo_do_ioctl(dev, ifr, cmd);
5066                                else
5067                                        err = -ENODEV;
5068                        }
5069                } else
5070                        err = -EINVAL;
5071
5072        }
5073        return err;
5074}
5075
5076/*
5077 *      This function handles all "interface"-type I/O control requests. The actual
5078 *      'doing' part of this is dev_ifsioc above.
5079 */
5080
5081/**
5082 *      dev_ioctl       -       network device ioctl
5083 *      @net: the applicable net namespace
5084 *      @cmd: command to issue
5085 *      @arg: pointer to a struct ifreq in user space
5086 *
5087 *      Issue ioctl functions to devices. This is normally called by the
5088 *      user space syscall interfaces but can sometimes be useful for
5089 *      other purposes. The return value is the return from the syscall if
5090 *      positive or a negative errno code on error.
5091 */
5092
5093int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
5094{
5095        struct ifreq ifr;
5096        int ret;
5097        char *colon;
5098
5099        /* One special case: SIOCGIFCONF takes ifconf argument
5100           and requires shared lock, because it sleeps writing
5101           to user space.
5102         */
5103
5104        if (cmd == SIOCGIFCONF) {
5105                rtnl_lock();
5106                ret = dev_ifconf(net, (char __user *) arg);
5107                rtnl_unlock();
5108                return ret;
5109        }
5110        if (cmd == SIOCGIFNAME)
5111                return dev_ifname(net, (struct ifreq __user *)arg);
5112
5113        if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
5114                return -EFAULT;
5115
5116        ifr.ifr_name[IFNAMSIZ-1] = 0;
5117
5118        colon = strchr(ifr.ifr_name, ':');
5119        if (colon)
5120                *colon = 0;
5121
5122        /*
5123         *      See which interface the caller is talking about.
5124         */
5125
5126        switch (cmd) {
5127        /*
5128         *      These ioctl calls:
5129         *      - can be done by all.
5130         *      - atomic and do not require locking.
5131         *      - return a value
5132         */
5133        case SIOCGIFFLAGS:
5134        case SIOCGIFMETRIC:
5135        case SIOCGIFMTU:
5136        case SIOCGIFHWADDR:
5137        case SIOCGIFSLAVE:
5138        case SIOCGIFMAP:
5139        case SIOCGIFINDEX:
5140        case SIOCGIFTXQLEN:
5141                dev_load(net, ifr.ifr_name);
5142                rcu_read_lock();
5143                ret = dev_ifsioc_locked(net, &ifr, cmd);
5144                rcu_read_unlock();
5145                if (!ret) {
5146                        if (colon)
5147                                *colon = ':';
5148                        if (copy_to_user(arg, &ifr,
5149                                         sizeof(struct ifreq)))
5150                                ret = -EFAULT;
5151                }
5152                return ret;
5153
5154        case SIOCETHTOOL:
5155                dev_load(net, ifr.ifr_name);
5156                rtnl_lock();
5157                ret = dev_ethtool(net, &ifr);
5158                rtnl_unlock();
5159                if (!ret) {
5160                        if (colon)
5161                                *colon = ':';
5162                        if (copy_to_user(arg, &ifr,
5163                                         sizeof(struct ifreq)))
5164                                ret = -EFAULT;
5165                }
5166                return ret;
5167
5168        /*
5169         *      These ioctl calls:
5170         *      - require superuser power.
5171         *      - require strict serialization.
5172         *      - return a value
5173         */
5174        case SIOCGMIIPHY:
5175        case SIOCGMIIREG:
5176        case SIOCSIFNAME:
5177                if (!capable(CAP_NET_ADMIN))
5178                        return -EPERM;
5179                dev_load(net, ifr.ifr_name);
5180                rtnl_lock();
5181                ret = dev_ifsioc(net, &ifr, cmd);
5182                rtnl_unlock();
5183                if (!ret) {
5184                        if (colon)
5185                                *colon = ':';
5186                        if (copy_to_user(arg, &ifr,
5187                                         sizeof(struct ifreq)))
5188                                ret = -EFAULT;
5189                }
5190                return ret;
5191
5192        /*
5193         *      These ioctl calls:
5194         *      - require superuser power.
5195         *      - require strict serialization.
5196         *      - do not return a value
5197         */
5198        case SIOCSIFFLAGS:
5199        case SIOCSIFMETRIC:
5200        case SIOCSIFMTU:
5201        case SIOCSIFMAP:
5202        case SIOCSIFHWADDR:
5203        case SIOCSIFSLAVE:
5204        case SIOCADDMULTI:
5205        case SIOCDELMULTI:
5206        case SIOCSIFHWBROADCAST:
5207        case SIOCSIFTXQLEN:
5208        case SIOCSMIIREG:
5209        case SIOCBONDENSLAVE:
5210        case SIOCBONDRELEASE:
5211        case SIOCBONDSETHWADDR:
5212        case SIOCBONDCHANGEACTIVE:
5213        case SIOCBRADDIF:
5214        case SIOCBRDELIF:
5215        case SIOCSHWTSTAMP:
5216                if (!capable(CAP_NET_ADMIN))
5217                        return -EPERM;
5218                /* fall through */
5219        case SIOCBONDSLAVEINFOQUERY:
5220        case SIOCBONDINFOQUERY:
5221                dev_load(net, ifr.ifr_name);
5222                rtnl_lock();
5223                ret = dev_ifsioc(net, &ifr, cmd);
5224                rtnl_unlock();
5225                return ret;
5226
5227        case SIOCGIFMEM:
5228                /* Get the per device memory space. We can add this but
5229                 * currently do not support it */
5230        case SIOCSIFMEM:
5231                /* Set the per device memory buffer space.
5232                 * Not applicable in our case */
5233        case SIOCSIFLINK:
5234                return -ENOTTY;
5235
5236        /*
5237         *      Unknown or private ioctl.
5238         */
5239        default:
5240                if (cmd == SIOCWANDEV ||
5241                    (cmd >= SIOCDEVPRIVATE &&
5242                     cmd <= SIOCDEVPRIVATE + 15)) {
5243                        dev_load(net, ifr.ifr_name);
5244                        rtnl_lock();
5245                        ret = dev_ifsioc(net, &ifr, cmd);
5246                        rtnl_unlock();
5247                        if (!ret && copy_to_user(arg, &ifr,
5248                                                 sizeof(struct ifreq)))
5249                                ret = -EFAULT;
5250                        return ret;
5251                }
5252                /* Take care of Wireless Extensions */
5253                if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5254                        return wext_handle_ioctl(net, &ifr, cmd, arg);
5255                return -ENOTTY;
5256        }
5257}
5258
5259
5260/**
5261 *      dev_new_index   -       allocate an ifindex
5262 *      @net: the applicable net namespace
5263 *
5264 *      Returns a suitable unique value for a new device interface
5265 *      number.  The caller must hold the rtnl semaphore or the
5266 *      dev_base_lock to be sure it remains unique.
5267 */
5268static int dev_new_index(struct net *net)
5269{
5270        static int ifindex;
5271        for (;;) {
5272                if (++ifindex <= 0)
5273                        ifindex = 1;
5274                if (!__dev_get_by_index(net, ifindex))
5275                        return ifindex;
5276        }
5277}
5278
5279/* Delayed registration/unregisteration */
5280static LIST_HEAD(net_todo_list);
5281
5282static void net_set_todo(struct net_device *dev)
5283{
5284        list_add_tail(&dev->todo_list, &net_todo_list);
5285}
5286
5287static void rollback_registered_many(struct list_head *head)
5288{
5289        struct net_device *dev, *tmp;
5290
5291        BUG_ON(dev_boot_phase);
5292        ASSERT_RTNL();
5293
5294        list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5295                /* Some devices call without registering
5296                 * for initialization unwind. Remove those
5297                 * devices and proceed with the remaining.
5298                 */
5299                if (dev->reg_state == NETREG_UNINITIALIZED) {
5300                        pr_debug("unregister_netdevice: device %s/%p never "
5301                                 "was registered\n", dev->name, dev);
5302
5303                        WARN_ON(1);
5304                        list_del(&dev->unreg_list);
5305                        continue;
5306                }
5307                dev->dismantle = true;
5308                BUG_ON(dev->reg_state != NETREG_REGISTERED);
5309        }
5310
5311        /* If device is running, close it first. */
5312        dev_close_many(head);
5313
5314        list_for_each_entry(dev, head, unreg_list) {
5315                /* And unlink it from device chain. */
5316                unlist_netdevice(dev);
5317
5318                dev->reg_state = NETREG_UNREGISTERING;
5319        }
5320
5321        synchronize_net();
5322
5323        list_for_each_entry(dev, head, unreg_list) {
5324                /* Shutdown queueing discipline. */
5325                dev_shutdown(dev);
5326
5327
5328                /* Notify protocols, that we are about to destroy
5329                   this device. They should clean all the things.
5330                */
5331                call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5332
5333                if (!dev->rtnl_link_ops ||
5334                    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5335                        rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5336
5337                /*
5338                 *      Flush the unicast and multicast chains
5339                 */
5340                dev_uc_flush(dev);
5341                dev_mc_flush(dev);
5342
5343                if (dev->netdev_ops->ndo_uninit)
5344                        dev->netdev_ops->ndo_uninit(dev);
5345
5346                /* Notifier chain MUST detach us from master device. */
5347                WARN_ON(dev->master);
5348
5349                /* Remove entries from kobject tree */
5350                netdev_unregister_kobject(dev);
5351        }
5352
5353        /* Process any work delayed until the end of the batch */
5354        dev = list_first_entry(head, struct net_device, unreg_list);
5355        call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5356
5357        synchronize_net();
5358
5359        list_for_each_entry(dev, head, unreg_list)
5360                dev_put(dev);
5361}
5362
5363static void rollback_registered(struct net_device *dev)
5364{
5365        LIST_HEAD(single);
5366
5367        list_add(&dev->unreg_list, &single);
5368        rollback_registered_many(&single);
5369        list_del(&single);
5370}
5371
5372static u32 netdev_fix_features(struct net_device *dev, u32 features)
5373{
5374        /* Fix illegal checksum combinations */
5375        if ((features & NETIF_F_HW_CSUM) &&
5376            (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5377                netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5378                features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5379        }
5380
5381        if ((features & NETIF_F_NO_CSUM) &&
5382            (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5383                netdev_warn(dev, "mixed no checksumming and other settings.\n");
5384                features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5385        }
5386
5387        /* Fix illegal SG+CSUM combinations. */
5388        if ((features & NETIF_F_SG) &&
5389            !(features & NETIF_F_ALL_CSUM)) {
5390                netdev_dbg(dev,
5391                        "Dropping NETIF_F_SG since no checksum feature.\n");
5392                features &= ~NETIF_F_SG;
5393        }
5394
5395        /* TSO requires that SG is present as well. */
5396        if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5397                netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5398                features &= ~NETIF_F_ALL_TSO;
5399        }
5400
5401        /* TSO ECN requires that TSO is present as well. */
5402        if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5403                features &= ~NETIF_F_TSO_ECN;
5404
5405        /* Software GSO depends on SG. */
5406        if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5407                netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5408                features &= ~NETIF_F_GSO;
5409        }
5410
5411        /* UFO needs SG and checksumming */
5412        if (features & NETIF_F_UFO) {
5413                /* maybe split UFO into V4 and V6? */
5414                if (!((features & NETIF_F_GEN_CSUM) ||
5415                    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5416                            == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5417                        netdev_dbg(dev,
5418                                "Dropping NETIF_F_UFO since no checksum offload features.\n");
5419                        features &= ~NETIF_F_UFO;
5420                }
5421
5422                if (!(features & NETIF_F_SG)) {
5423                        netdev_dbg(dev,
5424                                "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5425                        features &= ~NETIF_F_UFO;
5426                }
5427        }
5428
5429        return features;
5430}
5431
5432int __netdev_update_features(struct net_device *dev)
5433{
5434        u32 features;
5435        int err = 0;
5436
5437        ASSERT_RTNL();
5438
5439        features = netdev_get_wanted_features(dev);
5440
5441        if (dev->netdev_ops->ndo_fix_features)
5442                features = dev->netdev_ops->ndo_fix_features(dev, features);
5443
5444        /* driver might be less strict about feature dependencies */
5445        features = netdev_fix_features(dev, features);
5446
5447        if (dev->features == features)
5448                return 0;
5449
5450        netdev_dbg(dev, "Features changed: 0x%08x -> 0x%08x\n",
5451                dev->features, features);
5452
5453        if (dev->netdev_ops->ndo_set_features)
5454                err = dev->netdev_ops->ndo_set_features(dev, features);
5455
5456        if (unlikely(err < 0)) {
5457                netdev_err(dev,
5458                        "set_features() failed (%d); wanted 0x%08x, left 0x%08x\n",
5459                        err, features, dev->features);
5460                return -1;
5461        }
5462
5463        if (!err)
5464                dev->features = features;
5465
5466        return 1;
5467}
5468
5469/**
5470 *      netdev_update_features - recalculate device features
5471 *      @dev: the device to check
5472 *
5473 *      Recalculate dev->features set and send notifications if it
5474 *      has changed. Should be called after driver or hardware dependent
5475 *      conditions might have changed that influence the features.
5476 */
5477void netdev_update_features(struct net_device *dev)
5478{
5479        if (__netdev_update_features(dev))
5480                netdev_features_change(dev);
5481}
5482EXPORT_SYMBOL(netdev_update_features);
5483
5484/**
5485 *      netdev_change_features - recalculate device features
5486 *      @dev: the device to check
5487 *
5488 *      Recalculate dev->features set and send notifications even
5489 *      if they have not changed. Should be called instead of
5490 *      netdev_update_features() if also dev->vlan_features might
5491 *      have changed to allow the changes to be propagated to stacked
5492 *      VLAN devices.
5493 */
5494void netdev_change_features(struct net_device *dev)
5495{
5496        __netdev_update_features(dev);
5497        netdev_features_change(dev);
5498}
5499EXPORT_SYMBOL(netdev_change_features);
5500
5501/**
5502 *      netif_stacked_transfer_operstate -      transfer operstate
5503 *      @rootdev: the root or lower level device to transfer state from
5504 *      @dev: the device to transfer operstate to
5505 *
5506 *      Transfer operational state from root to device. This is normally
5507 *      called when a stacking relationship exists between the root
5508 *      device and the device(a leaf device).
5509 */
5510void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5511                                        struct net_device *dev)
5512{
5513        if (rootdev->operstate == IF_OPER_DORMANT)
5514                netif_dormant_on(dev);
5515        else
5516                netif_dormant_off(dev);
5517
5518        if (netif_carrier_ok(rootdev)) {
5519                if (!netif_carrier_ok(dev))
5520                        netif_carrier_on(dev);
5521        } else {
5522                if (netif_carrier_ok(dev))
5523                        netif_carrier_off(dev);
5524        }
5525}
5526EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5527
5528#ifdef CONFIG_RPS
5529static int netif_alloc_rx_queues(struct net_device *dev)
5530{
5531        unsigned int i, count = dev->num_rx_queues;
5532        struct netdev_rx_queue *rx;
5533
5534        BUG_ON(count < 1);
5535
5536        rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5537        if (!rx) {
5538                pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5539                return -ENOMEM;
5540        }
5541        dev->_rx = rx;
5542
5543        for (i = 0; i < count; i++)
5544                rx[i].dev = dev;
5545        return 0;
5546}
5547#endif
5548
5549static void netdev_init_one_queue(struct net_device *dev,
5550                                  struct netdev_queue *queue, void *_unused)
5551{
5552        /* Initialize queue lock */
5553        spin_lock_init(&queue->_xmit_lock);
5554        netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5555        queue->xmit_lock_owner = -1;
5556        netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5557        queue->dev = dev;
5558}
5559
5560static int netif_alloc_netdev_queues(struct net_device *dev)
5561{
5562        unsigned int count = dev->num_tx_queues;
5563        struct netdev_queue *tx;
5564
5565        BUG_ON(count < 1);
5566
5567        tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5568        if (!tx) {
5569                pr_err("netdev: Unable to allocate %u tx queues.\n",
5570                       count);
5571                return -ENOMEM;
5572        }
5573        dev->_tx = tx;
5574
5575        netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5576        spin_lock_init(&dev->tx_global_lock);
5577
5578        return 0;
5579}
5580
5581/**
5582 *      register_netdevice      - register a network device
5583 *      @dev: device to register
5584 *
5585 *      Take a completed network device structure and add it to the kernel
5586 *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5587 *      chain. 0 is returned on success. A negative errno code is returned
5588 *      on a failure to set up the device, or if the name is a duplicate.
5589 *
5590 *      Callers must hold the rtnl semaphore. You may want
5591 *      register_netdev() instead of this.
5592 *
5593 *      BUGS:
5594 *      The locking appears insufficient to guarantee two parallel registers
5595 *      will not get the same name.
5596 */
5597
5598int register_netdevice(struct net_device *dev)
5599{
5600        int ret;
5601        struct net *net = dev_net(dev);
5602
5603        BUG_ON(dev_boot_phase);
5604        ASSERT_RTNL();
5605
5606        might_sleep();
5607
5608        /* When net_device's are persistent, this will be fatal. */
5609        BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5610        BUG_ON(!net);
5611
5612        spin_lock_init(&dev->addr_list_lock);
5613        netdev_set_addr_lockdep_class(dev);
5614
5615        dev->iflink = -1;
5616
5617        ret = dev_get_valid_name(dev, dev->name);
5618        if (ret < 0)
5619                goto out;
5620
5621        /* Init, if this function is available */
5622        if (dev->netdev_ops->ndo_init) {
5623                ret = dev->netdev_ops->ndo_init(dev);
5624                if (ret) {
5625                        if (ret > 0)
5626                                ret = -EIO;
5627                        goto out;
5628                }
5629        }
5630
5631        dev->ifindex = dev_new_index(net);
5632        if (dev->iflink == -1)
5633                dev->iflink = dev->ifindex;
5634
5635        /* Transfer changeable features to wanted_features and enable
5636         * software offloads (GSO and GRO).
5637         */
5638        dev->hw_features |= NETIF_F_SOFT_FEATURES;
5639        dev->features |= NETIF_F_SOFT_FEATURES;
5640        dev->wanted_features = dev->features & dev->hw_features;
5641
5642        /* Turn on no cache copy if HW is doing checksum */
5643        dev->hw_features |= NETIF_F_NOCACHE_COPY;
5644        if ((dev->features & NETIF_F_ALL_CSUM) &&
5645            !(dev->features & NETIF_F_NO_CSUM)) {
5646                dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5647                dev->features |= NETIF_F_NOCACHE_COPY;
5648        }
5649
5650        /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5651         */
5652        dev->vlan_features |= NETIF_F_HIGHDMA;
5653
5654        ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5655        ret = notifier_to_errno(ret);
5656        if (ret)
5657                goto err_uninit;
5658
5659        ret = netdev_register_kobject(dev);
5660        if (ret)
5661                goto err_uninit;
5662        dev->reg_state = NETREG_REGISTERED;
5663
5664        __netdev_update_features(dev);
5665
5666        /*
5667         *      Default initial state at registry is that the
5668         *      device is present.
5669         */
5670
5671        set_bit(__LINK_STATE_PRESENT, &dev->state);
5672
5673        dev_init_scheduler(dev);
5674        dev_hold(dev);
5675        list_netdevice(dev);
5676
5677        /* Notify protocols, that a new device appeared. */
5678        ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5679        ret = notifier_to_errno(ret);
5680        if (ret) {
5681                rollback_registered(dev);
5682                dev->reg_state = NETREG_UNREGISTERED;
5683        }
5684        /*
5685         *      Prevent userspace races by waiting until the network
5686         *      device is fully setup before sending notifications.
5687         */
5688        if (!dev->rtnl_link_ops ||
5689            dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5690                rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5691
5692out:
5693        return ret;
5694
5695err_uninit:
5696        if (dev->netdev_ops->ndo_uninit)
5697                dev->netdev_ops->ndo_uninit(dev);
5698        goto out;
5699}
5700EXPORT_SYMBOL(register_netdevice);
5701
5702/**
5703 *      init_dummy_netdev       - init a dummy network device for NAPI
5704 *      @dev: device to init
5705 *
5706 *      This takes a network device structure and initialize the minimum
5707 *      amount of fields so it can be used to schedule NAPI polls without
5708 *      registering a full blown interface. This is to be used by drivers
5709 *      that need to tie several hardware interfaces to a single NAPI
5710 *      poll scheduler due to HW limitations.
5711 */
5712int init_dummy_netdev(struct net_device *dev)
5713{
5714        /* Clear everything. Note we don't initialize spinlocks
5715         * are they aren't supposed to be taken by any of the
5716         * NAPI code and this dummy netdev is supposed to be
5717         * only ever used for NAPI polls
5718         */
5719        memset(dev, 0, sizeof(struct net_device));
5720
5721        /* make sure we BUG if trying to hit standard
5722         * register/unregister code path
5723         */
5724        dev->reg_state = NETREG_DUMMY;
5725
5726        /* NAPI wants this */
5727        INIT_LIST_HEAD(&dev->napi_list);
5728
5729        /* a dummy interface is started by default */
5730        set_bit(__LINK_STATE_PRESENT, &dev->state);
5731        set_bit(__LINK_STATE_START, &dev->state);
5732
5733        /* Note : We dont allocate pcpu_refcnt for dummy devices,
5734         * because users of this 'device' dont need to change
5735         * its refcount.
5736         */
5737
5738        return 0;
5739}
5740EXPORT_SYMBOL_GPL(init_dummy_netdev);
5741
5742
5743/**
5744 *      register_netdev - register a network device
5745 *      @dev: device to register
5746 *
5747 *      Take a completed network device structure and add it to the kernel
5748 *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5749 *      chain. 0 is returned on success. A negative errno code is returned
5750 *      on a failure to set up the device, or if the name is a duplicate.
5751 *
5752 *      This is a wrapper around register_netdevice that takes the rtnl semaphore
5753 *      and expands the device name if you passed a format string to
5754 *      alloc_netdev.
5755 */
5756int register_netdev(struct net_device *dev)
5757{
5758        int err;
5759
5760        rtnl_lock();
5761        err = register_netdevice(dev);
5762        rtnl_unlock();
5763        return err;
5764}
5765EXPORT_SYMBOL(register_netdev);
5766
5767int netdev_refcnt_read(const struct net_device *dev)
5768{
5769        int i, refcnt = 0;
5770
5771        for_each_possible_cpu(i)
5772                refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5773        return refcnt;
5774}
5775EXPORT_SYMBOL(netdev_refcnt_read);
5776
5777/*
5778 * netdev_wait_allrefs - wait until all references are gone.
5779 *
5780 * This is called when unregistering network devices.
5781 *
5782 * Any protocol or device that holds a reference should register
5783 * for netdevice notification, and cleanup and put back the
5784 * reference if they receive an UNREGISTER event.
5785 * We can get stuck here if buggy protocols don't correctly
5786 * call dev_put.
5787 */
5788static void netdev_wait_allrefs(struct net_device *dev)
5789{
5790        unsigned long rebroadcast_time, warning_time;
5791        int refcnt;
5792
5793        linkwatch_forget_dev(dev);
5794
5795        rebroadcast_time = warning_time = jiffies;
5796        refcnt = netdev_refcnt_read(dev);
5797
5798        while (refcnt != 0) {
5799                if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5800                        rtnl_lock();
5801
5802                        /* Rebroadcast unregister notification */
5803                        call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5804                        /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5805                         * should have already handle it the first time */
5806
5807                        if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5808                                     &dev->state)) {
5809                                /* We must not have linkwatch events
5810                                 * pending on unregister. If this
5811                                 * happens, we simply run the queue
5812                                 * unscheduled, resulting in a noop
5813                                 * for this device.
5814                                 */
5815                                linkwatch_run_queue();
5816                        }
5817
5818                        __rtnl_unlock();
5819
5820                        rebroadcast_time = jiffies;
5821                }
5822
5823                msleep(250);
5824
5825                refcnt = netdev_refcnt_read(dev);
5826
5827                if (time_after(jiffies, warning_time + 10 * HZ)) {
5828                        printk(KERN_EMERG "unregister_netdevice: "
5829                               "waiting for %s to become free. Usage "
5830                               "count = %d\n",
5831                               dev->name, refcnt);
5832                        warning_time = jiffies;
5833                }
5834        }
5835}
5836
5837/* The sequence is:
5838 *
5839 *      rtnl_lock();
5840 *      ...
5841 *      register_netdevice(x1);
5842 *      register_netdevice(x2);
5843 *      ...
5844 *      unregister_netdevice(y1);
5845 *      unregister_netdevice(y2);
5846 *      ...
5847 *      rtnl_unlock();
5848 *      free_netdev(y1);
5849 *      free_netdev(y2);
5850 *
5851 * We are invoked by rtnl_unlock().
5852 * This allows us to deal with problems:
5853 * 1) We can delete sysfs objects which invoke hotplug
5854 *    without deadlocking with linkwatch via keventd.
5855 * 2) Since we run with the RTNL semaphore not held, we can sleep
5856 *    safely in order to wait for the netdev refcnt to drop to zero.
5857 *
5858 * We must not return until all unregister events added during
5859 * the interval the lock was held have been completed.
5860 */
5861void netdev_run_todo(void)
5862{
5863        struct list_head list;
5864
5865        /* Snapshot list, allow later requests */
5866        list_replace_init(&net_todo_list, &list);
5867
5868        __rtnl_unlock();
5869
5870        /* Wait for rcu callbacks to finish before attempting to drain
5871         * the device list.  This usually avoids a 250ms wait.
5872         */
5873        if (!list_empty(&list))
5874                rcu_barrier();
5875
5876        while (!list_empty(&list)) {
5877                struct net_device *dev
5878                        = list_first_entry(&list, struct net_device, todo_list);
5879                list_del(&dev->todo_list);
5880
5881                if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5882                        printk(KERN_ERR "network todo '%s' but state %d\n",
5883                               dev->name, dev->reg_state);
5884                        dump_stack();
5885                        continue;
5886                }
5887
5888                dev->reg_state = NETREG_UNREGISTERED;
5889
5890                on_each_cpu(flush_backlog, dev, 1);
5891
5892                netdev_wait_allrefs(dev);
5893
5894                /* paranoia */
5895                BUG_ON(netdev_refcnt_read(dev));
5896                WARN_ON(rcu_access_pointer(dev->ip_ptr));
5897                WARN_ON(rcu_access_pointer(dev->ip6_ptr));
5898                WARN_ON(dev->dn_ptr);
5899
5900                if (dev->destructor)
5901                        dev->destructor(dev);
5902
5903                /* Free network device */
5904                kobject_put(&dev->dev.kobj);
5905        }
5906}
5907
5908/* Convert net_device_stats to rtnl_link_stats64.  They have the same
5909 * fields in the same order, with only the type differing.
5910 */
5911static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5912                                    const struct net_device_stats *netdev_stats)
5913{
5914#if BITS_PER_LONG == 64
5915        BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5916        memcpy(stats64, netdev_stats, sizeof(*stats64));
5917#else
5918        size_t i, n = sizeof(*stats64) / sizeof(u64);
5919        const unsigned long *src = (const unsigned long *)netdev_stats;
5920        u64 *dst = (u64 *)stats64;
5921
5922        BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5923                     sizeof(*stats64) / sizeof(u64));
5924        for (i = 0; i < n; i++)
5925                dst[i] = src[i];
5926#endif
5927}
5928
5929/**
5930 *      dev_get_stats   - get network device statistics
5931 *      @dev: device to get statistics from
5932 *      @storage: place to store stats
5933 *
5934 *      Get network statistics from device. Return @storage.
5935 *      The device driver may provide its own method by setting
5936 *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5937 *      otherwise the internal statistics structure is used.
5938 */
5939struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5940                                        struct rtnl_link_stats64 *storage)
5941{
5942        const struct net_device_ops *ops = dev->netdev_ops;
5943
5944        if (ops->ndo_get_stats64) {
5945                memset(storage, 0, sizeof(*storage));
5946                ops->ndo_get_stats64(dev, storage);
5947        } else if (ops->ndo_get_stats) {
5948                netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5949        } else {
5950                netdev_stats_to_stats64(storage, &dev->stats);
5951        }
5952        storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5953        return storage;
5954}
5955EXPORT_SYMBOL(dev_get_stats);
5956
5957struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5958{
5959        struct netdev_queue *queue = dev_ingress_queue(dev);
5960
5961#ifdef CONFIG_NET_CLS_ACT
5962        if (queue)
5963                return queue;
5964        queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5965        if (!queue)
5966                return NULL;
5967        netdev_init_one_queue(dev, queue, NULL);
5968        queue->qdisc = &noop_qdisc;
5969        queue->qdisc_sleeping = &noop_qdisc;
5970        rcu_assign_pointer(dev->ingress_queue, queue);
5971#endif
5972        return queue;
5973}
5974
5975/**
5976 *      alloc_netdev_mqs - allocate network device
5977 *      @sizeof_priv:   size of private data to allocate space for
5978 *      @name:          device name format string
5979 *      @setup:         callback to initialize device
5980 *      @txqs:          the number of TX subqueues to allocate
5981 *      @rxqs:          the number of RX subqueues to allocate
5982 *
5983 *      Allocates a struct net_device with private data area for driver use
5984 *      and performs basic initialization.  Also allocates subquue structs
5985 *      for each queue on the device.
5986 */
5987struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5988                void (*setup)(struct net_device *),
5989                unsigned int txqs, unsigned int rxqs)
5990{
5991        struct net_device *dev;
5992        size_t alloc_size;
5993        struct net_device *p;
5994
5995        BUG_ON(strlen(name) >= sizeof(dev->name));
5996
5997        if (txqs < 1) {
5998                pr_err("alloc_netdev: Unable to allocate device "
5999                       "with zero queues.\n");
6000                return NULL;
6001        }
6002
6003#ifdef CONFIG_RPS
6004        if (rxqs < 1) {
6005                pr_err("alloc_netdev: Unable to allocate device "
6006                       "with zero RX queues.\n");
6007                return NULL;
6008        }
6009#endif
6010
6011        alloc_size = sizeof(struct net_device);
6012        if (sizeof_priv) {
6013                /* ensure 32-byte alignment of private area */
6014                alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6015                alloc_size += sizeof_priv;
6016        }
6017        /* ensure 32-byte alignment of whole construct */
6018        alloc_size += NETDEV_ALIGN - 1;
6019
6020        p = kzalloc(alloc_size, GFP_KERNEL);
6021        if (!p) {
6022                printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
6023                return NULL;
6024        }
6025
6026        dev = PTR_ALIGN(p, NETDEV_ALIGN);
6027        dev->padded = (char *)dev - (char *)p;
6028
6029        dev->pcpu_refcnt = alloc_percpu(int);
6030        if (!dev->pcpu_refcnt)
6031                goto free_p;
6032
6033        if (dev_addr_init(dev))
6034                goto free_pcpu;
6035
6036        dev_mc_init(dev);
6037        dev_uc_init(dev);
6038
6039        dev_net_set(dev, &init_net);
6040
6041        dev->gso_max_size = GSO_MAX_SIZE;
6042
6043        INIT_LIST_HEAD(&dev->napi_list);
6044        INIT_LIST_HEAD(&dev->unreg_list);
6045        INIT_LIST_HEAD(&dev->link_watch_list);
6046        dev->priv_flags = IFF_XMIT_DST_RELEASE;
6047        setup(dev);
6048
6049        dev->num_tx_queues = txqs;
6050        dev->real_num_tx_queues = txqs;
6051        if (netif_alloc_netdev_queues(dev))
6052                goto free_all;
6053
6054#ifdef CONFIG_RPS
6055        dev->num_rx_queues = rxqs;
6056        dev->real_num_rx_queues = rxqs;
6057        if (netif_alloc_rx_queues(dev))
6058                goto free_all;
6059#endif
6060
6061        strcpy(dev->name, name);
6062        dev->group = INIT_NETDEV_GROUP;
6063        return dev;
6064
6065free_all:
6066        free_netdev(dev);
6067        return NULL;
6068
6069free_pcpu:
6070        free_percpu(dev->pcpu_refcnt);
6071        kfree(dev->_tx);
6072#ifdef CONFIG_RPS
6073        kfree(dev->_rx);
6074#endif
6075
6076free_p:
6077        kfree(p);
6078        return NULL;
6079}
6080EXPORT_SYMBOL(alloc_netdev_mqs);
6081
6082/**
6083 *      free_netdev - free network device
6084 *      @dev: device
6085 *
6086 *      This function does the last stage of destroying an allocated device
6087 *      interface. The reference to the device object is released.
6088 *      If this is the last reference then it will be freed.
6089 */
6090void free_netdev(struct net_device *dev)
6091{
6092        struct napi_struct *p, *n;
6093
6094        release_net(dev_net(dev));
6095
6096        kfree(dev->_tx);
6097#ifdef CONFIG_RPS
6098        kfree(dev->_rx);
6099#endif
6100
6101        kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6102
6103        /* Flush device addresses */
6104        dev_addr_flush(dev);
6105
6106        list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6107                netif_napi_del(p);
6108
6109        free_percpu(dev->pcpu_refcnt);
6110        dev->pcpu_refcnt = NULL;
6111
6112        /*  Compatibility with error handling in drivers */
6113        if (dev->reg_state == NETREG_UNINITIALIZED) {
6114                kfree((char *)dev - dev->padded);
6115                return;
6116        }
6117
6118        BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6119        dev->reg_state = NETREG_RELEASED;
6120
6121        /* will free via device release */
6122        put_device(&dev->dev);
6123}
6124EXPORT_SYMBOL(free_netdev);
6125
6126/**
6127 *      synchronize_net -  Synchronize with packet receive processing
6128 *
6129 *      Wait for packets currently being received to be done.
6130 *      Does not block later packets from starting.
6131 */
6132void synchronize_net(void)
6133{
6134        might_sleep();
6135        if (rtnl_is_locked())
6136                synchronize_rcu_expedited();
6137        else
6138                synchronize_rcu();
6139}
6140EXPORT_SYMBOL(synchronize_net);
6141
6142/**
6143 *      unregister_netdevice_queue - remove device from the kernel
6144 *      @dev: device
6145 *      @head: list
6146 *
6147 *      This function shuts down a device interface and removes it
6148 *      from the kernel tables.
6149 *      If head not NULL, device is queued to be unregistered later.
6150 *
6151 *      Callers must hold the rtnl semaphore.  You may want
6152 *      unregister_netdev() instead of this.
6153 */
6154
6155void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6156{
6157        ASSERT_RTNL();
6158
6159        if (head) {
6160                list_move_tail(&dev->unreg_list, head);
6161        } else {
6162                rollback_registered(dev);
6163                /* Finish processing unregister after unlock */
6164                net_set_todo(dev);
6165        }
6166}
6167EXPORT_SYMBOL(unregister_netdevice_queue);
6168
6169/**
6170 *      unregister_netdevice_many - unregister many devices
6171 *      @head: list of devices
6172 */
6173void unregister_netdevice_many(struct list_head *head)
6174{
6175        struct net_device *dev;
6176
6177        if (!list_empty(head)) {
6178                rollback_registered_many(head);
6179                list_for_each_entry(dev, head, unreg_list)
6180                        net_set_todo(dev);
6181        }
6182}
6183EXPORT_SYMBOL(unregister_netdevice_many);
6184
6185/**
6186 *      unregister_netdev - remove device from the kernel
6187 *      @dev: device
6188 *
6189 *      This function shuts down a device interface and removes it
6190 *      from the kernel tables.
6191 *
6192 *      This is just a wrapper for unregister_netdevice that takes
6193 *      the rtnl semaphore.  In general you want to use this and not
6194 *      unregister_netdevice.
6195 */
6196void unregister_netdev(struct net_device *dev)
6197{
6198        rtnl_lock();
6199        unregister_netdevice(dev);
6200        rtnl_unlock();
6201}
6202EXPORT_SYMBOL(unregister_netdev);
6203
6204/**
6205 *      dev_change_net_namespace - move device to different nethost namespace
6206 *      @dev: device
6207 *      @net: network namespace
6208 *      @pat: If not NULL name pattern to try if the current device name
6209 *            is already taken in the destination network namespace.
6210 *
6211 *      This function shuts down a device interface and moves it
6212 *      to a new network namespace. On success 0 is returned, on
6213 *      a failure a netagive errno code is returned.
6214 *
6215 *      Callers must hold the rtnl semaphore.
6216 */
6217
6218int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6219{
6220        int err;
6221
6222        ASSERT_RTNL();
6223
6224        /* Don't allow namespace local devices to be moved. */
6225        err = -EINVAL;
6226        if (dev->features & NETIF_F_NETNS_LOCAL)
6227                goto out;
6228
6229        /* Ensure the device has been registrered */
6230        err = -EINVAL;
6231        if (dev->reg_state != NETREG_REGISTERED)
6232                goto out;
6233
6234        /* Get out if there is nothing todo */
6235        err = 0;
6236        if (net_eq(dev_net(dev), net))
6237                goto out;
6238
6239        /* Pick the destination device name, and ensure
6240         * we can use it in the destination network namespace.
6241         */
6242        err = -EEXIST;
6243        if (__dev_get_by_name(net, dev->name)) {
6244                /* We get here if we can't use the current device name */
6245                if (!pat)
6246                        goto out;
6247                if (dev_get_valid_name(dev, pat) < 0)
6248                        goto out;
6249        }
6250
6251        /*
6252         * And now a mini version of register_netdevice unregister_netdevice.
6253         */
6254
6255        /* If device is running close it first. */
6256        dev_close(dev);
6257
6258        /* And unlink it from device chain */
6259        err = -ENODEV;
6260        unlist_netdevice(dev);
6261
6262        synchronize_net();
6263
6264        /* Shutdown queueing discipline. */
6265        dev_shutdown(dev);
6266
6267        /* Notify protocols, that we are about to destroy
6268           this device. They should clean all the things.
6269
6270           Note that dev->reg_state stays at NETREG_REGISTERED.
6271           This is wanted because this way 8021q and macvlan know
6272           the device is just moving and can keep their slaves up.
6273        */
6274        call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6275        call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6276        rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
6277
6278        /*
6279         *      Flush the unicast and multicast chains
6280         */
6281        dev_uc_flush(dev);
6282        dev_mc_flush(dev);
6283
6284        /* Actually switch the network namespace */
6285        dev_net_set(dev, net);
6286
6287        /* If there is an ifindex conflict assign a new one */
6288        if (__dev_get_by_index(net, dev->ifindex)) {
6289                int iflink = (dev->iflink == dev->ifindex);
6290                dev->ifindex = dev_new_index(net);
6291                if (iflink)
6292                        dev->iflink = dev->ifindex;
6293        }
6294
6295        /* Fixup kobjects */
6296        err = device_rename(&dev->dev, dev->name);
6297        WARN_ON(err);
6298
6299        /* Add the device back in the hashes */
6300        list_netdevice(dev);
6301
6302        /* Notify protocols, that a new device appeared. */
6303        call_netdevice_notifiers(NETDEV_REGISTER, dev);
6304
6305        /*
6306         *      Prevent userspace races by waiting until the network
6307         *      device is fully setup before sending notifications.
6308         */
6309        rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6310
6311        synchronize_net();
6312        err = 0;
6313out:
6314        return err;
6315}
6316EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6317
6318static int dev_cpu_callback(struct notifier_block *nfb,
6319                            unsigned long action,
6320                            void *ocpu)
6321{
6322        struct sk_buff **list_skb;
6323        struct sk_buff *skb;
6324        unsigned int cpu, oldcpu = (unsigned long)ocpu;
6325        struct softnet_data *sd, *oldsd;
6326
6327        if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6328                return NOTIFY_OK;
6329
6330        local_irq_disable();
6331        cpu = smp_processor_id();
6332        sd = &per_cpu(softnet_data, cpu);
6333        oldsd = &per_cpu(softnet_data, oldcpu);
6334
6335        /* Find end of our completion_queue. */
6336        list_skb = &sd->completion_queue;
6337        while (*list_skb)
6338                list_skb = &(*list_skb)->next;
6339        /* Append completion queue from offline CPU. */
6340        *list_skb = oldsd->completion_queue;
6341        oldsd->completion_queue = NULL;
6342
6343        /* Append output queue from offline CPU. */
6344        if (oldsd->output_queue) {
6345                *sd->output_queue_tailp = oldsd->output_queue;
6346                sd->output_queue_tailp = oldsd->output_queue_tailp;
6347                oldsd->output_queue = NULL;
6348                oldsd->output_queue_tailp = &oldsd->output_queue;
6349        }
6350        /* Append NAPI poll list from offline CPU. */
6351        if (!list_empty(&oldsd->poll_list)) {
6352                list_splice_init(&oldsd->poll_list, &sd->poll_list);
6353                raise_softirq_irqoff(NET_RX_SOFTIRQ);
6354        }
6355
6356        raise_softirq_irqoff(NET_TX_SOFTIRQ);
6357        local_irq_enable();
6358
6359        /* Process offline CPU's input_pkt_queue */
6360        while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6361                netif_rx(skb);
6362                input_queue_head_incr(oldsd);
6363        }
6364        while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6365                netif_rx(skb);
6366                input_queue_head_incr(oldsd);
6367        }
6368
6369        return NOTIFY_OK;
6370}
6371
6372
6373/**
6374 *      netdev_increment_features - increment feature set by one
6375 *      @all: current feature set
6376 *      @one: new feature set
6377 *      @mask: mask feature set
6378 *
6379 *      Computes a new feature set after adding a device with feature set
6380 *      @one to the master device with current feature set @all.  Will not
6381 *      enable anything that is off in @mask. Returns the new feature set.
6382 */
6383u32 netdev_increment_features(u32 all, u32 one, u32 mask)
6384{
6385        if (mask & NETIF_F_GEN_CSUM)
6386                mask |= NETIF_F_ALL_CSUM;
6387        mask |= NETIF_F_VLAN_CHALLENGED;
6388
6389        all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6390        all &= one | ~NETIF_F_ALL_FOR_ALL;
6391
6392        /* If device needs checksumming, downgrade to it. */
6393        if (all & (NETIF_F_ALL_CSUM & ~NETIF_F_NO_CSUM))
6394                all &= ~NETIF_F_NO_CSUM;
6395
6396        /* If one device supports hw checksumming, set for all. */
6397        if (all & NETIF_F_GEN_CSUM)
6398                all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6399
6400        return all;
6401}
6402EXPORT_SYMBOL(netdev_increment_features);
6403
6404static struct hlist_head *netdev_create_hash(void)
6405{
6406        int i;
6407        struct hlist_head *hash;
6408
6409        hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6410        if (hash != NULL)
6411                for (i = 0; i < NETDEV_HASHENTRIES; i++)
6412                        INIT_HLIST_HEAD(&hash[i]);
6413
6414        return hash;
6415}
6416
6417/* Initialize per network namespace state */
6418static int __net_init netdev_init(struct net *net)
6419{
6420        INIT_LIST_HEAD(&net->dev_base_head);
6421
6422        net->dev_name_head = netdev_create_hash();
6423        if (net->dev_name_head == NULL)
6424                goto err_name;
6425
6426        net->dev_index_head = netdev_create_hash();
6427        if (net->dev_index_head == NULL)
6428                goto err_idx;
6429
6430        return 0;
6431
6432err_idx:
6433        kfree(net->dev_name_head);
6434err_name:
6435        return -ENOMEM;
6436}
6437
6438/**
6439 *      netdev_drivername - network driver for the device
6440 *      @dev: network device
6441 *
6442 *      Determine network driver for device.
6443 */
6444const char *netdev_drivername(const struct net_device *dev)
6445{
6446        const struct device_driver *driver;
6447        const struct device *parent;
6448        const char *empty = "";
6449
6450        parent = dev->dev.parent;
6451        if (!parent)
6452                return empty;
6453
6454        driver = parent->driver;
6455        if (driver && driver->name)
6456                return driver->name;
6457        return empty;
6458}
6459
6460int __netdev_printk(const char *level, const struct net_device *dev,
6461                           struct va_format *vaf)
6462{
6463        int r;
6464
6465        if (dev && dev->dev.parent)
6466                r = dev_printk(level, dev->dev.parent, "%s: %pV",
6467                               netdev_name(dev), vaf);
6468        else if (dev)
6469                r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6470        else
6471                r = printk("%s(NULL net_device): %pV", level, vaf);
6472
6473        return r;
6474}
6475EXPORT_SYMBOL(__netdev_printk);
6476
6477int netdev_printk(const char *level, const struct net_device *dev,
6478                  const char *format, ...)
6479{
6480        struct va_format vaf;
6481        va_list args;
6482        int r;
6483
6484        va_start(args, format);
6485
6486        vaf.fmt = format;
6487        vaf.va = &args;
6488
6489        r = __netdev_printk(level, dev, &vaf);
6490        va_end(args);
6491
6492        return r;
6493}
6494EXPORT_SYMBOL(netdev_printk);
6495
6496#define define_netdev_printk_level(func, level)                 \
6497int func(const struct net_device *dev, const char *fmt, ...)    \
6498{                                                               \
6499        int r;                                                  \
6500        struct va_format vaf;                                   \
6501        va_list args;                                           \
6502                                                                \
6503        va_start(args, fmt);                                    \
6504                                                                \
6505        vaf.fmt = fmt;                                          \
6506        vaf.va = &args;                                         \
6507                                                                \
6508        r = __netdev_printk(level, dev, &vaf);                  \
6509        va_end(args);                                           \
6510                                                                \
6511        return r;                                               \
6512}                                                               \
6513EXPORT_SYMBOL(func);
6514
6515define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6516define_netdev_printk_level(netdev_alert, KERN_ALERT);
6517define_netdev_printk_level(netdev_crit, KERN_CRIT);
6518define_netdev_printk_level(netdev_err, KERN_ERR);
6519define_netdev_printk_level(netdev_warn, KERN_WARNING);
6520define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6521define_netdev_printk_level(netdev_info, KERN_INFO);
6522
6523static void __net_exit netdev_exit(struct net *net)
6524{
6525        kfree(net->dev_name_head);
6526        kfree(net->dev_index_head);
6527}
6528
6529static struct pernet_operations __net_initdata netdev_net_ops = {
6530        .init = netdev_init,
6531        .exit = netdev_exit,
6532};
6533
6534static void __net_exit default_device_exit(struct net *net)
6535{
6536        struct net_device *dev, *aux;
6537        /*
6538         * Push all migratable network devices back to the
6539         * initial network namespace
6540         */
6541        rtnl_lock();
6542        for_each_netdev_safe(net, dev, aux) {
6543                int err;
6544                char fb_name[IFNAMSIZ];
6545
6546                /* Ignore unmoveable devices (i.e. loopback) */
6547                if (dev->features & NETIF_F_NETNS_LOCAL)
6548                        continue;
6549
6550                /* Leave virtual devices for the generic cleanup */
6551                if (dev->rtnl_link_ops)
6552                        continue;
6553
6554                /* Push remaining network devices to init_net */
6555                snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6556                err = dev_change_net_namespace(dev, &init_net, fb_name);
6557                if (err) {
6558                        printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6559                                __func__, dev->name, err);
6560                        BUG();
6561                }
6562        }
6563        rtnl_unlock();
6564}
6565
6566static void __net_exit default_device_exit_batch(struct list_head *net_list)
6567{
6568        /* At exit all network devices most be removed from a network
6569         * namespace.  Do this in the reverse order of registration.
6570         * Do this across as many network namespaces as possible to
6571         * improve batching efficiency.
6572         */
6573        struct net_device *dev;
6574        struct net *net;
6575        LIST_HEAD(dev_kill_list);
6576
6577        rtnl_lock();
6578        list_for_each_entry(net, net_list, exit_list) {
6579                for_each_netdev_reverse(net, dev) {
6580                        if (dev->rtnl_link_ops)
6581                                dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6582                        else
6583                                unregister_netdevice_queue(dev, &dev_kill_list);
6584                }
6585        }
6586        unregister_netdevice_many(&dev_kill_list);
6587        list_del(&dev_kill_list);
6588        rtnl_unlock();
6589}
6590
6591static struct pernet_operations __net_initdata default_device_ops = {
6592        .exit = default_device_exit,
6593        .exit_batch = default_device_exit_batch,
6594};
6595
6596/*
6597 *      Initialize the DEV module. At boot time this walks the device list and
6598 *      unhooks any devices that fail to initialise (normally hardware not
6599 *      present) and leaves us with a valid list of present and active devices.
6600 *
6601 */
6602
6603/*
6604 *       This is called single threaded during boot, so no need
6605 *       to take the rtnl semaphore.
6606 */
6607static int __init net_dev_init(void)
6608{
6609        int i, rc = -ENOMEM;
6610
6611        BUG_ON(!dev_boot_phase);
6612
6613        if (dev_proc_init())
6614                goto out;
6615
6616        if (netdev_kobject_init())
6617                goto out;
6618
6619        INIT_LIST_HEAD(&ptype_all);
6620        for (i = 0; i < PTYPE_HASH_SIZE; i++)
6621                INIT_LIST_HEAD(&ptype_base[i]);
6622
6623        if (register_pernet_subsys(&netdev_net_ops))
6624                goto out;
6625
6626        /*
6627         *      Initialise the packet receive queues.
6628         */
6629
6630        for_each_possible_cpu(i) {
6631                struct softnet_data *sd = &per_cpu(softnet_data, i);
6632
6633                memset(sd, 0, sizeof(*sd));
6634                skb_queue_head_init(&sd->input_pkt_queue);
6635                skb_queue_head_init(&sd->process_queue);
6636                sd->completion_queue = NULL;
6637                INIT_LIST_HEAD(&sd->poll_list);
6638                sd->output_queue = NULL;
6639                sd->output_queue_tailp = &sd->output_queue;
6640#ifdef CONFIG_RPS
6641                sd->csd.func = rps_trigger_softirq;
6642                sd->csd.info = sd;
6643                sd->csd.flags = 0;
6644                sd->cpu = i;
6645#endif
6646
6647                sd->backlog.poll = process_backlog;
6648                sd->backlog.weight = weight_p;
6649                sd->backlog.gro_list = NULL;
6650                sd->backlog.gro_count = 0;
6651        }
6652
6653        dev_boot_phase = 0;
6654
6655        /* The loopback device is special if any other network devices
6656         * is present in a network namespace the loopback device must
6657         * be present. Since we now dynamically allocate and free the
6658         * loopback device ensure this invariant is maintained by
6659         * keeping the loopback device as the first device on the
6660         * list of network devices.  Ensuring the loopback devices
6661         * is the first device that appears and the last network device
6662         * that disappears.
6663         */
6664        if (register_pernet_device(&loopback_net_ops))
6665                goto out;
6666
6667        if (register_pernet_device(&default_device_ops))
6668                goto out;
6669
6670        open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6671        open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6672
6673        hotcpu_notifier(dev_cpu_callback, 0);
6674        dst_init();
6675        dev_mcast_init();
6676        rc = 0;
6677out:
6678        return rc;
6679}
6680
6681subsys_initcall(net_dev_init);
6682
6683static int __init initialize_hashrnd(void)
6684{
6685        get_random_bytes(&hashrnd, sizeof(hashrnd));
6686        return 0;
6687}
6688
6689late_initcall_sync(initialize_hashrnd);
6690
6691