linux/net/core/dev.c
<<
>>
Prefs
   1/*
   2 *      NET3    Protocol independent device support routines.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 *      Derived from the non IP parts of dev.c 1.0.19
  10 *              Authors:        Ross Biro
  11 *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *      Additional Authors:
  15 *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *              David Hinds <dahinds@users.sourceforge.net>
  18 *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *              Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *      Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *                                      to 2 if register_netdev gets called
  25 *                                      before net_dev_init & also removed a
  26 *                                      few lines of code in the process.
  27 *              Alan Cox        :       device private ioctl copies fields back.
  28 *              Alan Cox        :       Transmit queue code does relevant
  29 *                                      stunts to keep the queue safe.
  30 *              Alan Cox        :       Fixed double lock.
  31 *              Alan Cox        :       Fixed promisc NULL pointer trap
  32 *              ????????        :       Support the full private ioctl range
  33 *              Alan Cox        :       Moved ioctl permission check into
  34 *                                      drivers
  35 *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36 *              Alan Cox        :       100 backlog just doesn't cut it when
  37 *                                      you start doing multicast video 8)
  38 *              Alan Cox        :       Rewrote net_bh and list manager.
  39 *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40 *              Alan Cox        :       Took out transmit every packet pass
  41 *                                      Saved a few bytes in the ioctl handler
  42 *              Alan Cox        :       Network driver sets packet type before
  43 *                                      calling netif_rx. Saves a function
  44 *                                      call a packet.
  45 *              Alan Cox        :       Hashed net_bh()
  46 *              Richard Kooijman:       Timestamp fixes.
  47 *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48 *              Alan Cox        :       Device lock protection.
  49 *              Alan Cox        :       Fixed nasty side effect of device close
  50 *                                      changes.
  51 *              Rudi Cilibrasi  :       Pass the right thing to
  52 *                                      set_mac_address()
  53 *              Dave Miller     :       32bit quantity for the device lock to
  54 *                                      make it work out on a Sparc.
  55 *              Bjorn Ekwall    :       Added KERNELD hack.
  56 *              Alan Cox        :       Cleaned up the backlog initialise.
  57 *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58 *                                      1 device.
  59 *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60 *                                      is no device open function.
  61 *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62 *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63 *              Cyrus Durgin    :       Cleaned for KMOD
  64 *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65 *                                      A network device unload needs to purge
  66 *                                      the backlog queue.
  67 *      Paul Rusty Russell      :       SIOCSIFNAME
  68 *              Pekka Riikonen  :       Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *                                      indefinitely on dev->refcnt
  71 *              J Hadi Salim    :       - Backlog queue sampling
  72 *                                      - netif_rx() feedback
  73 */
  74
  75#include <asm/uaccess.h>
  76#include <linux/bitops.h>
  77#include <linux/capability.h>
  78#include <linux/cpu.h>
  79#include <linux/types.h>
  80#include <linux/kernel.h>
  81#include <linux/hash.h>
  82#include <linux/slab.h>
  83#include <linux/sched.h>
  84#include <linux/mutex.h>
  85#include <linux/string.h>
  86#include <linux/mm.h>
  87#include <linux/socket.h>
  88#include <linux/sockios.h>
  89#include <linux/errno.h>
  90#include <linux/interrupt.h>
  91#include <linux/if_ether.h>
  92#include <linux/netdevice.h>
  93#include <linux/etherdevice.h>
  94#include <linux/ethtool.h>
  95#include <linux/notifier.h>
  96#include <linux/skbuff.h>
  97#include <net/net_namespace.h>
  98#include <net/sock.h>
  99#include <linux/rtnetlink.h>
 100#include <linux/stat.h>
 101#include <net/dst.h>
 102#include <net/pkt_sched.h>
 103#include <net/checksum.h>
 104#include <net/xfrm.h>
 105#include <linux/highmem.h>
 106#include <linux/init.h>
 107#include <linux/module.h>
 108#include <linux/netpoll.h>
 109#include <linux/rcupdate.h>
 110#include <linux/delay.h>
 111#include <net/iw_handler.h>
 112#include <asm/current.h>
 113#include <linux/audit.h>
 114#include <linux/dmaengine.h>
 115#include <linux/err.h>
 116#include <linux/ctype.h>
 117#include <linux/if_arp.h>
 118#include <linux/if_vlan.h>
 119#include <linux/ip.h>
 120#include <net/ip.h>
 121#include <linux/ipv6.h>
 122#include <linux/in.h>
 123#include <linux/jhash.h>
 124#include <linux/random.h>
 125#include <trace/events/napi.h>
 126#include <trace/events/net.h>
 127#include <trace/events/skb.h>
 128#include <linux/pci.h>
 129#include <linux/inetdevice.h>
 130#include <linux/cpu_rmap.h>
 131#include <linux/static_key.h>
 132#include <linux/hashtable.h>
 133#include <linux/vmalloc.h>
 134
 135#include "net-sysfs.h"
 136
 137/* Instead of increasing this, you should create a hash table. */
 138#define MAX_GRO_SKBS 8
 139
 140/* This should be increased if a protocol with a bigger head is added. */
 141#define GRO_MAX_HEAD (MAX_HEADER + 128)
 142
 143static DEFINE_SPINLOCK(ptype_lock);
 144static DEFINE_SPINLOCK(offload_lock);
 145struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 146struct list_head ptype_all __read_mostly;       /* Taps */
 147static struct list_head offload_base __read_mostly;
 148
 149/*
 150 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 151 * semaphore.
 152 *
 153 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 154 *
 155 * Writers must hold the rtnl semaphore while they loop through the
 156 * dev_base_head list, and hold dev_base_lock for writing when they do the
 157 * actual updates.  This allows pure readers to access the list even
 158 * while a writer is preparing to update it.
 159 *
 160 * To put it another way, dev_base_lock is held for writing only to
 161 * protect against pure readers; the rtnl semaphore provides the
 162 * protection against other writers.
 163 *
 164 * See, for example usages, register_netdevice() and
 165 * unregister_netdevice(), which must be called with the rtnl
 166 * semaphore held.
 167 */
 168DEFINE_RWLOCK(dev_base_lock);
 169EXPORT_SYMBOL(dev_base_lock);
 170
 171/* protects napi_hash addition/deletion and napi_gen_id */
 172static DEFINE_SPINLOCK(napi_hash_lock);
 173
 174static unsigned int napi_gen_id;
 175static DEFINE_HASHTABLE(napi_hash, 8);
 176
 177seqcount_t devnet_rename_seq;
 178
 179static inline void dev_base_seq_inc(struct net *net)
 180{
 181        while (++net->dev_base_seq == 0);
 182}
 183
 184static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 185{
 186        unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 187
 188        return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 189}
 190
 191static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 192{
 193        return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 194}
 195
 196static inline void rps_lock(struct softnet_data *sd)
 197{
 198#ifdef CONFIG_RPS
 199        spin_lock(&sd->input_pkt_queue.lock);
 200#endif
 201}
 202
 203static inline void rps_unlock(struct softnet_data *sd)
 204{
 205#ifdef CONFIG_RPS
 206        spin_unlock(&sd->input_pkt_queue.lock);
 207#endif
 208}
 209
 210/* Device list insertion */
 211static void list_netdevice(struct net_device *dev)
 212{
 213        struct net *net = dev_net(dev);
 214
 215        ASSERT_RTNL();
 216
 217        write_lock_bh(&dev_base_lock);
 218        list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 219        hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 220        hlist_add_head_rcu(&dev->index_hlist,
 221                           dev_index_hash(net, dev->ifindex));
 222        write_unlock_bh(&dev_base_lock);
 223
 224        dev_base_seq_inc(net);
 225}
 226
 227/* Device list removal
 228 * caller must respect a RCU grace period before freeing/reusing dev
 229 */
 230static void unlist_netdevice(struct net_device *dev)
 231{
 232        ASSERT_RTNL();
 233
 234        /* Unlink dev from the device chain */
 235        write_lock_bh(&dev_base_lock);
 236        list_del_rcu(&dev->dev_list);
 237        hlist_del_rcu(&dev->name_hlist);
 238        hlist_del_rcu(&dev->index_hlist);
 239        write_unlock_bh(&dev_base_lock);
 240
 241        dev_base_seq_inc(dev_net(dev));
 242}
 243
 244/*
 245 *      Our notifier list
 246 */
 247
 248static RAW_NOTIFIER_HEAD(netdev_chain);
 249
 250/*
 251 *      Device drivers call our routines to queue packets here. We empty the
 252 *      queue in the local softnet handler.
 253 */
 254
 255DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 256EXPORT_PER_CPU_SYMBOL(softnet_data);
 257
 258#ifdef CONFIG_LOCKDEP
 259/*
 260 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 261 * according to dev->type
 262 */
 263static const unsigned short netdev_lock_type[] =
 264        {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 265         ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 266         ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 267         ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 268         ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 269         ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 270         ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 271         ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 272         ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 273         ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 274         ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 275         ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 276         ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 277         ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 278         ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 279
 280static const char *const netdev_lock_name[] =
 281        {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 282         "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 283         "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 284         "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 285         "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 286         "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 287         "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 288         "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 289         "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 290         "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 291         "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 292         "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 293         "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 294         "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 295         "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 296
 297static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 298static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 299
 300static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 301{
 302        int i;
 303
 304        for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 305                if (netdev_lock_type[i] == dev_type)
 306                        return i;
 307        /* the last key is used by default */
 308        return ARRAY_SIZE(netdev_lock_type) - 1;
 309}
 310
 311static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 312                                                 unsigned short dev_type)
 313{
 314        int i;
 315
 316        i = netdev_lock_pos(dev_type);
 317        lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 318                                   netdev_lock_name[i]);
 319}
 320
 321static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 322{
 323        int i;
 324
 325        i = netdev_lock_pos(dev->type);
 326        lockdep_set_class_and_name(&dev->addr_list_lock,
 327                                   &netdev_addr_lock_key[i],
 328                                   netdev_lock_name[i]);
 329}
 330#else
 331static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 332                                                 unsigned short dev_type)
 333{
 334}
 335static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 336{
 337}
 338#endif
 339
 340/*******************************************************************************
 341
 342                Protocol management and registration routines
 343
 344*******************************************************************************/
 345
 346/*
 347 *      Add a protocol ID to the list. Now that the input handler is
 348 *      smarter we can dispense with all the messy stuff that used to be
 349 *      here.
 350 *
 351 *      BEWARE!!! Protocol handlers, mangling input packets,
 352 *      MUST BE last in hash buckets and checking protocol handlers
 353 *      MUST start from promiscuous ptype_all chain in net_bh.
 354 *      It is true now, do not change it.
 355 *      Explanation follows: if protocol handler, mangling packet, will
 356 *      be the first on list, it is not able to sense, that packet
 357 *      is cloned and should be copied-on-write, so that it will
 358 *      change it and subsequent readers will get broken packet.
 359 *                                                      --ANK (980803)
 360 */
 361
 362static inline struct list_head *ptype_head(const struct packet_type *pt)
 363{
 364        if (pt->type == htons(ETH_P_ALL))
 365                return &ptype_all;
 366        else
 367                return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 368}
 369
 370/**
 371 *      dev_add_pack - add packet handler
 372 *      @pt: packet type declaration
 373 *
 374 *      Add a protocol handler to the networking stack. The passed &packet_type
 375 *      is linked into kernel lists and may not be freed until it has been
 376 *      removed from the kernel lists.
 377 *
 378 *      This call does not sleep therefore it can not
 379 *      guarantee all CPU's that are in middle of receiving packets
 380 *      will see the new packet type (until the next received packet).
 381 */
 382
 383void dev_add_pack(struct packet_type *pt)
 384{
 385        struct list_head *head = ptype_head(pt);
 386
 387        spin_lock(&ptype_lock);
 388        list_add_rcu(&pt->list, head);
 389        spin_unlock(&ptype_lock);
 390}
 391EXPORT_SYMBOL(dev_add_pack);
 392
 393/**
 394 *      __dev_remove_pack        - remove packet handler
 395 *      @pt: packet type declaration
 396 *
 397 *      Remove a protocol handler that was previously added to the kernel
 398 *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 399 *      from the kernel lists and can be freed or reused once this function
 400 *      returns.
 401 *
 402 *      The packet type might still be in use by receivers
 403 *      and must not be freed until after all the CPU's have gone
 404 *      through a quiescent state.
 405 */
 406void __dev_remove_pack(struct packet_type *pt)
 407{
 408        struct list_head *head = ptype_head(pt);
 409        struct packet_type *pt1;
 410
 411        spin_lock(&ptype_lock);
 412
 413        list_for_each_entry(pt1, head, list) {
 414                if (pt == pt1) {
 415                        list_del_rcu(&pt->list);
 416                        goto out;
 417                }
 418        }
 419
 420        pr_warn("dev_remove_pack: %p not found\n", pt);
 421out:
 422        spin_unlock(&ptype_lock);
 423}
 424EXPORT_SYMBOL(__dev_remove_pack);
 425
 426/**
 427 *      dev_remove_pack  - remove packet handler
 428 *      @pt: packet type declaration
 429 *
 430 *      Remove a protocol handler that was previously added to the kernel
 431 *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 432 *      from the kernel lists and can be freed or reused once this function
 433 *      returns.
 434 *
 435 *      This call sleeps to guarantee that no CPU is looking at the packet
 436 *      type after return.
 437 */
 438void dev_remove_pack(struct packet_type *pt)
 439{
 440        __dev_remove_pack(pt);
 441
 442        synchronize_net();
 443}
 444EXPORT_SYMBOL(dev_remove_pack);
 445
 446
 447/**
 448 *      dev_add_offload - register offload handlers
 449 *      @po: protocol offload declaration
 450 *
 451 *      Add protocol offload handlers to the networking stack. The passed
 452 *      &proto_offload is linked into kernel lists and may not be freed until
 453 *      it has been removed from the kernel lists.
 454 *
 455 *      This call does not sleep therefore it can not
 456 *      guarantee all CPU's that are in middle of receiving packets
 457 *      will see the new offload handlers (until the next received packet).
 458 */
 459void dev_add_offload(struct packet_offload *po)
 460{
 461        struct list_head *head = &offload_base;
 462
 463        spin_lock(&offload_lock);
 464        list_add_rcu(&po->list, head);
 465        spin_unlock(&offload_lock);
 466}
 467EXPORT_SYMBOL(dev_add_offload);
 468
 469/**
 470 *      __dev_remove_offload     - remove offload handler
 471 *      @po: packet offload declaration
 472 *
 473 *      Remove a protocol offload handler that was previously added to the
 474 *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 475 *      is removed from the kernel lists and can be freed or reused once this
 476 *      function returns.
 477 *
 478 *      The packet type might still be in use by receivers
 479 *      and must not be freed until after all the CPU's have gone
 480 *      through a quiescent state.
 481 */
 482void __dev_remove_offload(struct packet_offload *po)
 483{
 484        struct list_head *head = &offload_base;
 485        struct packet_offload *po1;
 486
 487        spin_lock(&offload_lock);
 488
 489        list_for_each_entry(po1, head, list) {
 490                if (po == po1) {
 491                        list_del_rcu(&po->list);
 492                        goto out;
 493                }
 494        }
 495
 496        pr_warn("dev_remove_offload: %p not found\n", po);
 497out:
 498        spin_unlock(&offload_lock);
 499}
 500EXPORT_SYMBOL(__dev_remove_offload);
 501
 502/**
 503 *      dev_remove_offload       - remove packet offload handler
 504 *      @po: packet offload declaration
 505 *
 506 *      Remove a packet offload handler that was previously added to the kernel
 507 *      offload handlers by dev_add_offload(). The passed &offload_type is
 508 *      removed from the kernel lists and can be freed or reused once this
 509 *      function returns.
 510 *
 511 *      This call sleeps to guarantee that no CPU is looking at the packet
 512 *      type after return.
 513 */
 514void dev_remove_offload(struct packet_offload *po)
 515{
 516        __dev_remove_offload(po);
 517
 518        synchronize_net();
 519}
 520EXPORT_SYMBOL(dev_remove_offload);
 521
 522/******************************************************************************
 523
 524                      Device Boot-time Settings Routines
 525
 526*******************************************************************************/
 527
 528/* Boot time configuration table */
 529static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 530
 531/**
 532 *      netdev_boot_setup_add   - add new setup entry
 533 *      @name: name of the device
 534 *      @map: configured settings for the device
 535 *
 536 *      Adds new setup entry to the dev_boot_setup list.  The function
 537 *      returns 0 on error and 1 on success.  This is a generic routine to
 538 *      all netdevices.
 539 */
 540static int netdev_boot_setup_add(char *name, struct ifmap *map)
 541{
 542        struct netdev_boot_setup *s;
 543        int i;
 544
 545        s = dev_boot_setup;
 546        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 547                if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 548                        memset(s[i].name, 0, sizeof(s[i].name));
 549                        strlcpy(s[i].name, name, IFNAMSIZ);
 550                        memcpy(&s[i].map, map, sizeof(s[i].map));
 551                        break;
 552                }
 553        }
 554
 555        return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 556}
 557
 558/**
 559 *      netdev_boot_setup_check - check boot time settings
 560 *      @dev: the netdevice
 561 *
 562 *      Check boot time settings for the device.
 563 *      The found settings are set for the device to be used
 564 *      later in the device probing.
 565 *      Returns 0 if no settings found, 1 if they are.
 566 */
 567int netdev_boot_setup_check(struct net_device *dev)
 568{
 569        struct netdev_boot_setup *s = dev_boot_setup;
 570        int i;
 571
 572        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 573                if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 574                    !strcmp(dev->name, s[i].name)) {
 575                        dev->irq        = s[i].map.irq;
 576                        dev->base_addr  = s[i].map.base_addr;
 577                        dev->mem_start  = s[i].map.mem_start;
 578                        dev->mem_end    = s[i].map.mem_end;
 579                        return 1;
 580                }
 581        }
 582        return 0;
 583}
 584EXPORT_SYMBOL(netdev_boot_setup_check);
 585
 586
 587/**
 588 *      netdev_boot_base        - get address from boot time settings
 589 *      @prefix: prefix for network device
 590 *      @unit: id for network device
 591 *
 592 *      Check boot time settings for the base address of device.
 593 *      The found settings are set for the device to be used
 594 *      later in the device probing.
 595 *      Returns 0 if no settings found.
 596 */
 597unsigned long netdev_boot_base(const char *prefix, int unit)
 598{
 599        const struct netdev_boot_setup *s = dev_boot_setup;
 600        char name[IFNAMSIZ];
 601        int i;
 602
 603        sprintf(name, "%s%d", prefix, unit);
 604
 605        /*
 606         * If device already registered then return base of 1
 607         * to indicate not to probe for this interface
 608         */
 609        if (__dev_get_by_name(&init_net, name))
 610                return 1;
 611
 612        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 613                if (!strcmp(name, s[i].name))
 614                        return s[i].map.base_addr;
 615        return 0;
 616}
 617
 618/*
 619 * Saves at boot time configured settings for any netdevice.
 620 */
 621int __init netdev_boot_setup(char *str)
 622{
 623        int ints[5];
 624        struct ifmap map;
 625
 626        str = get_options(str, ARRAY_SIZE(ints), ints);
 627        if (!str || !*str)
 628                return 0;
 629
 630        /* Save settings */
 631        memset(&map, 0, sizeof(map));
 632        if (ints[0] > 0)
 633                map.irq = ints[1];
 634        if (ints[0] > 1)
 635                map.base_addr = ints[2];
 636        if (ints[0] > 2)
 637                map.mem_start = ints[3];
 638        if (ints[0] > 3)
 639                map.mem_end = ints[4];
 640
 641        /* Add new entry to the list */
 642        return netdev_boot_setup_add(str, &map);
 643}
 644
 645__setup("netdev=", netdev_boot_setup);
 646
 647/*******************************************************************************
 648
 649                            Device Interface Subroutines
 650
 651*******************************************************************************/
 652
 653/**
 654 *      __dev_get_by_name       - find a device by its name
 655 *      @net: the applicable net namespace
 656 *      @name: name to find
 657 *
 658 *      Find an interface by name. Must be called under RTNL semaphore
 659 *      or @dev_base_lock. If the name is found a pointer to the device
 660 *      is returned. If the name is not found then %NULL is returned. The
 661 *      reference counters are not incremented so the caller must be
 662 *      careful with locks.
 663 */
 664
 665struct net_device *__dev_get_by_name(struct net *net, const char *name)
 666{
 667        struct net_device *dev;
 668        struct hlist_head *head = dev_name_hash(net, name);
 669
 670        hlist_for_each_entry(dev, head, name_hlist)
 671                if (!strncmp(dev->name, name, IFNAMSIZ))
 672                        return dev;
 673
 674        return NULL;
 675}
 676EXPORT_SYMBOL(__dev_get_by_name);
 677
 678/**
 679 *      dev_get_by_name_rcu     - find a device by its name
 680 *      @net: the applicable net namespace
 681 *      @name: name to find
 682 *
 683 *      Find an interface by name.
 684 *      If the name is found a pointer to the device is returned.
 685 *      If the name is not found then %NULL is returned.
 686 *      The reference counters are not incremented so the caller must be
 687 *      careful with locks. The caller must hold RCU lock.
 688 */
 689
 690struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 691{
 692        struct net_device *dev;
 693        struct hlist_head *head = dev_name_hash(net, name);
 694
 695        hlist_for_each_entry_rcu(dev, head, name_hlist)
 696                if (!strncmp(dev->name, name, IFNAMSIZ))
 697                        return dev;
 698
 699        return NULL;
 700}
 701EXPORT_SYMBOL(dev_get_by_name_rcu);
 702
 703/**
 704 *      dev_get_by_name         - find a device by its name
 705 *      @net: the applicable net namespace
 706 *      @name: name to find
 707 *
 708 *      Find an interface by name. This can be called from any
 709 *      context and does its own locking. The returned handle has
 710 *      the usage count incremented and the caller must use dev_put() to
 711 *      release it when it is no longer needed. %NULL is returned if no
 712 *      matching device is found.
 713 */
 714
 715struct net_device *dev_get_by_name(struct net *net, const char *name)
 716{
 717        struct net_device *dev;
 718
 719        rcu_read_lock();
 720        dev = dev_get_by_name_rcu(net, name);
 721        if (dev)
 722                dev_hold(dev);
 723        rcu_read_unlock();
 724        return dev;
 725}
 726EXPORT_SYMBOL(dev_get_by_name);
 727
 728/**
 729 *      __dev_get_by_index - find a device by its ifindex
 730 *      @net: the applicable net namespace
 731 *      @ifindex: index of device
 732 *
 733 *      Search for an interface by index. Returns %NULL if the device
 734 *      is not found or a pointer to the device. The device has not
 735 *      had its reference counter increased so the caller must be careful
 736 *      about locking. The caller must hold either the RTNL semaphore
 737 *      or @dev_base_lock.
 738 */
 739
 740struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 741{
 742        struct net_device *dev;
 743        struct hlist_head *head = dev_index_hash(net, ifindex);
 744
 745        hlist_for_each_entry(dev, head, index_hlist)
 746                if (dev->ifindex == ifindex)
 747                        return dev;
 748
 749        return NULL;
 750}
 751EXPORT_SYMBOL(__dev_get_by_index);
 752
 753/**
 754 *      dev_get_by_index_rcu - find a device by its ifindex
 755 *      @net: the applicable net namespace
 756 *      @ifindex: index of device
 757 *
 758 *      Search for an interface by index. Returns %NULL if the device
 759 *      is not found or a pointer to the device. The device has not
 760 *      had its reference counter increased so the caller must be careful
 761 *      about locking. The caller must hold RCU lock.
 762 */
 763
 764struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 765{
 766        struct net_device *dev;
 767        struct hlist_head *head = dev_index_hash(net, ifindex);
 768
 769        hlist_for_each_entry_rcu(dev, head, index_hlist)
 770                if (dev->ifindex == ifindex)
 771                        return dev;
 772
 773        return NULL;
 774}
 775EXPORT_SYMBOL(dev_get_by_index_rcu);
 776
 777
 778/**
 779 *      dev_get_by_index - find a device by its ifindex
 780 *      @net: the applicable net namespace
 781 *      @ifindex: index of device
 782 *
 783 *      Search for an interface by index. Returns NULL if the device
 784 *      is not found or a pointer to the device. The device returned has
 785 *      had a reference added and the pointer is safe until the user calls
 786 *      dev_put to indicate they have finished with it.
 787 */
 788
 789struct net_device *dev_get_by_index(struct net *net, int ifindex)
 790{
 791        struct net_device *dev;
 792
 793        rcu_read_lock();
 794        dev = dev_get_by_index_rcu(net, ifindex);
 795        if (dev)
 796                dev_hold(dev);
 797        rcu_read_unlock();
 798        return dev;
 799}
 800EXPORT_SYMBOL(dev_get_by_index);
 801
 802/**
 803 *      netdev_get_name - get a netdevice name, knowing its ifindex.
 804 *      @net: network namespace
 805 *      @name: a pointer to the buffer where the name will be stored.
 806 *      @ifindex: the ifindex of the interface to get the name from.
 807 *
 808 *      The use of raw_seqcount_begin() and cond_resched() before
 809 *      retrying is required as we want to give the writers a chance
 810 *      to complete when CONFIG_PREEMPT is not set.
 811 */
 812int netdev_get_name(struct net *net, char *name, int ifindex)
 813{
 814        struct net_device *dev;
 815        unsigned int seq;
 816
 817retry:
 818        seq = raw_seqcount_begin(&devnet_rename_seq);
 819        rcu_read_lock();
 820        dev = dev_get_by_index_rcu(net, ifindex);
 821        if (!dev) {
 822                rcu_read_unlock();
 823                return -ENODEV;
 824        }
 825
 826        strcpy(name, dev->name);
 827        rcu_read_unlock();
 828        if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 829                cond_resched();
 830                goto retry;
 831        }
 832
 833        return 0;
 834}
 835
 836/**
 837 *      dev_getbyhwaddr_rcu - find a device by its hardware address
 838 *      @net: the applicable net namespace
 839 *      @type: media type of device
 840 *      @ha: hardware address
 841 *
 842 *      Search for an interface by MAC address. Returns NULL if the device
 843 *      is not found or a pointer to the device.
 844 *      The caller must hold RCU or RTNL.
 845 *      The returned device has not had its ref count increased
 846 *      and the caller must therefore be careful about locking
 847 *
 848 */
 849
 850struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 851                                       const char *ha)
 852{
 853        struct net_device *dev;
 854
 855        for_each_netdev_rcu(net, dev)
 856                if (dev->type == type &&
 857                    !memcmp(dev->dev_addr, ha, dev->addr_len))
 858                        return dev;
 859
 860        return NULL;
 861}
 862EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 863
 864struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 865{
 866        struct net_device *dev;
 867
 868        ASSERT_RTNL();
 869        for_each_netdev(net, dev)
 870                if (dev->type == type)
 871                        return dev;
 872
 873        return NULL;
 874}
 875EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 876
 877struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 878{
 879        struct net_device *dev, *ret = NULL;
 880
 881        rcu_read_lock();
 882        for_each_netdev_rcu(net, dev)
 883                if (dev->type == type) {
 884                        dev_hold(dev);
 885                        ret = dev;
 886                        break;
 887                }
 888        rcu_read_unlock();
 889        return ret;
 890}
 891EXPORT_SYMBOL(dev_getfirstbyhwtype);
 892
 893/**
 894 *      dev_get_by_flags_rcu - find any device with given flags
 895 *      @net: the applicable net namespace
 896 *      @if_flags: IFF_* values
 897 *      @mask: bitmask of bits in if_flags to check
 898 *
 899 *      Search for any interface with the given flags. Returns NULL if a device
 900 *      is not found or a pointer to the device. Must be called inside
 901 *      rcu_read_lock(), and result refcount is unchanged.
 902 */
 903
 904struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 905                                    unsigned short mask)
 906{
 907        struct net_device *dev, *ret;
 908
 909        ret = NULL;
 910        for_each_netdev_rcu(net, dev) {
 911                if (((dev->flags ^ if_flags) & mask) == 0) {
 912                        ret = dev;
 913                        break;
 914                }
 915        }
 916        return ret;
 917}
 918EXPORT_SYMBOL(dev_get_by_flags_rcu);
 919
 920/**
 921 *      dev_valid_name - check if name is okay for network device
 922 *      @name: name string
 923 *
 924 *      Network device names need to be valid file names to
 925 *      to allow sysfs to work.  We also disallow any kind of
 926 *      whitespace.
 927 */
 928bool dev_valid_name(const char *name)
 929{
 930        if (*name == '\0')
 931                return false;
 932        if (strlen(name) >= IFNAMSIZ)
 933                return false;
 934        if (!strcmp(name, ".") || !strcmp(name, ".."))
 935                return false;
 936
 937        while (*name) {
 938                if (*name == '/' || isspace(*name))
 939                        return false;
 940                name++;
 941        }
 942        return true;
 943}
 944EXPORT_SYMBOL(dev_valid_name);
 945
 946/**
 947 *      __dev_alloc_name - allocate a name for a device
 948 *      @net: network namespace to allocate the device name in
 949 *      @name: name format string
 950 *      @buf:  scratch buffer and result name string
 951 *
 952 *      Passed a format string - eg "lt%d" it will try and find a suitable
 953 *      id. It scans list of devices to build up a free map, then chooses
 954 *      the first empty slot. The caller must hold the dev_base or rtnl lock
 955 *      while allocating the name and adding the device in order to avoid
 956 *      duplicates.
 957 *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 958 *      Returns the number of the unit assigned or a negative errno code.
 959 */
 960
 961static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 962{
 963        int i = 0;
 964        const char *p;
 965        const int max_netdevices = 8*PAGE_SIZE;
 966        unsigned long *inuse;
 967        struct net_device *d;
 968
 969        p = strnchr(name, IFNAMSIZ-1, '%');
 970        if (p) {
 971                /*
 972                 * Verify the string as this thing may have come from
 973                 * the user.  There must be either one "%d" and no other "%"
 974                 * characters.
 975                 */
 976                if (p[1] != 'd' || strchr(p + 2, '%'))
 977                        return -EINVAL;
 978
 979                /* Use one page as a bit array of possible slots */
 980                inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 981                if (!inuse)
 982                        return -ENOMEM;
 983
 984                for_each_netdev(net, d) {
 985                        if (!sscanf(d->name, name, &i))
 986                                continue;
 987                        if (i < 0 || i >= max_netdevices)
 988                                continue;
 989
 990                        /*  avoid cases where sscanf is not exact inverse of printf */
 991                        snprintf(buf, IFNAMSIZ, name, i);
 992                        if (!strncmp(buf, d->name, IFNAMSIZ))
 993                                set_bit(i, inuse);
 994                }
 995
 996                i = find_first_zero_bit(inuse, max_netdevices);
 997                free_page((unsigned long) inuse);
 998        }
 999
1000        if (buf != name)
1001                snprintf(buf, IFNAMSIZ, name, i);
1002        if (!__dev_get_by_name(net, buf))
1003                return i;
1004
1005        /* It is possible to run out of possible slots
1006         * when the name is long and there isn't enough space left
1007         * for the digits, or if all bits are used.
1008         */
1009        return -ENFILE;
1010}
1011
1012/**
1013 *      dev_alloc_name - allocate a name for a device
1014 *      @dev: device
1015 *      @name: name format string
1016 *
1017 *      Passed a format string - eg "lt%d" it will try and find a suitable
1018 *      id. It scans list of devices to build up a free map, then chooses
1019 *      the first empty slot. The caller must hold the dev_base or rtnl lock
1020 *      while allocating the name and adding the device in order to avoid
1021 *      duplicates.
1022 *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1023 *      Returns the number of the unit assigned or a negative errno code.
1024 */
1025
1026int dev_alloc_name(struct net_device *dev, const char *name)
1027{
1028        char buf[IFNAMSIZ];
1029        struct net *net;
1030        int ret;
1031
1032        BUG_ON(!dev_net(dev));
1033        net = dev_net(dev);
1034        ret = __dev_alloc_name(net, name, buf);
1035        if (ret >= 0)
1036                strlcpy(dev->name, buf, IFNAMSIZ);
1037        return ret;
1038}
1039EXPORT_SYMBOL(dev_alloc_name);
1040
1041static int dev_alloc_name_ns(struct net *net,
1042                             struct net_device *dev,
1043                             const char *name)
1044{
1045        char buf[IFNAMSIZ];
1046        int ret;
1047
1048        ret = __dev_alloc_name(net, name, buf);
1049        if (ret >= 0)
1050                strlcpy(dev->name, buf, IFNAMSIZ);
1051        return ret;
1052}
1053
1054static int dev_get_valid_name(struct net *net,
1055                              struct net_device *dev,
1056                              const char *name)
1057{
1058        BUG_ON(!net);
1059
1060        if (!dev_valid_name(name))
1061                return -EINVAL;
1062
1063        if (strchr(name, '%'))
1064                return dev_alloc_name_ns(net, dev, name);
1065        else if (__dev_get_by_name(net, name))
1066                return -EEXIST;
1067        else if (dev->name != name)
1068                strlcpy(dev->name, name, IFNAMSIZ);
1069
1070        return 0;
1071}
1072
1073/**
1074 *      dev_change_name - change name of a device
1075 *      @dev: device
1076 *      @newname: name (or format string) must be at least IFNAMSIZ
1077 *
1078 *      Change name of a device, can pass format strings "eth%d".
1079 *      for wildcarding.
1080 */
1081int dev_change_name(struct net_device *dev, const char *newname)
1082{
1083        char oldname[IFNAMSIZ];
1084        int err = 0;
1085        int ret;
1086        struct net *net;
1087
1088        ASSERT_RTNL();
1089        BUG_ON(!dev_net(dev));
1090
1091        net = dev_net(dev);
1092        if (dev->flags & IFF_UP)
1093                return -EBUSY;
1094
1095        write_seqcount_begin(&devnet_rename_seq);
1096
1097        if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1098                write_seqcount_end(&devnet_rename_seq);
1099                return 0;
1100        }
1101
1102        memcpy(oldname, dev->name, IFNAMSIZ);
1103
1104        err = dev_get_valid_name(net, dev, newname);
1105        if (err < 0) {
1106                write_seqcount_end(&devnet_rename_seq);
1107                return err;
1108        }
1109
1110rollback:
1111        ret = device_rename(&dev->dev, dev->name);
1112        if (ret) {
1113                memcpy(dev->name, oldname, IFNAMSIZ);
1114                write_seqcount_end(&devnet_rename_seq);
1115                return ret;
1116        }
1117
1118        write_seqcount_end(&devnet_rename_seq);
1119
1120        write_lock_bh(&dev_base_lock);
1121        hlist_del_rcu(&dev->name_hlist);
1122        write_unlock_bh(&dev_base_lock);
1123
1124        synchronize_rcu();
1125
1126        write_lock_bh(&dev_base_lock);
1127        hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1128        write_unlock_bh(&dev_base_lock);
1129
1130        ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1131        ret = notifier_to_errno(ret);
1132
1133        if (ret) {
1134                /* err >= 0 after dev_alloc_name() or stores the first errno */
1135                if (err >= 0) {
1136                        err = ret;
1137                        write_seqcount_begin(&devnet_rename_seq);
1138                        memcpy(dev->name, oldname, IFNAMSIZ);
1139                        goto rollback;
1140                } else {
1141                        pr_err("%s: name change rollback failed: %d\n",
1142                               dev->name, ret);
1143                }
1144        }
1145
1146        return err;
1147}
1148
1149/**
1150 *      dev_set_alias - change ifalias of a device
1151 *      @dev: device
1152 *      @alias: name up to IFALIASZ
1153 *      @len: limit of bytes to copy from info
1154 *
1155 *      Set ifalias for a device,
1156 */
1157int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1158{
1159        char *new_ifalias;
1160
1161        ASSERT_RTNL();
1162
1163        if (len >= IFALIASZ)
1164                return -EINVAL;
1165
1166        if (!len) {
1167                kfree(dev->ifalias);
1168                dev->ifalias = NULL;
1169                return 0;
1170        }
1171
1172        new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1173        if (!new_ifalias)
1174                return -ENOMEM;
1175        dev->ifalias = new_ifalias;
1176
1177        strlcpy(dev->ifalias, alias, len+1);
1178        return len;
1179}
1180
1181
1182/**
1183 *      netdev_features_change - device changes features
1184 *      @dev: device to cause notification
1185 *
1186 *      Called to indicate a device has changed features.
1187 */
1188void netdev_features_change(struct net_device *dev)
1189{
1190        call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1191}
1192EXPORT_SYMBOL(netdev_features_change);
1193
1194/**
1195 *      netdev_state_change - device changes state
1196 *      @dev: device to cause notification
1197 *
1198 *      Called to indicate a device has changed state. This function calls
1199 *      the notifier chains for netdev_chain and sends a NEWLINK message
1200 *      to the routing socket.
1201 */
1202void netdev_state_change(struct net_device *dev)
1203{
1204        if (dev->flags & IFF_UP) {
1205                call_netdevice_notifiers(NETDEV_CHANGE, dev);
1206                rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1207        }
1208}
1209EXPORT_SYMBOL(netdev_state_change);
1210
1211/**
1212 *      netdev_notify_peers - notify network peers about existence of @dev
1213 *      @dev: network device
1214 *
1215 * Generate traffic such that interested network peers are aware of
1216 * @dev, such as by generating a gratuitous ARP. This may be used when
1217 * a device wants to inform the rest of the network about some sort of
1218 * reconfiguration such as a failover event or virtual machine
1219 * migration.
1220 */
1221void netdev_notify_peers(struct net_device *dev)
1222{
1223        rtnl_lock();
1224        call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1225        rtnl_unlock();
1226}
1227EXPORT_SYMBOL(netdev_notify_peers);
1228
1229static int __dev_open(struct net_device *dev)
1230{
1231        const struct net_device_ops *ops = dev->netdev_ops;
1232        int ret;
1233
1234        ASSERT_RTNL();
1235
1236        if (!netif_device_present(dev))
1237                return -ENODEV;
1238
1239        /* Block netpoll from trying to do any rx path servicing.
1240         * If we don't do this there is a chance ndo_poll_controller
1241         * or ndo_poll may be running while we open the device
1242         */
1243        netpoll_rx_disable(dev);
1244
1245        ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1246        ret = notifier_to_errno(ret);
1247        if (ret)
1248                return ret;
1249
1250        set_bit(__LINK_STATE_START, &dev->state);
1251
1252        if (ops->ndo_validate_addr)
1253                ret = ops->ndo_validate_addr(dev);
1254
1255        if (!ret && ops->ndo_open)
1256                ret = ops->ndo_open(dev);
1257
1258        netpoll_rx_enable(dev);
1259
1260        if (ret)
1261                clear_bit(__LINK_STATE_START, &dev->state);
1262        else {
1263                dev->flags |= IFF_UP;
1264                net_dmaengine_get();
1265                dev_set_rx_mode(dev);
1266                dev_activate(dev);
1267                add_device_randomness(dev->dev_addr, dev->addr_len);
1268        }
1269
1270        return ret;
1271}
1272
1273/**
1274 *      dev_open        - prepare an interface for use.
1275 *      @dev:   device to open
1276 *
1277 *      Takes a device from down to up state. The device's private open
1278 *      function is invoked and then the multicast lists are loaded. Finally
1279 *      the device is moved into the up state and a %NETDEV_UP message is
1280 *      sent to the netdev notifier chain.
1281 *
1282 *      Calling this function on an active interface is a nop. On a failure
1283 *      a negative errno code is returned.
1284 */
1285int dev_open(struct net_device *dev)
1286{
1287        int ret;
1288
1289        if (dev->flags & IFF_UP)
1290                return 0;
1291
1292        ret = __dev_open(dev);
1293        if (ret < 0)
1294                return ret;
1295
1296        rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1297        call_netdevice_notifiers(NETDEV_UP, dev);
1298
1299        return ret;
1300}
1301EXPORT_SYMBOL(dev_open);
1302
1303static int __dev_close_many(struct list_head *head)
1304{
1305        struct net_device *dev;
1306
1307        ASSERT_RTNL();
1308        might_sleep();
1309
1310        list_for_each_entry(dev, head, unreg_list) {
1311                call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1312
1313                clear_bit(__LINK_STATE_START, &dev->state);
1314
1315                /* Synchronize to scheduled poll. We cannot touch poll list, it
1316                 * can be even on different cpu. So just clear netif_running().
1317                 *
1318                 * dev->stop() will invoke napi_disable() on all of it's
1319                 * napi_struct instances on this device.
1320                 */
1321                smp_mb__after_clear_bit(); /* Commit netif_running(). */
1322        }
1323
1324        dev_deactivate_many(head);
1325
1326        list_for_each_entry(dev, head, unreg_list) {
1327                const struct net_device_ops *ops = dev->netdev_ops;
1328
1329                /*
1330                 *      Call the device specific close. This cannot fail.
1331                 *      Only if device is UP
1332                 *
1333                 *      We allow it to be called even after a DETACH hot-plug
1334                 *      event.
1335                 */
1336                if (ops->ndo_stop)
1337                        ops->ndo_stop(dev);
1338
1339                dev->flags &= ~IFF_UP;
1340                net_dmaengine_put();
1341        }
1342
1343        return 0;
1344}
1345
1346static int __dev_close(struct net_device *dev)
1347{
1348        int retval;
1349        LIST_HEAD(single);
1350
1351        /* Temporarily disable netpoll until the interface is down */
1352        netpoll_rx_disable(dev);
1353
1354        list_add(&dev->unreg_list, &single);
1355        retval = __dev_close_many(&single);
1356        list_del(&single);
1357
1358        netpoll_rx_enable(dev);
1359        return retval;
1360}
1361
1362static int dev_close_many(struct list_head *head)
1363{
1364        struct net_device *dev, *tmp;
1365        LIST_HEAD(tmp_list);
1366
1367        list_for_each_entry_safe(dev, tmp, head, unreg_list)
1368                if (!(dev->flags & IFF_UP))
1369                        list_move(&dev->unreg_list, &tmp_list);
1370
1371        __dev_close_many(head);
1372
1373        list_for_each_entry(dev, head, unreg_list) {
1374                rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1375                call_netdevice_notifiers(NETDEV_DOWN, dev);
1376        }
1377
1378        /* rollback_registered_many needs the complete original list */
1379        list_splice(&tmp_list, head);
1380        return 0;
1381}
1382
1383/**
1384 *      dev_close - shutdown an interface.
1385 *      @dev: device to shutdown
1386 *
1387 *      This function moves an active device into down state. A
1388 *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1389 *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1390 *      chain.
1391 */
1392int dev_close(struct net_device *dev)
1393{
1394        if (dev->flags & IFF_UP) {
1395                LIST_HEAD(single);
1396
1397                /* Block netpoll rx while the interface is going down */
1398                netpoll_rx_disable(dev);
1399
1400                list_add(&dev->unreg_list, &single);
1401                dev_close_many(&single);
1402                list_del(&single);
1403
1404                netpoll_rx_enable(dev);
1405        }
1406        return 0;
1407}
1408EXPORT_SYMBOL(dev_close);
1409
1410
1411/**
1412 *      dev_disable_lro - disable Large Receive Offload on a device
1413 *      @dev: device
1414 *
1415 *      Disable Large Receive Offload (LRO) on a net device.  Must be
1416 *      called under RTNL.  This is needed if received packets may be
1417 *      forwarded to another interface.
1418 */
1419void dev_disable_lro(struct net_device *dev)
1420{
1421        /*
1422         * If we're trying to disable lro on a vlan device
1423         * use the underlying physical device instead
1424         */
1425        if (is_vlan_dev(dev))
1426                dev = vlan_dev_real_dev(dev);
1427
1428        dev->wanted_features &= ~NETIF_F_LRO;
1429        netdev_update_features(dev);
1430
1431        if (unlikely(dev->features & NETIF_F_LRO))
1432                netdev_WARN(dev, "failed to disable LRO!\n");
1433}
1434EXPORT_SYMBOL(dev_disable_lro);
1435
1436static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1437                                   struct net_device *dev)
1438{
1439        struct netdev_notifier_info info;
1440
1441        netdev_notifier_info_init(&info, dev);
1442        return nb->notifier_call(nb, val, &info);
1443}
1444
1445static int dev_boot_phase = 1;
1446
1447/**
1448 *      register_netdevice_notifier - register a network notifier block
1449 *      @nb: notifier
1450 *
1451 *      Register a notifier to be called when network device events occur.
1452 *      The notifier passed is linked into the kernel structures and must
1453 *      not be reused until it has been unregistered. A negative errno code
1454 *      is returned on a failure.
1455 *
1456 *      When registered all registration and up events are replayed
1457 *      to the new notifier to allow device to have a race free
1458 *      view of the network device list.
1459 */
1460
1461int register_netdevice_notifier(struct notifier_block *nb)
1462{
1463        struct net_device *dev;
1464        struct net_device *last;
1465        struct net *net;
1466        int err;
1467
1468        rtnl_lock();
1469        err = raw_notifier_chain_register(&netdev_chain, nb);
1470        if (err)
1471                goto unlock;
1472        if (dev_boot_phase)
1473                goto unlock;
1474        for_each_net(net) {
1475                for_each_netdev(net, dev) {
1476                        err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1477                        err = notifier_to_errno(err);
1478                        if (err)
1479                                goto rollback;
1480
1481                        if (!(dev->flags & IFF_UP))
1482                                continue;
1483
1484                        call_netdevice_notifier(nb, NETDEV_UP, dev);
1485                }
1486        }
1487
1488unlock:
1489        rtnl_unlock();
1490        return err;
1491
1492rollback:
1493        last = dev;
1494        for_each_net(net) {
1495                for_each_netdev(net, dev) {
1496                        if (dev == last)
1497                                goto outroll;
1498
1499                        if (dev->flags & IFF_UP) {
1500                                call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1501                                                        dev);
1502                                call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1503                        }
1504                        call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1505                }
1506        }
1507
1508outroll:
1509        raw_notifier_chain_unregister(&netdev_chain, nb);
1510        goto unlock;
1511}
1512EXPORT_SYMBOL(register_netdevice_notifier);
1513
1514/**
1515 *      unregister_netdevice_notifier - unregister a network notifier block
1516 *      @nb: notifier
1517 *
1518 *      Unregister a notifier previously registered by
1519 *      register_netdevice_notifier(). The notifier is unlinked into the
1520 *      kernel structures and may then be reused. A negative errno code
1521 *      is returned on a failure.
1522 *
1523 *      After unregistering unregister and down device events are synthesized
1524 *      for all devices on the device list to the removed notifier to remove
1525 *      the need for special case cleanup code.
1526 */
1527
1528int unregister_netdevice_notifier(struct notifier_block *nb)
1529{
1530        struct net_device *dev;
1531        struct net *net;
1532        int err;
1533
1534        rtnl_lock();
1535        err = raw_notifier_chain_unregister(&netdev_chain, nb);
1536        if (err)
1537                goto unlock;
1538
1539        for_each_net(net) {
1540                for_each_netdev(net, dev) {
1541                        if (dev->flags & IFF_UP) {
1542                                call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1543                                                        dev);
1544                                call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1545                        }
1546                        call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1547                }
1548        }
1549unlock:
1550        rtnl_unlock();
1551        return err;
1552}
1553EXPORT_SYMBOL(unregister_netdevice_notifier);
1554
1555/**
1556 *      call_netdevice_notifiers_info - call all network notifier blocks
1557 *      @val: value passed unmodified to notifier function
1558 *      @dev: net_device pointer passed unmodified to notifier function
1559 *      @info: notifier information data
1560 *
1561 *      Call all network notifier blocks.  Parameters and return value
1562 *      are as for raw_notifier_call_chain().
1563 */
1564
1565int call_netdevice_notifiers_info(unsigned long val, struct net_device *dev,
1566                                  struct netdev_notifier_info *info)
1567{
1568        ASSERT_RTNL();
1569        netdev_notifier_info_init(info, dev);
1570        return raw_notifier_call_chain(&netdev_chain, val, info);
1571}
1572EXPORT_SYMBOL(call_netdevice_notifiers_info);
1573
1574/**
1575 *      call_netdevice_notifiers - call all network notifier blocks
1576 *      @val: value passed unmodified to notifier function
1577 *      @dev: net_device pointer passed unmodified to notifier function
1578 *
1579 *      Call all network notifier blocks.  Parameters and return value
1580 *      are as for raw_notifier_call_chain().
1581 */
1582
1583int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1584{
1585        struct netdev_notifier_info info;
1586
1587        return call_netdevice_notifiers_info(val, dev, &info);
1588}
1589EXPORT_SYMBOL(call_netdevice_notifiers);
1590
1591static struct static_key netstamp_needed __read_mostly;
1592#ifdef HAVE_JUMP_LABEL
1593/* We are not allowed to call static_key_slow_dec() from irq context
1594 * If net_disable_timestamp() is called from irq context, defer the
1595 * static_key_slow_dec() calls.
1596 */
1597static atomic_t netstamp_needed_deferred;
1598#endif
1599
1600void net_enable_timestamp(void)
1601{
1602#ifdef HAVE_JUMP_LABEL
1603        int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1604
1605        if (deferred) {
1606                while (--deferred)
1607                        static_key_slow_dec(&netstamp_needed);
1608                return;
1609        }
1610#endif
1611        static_key_slow_inc(&netstamp_needed);
1612}
1613EXPORT_SYMBOL(net_enable_timestamp);
1614
1615void net_disable_timestamp(void)
1616{
1617#ifdef HAVE_JUMP_LABEL
1618        if (in_interrupt()) {
1619                atomic_inc(&netstamp_needed_deferred);
1620                return;
1621        }
1622#endif
1623        static_key_slow_dec(&netstamp_needed);
1624}
1625EXPORT_SYMBOL(net_disable_timestamp);
1626
1627static inline void net_timestamp_set(struct sk_buff *skb)
1628{
1629        skb->tstamp.tv64 = 0;
1630        if (static_key_false(&netstamp_needed))
1631                __net_timestamp(skb);
1632}
1633
1634#define net_timestamp_check(COND, SKB)                  \
1635        if (static_key_false(&netstamp_needed)) {               \
1636                if ((COND) && !(SKB)->tstamp.tv64)      \
1637                        __net_timestamp(SKB);           \
1638        }                                               \
1639
1640static inline bool is_skb_forwardable(struct net_device *dev,
1641                                      struct sk_buff *skb)
1642{
1643        unsigned int len;
1644
1645        if (!(dev->flags & IFF_UP))
1646                return false;
1647
1648        len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1649        if (skb->len <= len)
1650                return true;
1651
1652        /* if TSO is enabled, we don't care about the length as the packet
1653         * could be forwarded without being segmented before
1654         */
1655        if (skb_is_gso(skb))
1656                return true;
1657
1658        return false;
1659}
1660
1661/**
1662 * dev_forward_skb - loopback an skb to another netif
1663 *
1664 * @dev: destination network device
1665 * @skb: buffer to forward
1666 *
1667 * return values:
1668 *      NET_RX_SUCCESS  (no congestion)
1669 *      NET_RX_DROP     (packet was dropped, but freed)
1670 *
1671 * dev_forward_skb can be used for injecting an skb from the
1672 * start_xmit function of one device into the receive queue
1673 * of another device.
1674 *
1675 * The receiving device may be in another namespace, so
1676 * we have to clear all information in the skb that could
1677 * impact namespace isolation.
1678 */
1679int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1680{
1681        if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1682                if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1683                        atomic_long_inc(&dev->rx_dropped);
1684                        kfree_skb(skb);
1685                        return NET_RX_DROP;
1686                }
1687        }
1688
1689        if (unlikely(!is_skb_forwardable(dev, skb))) {
1690                atomic_long_inc(&dev->rx_dropped);
1691                kfree_skb(skb);
1692                return NET_RX_DROP;
1693        }
1694        skb_scrub_packet(skb);
1695        skb->protocol = eth_type_trans(skb, dev);
1696
1697        /* eth_type_trans() can set pkt_type.
1698         * clear pkt_type _after_ calling eth_type_trans()
1699         */
1700        skb->pkt_type = PACKET_HOST;
1701
1702        return netif_rx(skb);
1703}
1704EXPORT_SYMBOL_GPL(dev_forward_skb);
1705
1706static inline int deliver_skb(struct sk_buff *skb,
1707                              struct packet_type *pt_prev,
1708                              struct net_device *orig_dev)
1709{
1710        if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1711                return -ENOMEM;
1712        atomic_inc(&skb->users);
1713        return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1714}
1715
1716static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1717{
1718        if (!ptype->af_packet_priv || !skb->sk)
1719                return false;
1720
1721        if (ptype->id_match)
1722                return ptype->id_match(ptype, skb->sk);
1723        else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1724                return true;
1725
1726        return false;
1727}
1728
1729/*
1730 *      Support routine. Sends outgoing frames to any network
1731 *      taps currently in use.
1732 */
1733
1734static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1735{
1736        struct packet_type *ptype;
1737        struct sk_buff *skb2 = NULL;
1738        struct packet_type *pt_prev = NULL;
1739
1740        rcu_read_lock();
1741        list_for_each_entry_rcu(ptype, &ptype_all, list) {
1742                /* Never send packets back to the socket
1743                 * they originated from - MvS (miquels@drinkel.ow.org)
1744                 */
1745                if ((ptype->dev == dev || !ptype->dev) &&
1746                    (!skb_loop_sk(ptype, skb))) {
1747                        if (pt_prev) {
1748                                deliver_skb(skb2, pt_prev, skb->dev);
1749                                pt_prev = ptype;
1750                                continue;
1751                        }
1752
1753                        skb2 = skb_clone(skb, GFP_ATOMIC);
1754                        if (!skb2)
1755                                break;
1756
1757                        net_timestamp_set(skb2);
1758
1759                        /* skb->nh should be correctly
1760                           set by sender, so that the second statement is
1761                           just protection against buggy protocols.
1762                         */
1763                        skb_reset_mac_header(skb2);
1764
1765                        if (skb_network_header(skb2) < skb2->data ||
1766                            skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1767                                net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1768                                                     ntohs(skb2->protocol),
1769                                                     dev->name);
1770                                skb_reset_network_header(skb2);
1771                        }
1772
1773                        skb2->transport_header = skb2->network_header;
1774                        skb2->pkt_type = PACKET_OUTGOING;
1775                        pt_prev = ptype;
1776                }
1777        }
1778        if (pt_prev)
1779                pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1780        rcu_read_unlock();
1781}
1782
1783/**
1784 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1785 * @dev: Network device
1786 * @txq: number of queues available
1787 *
1788 * If real_num_tx_queues is changed the tc mappings may no longer be
1789 * valid. To resolve this verify the tc mapping remains valid and if
1790 * not NULL the mapping. With no priorities mapping to this
1791 * offset/count pair it will no longer be used. In the worst case TC0
1792 * is invalid nothing can be done so disable priority mappings. If is
1793 * expected that drivers will fix this mapping if they can before
1794 * calling netif_set_real_num_tx_queues.
1795 */
1796static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1797{
1798        int i;
1799        struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1800
1801        /* If TC0 is invalidated disable TC mapping */
1802        if (tc->offset + tc->count > txq) {
1803                pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1804                dev->num_tc = 0;
1805                return;
1806        }
1807
1808        /* Invalidated prio to tc mappings set to TC0 */
1809        for (i = 1; i < TC_BITMASK + 1; i++) {
1810                int q = netdev_get_prio_tc_map(dev, i);
1811
1812                tc = &dev->tc_to_txq[q];
1813                if (tc->offset + tc->count > txq) {
1814                        pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1815                                i, q);
1816                        netdev_set_prio_tc_map(dev, i, 0);
1817                }
1818        }
1819}
1820
1821#ifdef CONFIG_XPS
1822static DEFINE_MUTEX(xps_map_mutex);
1823#define xmap_dereference(P)             \
1824        rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1825
1826static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1827                                        int cpu, u16 index)
1828{
1829        struct xps_map *map = NULL;
1830        int pos;
1831
1832        if (dev_maps)
1833                map = xmap_dereference(dev_maps->cpu_map[cpu]);
1834
1835        for (pos = 0; map && pos < map->len; pos++) {
1836                if (map->queues[pos] == index) {
1837                        if (map->len > 1) {
1838                                map->queues[pos] = map->queues[--map->len];
1839                        } else {
1840                                RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1841                                kfree_rcu(map, rcu);
1842                                map = NULL;
1843                        }
1844                        break;
1845                }
1846        }
1847
1848        return map;
1849}
1850
1851static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1852{
1853        struct xps_dev_maps *dev_maps;
1854        int cpu, i;
1855        bool active = false;
1856
1857        mutex_lock(&xps_map_mutex);
1858        dev_maps = xmap_dereference(dev->xps_maps);
1859
1860        if (!dev_maps)
1861                goto out_no_maps;
1862
1863        for_each_possible_cpu(cpu) {
1864                for (i = index; i < dev->num_tx_queues; i++) {
1865                        if (!remove_xps_queue(dev_maps, cpu, i))
1866                                break;
1867                }
1868                if (i == dev->num_tx_queues)
1869                        active = true;
1870        }
1871
1872        if (!active) {
1873                RCU_INIT_POINTER(dev->xps_maps, NULL);
1874                kfree_rcu(dev_maps, rcu);
1875        }
1876
1877        for (i = index; i < dev->num_tx_queues; i++)
1878                netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1879                                             NUMA_NO_NODE);
1880
1881out_no_maps:
1882        mutex_unlock(&xps_map_mutex);
1883}
1884
1885static struct xps_map *expand_xps_map(struct xps_map *map,
1886                                      int cpu, u16 index)
1887{
1888        struct xps_map *new_map;
1889        int alloc_len = XPS_MIN_MAP_ALLOC;
1890        int i, pos;
1891
1892        for (pos = 0; map && pos < map->len; pos++) {
1893                if (map->queues[pos] != index)
1894                        continue;
1895                return map;
1896        }
1897
1898        /* Need to add queue to this CPU's existing map */
1899        if (map) {
1900                if (pos < map->alloc_len)
1901                        return map;
1902
1903                alloc_len = map->alloc_len * 2;
1904        }
1905
1906        /* Need to allocate new map to store queue on this CPU's map */
1907        new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1908                               cpu_to_node(cpu));
1909        if (!new_map)
1910                return NULL;
1911
1912        for (i = 0; i < pos; i++)
1913                new_map->queues[i] = map->queues[i];
1914        new_map->alloc_len = alloc_len;
1915        new_map->len = pos;
1916
1917        return new_map;
1918}
1919
1920int netif_set_xps_queue(struct net_device *dev, struct cpumask *mask, u16 index)
1921{
1922        struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1923        struct xps_map *map, *new_map;
1924        int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1925        int cpu, numa_node_id = -2;
1926        bool active = false;
1927
1928        mutex_lock(&xps_map_mutex);
1929
1930        dev_maps = xmap_dereference(dev->xps_maps);
1931
1932        /* allocate memory for queue storage */
1933        for_each_online_cpu(cpu) {
1934                if (!cpumask_test_cpu(cpu, mask))
1935                        continue;
1936
1937                if (!new_dev_maps)
1938                        new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1939                if (!new_dev_maps) {
1940                        mutex_unlock(&xps_map_mutex);
1941                        return -ENOMEM;
1942                }
1943
1944                map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1945                                 NULL;
1946
1947                map = expand_xps_map(map, cpu, index);
1948                if (!map)
1949                        goto error;
1950
1951                RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1952        }
1953
1954        if (!new_dev_maps)
1955                goto out_no_new_maps;
1956
1957        for_each_possible_cpu(cpu) {
1958                if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1959                        /* add queue to CPU maps */
1960                        int pos = 0;
1961
1962                        map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1963                        while ((pos < map->len) && (map->queues[pos] != index))
1964                                pos++;
1965
1966                        if (pos == map->len)
1967                                map->queues[map->len++] = index;
1968#ifdef CONFIG_NUMA
1969                        if (numa_node_id == -2)
1970                                numa_node_id = cpu_to_node(cpu);
1971                        else if (numa_node_id != cpu_to_node(cpu))
1972                                numa_node_id = -1;
1973#endif
1974                } else if (dev_maps) {
1975                        /* fill in the new device map from the old device map */
1976                        map = xmap_dereference(dev_maps->cpu_map[cpu]);
1977                        RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1978                }
1979
1980        }
1981
1982        rcu_assign_pointer(dev->xps_maps, new_dev_maps);
1983
1984        /* Cleanup old maps */
1985        if (dev_maps) {
1986                for_each_possible_cpu(cpu) {
1987                        new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1988                        map = xmap_dereference(dev_maps->cpu_map[cpu]);
1989                        if (map && map != new_map)
1990                                kfree_rcu(map, rcu);
1991                }
1992
1993                kfree_rcu(dev_maps, rcu);
1994        }
1995
1996        dev_maps = new_dev_maps;
1997        active = true;
1998
1999out_no_new_maps:
2000        /* update Tx queue numa node */
2001        netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2002                                     (numa_node_id >= 0) ? numa_node_id :
2003                                     NUMA_NO_NODE);
2004
2005        if (!dev_maps)
2006                goto out_no_maps;
2007
2008        /* removes queue from unused CPUs */
2009        for_each_possible_cpu(cpu) {
2010                if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2011                        continue;
2012
2013                if (remove_xps_queue(dev_maps, cpu, index))
2014                        active = true;
2015        }
2016
2017        /* free map if not active */
2018        if (!active) {
2019                RCU_INIT_POINTER(dev->xps_maps, NULL);
2020                kfree_rcu(dev_maps, rcu);
2021        }
2022
2023out_no_maps:
2024        mutex_unlock(&xps_map_mutex);
2025
2026        return 0;
2027error:
2028        /* remove any maps that we added */
2029        for_each_possible_cpu(cpu) {
2030                new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2031                map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2032                                 NULL;
2033                if (new_map && new_map != map)
2034                        kfree(new_map);
2035        }
2036
2037        mutex_unlock(&xps_map_mutex);
2038
2039        kfree(new_dev_maps);
2040        return -ENOMEM;
2041}
2042EXPORT_SYMBOL(netif_set_xps_queue);
2043
2044#endif
2045/*
2046 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2047 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2048 */
2049int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2050{
2051        int rc;
2052
2053        if (txq < 1 || txq > dev->num_tx_queues)
2054                return -EINVAL;
2055
2056        if (dev->reg_state == NETREG_REGISTERED ||
2057            dev->reg_state == NETREG_UNREGISTERING) {
2058                ASSERT_RTNL();
2059
2060                rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2061                                                  txq);
2062                if (rc)
2063                        return rc;
2064
2065                if (dev->num_tc)
2066                        netif_setup_tc(dev, txq);
2067
2068                if (txq < dev->real_num_tx_queues) {
2069                        qdisc_reset_all_tx_gt(dev, txq);
2070#ifdef CONFIG_XPS
2071                        netif_reset_xps_queues_gt(dev, txq);
2072#endif
2073                }
2074        }
2075
2076        dev->real_num_tx_queues = txq;
2077        return 0;
2078}
2079EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2080
2081#ifdef CONFIG_RPS
2082/**
2083 *      netif_set_real_num_rx_queues - set actual number of RX queues used
2084 *      @dev: Network device
2085 *      @rxq: Actual number of RX queues
2086 *
2087 *      This must be called either with the rtnl_lock held or before
2088 *      registration of the net device.  Returns 0 on success, or a
2089 *      negative error code.  If called before registration, it always
2090 *      succeeds.
2091 */
2092int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2093{
2094        int rc;
2095
2096        if (rxq < 1 || rxq > dev->num_rx_queues)
2097                return -EINVAL;
2098
2099        if (dev->reg_state == NETREG_REGISTERED) {
2100                ASSERT_RTNL();
2101
2102                rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2103                                                  rxq);
2104                if (rc)
2105                        return rc;
2106        }
2107
2108        dev->real_num_rx_queues = rxq;
2109        return 0;
2110}
2111EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2112#endif
2113
2114/**
2115 * netif_get_num_default_rss_queues - default number of RSS queues
2116 *
2117 * This routine should set an upper limit on the number of RSS queues
2118 * used by default by multiqueue devices.
2119 */
2120int netif_get_num_default_rss_queues(void)
2121{
2122        return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2123}
2124EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2125
2126static inline void __netif_reschedule(struct Qdisc *q)
2127{
2128        struct softnet_data *sd;
2129        unsigned long flags;
2130
2131        local_irq_save(flags);
2132        sd = &__get_cpu_var(softnet_data);
2133        q->next_sched = NULL;
2134        *sd->output_queue_tailp = q;
2135        sd->output_queue_tailp = &q->next_sched;
2136        raise_softirq_irqoff(NET_TX_SOFTIRQ);
2137        local_irq_restore(flags);
2138}
2139
2140void __netif_schedule(struct Qdisc *q)
2141{
2142        if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2143                __netif_reschedule(q);
2144}
2145EXPORT_SYMBOL(__netif_schedule);
2146
2147void dev_kfree_skb_irq(struct sk_buff *skb)
2148{
2149        if (atomic_dec_and_test(&skb->users)) {
2150                struct softnet_data *sd;
2151                unsigned long flags;
2152
2153                local_irq_save(flags);
2154                sd = &__get_cpu_var(softnet_data);
2155                skb->next = sd->completion_queue;
2156                sd->completion_queue = skb;
2157                raise_softirq_irqoff(NET_TX_SOFTIRQ);
2158                local_irq_restore(flags);
2159        }
2160}
2161EXPORT_SYMBOL(dev_kfree_skb_irq);
2162
2163void dev_kfree_skb_any(struct sk_buff *skb)
2164{
2165        if (in_irq() || irqs_disabled())
2166                dev_kfree_skb_irq(skb);
2167        else
2168                dev_kfree_skb(skb);
2169}
2170EXPORT_SYMBOL(dev_kfree_skb_any);
2171
2172
2173/**
2174 * netif_device_detach - mark device as removed
2175 * @dev: network device
2176 *
2177 * Mark device as removed from system and therefore no longer available.
2178 */
2179void netif_device_detach(struct net_device *dev)
2180{
2181        if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2182            netif_running(dev)) {
2183                netif_tx_stop_all_queues(dev);
2184        }
2185}
2186EXPORT_SYMBOL(netif_device_detach);
2187
2188/**
2189 * netif_device_attach - mark device as attached
2190 * @dev: network device
2191 *
2192 * Mark device as attached from system and restart if needed.
2193 */
2194void netif_device_attach(struct net_device *dev)
2195{
2196        if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2197            netif_running(dev)) {
2198                netif_tx_wake_all_queues(dev);
2199                __netdev_watchdog_up(dev);
2200        }
2201}
2202EXPORT_SYMBOL(netif_device_attach);
2203
2204static void skb_warn_bad_offload(const struct sk_buff *skb)
2205{
2206        static const netdev_features_t null_features = 0;
2207        struct net_device *dev = skb->dev;
2208        const char *driver = "";
2209
2210        if (!net_ratelimit())
2211                return;
2212
2213        if (dev && dev->dev.parent)
2214                driver = dev_driver_string(dev->dev.parent);
2215
2216        WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2217             "gso_type=%d ip_summed=%d\n",
2218             driver, dev ? &dev->features : &null_features,
2219             skb->sk ? &skb->sk->sk_route_caps : &null_features,
2220             skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2221             skb_shinfo(skb)->gso_type, skb->ip_summed);
2222}
2223
2224/*
2225 * Invalidate hardware checksum when packet is to be mangled, and
2226 * complete checksum manually on outgoing path.
2227 */
2228int skb_checksum_help(struct sk_buff *skb)
2229{
2230        __wsum csum;
2231        int ret = 0, offset;
2232
2233        if (skb->ip_summed == CHECKSUM_COMPLETE)
2234                goto out_set_summed;
2235
2236        if (unlikely(skb_shinfo(skb)->gso_size)) {
2237                skb_warn_bad_offload(skb);
2238                return -EINVAL;
2239        }
2240
2241        /* Before computing a checksum, we should make sure no frag could
2242         * be modified by an external entity : checksum could be wrong.
2243         */
2244        if (skb_has_shared_frag(skb)) {
2245                ret = __skb_linearize(skb);
2246                if (ret)
2247                        goto out;
2248        }
2249
2250        offset = skb_checksum_start_offset(skb);
2251        BUG_ON(offset >= skb_headlen(skb));
2252        csum = skb_checksum(skb, offset, skb->len - offset, 0);
2253
2254        offset += skb->csum_offset;
2255        BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2256
2257        if (skb_cloned(skb) &&
2258            !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2259                ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2260                if (ret)
2261                        goto out;
2262        }
2263
2264        *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2265out_set_summed:
2266        skb->ip_summed = CHECKSUM_NONE;
2267out:
2268        return ret;
2269}
2270EXPORT_SYMBOL(skb_checksum_help);
2271
2272__be16 skb_network_protocol(struct sk_buff *skb)
2273{
2274        __be16 type = skb->protocol;
2275        int vlan_depth = ETH_HLEN;
2276
2277        /* Tunnel gso handlers can set protocol to ethernet. */
2278        if (type == htons(ETH_P_TEB)) {
2279                struct ethhdr *eth;
2280
2281                if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2282                        return 0;
2283
2284                eth = (struct ethhdr *)skb_mac_header(skb);
2285                type = eth->h_proto;
2286        }
2287
2288        while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
2289                struct vlan_hdr *vh;
2290
2291                if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
2292                        return 0;
2293
2294                vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2295                type = vh->h_vlan_encapsulated_proto;
2296                vlan_depth += VLAN_HLEN;
2297        }
2298
2299        return type;
2300}
2301
2302/**
2303 *      skb_mac_gso_segment - mac layer segmentation handler.
2304 *      @skb: buffer to segment
2305 *      @features: features for the output path (see dev->features)
2306 */
2307struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2308                                    netdev_features_t features)
2309{
2310        struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2311        struct packet_offload *ptype;
2312        __be16 type = skb_network_protocol(skb);
2313
2314        if (unlikely(!type))
2315                return ERR_PTR(-EINVAL);
2316
2317        __skb_pull(skb, skb->mac_len);
2318
2319        rcu_read_lock();
2320        list_for_each_entry_rcu(ptype, &offload_base, list) {
2321                if (ptype->type == type && ptype->callbacks.gso_segment) {
2322                        if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2323                                int err;
2324
2325                                err = ptype->callbacks.gso_send_check(skb);
2326                                segs = ERR_PTR(err);
2327                                if (err || skb_gso_ok(skb, features))
2328                                        break;
2329                                __skb_push(skb, (skb->data -
2330                                                 skb_network_header(skb)));
2331                        }
2332                        segs = ptype->callbacks.gso_segment(skb, features);
2333                        break;
2334                }
2335        }
2336        rcu_read_unlock();
2337
2338        __skb_push(skb, skb->data - skb_mac_header(skb));
2339
2340        return segs;
2341}
2342EXPORT_SYMBOL(skb_mac_gso_segment);
2343
2344
2345/* openvswitch calls this on rx path, so we need a different check.
2346 */
2347static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2348{
2349        if (tx_path)
2350                return skb->ip_summed != CHECKSUM_PARTIAL;
2351        else
2352                return skb->ip_summed == CHECKSUM_NONE;
2353}
2354
2355/**
2356 *      __skb_gso_segment - Perform segmentation on skb.
2357 *      @skb: buffer to segment
2358 *      @features: features for the output path (see dev->features)
2359 *      @tx_path: whether it is called in TX path
2360 *
2361 *      This function segments the given skb and returns a list of segments.
2362 *
2363 *      It may return NULL if the skb requires no segmentation.  This is
2364 *      only possible when GSO is used for verifying header integrity.
2365 */
2366struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2367                                  netdev_features_t features, bool tx_path)
2368{
2369        if (unlikely(skb_needs_check(skb, tx_path))) {
2370                int err;
2371
2372                skb_warn_bad_offload(skb);
2373
2374                if (skb_header_cloned(skb) &&
2375                    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2376                        return ERR_PTR(err);
2377        }
2378
2379        SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2380        skb_reset_mac_header(skb);
2381        skb_reset_mac_len(skb);
2382
2383        return skb_mac_gso_segment(skb, features);
2384}
2385EXPORT_SYMBOL(__skb_gso_segment);
2386
2387/* Take action when hardware reception checksum errors are detected. */
2388#ifdef CONFIG_BUG
2389void netdev_rx_csum_fault(struct net_device *dev)
2390{
2391        if (net_ratelimit()) {
2392                pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2393                dump_stack();
2394        }
2395}
2396EXPORT_SYMBOL(netdev_rx_csum_fault);
2397#endif
2398
2399/* Actually, we should eliminate this check as soon as we know, that:
2400 * 1. IOMMU is present and allows to map all the memory.
2401 * 2. No high memory really exists on this machine.
2402 */
2403
2404static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2405{
2406#ifdef CONFIG_HIGHMEM
2407        int i;
2408        if (!(dev->features & NETIF_F_HIGHDMA)) {
2409                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2410                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2411                        if (PageHighMem(skb_frag_page(frag)))
2412                                return 1;
2413                }
2414        }
2415
2416        if (PCI_DMA_BUS_IS_PHYS) {
2417                struct device *pdev = dev->dev.parent;
2418
2419                if (!pdev)
2420                        return 0;
2421                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2422                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2423                        dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2424                        if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2425                                return 1;
2426                }
2427        }
2428#endif
2429        return 0;
2430}
2431
2432struct dev_gso_cb {
2433        void (*destructor)(struct sk_buff *skb);
2434};
2435
2436#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2437
2438static void dev_gso_skb_destructor(struct sk_buff *skb)
2439{
2440        struct dev_gso_cb *cb;
2441
2442        do {
2443                struct sk_buff *nskb = skb->next;
2444
2445                skb->next = nskb->next;
2446                nskb->next = NULL;
2447                kfree_skb(nskb);
2448        } while (skb->next);
2449
2450        cb = DEV_GSO_CB(skb);
2451        if (cb->destructor)
2452                cb->destructor(skb);
2453}
2454
2455/**
2456 *      dev_gso_segment - Perform emulated hardware segmentation on skb.
2457 *      @skb: buffer to segment
2458 *      @features: device features as applicable to this skb
2459 *
2460 *      This function segments the given skb and stores the list of segments
2461 *      in skb->next.
2462 */
2463static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2464{
2465        struct sk_buff *segs;
2466
2467        segs = skb_gso_segment(skb, features);
2468
2469        /* Verifying header integrity only. */
2470        if (!segs)
2471                return 0;
2472
2473        if (IS_ERR(segs))
2474                return PTR_ERR(segs);
2475
2476        skb->next = segs;
2477        DEV_GSO_CB(skb)->destructor = skb->destructor;
2478        skb->destructor = dev_gso_skb_destructor;
2479
2480        return 0;
2481}
2482
2483static netdev_features_t harmonize_features(struct sk_buff *skb,
2484        netdev_features_t features)
2485{
2486        if (skb->ip_summed != CHECKSUM_NONE &&
2487            !can_checksum_protocol(features, skb_network_protocol(skb))) {
2488                features &= ~NETIF_F_ALL_CSUM;
2489        } else if (illegal_highdma(skb->dev, skb)) {
2490                features &= ~NETIF_F_SG;
2491        }
2492
2493        return features;
2494}
2495
2496netdev_features_t netif_skb_features(struct sk_buff *skb)
2497{
2498        __be16 protocol = skb->protocol;
2499        netdev_features_t features = skb->dev->features;
2500
2501        if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2502                features &= ~NETIF_F_GSO_MASK;
2503
2504        if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
2505                struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2506                protocol = veh->h_vlan_encapsulated_proto;
2507        } else if (!vlan_tx_tag_present(skb)) {
2508                return harmonize_features(skb, features);
2509        }
2510
2511        features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
2512                                               NETIF_F_HW_VLAN_STAG_TX);
2513
2514        if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
2515                features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2516                                NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
2517                                NETIF_F_HW_VLAN_STAG_TX;
2518
2519        return harmonize_features(skb, features);
2520}
2521EXPORT_SYMBOL(netif_skb_features);
2522
2523/*
2524 * Returns true if either:
2525 *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2526 *      2. skb is fragmented and the device does not support SG.
2527 */
2528static inline int skb_needs_linearize(struct sk_buff *skb,
2529                                      netdev_features_t features)
2530{
2531        return skb_is_nonlinear(skb) &&
2532                        ((skb_has_frag_list(skb) &&
2533                                !(features & NETIF_F_FRAGLIST)) ||
2534                        (skb_shinfo(skb)->nr_frags &&
2535                                !(features & NETIF_F_SG)));
2536}
2537
2538int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2539                        struct netdev_queue *txq)
2540{
2541        const struct net_device_ops *ops = dev->netdev_ops;
2542        int rc = NETDEV_TX_OK;
2543        unsigned int skb_len;
2544
2545        if (likely(!skb->next)) {
2546                netdev_features_t features;
2547
2548                /*
2549                 * If device doesn't need skb->dst, release it right now while
2550                 * its hot in this cpu cache
2551                 */
2552                if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2553                        skb_dst_drop(skb);
2554
2555                features = netif_skb_features(skb);
2556
2557                if (vlan_tx_tag_present(skb) &&
2558                    !vlan_hw_offload_capable(features, skb->vlan_proto)) {
2559                        skb = __vlan_put_tag(skb, skb->vlan_proto,
2560                                             vlan_tx_tag_get(skb));
2561                        if (unlikely(!skb))
2562                                goto out;
2563
2564                        skb->vlan_tci = 0;
2565                }
2566
2567                /* If encapsulation offload request, verify we are testing
2568                 * hardware encapsulation features instead of standard
2569                 * features for the netdev
2570                 */
2571                if (skb->encapsulation)
2572                        features &= dev->hw_enc_features;
2573
2574                if (netif_needs_gso(skb, features)) {
2575                        if (unlikely(dev_gso_segment(skb, features)))
2576                                goto out_kfree_skb;
2577                        if (skb->next)
2578                                goto gso;
2579                } else {
2580                        if (skb_needs_linearize(skb, features) &&
2581                            __skb_linearize(skb))
2582                                goto out_kfree_skb;
2583
2584                        /* If packet is not checksummed and device does not
2585                         * support checksumming for this protocol, complete
2586                         * checksumming here.
2587                         */
2588                        if (skb->ip_summed == CHECKSUM_PARTIAL) {
2589                                if (skb->encapsulation)
2590                                        skb_set_inner_transport_header(skb,
2591                                                skb_checksum_start_offset(skb));
2592                                else
2593                                        skb_set_transport_header(skb,
2594                                                skb_checksum_start_offset(skb));
2595                                if (!(features & NETIF_F_ALL_CSUM) &&
2596                                     skb_checksum_help(skb))
2597                                        goto out_kfree_skb;
2598                        }
2599                }
2600
2601                if (!list_empty(&ptype_all))
2602                        dev_queue_xmit_nit(skb, dev);
2603
2604                skb_len = skb->len;
2605                rc = ops->ndo_start_xmit(skb, dev);
2606                trace_net_dev_xmit(skb, rc, dev, skb_len);
2607                if (rc == NETDEV_TX_OK)
2608                        txq_trans_update(txq);
2609                return rc;
2610        }
2611
2612gso:
2613        do {
2614                struct sk_buff *nskb = skb->next;
2615
2616                skb->next = nskb->next;
2617                nskb->next = NULL;
2618
2619                if (!list_empty(&ptype_all))
2620                        dev_queue_xmit_nit(nskb, dev);
2621
2622                skb_len = nskb->len;
2623                rc = ops->ndo_start_xmit(nskb, dev);
2624                trace_net_dev_xmit(nskb, rc, dev, skb_len);
2625                if (unlikely(rc != NETDEV_TX_OK)) {
2626                        if (rc & ~NETDEV_TX_MASK)
2627                                goto out_kfree_gso_skb;
2628                        nskb->next = skb->next;
2629                        skb->next = nskb;
2630                        return rc;
2631                }
2632                txq_trans_update(txq);
2633                if (unlikely(netif_xmit_stopped(txq) && skb->next))
2634                        return NETDEV_TX_BUSY;
2635        } while (skb->next);
2636
2637out_kfree_gso_skb:
2638        if (likely(skb->next == NULL)) {
2639                skb->destructor = DEV_GSO_CB(skb)->destructor;
2640                consume_skb(skb);
2641                return rc;
2642        }
2643out_kfree_skb:
2644        kfree_skb(skb);
2645out:
2646        return rc;
2647}
2648
2649static void qdisc_pkt_len_init(struct sk_buff *skb)
2650{
2651        const struct skb_shared_info *shinfo = skb_shinfo(skb);
2652
2653        qdisc_skb_cb(skb)->pkt_len = skb->len;
2654
2655        /* To get more precise estimation of bytes sent on wire,
2656         * we add to pkt_len the headers size of all segments
2657         */
2658        if (shinfo->gso_size)  {
2659                unsigned int hdr_len;
2660                u16 gso_segs = shinfo->gso_segs;
2661
2662                /* mac layer + network layer */
2663                hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2664
2665                /* + transport layer */
2666                if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2667                        hdr_len += tcp_hdrlen(skb);
2668                else
2669                        hdr_len += sizeof(struct udphdr);
2670
2671                if (shinfo->gso_type & SKB_GSO_DODGY)
2672                        gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2673                                                shinfo->gso_size);
2674
2675                qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2676        }
2677}
2678
2679static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2680                                 struct net_device *dev,
2681                                 struct netdev_queue *txq)
2682{
2683        spinlock_t *root_lock = qdisc_lock(q);
2684        bool contended;
2685        int rc;
2686
2687        qdisc_pkt_len_init(skb);
2688        qdisc_calculate_pkt_len(skb, q);
2689        /*
2690         * Heuristic to force contended enqueues to serialize on a
2691         * separate lock before trying to get qdisc main lock.
2692         * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2693         * and dequeue packets faster.
2694         */
2695        contended = qdisc_is_running(q);
2696        if (unlikely(contended))
2697                spin_lock(&q->busylock);
2698
2699        spin_lock(root_lock);
2700        if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2701                kfree_skb(skb);
2702                rc = NET_XMIT_DROP;
2703        } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2704                   qdisc_run_begin(q)) {
2705                /*
2706                 * This is a work-conserving queue; there are no old skbs
2707                 * waiting to be sent out; and the qdisc is not running -
2708                 * xmit the skb directly.
2709                 */
2710                if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2711                        skb_dst_force(skb);
2712
2713                qdisc_bstats_update(q, skb);
2714
2715                if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2716                        if (unlikely(contended)) {
2717                                spin_unlock(&q->busylock);
2718                                contended = false;
2719                        }
2720                        __qdisc_run(q);
2721                } else
2722                        qdisc_run_end(q);
2723
2724                rc = NET_XMIT_SUCCESS;
2725        } else {
2726                skb_dst_force(skb);
2727                rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2728                if (qdisc_run_begin(q)) {
2729                        if (unlikely(contended)) {
2730                                spin_unlock(&q->busylock);
2731                                contended = false;
2732                        }
2733                        __qdisc_run(q);
2734                }
2735        }
2736        spin_unlock(root_lock);
2737        if (unlikely(contended))
2738                spin_unlock(&q->busylock);
2739        return rc;
2740}
2741
2742#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2743static void skb_update_prio(struct sk_buff *skb)
2744{
2745        struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2746
2747        if (!skb->priority && skb->sk && map) {
2748                unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2749
2750                if (prioidx < map->priomap_len)
2751                        skb->priority = map->priomap[prioidx];
2752        }
2753}
2754#else
2755#define skb_update_prio(skb)
2756#endif
2757
2758static DEFINE_PER_CPU(int, xmit_recursion);
2759#define RECURSION_LIMIT 10
2760
2761/**
2762 *      dev_loopback_xmit - loop back @skb
2763 *      @skb: buffer to transmit
2764 */
2765int dev_loopback_xmit(struct sk_buff *skb)
2766{
2767        skb_reset_mac_header(skb);
2768        __skb_pull(skb, skb_network_offset(skb));
2769        skb->pkt_type = PACKET_LOOPBACK;
2770        skb->ip_summed = CHECKSUM_UNNECESSARY;
2771        WARN_ON(!skb_dst(skb));
2772        skb_dst_force(skb);
2773        netif_rx_ni(skb);
2774        return 0;
2775}
2776EXPORT_SYMBOL(dev_loopback_xmit);
2777
2778/**
2779 *      dev_queue_xmit - transmit a buffer
2780 *      @skb: buffer to transmit
2781 *
2782 *      Queue a buffer for transmission to a network device. The caller must
2783 *      have set the device and priority and built the buffer before calling
2784 *      this function. The function can be called from an interrupt.
2785 *
2786 *      A negative errno code is returned on a failure. A success does not
2787 *      guarantee the frame will be transmitted as it may be dropped due
2788 *      to congestion or traffic shaping.
2789 *
2790 * -----------------------------------------------------------------------------------
2791 *      I notice this method can also return errors from the queue disciplines,
2792 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2793 *      be positive.
2794 *
2795 *      Regardless of the return value, the skb is consumed, so it is currently
2796 *      difficult to retry a send to this method.  (You can bump the ref count
2797 *      before sending to hold a reference for retry if you are careful.)
2798 *
2799 *      When calling this method, interrupts MUST be enabled.  This is because
2800 *      the BH enable code must have IRQs enabled so that it will not deadlock.
2801 *          --BLG
2802 */
2803int dev_queue_xmit(struct sk_buff *skb)
2804{
2805        struct net_device *dev = skb->dev;
2806        struct netdev_queue *txq;
2807        struct Qdisc *q;
2808        int rc = -ENOMEM;
2809
2810        skb_reset_mac_header(skb);
2811
2812        /* Disable soft irqs for various locks below. Also
2813         * stops preemption for RCU.
2814         */
2815        rcu_read_lock_bh();
2816
2817        skb_update_prio(skb);
2818
2819        txq = netdev_pick_tx(dev, skb);
2820        q = rcu_dereference_bh(txq->qdisc);
2821
2822#ifdef CONFIG_NET_CLS_ACT
2823        skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2824#endif
2825        trace_net_dev_queue(skb);
2826        if (q->enqueue) {
2827                rc = __dev_xmit_skb(skb, q, dev, txq);
2828                goto out;
2829        }
2830
2831        /* The device has no queue. Common case for software devices:
2832           loopback, all the sorts of tunnels...
2833
2834           Really, it is unlikely that netif_tx_lock protection is necessary
2835           here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2836           counters.)
2837           However, it is possible, that they rely on protection
2838           made by us here.
2839
2840           Check this and shot the lock. It is not prone from deadlocks.
2841           Either shot noqueue qdisc, it is even simpler 8)
2842         */
2843        if (dev->flags & IFF_UP) {
2844                int cpu = smp_processor_id(); /* ok because BHs are off */
2845
2846                if (txq->xmit_lock_owner != cpu) {
2847
2848                        if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2849                                goto recursion_alert;
2850
2851                        HARD_TX_LOCK(dev, txq, cpu);
2852
2853                        if (!netif_xmit_stopped(txq)) {
2854                                __this_cpu_inc(xmit_recursion);
2855                                rc = dev_hard_start_xmit(skb, dev, txq);
2856                                __this_cpu_dec(xmit_recursion);
2857                                if (dev_xmit_complete(rc)) {
2858                                        HARD_TX_UNLOCK(dev, txq);
2859                                        goto out;
2860                                }
2861                        }
2862                        HARD_TX_UNLOCK(dev, txq);
2863                        net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2864                                             dev->name);
2865                } else {
2866                        /* Recursion is detected! It is possible,
2867                         * unfortunately
2868                         */
2869recursion_alert:
2870                        net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2871                                             dev->name);
2872                }
2873        }
2874
2875        rc = -ENETDOWN;
2876        rcu_read_unlock_bh();
2877
2878        kfree_skb(skb);
2879        return rc;
2880out:
2881        rcu_read_unlock_bh();
2882        return rc;
2883}
2884EXPORT_SYMBOL(dev_queue_xmit);
2885
2886
2887/*=======================================================================
2888                        Receiver routines
2889  =======================================================================*/
2890
2891int netdev_max_backlog __read_mostly = 1000;
2892EXPORT_SYMBOL(netdev_max_backlog);
2893
2894int netdev_tstamp_prequeue __read_mostly = 1;
2895int netdev_budget __read_mostly = 300;
2896int weight_p __read_mostly = 64;            /* old backlog weight */
2897
2898/* Called with irq disabled */
2899static inline void ____napi_schedule(struct softnet_data *sd,
2900                                     struct napi_struct *napi)
2901{
2902        list_add_tail(&napi->poll_list, &sd->poll_list);
2903        __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2904}
2905
2906#ifdef CONFIG_RPS
2907
2908/* One global table that all flow-based protocols share. */
2909struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2910EXPORT_SYMBOL(rps_sock_flow_table);
2911
2912struct static_key rps_needed __read_mostly;
2913
2914static struct rps_dev_flow *
2915set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2916            struct rps_dev_flow *rflow, u16 next_cpu)
2917{
2918        if (next_cpu != RPS_NO_CPU) {
2919#ifdef CONFIG_RFS_ACCEL
2920                struct netdev_rx_queue *rxqueue;
2921                struct rps_dev_flow_table *flow_table;
2922                struct rps_dev_flow *old_rflow;
2923                u32 flow_id;
2924                u16 rxq_index;
2925                int rc;
2926
2927                /* Should we steer this flow to a different hardware queue? */
2928                if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2929                    !(dev->features & NETIF_F_NTUPLE))
2930                        goto out;
2931                rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2932                if (rxq_index == skb_get_rx_queue(skb))
2933                        goto out;
2934
2935                rxqueue = dev->_rx + rxq_index;
2936                flow_table = rcu_dereference(rxqueue->rps_flow_table);
2937                if (!flow_table)
2938                        goto out;
2939                flow_id = skb->rxhash & flow_table->mask;
2940                rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2941                                                        rxq_index, flow_id);
2942                if (rc < 0)
2943                        goto out;
2944                old_rflow = rflow;
2945                rflow = &flow_table->flows[flow_id];
2946                rflow->filter = rc;
2947                if (old_rflow->filter == rflow->filter)
2948                        old_rflow->filter = RPS_NO_FILTER;
2949        out:
2950#endif
2951                rflow->last_qtail =
2952                        per_cpu(softnet_data, next_cpu).input_queue_head;
2953        }
2954
2955        rflow->cpu = next_cpu;
2956        return rflow;
2957}
2958
2959/*
2960 * get_rps_cpu is called from netif_receive_skb and returns the target
2961 * CPU from the RPS map of the receiving queue for a given skb.
2962 * rcu_read_lock must be held on entry.
2963 */
2964static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2965                       struct rps_dev_flow **rflowp)
2966{
2967        struct netdev_rx_queue *rxqueue;
2968        struct rps_map *map;
2969        struct rps_dev_flow_table *flow_table;
2970        struct rps_sock_flow_table *sock_flow_table;
2971        int cpu = -1;
2972        u16 tcpu;
2973
2974        if (skb_rx_queue_recorded(skb)) {
2975                u16 index = skb_get_rx_queue(skb);
2976                if (unlikely(index >= dev->real_num_rx_queues)) {
2977                        WARN_ONCE(dev->real_num_rx_queues > 1,
2978                                  "%s received packet on queue %u, but number "
2979                                  "of RX queues is %u\n",
2980                                  dev->name, index, dev->real_num_rx_queues);
2981                        goto done;
2982                }
2983                rxqueue = dev->_rx + index;
2984        } else
2985                rxqueue = dev->_rx;
2986
2987        map = rcu_dereference(rxqueue->rps_map);
2988        if (map) {
2989                if (map->len == 1 &&
2990                    !rcu_access_pointer(rxqueue->rps_flow_table)) {
2991                        tcpu = map->cpus[0];
2992                        if (cpu_online(tcpu))
2993                                cpu = tcpu;
2994                        goto done;
2995                }
2996        } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2997                goto done;
2998        }
2999
3000        skb_reset_network_header(skb);
3001        if (!skb_get_rxhash(skb))
3002                goto done;
3003
3004        flow_table = rcu_dereference(rxqueue->rps_flow_table);
3005        sock_flow_table = rcu_dereference(rps_sock_flow_table);
3006        if (flow_table && sock_flow_table) {
3007                u16 next_cpu;
3008                struct rps_dev_flow *rflow;
3009
3010                rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
3011                tcpu = rflow->cpu;
3012
3013                next_cpu = sock_flow_table->ents[skb->rxhash &
3014                    sock_flow_table->mask];
3015
3016                /*
3017                 * If the desired CPU (where last recvmsg was done) is
3018                 * different from current CPU (one in the rx-queue flow
3019                 * table entry), switch if one of the following holds:
3020                 *   - Current CPU is unset (equal to RPS_NO_CPU).
3021                 *   - Current CPU is offline.
3022                 *   - The current CPU's queue tail has advanced beyond the
3023                 *     last packet that was enqueued using this table entry.
3024                 *     This guarantees that all previous packets for the flow
3025                 *     have been dequeued, thus preserving in order delivery.
3026                 */
3027                if (unlikely(tcpu != next_cpu) &&
3028                    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3029                     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3030                      rflow->last_qtail)) >= 0)) {
3031                        tcpu = next_cpu;
3032                        rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3033                }
3034
3035                if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3036                        *rflowp = rflow;
3037                        cpu = tcpu;
3038                        goto done;
3039                }
3040        }
3041
3042        if (map) {
3043                tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
3044
3045                if (cpu_online(tcpu)) {
3046                        cpu = tcpu;
3047                        goto done;
3048                }
3049        }
3050
3051done:
3052        return cpu;
3053}
3054
3055#ifdef CONFIG_RFS_ACCEL
3056
3057/**
3058 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3059 * @dev: Device on which the filter was set
3060 * @rxq_index: RX queue index
3061 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3062 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3063 *
3064 * Drivers that implement ndo_rx_flow_steer() should periodically call
3065 * this function for each installed filter and remove the filters for
3066 * which it returns %true.
3067 */
3068bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3069                         u32 flow_id, u16 filter_id)
3070{
3071        struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3072        struct rps_dev_flow_table *flow_table;
3073        struct rps_dev_flow *rflow;
3074        bool expire = true;
3075        int cpu;
3076
3077        rcu_read_lock();
3078        flow_table = rcu_dereference(rxqueue->rps_flow_table);
3079        if (flow_table && flow_id <= flow_table->mask) {
3080                rflow = &flow_table->flows[flow_id];
3081                cpu = ACCESS_ONCE(rflow->cpu);
3082                if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3083                    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3084                           rflow->last_qtail) <
3085                     (int)(10 * flow_table->mask)))
3086                        expire = false;
3087        }
3088        rcu_read_unlock();
3089        return expire;
3090}
3091EXPORT_SYMBOL(rps_may_expire_flow);
3092
3093#endif /* CONFIG_RFS_ACCEL */
3094
3095/* Called from hardirq (IPI) context */
3096static void rps_trigger_softirq(void *data)
3097{
3098        struct softnet_data *sd = data;
3099
3100        ____napi_schedule(sd, &sd->backlog);
3101        sd->received_rps++;
3102}
3103
3104#endif /* CONFIG_RPS */
3105
3106/*
3107 * Check if this softnet_data structure is another cpu one
3108 * If yes, queue it to our IPI list and return 1
3109 * If no, return 0
3110 */
3111static int rps_ipi_queued(struct softnet_data *sd)
3112{
3113#ifdef CONFIG_RPS
3114        struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3115
3116        if (sd != mysd) {
3117                sd->rps_ipi_next = mysd->rps_ipi_list;
3118                mysd->rps_ipi_list = sd;
3119
3120                __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3121                return 1;
3122        }
3123#endif /* CONFIG_RPS */
3124        return 0;
3125}
3126
3127#ifdef CONFIG_NET_FLOW_LIMIT
3128int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3129#endif
3130
3131static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3132{
3133#ifdef CONFIG_NET_FLOW_LIMIT
3134        struct sd_flow_limit *fl;
3135        struct softnet_data *sd;
3136        unsigned int old_flow, new_flow;
3137
3138        if (qlen < (netdev_max_backlog >> 1))
3139                return false;
3140
3141        sd = &__get_cpu_var(softnet_data);
3142
3143        rcu_read_lock();
3144        fl = rcu_dereference(sd->flow_limit);
3145        if (fl) {
3146                new_flow = skb_get_rxhash(skb) & (fl->num_buckets - 1);
3147                old_flow = fl->history[fl->history_head];
3148                fl->history[fl->history_head] = new_flow;
3149
3150                fl->history_head++;
3151                fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3152
3153                if (likely(fl->buckets[old_flow]))
3154                        fl->buckets[old_flow]--;
3155
3156                if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3157                        fl->count++;
3158                        rcu_read_unlock();
3159                        return true;
3160                }
3161        }
3162        rcu_read_unlock();
3163#endif
3164        return false;
3165}
3166
3167/*
3168 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3169 * queue (may be a remote CPU queue).
3170 */
3171static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3172                              unsigned int *qtail)
3173{
3174        struct softnet_data *sd;
3175        unsigned long flags;
3176        unsigned int qlen;
3177
3178        sd = &per_cpu(softnet_data, cpu);
3179
3180        local_irq_save(flags);
3181
3182        rps_lock(sd);
3183        qlen = skb_queue_len(&sd->input_pkt_queue);
3184        if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3185                if (skb_queue_len(&sd->input_pkt_queue)) {
3186enqueue:
3187                        __skb_queue_tail(&sd->input_pkt_queue, skb);
3188                        input_queue_tail_incr_save(sd, qtail);
3189                        rps_unlock(sd);
3190                        local_irq_restore(flags);
3191                        return NET_RX_SUCCESS;
3192                }
3193
3194                /* Schedule NAPI for backlog device
3195                 * We can use non atomic operation since we own the queue lock
3196                 */
3197                if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3198                        if (!rps_ipi_queued(sd))
3199                                ____napi_schedule(sd, &sd->backlog);
3200                }
3201                goto enqueue;
3202        }
3203
3204        sd->dropped++;
3205        rps_unlock(sd);
3206
3207        local_irq_restore(flags);
3208
3209        atomic_long_inc(&skb->dev->rx_dropped);
3210        kfree_skb(skb);
3211        return NET_RX_DROP;
3212}
3213
3214/**
3215 *      netif_rx        -       post buffer to the network code
3216 *      @skb: buffer to post
3217 *
3218 *      This function receives a packet from a device driver and queues it for
3219 *      the upper (protocol) levels to process.  It always succeeds. The buffer
3220 *      may be dropped during processing for congestion control or by the
3221 *      protocol layers.
3222 *
3223 *      return values:
3224 *      NET_RX_SUCCESS  (no congestion)
3225 *      NET_RX_DROP     (packet was dropped)
3226 *
3227 */
3228
3229int netif_rx(struct sk_buff *skb)
3230{
3231        int ret;
3232
3233        /* if netpoll wants it, pretend we never saw it */
3234        if (netpoll_rx(skb))
3235                return NET_RX_DROP;
3236
3237        net_timestamp_check(netdev_tstamp_prequeue, skb);
3238
3239        trace_netif_rx(skb);
3240#ifdef CONFIG_RPS
3241        if (static_key_false(&rps_needed)) {
3242                struct rps_dev_flow voidflow, *rflow = &voidflow;
3243                int cpu;
3244
3245                preempt_disable();
3246                rcu_read_lock();
3247
3248                cpu = get_rps_cpu(skb->dev, skb, &rflow);
3249                if (cpu < 0)
3250                        cpu = smp_processor_id();
3251
3252                ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3253
3254                rcu_read_unlock();
3255                preempt_enable();
3256        } else
3257#endif
3258        {
3259                unsigned int qtail;
3260                ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3261                put_cpu();
3262        }
3263        return ret;
3264}
3265EXPORT_SYMBOL(netif_rx);
3266
3267int netif_rx_ni(struct sk_buff *skb)
3268{
3269        int err;
3270
3271        preempt_disable();
3272        err = netif_rx(skb);
3273        if (local_softirq_pending())
3274                do_softirq();
3275        preempt_enable();
3276
3277        return err;
3278}
3279EXPORT_SYMBOL(netif_rx_ni);
3280
3281static void net_tx_action(struct softirq_action *h)
3282{
3283        struct softnet_data *sd = &__get_cpu_var(softnet_data);
3284
3285        if (sd->completion_queue) {
3286                struct sk_buff *clist;
3287
3288                local_irq_disable();
3289                clist = sd->completion_queue;
3290                sd->completion_queue = NULL;
3291                local_irq_enable();
3292
3293                while (clist) {
3294                        struct sk_buff *skb = clist;
3295                        clist = clist->next;
3296
3297                        WARN_ON(atomic_read(&skb->users));
3298                        trace_kfree_skb(skb, net_tx_action);
3299                        __kfree_skb(skb);
3300                }
3301        }
3302
3303        if (sd->output_queue) {
3304                struct Qdisc *head;
3305
3306                local_irq_disable();
3307                head = sd->output_queue;
3308                sd->output_queue = NULL;
3309                sd->output_queue_tailp = &sd->output_queue;
3310                local_irq_enable();
3311
3312                while (head) {
3313                        struct Qdisc *q = head;
3314                        spinlock_t *root_lock;
3315
3316                        head = head->next_sched;
3317
3318                        root_lock = qdisc_lock(q);
3319                        if (spin_trylock(root_lock)) {
3320                                smp_mb__before_clear_bit();
3321                                clear_bit(__QDISC_STATE_SCHED,
3322                                          &q->state);
3323                                qdisc_run(q);
3324                                spin_unlock(root_lock);
3325                        } else {
3326                                if (!test_bit(__QDISC_STATE_DEACTIVATED,
3327                                              &q->state)) {
3328                                        __netif_reschedule(q);
3329                                } else {
3330                                        smp_mb__before_clear_bit();
3331                                        clear_bit(__QDISC_STATE_SCHED,
3332                                                  &q->state);
3333                                }
3334                        }
3335                }
3336        }
3337}
3338
3339#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3340    (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3341/* This hook is defined here for ATM LANE */
3342int (*br_fdb_test_addr_hook)(struct net_device *dev,
3343                             unsigned char *addr) __read_mostly;
3344EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3345#endif
3346
3347#ifdef CONFIG_NET_CLS_ACT
3348/* TODO: Maybe we should just force sch_ingress to be compiled in
3349 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3350 * a compare and 2 stores extra right now if we dont have it on
3351 * but have CONFIG_NET_CLS_ACT
3352 * NOTE: This doesn't stop any functionality; if you dont have
3353 * the ingress scheduler, you just can't add policies on ingress.
3354 *
3355 */
3356static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3357{
3358        struct net_device *dev = skb->dev;
3359        u32 ttl = G_TC_RTTL(skb->tc_verd);
3360        int result = TC_ACT_OK;
3361        struct Qdisc *q;
3362
3363        if (unlikely(MAX_RED_LOOP < ttl++)) {
3364                net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3365                                     skb->skb_iif, dev->ifindex);
3366                return TC_ACT_SHOT;
3367        }
3368
3369        skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3370        skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3371
3372        q = rxq->qdisc;
3373        if (q != &noop_qdisc) {
3374                spin_lock(qdisc_lock(q));
3375                if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3376                        result = qdisc_enqueue_root(skb, q);
3377                spin_unlock(qdisc_lock(q));
3378        }
3379
3380        return result;
3381}
3382
3383static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3384                                         struct packet_type **pt_prev,
3385                                         int *ret, struct net_device *orig_dev)
3386{
3387        struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3388
3389        if (!rxq || rxq->qdisc == &noop_qdisc)
3390                goto out;
3391
3392        if (*pt_prev) {
3393                *ret = deliver_skb(skb, *pt_prev, orig_dev);
3394                *pt_prev = NULL;
3395        }
3396
3397        switch (ing_filter(skb, rxq)) {
3398        case TC_ACT_SHOT:
3399        case TC_ACT_STOLEN:
3400                kfree_skb(skb);
3401                return NULL;
3402        }
3403
3404out:
3405        skb->tc_verd = 0;
3406        return skb;
3407}
3408#endif
3409
3410/**
3411 *      netdev_rx_handler_register - register receive handler
3412 *      @dev: device to register a handler for
3413 *      @rx_handler: receive handler to register
3414 *      @rx_handler_data: data pointer that is used by rx handler
3415 *
3416 *      Register a receive hander for a device. This handler will then be
3417 *      called from __netif_receive_skb. A negative errno code is returned
3418 *      on a failure.
3419 *
3420 *      The caller must hold the rtnl_mutex.
3421 *
3422 *      For a general description of rx_handler, see enum rx_handler_result.
3423 */
3424int netdev_rx_handler_register(struct net_device *dev,
3425                               rx_handler_func_t *rx_handler,
3426                               void *rx_handler_data)
3427{
3428        ASSERT_RTNL();
3429
3430        if (dev->rx_handler)
3431                return -EBUSY;
3432
3433        /* Note: rx_handler_data must be set before rx_handler */
3434        rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3435        rcu_assign_pointer(dev->rx_handler, rx_handler);
3436
3437        return 0;
3438}
3439EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3440
3441/**
3442 *      netdev_rx_handler_unregister - unregister receive handler
3443 *      @dev: device to unregister a handler from
3444 *
3445 *      Unregister a receive handler from a device.
3446 *
3447 *      The caller must hold the rtnl_mutex.
3448 */
3449void netdev_rx_handler_unregister(struct net_device *dev)
3450{
3451
3452        ASSERT_RTNL();
3453        RCU_INIT_POINTER(dev->rx_handler, NULL);
3454        /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3455         * section has a guarantee to see a non NULL rx_handler_data
3456         * as well.
3457         */
3458        synchronize_net();
3459        RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3460}
3461EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3462
3463/*
3464 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3465 * the special handling of PFMEMALLOC skbs.
3466 */
3467static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3468{
3469        switch (skb->protocol) {
3470        case __constant_htons(ETH_P_ARP):
3471        case __constant_htons(ETH_P_IP):
3472        case __constant_htons(ETH_P_IPV6):
3473        case __constant_htons(ETH_P_8021Q):
3474        case __constant_htons(ETH_P_8021AD):
3475                return true;
3476        default:
3477                return false;
3478        }
3479}
3480
3481static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3482{
3483        struct packet_type *ptype, *pt_prev;
3484        rx_handler_func_t *rx_handler;
3485        struct net_device *orig_dev;
3486        struct net_device *null_or_dev;
3487        bool deliver_exact = false;
3488        int ret = NET_RX_DROP;
3489        __be16 type;
3490
3491        net_timestamp_check(!netdev_tstamp_prequeue, skb);
3492
3493        trace_netif_receive_skb(skb);
3494
3495        /* if we've gotten here through NAPI, check netpoll */
3496        if (netpoll_receive_skb(skb))
3497                goto out;
3498
3499        orig_dev = skb->dev;
3500
3501        skb_reset_network_header(skb);
3502        if (!skb_transport_header_was_set(skb))
3503                skb_reset_transport_header(skb);
3504        skb_reset_mac_len(skb);
3505
3506        pt_prev = NULL;
3507
3508        rcu_read_lock();
3509
3510another_round:
3511        skb->skb_iif = skb->dev->ifindex;
3512
3513        __this_cpu_inc(softnet_data.processed);
3514
3515        if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3516            skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3517                skb = vlan_untag(skb);
3518                if (unlikely(!skb))
3519                        goto unlock;
3520        }
3521
3522#ifdef CONFIG_NET_CLS_ACT
3523        if (skb->tc_verd & TC_NCLS) {
3524                skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3525                goto ncls;
3526        }
3527#endif
3528
3529        if (pfmemalloc)
3530                goto skip_taps;
3531
3532        list_for_each_entry_rcu(ptype, &ptype_all, list) {
3533                if (!ptype->dev || ptype->dev == skb->dev) {
3534                        if (pt_prev)
3535                                ret = deliver_skb(skb, pt_prev, orig_dev);
3536                        pt_prev = ptype;
3537                }
3538        }
3539
3540skip_taps:
3541#ifdef CONFIG_NET_CLS_ACT
3542        skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3543        if (!skb)
3544                goto unlock;
3545ncls:
3546#endif
3547
3548        if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3549                goto drop;
3550
3551        if (vlan_tx_tag_present(skb)) {
3552                if (pt_prev) {
3553                        ret = deliver_skb(skb, pt_prev, orig_dev);
3554                        pt_prev = NULL;
3555                }
3556                if (vlan_do_receive(&skb))
3557                        goto another_round;
3558                else if (unlikely(!skb))
3559                        goto unlock;
3560        }
3561
3562        rx_handler = rcu_dereference(skb->dev->rx_handler);
3563        if (rx_handler) {
3564                if (pt_prev) {
3565                        ret = deliver_skb(skb, pt_prev, orig_dev);
3566                        pt_prev = NULL;
3567                }
3568                switch (rx_handler(&skb)) {
3569                case RX_HANDLER_CONSUMED:
3570                        ret = NET_RX_SUCCESS;
3571                        goto unlock;
3572                case RX_HANDLER_ANOTHER:
3573                        goto another_round;
3574                case RX_HANDLER_EXACT:
3575                        deliver_exact = true;
3576                case RX_HANDLER_PASS:
3577                        break;
3578                default:
3579                        BUG();
3580                }
3581        }
3582
3583        if (unlikely(vlan_tx_tag_present(skb))) {
3584                if (vlan_tx_tag_get_id(skb))
3585                        skb->pkt_type = PACKET_OTHERHOST;
3586                /* Note: we might in the future use prio bits
3587                 * and set skb->priority like in vlan_do_receive()
3588                 * For the time being, just ignore Priority Code Point
3589                 */
3590                skb->vlan_tci = 0;
3591        }
3592
3593        /* deliver only exact match when indicated */
3594        null_or_dev = deliver_exact ? skb->dev : NULL;
3595
3596        type = skb->protocol;
3597        list_for_each_entry_rcu(ptype,
3598                        &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3599                if (ptype->type == type &&
3600                    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3601                     ptype->dev == orig_dev)) {
3602                        if (pt_prev)
3603                                ret = deliver_skb(skb, pt_prev, orig_dev);
3604                        pt_prev = ptype;
3605                }
3606        }
3607
3608        if (pt_prev) {
3609                if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3610                        goto drop;
3611                else
3612                        ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3613        } else {
3614drop:
3615                atomic_long_inc(&skb->dev->rx_dropped);
3616                kfree_skb(skb);
3617                /* Jamal, now you will not able to escape explaining
3618                 * me how you were going to use this. :-)
3619                 */
3620                ret = NET_RX_DROP;
3621        }
3622
3623unlock:
3624        rcu_read_unlock();
3625out:
3626        return ret;
3627}
3628
3629static int __netif_receive_skb(struct sk_buff *skb)
3630{
3631        int ret;
3632
3633        if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3634                unsigned long pflags = current->flags;
3635
3636                /*
3637                 * PFMEMALLOC skbs are special, they should
3638                 * - be delivered to SOCK_MEMALLOC sockets only
3639                 * - stay away from userspace
3640                 * - have bounded memory usage
3641                 *
3642                 * Use PF_MEMALLOC as this saves us from propagating the allocation
3643                 * context down to all allocation sites.
3644                 */
3645                current->flags |= PF_MEMALLOC;
3646                ret = __netif_receive_skb_core(skb, true);
3647                tsk_restore_flags(current, pflags, PF_MEMALLOC);
3648        } else
3649                ret = __netif_receive_skb_core(skb, false);
3650
3651        return ret;
3652}
3653
3654/**
3655 *      netif_receive_skb - process receive buffer from network
3656 *      @skb: buffer to process
3657 *
3658 *      netif_receive_skb() is the main receive data processing function.
3659 *      It always succeeds. The buffer may be dropped during processing
3660 *      for congestion control or by the protocol layers.
3661 *
3662 *      This function may only be called from softirq context and interrupts
3663 *      should be enabled.
3664 *
3665 *      Return values (usually ignored):
3666 *      NET_RX_SUCCESS: no congestion
3667 *      NET_RX_DROP: packet was dropped
3668 */
3669int netif_receive_skb(struct sk_buff *skb)
3670{
3671        net_timestamp_check(netdev_tstamp_prequeue, skb);
3672
3673        if (skb_defer_rx_timestamp(skb))
3674                return NET_RX_SUCCESS;
3675
3676#ifdef CONFIG_RPS
3677        if (static_key_false(&rps_needed)) {
3678                struct rps_dev_flow voidflow, *rflow = &voidflow;
3679                int cpu, ret;
3680
3681                rcu_read_lock();
3682
3683                cpu = get_rps_cpu(skb->dev, skb, &rflow);
3684
3685                if (cpu >= 0) {
3686                        ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3687                        rcu_read_unlock();
3688                        return ret;
3689                }
3690                rcu_read_unlock();
3691        }
3692#endif
3693        return __netif_receive_skb(skb);
3694}
3695EXPORT_SYMBOL(netif_receive_skb);
3696
3697/* Network device is going away, flush any packets still pending
3698 * Called with irqs disabled.
3699 */
3700static void flush_backlog(void *arg)
3701{
3702        struct net_device *dev = arg;
3703        struct softnet_data *sd = &__get_cpu_var(softnet_data);
3704        struct sk_buff *skb, *tmp;
3705
3706        rps_lock(sd);
3707        skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3708                if (skb->dev == dev) {
3709                        __skb_unlink(skb, &sd->input_pkt_queue);
3710                        kfree_skb(skb);
3711                        input_queue_head_incr(sd);
3712                }
3713        }
3714        rps_unlock(sd);
3715
3716        skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3717                if (skb->dev == dev) {
3718                        __skb_unlink(skb, &sd->process_queue);
3719                        kfree_skb(skb);
3720                        input_queue_head_incr(sd);
3721                }
3722        }
3723}
3724
3725static int napi_gro_complete(struct sk_buff *skb)
3726{
3727        struct packet_offload *ptype;
3728        __be16 type = skb->protocol;
3729        struct list_head *head = &offload_base;
3730        int err = -ENOENT;
3731
3732        BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3733
3734        if (NAPI_GRO_CB(skb)->count == 1) {
3735                skb_shinfo(skb)->gso_size = 0;
3736                goto out;
3737        }
3738
3739        rcu_read_lock();
3740        list_for_each_entry_rcu(ptype, head, list) {
3741                if (ptype->type != type || !ptype->callbacks.gro_complete)
3742                        continue;
3743
3744                err = ptype->callbacks.gro_complete(skb);
3745                break;
3746        }
3747        rcu_read_unlock();
3748
3749        if (err) {
3750                WARN_ON(&ptype->list == head);
3751                kfree_skb(skb);
3752                return NET_RX_SUCCESS;
3753        }
3754
3755out:
3756        return netif_receive_skb(skb);
3757}
3758
3759/* napi->gro_list contains packets ordered by age.
3760 * youngest packets at the head of it.
3761 * Complete skbs in reverse order to reduce latencies.
3762 */
3763void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3764{
3765        struct sk_buff *skb, *prev = NULL;
3766
3767        /* scan list and build reverse chain */
3768        for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3769                skb->prev = prev;
3770                prev = skb;
3771        }
3772
3773        for (skb = prev; skb; skb = prev) {
3774                skb->next = NULL;
3775
3776                if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3777                        return;
3778
3779                prev = skb->prev;
3780                napi_gro_complete(skb);
3781                napi->gro_count--;
3782        }
3783
3784        napi->gro_list = NULL;
3785}
3786EXPORT_SYMBOL(napi_gro_flush);
3787
3788static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3789{
3790        struct sk_buff *p;
3791        unsigned int maclen = skb->dev->hard_header_len;
3792
3793        for (p = napi->gro_list; p; p = p->next) {
3794                unsigned long diffs;
3795
3796                diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3797                diffs |= p->vlan_tci ^ skb->vlan_tci;
3798                if (maclen == ETH_HLEN)
3799                        diffs |= compare_ether_header(skb_mac_header(p),
3800                                                      skb_gro_mac_header(skb));
3801                else if (!diffs)
3802                        diffs = memcmp(skb_mac_header(p),
3803                                       skb_gro_mac_header(skb),
3804                                       maclen);
3805                NAPI_GRO_CB(p)->same_flow = !diffs;
3806                NAPI_GRO_CB(p)->flush = 0;
3807        }
3808}
3809
3810static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3811{
3812        struct sk_buff **pp = NULL;
3813        struct packet_offload *ptype;
3814        __be16 type = skb->protocol;
3815        struct list_head *head = &offload_base;
3816        int same_flow;
3817        enum gro_result ret;
3818
3819        if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3820                goto normal;
3821
3822        if (skb_is_gso(skb) || skb_has_frag_list(skb))
3823                goto normal;
3824
3825        gro_list_prepare(napi, skb);
3826
3827        rcu_read_lock();
3828        list_for_each_entry_rcu(ptype, head, list) {
3829                if (ptype->type != type || !ptype->callbacks.gro_receive)
3830                        continue;
3831
3832                skb_set_network_header(skb, skb_gro_offset(skb));
3833                skb_reset_mac_len(skb);
3834                NAPI_GRO_CB(skb)->same_flow = 0;
3835                NAPI_GRO_CB(skb)->flush = 0;
3836                NAPI_GRO_CB(skb)->free = 0;
3837
3838                pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
3839                break;
3840        }
3841        rcu_read_unlock();
3842
3843        if (&ptype->list == head)
3844                goto normal;
3845
3846        same_flow = NAPI_GRO_CB(skb)->same_flow;
3847        ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3848
3849        if (pp) {
3850                struct sk_buff *nskb = *pp;
3851
3852                *pp = nskb->next;
3853                nskb->next = NULL;
3854                napi_gro_complete(nskb);
3855                napi->gro_count--;
3856        }
3857
3858        if (same_flow)
3859                goto ok;
3860
3861        if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3862                goto normal;
3863
3864        napi->gro_count++;
3865        NAPI_GRO_CB(skb)->count = 1;
3866        NAPI_GRO_CB(skb)->age = jiffies;
3867        skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3868        skb->next = napi->gro_list;
3869        napi->gro_list = skb;
3870        ret = GRO_HELD;
3871
3872pull:
3873        if (skb_headlen(skb) < skb_gro_offset(skb)) {
3874                int grow = skb_gro_offset(skb) - skb_headlen(skb);
3875
3876                BUG_ON(skb->end - skb->tail < grow);
3877
3878                memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3879
3880                skb->tail += grow;
3881                skb->data_len -= grow;
3882
3883                skb_shinfo(skb)->frags[0].page_offset += grow;
3884                skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3885
3886                if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3887                        skb_frag_unref(skb, 0);
3888                        memmove(skb_shinfo(skb)->frags,
3889                                skb_shinfo(skb)->frags + 1,
3890                                --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3891                }
3892        }
3893
3894ok:
3895        return ret;
3896
3897normal:
3898        ret = GRO_NORMAL;
3899        goto pull;
3900}
3901
3902
3903static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3904{
3905        switch (ret) {
3906        case GRO_NORMAL:
3907                if (netif_receive_skb(skb))
3908                        ret = GRO_DROP;
3909                break;
3910
3911        case GRO_DROP:
3912                kfree_skb(skb);
3913                break;
3914
3915        case GRO_MERGED_FREE:
3916                if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3917                        kmem_cache_free(skbuff_head_cache, skb);
3918                else
3919                        __kfree_skb(skb);
3920                break;
3921
3922        case GRO_HELD:
3923        case GRO_MERGED:
3924                break;
3925        }
3926
3927        return ret;
3928}
3929
3930static void skb_gro_reset_offset(struct sk_buff *skb)
3931{
3932        const struct skb_shared_info *pinfo = skb_shinfo(skb);
3933        const skb_frag_t *frag0 = &pinfo->frags[0];
3934
3935        NAPI_GRO_CB(skb)->data_offset = 0;
3936        NAPI_GRO_CB(skb)->frag0 = NULL;
3937        NAPI_GRO_CB(skb)->frag0_len = 0;
3938
3939        if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3940            pinfo->nr_frags &&
3941            !PageHighMem(skb_frag_page(frag0))) {
3942                NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3943                NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3944        }
3945}
3946
3947gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3948{
3949        skb_gro_reset_offset(skb);
3950
3951        return napi_skb_finish(dev_gro_receive(napi, skb), skb);
3952}
3953EXPORT_SYMBOL(napi_gro_receive);
3954
3955static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3956{
3957        __skb_pull(skb, skb_headlen(skb));
3958        /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3959        skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3960        skb->vlan_tci = 0;
3961        skb->dev = napi->dev;
3962        skb->skb_iif = 0;
3963
3964        napi->skb = skb;
3965}
3966
3967struct sk_buff *napi_get_frags(struct napi_struct *napi)
3968{
3969        struct sk_buff *skb = napi->skb;
3970
3971        if (!skb) {
3972                skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3973                if (skb)
3974                        napi->skb = skb;
3975        }
3976        return skb;
3977}
3978EXPORT_SYMBOL(napi_get_frags);
3979
3980static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3981                               gro_result_t ret)
3982{
3983        switch (ret) {
3984        case GRO_NORMAL:
3985        case GRO_HELD:
3986                skb->protocol = eth_type_trans(skb, skb->dev);
3987
3988                if (ret == GRO_HELD)
3989                        skb_gro_pull(skb, -ETH_HLEN);
3990                else if (netif_receive_skb(skb))
3991                        ret = GRO_DROP;
3992                break;
3993
3994        case GRO_DROP:
3995        case GRO_MERGED_FREE:
3996                napi_reuse_skb(napi, skb);
3997                break;
3998
3999        case GRO_MERGED:
4000                break;
4001        }
4002
4003        return ret;
4004}
4005
4006static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4007{
4008        struct sk_buff *skb = napi->skb;
4009        struct ethhdr *eth;
4010        unsigned int hlen;
4011        unsigned int off;
4012
4013        napi->skb = NULL;
4014
4015        skb_reset_mac_header(skb);
4016        skb_gro_reset_offset(skb);
4017
4018        off = skb_gro_offset(skb);
4019        hlen = off + sizeof(*eth);
4020        eth = skb_gro_header_fast(skb, off);
4021        if (skb_gro_header_hard(skb, hlen)) {
4022                eth = skb_gro_header_slow(skb, hlen, off);
4023                if (unlikely(!eth)) {
4024                        napi_reuse_skb(napi, skb);
4025                        skb = NULL;
4026                        goto out;
4027                }
4028        }
4029
4030        skb_gro_pull(skb, sizeof(*eth));
4031
4032        /*
4033         * This works because the only protocols we care about don't require
4034         * special handling.  We'll fix it up properly at the end.
4035         */
4036        skb->protocol = eth->h_proto;
4037
4038out:
4039        return skb;
4040}
4041
4042gro_result_t napi_gro_frags(struct napi_struct *napi)
4043{
4044        struct sk_buff *skb = napi_frags_skb(napi);
4045
4046        if (!skb)
4047                return GRO_DROP;
4048
4049        return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4050}
4051EXPORT_SYMBOL(napi_gro_frags);
4052
4053/*
4054 * net_rps_action sends any pending IPI's for rps.
4055 * Note: called with local irq disabled, but exits with local irq enabled.
4056 */
4057static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4058{
4059#ifdef CONFIG_RPS
4060        struct softnet_data *remsd = sd->rps_ipi_list;
4061
4062        if (remsd) {
4063                sd->rps_ipi_list = NULL;
4064
4065                local_irq_enable();
4066
4067                /* Send pending IPI's to kick RPS processing on remote cpus. */
4068                while (remsd) {
4069                        struct softnet_data *next = remsd->rps_ipi_next;
4070
4071                        if (cpu_online(remsd->cpu))
4072                                __smp_call_function_single(remsd->cpu,
4073                                                           &remsd->csd, 0);
4074                        remsd = next;
4075                }
4076        } else
4077#endif
4078                local_irq_enable();
4079}
4080
4081static int process_backlog(struct napi_struct *napi, int quota)
4082{
4083        int work = 0;
4084        struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4085
4086#ifdef CONFIG_RPS
4087        /* Check if we have pending ipi, its better to send them now,
4088         * not waiting net_rx_action() end.
4089         */
4090        if (sd->rps_ipi_list) {
4091                local_irq_disable();
4092                net_rps_action_and_irq_enable(sd);
4093        }
4094#endif
4095        napi->weight = weight_p;
4096        local_irq_disable();
4097        while (work < quota) {
4098                struct sk_buff *skb;
4099                unsigned int qlen;
4100
4101                while ((skb = __skb_dequeue(&sd->process_queue))) {
4102                        local_irq_enable();
4103                        __netif_receive_skb(skb);
4104                        local_irq_disable();
4105                        input_queue_head_incr(sd);
4106                        if (++work >= quota) {
4107                                local_irq_enable();
4108                                return work;
4109                        }
4110                }
4111
4112                rps_lock(sd);
4113                qlen = skb_queue_len(&sd->input_pkt_queue);
4114                if (qlen)
4115                        skb_queue_splice_tail_init(&sd->input_pkt_queue,
4116                                                   &sd->process_queue);
4117
4118                if (qlen < quota - work) {
4119                        /*
4120                         * Inline a custom version of __napi_complete().
4121                         * only current cpu owns and manipulates this napi,
4122                         * and NAPI_STATE_SCHED is the only possible flag set on backlog.
4123                         * we can use a plain write instead of clear_bit(),
4124                         * and we dont need an smp_mb() memory barrier.
4125                         */
4126                        list_del(&napi->poll_list);
4127                        napi->state = 0;
4128
4129                        quota = work + qlen;
4130                }
4131                rps_unlock(sd);
4132        }
4133        local_irq_enable();
4134
4135        return work;
4136}
4137
4138/**
4139 * __napi_schedule - schedule for receive
4140 * @n: entry to schedule
4141 *
4142 * The entry's receive function will be scheduled to run
4143 */
4144void __napi_schedule(struct napi_struct *n)
4145{
4146        unsigned long flags;
4147
4148        local_irq_save(flags);
4149        ____napi_schedule(&__get_cpu_var(softnet_data), n);
4150        local_irq_restore(flags);
4151}
4152EXPORT_SYMBOL(__napi_schedule);
4153
4154void __napi_complete(struct napi_struct *n)
4155{
4156        BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4157        BUG_ON(n->gro_list);
4158
4159        list_del(&n->poll_list);
4160        smp_mb__before_clear_bit();
4161        clear_bit(NAPI_STATE_SCHED, &n->state);
4162}
4163EXPORT_SYMBOL(__napi_complete);
4164
4165void napi_complete(struct napi_struct *n)
4166{
4167        unsigned long flags;
4168
4169        /*
4170         * don't let napi dequeue from the cpu poll list
4171         * just in case its running on a different cpu
4172         */
4173        if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4174                return;
4175
4176        napi_gro_flush(n, false);
4177        local_irq_save(flags);
4178        __napi_complete(n);
4179        local_irq_restore(flags);
4180}
4181EXPORT_SYMBOL(napi_complete);
4182
4183/* must be called under rcu_read_lock(), as we dont take a reference */
4184struct napi_struct *napi_by_id(unsigned int napi_id)
4185{
4186        unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4187        struct napi_struct *napi;
4188
4189        hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4190                if (napi->napi_id == napi_id)
4191                        return napi;
4192
4193        return NULL;
4194}
4195EXPORT_SYMBOL_GPL(napi_by_id);
4196
4197void napi_hash_add(struct napi_struct *napi)
4198{
4199        if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4200
4201                spin_lock(&napi_hash_lock);
4202
4203                /* 0 is not a valid id, we also skip an id that is taken
4204                 * we expect both events to be extremely rare
4205                 */
4206                napi->napi_id = 0;
4207                while (!napi->napi_id) {
4208                        napi->napi_id = ++napi_gen_id;
4209                        if (napi_by_id(napi->napi_id))
4210                                napi->napi_id = 0;
4211                }
4212
4213                hlist_add_head_rcu(&napi->napi_hash_node,
4214                        &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4215
4216                spin_unlock(&napi_hash_lock);
4217        }
4218}
4219EXPORT_SYMBOL_GPL(napi_hash_add);
4220
4221/* Warning : caller is responsible to make sure rcu grace period
4222 * is respected before freeing memory containing @napi
4223 */
4224void napi_hash_del(struct napi_struct *napi)
4225{
4226        spin_lock(&napi_hash_lock);
4227
4228        if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4229                hlist_del_rcu(&napi->napi_hash_node);
4230
4231        spin_unlock(&napi_hash_lock);
4232}
4233EXPORT_SYMBOL_GPL(napi_hash_del);
4234
4235void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4236                    int (*poll)(struct napi_struct *, int), int weight)
4237{
4238        INIT_LIST_HEAD(&napi->poll_list);
4239        napi->gro_count = 0;
4240        napi->gro_list = NULL;
4241        napi->skb = NULL;
4242        napi->poll = poll;
4243        if (weight > NAPI_POLL_WEIGHT)
4244                pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4245                            weight, dev->name);
4246        napi->weight = weight;
4247        list_add(&napi->dev_list, &dev->napi_list);
4248        napi->dev = dev;
4249#ifdef CONFIG_NETPOLL
4250        spin_lock_init(&napi->poll_lock);
4251        napi->poll_owner = -1;
4252#endif
4253        set_bit(NAPI_STATE_SCHED, &napi->state);
4254}
4255EXPORT_SYMBOL(netif_napi_add);
4256
4257void netif_napi_del(struct napi_struct *napi)
4258{
4259        struct sk_buff *skb, *next;
4260
4261        list_del_init(&napi->dev_list);
4262        napi_free_frags(napi);
4263
4264        for (skb = napi->gro_list; skb; skb = next) {
4265                next = skb->next;
4266                skb->next = NULL;
4267                kfree_skb(skb);
4268        }
4269
4270        napi->gro_list = NULL;
4271        napi->gro_count = 0;
4272}
4273EXPORT_SYMBOL(netif_napi_del);
4274
4275static void net_rx_action(struct softirq_action *h)
4276{
4277        struct softnet_data *sd = &__get_cpu_var(softnet_data);
4278        unsigned long time_limit = jiffies + 2;
4279        int budget = netdev_budget;
4280        void *have;
4281
4282        local_irq_disable();
4283
4284        while (!list_empty(&sd->poll_list)) {
4285                struct napi_struct *n;
4286                int work, weight;
4287
4288                /* If softirq window is exhuasted then punt.
4289                 * Allow this to run for 2 jiffies since which will allow
4290                 * an average latency of 1.5/HZ.
4291                 */
4292                if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
4293                        goto softnet_break;
4294
4295                local_irq_enable();
4296
4297                /* Even though interrupts have been re-enabled, this
4298                 * access is safe because interrupts can only add new
4299                 * entries to the tail of this list, and only ->poll()
4300                 * calls can remove this head entry from the list.
4301                 */
4302                n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4303
4304                have = netpoll_poll_lock(n);
4305
4306                weight = n->weight;
4307
4308                /* This NAPI_STATE_SCHED test is for avoiding a race
4309                 * with netpoll's poll_napi().  Only the entity which
4310                 * obtains the lock and sees NAPI_STATE_SCHED set will
4311                 * actually make the ->poll() call.  Therefore we avoid
4312                 * accidentally calling ->poll() when NAPI is not scheduled.
4313                 */
4314                work = 0;
4315                if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4316                        work = n->poll(n, weight);
4317                        trace_napi_poll(n);
4318                }
4319
4320                WARN_ON_ONCE(work > weight);
4321
4322                budget -= work;
4323
4324                local_irq_disable();
4325
4326                /* Drivers must not modify the NAPI state if they
4327                 * consume the entire weight.  In such cases this code
4328                 * still "owns" the NAPI instance and therefore can
4329                 * move the instance around on the list at-will.
4330                 */
4331                if (unlikely(work == weight)) {
4332                        if (unlikely(napi_disable_pending(n))) {
4333                                local_irq_enable();
4334                                napi_complete(n);
4335                                local_irq_disable();
4336                        } else {
4337                                if (n->gro_list) {
4338                                        /* flush too old packets
4339                                         * If HZ < 1000, flush all packets.
4340                                         */
4341                                        local_irq_enable();
4342                                        napi_gro_flush(n, HZ >= 1000);
4343                                        local_irq_disable();
4344                                }
4345                                list_move_tail(&n->poll_list, &sd->poll_list);
4346                        }
4347                }
4348
4349                netpoll_poll_unlock(have);
4350        }
4351out:
4352        net_rps_action_and_irq_enable(sd);
4353
4354#ifdef CONFIG_NET_DMA
4355        /*
4356         * There may not be any more sk_buffs coming right now, so push
4357         * any pending DMA copies to hardware
4358         */
4359        dma_issue_pending_all();
4360#endif
4361
4362        return;
4363
4364softnet_break:
4365        sd->time_squeeze++;
4366        __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4367        goto out;
4368}
4369
4370struct netdev_upper {
4371        struct net_device *dev;
4372        bool master;
4373        struct list_head list;
4374        struct rcu_head rcu;
4375        struct list_head search_list;
4376};
4377
4378static void __append_search_uppers(struct list_head *search_list,
4379                                   struct net_device *dev)
4380{
4381        struct netdev_upper *upper;
4382
4383        list_for_each_entry(upper, &dev->upper_dev_list, list) {
4384                /* check if this upper is not already in search list */
4385                if (list_empty(&upper->search_list))
4386                        list_add_tail(&upper->search_list, search_list);
4387        }
4388}
4389
4390static bool __netdev_search_upper_dev(struct net_device *dev,
4391                                      struct net_device *upper_dev)
4392{
4393        LIST_HEAD(search_list);
4394        struct netdev_upper *upper;
4395        struct netdev_upper *tmp;
4396        bool ret = false;
4397
4398        __append_search_uppers(&search_list, dev);
4399        list_for_each_entry(upper, &search_list, search_list) {
4400                if (upper->dev == upper_dev) {
4401                        ret = true;
4402                        break;
4403                }
4404                __append_search_uppers(&search_list, upper->dev);
4405        }
4406        list_for_each_entry_safe(upper, tmp, &search_list, search_list)
4407                INIT_LIST_HEAD(&upper->search_list);
4408        return ret;
4409}
4410
4411static struct netdev_upper *__netdev_find_upper(struct net_device *dev,
4412                                                struct net_device *upper_dev)
4413{
4414        struct netdev_upper *upper;
4415
4416        list_for_each_entry(upper, &dev->upper_dev_list, list) {
4417                if (upper->dev == upper_dev)
4418                        return upper;
4419        }
4420        return NULL;
4421}
4422
4423/**
4424 * netdev_has_upper_dev - Check if device is linked to an upper device
4425 * @dev: device
4426 * @upper_dev: upper device to check
4427 *
4428 * Find out if a device is linked to specified upper device and return true
4429 * in case it is. Note that this checks only immediate upper device,
4430 * not through a complete stack of devices. The caller must hold the RTNL lock.
4431 */
4432bool netdev_has_upper_dev(struct net_device *dev,
4433                          struct net_device *upper_dev)
4434{
4435        ASSERT_RTNL();
4436
4437        return __netdev_find_upper(dev, upper_dev);
4438}
4439EXPORT_SYMBOL(netdev_has_upper_dev);
4440
4441/**
4442 * netdev_has_any_upper_dev - Check if device is linked to some device
4443 * @dev: device
4444 *
4445 * Find out if a device is linked to an upper device and return true in case
4446 * it is. The caller must hold the RTNL lock.
4447 */
4448bool netdev_has_any_upper_dev(struct net_device *dev)
4449{
4450        ASSERT_RTNL();
4451
4452        return !list_empty(&dev->upper_dev_list);
4453}
4454EXPORT_SYMBOL(netdev_has_any_upper_dev);
4455
4456/**
4457 * netdev_master_upper_dev_get - Get master upper device
4458 * @dev: device
4459 *
4460 * Find a master upper device and return pointer to it or NULL in case
4461 * it's not there. The caller must hold the RTNL lock.
4462 */
4463struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4464{
4465        struct netdev_upper *upper;
4466
4467        ASSERT_RTNL();
4468
4469        if (list_empty(&dev->upper_dev_list))
4470                return NULL;
4471
4472        upper = list_first_entry(&dev->upper_dev_list,
4473                                 struct netdev_upper, list);
4474        if (likely(upper->master))
4475                return upper->dev;
4476        return NULL;
4477}
4478EXPORT_SYMBOL(netdev_master_upper_dev_get);
4479
4480/**
4481 * netdev_master_upper_dev_get_rcu - Get master upper device
4482 * @dev: device
4483 *
4484 * Find a master upper device and return pointer to it or NULL in case
4485 * it's not there. The caller must hold the RCU read lock.
4486 */
4487struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4488{
4489        struct netdev_upper *upper;
4490
4491        upper = list_first_or_null_rcu(&dev->upper_dev_list,
4492                                       struct netdev_upper, list);
4493        if (upper && likely(upper->master))
4494                return upper->dev;
4495        return NULL;
4496}
4497EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4498
4499static int __netdev_upper_dev_link(struct net_device *dev,
4500                                   struct net_device *upper_dev, bool master)
4501{
4502        struct netdev_upper *upper;
4503
4504        ASSERT_RTNL();
4505
4506        if (dev == upper_dev)
4507                return -EBUSY;
4508
4509        /* To prevent loops, check if dev is not upper device to upper_dev. */
4510        if (__netdev_search_upper_dev(upper_dev, dev))
4511                return -EBUSY;
4512
4513        if (__netdev_find_upper(dev, upper_dev))
4514                return -EEXIST;
4515
4516        if (master && netdev_master_upper_dev_get(dev))
4517                return -EBUSY;
4518
4519        upper = kmalloc(sizeof(*upper), GFP_KERNEL);
4520        if (!upper)
4521                return -ENOMEM;
4522
4523        upper->dev = upper_dev;
4524        upper->master = master;
4525        INIT_LIST_HEAD(&upper->search_list);
4526
4527        /* Ensure that master upper link is always the first item in list. */
4528        if (master)
4529                list_add_rcu(&upper->list, &dev->upper_dev_list);
4530        else
4531                list_add_tail_rcu(&upper->list, &dev->upper_dev_list);
4532        dev_hold(upper_dev);
4533        call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
4534        return 0;
4535}
4536
4537/**
4538 * netdev_upper_dev_link - Add a link to the upper device
4539 * @dev: device
4540 * @upper_dev: new upper device
4541 *
4542 * Adds a link to device which is upper to this one. The caller must hold
4543 * the RTNL lock. On a failure a negative errno code is returned.
4544 * On success the reference counts are adjusted and the function
4545 * returns zero.
4546 */
4547int netdev_upper_dev_link(struct net_device *dev,
4548                          struct net_device *upper_dev)
4549{
4550        return __netdev_upper_dev_link(dev, upper_dev, false);
4551}
4552EXPORT_SYMBOL(netdev_upper_dev_link);
4553
4554/**
4555 * netdev_master_upper_dev_link - Add a master link to the upper device
4556 * @dev: device
4557 * @upper_dev: new upper device
4558 *
4559 * Adds a link to device which is upper to this one. In this case, only
4560 * one master upper device can be linked, although other non-master devices
4561 * might be linked as well. The caller must hold the RTNL lock.
4562 * On a failure a negative errno code is returned. On success the reference
4563 * counts are adjusted and the function returns zero.
4564 */
4565int netdev_master_upper_dev_link(struct net_device *dev,
4566                                 struct net_device *upper_dev)
4567{
4568        return __netdev_upper_dev_link(dev, upper_dev, true);
4569}
4570EXPORT_SYMBOL(netdev_master_upper_dev_link);
4571
4572/**
4573 * netdev_upper_dev_unlink - Removes a link to upper device
4574 * @dev: device
4575 * @upper_dev: new upper device
4576 *
4577 * Removes a link to device which is upper to this one. The caller must hold
4578 * the RTNL lock.
4579 */
4580void netdev_upper_dev_unlink(struct net_device *dev,
4581                             struct net_device *upper_dev)
4582{
4583        struct netdev_upper *upper;
4584
4585        ASSERT_RTNL();
4586
4587        upper = __netdev_find_upper(dev, upper_dev);
4588        if (!upper)
4589                return;
4590        list_del_rcu(&upper->list);
4591        dev_put(upper_dev);
4592        kfree_rcu(upper, rcu);
4593        call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
4594}
4595EXPORT_SYMBOL(netdev_upper_dev_unlink);
4596
4597static void dev_change_rx_flags(struct net_device *dev, int flags)
4598{
4599        const struct net_device_ops *ops = dev->netdev_ops;
4600
4601        if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4602                ops->ndo_change_rx_flags(dev, flags);
4603}
4604
4605static int __dev_set_promiscuity(struct net_device *dev, int inc)
4606{
4607        unsigned int old_flags = dev->flags;
4608        kuid_t uid;
4609        kgid_t gid;
4610
4611        ASSERT_RTNL();
4612
4613        dev->flags |= IFF_PROMISC;
4614        dev->promiscuity += inc;
4615        if (dev->promiscuity == 0) {
4616                /*
4617                 * Avoid overflow.
4618                 * If inc causes overflow, untouch promisc and return error.
4619                 */
4620                if (inc < 0)
4621                        dev->flags &= ~IFF_PROMISC;
4622                else {
4623                        dev->promiscuity -= inc;
4624                        pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4625                                dev->name);
4626                        return -EOVERFLOW;
4627                }
4628        }
4629        if (dev->flags != old_flags) {
4630                pr_info("device %s %s promiscuous mode\n",
4631                        dev->name,
4632                        dev->flags & IFF_PROMISC ? "entered" : "left");
4633                if (audit_enabled) {
4634                        current_uid_gid(&uid, &gid);
4635                        audit_log(current->audit_context, GFP_ATOMIC,
4636                                AUDIT_ANOM_PROMISCUOUS,
4637                                "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4638                                dev->name, (dev->flags & IFF_PROMISC),
4639                                (old_flags & IFF_PROMISC),
4640                                from_kuid(&init_user_ns, audit_get_loginuid(current)),
4641                                from_kuid(&init_user_ns, uid),
4642                                from_kgid(&init_user_ns, gid),
4643                                audit_get_sessionid(current));
4644                }
4645
4646                dev_change_rx_flags(dev, IFF_PROMISC);
4647        }
4648        return 0;
4649}
4650
4651/**
4652 *      dev_set_promiscuity     - update promiscuity count on a device
4653 *      @dev: device
4654 *      @inc: modifier
4655 *
4656 *      Add or remove promiscuity from a device. While the count in the device
4657 *      remains above zero the interface remains promiscuous. Once it hits zero
4658 *      the device reverts back to normal filtering operation. A negative inc
4659 *      value is used to drop promiscuity on the device.
4660 *      Return 0 if successful or a negative errno code on error.
4661 */
4662int dev_set_promiscuity(struct net_device *dev, int inc)
4663{
4664        unsigned int old_flags = dev->flags;
4665        int err;
4666
4667        err = __dev_set_promiscuity(dev, inc);
4668        if (err < 0)
4669                return err;
4670        if (dev->flags != old_flags)
4671                dev_set_rx_mode(dev);
4672        return err;
4673}
4674EXPORT_SYMBOL(dev_set_promiscuity);
4675
4676/**
4677 *      dev_set_allmulti        - update allmulti count on a device
4678 *      @dev: device
4679 *      @inc: modifier
4680 *
4681 *      Add or remove reception of all multicast frames to a device. While the
4682 *      count in the device remains above zero the interface remains listening
4683 *      to all interfaces. Once it hits zero the device reverts back to normal
4684 *      filtering operation. A negative @inc value is used to drop the counter
4685 *      when releasing a resource needing all multicasts.
4686 *      Return 0 if successful or a negative errno code on error.
4687 */
4688
4689int dev_set_allmulti(struct net_device *dev, int inc)
4690{
4691        unsigned int old_flags = dev->flags;
4692
4693        ASSERT_RTNL();
4694
4695        dev->flags |= IFF_ALLMULTI;
4696        dev->allmulti += inc;
4697        if (dev->allmulti == 0) {
4698                /*
4699                 * Avoid overflow.
4700                 * If inc causes overflow, untouch allmulti and return error.
4701                 */
4702                if (inc < 0)
4703                        dev->flags &= ~IFF_ALLMULTI;
4704                else {
4705                        dev->allmulti -= inc;
4706                        pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4707                                dev->name);
4708                        return -EOVERFLOW;
4709                }
4710        }
4711        if (dev->flags ^ old_flags) {
4712                dev_change_rx_flags(dev, IFF_ALLMULTI);
4713                dev_set_rx_mode(dev);
4714        }
4715        return 0;
4716}
4717EXPORT_SYMBOL(dev_set_allmulti);
4718
4719/*
4720 *      Upload unicast and multicast address lists to device and
4721 *      configure RX filtering. When the device doesn't support unicast
4722 *      filtering it is put in promiscuous mode while unicast addresses
4723 *      are present.
4724 */
4725void __dev_set_rx_mode(struct net_device *dev)
4726{
4727        const struct net_device_ops *ops = dev->netdev_ops;
4728
4729        /* dev_open will call this function so the list will stay sane. */
4730        if (!(dev->flags&IFF_UP))
4731                return;
4732
4733        if (!netif_device_present(dev))
4734                return;
4735
4736        if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4737                /* Unicast addresses changes may only happen under the rtnl,
4738                 * therefore calling __dev_set_promiscuity here is safe.
4739                 */
4740                if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4741                        __dev_set_promiscuity(dev, 1);
4742                        dev->uc_promisc = true;
4743                } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4744                        __dev_set_promiscuity(dev, -1);
4745                        dev->uc_promisc = false;
4746                }
4747        }
4748
4749        if (ops->ndo_set_rx_mode)
4750                ops->ndo_set_rx_mode(dev);
4751}
4752
4753void dev_set_rx_mode(struct net_device *dev)
4754{
4755        netif_addr_lock_bh(dev);
4756        __dev_set_rx_mode(dev);
4757        netif_addr_unlock_bh(dev);
4758}
4759
4760/**
4761 *      dev_get_flags - get flags reported to userspace
4762 *      @dev: device
4763 *
4764 *      Get the combination of flag bits exported through APIs to userspace.
4765 */
4766unsigned int dev_get_flags(const struct net_device *dev)
4767{
4768        unsigned int flags;
4769
4770        flags = (dev->flags & ~(IFF_PROMISC |
4771                                IFF_ALLMULTI |
4772                                IFF_RUNNING |
4773                                IFF_LOWER_UP |
4774                                IFF_DORMANT)) |
4775                (dev->gflags & (IFF_PROMISC |
4776                                IFF_ALLMULTI));
4777
4778        if (netif_running(dev)) {
4779                if (netif_oper_up(dev))
4780                        flags |= IFF_RUNNING;
4781                if (netif_carrier_ok(dev))
4782                        flags |= IFF_LOWER_UP;
4783                if (netif_dormant(dev))
4784                        flags |= IFF_DORMANT;
4785        }
4786
4787        return flags;
4788}
4789EXPORT_SYMBOL(dev_get_flags);
4790
4791int __dev_change_flags(struct net_device *dev, unsigned int flags)
4792{
4793        unsigned int old_flags = dev->flags;
4794        int ret;
4795
4796        ASSERT_RTNL();
4797
4798        /*
4799         *      Set the flags on our device.
4800         */
4801
4802        dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4803                               IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4804                               IFF_AUTOMEDIA)) |
4805                     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4806                                    IFF_ALLMULTI));
4807
4808        /*
4809         *      Load in the correct multicast list now the flags have changed.
4810         */
4811
4812        if ((old_flags ^ flags) & IFF_MULTICAST)
4813                dev_change_rx_flags(dev, IFF_MULTICAST);
4814
4815        dev_set_rx_mode(dev);
4816
4817        /*
4818         *      Have we downed the interface. We handle IFF_UP ourselves
4819         *      according to user attempts to set it, rather than blindly
4820         *      setting it.
4821         */
4822
4823        ret = 0;
4824        if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4825                ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4826
4827                if (!ret)
4828                        dev_set_rx_mode(dev);
4829        }
4830
4831        if ((flags ^ dev->gflags) & IFF_PROMISC) {
4832                int inc = (flags & IFF_PROMISC) ? 1 : -1;
4833
4834                dev->gflags ^= IFF_PROMISC;
4835                dev_set_promiscuity(dev, inc);
4836        }
4837
4838        /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4839           is important. Some (broken) drivers set IFF_PROMISC, when
4840           IFF_ALLMULTI is requested not asking us and not reporting.
4841         */
4842        if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4843                int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4844
4845                dev->gflags ^= IFF_ALLMULTI;
4846                dev_set_allmulti(dev, inc);
4847        }
4848
4849        return ret;
4850}
4851
4852void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4853{
4854        unsigned int changes = dev->flags ^ old_flags;
4855
4856        if (changes & IFF_UP) {
4857                if (dev->flags & IFF_UP)
4858                        call_netdevice_notifiers(NETDEV_UP, dev);
4859                else
4860                        call_netdevice_notifiers(NETDEV_DOWN, dev);
4861        }
4862
4863        if (dev->flags & IFF_UP &&
4864            (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
4865                struct netdev_notifier_change_info change_info;
4866
4867                change_info.flags_changed = changes;
4868                call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
4869                                              &change_info.info);
4870        }
4871}
4872
4873/**
4874 *      dev_change_flags - change device settings
4875 *      @dev: device
4876 *      @flags: device state flags
4877 *
4878 *      Change settings on device based state flags. The flags are
4879 *      in the userspace exported format.
4880 */
4881int dev_change_flags(struct net_device *dev, unsigned int flags)
4882{
4883        int ret;
4884        unsigned int changes, old_flags = dev->flags;
4885
4886        ret = __dev_change_flags(dev, flags);
4887        if (ret < 0)
4888                return ret;
4889
4890        changes = old_flags ^ dev->flags;
4891        if (changes)
4892                rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4893
4894        __dev_notify_flags(dev, old_flags);
4895        return ret;
4896}
4897EXPORT_SYMBOL(dev_change_flags);
4898
4899/**
4900 *      dev_set_mtu - Change maximum transfer unit
4901 *      @dev: device
4902 *      @new_mtu: new transfer unit
4903 *
4904 *      Change the maximum transfer size of the network device.
4905 */
4906int dev_set_mtu(struct net_device *dev, int new_mtu)
4907{
4908        const struct net_device_ops *ops = dev->netdev_ops;
4909        int err;
4910
4911        if (new_mtu == dev->mtu)
4912                return 0;
4913
4914        /*      MTU must be positive.    */
4915        if (new_mtu < 0)
4916                return -EINVAL;
4917
4918        if (!netif_device_present(dev))
4919                return -ENODEV;
4920
4921        err = 0;
4922        if (ops->ndo_change_mtu)
4923                err = ops->ndo_change_mtu(dev, new_mtu);
4924        else
4925                dev->mtu = new_mtu;
4926
4927        if (!err)
4928                call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4929        return err;
4930}
4931EXPORT_SYMBOL(dev_set_mtu);
4932
4933/**
4934 *      dev_set_group - Change group this device belongs to
4935 *      @dev: device
4936 *      @new_group: group this device should belong to
4937 */
4938void dev_set_group(struct net_device *dev, int new_group)
4939{
4940        dev->group = new_group;
4941}
4942EXPORT_SYMBOL(dev_set_group);
4943
4944/**
4945 *      dev_set_mac_address - Change Media Access Control Address
4946 *      @dev: device
4947 *      @sa: new address
4948 *
4949 *      Change the hardware (MAC) address of the device
4950 */
4951int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4952{
4953        const struct net_device_ops *ops = dev->netdev_ops;
4954        int err;
4955
4956        if (!ops->ndo_set_mac_address)
4957                return -EOPNOTSUPP;
4958        if (sa->sa_family != dev->type)
4959                return -EINVAL;
4960        if (!netif_device_present(dev))
4961                return -ENODEV;
4962        err = ops->ndo_set_mac_address(dev, sa);
4963        if (err)
4964                return err;
4965        dev->addr_assign_type = NET_ADDR_SET;
4966        call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4967        add_device_randomness(dev->dev_addr, dev->addr_len);
4968        return 0;
4969}
4970EXPORT_SYMBOL(dev_set_mac_address);
4971
4972/**
4973 *      dev_change_carrier - Change device carrier
4974 *      @dev: device
4975 *      @new_carrier: new value
4976 *
4977 *      Change device carrier
4978 */
4979int dev_change_carrier(struct net_device *dev, bool new_carrier)
4980{
4981        const struct net_device_ops *ops = dev->netdev_ops;
4982
4983        if (!ops->ndo_change_carrier)
4984                return -EOPNOTSUPP;
4985        if (!netif_device_present(dev))
4986                return -ENODEV;
4987        return ops->ndo_change_carrier(dev, new_carrier);
4988}
4989EXPORT_SYMBOL(dev_change_carrier);
4990
4991/**
4992 *      dev_new_index   -       allocate an ifindex
4993 *      @net: the applicable net namespace
4994 *
4995 *      Returns a suitable unique value for a new device interface
4996 *      number.  The caller must hold the rtnl semaphore or the
4997 *      dev_base_lock to be sure it remains unique.
4998 */
4999static int dev_new_index(struct net *net)
5000{
5001        int ifindex = net->ifindex;
5002        for (;;) {
5003                if (++ifindex <= 0)
5004                        ifindex = 1;
5005                if (!__dev_get_by_index(net, ifindex))
5006                        return net->ifindex = ifindex;
5007        }
5008}
5009
5010/* Delayed registration/unregisteration */
5011static LIST_HEAD(net_todo_list);
5012
5013static void net_set_todo(struct net_device *dev)
5014{
5015        list_add_tail(&dev->todo_list, &net_todo_list);
5016}
5017
5018static void rollback_registered_many(struct list_head *head)
5019{
5020        struct net_device *dev, *tmp;
5021
5022        BUG_ON(dev_boot_phase);
5023        ASSERT_RTNL();
5024
5025        list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5026                /* Some devices call without registering
5027                 * for initialization unwind. Remove those
5028                 * devices and proceed with the remaining.
5029                 */
5030                if (dev->reg_state == NETREG_UNINITIALIZED) {
5031                        pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5032                                 dev->name, dev);
5033
5034                        WARN_ON(1);
5035                        list_del(&dev->unreg_list);
5036                        continue;
5037                }
5038                dev->dismantle = true;
5039                BUG_ON(dev->reg_state != NETREG_REGISTERED);
5040        }
5041
5042        /* If device is running, close it first. */
5043        dev_close_many(head);
5044
5045        list_for_each_entry(dev, head, unreg_list) {
5046                /* And unlink it from device chain. */
5047                unlist_netdevice(dev);
5048
5049                dev->reg_state = NETREG_UNREGISTERING;
5050        }
5051
5052        synchronize_net();
5053
5054        list_for_each_entry(dev, head, unreg_list) {
5055                /* Shutdown queueing discipline. */
5056                dev_shutdown(dev);
5057
5058
5059                /* Notify protocols, that we are about to destroy
5060                   this device. They should clean all the things.
5061                */
5062                call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5063
5064                if (!dev->rtnl_link_ops ||
5065                    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5066                        rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5067
5068                /*
5069                 *      Flush the unicast and multicast chains
5070                 */
5071                dev_uc_flush(dev);
5072                dev_mc_flush(dev);
5073
5074                if (dev->netdev_ops->ndo_uninit)
5075                        dev->netdev_ops->ndo_uninit(dev);
5076
5077                /* Notifier chain MUST detach us all upper devices. */
5078                WARN_ON(netdev_has_any_upper_dev(dev));
5079
5080                /* Remove entries from kobject tree */
5081                netdev_unregister_kobject(dev);
5082#ifdef CONFIG_XPS
5083                /* Remove XPS queueing entries */
5084                netif_reset_xps_queues_gt(dev, 0);
5085#endif
5086        }
5087
5088        synchronize_net();
5089
5090        list_for_each_entry(dev, head, unreg_list)
5091                dev_put(dev);
5092}
5093
5094static void rollback_registered(struct net_device *dev)
5095{
5096        LIST_HEAD(single);
5097
5098        list_add(&dev->unreg_list, &single);
5099        rollback_registered_many(&single);
5100        list_del(&single);
5101}
5102
5103static netdev_features_t netdev_fix_features(struct net_device *dev,
5104        netdev_features_t features)
5105{
5106        /* Fix illegal checksum combinations */
5107        if ((features & NETIF_F_HW_CSUM) &&
5108            (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5109                netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5110                features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5111        }
5112
5113        /* TSO requires that SG is present as well. */
5114        if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5115                netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5116                features &= ~NETIF_F_ALL_TSO;
5117        }
5118
5119        if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
5120                                        !(features & NETIF_F_IP_CSUM)) {
5121                netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
5122                features &= ~NETIF_F_TSO;
5123                features &= ~NETIF_F_TSO_ECN;
5124        }
5125
5126        if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
5127                                         !(features & NETIF_F_IPV6_CSUM)) {
5128                netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
5129                features &= ~NETIF_F_TSO6;
5130        }
5131
5132        /* TSO ECN requires that TSO is present as well. */
5133        if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5134                features &= ~NETIF_F_TSO_ECN;
5135
5136        /* Software GSO depends on SG. */
5137        if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5138                netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5139                features &= ~NETIF_F_GSO;
5140        }
5141
5142        /* UFO needs SG and checksumming */
5143        if (features & NETIF_F_UFO) {
5144                /* maybe split UFO into V4 and V6? */
5145                if (!((features & NETIF_F_GEN_CSUM) ||
5146                    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5147                            == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5148                        netdev_dbg(dev,
5149                                "Dropping NETIF_F_UFO since no checksum offload features.\n");
5150                        features &= ~NETIF_F_UFO;
5151                }
5152
5153                if (!(features & NETIF_F_SG)) {
5154                        netdev_dbg(dev,
5155                                "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5156                        features &= ~NETIF_F_UFO;
5157                }
5158        }
5159
5160        return features;
5161}
5162
5163int __netdev_update_features(struct net_device *dev)
5164{
5165        netdev_features_t features;
5166        int err = 0;
5167
5168        ASSERT_RTNL();
5169
5170        features = netdev_get_wanted_features(dev);
5171
5172        if (dev->netdev_ops->ndo_fix_features)
5173                features = dev->netdev_ops->ndo_fix_features(dev, features);
5174
5175        /* driver might be less strict about feature dependencies */
5176        features = netdev_fix_features(dev, features);
5177
5178        if (dev->features == features)
5179                return 0;
5180
5181        netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5182                &dev->features, &features);
5183
5184        if (dev->netdev_ops->ndo_set_features)
5185                err = dev->netdev_ops->ndo_set_features(dev, features);
5186
5187        if (unlikely(err < 0)) {
5188                netdev_err(dev,
5189                        "set_features() failed (%d); wanted %pNF, left %pNF\n",
5190                        err, &features, &dev->features);
5191                return -1;
5192        }
5193
5194        if (!err)
5195                dev->features = features;
5196
5197        return 1;
5198}
5199
5200/**
5201 *      netdev_update_features - recalculate device features
5202 *      @dev: the device to check
5203 *
5204 *      Recalculate dev->features set and send notifications if it
5205 *      has changed. Should be called after driver or hardware dependent
5206 *      conditions might have changed that influence the features.
5207 */
5208void netdev_update_features(struct net_device *dev)
5209{
5210        if (__netdev_update_features(dev))
5211                netdev_features_change(dev);
5212}
5213EXPORT_SYMBOL(netdev_update_features);
5214
5215/**
5216 *      netdev_change_features - recalculate device features
5217 *      @dev: the device to check
5218 *
5219 *      Recalculate dev->features set and send notifications even
5220 *      if they have not changed. Should be called instead of
5221 *      netdev_update_features() if also dev->vlan_features might
5222 *      have changed to allow the changes to be propagated to stacked
5223 *      VLAN devices.
5224 */
5225void netdev_change_features(struct net_device *dev)
5226{
5227        __netdev_update_features(dev);
5228        netdev_features_change(dev);
5229}
5230EXPORT_SYMBOL(netdev_change_features);
5231
5232/**
5233 *      netif_stacked_transfer_operstate -      transfer operstate
5234 *      @rootdev: the root or lower level device to transfer state from
5235 *      @dev: the device to transfer operstate to
5236 *
5237 *      Transfer operational state from root to device. This is normally
5238 *      called when a stacking relationship exists between the root
5239 *      device and the device(a leaf device).
5240 */
5241void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5242                                        struct net_device *dev)
5243{
5244        if (rootdev->operstate == IF_OPER_DORMANT)
5245                netif_dormant_on(dev);
5246        else
5247                netif_dormant_off(dev);
5248
5249        if (netif_carrier_ok(rootdev)) {
5250                if (!netif_carrier_ok(dev))
5251                        netif_carrier_on(dev);
5252        } else {
5253                if (netif_carrier_ok(dev))
5254                        netif_carrier_off(dev);
5255        }
5256}
5257EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5258
5259#ifdef CONFIG_RPS
5260static int netif_alloc_rx_queues(struct net_device *dev)
5261{
5262        unsigned int i, count = dev->num_rx_queues;
5263        struct netdev_rx_queue *rx;
5264
5265        BUG_ON(count < 1);
5266
5267        rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5268        if (!rx)
5269                return -ENOMEM;
5270
5271        dev->_rx = rx;
5272
5273        for (i = 0; i < count; i++)
5274                rx[i].dev = dev;
5275        return 0;
5276}
5277#endif
5278
5279static void netdev_init_one_queue(struct net_device *dev,
5280                                  struct netdev_queue *queue, void *_unused)
5281{
5282        /* Initialize queue lock */
5283        spin_lock_init(&queue->_xmit_lock);
5284        netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5285        queue->xmit_lock_owner = -1;
5286        netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5287        queue->dev = dev;
5288#ifdef CONFIG_BQL
5289        dql_init(&queue->dql, HZ);
5290#endif
5291}
5292
5293static void netif_free_tx_queues(struct net_device *dev)
5294{
5295        if (is_vmalloc_addr(dev->_tx))
5296                vfree(dev->_tx);
5297        else
5298                kfree(dev->_tx);
5299}
5300
5301static int netif_alloc_netdev_queues(struct net_device *dev)
5302{
5303        unsigned int count = dev->num_tx_queues;
5304        struct netdev_queue *tx;
5305        size_t sz = count * sizeof(*tx);
5306
5307        BUG_ON(count < 1 || count > 0xffff);
5308
5309        tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
5310        if (!tx) {
5311                tx = vzalloc(sz);
5312                if (!tx)
5313                        return -ENOMEM;
5314        }
5315        dev->_tx = tx;
5316
5317        netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5318        spin_lock_init(&dev->tx_global_lock);
5319
5320        return 0;
5321}
5322
5323/**
5324 *      register_netdevice      - register a network device
5325 *      @dev: device to register
5326 *
5327 *      Take a completed network device structure and add it to the kernel
5328 *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5329 *      chain. 0 is returned on success. A negative errno code is returned
5330 *      on a failure to set up the device, or if the name is a duplicate.
5331 *
5332 *      Callers must hold the rtnl semaphore. You may want
5333 *      register_netdev() instead of this.
5334 *
5335 *      BUGS:
5336 *      The locking appears insufficient to guarantee two parallel registers
5337 *      will not get the same name.
5338 */
5339
5340int register_netdevice(struct net_device *dev)
5341{
5342        int ret;
5343        struct net *net = dev_net(dev);
5344
5345        BUG_ON(dev_boot_phase);
5346        ASSERT_RTNL();
5347
5348        might_sleep();
5349
5350        /* When net_device's are persistent, this will be fatal. */
5351        BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5352        BUG_ON(!net);
5353
5354        spin_lock_init(&dev->addr_list_lock);
5355        netdev_set_addr_lockdep_class(dev);
5356
5357        dev->iflink = -1;
5358
5359        ret = dev_get_valid_name(net, dev, dev->name);
5360        if (ret < 0)
5361                goto out;
5362
5363        /* Init, if this function is available */
5364        if (dev->netdev_ops->ndo_init) {
5365                ret = dev->netdev_ops->ndo_init(dev);
5366                if (ret) {
5367                        if (ret > 0)
5368                                ret = -EIO;
5369                        goto out;
5370                }
5371        }
5372
5373        if (((dev->hw_features | dev->features) &
5374             NETIF_F_HW_VLAN_CTAG_FILTER) &&
5375            (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
5376             !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
5377                netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
5378                ret = -EINVAL;
5379                goto err_uninit;
5380        }
5381
5382        ret = -EBUSY;
5383        if (!dev->ifindex)
5384                dev->ifindex = dev_new_index(net);
5385        else if (__dev_get_by_index(net, dev->ifindex))
5386                goto err_uninit;
5387
5388        if (dev->iflink == -1)
5389                dev->iflink = dev->ifindex;
5390
5391        /* Transfer changeable features to wanted_features and enable
5392         * software offloads (GSO and GRO).
5393         */
5394        dev->hw_features |= NETIF_F_SOFT_FEATURES;
5395        dev->features |= NETIF_F_SOFT_FEATURES;
5396        dev->wanted_features = dev->features & dev->hw_features;
5397
5398        /* Turn on no cache copy if HW is doing checksum */
5399        if (!(dev->flags & IFF_LOOPBACK)) {
5400                dev->hw_features |= NETIF_F_NOCACHE_COPY;
5401                if (dev->features & NETIF_F_ALL_CSUM) {
5402                        dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5403                        dev->features |= NETIF_F_NOCACHE_COPY;
5404                }
5405        }
5406
5407        /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5408         */
5409        dev->vlan_features |= NETIF_F_HIGHDMA;
5410
5411        /* Make NETIF_F_SG inheritable to tunnel devices.
5412         */
5413        dev->hw_enc_features |= NETIF_F_SG;
5414
5415        /* Make NETIF_F_SG inheritable to MPLS.
5416         */
5417        dev->mpls_features |= NETIF_F_SG;
5418
5419        ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5420        ret = notifier_to_errno(ret);
5421        if (ret)
5422                goto err_uninit;
5423
5424        ret = netdev_register_kobject(dev);
5425        if (ret)
5426                goto err_uninit;
5427        dev->reg_state = NETREG_REGISTERED;
5428
5429        __netdev_update_features(dev);
5430
5431        /*
5432         *      Default initial state at registry is that the
5433         *      device is present.
5434         */
5435
5436        set_bit(__LINK_STATE_PRESENT, &dev->state);
5437
5438        linkwatch_init_dev(dev);
5439
5440        dev_init_scheduler(dev);
5441        dev_hold(dev);
5442        list_netdevice(dev);
5443        add_device_randomness(dev->dev_addr, dev->addr_len);
5444
5445        /* If the device has permanent device address, driver should
5446         * set dev_addr and also addr_assign_type should be set to
5447         * NET_ADDR_PERM (default value).
5448         */
5449        if (dev->addr_assign_type == NET_ADDR_PERM)
5450                memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
5451
5452        /* Notify protocols, that a new device appeared. */
5453        ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5454        ret = notifier_to_errno(ret);
5455        if (ret) {
5456                rollback_registered(dev);
5457                dev->reg_state = NETREG_UNREGISTERED;
5458        }
5459        /*
5460         *      Prevent userspace races by waiting until the network
5461         *      device is fully setup before sending notifications.
5462         */
5463        if (!dev->rtnl_link_ops ||
5464            dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5465                rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5466
5467out:
5468        return ret;
5469
5470err_uninit:
5471        if (dev->netdev_ops->ndo_uninit)
5472                dev->netdev_ops->ndo_uninit(dev);
5473        goto out;
5474}
5475EXPORT_SYMBOL(register_netdevice);
5476
5477/**
5478 *      init_dummy_netdev       - init a dummy network device for NAPI
5479 *      @dev: device to init
5480 *
5481 *      This takes a network device structure and initialize the minimum
5482 *      amount of fields so it can be used to schedule NAPI polls without
5483 *      registering a full blown interface. This is to be used by drivers
5484 *      that need to tie several hardware interfaces to a single NAPI
5485 *      poll scheduler due to HW limitations.
5486 */
5487int init_dummy_netdev(struct net_device *dev)
5488{
5489        /* Clear everything. Note we don't initialize spinlocks
5490         * are they aren't supposed to be taken by any of the
5491         * NAPI code and this dummy netdev is supposed to be
5492         * only ever used for NAPI polls
5493         */
5494        memset(dev, 0, sizeof(struct net_device));
5495
5496        /* make sure we BUG if trying to hit standard
5497         * register/unregister code path
5498         */
5499        dev->reg_state = NETREG_DUMMY;
5500
5501        /* NAPI wants this */
5502        INIT_LIST_HEAD(&dev->napi_list);
5503
5504        /* a dummy interface is started by default */
5505        set_bit(__LINK_STATE_PRESENT, &dev->state);
5506        set_bit(__LINK_STATE_START, &dev->state);
5507
5508        /* Note : We dont allocate pcpu_refcnt for dummy devices,
5509         * because users of this 'device' dont need to change
5510         * its refcount.
5511         */
5512
5513        return 0;
5514}
5515EXPORT_SYMBOL_GPL(init_dummy_netdev);
5516
5517
5518/**
5519 *      register_netdev - register a network device
5520 *      @dev: device to register
5521 *
5522 *      Take a completed network device structure and add it to the kernel
5523 *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5524 *      chain. 0 is returned on success. A negative errno code is returned
5525 *      on a failure to set up the device, or if the name is a duplicate.
5526 *
5527 *      This is a wrapper around register_netdevice that takes the rtnl semaphore
5528 *      and expands the device name if you passed a format string to
5529 *      alloc_netdev.
5530 */
5531int register_netdev(struct net_device *dev)
5532{
5533        int err;
5534
5535        rtnl_lock();
5536        err = register_netdevice(dev);
5537        rtnl_unlock();
5538        return err;
5539}
5540EXPORT_SYMBOL(register_netdev);
5541
5542int netdev_refcnt_read(const struct net_device *dev)
5543{
5544        int i, refcnt = 0;
5545
5546        for_each_possible_cpu(i)
5547                refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5548        return refcnt;
5549}
5550EXPORT_SYMBOL(netdev_refcnt_read);
5551
5552/**
5553 * netdev_wait_allrefs - wait until all references are gone.
5554 * @dev: target net_device
5555 *
5556 * This is called when unregistering network devices.
5557 *
5558 * Any protocol or device that holds a reference should register
5559 * for netdevice notification, and cleanup and put back the
5560 * reference if they receive an UNREGISTER event.
5561 * We can get stuck here if buggy protocols don't correctly
5562 * call dev_put.
5563 */
5564static void netdev_wait_allrefs(struct net_device *dev)
5565{
5566        unsigned long rebroadcast_time, warning_time;
5567        int refcnt;
5568
5569        linkwatch_forget_dev(dev);
5570
5571        rebroadcast_time = warning_time = jiffies;
5572        refcnt = netdev_refcnt_read(dev);
5573
5574        while (refcnt != 0) {
5575                if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5576                        rtnl_lock();
5577
5578                        /* Rebroadcast unregister notification */
5579                        call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5580
5581                        __rtnl_unlock();
5582                        rcu_barrier();
5583                        rtnl_lock();
5584
5585                        call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5586                        if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5587                                     &dev->state)) {
5588                                /* We must not have linkwatch events
5589                                 * pending on unregister. If this
5590                                 * happens, we simply run the queue
5591                                 * unscheduled, resulting in a noop
5592                                 * for this device.
5593                                 */
5594                                linkwatch_run_queue();
5595                        }
5596
5597                        __rtnl_unlock();
5598
5599                        rebroadcast_time = jiffies;
5600                }
5601
5602                msleep(250);
5603
5604                refcnt = netdev_refcnt_read(dev);
5605
5606                if (time_after(jiffies, warning_time + 10 * HZ)) {
5607                        pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5608                                 dev->name, refcnt);
5609                        warning_time = jiffies;
5610                }
5611        }
5612}
5613
5614/* The sequence is:
5615 *
5616 *      rtnl_lock();
5617 *      ...
5618 *      register_netdevice(x1);
5619 *      register_netdevice(x2);
5620 *      ...
5621 *      unregister_netdevice(y1);
5622 *      unregister_netdevice(y2);
5623 *      ...
5624 *      rtnl_unlock();
5625 *      free_netdev(y1);
5626 *      free_netdev(y2);
5627 *
5628 * We are invoked by rtnl_unlock().
5629 * This allows us to deal with problems:
5630 * 1) We can delete sysfs objects which invoke hotplug
5631 *    without deadlocking with linkwatch via keventd.
5632 * 2) Since we run with the RTNL semaphore not held, we can sleep
5633 *    safely in order to wait for the netdev refcnt to drop to zero.
5634 *
5635 * We must not return until all unregister events added during
5636 * the interval the lock was held have been completed.
5637 */
5638void netdev_run_todo(void)
5639{
5640        struct list_head list;
5641
5642        /* Snapshot list, allow later requests */
5643        list_replace_init(&net_todo_list, &list);
5644
5645        __rtnl_unlock();
5646
5647
5648        /* Wait for rcu callbacks to finish before next phase */
5649        if (!list_empty(&list))
5650                rcu_barrier();
5651
5652        while (!list_empty(&list)) {
5653                struct net_device *dev
5654                        = list_first_entry(&list, struct net_device, todo_list);
5655                list_del(&dev->todo_list);
5656
5657                rtnl_lock();
5658                call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5659                __rtnl_unlock();
5660
5661                if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5662                        pr_err("network todo '%s' but state %d\n",
5663                               dev->name, dev->reg_state);
5664                        dump_stack();
5665                        continue;
5666                }
5667
5668                dev->reg_state = NETREG_UNREGISTERED;
5669
5670                on_each_cpu(flush_backlog, dev, 1);
5671
5672                netdev_wait_allrefs(dev);
5673
5674                /* paranoia */
5675                BUG_ON(netdev_refcnt_read(dev));
5676                WARN_ON(rcu_access_pointer(dev->ip_ptr));
5677                WARN_ON(rcu_access_pointer(dev->ip6_ptr));
5678                WARN_ON(dev->dn_ptr);
5679
5680                if (dev->destructor)
5681                        dev->destructor(dev);
5682
5683                /* Free network device */
5684                kobject_put(&dev->dev.kobj);
5685        }
5686}
5687
5688/* Convert net_device_stats to rtnl_link_stats64.  They have the same
5689 * fields in the same order, with only the type differing.
5690 */
5691void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5692                             const struct net_device_stats *netdev_stats)
5693{
5694#if BITS_PER_LONG == 64
5695        BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5696        memcpy(stats64, netdev_stats, sizeof(*stats64));
5697#else
5698        size_t i, n = sizeof(*stats64) / sizeof(u64);
5699        const unsigned long *src = (const unsigned long *)netdev_stats;
5700        u64 *dst = (u64 *)stats64;
5701
5702        BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5703                     sizeof(*stats64) / sizeof(u64));
5704        for (i = 0; i < n; i++)
5705                dst[i] = src[i];
5706#endif
5707}
5708EXPORT_SYMBOL(netdev_stats_to_stats64);
5709
5710/**
5711 *      dev_get_stats   - get network device statistics
5712 *      @dev: device to get statistics from
5713 *      @storage: place to store stats
5714 *
5715 *      Get network statistics from device. Return @storage.
5716 *      The device driver may provide its own method by setting
5717 *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5718 *      otherwise the internal statistics structure is used.
5719 */
5720struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5721                                        struct rtnl_link_stats64 *storage)
5722{
5723        const struct net_device_ops *ops = dev->netdev_ops;
5724
5725        if (ops->ndo_get_stats64) {
5726                memset(storage, 0, sizeof(*storage));
5727                ops->ndo_get_stats64(dev, storage);
5728        } else if (ops->ndo_get_stats) {
5729                netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5730        } else {
5731                netdev_stats_to_stats64(storage, &dev->stats);
5732        }
5733        storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5734        return storage;
5735}
5736EXPORT_SYMBOL(dev_get_stats);
5737
5738struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5739{
5740        struct netdev_queue *queue = dev_ingress_queue(dev);
5741
5742#ifdef CONFIG_NET_CLS_ACT
5743        if (queue)
5744                return queue;
5745        queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5746        if (!queue)
5747                return NULL;
5748        netdev_init_one_queue(dev, queue, NULL);
5749        queue->qdisc = &noop_qdisc;
5750        queue->qdisc_sleeping = &noop_qdisc;
5751        rcu_assign_pointer(dev->ingress_queue, queue);
5752#endif
5753        return queue;
5754}
5755
5756static const struct ethtool_ops default_ethtool_ops;
5757
5758void netdev_set_default_ethtool_ops(struct net_device *dev,
5759                                    const struct ethtool_ops *ops)
5760{
5761        if (dev->ethtool_ops == &default_ethtool_ops)
5762                dev->ethtool_ops = ops;
5763}
5764EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
5765
5766/**
5767 *      alloc_netdev_mqs - allocate network device
5768 *      @sizeof_priv:   size of private data to allocate space for
5769 *      @name:          device name format string
5770 *      @setup:         callback to initialize device
5771 *      @txqs:          the number of TX subqueues to allocate
5772 *      @rxqs:          the number of RX subqueues to allocate
5773 *
5774 *      Allocates a struct net_device with private data area for driver use
5775 *      and performs basic initialization.  Also allocates subquue structs
5776 *      for each queue on the device.
5777 */
5778struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5779                void (*setup)(struct net_device *),
5780                unsigned int txqs, unsigned int rxqs)
5781{
5782        struct net_device *dev;
5783        size_t alloc_size;
5784        struct net_device *p;
5785
5786        BUG_ON(strlen(name) >= sizeof(dev->name));
5787
5788        if (txqs < 1) {
5789                pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
5790                return NULL;
5791        }
5792
5793#ifdef CONFIG_RPS
5794        if (rxqs < 1) {
5795                pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
5796                return NULL;
5797        }
5798#endif
5799
5800        alloc_size = sizeof(struct net_device);
5801        if (sizeof_priv) {
5802                /* ensure 32-byte alignment of private area */
5803                alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5804                alloc_size += sizeof_priv;
5805        }
5806        /* ensure 32-byte alignment of whole construct */
5807        alloc_size += NETDEV_ALIGN - 1;
5808
5809        p = kzalloc(alloc_size, GFP_KERNEL);
5810        if (!p)
5811                return NULL;
5812
5813        dev = PTR_ALIGN(p, NETDEV_ALIGN);
5814        dev->padded = (char *)dev - (char *)p;
5815
5816        dev->pcpu_refcnt = alloc_percpu(int);
5817        if (!dev->pcpu_refcnt)
5818                goto free_p;
5819
5820        if (dev_addr_init(dev))
5821                goto free_pcpu;
5822
5823        dev_mc_init(dev);
5824        dev_uc_init(dev);
5825
5826        dev_net_set(dev, &init_net);
5827
5828        dev->gso_max_size = GSO_MAX_SIZE;
5829        dev->gso_max_segs = GSO_MAX_SEGS;
5830
5831        INIT_LIST_HEAD(&dev->napi_list);
5832        INIT_LIST_HEAD(&dev->unreg_list);
5833        INIT_LIST_HEAD(&dev->link_watch_list);
5834        INIT_LIST_HEAD(&dev->upper_dev_list);
5835        dev->priv_flags = IFF_XMIT_DST_RELEASE;
5836        setup(dev);
5837
5838        dev->num_tx_queues = txqs;
5839        dev->real_num_tx_queues = txqs;
5840        if (netif_alloc_netdev_queues(dev))
5841                goto free_all;
5842
5843#ifdef CONFIG_RPS
5844        dev->num_rx_queues = rxqs;
5845        dev->real_num_rx_queues = rxqs;
5846        if (netif_alloc_rx_queues(dev))
5847                goto free_all;
5848#endif
5849
5850        strcpy(dev->name, name);
5851        dev->group = INIT_NETDEV_GROUP;
5852        if (!dev->ethtool_ops)
5853                dev->ethtool_ops = &default_ethtool_ops;
5854        return dev;
5855
5856free_all:
5857        free_netdev(dev);
5858        return NULL;
5859
5860free_pcpu:
5861        free_percpu(dev->pcpu_refcnt);
5862        netif_free_tx_queues(dev);
5863#ifdef CONFIG_RPS
5864        kfree(dev->_rx);
5865#endif
5866
5867free_p:
5868        kfree(p);
5869        return NULL;
5870}
5871EXPORT_SYMBOL(alloc_netdev_mqs);
5872
5873/**
5874 *      free_netdev - free network device
5875 *      @dev: device
5876 *
5877 *      This function does the last stage of destroying an allocated device
5878 *      interface. The reference to the device object is released.
5879 *      If this is the last reference then it will be freed.
5880 */
5881void free_netdev(struct net_device *dev)
5882{
5883        struct napi_struct *p, *n;
5884
5885        release_net(dev_net(dev));
5886
5887        netif_free_tx_queues(dev);
5888#ifdef CONFIG_RPS
5889        kfree(dev->_rx);
5890#endif
5891
5892        kfree(rcu_dereference_protected(dev->ingress_queue, 1));
5893
5894        /* Flush device addresses */
5895        dev_addr_flush(dev);
5896
5897        list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5898                netif_napi_del(p);
5899
5900        free_percpu(dev->pcpu_refcnt);
5901        dev->pcpu_refcnt = NULL;
5902
5903        /*  Compatibility with error handling in drivers */
5904        if (dev->reg_state == NETREG_UNINITIALIZED) {
5905                kfree((char *)dev - dev->padded);
5906                return;
5907        }
5908
5909        BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5910        dev->reg_state = NETREG_RELEASED;
5911
5912        /* will free via device release */
5913        put_device(&dev->dev);
5914}
5915EXPORT_SYMBOL(free_netdev);
5916
5917/**
5918 *      synchronize_net -  Synchronize with packet receive processing
5919 *
5920 *      Wait for packets currently being received to be done.
5921 *      Does not block later packets from starting.
5922 */
5923void synchronize_net(void)
5924{
5925        might_sleep();
5926        if (rtnl_is_locked())
5927                synchronize_rcu_expedited();
5928        else
5929                synchronize_rcu();
5930}
5931EXPORT_SYMBOL(synchronize_net);
5932
5933/**
5934 *      unregister_netdevice_queue - remove device from the kernel
5935 *      @dev: device
5936 *      @head: list
5937 *
5938 *      This function shuts down a device interface and removes it
5939 *      from the kernel tables.
5940 *      If head not NULL, device is queued to be unregistered later.
5941 *
5942 *      Callers must hold the rtnl semaphore.  You may want
5943 *      unregister_netdev() instead of this.
5944 */
5945
5946void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5947{
5948        ASSERT_RTNL();
5949
5950        if (head) {
5951                list_move_tail(&dev->unreg_list, head);
5952        } else {
5953                rollback_registered(dev);
5954                /* Finish processing unregister after unlock */
5955                net_set_todo(dev);
5956        }
5957}
5958EXPORT_SYMBOL(unregister_netdevice_queue);
5959
5960/**
5961 *      unregister_netdevice_many - unregister many devices
5962 *      @head: list of devices
5963 */
5964void unregister_netdevice_many(struct list_head *head)
5965{
5966        struct net_device *dev;
5967
5968        if (!list_empty(head)) {
5969                rollback_registered_many(head);
5970                list_for_each_entry(dev, head, unreg_list)
5971                        net_set_todo(dev);
5972        }
5973}
5974EXPORT_SYMBOL(unregister_netdevice_many);
5975
5976/**
5977 *      unregister_netdev - remove device from the kernel
5978 *      @dev: device
5979 *
5980 *      This function shuts down a device interface and removes it
5981 *      from the kernel tables.
5982 *
5983 *      This is just a wrapper for unregister_netdevice that takes
5984 *      the rtnl semaphore.  In general you want to use this and not
5985 *      unregister_netdevice.
5986 */
5987void unregister_netdev(struct net_device *dev)
5988{
5989        rtnl_lock();
5990        unregister_netdevice(dev);
5991        rtnl_unlock();
5992}
5993EXPORT_SYMBOL(unregister_netdev);
5994
5995/**
5996 *      dev_change_net_namespace - move device to different nethost namespace
5997 *      @dev: device
5998 *      @net: network namespace
5999 *      @pat: If not NULL name pattern to try if the current device name
6000 *            is already taken in the destination network namespace.
6001 *
6002 *      This function shuts down a device interface and moves it
6003 *      to a new network namespace. On success 0 is returned, on
6004 *      a failure a netagive errno code is returned.
6005 *
6006 *      Callers must hold the rtnl semaphore.
6007 */
6008
6009int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6010{
6011        int err;
6012
6013        ASSERT_RTNL();
6014
6015        /* Don't allow namespace local devices to be moved. */
6016        err = -EINVAL;
6017        if (dev->features & NETIF_F_NETNS_LOCAL)
6018                goto out;
6019
6020        /* Ensure the device has been registrered */
6021        if (dev->reg_state != NETREG_REGISTERED)
6022                goto out;
6023
6024        /* Get out if there is nothing todo */
6025        err = 0;
6026        if (net_eq(dev_net(dev), net))
6027                goto out;
6028
6029        /* Pick the destination device name, and ensure
6030         * we can use it in the destination network namespace.
6031         */
6032        err = -EEXIST;
6033        if (__dev_get_by_name(net, dev->name)) {
6034                /* We get here if we can't use the current device name */
6035                if (!pat)
6036                        goto out;
6037                if (dev_get_valid_name(net, dev, pat) < 0)
6038                        goto out;
6039        }
6040
6041        /*
6042         * And now a mini version of register_netdevice unregister_netdevice.
6043         */
6044
6045        /* If device is running close it first. */
6046        dev_close(dev);
6047
6048        /* And unlink it from device chain */
6049        err = -ENODEV;
6050        unlist_netdevice(dev);
6051
6052        synchronize_net();
6053
6054        /* Shutdown queueing discipline. */
6055        dev_shutdown(dev);
6056
6057        /* Notify protocols, that we are about to destroy
6058           this device. They should clean all the things.
6059
6060           Note that dev->reg_state stays at NETREG_REGISTERED.
6061           This is wanted because this way 8021q and macvlan know
6062           the device is just moving and can keep their slaves up.
6063        */
6064        call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6065        rcu_barrier();
6066        call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6067        rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
6068
6069        /*
6070         *      Flush the unicast and multicast chains
6071         */
6072        dev_uc_flush(dev);
6073        dev_mc_flush(dev);
6074
6075        /* Send a netdev-removed uevent to the old namespace */
6076        kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6077
6078        /* Actually switch the network namespace */
6079        dev_net_set(dev, net);
6080
6081        /* If there is an ifindex conflict assign a new one */
6082        if (__dev_get_by_index(net, dev->ifindex)) {
6083                int iflink = (dev->iflink == dev->ifindex);
6084                dev->ifindex = dev_new_index(net);
6085                if (iflink)
6086                        dev->iflink = dev->ifindex;
6087        }
6088
6089        /* Send a netdev-add uevent to the new namespace */
6090        kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6091
6092        /* Fixup kobjects */
6093        err = device_rename(&dev->dev, dev->name);
6094        WARN_ON(err);
6095
6096        /* Add the device back in the hashes */
6097        list_netdevice(dev);
6098
6099        /* Notify protocols, that a new device appeared. */
6100        call_netdevice_notifiers(NETDEV_REGISTER, dev);
6101
6102        /*
6103         *      Prevent userspace races by waiting until the network
6104         *      device is fully setup before sending notifications.
6105         */
6106        rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6107
6108        synchronize_net();
6109        err = 0;
6110out:
6111        return err;
6112}
6113EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6114
6115static int dev_cpu_callback(struct notifier_block *nfb,
6116                            unsigned long action,
6117                            void *ocpu)
6118{
6119        struct sk_buff **list_skb;
6120        struct sk_buff *skb;
6121        unsigned int cpu, oldcpu = (unsigned long)ocpu;
6122        struct softnet_data *sd, *oldsd;
6123
6124        if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6125                return NOTIFY_OK;
6126
6127        local_irq_disable();
6128        cpu = smp_processor_id();
6129        sd = &per_cpu(softnet_data, cpu);
6130        oldsd = &per_cpu(softnet_data, oldcpu);
6131
6132        /* Find end of our completion_queue. */
6133        list_skb = &sd->completion_queue;
6134        while (*list_skb)
6135                list_skb = &(*list_skb)->next;
6136        /* Append completion queue from offline CPU. */
6137        *list_skb = oldsd->completion_queue;
6138        oldsd->completion_queue = NULL;
6139
6140        /* Append output queue from offline CPU. */
6141        if (oldsd->output_queue) {
6142                *sd->output_queue_tailp = oldsd->output_queue;
6143                sd->output_queue_tailp = oldsd->output_queue_tailp;
6144                oldsd->output_queue = NULL;
6145                oldsd->output_queue_tailp = &oldsd->output_queue;
6146        }
6147        /* Append NAPI poll list from offline CPU. */
6148        if (!list_empty(&oldsd->poll_list)) {
6149                list_splice_init(&oldsd->poll_list, &sd->poll_list);
6150                raise_softirq_irqoff(NET_RX_SOFTIRQ);
6151        }
6152
6153        raise_softirq_irqoff(NET_TX_SOFTIRQ);
6154        local_irq_enable();
6155
6156        /* Process offline CPU's input_pkt_queue */
6157        while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6158                netif_rx(skb);
6159                input_queue_head_incr(oldsd);
6160        }
6161        while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6162                netif_rx(skb);
6163                input_queue_head_incr(oldsd);
6164        }
6165
6166        return NOTIFY_OK;
6167}
6168
6169
6170/**
6171 *      netdev_increment_features - increment feature set by one
6172 *      @all: current feature set
6173 *      @one: new feature set
6174 *      @mask: mask feature set
6175 *
6176 *      Computes a new feature set after adding a device with feature set
6177 *      @one to the master device with current feature set @all.  Will not
6178 *      enable anything that is off in @mask. Returns the new feature set.
6179 */
6180netdev_features_t netdev_increment_features(netdev_features_t all,
6181        netdev_features_t one, netdev_features_t mask)
6182{
6183        if (mask & NETIF_F_GEN_CSUM)
6184                mask |= NETIF_F_ALL_CSUM;
6185        mask |= NETIF_F_VLAN_CHALLENGED;
6186
6187        all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6188        all &= one | ~NETIF_F_ALL_FOR_ALL;
6189
6190        /* If one device supports hw checksumming, set for all. */
6191        if (all & NETIF_F_GEN_CSUM)
6192                all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6193
6194        return all;
6195}
6196EXPORT_SYMBOL(netdev_increment_features);
6197
6198static struct hlist_head * __net_init netdev_create_hash(void)
6199{
6200        int i;
6201        struct hlist_head *hash;
6202
6203        hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6204        if (hash != NULL)
6205                for (i = 0; i < NETDEV_HASHENTRIES; i++)
6206                        INIT_HLIST_HEAD(&hash[i]);
6207
6208        return hash;
6209}
6210
6211/* Initialize per network namespace state */
6212static int __net_init netdev_init(struct net *net)
6213{
6214        if (net != &init_net)
6215                INIT_LIST_HEAD(&net->dev_base_head);
6216
6217        net->dev_name_head = netdev_create_hash();
6218        if (net->dev_name_head == NULL)
6219                goto err_name;
6220
6221        net->dev_index_head = netdev_create_hash();
6222        if (net->dev_index_head == NULL)
6223                goto err_idx;
6224
6225        return 0;
6226
6227err_idx:
6228        kfree(net->dev_name_head);
6229err_name:
6230        return -ENOMEM;
6231}
6232
6233/**
6234 *      netdev_drivername - network driver for the device
6235 *      @dev: network device
6236 *
6237 *      Determine network driver for device.
6238 */
6239const char *netdev_drivername(const struct net_device *dev)
6240{
6241        const struct device_driver *driver;
6242        const struct device *parent;
6243        const char *empty = "";
6244
6245        parent = dev->dev.parent;
6246        if (!parent)
6247                return empty;
6248
6249        driver = parent->driver;
6250        if (driver && driver->name)
6251                return driver->name;
6252        return empty;
6253}
6254
6255static int __netdev_printk(const char *level, const struct net_device *dev,
6256                           struct va_format *vaf)
6257{
6258        int r;
6259
6260        if (dev && dev->dev.parent) {
6261                r = dev_printk_emit(level[1] - '0',
6262                                    dev->dev.parent,
6263                                    "%s %s %s: %pV",
6264                                    dev_driver_string(dev->dev.parent),
6265                                    dev_name(dev->dev.parent),
6266                                    netdev_name(dev), vaf);
6267        } else if (dev) {
6268                r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6269        } else {
6270                r = printk("%s(NULL net_device): %pV", level, vaf);
6271        }
6272
6273        return r;
6274}
6275
6276int netdev_printk(const char *level, const struct net_device *dev,
6277                  const char *format, ...)
6278{
6279        struct va_format vaf;
6280        va_list args;
6281        int r;
6282
6283        va_start(args, format);
6284
6285        vaf.fmt = format;
6286        vaf.va = &args;
6287
6288        r = __netdev_printk(level, dev, &vaf);
6289
6290        va_end(args);
6291
6292        return r;
6293}
6294EXPORT_SYMBOL(netdev_printk);
6295
6296#define define_netdev_printk_level(func, level)                 \
6297int func(const struct net_device *dev, const char *fmt, ...)    \
6298{                                                               \
6299        int r;                                                  \
6300        struct va_format vaf;                                   \
6301        va_list args;                                           \
6302                                                                \
6303        va_start(args, fmt);                                    \
6304                                                                \
6305        vaf.fmt = fmt;                                          \
6306        vaf.va = &args;                                         \
6307                                                                \
6308        r = __netdev_printk(level, dev, &vaf);                  \
6309                                                                \
6310        va_end(args);                                           \
6311                                                                \
6312        return r;                                               \
6313}                                                               \
6314EXPORT_SYMBOL(func);
6315
6316define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6317define_netdev_printk_level(netdev_alert, KERN_ALERT);
6318define_netdev_printk_level(netdev_crit, KERN_CRIT);
6319define_netdev_printk_level(netdev_err, KERN_ERR);
6320define_netdev_printk_level(netdev_warn, KERN_WARNING);
6321define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6322define_netdev_printk_level(netdev_info, KERN_INFO);
6323
6324static void __net_exit netdev_exit(struct net *net)
6325{
6326        kfree(net->dev_name_head);
6327        kfree(net->dev_index_head);
6328}
6329
6330static struct pernet_operations __net_initdata netdev_net_ops = {
6331        .init = netdev_init,
6332        .exit = netdev_exit,
6333};
6334
6335static void __net_exit default_device_exit(struct net *net)
6336{
6337        struct net_device *dev, *aux;
6338        /*
6339         * Push all migratable network devices back to the
6340         * initial network namespace
6341         */
6342        rtnl_lock();
6343        for_each_netdev_safe(net, dev, aux) {
6344                int err;
6345                char fb_name[IFNAMSIZ];
6346
6347                /* Ignore unmoveable devices (i.e. loopback) */
6348                if (dev->features & NETIF_F_NETNS_LOCAL)
6349                        continue;
6350
6351                /* Leave virtual devices for the generic cleanup */
6352                if (dev->rtnl_link_ops)
6353                        continue;
6354
6355                /* Push remaining network devices to init_net */
6356                snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6357                err = dev_change_net_namespace(dev, &init_net, fb_name);
6358                if (err) {
6359                        pr_emerg("%s: failed to move %s to init_net: %d\n",
6360                                 __func__, dev->name, err);
6361                        BUG();
6362                }
6363        }
6364        rtnl_unlock();
6365}
6366
6367static void __net_exit default_device_exit_batch(struct list_head *net_list)
6368{
6369        /* At exit all network devices most be removed from a network
6370         * namespace.  Do this in the reverse order of registration.
6371         * Do this across as many network namespaces as possible to
6372         * improve batching efficiency.
6373         */
6374        struct net_device *dev;
6375        struct net *net;
6376        LIST_HEAD(dev_kill_list);
6377
6378        rtnl_lock();
6379        list_for_each_entry(net, net_list, exit_list) {
6380                for_each_netdev_reverse(net, dev) {
6381                        if (dev->rtnl_link_ops)
6382                                dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6383                        else
6384                                unregister_netdevice_queue(dev, &dev_kill_list);
6385                }
6386        }
6387        unregister_netdevice_many(&dev_kill_list);
6388        list_del(&dev_kill_list);
6389        rtnl_unlock();
6390}
6391
6392static struct pernet_operations __net_initdata default_device_ops = {
6393        .exit = default_device_exit,
6394        .exit_batch = default_device_exit_batch,
6395};
6396
6397/*
6398 *      Initialize the DEV module. At boot time this walks the device list and
6399 *      unhooks any devices that fail to initialise (normally hardware not
6400 *      present) and leaves us with a valid list of present and active devices.
6401 *
6402 */
6403
6404/*
6405 *       This is called single threaded during boot, so no need
6406 *       to take the rtnl semaphore.
6407 */
6408static int __init net_dev_init(void)
6409{
6410        int i, rc = -ENOMEM;
6411
6412        BUG_ON(!dev_boot_phase);
6413
6414        if (dev_proc_init())
6415                goto out;
6416
6417        if (netdev_kobject_init())
6418                goto out;
6419
6420        INIT_LIST_HEAD(&ptype_all);
6421        for (i = 0; i < PTYPE_HASH_SIZE; i++)
6422                INIT_LIST_HEAD(&ptype_base[i]);
6423
6424        INIT_LIST_HEAD(&offload_base);
6425
6426        if (register_pernet_subsys(&netdev_net_ops))
6427                goto out;
6428
6429        /*
6430         *      Initialise the packet receive queues.
6431         */
6432
6433        for_each_possible_cpu(i) {
6434                struct softnet_data *sd = &per_cpu(softnet_data, i);
6435
6436                memset(sd, 0, sizeof(*sd));
6437                skb_queue_head_init(&sd->input_pkt_queue);
6438                skb_queue_head_init(&sd->process_queue);
6439                sd->completion_queue = NULL;
6440                INIT_LIST_HEAD(&sd->poll_list);
6441                sd->output_queue = NULL;
6442                sd->output_queue_tailp = &sd->output_queue;
6443#ifdef CONFIG_RPS
6444                sd->csd.func = rps_trigger_softirq;
6445                sd->csd.info = sd;
6446                sd->csd.flags = 0;
6447                sd->cpu = i;
6448#endif
6449
6450                sd->backlog.poll = process_backlog;
6451                sd->backlog.weight = weight_p;
6452                sd->backlog.gro_list = NULL;
6453                sd->backlog.gro_count = 0;
6454
6455#ifdef CONFIG_NET_FLOW_LIMIT
6456                sd->flow_limit = NULL;
6457#endif
6458        }
6459
6460        dev_boot_phase = 0;
6461
6462        /* The loopback device is special if any other network devices
6463         * is present in a network namespace the loopback device must
6464         * be present. Since we now dynamically allocate and free the
6465         * loopback device ensure this invariant is maintained by
6466         * keeping the loopback device as the first device on the
6467         * list of network devices.  Ensuring the loopback devices
6468         * is the first device that appears and the last network device
6469         * that disappears.
6470         */
6471        if (register_pernet_device(&loopback_net_ops))
6472                goto out;
6473
6474        if (register_pernet_device(&default_device_ops))
6475                goto out;
6476
6477        open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6478        open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6479
6480        hotcpu_notifier(dev_cpu_callback, 0);
6481        dst_init();
6482        rc = 0;
6483out:
6484        return rc;
6485}
6486
6487subsys_initcall(net_dev_init);
6488