LXR linux/net/core/dev.c

   1/*
   2 *      NET3    Protocol independent device support routines.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 *      Derived from the non IP parts of dev.c 1.0.19
  10 *              Authors:        Ross Biro
  11 *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *      Additional Authors:
  15 *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *              David Hinds <dahinds@users.sourceforge.net>
  18 *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *              Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *      Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *                                      to 2 if register_netdev gets called
  25 *                                      before net_dev_init & also removed a
  26 *                                      few lines of code in the process.
  27 *              Alan Cox        :       device private ioctl copies fields back.
  28 *              Alan Cox        :       Transmit queue code does relevant
  29 *                                      stunts to keep the queue safe.
  30 *              Alan Cox        :       Fixed double lock.
  31 *              Alan Cox        :       Fixed promisc NULL pointer trap
  32 *              ????????        :       Support the full private ioctl range
  33 *              Alan Cox        :       Moved ioctl permission check into
  34 *                                      drivers
  35 *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36 *              Alan Cox        :       100 backlog just doesn't cut it when
  37 *                                      you start doing multicast video 8)
  38 *              Alan Cox        :       Rewrote net_bh and list manager.
  39 *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40 *              Alan Cox        :       Took out transmit every packet pass
  41 *                                      Saved a few bytes in the ioctl handler
  42 *              Alan Cox        :       Network driver sets packet type before
  43 *                                      calling netif_rx. Saves a function
  44 *                                      call a packet.
  45 *              Alan Cox        :       Hashed net_bh()
  46 *              Richard Kooijman:       Timestamp fixes.
  47 *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48 *              Alan Cox        :       Device lock protection.
  49 *              Alan Cox        :       Fixed nasty side effect of device close
  50 *                                      changes.
  51 *              Rudi Cilibrasi  :       Pass the right thing to
  52 *                                      set_mac_address()
  53 *              Dave Miller     :       32bit quantity for the device lock to
  54 *                                      make it work out on a Sparc.
  55 *              Bjorn Ekwall    :       Added KERNELD hack.
  56 *              Alan Cox        :       Cleaned up the backlog initialise.
  57 *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58 *                                      1 device.
  59 *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60 *                                      is no device open function.
  61 *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62 *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63 *              Cyrus Durgin    :       Cleaned for KMOD
  64 *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65 *                                      A network device unload needs to purge
  66 *                                      the backlog queue.
  67 *      Paul Rusty Russell      :       SIOCSIFNAME
  68 *              Pekka Riikonen  :       Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *                                      indefinitely on dev->refcnt
  71 *              J Hadi Salim    :       - Backlog queue sampling
  72 *                                      - netif_rx() feedback
  73 */
  74
  75#include <asm/uaccess.h>
  76#include <linux/bitops.h>
  77#include <linux/capability.h>
  78#include <linux/cpu.h>
  79#include <linux/types.h>
  80#include <linux/kernel.h>
  81#include <linux/hash.h>
  82#include <linux/slab.h>
  83#include <linux/sched.h>
  84#include <linux/mutex.h>
  85#include <linux/string.h>
  86#include <linux/mm.h>
  87#include <linux/socket.h>
  88#include <linux/sockios.h>
  89#include <linux/errno.h>
  90#include <linux/interrupt.h>
  91#include <linux/if_ether.h>
  92#include <linux/netdevice.h>
  93#include <linux/etherdevice.h>
  94#include <linux/ethtool.h>
  95#include <linux/notifier.h>
  96#include <linux/skbuff.h>
  97#include <net/net_namespace.h>
  98#include <net/sock.h>
  99#include <net/busy_poll.h>
 100#include <linux/rtnetlink.h>
 101#include <linux/stat.h>
 102#include <net/dst.h>
 103#include <net/dst_metadata.h>
 104#include <net/pkt_sched.h>
 105#include <net/checksum.h>
 106#include <net/xfrm.h>
 107#include <linux/highmem.h>
 108#include <linux/init.h>
 109#include <linux/module.h>
 110#include <linux/netpoll.h>
 111#include <linux/rcupdate.h>
 112#include <linux/delay.h>
 113#include <net/iw_handler.h>
 114#include <asm/current.h>
 115#include <linux/audit.h>
 116#include <linux/dmaengine.h>
 117#include <linux/err.h>
 118#include <linux/ctype.h>
 119#include <linux/if_arp.h>
 120#include <linux/if_vlan.h>
 121#include <linux/ip.h>
 122#include <net/ip.h>
 123#include <net/mpls.h>
 124#include <linux/ipv6.h>
 125#include <linux/in.h>
 126#include <linux/jhash.h>
 127#include <linux/random.h>
 128#include <trace/events/napi.h>
 129#include <trace/events/net.h>
 130#include <trace/events/skb.h>
 131#include <linux/pci.h>
 132#include <linux/inetdevice.h>
 133#include <linux/cpu_rmap.h>
 134#include <linux/static_key.h>
 135#include <linux/hashtable.h>
 136#include <linux/vmalloc.h>
 137#include <linux/if_macvlan.h>
 138#include <linux/errqueue.h>
 139#include <linux/hrtimer.h>
 140#include <linux/netfilter_ingress.h>
 141#include <linux/sctp.h>
 142
 143#include "net-sysfs.h"
 144
 145/* Instead of increasing this, you should create a hash table. */
 146#define MAX_GRO_SKBS 8
 147
 148/* This should be increased if a protocol with a bigger head is added. */
 149#define GRO_MAX_HEAD (MAX_HEADER + 128)
 150
 151static DEFINE_SPINLOCK(ptype_lock);
 152static DEFINE_SPINLOCK(offload_lock);
 153struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 154struct list_head ptype_all __read_mostly;       /* Taps */
 155static struct list_head offload_base __read_mostly;
 156
 157static int netif_rx_internal(struct sk_buff *skb);
 158static int call_netdevice_notifiers_info(unsigned long val,
 159                                         struct net_device *dev,
 160                                         struct netdev_notifier_info *info);
 161
 162/*
 163 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 164 * semaphore.
 165 *
 166 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 167 *
 168 * Writers must hold the rtnl semaphore while they loop through the
 169 * dev_base_head list, and hold dev_base_lock for writing when they do the
 170 * actual updates.  This allows pure readers to access the list even
 171 * while a writer is preparing to update it.
 172 *
 173 * To put it another way, dev_base_lock is held for writing only to
 174 * protect against pure readers; the rtnl semaphore provides the
 175 * protection against other writers.
 176 *
 177 * See, for example usages, register_netdevice() and
 178 * unregister_netdevice(), which must be called with the rtnl
 179 * semaphore held.
 180 */
 181DEFINE_RWLOCK(dev_base_lock);
 182EXPORT_SYMBOL(dev_base_lock);
 183
 184/* protects napi_hash addition/deletion and napi_gen_id */
 185static DEFINE_SPINLOCK(napi_hash_lock);
 186
 187static unsigned int napi_gen_id = NR_CPUS;
 188static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
 189
 190static seqcount_t devnet_rename_seq;
 191
 192static inline void dev_base_seq_inc(struct net *net)
 193{
 194        while (++net->dev_base_seq == 0);
 195}
 196
 197static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 198{
 199        unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 200
 201        return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 202}
 203
 204static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 205{
 206        return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 207}
 208
 209static inline void rps_lock(struct softnet_data *sd)
 210{
 211#ifdef CONFIG_RPS
 212        spin_lock(&sd->input_pkt_queue.lock);
 213#endif
 214}
 215
 216static inline void rps_unlock(struct softnet_data *sd)
 217{
 218#ifdef CONFIG_RPS
 219        spin_unlock(&sd->input_pkt_queue.lock);
 220#endif
 221}
 222
 223/* Device list insertion */
 224static void list_netdevice(struct net_device *dev)
 225{
 226        struct net *net = dev_net(dev);
 227
 228        ASSERT_RTNL();
 229
 230        write_lock_bh(&dev_base_lock);
 231        list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 232        hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 233        hlist_add_head_rcu(&dev->index_hlist,
 234                           dev_index_hash(net, dev->ifindex));
 235        write_unlock_bh(&dev_base_lock);
 236
 237        dev_base_seq_inc(net);
 238}
 239
 240/* Device list removal
 241 * caller must respect a RCU grace period before freeing/reusing dev
 242 */
 243static void unlist_netdevice(struct net_device *dev)
 244{
 245        ASSERT_RTNL();
 246
 247        /* Unlink dev from the device chain */
 248        write_lock_bh(&dev_base_lock);
 249        list_del_rcu(&dev->dev_list);
 250        hlist_del_rcu(&dev->name_hlist);
 251        hlist_del_rcu(&dev->index_hlist);
 252        write_unlock_bh(&dev_base_lock);
 253
 254        dev_base_seq_inc(dev_net(dev));
 255}
 256
 257/*
 258 *      Our notifier list
 259 */
 260
 261static RAW_NOTIFIER_HEAD(netdev_chain);
 262
 263/*
 264 *      Device drivers call our routines to queue packets here. We empty the
 265 *      queue in the local softnet handler.
 266 */
 267
 268DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 269EXPORT_PER_CPU_SYMBOL(softnet_data);
 270
 271#ifdef CONFIG_LOCKDEP
 272/*
 273 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 274 * according to dev->type
 275 */
 276static const unsigned short netdev_lock_type[] =
 277        {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 278         ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 279         ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 280         ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 281         ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 282         ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 283         ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 284         ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 285         ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 286         ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 287         ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 288         ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 289         ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 290         ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 291         ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 292
 293static const char *const netdev_lock_name[] =
 294        {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 295         "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 296         "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 297         "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 298         "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 299         "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 300         "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 301         "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 302         "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 303         "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 304         "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 305         "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 306         "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 307         "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 308         "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 309
 310static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 311static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 312
 313static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 314{
 315        int i;
 316
 317        for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 318                if (netdev_lock_type[i] == dev_type)
 319                        return i;
 320        /* the last key is used by default */
 321        return ARRAY_SIZE(netdev_lock_type) - 1;
 322}
 323
 324static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 325                                                 unsigned short dev_type)
 326{
 327        int i;
 328
 329        i = netdev_lock_pos(dev_type);
 330        lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 331                                   netdev_lock_name[i]);
 332}
 333
 334static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 335{
 336        int i;
 337
 338        i = netdev_lock_pos(dev->type);
 339        lockdep_set_class_and_name(&dev->addr_list_lock,
 340                                   &netdev_addr_lock_key[i],
 341                                   netdev_lock_name[i]);
 342}
 343#else
 344static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 345                                                 unsigned short dev_type)
 346{
 347}
 348static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 349{
 350}
 351#endif
 352
 353/*******************************************************************************
 354
 355                Protocol management and registration routines
 356
 357*******************************************************************************/
 358
 359/*
 360 *      Add a protocol ID to the list. Now that the input handler is
 361 *      smarter we can dispense with all the messy stuff that used to be
 362 *      here.
 363 *
 364 *      BEWARE!!! Protocol handlers, mangling input packets,
 365 *      MUST BE last in hash buckets and checking protocol handlers
 366 *      MUST start from promiscuous ptype_all chain in net_bh.
 367 *      It is true now, do not change it.
 368 *      Explanation follows: if protocol handler, mangling packet, will
 369 *      be the first on list, it is not able to sense, that packet
 370 *      is cloned and should be copied-on-write, so that it will
 371 *      change it and subsequent readers will get broken packet.
 372 *                                                      --ANK (980803)
 373 */
 374
 375static inline struct list_head *ptype_head(const struct packet_type *pt)
 376{
 377        if (pt->type == htons(ETH_P_ALL))
 378                return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 379        else
 380                return pt->dev ? &pt->dev->ptype_specific :
 381                                 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 382}
 383
 384/**
 385 *      dev_add_pack - add packet handler
 386 *      @pt: packet type declaration
 387 *
 388 *      Add a protocol handler to the networking stack. The passed &packet_type
 389 *      is linked into kernel lists and may not be freed until it has been
 390 *      removed from the kernel lists.
 391 *
 392 *      This call does not sleep therefore it can not
 393 *      guarantee all CPU's that are in middle of receiving packets
 394 *      will see the new packet type (until the next received packet).
 395 */
 396
 397void dev_add_pack(struct packet_type *pt)
 398{
 399        struct list_head *head = ptype_head(pt);
 400
 401        spin_lock(&ptype_lock);
 402        list_add_rcu(&pt->list, head);
 403        spin_unlock(&ptype_lock);
 404}
 405EXPORT_SYMBOL(dev_add_pack);
 406
 407/**
 408 *      __dev_remove_pack        - remove packet handler
 409 *      @pt: packet type declaration
 410 *
 411 *      Remove a protocol handler that was previously added to the kernel
 412 *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 413 *      from the kernel lists and can be freed or reused once this function
 414 *      returns.
 415 *
 416 *      The packet type might still be in use by receivers
 417 *      and must not be freed until after all the CPU's have gone
 418 *      through a quiescent state.
 419 */
 420void __dev_remove_pack(struct packet_type *pt)
 421{
 422        struct list_head *head = ptype_head(pt);
 423        struct packet_type *pt1;
 424
 425        spin_lock(&ptype_lock);
 426
 427        list_for_each_entry(pt1, head, list) {
 428                if (pt == pt1) {
 429                        list_del_rcu(&pt->list);
 430                        goto out;
 431                }
 432        }
 433
 434        pr_warn("dev_remove_pack: %p not found\n", pt);
 435out:
 436        spin_unlock(&ptype_lock);
 437}
 438EXPORT_SYMBOL(__dev_remove_pack);
 439
 440/**
 441 *      dev_remove_pack  - remove packet handler
 442 *      @pt: packet type declaration
 443 *
 444 *      Remove a protocol handler that was previously added to the kernel
 445 *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 446 *      from the kernel lists and can be freed or reused once this function
 447 *      returns.
 448 *
 449 *      This call sleeps to guarantee that no CPU is looking at the packet
 450 *      type after return.
 451 */
 452void dev_remove_pack(struct packet_type *pt)
 453{
 454        __dev_remove_pack(pt);
 455
 456        synchronize_net();
 457}
 458EXPORT_SYMBOL(dev_remove_pack);
 459
 460
 461/**
 462 *      dev_add_offload - register offload handlers
 463 *      @po: protocol offload declaration
 464 *
 465 *      Add protocol offload handlers to the networking stack. The passed
 466 *      &proto_offload is linked into kernel lists and may not be freed until
 467 *      it has been removed from the kernel lists.
 468 *
 469 *      This call does not sleep therefore it can not
 470 *      guarantee all CPU's that are in middle of receiving packets
 471 *      will see the new offload handlers (until the next received packet).
 472 */
 473void dev_add_offload(struct packet_offload *po)
 474{
 475        struct packet_offload *elem;
 476
 477        spin_lock(&offload_lock);
 478        list_for_each_entry(elem, &offload_base, list) {
 479                if (po->priority < elem->priority)
 480                        break;
 481        }
 482        list_add_rcu(&po->list, elem->list.prev);
 483        spin_unlock(&offload_lock);
 484}
 485EXPORT_SYMBOL(dev_add_offload);
 486
 487/**
 488 *      __dev_remove_offload     - remove offload handler
 489 *      @po: packet offload declaration
 490 *
 491 *      Remove a protocol offload handler that was previously added to the
 492 *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 493 *      is removed from the kernel lists and can be freed or reused once this
 494 *      function returns.
 495 *
 496 *      The packet type might still be in use by receivers
 497 *      and must not be freed until after all the CPU's have gone
 498 *      through a quiescent state.
 499 */
 500static void __dev_remove_offload(struct packet_offload *po)
 501{
 502        struct list_head *head = &offload_base;
 503        struct packet_offload *po1;
 504
 505        spin_lock(&offload_lock);
 506
 507        list_for_each_entry(po1, head, list) {
 508                if (po == po1) {
 509                        list_del_rcu(&po->list);
 510                        goto out;
 511                }
 512        }
 513
 514        pr_warn("dev_remove_offload: %p not found\n", po);
 515out:
 516        spin_unlock(&offload_lock);
 517}
 518
 519/**
 520 *      dev_remove_offload       - remove packet offload handler
 521 *      @po: packet offload declaration
 522 *
 523 *      Remove a packet offload handler that was previously added to the kernel
 524 *      offload handlers by dev_add_offload(). The passed &offload_type is
 525 *      removed from the kernel lists and can be freed or reused once this
 526 *      function returns.
 527 *
 528 *      This call sleeps to guarantee that no CPU is looking at the packet
 529 *      type after return.
 530 */
 531void dev_remove_offload(struct packet_offload *po)
 532{
 533        __dev_remove_offload(po);
 534
 535        synchronize_net();
 536}
 537EXPORT_SYMBOL(dev_remove_offload);
 538
 539/******************************************************************************
 540
 541                      Device Boot-time Settings Routines
 542
 543*******************************************************************************/
 544
 545/* Boot time configuration table */
 546static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 547
 548/**
 549 *      netdev_boot_setup_add   - add new setup entry
 550 *      @name: name of the device
 551 *      @map: configured settings for the device
 552 *
 553 *      Adds new setup entry to the dev_boot_setup list.  The function
 554 *      returns 0 on error and 1 on success.  This is a generic routine to
 555 *      all netdevices.
 556 */
 557static int netdev_boot_setup_add(char *name, struct ifmap *map)
 558{
 559        struct netdev_boot_setup *s;
 560        int i;
 561
 562        s = dev_boot_setup;
 563        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 564                if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 565                        memset(s[i].name, 0, sizeof(s[i].name));
 566                        strlcpy(s[i].name, name, IFNAMSIZ);
 567                        memcpy(&s[i].map, map, sizeof(s[i].map));
 568                        break;
 569                }
 570        }
 571
 572        return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 573}
 574
 575/**
 576 *      netdev_boot_setup_check - check boot time settings
 577 *      @dev: the netdevice
 578 *
 579 *      Check boot time settings for the device.
 580 *      The found settings are set for the device to be used
 581 *      later in the device probing.
 582 *      Returns 0 if no settings found, 1 if they are.
 583 */
 584int netdev_boot_setup_check(struct net_device *dev)
 585{
 586        struct netdev_boot_setup *s = dev_boot_setup;
 587        int i;
 588
 589        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 590                if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 591                    !strcmp(dev->name, s[i].name)) {
 592                        dev->irq        = s[i].map.irq;
 593                        dev->base_addr  = s[i].map.base_addr;
 594                        dev->mem_start  = s[i].map.mem_start;
 595                        dev->mem_end    = s[i].map.mem_end;
 596                        return 1;
 597                }
 598        }
 599        return 0;
 600}
 601EXPORT_SYMBOL(netdev_boot_setup_check);
 602
 603
 604/**
 605 *      netdev_boot_base        - get address from boot time settings
 606 *      @prefix: prefix for network device
 607 *      @unit: id for network device
 608 *
 609 *      Check boot time settings for the base address of device.
 610 *      The found settings are set for the device to be used
 611 *      later in the device probing.
 612 *      Returns 0 if no settings found.
 613 */
 614unsigned long netdev_boot_base(const char *prefix, int unit)
 615{
 616        const struct netdev_boot_setup *s = dev_boot_setup;
 617        char name[IFNAMSIZ];
 618        int i;
 619
 620        sprintf(name, "%s%d", prefix, unit);
 621
 622        /*
 623         * If device already registered then return base of 1
 624         * to indicate not to probe for this interface
 625         */
 626        if (__dev_get_by_name(&init_net, name))
 627                return 1;
 628
 629        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 630                if (!strcmp(name, s[i].name))
 631                        return s[i].map.base_addr;
 632        return 0;
 633}
 634
 635/*
 636 * Saves at boot time configured settings for any netdevice.
 637 */
 638int __init netdev_boot_setup(char *str)
 639{
 640        int ints[5];
 641        struct ifmap map;
 642
 643        str = get_options(str, ARRAY_SIZE(ints), ints);
 644        if (!str || !*str)
 645                return 0;
 646
 647        /* Save settings */
 648        memset(&map, 0, sizeof(map));
 649        if (ints[0] > 0)
 650                map.irq = ints[1];
 651        if (ints[0] > 1)
 652                map.base_addr = ints[2];
 653        if (ints[0] > 2)
 654                map.mem_start = ints[3];
 655        if (ints[0] > 3)
 656                map.mem_end = ints[4];
 657
 658        /* Add new entry to the list */
 659        return netdev_boot_setup_add(str, &map);
 660}
 661
 662__setup("netdev=", netdev_boot_setup);
 663
 664/*******************************************************************************
 665
 666                            Device Interface Subroutines
 667
 668*******************************************************************************/
 669
 670/**
 671 *      dev_get_iflink  - get 'iflink' value of a interface
 672 *      @dev: targeted interface
 673 *
 674 *      Indicates the ifindex the interface is linked to.
 675 *      Physical interfaces have the same 'ifindex' and 'iflink' values.
 676 */
 677
 678int dev_get_iflink(const struct net_device *dev)
 679{
 680        if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
 681                return dev->netdev_ops->ndo_get_iflink(dev);
 682
 683        return dev->ifindex;
 684}
 685EXPORT_SYMBOL(dev_get_iflink);
 686
 687/**
 688 *      dev_fill_metadata_dst - Retrieve tunnel egress information.
 689 *      @dev: targeted interface
 690 *      @skb: The packet.
 691 *
 692 *      For better visibility of tunnel traffic OVS needs to retrieve
 693 *      egress tunnel information for a packet. Following API allows
 694 *      user to get this info.
 695 */
 696int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 697{
 698        struct ip_tunnel_info *info;
 699
 700        if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
 701                return -EINVAL;
 702
 703        info = skb_tunnel_info_unclone(skb);
 704        if (!info)
 705                return -ENOMEM;
 706        if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
 707                return -EINVAL;
 708
 709        return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
 710}
 711EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
 712
 713/**
 714 *      __dev_get_by_name       - find a device by its name
 715 *      @net: the applicable net namespace
 716 *      @name: name to find
 717 *
 718 *      Find an interface by name. Must be called under RTNL semaphore
 719 *      or @dev_base_lock. If the name is found a pointer to the device
 720 *      is returned. If the name is not found then %NULL is returned. The
 721 *      reference counters are not incremented so the caller must be
 722 *      careful with locks.
 723 */
 724
 725struct net_device *__dev_get_by_name(struct net *net, const char *name)
 726{
 727        struct net_device *dev;
 728        struct hlist_head *head = dev_name_hash(net, name);
 729
 730        hlist_for_each_entry(dev, head, name_hlist)
 731                if (!strncmp(dev->name, name, IFNAMSIZ))
 732                        return dev;
 733
 734        return NULL;
 735}
 736EXPORT_SYMBOL(__dev_get_by_name);
 737
 738/**
 739 *      dev_get_by_name_rcu     - find a device by its name
 740 *      @net: the applicable net namespace
 741 *      @name: name to find
 742 *
 743 *      Find an interface by name.
 744 *      If the name is found a pointer to the device is returned.
 745 *      If the name is not found then %NULL is returned.
 746 *      The reference counters are not incremented so the caller must be
 747 *      careful with locks. The caller must hold RCU lock.
 748 */
 749
 750struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 751{
 752        struct net_device *dev;
 753        struct hlist_head *head = dev_name_hash(net, name);
 754
 755        hlist_for_each_entry_rcu(dev, head, name_hlist)
 756                if (!strncmp(dev->name, name, IFNAMSIZ))
 757                        return dev;
 758
 759        return NULL;
 760}
 761EXPORT_SYMBOL(dev_get_by_name_rcu);
 762
 763/**
 764 *      dev_get_by_name         - find a device by its name
 765 *      @net: the applicable net namespace
 766 *      @name: name to find
 767 *
 768 *      Find an interface by name. This can be called from any
 769 *      context and does its own locking. The returned handle has
 770 *      the usage count incremented and the caller must use dev_put() to
 771 *      release it when it is no longer needed. %NULL is returned if no
 772 *      matching device is found.
 773 */
 774
 775struct net_device *dev_get_by_name(struct net *net, const char *name)
 776{
 777        struct net_device *dev;
 778
 779        rcu_read_lock();
 780        dev = dev_get_by_name_rcu(net, name);
 781        if (dev)
 782                dev_hold(dev);
 783        rcu_read_unlock();
 784        return dev;
 785}
 786EXPORT_SYMBOL(dev_get_by_name);
 787
 788/**
 789 *      __dev_get_by_index - find a device by its ifindex
 790 *      @net: the applicable net namespace
 791 *      @ifindex: index of device
 792 *
 793 *      Search for an interface by index. Returns %NULL if the device
 794 *      is not found or a pointer to the device. The device has not
 795 *      had its reference counter increased so the caller must be careful
 796 *      about locking. The caller must hold either the RTNL semaphore
 797 *      or @dev_base_lock.
 798 */
 799
 800struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 801{
 802        struct net_device *dev;
 803        struct hlist_head *head = dev_index_hash(net, ifindex);
 804
 805        hlist_for_each_entry(dev, head, index_hlist)
 806                if (dev->ifindex == ifindex)
 807                        return dev;
 808
 809        return NULL;
 810}
 811EXPORT_SYMBOL(__dev_get_by_index);
 812
 813/**
 814 *      dev_get_by_index_rcu - find a device by its ifindex
 815 *      @net: the applicable net namespace
 816 *      @ifindex: index of device
 817 *
 818 *      Search for an interface by index. Returns %NULL if the device
 819 *      is not found or a pointer to the device. The device has not
 820 *      had its reference counter increased so the caller must be careful
 821 *      about locking. The caller must hold RCU lock.
 822 */
 823
 824struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 825{
 826        struct net_device *dev;
 827        struct hlist_head *head = dev_index_hash(net, ifindex);
 828
 829        hlist_for_each_entry_rcu(dev, head, index_hlist)
 830                if (dev->ifindex == ifindex)
 831                        return dev;
 832
 833        return NULL;
 834}
 835EXPORT_SYMBOL(dev_get_by_index_rcu);
 836
 837
 838/**
 839 *      dev_get_by_index - find a device by its ifindex
 840 *      @net: the applicable net namespace
 841 *      @ifindex: index of device
 842 *
 843 *      Search for an interface by index. Returns NULL if the device
 844 *      is not found or a pointer to the device. The device returned has
 845 *      had a reference added and the pointer is safe until the user calls
 846 *      dev_put to indicate they have finished with it.
 847 */
 848
 849struct net_device *dev_get_by_index(struct net *net, int ifindex)
 850{
 851        struct net_device *dev;
 852
 853        rcu_read_lock();
 854        dev = dev_get_by_index_rcu(net, ifindex);
 855        if (dev)
 856                dev_hold(dev);
 857        rcu_read_unlock();
 858        return dev;
 859}
 860EXPORT_SYMBOL(dev_get_by_index);
 861
 862/**
 863 *      netdev_get_name - get a netdevice name, knowing its ifindex.
 864 *      @net: network namespace
 865 *      @name: a pointer to the buffer where the name will be stored.
 866 *      @ifindex: the ifindex of the interface to get the name from.
 867 *
 868 *      The use of raw_seqcount_begin() and cond_resched() before
 869 *      retrying is required as we want to give the writers a chance
 870 *      to complete when CONFIG_PREEMPT is not set.
 871 */
 872int netdev_get_name(struct net *net, char *name, int ifindex)
 873{
 874        struct net_device *dev;
 875        unsigned int seq;
 876
 877retry:
 878        seq = raw_seqcount_begin(&devnet_rename_seq);
 879        rcu_read_lock();
 880        dev = dev_get_by_index_rcu(net, ifindex);
 881        if (!dev) {
 882                rcu_read_unlock();
 883                return -ENODEV;
 884        }
 885
 886        strcpy(name, dev->name);
 887        rcu_read_unlock();
 888        if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 889                cond_resched();
 890                goto retry;
 891        }
 892
 893        return 0;
 894}
 895
 896/**
 897 *      dev_getbyhwaddr_rcu - find a device by its hardware address
 898 *      @net: the applicable net namespace
 899 *      @type: media type of device
 900 *      @ha: hardware address
 901 *
 902 *      Search for an interface by MAC address. Returns NULL if the device
 903 *      is not found or a pointer to the device.
 904 *      The caller must hold RCU or RTNL.
 905 *      The returned device has not had its ref count increased
 906 *      and the caller must therefore be careful about locking
 907 *
 908 */
 909
 910struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 911                                       const char *ha)
 912{
 913        struct net_device *dev;
 914
 915        for_each_netdev_rcu(net, dev)
 916                if (dev->type == type &&
 917                    !memcmp(dev->dev_addr, ha, dev->addr_len))
 918                        return dev;
 919
 920        return NULL;
 921}
 922EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 923
 924struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 925{
 926        struct net_device *dev;
 927
 928        ASSERT_RTNL();
 929        for_each_netdev(net, dev)
 930                if (dev->type == type)
 931                        return dev;
 932
 933        return NULL;
 934}
 935EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 936
 937struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 938{
 939        struct net_device *dev, *ret = NULL;
 940
 941        rcu_read_lock();
 942        for_each_netdev_rcu(net, dev)
 943                if (dev->type == type) {
 944                        dev_hold(dev);
 945                        ret = dev;
 946                        break;
 947                }
 948        rcu_read_unlock();
 949        return ret;
 950}
 951EXPORT_SYMBOL(dev_getfirstbyhwtype);
 952
 953/**
 954 *      __dev_get_by_flags - find any device with given flags
 955 *      @net: the applicable net namespace
 956 *      @if_flags: IFF_* values
 957 *      @mask: bitmask of bits in if_flags to check
 958 *
 959 *      Search for any interface with the given flags. Returns NULL if a device
 960 *      is not found or a pointer to the device. Must be called inside
 961 *      rtnl_lock(), and result refcount is unchanged.
 962 */
 963
 964struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 965                                      unsigned short mask)
 966{
 967        struct net_device *dev, *ret;
 968
 969        ASSERT_RTNL();
 970
 971        ret = NULL;
 972        for_each_netdev(net, dev) {
 973                if (((dev->flags ^ if_flags) & mask) == 0) {
 974                        ret = dev;
 975                        break;
 976                }
 977        }
 978        return ret;
 979}
 980EXPORT_SYMBOL(__dev_get_by_flags);
 981
 982/**
 983 *      dev_valid_name - check if name is okay for network device
 984 *      @name: name string
 985 *
 986 *      Network device names need to be valid file names to
 987 *      to allow sysfs to work.  We also disallow any kind of
 988 *      whitespace.
 989 */
 990bool dev_valid_name(const char *name)
 991{
 992        if (*name == '\0')
 993                return false;
 994        if (strlen(name) >= IFNAMSIZ)
 995                return false;
 996        if (!strcmp(name, ".") || !strcmp(name, ".."))
 997                return false;
 998
 999        while (*name) {
1000                if (*name == '/' || *name == ':' || isspace(*name))

1001                        return false;
1002                name++;
1003        }
1004        return true;
1005}
1006EXPORT_SYMBOL(dev_valid_name);
1007
1008/**
1009 *      __dev_alloc_name - allocate a name for a device
1010 *      @net: network namespace to allocate the device name in
1011 *      @name: name format string
1012 *      @buf:  scratch buffer and result name string
1013 *
1014 *      Passed a format string - eg "lt%d" it will try and find a suitable
1015 *      id. It scans list of devices to build up a free map, then chooses
1016 *      the first empty slot. The caller must hold the dev_base or rtnl lock
1017 *      while allocating the name and adding the device in order to avoid
1018 *      duplicates.
1019 *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1020 *      Returns the number of the unit assigned or a negative errno code.
1021 */
1022
1023static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1024{
1025        int i = 0;
1026        const char *p;
1027        const int max_netdevices = 8*PAGE_SIZE;
1028        unsigned long *inuse;
1029        struct net_device *d;
1030
1031        p = strnchr(name, IFNAMSIZ-1, '%');
1032        if (p) {
1033                /*
1034                 * Verify the string as this thing may have come from
1035                 * the user.  There must be either one "%d" and no other "%"
1036                 * characters.
1037                 */
1038                if (p[1] != 'd' || strchr(p + 2, '%'))
1039                        return -EINVAL;
1040
1041                /* Use one page as a bit array of possible slots */
1042                inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1043                if (!inuse)
1044                        return -ENOMEM;
1045
1046                for_each_netdev(net, d) {
1047                        if (!sscanf(d->name, name, &i))
1048                                continue;
1049                        if (i < 0 || i >= max_netdevices)
1050                                continue;
1051
1052                        /*  avoid cases where sscanf is not exact inverse of printf */
1053                        snprintf(buf, IFNAMSIZ, name, i);
1054                        if (!strncmp(buf, d->name, IFNAMSIZ))
1055                                set_bit(i, inuse);
1056                }
1057
1058                i = find_first_zero_bit(inuse, max_netdevices);
1059                free_page((unsigned long) inuse);
1060        }
1061
1062        if (buf != name)
1063                snprintf(buf, IFNAMSIZ, name, i);
1064        if (!__dev_get_by_name(net, buf))
1065                return i;
1066
1067        /* It is possible to run out of possible slots
1068         * when the name is long and there isn't enough space left
1069         * for the digits, or if all bits are used.
1070         */
1071        return -ENFILE;
1072}
1073
1074/**
1075 *      dev_alloc_name - allocate a name for a device
1076 *      @dev: device
1077 *      @name: name format string
1078 *
1079 *      Passed a format string - eg "lt%d" it will try and find a suitable
1080 *      id. It scans list of devices to build up a free map, then chooses
1081 *      the first empty slot. The caller must hold the dev_base or rtnl lock
1082 *      while allocating the name and adding the device in order to avoid
1083 *      duplicates.
1084 *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1085 *      Returns the number of the unit assigned or a negative errno code.
1086 */
1087
1088int dev_alloc_name(struct net_device *dev, const char *name)
1089{
1090        char buf[IFNAMSIZ];
1091        struct net *net;
1092        int ret;
1093
1094        BUG_ON(!dev_net(dev));
1095        net = dev_net(dev);
1096        ret = __dev_alloc_name(net, name, buf);
1097        if (ret >= 0)
1098                strlcpy(dev->name, buf, IFNAMSIZ);
1099        return ret;
1100}
1101EXPORT_SYMBOL(dev_alloc_name);
1102
1103static int dev_alloc_name_ns(struct net *net,
1104                             struct net_device *dev,
1105                             const char *name)
1106{
1107        char buf[IFNAMSIZ];
1108        int ret;
1109
1110        ret = __dev_alloc_name(net, name, buf);
1111        if (ret >= 0)
1112                strlcpy(dev->name, buf, IFNAMSIZ);
1113        return ret;
1114}
1115
1116static int dev_get_valid_name(struct net *net,
1117                              struct net_device *dev,
1118                              const char *name)
1119{
1120        BUG_ON(!net);
1121
1122        if (!dev_valid_name(name))
1123                return -EINVAL;
1124
1125        if (strchr(name, '%'))
1126                return dev_alloc_name_ns(net, dev, name);
1127        else if (__dev_get_by_name(net, name))
1128                return -EEXIST;
1129        else if (dev->name != name)
1130                strlcpy(dev->name, name, IFNAMSIZ);
1131
1132        return 0;
1133}
1134
1135/**
1136 *      dev_change_name - change name of a device
1137 *      @dev: device
1138 *      @newname: name (or format string) must be at least IFNAMSIZ
1139 *
1140 *      Change name of a device, can pass format strings "eth%d".
1141 *      for wildcarding.
1142 */
1143int dev_change_name(struct net_device *dev, const char *newname)
1144{
1145        unsigned char old_assign_type;
1146        char oldname[IFNAMSIZ];
1147        int err = 0;
1148        int ret;
1149        struct net *net;
1150
1151        ASSERT_RTNL();
1152        BUG_ON(!dev_net(dev));
1153
1154        net = dev_net(dev);
1155        if (dev->flags & IFF_UP)
1156                return -EBUSY;
1157
1158        write_seqcount_begin(&devnet_rename_seq);
1159
1160        if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1161                write_seqcount_end(&devnet_rename_seq);
1162                return 0;
1163        }
1164
1165        memcpy(oldname, dev->name, IFNAMSIZ);
1166
1167        err = dev_get_valid_name(net, dev, newname);
1168        if (err < 0) {
1169                write_seqcount_end(&devnet_rename_seq);
1170                return err;
1171        }
1172
1173        if (oldname[0] && !strchr(oldname, '%'))
1174                netdev_info(dev, "renamed from %s\n", oldname);
1175
1176        old_assign_type = dev->name_assign_type;
1177        dev->name_assign_type = NET_NAME_RENAMED;
1178
1179rollback:
1180        ret = device_rename(&dev->dev, dev->name);
1181        if (ret) {
1182                memcpy(dev->name, oldname, IFNAMSIZ);
1183                dev->name_assign_type = old_assign_type;
1184                write_seqcount_end(&devnet_rename_seq);
1185                return ret;
1186        }
1187
1188        write_seqcount_end(&devnet_rename_seq);
1189
1190        netdev_adjacent_rename_links(dev, oldname);
1191
1192        write_lock_bh(&dev_base_lock);
1193        hlist_del_rcu(&dev->name_hlist);
1194        write_unlock_bh(&dev_base_lock);
1195
1196        synchronize_rcu();
1197
1198        write_lock_bh(&dev_base_lock);
1199        hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1200        write_unlock_bh(&dev_base_lock);
1201
1202        ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1203        ret = notifier_to_errno(ret);
1204
1205        if (ret) {
1206                /* err >= 0 after dev_alloc_name() or stores the first errno */
1207                if (err >= 0) {
1208                        err = ret;
1209                        write_seqcount_begin(&devnet_rename_seq);
1210                        memcpy(dev->name, oldname, IFNAMSIZ);
1211                        memcpy(oldname, newname, IFNAMSIZ);
1212                        dev->name_assign_type = old_assign_type;
1213                        old_assign_type = NET_NAME_RENAMED;
1214                        goto rollback;
1215                } else {
1216                        pr_err("%s: name change rollback failed: %d\n",
1217                               dev->name, ret);
1218                }
1219        }
1220
1221        return err;
1222}
1223
1224/**
1225 *      dev_set_alias - change ifalias of a device
1226 *      @dev: device
1227 *      @alias: name up to IFALIASZ
1228 *      @len: limit of bytes to copy from info
1229 *
1230 *      Set ifalias for a device,
1231 */
1232int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1233{
1234        char *new_ifalias;
1235
1236        ASSERT_RTNL();
1237
1238        if (len >= IFALIASZ)
1239                return -EINVAL;
1240
1241        if (!len) {
1242                kfree(dev->ifalias);
1243                dev->ifalias = NULL;
1244                return 0;
1245        }
1246
1247        new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1248        if (!new_ifalias)
1249                return -ENOMEM;
1250        dev->ifalias = new_ifalias;
1251
1252        strlcpy(dev->ifalias, alias, len+1);
1253        return len;
1254}
1255
1256
1257/**
1258 *      netdev_features_change - device changes features
1259 *      @dev: device to cause notification
1260 *
1261 *      Called to indicate a device has changed features.
1262 */
1263void netdev_features_change(struct net_device *dev)
1264{
1265        call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1266}
1267EXPORT_SYMBOL(netdev_features_change);
1268
1269/**
1270 *      netdev_state_change - device changes state
1271 *      @dev: device to cause notification
1272 *
1273 *      Called to indicate a device has changed state. This function calls
1274 *      the notifier chains for netdev_chain and sends a NEWLINK message
1275 *      to the routing socket.
1276 */
1277void netdev_state_change(struct net_device *dev)
1278{
1279        if (dev->flags & IFF_UP) {
1280                struct netdev_notifier_change_info change_info;
1281
1282                change_info.flags_changed = 0;
1283                call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1284                                              &change_info.info);
1285                rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1286        }
1287}
1288EXPORT_SYMBOL(netdev_state_change);
1289
1290/**
1291 *      netdev_notify_peers - notify network peers about existence of @dev
1292 *      @dev: network device
1293 *
1294 * Generate traffic such that interested network peers are aware of
1295 * @dev, such as by generating a gratuitous ARP. This may be used when
1296 * a device wants to inform the rest of the network about some sort of
1297 * reconfiguration such as a failover event or virtual machine
1298 * migration.
1299 */
1300void netdev_notify_peers(struct net_device *dev)
1301{
1302        rtnl_lock();
1303        call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1304        rtnl_unlock();
1305}
1306EXPORT_SYMBOL(netdev_notify_peers);
1307
1308static int __dev_open(struct net_device *dev)
1309{
1310        const struct net_device_ops *ops = dev->netdev_ops;
1311        int ret;
1312
1313        ASSERT_RTNL();
1314
1315        if (!netif_device_present(dev))
1316                return -ENODEV;
1317
1318        /* Block netpoll from trying to do any rx path servicing.
1319         * If we don't do this there is a chance ndo_poll_controller
1320         * or ndo_poll may be running while we open the device
1321         */
1322        netpoll_poll_disable(dev);
1323
1324        ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1325        ret = notifier_to_errno(ret);
1326        if (ret)
1327                return ret;
1328
1329        set_bit(__LINK_STATE_START, &dev->state);
1330
1331        if (ops->ndo_validate_addr)
1332                ret = ops->ndo_validate_addr(dev);
1333
1334        if (!ret && ops->ndo_open)
1335                ret = ops->ndo_open(dev);
1336
1337        netpoll_poll_enable(dev);
1338
1339        if (ret)
1340                clear_bit(__LINK_STATE_START, &dev->state);
1341        else {
1342                dev->flags |= IFF_UP;
1343                dev_set_rx_mode(dev);
1344                dev_activate(dev);
1345                add_device_randomness(dev->dev_addr, dev->addr_len);
1346        }
1347
1348        return ret;
1349}
1350
1351/**
1352 *      dev_open        - prepare an interface for use.
1353 *      @dev:   device to open
1354 *
1355 *      Takes a device from down to up state. The device's private open
1356 *      function is invoked and then the multicast lists are loaded. Finally
1357 *      the device is moved into the up state and a %NETDEV_UP message is
1358 *      sent to the netdev notifier chain.
1359 *
1360 *      Calling this function on an active interface is a nop. On a failure
1361 *      a negative errno code is returned.
1362 */
1363int dev_open(struct net_device *dev)
1364{
1365        int ret;
1366
1367        if (dev->flags & IFF_UP)
1368                return 0;
1369
1370        ret = __dev_open(dev);
1371        if (ret < 0)
1372                return ret;
1373
1374        rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1375        call_netdevice_notifiers(NETDEV_UP, dev);
1376
1377        return ret;
1378}
1379EXPORT_SYMBOL(dev_open);
1380
1381static int __dev_close_many(struct list_head *head)
1382{
1383        struct net_device *dev;
1384
1385        ASSERT_RTNL();
1386        might_sleep();
1387
1388        list_for_each_entry(dev, head, close_list) {
1389                /* Temporarily disable netpoll until the interface is down */
1390                netpoll_poll_disable(dev);
1391
1392                call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1393
1394                clear_bit(__LINK_STATE_START, &dev->state);
1395
1396                /* Synchronize to scheduled poll. We cannot touch poll list, it
1397                 * can be even on different cpu. So just clear netif_running().
1398                 *
1399                 * dev->stop() will invoke napi_disable() on all of it's
1400                 * napi_struct instances on this device.
1401                 */
1402                smp_mb__after_atomic(); /* Commit netif_running(). */
1403        }
1404
1405        dev_deactivate_many(head);
1406
1407        list_for_each_entry(dev, head, close_list) {
1408                const struct net_device_ops *ops = dev->netdev_ops;
1409
1410                /*
1411                 *      Call the device specific close. This cannot fail.
1412                 *      Only if device is UP
1413                 *
1414                 *      We allow it to be called even after a DETACH hot-plug
1415                 *      event.
1416                 */
1417                if (ops->ndo_stop)
1418                        ops->ndo_stop(dev);
1419
1420                dev->flags &= ~IFF_UP;
1421                netpoll_poll_enable(dev);
1422        }
1423
1424        return 0;
1425}
1426
1427static int __dev_close(struct net_device *dev)
1428{
1429        int retval;
1430        LIST_HEAD(single);
1431
1432        list_add(&dev->close_list, &single);
1433        retval = __dev_close_many(&single);
1434        list_del(&single);
1435
1436        return retval;
1437}
1438
1439int dev_close_many(struct list_head *head, bool unlink)
1440{
1441        struct net_device *dev, *tmp;
1442
1443        /* Remove the devices that don't need to be closed */
1444        list_for_each_entry_safe(dev, tmp, head, close_list)
1445                if (!(dev->flags & IFF_UP))
1446                        list_del_init(&dev->close_list);
1447
1448        __dev_close_many(head);
1449
1450        list_for_each_entry_safe(dev, tmp, head, close_list) {
1451                rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1452                call_netdevice_notifiers(NETDEV_DOWN, dev);
1453                if (unlink)
1454                        list_del_init(&dev->close_list);
1455        }
1456
1457        return 0;
1458}
1459EXPORT_SYMBOL(dev_close_many);
1460
1461/**
1462 *      dev_close - shutdown an interface.
1463 *      @dev: device to shutdown
1464 *
1465 *      This function moves an active device into down state. A
1466 *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1467 *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1468 *      chain.
1469 */
1470int dev_close(struct net_device *dev)
1471{
1472        if (dev->flags & IFF_UP) {
1473                LIST_HEAD(single);
1474
1475                list_add(&dev->close_list, &single);
1476                dev_close_many(&single, true);
1477                list_del(&single);
1478        }
1479        return 0;
1480}
1481EXPORT_SYMBOL(dev_close);
1482
1483
1484/**
1485 *      dev_disable_lro - disable Large Receive Offload on a device
1486 *      @dev: device
1487 *
1488 *      Disable Large Receive Offload (LRO) on a net device.  Must be
1489 *      called under RTNL.  This is needed if received packets may be
1490 *      forwarded to another interface.
1491 */
1492void dev_disable_lro(struct net_device *dev)
1493{
1494        struct net_device *lower_dev;
1495        struct list_head *iter;
1496
1497        dev->wanted_features &= ~NETIF_F_LRO;
1498        netdev_update_features(dev);
1499
1500        if (unlikely(dev->features & NETIF_F_LRO))
1501                netdev_WARN(dev, "failed to disable LRO!\n");
1502
1503        netdev_for_each_lower_dev(dev, lower_dev, iter)
1504                dev_disable_lro(lower_dev);
1505}
1506EXPORT_SYMBOL(dev_disable_lro);
1507
1508static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1509                                   struct net_device *dev)
1510{
1511        struct netdev_notifier_info info;
1512
1513        netdev_notifier_info_init(&info, dev);
1514        return nb->notifier_call(nb, val, &info);
1515}
1516
1517static int dev_boot_phase = 1;
1518
1519/**
1520 *      register_netdevice_notifier - register a network notifier block
1521 *      @nb: notifier
1522 *
1523 *      Register a notifier to be called when network device events occur.
1524 *      The notifier passed is linked into the kernel structures and must
1525 *      not be reused until it has been unregistered. A negative errno code
1526 *      is returned on a failure.
1527 *
1528 *      When registered all registration and up events are replayed
1529 *      to the new notifier to allow device to have a race free
1530 *      view of the network device list.
1531 */
1532
1533int register_netdevice_notifier(struct notifier_block *nb)
1534{
1535        struct net_device *dev;
1536        struct net_device *last;
1537        struct net *net;
1538        int err;
1539
1540        rtnl_lock();
1541        err = raw_notifier_chain_register(&netdev_chain, nb);
1542        if (err)
1543                goto unlock;
1544        if (dev_boot_phase)
1545                goto unlock;
1546        for_each_net(net) {
1547                for_each_netdev(net, dev) {
1548                        err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1549                        err = notifier_to_errno(err);
1550                        if (err)
1551                                goto rollback;
1552
1553                        if (!(dev->flags & IFF_UP))
1554                                continue;
1555
1556                        call_netdevice_notifier(nb, NETDEV_UP, dev);
1557                }
1558        }
1559
1560unlock:
1561        rtnl_unlock();
1562        return err;
1563
1564rollback:
1565        last = dev;
1566        for_each_net(net) {
1567                for_each_netdev(net, dev) {
1568                        if (dev == last)
1569                                goto outroll;
1570
1571                        if (dev->flags & IFF_UP) {
1572                                call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1573                                                        dev);
1574                                call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1575                        }
1576                        call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1577                }
1578        }
1579
1580outroll:
1581        raw_notifier_chain_unregister(&netdev_chain, nb);
1582        goto unlock;
1583}
1584EXPORT_SYMBOL(register_netdevice_notifier);
1585
1586/**
1587 *      unregister_netdevice_notifier - unregister a network notifier block
1588 *      @nb: notifier
1589 *
1590 *      Unregister a notifier previously registered by
1591 *      register_netdevice_notifier(). The notifier is unlinked into the
1592 *      kernel structures and may then be reused. A negative errno code
1593 *      is returned on a failure.
1594 *
1595 *      After unregistering unregister and down device events are synthesized
1596 *      for all devices on the device list to the removed notifier to remove
1597 *      the need for special case cleanup code.
1598 */
1599
1600int unregister_netdevice_notifier(struct notifier_block *nb)
1601{
1602        struct net_device *dev;
1603        struct net *net;
1604        int err;
1605
1606        rtnl_lock();
1607        err = raw_notifier_chain_unregister(&netdev_chain, nb);
1608        if (err)
1609                goto unlock;
1610
1611        for_each_net(net) {
1612                for_each_netdev(net, dev) {
1613                        if (dev->flags & IFF_UP) {
1614                                call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1615                                                        dev);
1616                                call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1617                        }
1618                        call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1619                }
1620        }
1621unlock:
1622        rtnl_unlock();
1623        return err;
1624}
1625EXPORT_SYMBOL(unregister_netdevice_notifier);
1626
1627/**
1628 *      call_netdevice_notifiers_info - call all network notifier blocks
1629 *      @val: value passed unmodified to notifier function
1630 *      @dev: net_device pointer passed unmodified to notifier function
1631 *      @info: notifier information data
1632 *
1633 *      Call all network notifier blocks.  Parameters and return value
1634 *      are as for raw_notifier_call_chain().
1635 */
1636
1637static int call_netdevice_notifiers_info(unsigned long val,
1638                                         struct net_device *dev,
1639                                         struct netdev_notifier_info *info)
1640{
1641        ASSERT_RTNL();
1642        netdev_notifier_info_init(info, dev);
1643        return raw_notifier_call_chain(&netdev_chain, val, info);
1644}
1645
1646/**
1647 *      call_netdevice_notifiers - call all network notifier blocks
1648 *      @val: value passed unmodified to notifier function
1649 *      @dev: net_device pointer passed unmodified to notifier function
1650 *
1651 *      Call all network notifier blocks.  Parameters and return value
1652 *      are as for raw_notifier_call_chain().
1653 */
1654
1655int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1656{
1657        struct netdev_notifier_info info;
1658
1659        return call_netdevice_notifiers_info(val, dev, &info);
1660}
1661EXPORT_SYMBOL(call_netdevice_notifiers);
1662
1663#ifdef CONFIG_NET_INGRESS
1664static struct static_key ingress_needed __read_mostly;
1665
1666void net_inc_ingress_queue(void)
1667{
1668        static_key_slow_inc(&ingress_needed);
1669}
1670EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1671
1672void net_dec_ingress_queue(void)
1673{
1674        static_key_slow_dec(&ingress_needed);
1675}
1676EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1677#endif
1678
1679#ifdef CONFIG_NET_EGRESS
1680static struct static_key egress_needed __read_mostly;
1681
1682void net_inc_egress_queue(void)
1683{
1684        static_key_slow_inc(&egress_needed);
1685}
1686EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1687
1688void net_dec_egress_queue(void)
1689{
1690        static_key_slow_dec(&egress_needed);
1691}
1692EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1693#endif
1694
1695static struct static_key netstamp_needed __read_mostly;
1696#ifdef HAVE_JUMP_LABEL
1697/* We are not allowed to call static_key_slow_dec() from irq context
1698 * If net_disable_timestamp() is called from irq context, defer the
1699 * static_key_slow_dec() calls.
1700 */
1701static atomic_t netstamp_needed_deferred;
1702#endif
1703
1704void net_enable_timestamp(void)
1705{
1706#ifdef HAVE_JUMP_LABEL
1707        int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1708
1709        if (deferred) {
1710                while (--deferred)
1711                        static_key_slow_dec(&netstamp_needed);
1712                return;
1713        }
1714#endif
1715        static_key_slow_inc(&netstamp_needed);
1716}
1717EXPORT_SYMBOL(net_enable_timestamp);
1718
1719void net_disable_timestamp(void)
1720{
1721#ifdef HAVE_JUMP_LABEL
1722        if (in_interrupt()) {
1723                atomic_inc(&netstamp_needed_deferred);
1724                return;
1725        }
1726#endif
1727        static_key_slow_dec(&netstamp_needed);
1728}
1729EXPORT_SYMBOL(net_disable_timestamp);
1730
1731static inline void net_timestamp_set(struct sk_buff *skb)
1732{
1733        skb->tstamp.tv64 = 0;
1734        if (static_key_false(&netstamp_needed))
1735                __net_timestamp(skb);
1736}
1737
1738#define net_timestamp_check(COND, SKB)                  \
1739        if (static_key_false(&netstamp_needed)) {               \
1740                if ((COND) && !(SKB)->tstamp.tv64)      \
1741                        __net_timestamp(SKB);           \
1742        }                                               \
1743
1744bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
1745{
1746        unsigned int len;
1747
1748        if (!(dev->flags & IFF_UP))
1749                return false;
1750
1751        len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1752        if (skb->len <= len)
1753                return true;
1754
1755        /* if TSO is enabled, we don't care about the length as the packet
1756         * could be forwarded without being segmented before
1757         */
1758        if (skb_is_gso(skb))
1759                return true;
1760
1761        return false;
1762}
1763EXPORT_SYMBOL_GPL(is_skb_forwardable);
1764
1765int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1766{
1767        if (skb_orphan_frags(skb, GFP_ATOMIC) ||
1768            unlikely(!is_skb_forwardable(dev, skb))) {
1769                atomic_long_inc(&dev->rx_dropped);
1770                kfree_skb(skb);
1771                return NET_RX_DROP;
1772        }
1773
1774        skb_scrub_packet(skb, true);
1775        skb->priority = 0;
1776        skb->protocol = eth_type_trans(skb, dev);
1777        skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1778
1779        return 0;
1780}
1781EXPORT_SYMBOL_GPL(__dev_forward_skb);
1782
1783/**
1784 * dev_forward_skb - loopback an skb to another netif
1785 *
1786 * @dev: destination network device
1787 * @skb: buffer to forward
1788 *
1789 * return values:
1790 *      NET_RX_SUCCESS  (no congestion)
1791 *      NET_RX_DROP     (packet was dropped, but freed)
1792 *
1793 * dev_forward_skb can be used for injecting an skb from the
1794 * start_xmit function of one device into the receive queue
1795 * of another device.
1796 *
1797 * The receiving device may be in another namespace, so
1798 * we have to clear all information in the skb that could
1799 * impact namespace isolation.
1800 */
1801int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1802{
1803        return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1804}
1805EXPORT_SYMBOL_GPL(dev_forward_skb);
1806
1807static inline int deliver_skb(struct sk_buff *skb,
1808                              struct packet_type *pt_prev,
1809                              struct net_device *orig_dev)
1810{
1811        if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1812                return -ENOMEM;
1813        atomic_inc(&skb->users);
1814        return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1815}
1816
1817static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1818                                          struct packet_type **pt,
1819                                          struct net_device *orig_dev,
1820                                          __be16 type,
1821                                          struct list_head *ptype_list)
1822{
1823        struct packet_type *ptype, *pt_prev = *pt;
1824
1825        list_for_each_entry_rcu(ptype, ptype_list, list) {
1826                if (ptype->type != type)
1827                        continue;
1828                if (pt_prev)
1829                        deliver_skb(skb, pt_prev, orig_dev);
1830                pt_prev = ptype;
1831        }
1832        *pt = pt_prev;
1833}
1834
1835static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1836{
1837        if (!ptype->af_packet_priv || !skb->sk)
1838                return false;
1839
1840        if (ptype->id_match)
1841                return ptype->id_match(ptype, skb->sk);
1842        else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1843                return true;
1844
1845        return false;
1846}
1847
1848/*
1849 *      Support routine. Sends outgoing frames to any network
1850 *      taps currently in use.
1851 */
1852
1853void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1854{
1855        struct packet_type *ptype;
1856        struct sk_buff *skb2 = NULL;
1857        struct packet_type *pt_prev = NULL;
1858        struct list_head *ptype_list = &ptype_all;
1859
1860        rcu_read_lock();
1861again:
1862        list_for_each_entry_rcu(ptype, ptype_list, list) {
1863                /* Never send packets back to the socket
1864                 * they originated from - MvS (miquels@drinkel.ow.org)
1865                 */
1866                if (skb_loop_sk(ptype, skb))
1867                        continue;
1868
1869                if (pt_prev) {
1870                        deliver_skb(skb2, pt_prev, skb->dev);
1871                        pt_prev = ptype;
1872                        continue;
1873                }
1874
1875                /* need to clone skb, done only once */
1876                skb2 = skb_clone(skb, GFP_ATOMIC);
1877                if (!skb2)
1878                        goto out_unlock;
1879
1880                net_timestamp_set(skb2);
1881
1882                /* skb->nh should be correctly
1883                 * set by sender, so that the second statement is
1884                 * just protection against buggy protocols.
1885                 */
1886                skb_reset_mac_header(skb2);
1887
1888                if (skb_network_header(skb2) < skb2->data ||
1889                    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1890                        net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1891                                             ntohs(skb2->protocol),
1892                                             dev->name);
1893                        skb_reset_network_header(skb2);
1894                }
1895
1896                skb2->transport_header = skb2->network_header;
1897                skb2->pkt_type = PACKET_OUTGOING;
1898                pt_prev = ptype;
1899        }
1900
1901        if (ptype_list == &ptype_all) {
1902                ptype_list = &dev->ptype_all;
1903                goto again;
1904        }
1905out_unlock:
1906        if (pt_prev)
1907                pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1908        rcu_read_unlock();
1909}
1910EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
1911
1912/**
1913 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1914 * @dev: Network device
1915 * @txq: number of queues available
1916 *
1917 * If real_num_tx_queues is changed the tc mappings may no longer be
1918 * valid. To resolve this verify the tc mapping remains valid and if
1919 * not NULL the mapping. With no priorities mapping to this
1920 * offset/count pair it will no longer be used. In the worst case TC0
1921 * is invalid nothing can be done so disable priority mappings. If is
1922 * expected that drivers will fix this mapping if they can before
1923 * calling netif_set_real_num_tx_queues.
1924 */
1925static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1926{
1927        int i;
1928        struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1929
1930        /* If TC0 is invalidated disable TC mapping */
1931        if (tc->offset + tc->count > txq) {
1932                pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1933                dev->num_tc = 0;
1934                return;
1935        }
1936
1937        /* Invalidated prio to tc mappings set to TC0 */
1938        for (i = 1; i < TC_BITMASK + 1; i++) {
1939                int q = netdev_get_prio_tc_map(dev, i);
1940
1941                tc = &dev->tc_to_txq[q];
1942                if (tc->offset + tc->count > txq) {
1943                        pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1944                                i, q);
1945                        netdev_set_prio_tc_map(dev, i, 0);
1946                }
1947        }
1948}
1949
1950#ifdef CONFIG_XPS
1951static DEFINE_MUTEX(xps_map_mutex);
1952#define xmap_dereference(P)             \
1953        rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1954
1955static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1956                                        int cpu, u16 index)
1957{
1958        struct xps_map *map = NULL;
1959        int pos;
1960
1961        if (dev_maps)
1962                map = xmap_dereference(dev_maps->cpu_map[cpu]);
1963
1964        for (pos = 0; map && pos < map->len; pos++) {
1965                if (map->queues[pos] == index) {
1966                        if (map->len > 1) {
1967                                map->queues[pos] = map->queues[--map->len];
1968                        } else {
1969                                RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1970                                kfree_rcu(map, rcu);
1971                                map = NULL;
1972                        }
1973                        break;
1974                }
1975        }
1976
1977        return map;
1978}
1979
1980static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1981{
1982        struct xps_dev_maps *dev_maps;
1983        int cpu, i;
1984        bool active = false;
1985
1986        mutex_lock(&xps_map_mutex);
1987        dev_maps = xmap_dereference(dev->xps_maps);
1988
1989        if (!dev_maps)
1990                goto out_no_maps;
1991
1992        for_each_possible_cpu(cpu) {
1993                for (i = index; i < dev->num_tx_queues; i++) {
1994                        if (!remove_xps_queue(dev_maps, cpu, i))
1995                                break;
1996                }
1997                if (i == dev->num_tx_queues)
1998                        active = true;
1999        }
2000

2001        if (!active) {
2002                RCU_INIT_POINTER(dev->xps_maps, NULL);
2003                kfree_rcu(dev_maps, rcu);
2004        }
2005
2006        for (i = index; i < dev->num_tx_queues; i++)
2007                netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2008                                             NUMA_NO_NODE);
2009
2010out_no_maps:
2011        mutex_unlock(&xps_map_mutex);
2012}
2013
2014static struct xps_map *expand_xps_map(struct xps_map *map,
2015                                      int cpu, u16 index)
2016{
2017        struct xps_map *new_map;
2018        int alloc_len = XPS_MIN_MAP_ALLOC;
2019        int i, pos;
2020
2021        for (pos = 0; map && pos < map->len; pos++) {
2022                if (map->queues[pos] != index)
2023                        continue;
2024                return map;
2025        }
2026
2027        /* Need to add queue to this CPU's existing map */
2028        if (map) {
2029                if (pos < map->alloc_len)
2030                        return map;
2031
2032                alloc_len = map->alloc_len * 2;
2033        }
2034
2035        /* Need to allocate new map to store queue on this CPU's map */
2036        new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2037                               cpu_to_node(cpu));
2038        if (!new_map)
2039                return NULL;
2040
2041        for (i = 0; i < pos; i++)
2042                new_map->queues[i] = map->queues[i];
2043        new_map->alloc_len = alloc_len;
2044        new_map->len = pos;
2045
2046        return new_map;
2047}
2048
2049int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2050                        u16 index)
2051{
2052        struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2053        struct xps_map *map, *new_map;
2054        int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
2055        int cpu, numa_node_id = -2;
2056        bool active = false;
2057
2058        mutex_lock(&xps_map_mutex);
2059
2060        dev_maps = xmap_dereference(dev->xps_maps);
2061
2062        /* allocate memory for queue storage */
2063        for_each_online_cpu(cpu) {
2064                if (!cpumask_test_cpu(cpu, mask))
2065                        continue;
2066
2067                if (!new_dev_maps)
2068                        new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2069                if (!new_dev_maps) {
2070                        mutex_unlock(&xps_map_mutex);
2071                        return -ENOMEM;
2072                }
2073
2074                map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2075                                 NULL;
2076
2077                map = expand_xps_map(map, cpu, index);
2078                if (!map)
2079                        goto error;
2080
2081                RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2082        }
2083
2084        if (!new_dev_maps)
2085                goto out_no_new_maps;
2086
2087        for_each_possible_cpu(cpu) {
2088                if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2089                        /* add queue to CPU maps */
2090                        int pos = 0;
2091
2092                        map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2093                        while ((pos < map->len) && (map->queues[pos] != index))
2094                                pos++;
2095
2096                        if (pos == map->len)
2097                                map->queues[map->len++] = index;
2098#ifdef CONFIG_NUMA
2099                        if (numa_node_id == -2)
2100                                numa_node_id = cpu_to_node(cpu);
2101                        else if (numa_node_id != cpu_to_node(cpu))
2102                                numa_node_id = -1;
2103#endif
2104                } else if (dev_maps) {
2105                        /* fill in the new device map from the old device map */
2106                        map = xmap_dereference(dev_maps->cpu_map[cpu]);
2107                        RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2108                }
2109
2110        }
2111
2112        rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2113
2114        /* Cleanup old maps */
2115        if (dev_maps) {
2116                for_each_possible_cpu(cpu) {
2117                        new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2118                        map = xmap_dereference(dev_maps->cpu_map[cpu]);
2119                        if (map && map != new_map)
2120                                kfree_rcu(map, rcu);
2121                }
2122
2123                kfree_rcu(dev_maps, rcu);
2124        }
2125
2126        dev_maps = new_dev_maps;
2127        active = true;
2128
2129out_no_new_maps:
2130        /* update Tx queue numa node */
2131        netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2132                                     (numa_node_id >= 0) ? numa_node_id :
2133                                     NUMA_NO_NODE);
2134
2135        if (!dev_maps)
2136                goto out_no_maps;
2137
2138        /* removes queue from unused CPUs */
2139        for_each_possible_cpu(cpu) {
2140                if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2141                        continue;
2142
2143                if (remove_xps_queue(dev_maps, cpu, index))
2144                        active = true;
2145        }
2146
2147        /* free map if not active */
2148        if (!active) {
2149                RCU_INIT_POINTER(dev->xps_maps, NULL);
2150                kfree_rcu(dev_maps, rcu);
2151        }
2152
2153out_no_maps:
2154        mutex_unlock(&xps_map_mutex);
2155
2156        return 0;
2157error:
2158        /* remove any maps that we added */
2159        for_each_possible_cpu(cpu) {
2160                new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2161                map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2162                                 NULL;
2163                if (new_map && new_map != map)
2164                        kfree(new_map);
2165        }
2166
2167        mutex_unlock(&xps_map_mutex);
2168
2169        kfree(new_dev_maps);
2170        return -ENOMEM;
2171}
2172EXPORT_SYMBOL(netif_set_xps_queue);
2173
2174#endif
2175/*
2176 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2177 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2178 */
2179int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2180{
2181        int rc;
2182
2183        if (txq < 1 || txq > dev->num_tx_queues)
2184                return -EINVAL;
2185
2186        if (dev->reg_state == NETREG_REGISTERED ||
2187            dev->reg_state == NETREG_UNREGISTERING) {
2188                ASSERT_RTNL();
2189
2190                rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2191                                                  txq);
2192                if (rc)
2193                        return rc;
2194
2195                if (dev->num_tc)
2196                        netif_setup_tc(dev, txq);
2197
2198                if (txq < dev->real_num_tx_queues) {
2199                        qdisc_reset_all_tx_gt(dev, txq);
2200#ifdef CONFIG_XPS
2201                        netif_reset_xps_queues_gt(dev, txq);
2202#endif
2203                }
2204        }
2205
2206        dev->real_num_tx_queues = txq;
2207        return 0;
2208}
2209EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2210
2211#ifdef CONFIG_SYSFS
2212/**
2213 *      netif_set_real_num_rx_queues - set actual number of RX queues used
2214 *      @dev: Network device
2215 *      @rxq: Actual number of RX queues
2216 *
2217 *      This must be called either with the rtnl_lock held or before
2218 *      registration of the net device.  Returns 0 on success, or a
2219 *      negative error code.  If called before registration, it always
2220 *      succeeds.
2221 */
2222int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2223{
2224        int rc;
2225
2226        if (rxq < 1 || rxq > dev->num_rx_queues)
2227                return -EINVAL;
2228
2229        if (dev->reg_state == NETREG_REGISTERED) {
2230                ASSERT_RTNL();
2231
2232                rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2233                                                  rxq);
2234                if (rc)
2235                        return rc;
2236        }
2237
2238        dev->real_num_rx_queues = rxq;
2239        return 0;
2240}
2241EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2242#endif
2243
2244/**
2245 * netif_get_num_default_rss_queues - default number of RSS queues
2246 *
2247 * This routine should set an upper limit on the number of RSS queues
2248 * used by default by multiqueue devices.
2249 */
2250int netif_get_num_default_rss_queues(void)
2251{
2252        return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2253}
2254EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2255
2256static inline void __netif_reschedule(struct Qdisc *q)
2257{
2258        struct softnet_data *sd;
2259        unsigned long flags;
2260
2261        local_irq_save(flags);
2262        sd = this_cpu_ptr(&softnet_data);
2263        q->next_sched = NULL;
2264        *sd->output_queue_tailp = q;
2265        sd->output_queue_tailp = &q->next_sched;
2266        raise_softirq_irqoff(NET_TX_SOFTIRQ);
2267        local_irq_restore(flags);
2268}
2269
2270void __netif_schedule(struct Qdisc *q)
2271{
2272        if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2273                __netif_reschedule(q);
2274}
2275EXPORT_SYMBOL(__netif_schedule);
2276
2277struct dev_kfree_skb_cb {
2278        enum skb_free_reason reason;
2279};
2280
2281static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2282{
2283        return (struct dev_kfree_skb_cb *)skb->cb;
2284}
2285
2286void netif_schedule_queue(struct netdev_queue *txq)
2287{
2288        rcu_read_lock();
2289        if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2290                struct Qdisc *q = rcu_dereference(txq->qdisc);
2291
2292                __netif_schedule(q);
2293        }
2294        rcu_read_unlock();
2295}
2296EXPORT_SYMBOL(netif_schedule_queue);
2297
2298/**
2299 *      netif_wake_subqueue - allow sending packets on subqueue
2300 *      @dev: network device
2301 *      @queue_index: sub queue index
2302 *
2303 * Resume individual transmit queue of a device with multiple transmit queues.
2304 */
2305void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2306{
2307        struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2308
2309        if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2310                struct Qdisc *q;
2311
2312                rcu_read_lock();
2313                q = rcu_dereference(txq->qdisc);
2314                __netif_schedule(q);
2315                rcu_read_unlock();
2316        }
2317}
2318EXPORT_SYMBOL(netif_wake_subqueue);
2319
2320void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2321{
2322        if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2323                struct Qdisc *q;
2324
2325                rcu_read_lock();
2326                q = rcu_dereference(dev_queue->qdisc);
2327                __netif_schedule(q);
2328                rcu_read_unlock();
2329        }
2330}
2331EXPORT_SYMBOL(netif_tx_wake_queue);
2332
2333void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2334{
2335        unsigned long flags;
2336
2337        if (likely(atomic_read(&skb->users) == 1)) {
2338                smp_rmb();
2339                atomic_set(&skb->users, 0);
2340        } else if (likely(!atomic_dec_and_test(&skb->users))) {
2341                return;
2342        }
2343        get_kfree_skb_cb(skb)->reason = reason;
2344        local_irq_save(flags);
2345        skb->next = __this_cpu_read(softnet_data.completion_queue);
2346        __this_cpu_write(softnet_data.completion_queue, skb);
2347        raise_softirq_irqoff(NET_TX_SOFTIRQ);
2348        local_irq_restore(flags);
2349}
2350EXPORT_SYMBOL(__dev_kfree_skb_irq);
2351
2352void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2353{
2354        if (in_irq() || irqs_disabled())
2355                __dev_kfree_skb_irq(skb, reason);
2356        else
2357                dev_kfree_skb(skb);
2358}
2359EXPORT_SYMBOL(__dev_kfree_skb_any);
2360
2361
2362/**
2363 * netif_device_detach - mark device as removed
2364 * @dev: network device
2365 *
2366 * Mark device as removed from system and therefore no longer available.
2367 */
2368void netif_device_detach(struct net_device *dev)
2369{
2370        if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2371            netif_running(dev)) {
2372                netif_tx_stop_all_queues(dev);
2373        }
2374}
2375EXPORT_SYMBOL(netif_device_detach);
2376
2377/**
2378 * netif_device_attach - mark device as attached
2379 * @dev: network device
2380 *
2381 * Mark device as attached from system and restart if needed.
2382 */
2383void netif_device_attach(struct net_device *dev)
2384{
2385        if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2386            netif_running(dev)) {
2387                netif_tx_wake_all_queues(dev);
2388                __netdev_watchdog_up(dev);
2389        }
2390}
2391EXPORT_SYMBOL(netif_device_attach);
2392
2393/*
2394 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2395 * to be used as a distribution range.
2396 */
2397u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2398                  unsigned int num_tx_queues)
2399{
2400        u32 hash;
2401        u16 qoffset = 0;
2402        u16 qcount = num_tx_queues;
2403
2404        if (skb_rx_queue_recorded(skb)) {
2405                hash = skb_get_rx_queue(skb);
2406                while (unlikely(hash >= num_tx_queues))
2407                        hash -= num_tx_queues;
2408                return hash;
2409        }
2410
2411        if (dev->num_tc) {
2412                u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2413                qoffset = dev->tc_to_txq[tc].offset;
2414                qcount = dev->tc_to_txq[tc].count;
2415        }
2416
2417        return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2418}
2419EXPORT_SYMBOL(__skb_tx_hash);
2420
2421static void skb_warn_bad_offload(const struct sk_buff *skb)
2422{
2423        static const netdev_features_t null_features = 0;
2424        struct net_device *dev = skb->dev;
2425        const char *name = "";
2426
2427        if (!net_ratelimit())
2428                return;
2429
2430        if (dev) {
2431                if (dev->dev.parent)
2432                        name = dev_driver_string(dev->dev.parent);
2433                else
2434                        name = netdev_name(dev);
2435        }
2436        WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2437             "gso_type=%d ip_summed=%d\n",
2438             name, dev ? &dev->features : &null_features,
2439             skb->sk ? &skb->sk->sk_route_caps : &null_features,
2440             skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2441             skb_shinfo(skb)->gso_type, skb->ip_summed);
2442}
2443
2444/*
2445 * Invalidate hardware checksum when packet is to be mangled, and
2446 * complete checksum manually on outgoing path.
2447 */
2448int skb_checksum_help(struct sk_buff *skb)
2449{
2450        __wsum csum;
2451        int ret = 0, offset;
2452
2453        if (skb->ip_summed == CHECKSUM_COMPLETE)
2454                goto out_set_summed;
2455
2456        if (unlikely(skb_shinfo(skb)->gso_size)) {
2457                skb_warn_bad_offload(skb);
2458                return -EINVAL;
2459        }
2460
2461        /* Before computing a checksum, we should make sure no frag could
2462         * be modified by an external entity : checksum could be wrong.
2463         */
2464        if (skb_has_shared_frag(skb)) {
2465                ret = __skb_linearize(skb);
2466                if (ret)
2467                        goto out;
2468        }
2469
2470        offset = skb_checksum_start_offset(skb);
2471        BUG_ON(offset >= skb_headlen(skb));
2472        csum = skb_checksum(skb, offset, skb->len - offset, 0);
2473
2474        offset += skb->csum_offset;
2475        BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2476
2477        if (skb_cloned(skb) &&
2478            !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2479                ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2480                if (ret)
2481                        goto out;
2482        }
2483
2484        *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2485out_set_summed:
2486        skb->ip_summed = CHECKSUM_NONE;
2487out:
2488        return ret;
2489}
2490EXPORT_SYMBOL(skb_checksum_help);
2491
2492/* skb_csum_offload_check - Driver helper function to determine if a device
2493 * with limited checksum offload capabilities is able to offload the checksum
2494 * for a given packet.
2495 *
2496 * Arguments:
2497 *   skb - sk_buff for the packet in question
2498 *   spec - contains the description of what device can offload
2499 *   csum_encapped - returns true if the checksum being offloaded is
2500 *            encpasulated. That is it is checksum for the transport header
2501 *            in the inner headers.
2502 *   checksum_help - when set indicates that helper function should
2503 *            call skb_checksum_help if offload checks fail
2504 *
2505 * Returns:
2506 *   true: Packet has passed the checksum checks and should be offloadable to
2507 *         the device (a driver may still need to check for additional
2508 *         restrictions of its device)
2509 *   false: Checksum is not offloadable. If checksum_help was set then
2510 *         skb_checksum_help was called to resolve checksum for non-GSO
2511 *         packets and when IP protocol is not SCTP
2512 */
2513bool __skb_csum_offload_chk(struct sk_buff *skb,
2514                            const struct skb_csum_offl_spec *spec,
2515                            bool *csum_encapped,
2516                            bool csum_help)
2517{
2518        struct iphdr *iph;
2519        struct ipv6hdr *ipv6;
2520        void *nhdr;
2521        int protocol;
2522        u8 ip_proto;
2523
2524        if (skb->protocol == htons(ETH_P_8021Q) ||
2525            skb->protocol == htons(ETH_P_8021AD)) {
2526                if (!spec->vlan_okay)
2527                        goto need_help;
2528        }
2529
2530        /* We check whether the checksum refers to a transport layer checksum in
2531         * the outermost header or an encapsulated transport layer checksum that
2532         * corresponds to the inner headers of the skb. If the checksum is for
2533         * something else in the packet we need help.
2534         */
2535        if (skb_checksum_start_offset(skb) == skb_transport_offset(skb)) {
2536                /* Non-encapsulated checksum */
2537                protocol = eproto_to_ipproto(vlan_get_protocol(skb));
2538                nhdr = skb_network_header(skb);
2539                *csum_encapped = false;
2540                if (spec->no_not_encapped)
2541                        goto need_help;
2542        } else if (skb->encapsulation && spec->encap_okay &&
2543                   skb_checksum_start_offset(skb) ==
2544                   skb_inner_transport_offset(skb)) {
2545                /* Encapsulated checksum */
2546                *csum_encapped = true;
2547                switch (skb->inner_protocol_type) {
2548                case ENCAP_TYPE_ETHER:
2549                        protocol = eproto_to_ipproto(skb->inner_protocol);
2550                        break;
2551                case ENCAP_TYPE_IPPROTO:
2552                        protocol = skb->inner_protocol;
2553                        break;
2554                }
2555                nhdr = skb_inner_network_header(skb);
2556        } else {
2557                goto need_help;
2558        }
2559
2560        switch (protocol) {
2561        case IPPROTO_IP:
2562                if (!spec->ipv4_okay)
2563                        goto need_help;
2564                iph = nhdr;
2565                ip_proto = iph->protocol;
2566                if (iph->ihl != 5 && !spec->ip_options_okay)
2567                        goto need_help;
2568                break;
2569        case IPPROTO_IPV6:
2570                if (!spec->ipv6_okay)
2571                        goto need_help;
2572                if (spec->no_encapped_ipv6 && *csum_encapped)
2573                        goto need_help;
2574                ipv6 = nhdr;
2575                nhdr += sizeof(*ipv6);
2576                ip_proto = ipv6->nexthdr;
2577                break;
2578        default:
2579                goto need_help;
2580        }
2581
2582ip_proto_again:
2583        switch (ip_proto) {
2584        case IPPROTO_TCP:
2585                if (!spec->tcp_okay ||
2586                    skb->csum_offset != offsetof(struct tcphdr, check))
2587                        goto need_help;
2588                break;
2589        case IPPROTO_UDP:
2590                if (!spec->udp_okay ||
2591                    skb->csum_offset != offsetof(struct udphdr, check))
2592                        goto need_help;
2593                break;
2594        case IPPROTO_SCTP:
2595                if (!spec->sctp_okay ||
2596                    skb->csum_offset != offsetof(struct sctphdr, checksum))
2597                        goto cant_help;
2598                break;
2599        case NEXTHDR_HOP:
2600        case NEXTHDR_ROUTING:
2601        case NEXTHDR_DEST: {
2602                u8 *opthdr = nhdr;
2603
2604                if (protocol != IPPROTO_IPV6 || !spec->ext_hdrs_okay)
2605                        goto need_help;
2606
2607                ip_proto = opthdr[0];
2608                nhdr += (opthdr[1] + 1) << 3;
2609
2610                goto ip_proto_again;
2611        }
2612        default:
2613                goto need_help;
2614        }
2615
2616        /* Passed the tests for offloading checksum */
2617        return true;
2618
2619need_help:
2620        if (csum_help && !skb_shinfo(skb)->gso_size)
2621                skb_checksum_help(skb);
2622cant_help:
2623        return false;
2624}
2625EXPORT_SYMBOL(__skb_csum_offload_chk);
2626
2627__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2628{
2629        __be16 type = skb->protocol;
2630
2631        /* Tunnel gso handlers can set protocol to ethernet. */
2632        if (type == htons(ETH_P_TEB)) {
2633                struct ethhdr *eth;
2634
2635                if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2636                        return 0;
2637
2638                eth = (struct ethhdr *)skb_mac_header(skb);
2639                type = eth->h_proto;
2640        }
2641
2642        return __vlan_get_protocol(skb, type, depth);
2643}
2644
2645/**
2646 *      skb_mac_gso_segment - mac layer segmentation handler.
2647 *      @skb: buffer to segment
2648 *      @features: features for the output path (see dev->features)
2649 */
2650struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2651                                    netdev_features_t features)
2652{
2653        struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2654        struct packet_offload *ptype;
2655        int vlan_depth = skb->mac_len;
2656        __be16 type = skb_network_protocol(skb, &vlan_depth);
2657
2658        if (unlikely(!type))
2659                return ERR_PTR(-EINVAL);
2660
2661        __skb_pull(skb, vlan_depth);
2662
2663        rcu_read_lock();
2664        list_for_each_entry_rcu(ptype, &offload_base, list) {
2665                if (ptype->type == type && ptype->callbacks.gso_segment) {
2666                        segs = ptype->callbacks.gso_segment(skb, features);
2667                        break;
2668                }
2669        }
2670        rcu_read_unlock();
2671
2672        __skb_push(skb, skb->data - skb_mac_header(skb));
2673
2674        return segs;
2675}
2676EXPORT_SYMBOL(skb_mac_gso_segment);
2677
2678
2679/* openvswitch calls this on rx path, so we need a different check.
2680 */
2681static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2682{
2683        if (tx_path)
2684                return skb->ip_summed != CHECKSUM_PARTIAL;
2685        else
2686                return skb->ip_summed == CHECKSUM_NONE;
2687}
2688
2689/**
2690 *      __skb_gso_segment - Perform segmentation on skb.
2691 *      @skb: buffer to segment
2692 *      @features: features for the output path (see dev->features)
2693 *      @tx_path: whether it is called in TX path
2694 *
2695 *      This function segments the given skb and returns a list of segments.
2696 *
2697 *      It may return NULL if the skb requires no segmentation.  This is
2698 *      only possible when GSO is used for verifying header integrity.
2699 *
2700 *      Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2701 */
2702struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2703                                  netdev_features_t features, bool tx_path)
2704{
2705        if (unlikely(skb_needs_check(skb, tx_path))) {
2706                int err;
2707
2708                skb_warn_bad_offload(skb);
2709
2710                err = skb_cow_head(skb, 0);
2711                if (err < 0)
2712                        return ERR_PTR(err);
2713        }
2714
2715        /* Only report GSO partial support if it will enable us to
2716         * support segmentation on this frame without needing additional
2717         * work.
2718         */
2719        if (features & NETIF_F_GSO_PARTIAL) {
2720                netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
2721                struct net_device *dev = skb->dev;
2722
2723                partial_features |= dev->features & dev->gso_partial_features;
2724                if (!skb_gso_ok(skb, features | partial_features))
2725                        features &= ~NETIF_F_GSO_PARTIAL;
2726        }
2727
2728        BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2729                     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2730
2731        SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2732        SKB_GSO_CB(skb)->encap_level = 0;
2733
2734        skb_reset_mac_header(skb);
2735        skb_reset_mac_len(skb);
2736
2737        return skb_mac_gso_segment(skb, features);
2738}
2739EXPORT_SYMBOL(__skb_gso_segment);
2740
2741/* Take action when hardware reception checksum errors are detected. */
2742#ifdef CONFIG_BUG
2743void netdev_rx_csum_fault(struct net_device *dev)
2744{
2745        if (net_ratelimit()) {
2746                pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2747                dump_stack();
2748        }
2749}
2750EXPORT_SYMBOL(netdev_rx_csum_fault);
2751#endif
2752
2753/* Actually, we should eliminate this check as soon as we know, that:
2754 * 1. IOMMU is present and allows to map all the memory.
2755 * 2. No high memory really exists on this machine.
2756 */
2757
2758static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2759{
2760#ifdef CONFIG_HIGHMEM
2761        int i;
2762        if (!(dev->features & NETIF_F_HIGHDMA)) {
2763                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2764                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2765                        if (PageHighMem(skb_frag_page(frag)))
2766                                return 1;
2767                }
2768        }
2769
2770        if (PCI_DMA_BUS_IS_PHYS) {
2771                struct device *pdev = dev->dev.parent;
2772
2773                if (!pdev)
2774                        return 0;
2775                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2776                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2777                        dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2778                        if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2779                                return 1;
2780                }
2781        }
2782#endif
2783        return 0;
2784}
2785
2786/* If MPLS offload request, verify we are testing hardware MPLS features
2787 * instead of standard features for the netdev.
2788 */
2789#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2790static netdev_features_t net_mpls_features(struct sk_buff *skb,
2791                                           netdev_features_t features,
2792                                           __be16 type)
2793{
2794        if (eth_p_mpls(type))
2795                features &= skb->dev->mpls_features;
2796
2797        return features;
2798}
2799#else
2800static netdev_features_t net_mpls_features(struct sk_buff *skb,
2801                                           netdev_features_t features,
2802                                           __be16 type)
2803{
2804        return features;
2805}
2806#endif
2807
2808static netdev_features_t harmonize_features(struct sk_buff *skb,
2809        netdev_features_t features)
2810{
2811        int tmp;
2812        __be16 type;
2813
2814        type = skb_network_protocol(skb, &tmp);
2815        features = net_mpls_features(skb, features, type);
2816
2817        if (skb->ip_summed != CHECKSUM_NONE &&
2818            !can_checksum_protocol(features, type)) {
2819                features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
2820        } else if (illegal_highdma(skb->dev, skb)) {
2821                features &= ~NETIF_F_SG;
2822        }
2823
2824        return features;
2825}
2826
2827netdev_features_t passthru_features_check(struct sk_buff *skb,
2828                                          struct net_device *dev,
2829                                          netdev_features_t features)
2830{
2831        return features;
2832}
2833EXPORT_SYMBOL(passthru_features_check);
2834
2835static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2836                                             struct net_device *dev,
2837                                             netdev_features_t features)
2838{
2839        return vlan_features_check(skb, features);
2840}
2841
2842static netdev_features_t gso_features_check(const struct sk_buff *skb,
2843                                            struct net_device *dev,
2844                                            netdev_features_t features)
2845{
2846        u16 gso_segs = skb_shinfo(skb)->gso_segs;
2847
2848        if (gso_segs > dev->gso_max_segs)
2849                return features & ~NETIF_F_GSO_MASK;
2850
2851        /* Support for GSO partial features requires software
2852         * intervention before we can actually process the packets
2853         * so we need to strip support for any partial features now
2854         * and we can pull them back in after we have partially
2855         * segmented the frame.
2856         */
2857        if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
2858                features &= ~dev->gso_partial_features;
2859
2860        /* Make sure to clear the IPv4 ID mangling feature if the
2861         * IPv4 header has the potential to be fragmented.
2862         */
2863        if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
2864                struct iphdr *iph = skb->encapsulation ?
2865                                    inner_ip_hdr(skb) : ip_hdr(skb);
2866
2867                if (!(iph->frag_off & htons(IP_DF)))
2868                        features &= ~NETIF_F_TSO_MANGLEID;
2869        }
2870
2871        return features;
2872}
2873
2874netdev_features_t netif_skb_features(struct sk_buff *skb)
2875{
2876        struct net_device *dev = skb->dev;
2877        netdev_features_t features = dev->features;
2878
2879        if (skb_is_gso(skb))
2880                features = gso_features_check(skb, dev, features);
2881
2882        /* If encapsulation offload request, verify we are testing
2883         * hardware encapsulation features instead of standard
2884         * features for the netdev
2885         */
2886        if (skb->encapsulation)
2887                features &= dev->hw_enc_features;
2888
2889        if (skb_vlan_tagged(skb))
2890                features = netdev_intersect_features(features,
2891                                                     dev->vlan_features |
2892                                                     NETIF_F_HW_VLAN_CTAG_TX |
2893                                                     NETIF_F_HW_VLAN_STAG_TX);
2894
2895        if (dev->netdev_ops->ndo_features_check)
2896                features &= dev->netdev_ops->ndo_features_check(skb, dev,
2897                                                                features);
2898        else
2899                features &= dflt_features_check(skb, dev, features);
2900
2901        return harmonize_features(skb, features);
2902}
2903EXPORT_SYMBOL(netif_skb_features);
2904
2905static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2906                    struct netdev_queue *txq, bool more)
2907{
2908        unsigned int len;
2909        int rc;
2910
2911        if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2912                dev_queue_xmit_nit(skb, dev);
2913
2914        len = skb->len;
2915        trace_net_dev_start_xmit(skb, dev);
2916        rc = netdev_start_xmit(skb, dev, txq, more);
2917        trace_net_dev_xmit(skb, rc, dev, len);
2918
2919        return rc;
2920}
2921
2922struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2923                                    struct netdev_queue *txq, int *ret)
2924{
2925        struct sk_buff *skb = first;
2926        int rc = NETDEV_TX_OK;
2927
2928        while (skb) {
2929                struct sk_buff *next = skb->next;
2930
2931                skb->next = NULL;
2932                rc = xmit_one(skb, dev, txq, next != NULL);
2933                if (unlikely(!dev_xmit_complete(rc))) {
2934                        skb->next = next;
2935                        goto out;
2936                }
2937
2938                skb = next;
2939                if (netif_xmit_stopped(txq) && skb) {
2940                        rc = NETDEV_TX_BUSY;
2941                        break;
2942                }
2943        }
2944
2945out:
2946        *ret = rc;
2947        return skb;
2948}
2949
2950static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2951                                          netdev_features_t features)
2952{
2953        if (skb_vlan_tag_present(skb) &&
2954            !vlan_hw_offload_capable(features, skb->vlan_proto))
2955                skb = __vlan_hwaccel_push_inside(skb);
2956        return skb;
2957}
2958
2959static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2960{
2961        netdev_features_t features;
2962
2963        features = netif_skb_features(skb);
2964        skb = validate_xmit_vlan(skb, features);
2965        if (unlikely(!skb))
2966                goto out_null;
2967
2968        if (netif_needs_gso(skb, features)) {
2969                struct sk_buff *segs;
2970
2971                segs = skb_gso_segment(skb, features);
2972                if (IS_ERR(segs)) {
2973                        goto out_kfree_skb;
2974                } else if (segs) {
2975                        consume_skb(skb);
2976                        skb = segs;
2977                }
2978        } else {
2979                if (skb_needs_linearize(skb, features) &&
2980                    __skb_linearize(skb))
2981                        goto out_kfree_skb;
2982
2983                /* If packet is not checksummed and device does not
2984                 * support checksumming for this protocol, complete
2985                 * checksumming here.
2986                 */
2987                if (skb->ip_summed == CHECKSUM_PARTIAL) {
2988                        if (skb->encapsulation)
2989                                skb_set_inner_transport_header(skb,
2990                                                               skb_checksum_start_offset(skb));
2991                        else
2992                                skb_set_transport_header(skb,
2993                                                         skb_checksum_start_offset(skb));
2994                        if (!(features & NETIF_F_CSUM_MASK) &&
2995                            skb_checksum_help(skb))
2996                                goto out_kfree_skb;
2997                }
2998        }
2999
3000        return skb;

3001
3002out_kfree_skb:
3003        kfree_skb(skb);
3004out_null:
3005        atomic_long_inc(&dev->tx_dropped);
3006        return NULL;
3007}
3008
3009struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
3010{
3011        struct sk_buff *next, *head = NULL, *tail;
3012
3013        for (; skb != NULL; skb = next) {
3014                next = skb->next;
3015                skb->next = NULL;
3016
3017                /* in case skb wont be segmented, point to itself */
3018                skb->prev = skb;
3019
3020                skb = validate_xmit_skb(skb, dev);
3021                if (!skb)
3022                        continue;
3023
3024                if (!head)
3025                        head = skb;
3026                else
3027                        tail->next = skb;
3028                /* If skb was segmented, skb->prev points to
3029                 * the last segment. If not, it still contains skb.
3030                 */
3031                tail = skb->prev;
3032        }
3033        return head;
3034}
3035
3036static void qdisc_pkt_len_init(struct sk_buff *skb)
3037{
3038        const struct skb_shared_info *shinfo = skb_shinfo(skb);
3039
3040        qdisc_skb_cb(skb)->pkt_len = skb->len;
3041
3042        /* To get more precise estimation of bytes sent on wire,
3043         * we add to pkt_len the headers size of all segments
3044         */
3045        if (shinfo->gso_size)  {
3046                unsigned int hdr_len;
3047                u16 gso_segs = shinfo->gso_segs;
3048
3049                /* mac layer + network layer */
3050                hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3051
3052                /* + transport layer */
3053                if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
3054                        hdr_len += tcp_hdrlen(skb);
3055                else
3056                        hdr_len += sizeof(struct udphdr);
3057
3058                if (shinfo->gso_type & SKB_GSO_DODGY)
3059                        gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3060                                                shinfo->gso_size);
3061
3062                qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3063        }
3064}
3065
3066static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3067                                 struct net_device *dev,
3068                                 struct netdev_queue *txq)
3069{
3070        spinlock_t *root_lock = qdisc_lock(q);
3071        bool contended;
3072        int rc;
3073
3074        qdisc_calculate_pkt_len(skb, q);
3075        /*
3076         * Heuristic to force contended enqueues to serialize on a
3077         * separate lock before trying to get qdisc main lock.
3078         * This permits __QDISC___STATE_RUNNING owner to get the lock more
3079         * often and dequeue packets faster.
3080         */
3081        contended = qdisc_is_running(q);
3082        if (unlikely(contended))
3083                spin_lock(&q->busylock);
3084
3085        spin_lock(root_lock);
3086        if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3087                kfree_skb(skb);
3088                rc = NET_XMIT_DROP;
3089        } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3090                   qdisc_run_begin(q)) {
3091                /*
3092                 * This is a work-conserving queue; there are no old skbs
3093                 * waiting to be sent out; and the qdisc is not running -
3094                 * xmit the skb directly.
3095                 */
3096
3097                qdisc_bstats_update(q, skb);
3098
3099                if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3100                        if (unlikely(contended)) {
3101                                spin_unlock(&q->busylock);
3102                                contended = false;
3103                        }
3104                        __qdisc_run(q);
3105                } else
3106                        qdisc_run_end(q);
3107
3108                rc = NET_XMIT_SUCCESS;
3109        } else {
3110                rc = q->enqueue(skb, q) & NET_XMIT_MASK;
3111                if (qdisc_run_begin(q)) {
3112                        if (unlikely(contended)) {
3113                                spin_unlock(&q->busylock);
3114                                contended = false;
3115                        }
3116                        __qdisc_run(q);
3117                }
3118        }
3119        spin_unlock(root_lock);
3120        if (unlikely(contended))
3121                spin_unlock(&q->busylock);
3122        return rc;
3123}
3124
3125#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3126static void skb_update_prio(struct sk_buff *skb)
3127{
3128        struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
3129
3130        if (!skb->priority && skb->sk && map) {
3131                unsigned int prioidx =
3132                        sock_cgroup_prioidx(&skb->sk->sk_cgrp_data);
3133
3134                if (prioidx < map->priomap_len)
3135                        skb->priority = map->priomap[prioidx];
3136        }
3137}
3138#else
3139#define skb_update_prio(skb)
3140#endif
3141
3142DEFINE_PER_CPU(int, xmit_recursion);
3143EXPORT_SYMBOL(xmit_recursion);
3144
3145#define RECURSION_LIMIT 10
3146
3147/**
3148 *      dev_loopback_xmit - loop back @skb
3149 *      @net: network namespace this loopback is happening in
3150 *      @sk:  sk needed to be a netfilter okfn
3151 *      @skb: buffer to transmit
3152 */
3153int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3154{
3155        skb_reset_mac_header(skb);
3156        __skb_pull(skb, skb_network_offset(skb));
3157        skb->pkt_type = PACKET_LOOPBACK;
3158        skb->ip_summed = CHECKSUM_UNNECESSARY;
3159        WARN_ON(!skb_dst(skb));
3160        skb_dst_force(skb);
3161        netif_rx_ni(skb);
3162        return 0;
3163}
3164EXPORT_SYMBOL(dev_loopback_xmit);
3165
3166#ifdef CONFIG_NET_EGRESS
3167static struct sk_buff *
3168sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3169{
3170        struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list);
3171        struct tcf_result cl_res;
3172
3173        if (!cl)
3174                return skb;
3175
3176        /* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set
3177         * earlier by the caller.
3178         */
3179        qdisc_bstats_cpu_update(cl->q, skb);
3180
3181        switch (tc_classify(skb, cl, &cl_res, false)) {
3182        case TC_ACT_OK:
3183        case TC_ACT_RECLASSIFY:
3184                skb->tc_index = TC_H_MIN(cl_res.classid);
3185                break;
3186        case TC_ACT_SHOT:
3187                qdisc_qstats_cpu_drop(cl->q);
3188                *ret = NET_XMIT_DROP;
3189                kfree_skb(skb);
3190                return NULL;
3191        case TC_ACT_STOLEN:
3192        case TC_ACT_QUEUED:
3193                *ret = NET_XMIT_SUCCESS;
3194                consume_skb(skb);
3195                return NULL;
3196        case TC_ACT_REDIRECT:
3197                /* No need to push/pop skb's mac_header here on egress! */
3198                skb_do_redirect(skb);
3199                *ret = NET_XMIT_SUCCESS;
3200                return NULL;
3201        default:
3202                break;
3203        }
3204
3205        return skb;
3206}
3207#endif /* CONFIG_NET_EGRESS */
3208
3209static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
3210{
3211#ifdef CONFIG_XPS
3212        struct xps_dev_maps *dev_maps;
3213        struct xps_map *map;
3214        int queue_index = -1;
3215
3216        rcu_read_lock();
3217        dev_maps = rcu_dereference(dev->xps_maps);
3218        if (dev_maps) {
3219                map = rcu_dereference(
3220                    dev_maps->cpu_map[skb->sender_cpu - 1]);
3221                if (map) {
3222                        if (map->len == 1)
3223                                queue_index = map->queues[0];
3224                        else
3225                                queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3226                                                                           map->len)];
3227                        if (unlikely(queue_index >= dev->real_num_tx_queues))
3228                                queue_index = -1;
3229                }
3230        }
3231        rcu_read_unlock();
3232
3233        return queue_index;
3234#else
3235        return -1;
3236#endif
3237}
3238
3239static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3240{
3241        struct sock *sk = skb->sk;
3242        int queue_index = sk_tx_queue_get(sk);
3243
3244        if (queue_index < 0 || skb->ooo_okay ||
3245            queue_index >= dev->real_num_tx_queues) {
3246                int new_index = get_xps_queue(dev, skb);
3247                if (new_index < 0)
3248                        new_index = skb_tx_hash(dev, skb);
3249
3250                if (queue_index != new_index && sk &&
3251                    sk_fullsock(sk) &&
3252                    rcu_access_pointer(sk->sk_dst_cache))
3253                        sk_tx_queue_set(sk, new_index);
3254
3255                queue_index = new_index;
3256        }
3257
3258        return queue_index;
3259}
3260
3261struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3262                                    struct sk_buff *skb,
3263                                    void *accel_priv)
3264{
3265        int queue_index = 0;
3266
3267#ifdef CONFIG_XPS
3268        u32 sender_cpu = skb->sender_cpu - 1;
3269
3270        if (sender_cpu >= (u32)NR_CPUS)
3271                skb->sender_cpu = raw_smp_processor_id() + 1;
3272#endif
3273
3274        if (dev->real_num_tx_queues != 1) {
3275                const struct net_device_ops *ops = dev->netdev_ops;
3276                if (ops->ndo_select_queue)
3277                        queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3278                                                            __netdev_pick_tx);
3279                else
3280                        queue_index = __netdev_pick_tx(dev, skb);
3281
3282                if (!accel_priv)
3283                        queue_index = netdev_cap_txqueue(dev, queue_index);
3284        }
3285
3286        skb_set_queue_mapping(skb, queue_index);
3287        return netdev_get_tx_queue(dev, queue_index);
3288}
3289
3290/**
3291 *      __dev_queue_xmit - transmit a buffer
3292 *      @skb: buffer to transmit
3293 *      @accel_priv: private data used for L2 forwarding offload
3294 *
3295 *      Queue a buffer for transmission to a network device. The caller must
3296 *      have set the device and priority and built the buffer before calling
3297 *      this function. The function can be called from an interrupt.
3298 *
3299 *      A negative errno code is returned on a failure. A success does not
3300 *      guarantee the frame will be transmitted as it may be dropped due
3301 *      to congestion or traffic shaping.
3302 *
3303 * -----------------------------------------------------------------------------------
3304 *      I notice this method can also return errors from the queue disciplines,
3305 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3306 *      be positive.
3307 *
3308 *      Regardless of the return value, the skb is consumed, so it is currently
3309 *      difficult to retry a send to this method.  (You can bump the ref count
3310 *      before sending to hold a reference for retry if you are careful.)
3311 *
3312 *      When calling this method, interrupts MUST be enabled.  This is because
3313 *      the BH enable code must have IRQs enabled so that it will not deadlock.
3314 *          --BLG
3315 */
3316static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3317{
3318        struct net_device *dev = skb->dev;
3319        struct netdev_queue *txq;
3320        struct Qdisc *q;
3321        int rc = -ENOMEM;
3322
3323        skb_reset_mac_header(skb);
3324
3325        if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3326                __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3327
3328        /* Disable soft irqs for various locks below. Also
3329         * stops preemption for RCU.
3330         */
3331        rcu_read_lock_bh();
3332
3333        skb_update_prio(skb);
3334
3335        qdisc_pkt_len_init(skb);
3336#ifdef CONFIG_NET_CLS_ACT
3337        skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3338# ifdef CONFIG_NET_EGRESS
3339        if (static_key_false(&egress_needed)) {
3340                skb = sch_handle_egress(skb, &rc, dev);
3341                if (!skb)
3342                        goto out;
3343        }
3344# endif
3345#endif
3346        /* If device/qdisc don't need skb->dst, release it right now while
3347         * its hot in this cpu cache.
3348         */
3349        if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3350                skb_dst_drop(skb);
3351        else
3352                skb_dst_force(skb);
3353
3354#ifdef CONFIG_NET_SWITCHDEV
3355        /* Don't forward if offload device already forwarded */
3356        if (skb->offload_fwd_mark &&
3357            skb->offload_fwd_mark == dev->offload_fwd_mark) {
3358                consume_skb(skb);
3359                rc = NET_XMIT_SUCCESS;
3360                goto out;
3361        }
3362#endif
3363
3364        txq = netdev_pick_tx(dev, skb, accel_priv);
3365        q = rcu_dereference_bh(txq->qdisc);
3366
3367        trace_net_dev_queue(skb);
3368        if (q->enqueue) {
3369                rc = __dev_xmit_skb(skb, q, dev, txq);
3370                goto out;
3371        }
3372
3373        /* The device has no queue. Common case for software devices:
3374           loopback, all the sorts of tunnels...
3375
3376           Really, it is unlikely that netif_tx_lock protection is necessary
3377           here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3378           counters.)
3379           However, it is possible, that they rely on protection
3380           made by us here.
3381
3382           Check this and shot the lock. It is not prone from deadlocks.
3383           Either shot noqueue qdisc, it is even simpler 8)
3384         */
3385        if (dev->flags & IFF_UP) {
3386                int cpu = smp_processor_id(); /* ok because BHs are off */
3387
3388                if (txq->xmit_lock_owner != cpu) {
3389
3390                        if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
3391                                goto recursion_alert;
3392
3393                        skb = validate_xmit_skb(skb, dev);
3394                        if (!skb)
3395                                goto out;
3396
3397                        HARD_TX_LOCK(dev, txq, cpu);
3398
3399                        if (!netif_xmit_stopped(txq)) {
3400                                __this_cpu_inc(xmit_recursion);
3401                                skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3402                                __this_cpu_dec(xmit_recursion);
3403                                if (dev_xmit_complete(rc)) {
3404                                        HARD_TX_UNLOCK(dev, txq);
3405                                        goto out;
3406                                }
3407                        }
3408                        HARD_TX_UNLOCK(dev, txq);
3409                        net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3410                                             dev->name);
3411                } else {
3412                        /* Recursion is detected! It is possible,
3413                         * unfortunately
3414                         */
3415recursion_alert:
3416                        net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3417                                             dev->name);
3418                }
3419        }
3420
3421        rc = -ENETDOWN;
3422        rcu_read_unlock_bh();
3423
3424        atomic_long_inc(&dev->tx_dropped);
3425        kfree_skb_list(skb);
3426        return rc;
3427out:
3428        rcu_read_unlock_bh();
3429        return rc;
3430}
3431
3432int dev_queue_xmit(struct sk_buff *skb)
3433{
3434        return __dev_queue_xmit(skb, NULL);
3435}
3436EXPORT_SYMBOL(dev_queue_xmit);
3437
3438int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3439{
3440        return __dev_queue_xmit(skb, accel_priv);
3441}
3442EXPORT_SYMBOL(dev_queue_xmit_accel);
3443
3444
3445/*=======================================================================
3446                        Receiver routines
3447  =======================================================================*/
3448
3449int netdev_max_backlog __read_mostly = 1000;
3450EXPORT_SYMBOL(netdev_max_backlog);
3451
3452int netdev_tstamp_prequeue __read_mostly = 1;
3453int netdev_budget __read_mostly = 300;
3454int weight_p __read_mostly = 64;            /* old backlog weight */
3455
3456/* Called with irq disabled */
3457static inline void ____napi_schedule(struct softnet_data *sd,
3458                                     struct napi_struct *napi)
3459{
3460        list_add_tail(&napi->poll_list, &sd->poll_list);
3461        __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3462}
3463
3464#ifdef CONFIG_RPS
3465
3466/* One global table that all flow-based protocols share. */
3467struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3468EXPORT_SYMBOL(rps_sock_flow_table);
3469u32 rps_cpu_mask __read_mostly;
3470EXPORT_SYMBOL(rps_cpu_mask);
3471
3472struct static_key rps_needed __read_mostly;
3473EXPORT_SYMBOL(rps_needed);
3474
3475static struct rps_dev_flow *
3476set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3477            struct rps_dev_flow *rflow, u16 next_cpu)
3478{
3479        if (next_cpu < nr_cpu_ids) {
3480#ifdef CONFIG_RFS_ACCEL
3481                struct netdev_rx_queue *rxqueue;
3482                struct rps_dev_flow_table *flow_table;
3483                struct rps_dev_flow *old_rflow;
3484                u32 flow_id;
3485                u16 rxq_index;
3486                int rc;
3487
3488                /* Should we steer this flow to a different hardware queue? */
3489                if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3490                    !(dev->features & NETIF_F_NTUPLE))
3491                        goto out;
3492                rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3493                if (rxq_index == skb_get_rx_queue(skb))
3494                        goto out;
3495
3496                rxqueue = dev->_rx + rxq_index;
3497                flow_table = rcu_dereference(rxqueue->rps_flow_table);
3498                if (!flow_table)
3499                        goto out;
3500                flow_id = skb_get_hash(skb) & flow_table->mask;
3501                rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3502                                                        rxq_index, flow_id);
3503                if (rc < 0)
3504                        goto out;
3505                old_rflow = rflow;
3506                rflow = &flow_table->flows[flow_id];
3507                rflow->filter = rc;
3508                if (old_rflow->filter == rflow->filter)
3509                        old_rflow->filter = RPS_NO_FILTER;
3510        out:
3511#endif
3512                rflow->last_qtail =
3513                        per_cpu(softnet_data, next_cpu).input_queue_head;
3514        }
3515
3516        rflow->cpu = next_cpu;
3517        return rflow;
3518}
3519
3520/*
3521 * get_rps_cpu is called from netif_receive_skb and returns the target
3522 * CPU from the RPS map of the receiving queue for a given skb.
3523 * rcu_read_lock must be held on entry.
3524 */
3525static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3526                       struct rps_dev_flow **rflowp)
3527{
3528        const struct rps_sock_flow_table *sock_flow_table;
3529        struct netdev_rx_queue *rxqueue = dev->_rx;
3530        struct rps_dev_flow_table *flow_table;
3531        struct rps_map *map;
3532        int cpu = -1;
3533        u32 tcpu;
3534        u32 hash;
3535
3536        if (skb_rx_queue_recorded(skb)) {
3537                u16 index = skb_get_rx_queue(skb);
3538
3539                if (unlikely(index >= dev->real_num_rx_queues)) {
3540                        WARN_ONCE(dev->real_num_rx_queues > 1,
3541                                  "%s received packet on queue %u, but number "
3542                                  "of RX queues is %u\n",
3543                                  dev->name, index, dev->real_num_rx_queues);
3544                        goto done;
3545                }
3546                rxqueue += index;
3547        }
3548
3549        /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3550
3551        flow_table = rcu_dereference(rxqueue->rps_flow_table);
3552        map = rcu_dereference(rxqueue->rps_map);
3553        if (!flow_table && !map)
3554                goto done;
3555
3556        skb_reset_network_header(skb);
3557        hash = skb_get_hash(skb);
3558        if (!hash)
3559                goto done;
3560
3561        sock_flow_table = rcu_dereference(rps_sock_flow_table);
3562        if (flow_table && sock_flow_table) {
3563                struct rps_dev_flow *rflow;
3564                u32 next_cpu;
3565                u32 ident;
3566
3567                /* First check into global flow table if there is a match */
3568                ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3569                if ((ident ^ hash) & ~rps_cpu_mask)
3570                        goto try_rps;
3571
3572                next_cpu = ident & rps_cpu_mask;
3573
3574                /* OK, now we know there is a match,
3575                 * we can look at the local (per receive queue) flow table
3576                 */
3577                rflow = &flow_table->flows[hash & flow_table->mask];
3578                tcpu = rflow->cpu;
3579
3580                /*
3581                 * If the desired CPU (where last recvmsg was done) is
3582                 * different from current CPU (one in the rx-queue flow
3583                 * table entry), switch if one of the following holds:
3584                 *   - Current CPU is unset (>= nr_cpu_ids).
3585                 *   - Current CPU is offline.
3586                 *   - The current CPU's queue tail has advanced beyond the
3587                 *     last packet that was enqueued using this table entry.
3588                 *     This guarantees that all previous packets for the flow
3589                 *     have been dequeued, thus preserving in order delivery.
3590                 */
3591                if (unlikely(tcpu != next_cpu) &&
3592                    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3593                     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3594                      rflow->last_qtail)) >= 0)) {
3595                        tcpu = next_cpu;
3596                        rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3597                }
3598
3599                if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3600                        *rflowp = rflow;
3601                        cpu = tcpu;
3602                        goto done;
3603                }
3604        }
3605
3606try_rps:
3607
3608        if (map) {
3609                tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3610                if (cpu_online(tcpu)) {
3611                        cpu = tcpu;
3612                        goto done;
3613                }
3614        }
3615
3616done:
3617        return cpu;
3618}
3619
3620#ifdef CONFIG_RFS_ACCEL
3621
3622/**
3623 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3624 * @dev: Device on which the filter was set
3625 * @rxq_index: RX queue index
3626 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3627 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3628 *
3629 * Drivers that implement ndo_rx_flow_steer() should periodically call
3630 * this function for each installed filter and remove the filters for
3631 * which it returns %true.
3632 */
3633bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3634                         u32 flow_id, u16 filter_id)
3635{
3636        struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3637        struct rps_dev_flow_table *flow_table;
3638        struct rps_dev_flow *rflow;
3639        bool expire = true;
3640        unsigned int cpu;
3641
3642        rcu_read_lock();
3643        flow_table = rcu_dereference(rxqueue->rps_flow_table);
3644        if (flow_table && flow_id <= flow_table->mask) {
3645                rflow = &flow_table->flows[flow_id];
3646                cpu = ACCESS_ONCE(rflow->cpu);
3647                if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3648                    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3649                           rflow->last_qtail) <
3650                     (int)(10 * flow_table->mask)))
3651                        expire = false;
3652        }
3653        rcu_read_unlock();
3654        return expire;
3655}
3656EXPORT_SYMBOL(rps_may_expire_flow);
3657
3658#endif /* CONFIG_RFS_ACCEL */
3659
3660/* Called from hardirq (IPI) context */
3661static void rps_trigger_softirq(void *data)
3662{
3663        struct softnet_data *sd = data;
3664
3665        ____napi_schedule(sd, &sd->backlog);
3666        sd->received_rps++;
3667}
3668
3669#endif /* CONFIG_RPS */
3670
3671/*
3672 * Check if this softnet_data structure is another cpu one
3673 * If yes, queue it to our IPI list and return 1
3674 * If no, return 0
3675 */
3676static int rps_ipi_queued(struct softnet_data *sd)
3677{
3678#ifdef CONFIG_RPS
3679        struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3680
3681        if (sd != mysd) {
3682                sd->rps_ipi_next = mysd->rps_ipi_list;
3683                mysd->rps_ipi_list = sd;
3684
3685                __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3686                return 1;
3687        }
3688#endif /* CONFIG_RPS */
3689        return 0;
3690}
3691
3692#ifdef CONFIG_NET_FLOW_LIMIT
3693int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3694#endif
3695
3696static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3697{
3698#ifdef CONFIG_NET_FLOW_LIMIT
3699        struct sd_flow_limit *fl;
3700        struct softnet_data *sd;
3701        unsigned int old_flow, new_flow;
3702
3703        if (qlen < (netdev_max_backlog >> 1))
3704                return false;
3705
3706        sd = this_cpu_ptr(&softnet_data);
3707
3708        rcu_read_lock();
3709        fl = rcu_dereference(sd->flow_limit);
3710        if (fl) {
3711                new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3712                old_flow = fl->history[fl->history_head];
3713                fl->history[fl->history_head] = new_flow;
3714
3715                fl->history_head++;
3716                fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3717
3718                if (likely(fl->buckets[old_flow]))
3719                        fl->buckets[old_flow]--;
3720
3721                if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3722                        fl->count++;
3723                        rcu_read_unlock();
3724                        return true;
3725                }
3726        }
3727        rcu_read_unlock();
3728#endif
3729        return false;
3730}
3731
3732/*
3733 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3734 * queue (may be a remote CPU queue).
3735 */
3736static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3737                              unsigned int *qtail)
3738{
3739        struct softnet_data *sd;
3740        unsigned long flags;
3741        unsigned int qlen;
3742
3743        sd = &per_cpu(softnet_data, cpu);
3744
3745        local_irq_save(flags);
3746
3747        rps_lock(sd);
3748        if (!netif_running(skb->dev))
3749                goto drop;
3750        qlen = skb_queue_len(&sd->input_pkt_queue);
3751        if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3752                if (qlen) {
3753enqueue:
3754                        __skb_queue_tail(&sd->input_pkt_queue, skb);
3755                        input_queue_tail_incr_save(sd, qtail);
3756                        rps_unlock(sd);
3757                        local_irq_restore(flags);
3758                        return NET_RX_SUCCESS;
3759                }
3760
3761                /* Schedule NAPI for backlog device
3762                 * We can use non atomic operation since we own the queue lock
3763                 */
3764                if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3765                        if (!rps_ipi_queued(sd))
3766                                ____napi_schedule(sd, &sd->backlog);
3767                }
3768                goto enqueue;
3769        }
3770
3771drop:
3772        sd->dropped++;
3773        rps_unlock(sd);
3774
3775        local_irq_restore(flags);
3776
3777        atomic_long_inc(&skb->dev->rx_dropped);
3778        kfree_skb(skb);
3779        return NET_RX_DROP;
3780}
3781
3782static int netif_rx_internal(struct sk_buff *skb)
3783{
3784        int ret;
3785
3786        net_timestamp_check(netdev_tstamp_prequeue, skb);
3787
3788        trace_netif_rx(skb);
3789#ifdef CONFIG_RPS
3790        if (static_key_false(&rps_needed)) {
3791                struct rps_dev_flow voidflow, *rflow = &voidflow;
3792                int cpu;
3793
3794                preempt_disable();
3795                rcu_read_lock();
3796
3797                cpu = get_rps_cpu(skb->dev, skb, &rflow);
3798                if (cpu < 0)
3799                        cpu = smp_processor_id();
3800
3801                ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3802
3803                rcu_read_unlock();
3804                preempt_enable();
3805        } else
3806#endif
3807        {
3808                unsigned int qtail;
3809                ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3810                put_cpu();
3811        }
3812        return ret;
3813}
3814
3815/**
3816 *      netif_rx        -       post buffer to the network code
3817 *      @skb: buffer to post
3818 *
3819 *      This function receives a packet from a device driver and queues it for
3820 *      the upper (protocol) levels to process.  It always succeeds. The buffer
3821 *      may be dropped during processing for congestion control or by the
3822 *      protocol layers.
3823 *
3824 *      return values:
3825 *      NET_RX_SUCCESS  (no congestion)
3826 *      NET_RX_DROP     (packet was dropped)
3827 *
3828 */
3829
3830int netif_rx(struct sk_buff *skb)
3831{
3832        trace_netif_rx_entry(skb);
3833
3834        return netif_rx_internal(skb);
3835}
3836EXPORT_SYMBOL(netif_rx);
3837
3838int netif_rx_ni(struct sk_buff *skb)
3839{
3840        int err;
3841
3842        trace_netif_rx_ni_entry(skb);
3843
3844        preempt_disable();
3845        err = netif_rx_internal(skb);
3846        if (local_softirq_pending())
3847                do_softirq();
3848        preempt_enable();
3849
3850        return err;
3851}
3852EXPORT_SYMBOL(netif_rx_ni);
3853
3854static void net_tx_action(struct softirq_action *h)
3855{
3856        struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3857
3858        if (sd->completion_queue) {
3859                struct sk_buff *clist;
3860
3861                local_irq_disable();
3862                clist = sd->completion_queue;
3863                sd->completion_queue = NULL;
3864                local_irq_enable();
3865
3866                while (clist) {
3867                        struct sk_buff *skb = clist;
3868                        clist = clist->next;
3869
3870                        WARN_ON(atomic_read(&skb->users));
3871                        if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3872                                trace_consume_skb(skb);
3873                        else
3874                                trace_kfree_skb(skb, net_tx_action);
3875
3876                        if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
3877                                __kfree_skb(skb);
3878                        else
3879                                __kfree_skb_defer(skb);
3880                }
3881
3882                __kfree_skb_flush();
3883        }
3884
3885        if (sd->output_queue) {
3886                struct Qdisc *head;
3887
3888                local_irq_disable();
3889                head = sd->output_queue;
3890                sd->output_queue = NULL;
3891                sd->output_queue_tailp = &sd->output_queue;
3892                local_irq_enable();
3893
3894                while (head) {
3895                        struct Qdisc *q = head;
3896                        spinlock_t *root_lock;
3897
3898                        head = head->next_sched;
3899
3900                        root_lock = qdisc_lock(q);
3901                        if (spin_trylock(root_lock)) {
3902                                smp_mb__before_atomic();
3903                                clear_bit(__QDISC_STATE_SCHED,
3904                                          &q->state);
3905                                qdisc_run(q);
3906                                spin_unlock(root_lock);
3907                        } else {
3908                                if (!test_bit(__QDISC_STATE_DEACTIVATED,
3909                                              &q->state)) {
3910                                        __netif_reschedule(q);
3911                                } else {
3912                                        smp_mb__before_atomic();
3913                                        clear_bit(__QDISC_STATE_SCHED,
3914                                                  &q->state);
3915                                }
3916                        }
3917                }
3918        }
3919}
3920
3921#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3922    (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3923/* This hook is defined here for ATM LANE */
3924int (*br_fdb_test_addr_hook)(struct net_device *dev,
3925                             unsigned char *addr) __read_mostly;
3926EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3927#endif
3928
3929static inline struct sk_buff *
3930sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
3931                   struct net_device *orig_dev)
3932{
3933#ifdef CONFIG_NET_CLS_ACT
3934        struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3935        struct tcf_result cl_res;
3936
3937        /* If there's at least one ingress present somewhere (so
3938         * we get here via enabled static key), remaining devices
3939         * that are not configured with an ingress qdisc will bail
3940         * out here.
3941         */
3942        if (!cl)
3943                return skb;
3944        if (*pt_prev) {
3945                *ret = deliver_skb(skb, *pt_prev, orig_dev);
3946                *pt_prev = NULL;
3947        }
3948
3949        qdisc_skb_cb(skb)->pkt_len = skb->len;
3950        skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3951        qdisc_bstats_cpu_update(cl->q, skb);
3952
3953        switch (tc_classify(skb, cl, &cl_res, false)) {
3954        case TC_ACT_OK:
3955        case TC_ACT_RECLASSIFY:
3956                skb->tc_index = TC_H_MIN(cl_res.classid);
3957                break;
3958        case TC_ACT_SHOT:
3959                qdisc_qstats_cpu_drop(cl->q);
3960                kfree_skb(skb);
3961                return NULL;
3962        case TC_ACT_STOLEN:
3963        case TC_ACT_QUEUED:
3964                consume_skb(skb);
3965                return NULL;
3966        case TC_ACT_REDIRECT:
3967                /* skb_mac_header check was done by cls/act_bpf, so
3968                 * we can safely push the L2 header back before
3969                 * redirecting to another netdev
3970                 */
3971                __skb_push(skb, skb->mac_len);
3972                skb_do_redirect(skb);
3973                return NULL;
3974        default:
3975                break;
3976        }
3977#endif /* CONFIG_NET_CLS_ACT */
3978        return skb;
3979}
3980
3981/**
3982 *      netdev_rx_handler_register - register receive handler
3983 *      @dev: device to register a handler for
3984 *      @rx_handler: receive handler to register
3985 *      @rx_handler_data: data pointer that is used by rx handler
3986 *
3987 *      Register a receive handler for a device. This handler will then be
3988 *      called from __netif_receive_skb. A negative errno code is returned
3989 *      on a failure.
3990 *
3991 *      The caller must hold the rtnl_mutex.
3992 *
3993 *      For a general description of rx_handler, see enum rx_handler_result.
3994 */
3995int netdev_rx_handler_register(struct net_device *dev,
3996                               rx_handler_func_t *rx_handler,
3997                               void *rx_handler_data)
3998{
3999        ASSERT_RTNL();
4000

4001        if (dev->rx_handler)
4002                return -EBUSY;
4003
4004        /* Note: rx_handler_data must be set before rx_handler */
4005        rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
4006        rcu_assign_pointer(dev->rx_handler, rx_handler);
4007
4008        return 0;
4009}
4010EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
4011
4012/**
4013 *      netdev_rx_handler_unregister - unregister receive handler
4014 *      @dev: device to unregister a handler from
4015 *
4016 *      Unregister a receive handler from a device.
4017 *
4018 *      The caller must hold the rtnl_mutex.
4019 */
4020void netdev_rx_handler_unregister(struct net_device *dev)
4021{
4022
4023        ASSERT_RTNL();
4024        RCU_INIT_POINTER(dev->rx_handler, NULL);
4025        /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
4026         * section has a guarantee to see a non NULL rx_handler_data
4027         * as well.
4028         */
4029        synchronize_net();
4030        RCU_INIT_POINTER(dev->rx_handler_data, NULL);
4031}
4032EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
4033
4034/*
4035 * Limit the use of PFMEMALLOC reserves to those protocols that implement
4036 * the special handling of PFMEMALLOC skbs.
4037 */
4038static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
4039{
4040        switch (skb->protocol) {
4041        case htons(ETH_P_ARP):
4042        case htons(ETH_P_IP):
4043        case htons(ETH_P_IPV6):
4044        case htons(ETH_P_8021Q):
4045        case htons(ETH_P_8021AD):
4046                return true;
4047        default:
4048                return false;
4049        }
4050}
4051
4052static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
4053                             int *ret, struct net_device *orig_dev)
4054{
4055#ifdef CONFIG_NETFILTER_INGRESS
4056        if (nf_hook_ingress_active(skb)) {
4057                if (*pt_prev) {
4058                        *ret = deliver_skb(skb, *pt_prev, orig_dev);
4059                        *pt_prev = NULL;
4060                }
4061
4062                return nf_hook_ingress(skb);
4063        }
4064#endif /* CONFIG_NETFILTER_INGRESS */
4065        return 0;
4066}
4067
4068static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
4069{
4070        struct packet_type *ptype, *pt_prev;
4071        rx_handler_func_t *rx_handler;
4072        struct net_device *orig_dev;
4073        bool deliver_exact = false;
4074        int ret = NET_RX_DROP;
4075        __be16 type;
4076
4077        net_timestamp_check(!netdev_tstamp_prequeue, skb);
4078
4079        trace_netif_receive_skb(skb);
4080
4081        orig_dev = skb->dev;
4082
4083        skb_reset_network_header(skb);
4084        if (!skb_transport_header_was_set(skb))
4085                skb_reset_transport_header(skb);
4086        skb_reset_mac_len(skb);
4087
4088        pt_prev = NULL;
4089
4090another_round:
4091        skb->skb_iif = skb->dev->ifindex;
4092
4093        __this_cpu_inc(softnet_data.processed);
4094
4095        if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
4096            skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
4097                skb = skb_vlan_untag(skb);
4098                if (unlikely(!skb))
4099                        goto out;
4100        }
4101
4102#ifdef CONFIG_NET_CLS_ACT
4103        if (skb->tc_verd & TC_NCLS) {
4104                skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
4105                goto ncls;
4106        }
4107#endif
4108
4109        if (pfmemalloc)
4110                goto skip_taps;
4111
4112        list_for_each_entry_rcu(ptype, &ptype_all, list) {
4113                if (pt_prev)
4114                        ret = deliver_skb(skb, pt_prev, orig_dev);
4115                pt_prev = ptype;
4116        }
4117
4118        list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
4119                if (pt_prev)
4120                        ret = deliver_skb(skb, pt_prev, orig_dev);
4121                pt_prev = ptype;
4122        }
4123
4124skip_taps:
4125#ifdef CONFIG_NET_INGRESS
4126        if (static_key_false(&ingress_needed)) {
4127                skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
4128                if (!skb)
4129                        goto out;
4130
4131                if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
4132                        goto out;
4133        }
4134#endif
4135#ifdef CONFIG_NET_CLS_ACT
4136        skb->tc_verd = 0;
4137ncls:
4138#endif
4139        if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
4140                goto drop;
4141
4142        if (skb_vlan_tag_present(skb)) {
4143                if (pt_prev) {
4144                        ret = deliver_skb(skb, pt_prev, orig_dev);
4145                        pt_prev = NULL;
4146                }
4147                if (vlan_do_receive(&skb))
4148                        goto another_round;
4149                else if (unlikely(!skb))
4150                        goto out;
4151        }
4152
4153        rx_handler = rcu_dereference(skb->dev->rx_handler);
4154        if (rx_handler) {
4155                if (pt_prev) {
4156                        ret = deliver_skb(skb, pt_prev, orig_dev);
4157                        pt_prev = NULL;
4158                }
4159                switch (rx_handler(&skb)) {
4160                case RX_HANDLER_CONSUMED:
4161                        ret = NET_RX_SUCCESS;
4162                        goto out;
4163                case RX_HANDLER_ANOTHER:
4164                        goto another_round;
4165                case RX_HANDLER_EXACT:
4166                        deliver_exact = true;
4167                case RX_HANDLER_PASS:
4168                        break;
4169                default:
4170                        BUG();
4171                }
4172        }
4173
4174        if (unlikely(skb_vlan_tag_present(skb))) {
4175                if (skb_vlan_tag_get_id(skb))
4176                        skb->pkt_type = PACKET_OTHERHOST;
4177                /* Note: we might in the future use prio bits
4178                 * and set skb->priority like in vlan_do_receive()
4179                 * For the time being, just ignore Priority Code Point
4180                 */
4181                skb->vlan_tci = 0;
4182        }
4183
4184        type = skb->protocol;
4185
4186        /* deliver only exact match when indicated */
4187        if (likely(!deliver_exact)) {
4188                deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4189                                       &ptype_base[ntohs(type) &
4190                                                   PTYPE_HASH_MASK]);
4191        }
4192
4193        deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4194                               &orig_dev->ptype_specific);
4195
4196        if (unlikely(skb->dev != orig_dev)) {
4197                deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4198                                       &skb->dev->ptype_specific);
4199        }
4200
4201        if (pt_prev) {
4202                if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
4203                        goto drop;
4204                else
4205                        ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4206        } else {
4207drop:
4208                if (!deliver_exact)
4209                        atomic_long_inc(&skb->dev->rx_dropped);
4210                else
4211                        atomic_long_inc(&skb->dev->rx_nohandler);
4212                kfree_skb(skb);
4213                /* Jamal, now you will not able to escape explaining
4214                 * me how you were going to use this. :-)
4215                 */
4216                ret = NET_RX_DROP;
4217        }
4218
4219out:
4220        return ret;
4221}
4222
4223static int __netif_receive_skb(struct sk_buff *skb)
4224{
4225        int ret;
4226
4227        if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4228                unsigned long pflags = current->flags;
4229
4230                /*
4231                 * PFMEMALLOC skbs are special, they should
4232                 * - be delivered to SOCK_MEMALLOC sockets only
4233                 * - stay away from userspace
4234                 * - have bounded memory usage
4235                 *
4236                 * Use PF_MEMALLOC as this saves us from propagating the allocation
4237                 * context down to all allocation sites.
4238                 */
4239                current->flags |= PF_MEMALLOC;
4240                ret = __netif_receive_skb_core(skb, true);
4241                tsk_restore_flags(current, pflags, PF_MEMALLOC);
4242        } else
4243                ret = __netif_receive_skb_core(skb, false);
4244
4245        return ret;
4246}
4247
4248static int netif_receive_skb_internal(struct sk_buff *skb)
4249{
4250        int ret;
4251
4252        net_timestamp_check(netdev_tstamp_prequeue, skb);
4253
4254        if (skb_defer_rx_timestamp(skb))
4255                return NET_RX_SUCCESS;
4256
4257        rcu_read_lock();
4258
4259#ifdef CONFIG_RPS
4260        if (static_key_false(&rps_needed)) {
4261                struct rps_dev_flow voidflow, *rflow = &voidflow;
4262                int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4263
4264                if (cpu >= 0) {
4265                        ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4266                        rcu_read_unlock();
4267                        return ret;
4268                }
4269        }
4270#endif
4271        ret = __netif_receive_skb(skb);
4272        rcu_read_unlock();
4273        return ret;
4274}
4275
4276/**
4277 *      netif_receive_skb - process receive buffer from network
4278 *      @skb: buffer to process
4279 *
4280 *      netif_receive_skb() is the main receive data processing function.
4281 *      It always succeeds. The buffer may be dropped during processing
4282 *      for congestion control or by the protocol layers.
4283 *
4284 *      This function may only be called from softirq context and interrupts
4285 *      should be enabled.
4286 *
4287 *      Return values (usually ignored):
4288 *      NET_RX_SUCCESS: no congestion
4289 *      NET_RX_DROP: packet was dropped
4290 */
4291int netif_receive_skb(struct sk_buff *skb)
4292{
4293        trace_netif_receive_skb_entry(skb);
4294
4295        return netif_receive_skb_internal(skb);
4296}
4297EXPORT_SYMBOL(netif_receive_skb);
4298
4299/* Network device is going away, flush any packets still pending
4300 * Called with irqs disabled.
4301 */
4302static void flush_backlog(void *arg)
4303{
4304        struct net_device *dev = arg;
4305        struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4306        struct sk_buff *skb, *tmp;
4307
4308        rps_lock(sd);
4309        skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4310                if (skb->dev == dev) {
4311                        __skb_unlink(skb, &sd->input_pkt_queue);
4312                        kfree_skb(skb);
4313                        input_queue_head_incr(sd);
4314                }
4315        }
4316        rps_unlock(sd);
4317
4318        skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4319                if (skb->dev == dev) {
4320                        __skb_unlink(skb, &sd->process_queue);
4321                        kfree_skb(skb);
4322                        input_queue_head_incr(sd);
4323                }
4324        }
4325}
4326
4327static int napi_gro_complete(struct sk_buff *skb)
4328{
4329        struct packet_offload *ptype;
4330        __be16 type = skb->protocol;
4331        struct list_head *head = &offload_base;
4332        int err = -ENOENT;
4333
4334        BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4335
4336        if (NAPI_GRO_CB(skb)->count == 1) {
4337                skb_shinfo(skb)->gso_size = 0;
4338                goto out;
4339        }
4340
4341        rcu_read_lock();
4342        list_for_each_entry_rcu(ptype, head, list) {
4343                if (ptype->type != type || !ptype->callbacks.gro_complete)
4344                        continue;
4345
4346                err = ptype->callbacks.gro_complete(skb, 0);
4347                break;
4348        }
4349        rcu_read_unlock();
4350
4351        if (err) {
4352                WARN_ON(&ptype->list == head);
4353                kfree_skb(skb);
4354                return NET_RX_SUCCESS;
4355        }
4356
4357out:
4358        return netif_receive_skb_internal(skb);
4359}
4360
4361/* napi->gro_list contains packets ordered by age.
4362 * youngest packets at the head of it.
4363 * Complete skbs in reverse order to reduce latencies.
4364 */
4365void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4366{
4367        struct sk_buff *skb, *prev = NULL;
4368
4369        /* scan list and build reverse chain */
4370        for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4371                skb->prev = prev;
4372                prev = skb;
4373        }
4374
4375        for (skb = prev; skb; skb = prev) {
4376                skb->next = NULL;
4377
4378                if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4379                        return;
4380
4381                prev = skb->prev;
4382                napi_gro_complete(skb);
4383                napi->gro_count--;
4384        }
4385
4386        napi->gro_list = NULL;
4387}
4388EXPORT_SYMBOL(napi_gro_flush);
4389
4390static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4391{
4392        struct sk_buff *p;
4393        unsigned int maclen = skb->dev->hard_header_len;
4394        u32 hash = skb_get_hash_raw(skb);
4395
4396        for (p = napi->gro_list; p; p = p->next) {
4397                unsigned long diffs;
4398
4399                NAPI_GRO_CB(p)->flush = 0;
4400
4401                if (hash != skb_get_hash_raw(p)) {
4402                        NAPI_GRO_CB(p)->same_flow = 0;
4403                        continue;
4404                }
4405
4406                diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4407                diffs |= p->vlan_tci ^ skb->vlan_tci;
4408                diffs |= skb_metadata_dst_cmp(p, skb);
4409                if (maclen == ETH_HLEN)
4410                        diffs |= compare_ether_header(skb_mac_header(p),
4411                                                      skb_mac_header(skb));
4412                else if (!diffs)
4413                        diffs = memcmp(skb_mac_header(p),
4414                                       skb_mac_header(skb),
4415                                       maclen);
4416                NAPI_GRO_CB(p)->same_flow = !diffs;
4417        }
4418}
4419
4420static void skb_gro_reset_offset(struct sk_buff *skb)
4421{
4422        const struct skb_shared_info *pinfo = skb_shinfo(skb);
4423        const skb_frag_t *frag0 = &pinfo->frags[0];
4424
4425        NAPI_GRO_CB(skb)->data_offset = 0;
4426        NAPI_GRO_CB(skb)->frag0 = NULL;
4427        NAPI_GRO_CB(skb)->frag0_len = 0;
4428
4429        if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4430            pinfo->nr_frags &&
4431            !PageHighMem(skb_frag_page(frag0))) {
4432                NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4433                NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
4434        }
4435}
4436
4437static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4438{
4439        struct skb_shared_info *pinfo = skb_shinfo(skb);
4440
4441        BUG_ON(skb->end - skb->tail < grow);
4442
4443        memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4444
4445        skb->data_len -= grow;
4446        skb->tail += grow;
4447
4448        pinfo->frags[0].page_offset += grow;
4449        skb_frag_size_sub(&pinfo->frags[0], grow);
4450
4451        if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4452                skb_frag_unref(skb, 0);
4453                memmove(pinfo->frags, pinfo->frags + 1,
4454                        --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4455        }
4456}
4457
4458static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4459{
4460        struct sk_buff **pp = NULL;
4461        struct packet_offload *ptype;
4462        __be16 type = skb->protocol;
4463        struct list_head *head = &offload_base;
4464        int same_flow;
4465        enum gro_result ret;
4466        int grow;
4467
4468        if (!(skb->dev->features & NETIF_F_GRO))
4469                goto normal;
4470
4471        if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4472                goto normal;
4473
4474        gro_list_prepare(napi, skb);
4475
4476        rcu_read_lock();
4477        list_for_each_entry_rcu(ptype, head, list) {
4478                if (ptype->type != type || !ptype->callbacks.gro_receive)
4479                        continue;
4480
4481                skb_set_network_header(skb, skb_gro_offset(skb));
4482                skb_reset_mac_len(skb);
4483                NAPI_GRO_CB(skb)->same_flow = 0;
4484                NAPI_GRO_CB(skb)->flush = 0;
4485                NAPI_GRO_CB(skb)->free = 0;
4486                NAPI_GRO_CB(skb)->encap_mark = 0;
4487                NAPI_GRO_CB(skb)->is_fou = 0;
4488                NAPI_GRO_CB(skb)->is_atomic = 1;
4489                NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4490
4491                /* Setup for GRO checksum validation */
4492                switch (skb->ip_summed) {
4493                case CHECKSUM_COMPLETE:
4494                        NAPI_GRO_CB(skb)->csum = skb->csum;
4495                        NAPI_GRO_CB(skb)->csum_valid = 1;
4496                        NAPI_GRO_CB(skb)->csum_cnt = 0;
4497                        break;
4498                case CHECKSUM_UNNECESSARY:
4499                        NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4500                        NAPI_GRO_CB(skb)->csum_valid = 0;
4501                        break;
4502                default:
4503                        NAPI_GRO_CB(skb)->csum_cnt = 0;
4504                        NAPI_GRO_CB(skb)->csum_valid = 0;
4505                }
4506
4507                pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4508                break;
4509        }
4510        rcu_read_unlock();
4511
4512        if (&ptype->list == head)
4513                goto normal;
4514
4515        same_flow = NAPI_GRO_CB(skb)->same_flow;
4516        ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4517
4518        if (pp) {
4519                struct sk_buff *nskb = *pp;
4520
4521                *pp = nskb->next;
4522                nskb->next = NULL;
4523                napi_gro_complete(nskb);
4524                napi->gro_count--;
4525        }
4526
4527        if (same_flow)
4528                goto ok;
4529
4530        if (NAPI_GRO_CB(skb)->flush)
4531                goto normal;
4532
4533        if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4534                struct sk_buff *nskb = napi->gro_list;
4535
4536                /* locate the end of the list to select the 'oldest' flow */
4537                while (nskb->next) {
4538                        pp = &nskb->next;
4539                        nskb = *pp;
4540                }
4541                *pp = NULL;
4542                nskb->next = NULL;
4543                napi_gro_complete(nskb);
4544        } else {
4545                napi->gro_count++;
4546        }
4547        NAPI_GRO_CB(skb)->count = 1;
4548        NAPI_GRO_CB(skb)->age = jiffies;
4549        NAPI_GRO_CB(skb)->last = skb;
4550        skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4551        skb->next = napi->gro_list;
4552        napi->gro_list = skb;
4553        ret = GRO_HELD;
4554
4555pull:
4556        grow = skb_gro_offset(skb) - skb_headlen(skb);
4557        if (grow > 0)
4558                gro_pull_from_frag0(skb, grow);
4559ok:
4560        return ret;
4561
4562normal:
4563        ret = GRO_NORMAL;
4564        goto pull;
4565}
4566
4567struct packet_offload *gro_find_receive_by_type(__be16 type)
4568{
4569        struct list_head *offload_head = &offload_base;
4570        struct packet_offload *ptype;
4571
4572        list_for_each_entry_rcu(ptype, offload_head, list) {
4573                if (ptype->type != type || !ptype->callbacks.gro_receive)
4574                        continue;
4575                return ptype;
4576        }
4577        return NULL;
4578}
4579EXPORT_SYMBOL(gro_find_receive_by_type);
4580
4581struct packet_offload *gro_find_complete_by_type(__be16 type)
4582{
4583        struct list_head *offload_head = &offload_base;
4584        struct packet_offload *ptype;
4585
4586        list_for_each_entry_rcu(ptype, offload_head, list) {
4587                if (ptype->type != type || !ptype->callbacks.gro_complete)
4588                        continue;
4589                return ptype;
4590        }
4591        return NULL;
4592}
4593EXPORT_SYMBOL(gro_find_complete_by_type);
4594
4595static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4596{
4597        switch (ret) {
4598        case GRO_NORMAL:
4599                if (netif_receive_skb_internal(skb))
4600                        ret = GRO_DROP;
4601                break;
4602
4603        case GRO_DROP:
4604                kfree_skb(skb);
4605                break;
4606
4607        case GRO_MERGED_FREE:
4608                if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
4609                        skb_dst_drop(skb);
4610                        kmem_cache_free(skbuff_head_cache, skb);
4611                } else {
4612                        __kfree_skb(skb);
4613                }
4614                break;
4615
4616        case GRO_HELD:
4617        case GRO_MERGED:
4618                break;
4619        }
4620
4621        return ret;
4622}
4623
4624gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4625{
4626        skb_mark_napi_id(skb, napi);
4627        trace_napi_gro_receive_entry(skb);
4628
4629        skb_gro_reset_offset(skb);
4630
4631        return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4632}
4633EXPORT_SYMBOL(napi_gro_receive);
4634
4635static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4636{
4637        if (unlikely(skb->pfmemalloc)) {
4638                consume_skb(skb);
4639                return;
4640        }
4641        __skb_pull(skb, skb_headlen(skb));
4642        /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4643        skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4644        skb->vlan_tci = 0;
4645        skb->dev = napi->dev;
4646        skb->skb_iif = 0;
4647        skb->encapsulation = 0;
4648        skb_shinfo(skb)->gso_type = 0;
4649        skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4650
4651        napi->skb = skb;
4652}
4653
4654struct sk_buff *napi_get_frags(struct napi_struct *napi)
4655{
4656        struct sk_buff *skb = napi->skb;
4657
4658        if (!skb) {
4659                skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4660                if (skb) {
4661                        napi->skb = skb;
4662                        skb_mark_napi_id(skb, napi);
4663                }
4664        }
4665        return skb;
4666}
4667EXPORT_SYMBOL(napi_get_frags);
4668
4669static gro_result_t napi_frags_finish(struct napi_struct *napi,
4670                                      struct sk_buff *skb,
4671                                      gro_result_t ret)
4672{
4673        switch (ret) {
4674        case GRO_NORMAL:
4675        case GRO_HELD:
4676                __skb_push(skb, ETH_HLEN);
4677                skb->protocol = eth_type_trans(skb, skb->dev);
4678                if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4679                        ret = GRO_DROP;
4680                break;
4681
4682        case GRO_DROP:
4683        case GRO_MERGED_FREE:
4684                napi_reuse_skb(napi, skb);
4685                break;
4686
4687        case GRO_MERGED:
4688                break;
4689        }
4690
4691        return ret;
4692}
4693
4694/* Upper GRO stack assumes network header starts at gro_offset=0
4695 * Drivers could call both napi_gro_frags() and napi_gro_receive()
4696 * We copy ethernet header into skb->data to have a common layout.
4697 */
4698static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4699{
4700        struct sk_buff *skb = napi->skb;
4701        const struct ethhdr *eth;
4702        unsigned int hlen = sizeof(*eth);
4703
4704        napi->skb = NULL;
4705
4706        skb_reset_mac_header(skb);
4707        skb_gro_reset_offset(skb);
4708
4709        eth = skb_gro_header_fast(skb, 0);
4710        if (unlikely(skb_gro_header_hard(skb, hlen))) {
4711                eth = skb_gro_header_slow(skb, hlen, 0);
4712                if (unlikely(!eth)) {
4713                        net_warn_ratelimited("%s: dropping impossible skb from %s\n",
4714                                             __func__, napi->dev->name);
4715                        napi_reuse_skb(napi, skb);
4716                        return NULL;
4717                }
4718        } else {
4719                gro_pull_from_frag0(skb, hlen);
4720                NAPI_GRO_CB(skb)->frag0 += hlen;
4721                NAPI_GRO_CB(skb)->frag0_len -= hlen;
4722        }
4723        __skb_pull(skb, hlen);
4724
4725        /*
4726         * This works because the only protocols we care about don't require
4727         * special handling.
4728         * We'll fix it up properly in napi_frags_finish()
4729         */
4730        skb->protocol = eth->h_proto;
4731
4732        return skb;
4733}
4734
4735gro_result_t napi_gro_frags(struct napi_struct *napi)
4736{
4737        struct sk_buff *skb = napi_frags_skb(napi);
4738
4739        if (!skb)
4740                return GRO_DROP;
4741
4742        trace_napi_gro_frags_entry(skb);
4743
4744        return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4745}
4746EXPORT_SYMBOL(napi_gro_frags);
4747
4748/* Compute the checksum from gro_offset and return the folded value
4749 * after adding in any pseudo checksum.
4750 */
4751__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4752{
4753        __wsum wsum;
4754        __sum16 sum;
4755
4756        wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4757
4758        /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4759        sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4760        if (likely(!sum)) {
4761                if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4762                    !skb->csum_complete_sw)
4763                        netdev_rx_csum_fault(skb->dev);
4764        }
4765
4766        NAPI_GRO_CB(skb)->csum = wsum;
4767        NAPI_GRO_CB(skb)->csum_valid = 1;
4768
4769        return sum;
4770}
4771EXPORT_SYMBOL(__skb_gro_checksum_complete);
4772
4773/*
4774 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4775 * Note: called with local irq disabled, but exits with local irq enabled.
4776 */
4777static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4778{
4779#ifdef CONFIG_RPS
4780        struct softnet_data *remsd = sd->rps_ipi_list;
4781
4782        if (remsd) {
4783                sd->rps_ipi_list = NULL;
4784
4785                local_irq_enable();
4786
4787                /* Send pending IPI's to kick RPS processing on remote cpus. */
4788                while (remsd) {
4789                        struct softnet_data *next = remsd->rps_ipi_next;
4790
4791                        if (cpu_online(remsd->cpu))
4792                                smp_call_function_single_async(remsd->cpu,
4793                                                           &remsd->csd);
4794                        remsd = next;
4795                }
4796        } else
4797#endif
4798                local_irq_enable();
4799}
4800
4801static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4802{
4803#ifdef CONFIG_RPS
4804        return sd->rps_ipi_list != NULL;
4805#else
4806        return false;
4807#endif
4808}
4809
4810static int process_backlog(struct napi_struct *napi, int quota)
4811{
4812        int work = 0;
4813        struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4814
4815        /* Check if we have pending ipi, its better to send them now,
4816         * not waiting net_rx_action() end.
4817         */
4818        if (sd_has_rps_ipi_waiting(sd)) {
4819                local_irq_disable();
4820                net_rps_action_and_irq_enable(sd);
4821        }
4822
4823        napi->weight = weight_p;
4824        local_irq_disable();
4825        while (1) {
4826                struct sk_buff *skb;
4827
4828                while ((skb = __skb_dequeue(&sd->process_queue))) {
4829                        rcu_read_lock();
4830                        local_irq_enable();
4831                        __netif_receive_skb(skb);
4832                        rcu_read_unlock();
4833                        local_irq_disable();
4834                        input_queue_head_incr(sd);
4835                        if (++work >= quota) {
4836                                local_irq_enable();
4837                                return work;
4838                        }
4839                }
4840
4841                rps_lock(sd);
4842                if (skb_queue_empty(&sd->input_pkt_queue)) {
4843                        /*
4844                         * Inline a custom version of __napi_complete().
4845                         * only current cpu owns and manipulates this napi,
4846                         * and NAPI_STATE_SCHED is the only possible flag set
4847                         * on backlog.
4848                         * We can use a plain write instead of clear_bit(),
4849                         * and we dont need an smp_mb() memory barrier.
4850                         */
4851                        napi->state = 0;
4852                        rps_unlock(sd);
4853
4854                        break;
4855                }
4856
4857                skb_queue_splice_tail_init(&sd->input_pkt_queue,
4858                                           &sd->process_queue);
4859                rps_unlock(sd);
4860        }
4861        local_irq_enable();
4862
4863        return work;
4864}
4865
4866/**
4867 * __napi_schedule - schedule for receive
4868 * @n: entry to schedule
4869 *
4870 * The entry's receive function will be scheduled to run.
4871 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4872 */
4873void __napi_schedule(struct napi_struct *n)
4874{
4875        unsigned long flags;
4876
4877        local_irq_save(flags);
4878        ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4879        local_irq_restore(flags);
4880}
4881EXPORT_SYMBOL(__napi_schedule);
4882
4883/**
4884 * __napi_schedule_irqoff - schedule for receive
4885 * @n: entry to schedule
4886 *
4887 * Variant of __napi_schedule() assuming hard irqs are masked
4888 */
4889void __napi_schedule_irqoff(struct napi_struct *n)
4890{
4891        ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4892}
4893EXPORT_SYMBOL(__napi_schedule_irqoff);
4894
4895void __napi_complete(struct napi_struct *n)
4896{
4897        BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4898
4899        list_del_init(&n->poll_list);
4900        smp_mb__before_atomic();
4901        clear_bit(NAPI_STATE_SCHED, &n->state);
4902}
4903EXPORT_SYMBOL(__napi_complete);
4904
4905void napi_complete_done(struct napi_struct *n, int work_done)
4906{
4907        unsigned long flags;
4908
4909        /*
4910         * don't let napi dequeue from the cpu poll list
4911         * just in case its running on a different cpu
4912         */
4913        if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4914                return;
4915
4916        if (n->gro_list) {
4917                unsigned long timeout = 0;
4918
4919                if (work_done)
4920                        timeout = n->dev->gro_flush_timeout;
4921
4922                if (timeout)
4923                        hrtimer_start(&n->timer, ns_to_ktime(timeout),
4924                                      HRTIMER_MODE_REL_PINNED);
4925                else
4926                        napi_gro_flush(n, false);
4927        }
4928        if (likely(list_empty(&n->poll_list))) {
4929                WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4930        } else {
4931                /* If n->poll_list is not empty, we need to mask irqs */
4932                local_irq_save(flags);
4933                __napi_complete(n);
4934                local_irq_restore(flags);
4935        }
4936}
4937EXPORT_SYMBOL(napi_complete_done);
4938
4939/* must be called under rcu_read_lock(), as we dont take a reference */
4940static struct napi_struct *napi_by_id(unsigned int napi_id)
4941{
4942        unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4943        struct napi_struct *napi;
4944
4945        hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4946                if (napi->napi_id == napi_id)
4947                        return napi;
4948
4949        return NULL;
4950}
4951
4952#if defined(CONFIG_NET_RX_BUSY_POLL)
4953#define BUSY_POLL_BUDGET 8
4954bool sk_busy_loop(struct sock *sk, int nonblock)
4955{
4956        unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
4957        int (*busy_poll)(struct napi_struct *dev);
4958        struct napi_struct *napi;
4959        int rc = false;
4960
4961        rcu_read_lock();
4962
4963        napi = napi_by_id(sk->sk_napi_id);
4964        if (!napi)
4965                goto out;
4966
4967        /* Note: ndo_busy_poll method is optional in linux-4.5 */
4968        busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
4969
4970        do {
4971                rc = 0;
4972                local_bh_disable();
4973                if (busy_poll) {
4974                        rc = busy_poll(napi);
4975                } else if (napi_schedule_prep(napi)) {
4976                        void *have = netpoll_poll_lock(napi);
4977
4978                        if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
4979                                rc = napi->poll(napi, BUSY_POLL_BUDGET);
4980                                trace_napi_poll(napi);
4981                                if (rc == BUSY_POLL_BUDGET) {
4982                                        napi_complete_done(napi, rc);
4983                                        napi_schedule(napi);
4984                                }
4985                        }
4986                        netpoll_poll_unlock(have);
4987                }
4988                if (rc > 0)
4989                        __NET_ADD_STATS(sock_net(sk),
4990                                        LINUX_MIB_BUSYPOLLRXPACKETS, rc);
4991                local_bh_enable();
4992
4993                if (rc == LL_FLUSH_FAILED)
4994                        break; /* permanent failure */
4995
4996                cpu_relax();
4997        } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) &&
4998                 !need_resched() && !busy_loop_timeout(end_time));
4999
5000        rc = !skb_queue_empty(&sk->sk_receive_queue);

5001out:
5002        rcu_read_unlock();
5003        return rc;
5004}
5005EXPORT_SYMBOL(sk_busy_loop);
5006
5007#endif /* CONFIG_NET_RX_BUSY_POLL */
5008
5009void napi_hash_add(struct napi_struct *napi)
5010{
5011        if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
5012            test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
5013                return;
5014
5015        spin_lock(&napi_hash_lock);
5016
5017        /* 0..NR_CPUS+1 range is reserved for sender_cpu use */
5018        do {
5019                if (unlikely(++napi_gen_id < NR_CPUS + 1))
5020                        napi_gen_id = NR_CPUS + 1;
5021        } while (napi_by_id(napi_gen_id));
5022        napi->napi_id = napi_gen_id;
5023
5024        hlist_add_head_rcu(&napi->napi_hash_node,
5025                           &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
5026
5027        spin_unlock(&napi_hash_lock);
5028}
5029EXPORT_SYMBOL_GPL(napi_hash_add);
5030
5031/* Warning : caller is responsible to make sure rcu grace period
5032 * is respected before freeing memory containing @napi
5033 */
5034bool napi_hash_del(struct napi_struct *napi)
5035{
5036        bool rcu_sync_needed = false;
5037
5038        spin_lock(&napi_hash_lock);
5039
5040        if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
5041                rcu_sync_needed = true;
5042                hlist_del_rcu(&napi->napi_hash_node);
5043        }
5044        spin_unlock(&napi_hash_lock);
5045        return rcu_sync_needed;
5046}
5047EXPORT_SYMBOL_GPL(napi_hash_del);
5048
5049static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
5050{
5051        struct napi_struct *napi;
5052
5053        napi = container_of(timer, struct napi_struct, timer);
5054        if (napi->gro_list)
5055                napi_schedule(napi);
5056
5057        return HRTIMER_NORESTART;
5058}
5059
5060void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
5061                    int (*poll)(struct napi_struct *, int), int weight)
5062{
5063        INIT_LIST_HEAD(&napi->poll_list);
5064        hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
5065        napi->timer.function = napi_watchdog;
5066        napi->gro_count = 0;
5067        napi->gro_list = NULL;
5068        napi->skb = NULL;
5069        napi->poll = poll;
5070        if (weight > NAPI_POLL_WEIGHT)
5071                pr_err_once("netif_napi_add() called with weight %d on device %s\n",
5072                            weight, dev->name);
5073        napi->weight = weight;
5074        list_add(&napi->dev_list, &dev->napi_list);
5075        napi->dev = dev;
5076#ifdef CONFIG_NETPOLL
5077        spin_lock_init(&napi->poll_lock);
5078        napi->poll_owner = -1;
5079#endif
5080        set_bit(NAPI_STATE_SCHED, &napi->state);
5081        napi_hash_add(napi);
5082}
5083EXPORT_SYMBOL(netif_napi_add);
5084
5085void napi_disable(struct napi_struct *n)
5086{
5087        might_sleep();
5088        set_bit(NAPI_STATE_DISABLE, &n->state);
5089
5090        while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
5091                msleep(1);
5092        while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
5093                msleep(1);
5094
5095        hrtimer_cancel(&n->timer);
5096
5097        clear_bit(NAPI_STATE_DISABLE, &n->state);
5098}
5099EXPORT_SYMBOL(napi_disable);
5100
5101/* Must be called in process context */
5102void netif_napi_del(struct napi_struct *napi)
5103{
5104        might_sleep();
5105        if (napi_hash_del(napi))
5106                synchronize_net();
5107        list_del_init(&napi->dev_list);
5108        napi_free_frags(napi);
5109
5110        kfree_skb_list(napi->gro_list);
5111        napi->gro_list = NULL;
5112        napi->gro_count = 0;
5113}
5114EXPORT_SYMBOL(netif_napi_del);
5115
5116static int napi_poll(struct napi_struct *n, struct list_head *repoll)
5117{
5118        void *have;
5119        int work, weight;
5120
5121        list_del_init(&n->poll_list);
5122
5123        have = netpoll_poll_lock(n);
5124
5125        weight = n->weight;
5126
5127        /* This NAPI_STATE_SCHED test is for avoiding a race
5128         * with netpoll's poll_napi().  Only the entity which
5129         * obtains the lock and sees NAPI_STATE_SCHED set will
5130         * actually make the ->poll() call.  Therefore we avoid
5131         * accidentally calling ->poll() when NAPI is not scheduled.
5132         */
5133        work = 0;
5134        if (test_bit(NAPI_STATE_SCHED, &n->state)) {
5135                work = n->poll(n, weight);
5136                trace_napi_poll(n);
5137        }
5138
5139        WARN_ON_ONCE(work > weight);
5140
5141        if (likely(work < weight))
5142                goto out_unlock;
5143
5144        /* Drivers must not modify the NAPI state if they
5145         * consume the entire weight.  In such cases this code
5146         * still "owns" the NAPI instance and therefore can
5147         * move the instance around on the list at-will.
5148         */
5149        if (unlikely(napi_disable_pending(n))) {
5150                napi_complete(n);
5151                goto out_unlock;
5152        }
5153
5154        if (n->gro_list) {
5155                /* flush too old packets
5156                 * If HZ < 1000, flush all packets.
5157                 */
5158                napi_gro_flush(n, HZ >= 1000);
5159        }
5160
5161        /* Some drivers may have called napi_schedule
5162         * prior to exhausting their budget.
5163         */
5164        if (unlikely(!list_empty(&n->poll_list))) {
5165                pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
5166                             n->dev ? n->dev->name : "backlog");
5167                goto out_unlock;
5168        }
5169
5170        list_add_tail(&n->poll_list, repoll);
5171
5172out_unlock:
5173        netpoll_poll_unlock(have);
5174
5175        return work;
5176}
5177
5178static void net_rx_action(struct softirq_action *h)
5179{
5180        struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5181        unsigned long time_limit = jiffies + 2;
5182        int budget = netdev_budget;
5183        LIST_HEAD(list);
5184        LIST_HEAD(repoll);
5185
5186        local_irq_disable();
5187        list_splice_init(&sd->poll_list, &list);
5188        local_irq_enable();
5189
5190        for (;;) {
5191                struct napi_struct *n;
5192
5193                if (list_empty(&list)) {
5194                        if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
5195                                return;
5196                        break;
5197                }
5198
5199                n = list_first_entry(&list, struct napi_struct, poll_list);
5200                budget -= napi_poll(n, &repoll);
5201
5202                /* If softirq window is exhausted then punt.
5203                 * Allow this to run for 2 jiffies since which will allow
5204                 * an average latency of 1.5/HZ.
5205                 */
5206                if (unlikely(budget <= 0 ||
5207                             time_after_eq(jiffies, time_limit))) {
5208                        sd->time_squeeze++;
5209                        break;
5210                }
5211        }
5212
5213        __kfree_skb_flush();
5214        local_irq_disable();
5215
5216        list_splice_tail_init(&sd->poll_list, &list);
5217        list_splice_tail(&repoll, &list);
5218        list_splice(&list, &sd->poll_list);
5219        if (!list_empty(&sd->poll_list))
5220                __raise_softirq_irqoff(NET_RX_SOFTIRQ);
5221
5222        net_rps_action_and_irq_enable(sd);
5223}
5224
5225struct netdev_adjacent {
5226        struct net_device *dev;
5227
5228        /* upper master flag, there can only be one master device per list */
5229        bool master;
5230
5231        /* counter for the number of times this device was added to us */
5232        u16 ref_nr;
5233
5234        /* private field for the users */
5235        void *private;
5236
5237        struct list_head list;
5238        struct rcu_head rcu;
5239};
5240
5241static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
5242                                                 struct list_head *adj_list)
5243{
5244        struct netdev_adjacent *adj;
5245
5246        list_for_each_entry(adj, adj_list, list) {
5247                if (adj->dev == adj_dev)
5248                        return adj;
5249        }
5250        return NULL;
5251}
5252
5253/**
5254 * netdev_has_upper_dev - Check if device is linked to an upper device
5255 * @dev: device
5256 * @upper_dev: upper device to check
5257 *
5258 * Find out if a device is linked to specified upper device and return true
5259 * in case it is. Note that this checks only immediate upper device,
5260 * not through a complete stack of devices. The caller must hold the RTNL lock.
5261 */
5262bool netdev_has_upper_dev(struct net_device *dev,
5263                          struct net_device *upper_dev)
5264{
5265        ASSERT_RTNL();
5266
5267        return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper);
5268}
5269EXPORT_SYMBOL(netdev_has_upper_dev);
5270
5271/**
5272 * netdev_has_any_upper_dev - Check if device is linked to some device
5273 * @dev: device
5274 *
5275 * Find out if a device is linked to an upper device and return true in case
5276 * it is. The caller must hold the RTNL lock.
5277 */
5278static bool netdev_has_any_upper_dev(struct net_device *dev)
5279{
5280        ASSERT_RTNL();
5281
5282        return !list_empty(&dev->all_adj_list.upper);
5283}
5284
5285/**
5286 * netdev_master_upper_dev_get - Get master upper device
5287 * @dev: device
5288 *
5289 * Find a master upper device and return pointer to it or NULL in case
5290 * it's not there. The caller must hold the RTNL lock.
5291 */
5292struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5293{
5294        struct netdev_adjacent *upper;
5295
5296        ASSERT_RTNL();
5297
5298        if (list_empty(&dev->adj_list.upper))
5299                return NULL;
5300
5301        upper = list_first_entry(&dev->adj_list.upper,
5302                                 struct netdev_adjacent, list);
5303        if (likely(upper->master))
5304                return upper->dev;
5305        return NULL;
5306}
5307EXPORT_SYMBOL(netdev_master_upper_dev_get);
5308
5309void *netdev_adjacent_get_private(struct list_head *adj_list)
5310{
5311        struct netdev_adjacent *adj;
5312
5313        adj = list_entry(adj_list, struct netdev_adjacent, list);
5314
5315        return adj->private;
5316}
5317EXPORT_SYMBOL(netdev_adjacent_get_private);
5318
5319/**
5320 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5321 * @dev: device
5322 * @iter: list_head ** of the current position
5323 *
5324 * Gets the next device from the dev's upper list, starting from iter
5325 * position. The caller must hold RCU read lock.
5326 */
5327struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5328                                                 struct list_head **iter)
5329{
5330        struct netdev_adjacent *upper;
5331
5332        WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5333
5334        upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5335
5336        if (&upper->list == &dev->adj_list.upper)
5337                return NULL;
5338
5339        *iter = &upper->list;
5340
5341        return upper->dev;
5342}
5343EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5344
5345/**
5346 * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
5347 * @dev: device
5348 * @iter: list_head ** of the current position
5349 *
5350 * Gets the next device from the dev's upper list, starting from iter
5351 * position. The caller must hold RCU read lock.
5352 */
5353struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
5354                                                     struct list_head **iter)
5355{
5356        struct netdev_adjacent *upper;
5357
5358        WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5359
5360        upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5361
5362        if (&upper->list == &dev->all_adj_list.upper)
5363                return NULL;
5364
5365        *iter = &upper->list;
5366
5367        return upper->dev;
5368}
5369EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
5370
5371/**
5372 * netdev_lower_get_next_private - Get the next ->private from the
5373 *                                 lower neighbour list
5374 * @dev: device
5375 * @iter: list_head ** of the current position
5376 *
5377 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5378 * list, starting from iter position. The caller must hold either hold the
5379 * RTNL lock or its own locking that guarantees that the neighbour lower
5380 * list will remain unchanged.
5381 */
5382void *netdev_lower_get_next_private(struct net_device *dev,
5383                                    struct list_head **iter)
5384{
5385        struct netdev_adjacent *lower;
5386
5387        lower = list_entry(*iter, struct netdev_adjacent, list);
5388
5389        if (&lower->list == &dev->adj_list.lower)
5390                return NULL;
5391
5392        *iter = lower->list.next;
5393
5394        return lower->private;
5395}
5396EXPORT_SYMBOL(netdev_lower_get_next_private);
5397
5398/**
5399 * netdev_lower_get_next_private_rcu - Get the next ->private from the
5400 *                                     lower neighbour list, RCU
5401 *                                     variant
5402 * @dev: device
5403 * @iter: list_head ** of the current position
5404 *
5405 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5406 * list, starting from iter position. The caller must hold RCU read lock.
5407 */
5408void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5409                                        struct list_head **iter)
5410{
5411        struct netdev_adjacent *lower;
5412
5413        WARN_ON_ONCE(!rcu_read_lock_held());
5414
5415        lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5416
5417        if (&lower->list == &dev->adj_list.lower)
5418                return NULL;
5419
5420        *iter = &lower->list;
5421
5422        return lower->private;
5423}
5424EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5425
5426/**
5427 * netdev_lower_get_next - Get the next device from the lower neighbour
5428 *                         list
5429 * @dev: device
5430 * @iter: list_head ** of the current position
5431 *
5432 * Gets the next netdev_adjacent from the dev's lower neighbour
5433 * list, starting from iter position. The caller must hold RTNL lock or
5434 * its own locking that guarantees that the neighbour lower
5435 * list will remain unchanged.
5436 */
5437void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5438{
5439        struct netdev_adjacent *lower;
5440
5441        lower = list_entry(*iter, struct netdev_adjacent, list);
5442
5443        if (&lower->list == &dev->adj_list.lower)
5444                return NULL;
5445
5446        *iter = lower->list.next;
5447
5448        return lower->dev;
5449}
5450EXPORT_SYMBOL(netdev_lower_get_next);
5451
5452/**
5453 * netdev_lower_get_first_private_rcu - Get the first ->private from the
5454 *                                     lower neighbour list, RCU
5455 *                                     variant
5456 * @dev: device
5457 *
5458 * Gets the first netdev_adjacent->private from the dev's lower neighbour
5459 * list. The caller must hold RCU read lock.
5460 */
5461void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5462{
5463        struct netdev_adjacent *lower;
5464
5465        lower = list_first_or_null_rcu(&dev->adj_list.lower,
5466                        struct netdev_adjacent, list);
5467        if (lower)
5468                return lower->private;
5469        return NULL;
5470}
5471EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5472
5473/**
5474 * netdev_master_upper_dev_get_rcu - Get master upper device
5475 * @dev: device
5476 *
5477 * Find a master upper device and return pointer to it or NULL in case
5478 * it's not there. The caller must hold the RCU read lock.
5479 */
5480struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5481{
5482        struct netdev_adjacent *upper;
5483
5484        upper = list_first_or_null_rcu(&dev->adj_list.upper,
5485                                       struct netdev_adjacent, list);
5486        if (upper && likely(upper->master))
5487                return upper->dev;
5488        return NULL;
5489}
5490EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5491
5492static int netdev_adjacent_sysfs_add(struct net_device *dev,
5493                              struct net_device *adj_dev,
5494                              struct list_head *dev_list)
5495{
5496        char linkname[IFNAMSIZ+7];
5497        sprintf(linkname, dev_list == &dev->adj_list.upper ?
5498                "upper_%s" : "lower_%s", adj_dev->name);
5499        return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5500                                 linkname);
5501}
5502static void netdev_adjacent_sysfs_del(struct net_device *dev,
5503                               char *name,
5504                               struct list_head *dev_list)
5505{
5506        char linkname[IFNAMSIZ+7];
5507        sprintf(linkname, dev_list == &dev->adj_list.upper ?
5508                "upper_%s" : "lower_%s", name);
5509        sysfs_remove_link(&(dev->dev.kobj), linkname);
5510}
5511
5512static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5513                                                 struct net_device *adj_dev,
5514                                                 struct list_head *dev_list)
5515{
5516        return (dev_list == &dev->adj_list.upper ||
5517                dev_list == &dev->adj_list.lower) &&
5518                net_eq(dev_net(dev), dev_net(adj_dev));
5519}
5520
5521static int __netdev_adjacent_dev_insert(struct net_device *dev,
5522                                        struct net_device *adj_dev,
5523                                        struct list_head *dev_list,
5524                                        void *private, bool master)
5525{
5526        struct netdev_adjacent *adj;
5527        int ret;
5528
5529        adj = __netdev_find_adj(adj_dev, dev_list);
5530
5531        if (adj) {
5532                adj->ref_nr++;
5533                return 0;
5534        }
5535
5536        adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5537        if (!adj)
5538                return -ENOMEM;
5539
5540        adj->dev = adj_dev;
5541        adj->master = master;
5542        adj->ref_nr = 1;
5543        adj->private = private;
5544        dev_hold(adj_dev);
5545
5546        pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5547                 adj_dev->name, dev->name, adj_dev->name);
5548
5549        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5550                ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5551                if (ret)
5552                        goto free_adj;
5553        }
5554
5555        /* Ensure that master link is always the first item in list. */
5556        if (master) {
5557                ret = sysfs_create_link(&(dev->dev.kobj),
5558                                        &(adj_dev->dev.kobj), "master");
5559                if (ret)
5560                        goto remove_symlinks;
5561
5562                list_add_rcu(&adj->list, dev_list);
5563        } else {
5564                list_add_tail_rcu(&adj->list, dev_list);
5565        }
5566
5567        return 0;
5568
5569remove_symlinks:
5570        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5571                netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5572free_adj:
5573        kfree(adj);
5574        dev_put(adj_dev);
5575
5576        return ret;
5577}
5578
5579static void __netdev_adjacent_dev_remove(struct net_device *dev,
5580                                         struct net_device *adj_dev,
5581                                         struct list_head *dev_list)
5582{
5583        struct netdev_adjacent *adj;
5584
5585        adj = __netdev_find_adj(adj_dev, dev_list);
5586
5587        if (!adj) {
5588                pr_err("tried to remove device %s from %s\n",
5589                       dev->name, adj_dev->name);
5590                BUG();
5591        }
5592
5593        if (adj->ref_nr > 1) {
5594                pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
5595                         adj->ref_nr-1);
5596                adj->ref_nr--;
5597                return;
5598        }
5599
5600        if (adj->master)
5601                sysfs_remove_link(&(dev->dev.kobj), "master");
5602
5603        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5604                netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5605
5606        list_del_rcu(&adj->list);
5607        pr_debug("dev_put for %s, because link removed from %s to %s\n",
5608                 adj_dev->name, dev->name, adj_dev->name);
5609        dev_put(adj_dev);
5610        kfree_rcu(adj, rcu);
5611}
5612
5613static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5614                                            struct net_device *upper_dev,
5615                                            struct list_head *up_list,
5616                                            struct list_head *down_list,
5617                                            void *private, bool master)
5618{
5619        int ret;
5620
5621        ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
5622                                           master);
5623        if (ret)
5624                return ret;
5625
5626        ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
5627                                           false);
5628        if (ret) {
5629                __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5630                return ret;
5631        }
5632
5633        return 0;
5634}
5635
5636static int __netdev_adjacent_dev_link(struct net_device *dev,
5637                                      struct net_device *upper_dev)
5638{
5639        return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5640                                                &dev->all_adj_list.upper,
5641                                                &upper_dev->all_adj_list.lower,
5642                                                NULL, false);
5643}
5644
5645static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5646                                               struct net_device *upper_dev,
5647                                               struct list_head *up_list,
5648                                               struct list_head *down_list)
5649{
5650        __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5651        __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5652}
5653
5654static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5655                                         struct net_device *upper_dev)
5656{
5657        __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5658                                           &dev->all_adj_list.upper,
5659                                           &upper_dev->all_adj_list.lower);
5660}
5661
5662static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5663                                                struct net_device *upper_dev,
5664                                                void *private, bool master)
5665{
5666        int ret = __netdev_adjacent_dev_link(dev, upper_dev);
5667
5668        if (ret)
5669                return ret;
5670
5671        ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
5672                                               &dev->adj_list.upper,
5673                                               &upper_dev->adj_list.lower,
5674                                               private, master);
5675        if (ret) {
5676                __netdev_adjacent_dev_unlink(dev, upper_dev);
5677                return ret;
5678        }
5679
5680        return 0;
5681}
5682
5683static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5684                                                   struct net_device *upper_dev)
5685{
5686        __netdev_adjacent_dev_unlink(dev, upper_dev);
5687        __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5688                                           &dev->adj_list.upper,
5689                                           &upper_dev->adj_list.lower);
5690}
5691
5692static int __netdev_upper_dev_link(struct net_device *dev,
5693                                   struct net_device *upper_dev, bool master,
5694                                   void *upper_priv, void *upper_info)
5695{
5696        struct netdev_notifier_changeupper_info changeupper_info;
5697        struct netdev_adjacent *i, *j, *to_i, *to_j;
5698        int ret = 0;
5699
5700        ASSERT_RTNL();
5701
5702        if (dev == upper_dev)
5703                return -EBUSY;
5704
5705        /* To prevent loops, check if dev is not upper device to upper_dev. */
5706        if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper))
5707                return -EBUSY;
5708
5709        if (__netdev_find_adj(upper_dev, &dev->adj_list.upper))
5710                return -EEXIST;
5711
5712        if (master && netdev_master_upper_dev_get(dev))
5713                return -EBUSY;
5714
5715        changeupper_info.upper_dev = upper_dev;
5716        changeupper_info.master = master;
5717        changeupper_info.linking = true;
5718        changeupper_info.upper_info = upper_info;
5719
5720        ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5721                                            &changeupper_info.info);
5722        ret = notifier_to_errno(ret);
5723        if (ret)
5724                return ret;
5725
5726        ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
5727                                                   master);
5728        if (ret)
5729                return ret;
5730
5731        /* Now that we linked these devs, make all the upper_dev's
5732         * all_adj_list.upper visible to every dev's all_adj_list.lower an
5733         * versa, and don't forget the devices itself. All of these
5734         * links are non-neighbours.
5735         */
5736        list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5737                list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5738                        pr_debug("Interlinking %s with %s, non-neighbour\n",
5739                                 i->dev->name, j->dev->name);
5740                        ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5741                        if (ret)
5742                                goto rollback_mesh;
5743                }
5744        }
5745
5746        /* add dev to every upper_dev's upper device */
5747        list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5748                pr_debug("linking %s's upper device %s with %s\n",
5749                         upper_dev->name, i->dev->name, dev->name);
5750                ret = __netdev_adjacent_dev_link(dev, i->dev);
5751                if (ret)
5752                        goto rollback_upper_mesh;
5753        }
5754
5755        /* add upper_dev to every dev's lower device */
5756        list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5757                pr_debug("linking %s's lower device %s with %s\n", dev->name,
5758                         i->dev->name, upper_dev->name);
5759                ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5760                if (ret)
5761                        goto rollback_lower_mesh;
5762        }
5763
5764        ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5765                                            &changeupper_info.info);
5766        ret = notifier_to_errno(ret);
5767        if (ret)
5768                goto rollback_lower_mesh;
5769
5770        return 0;
5771
5772rollback_lower_mesh:
5773        to_i = i;
5774        list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5775                if (i == to_i)
5776                        break;
5777                __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5778        }
5779
5780        i = NULL;
5781
5782rollback_upper_mesh:
5783        to_i = i;
5784        list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5785                if (i == to_i)
5786                        break;
5787                __netdev_adjacent_dev_unlink(dev, i->dev);
5788        }
5789
5790        i = j = NULL;
5791
5792rollback_mesh:
5793        to_i = i;
5794        to_j = j;
5795        list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5796                list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5797                        if (i == to_i && j == to_j)
5798                                break;
5799                        __netdev_adjacent_dev_unlink(i->dev, j->dev);
5800                }
5801                if (i == to_i)
5802                        break;
5803        }
5804
5805        __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5806
5807        return ret;
5808}
5809
5810/**
5811 * netdev_upper_dev_link - Add a link to the upper device
5812 * @dev: device
5813 * @upper_dev: new upper device
5814 *
5815 * Adds a link to device which is upper to this one. The caller must hold
5816 * the RTNL lock. On a failure a negative errno code is returned.
5817 * On success the reference counts are adjusted and the function
5818 * returns zero.
5819 */
5820int netdev_upper_dev_link(struct net_device *dev,
5821                          struct net_device *upper_dev)
5822{
5823        return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL);
5824}
5825EXPORT_SYMBOL(netdev_upper_dev_link);
5826
5827/**
5828 * netdev_master_upper_dev_link - Add a master link to the upper device
5829 * @dev: device
5830 * @upper_dev: new upper device
5831 * @upper_priv: upper device private
5832 * @upper_info: upper info to be passed down via notifier
5833 *
5834 * Adds a link to device which is upper to this one. In this case, only
5835 * one master upper device can be linked, although other non-master devices
5836 * might be linked as well. The caller must hold the RTNL lock.
5837 * On a failure a negative errno code is returned. On success the reference
5838 * counts are adjusted and the function returns zero.
5839 */
5840int netdev_master_upper_dev_link(struct net_device *dev,
5841                                 struct net_device *upper_dev,
5842                                 void *upper_priv, void *upper_info)
5843{
5844        return __netdev_upper_dev_link(dev, upper_dev, true,
5845                                       upper_priv, upper_info);
5846}
5847EXPORT_SYMBOL(netdev_master_upper_dev_link);
5848
5849/**
5850 * netdev_upper_dev_unlink - Removes a link to upper device
5851 * @dev: device
5852 * @upper_dev: new upper device
5853 *
5854 * Removes a link to device which is upper to this one. The caller must hold
5855 * the RTNL lock.
5856 */
5857void netdev_upper_dev_unlink(struct net_device *dev,
5858                             struct net_device *upper_dev)
5859{
5860        struct netdev_notifier_changeupper_info changeupper_info;
5861        struct netdev_adjacent *i, *j;
5862        ASSERT_RTNL();
5863
5864        changeupper_info.upper_dev = upper_dev;
5865        changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
5866        changeupper_info.linking = false;
5867
5868        call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5869                                      &changeupper_info.info);
5870
5871        __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5872
5873        /* Here is the tricky part. We must remove all dev's lower
5874         * devices from all upper_dev's upper devices and vice
5875         * versa, to maintain the graph relationship.
5876         */
5877        list_for_each_entry(i, &dev->all_adj_list.lower, list)
5878                list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5879                        __netdev_adjacent_dev_unlink(i->dev, j->dev);
5880
5881        /* remove also the devices itself from lower/upper device
5882         * list
5883         */
5884        list_for_each_entry(i, &dev->all_adj_list.lower, list)
5885                __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5886
5887        list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5888                __netdev_adjacent_dev_unlink(dev, i->dev);
5889
5890        call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5891                                      &changeupper_info.info);
5892}
5893EXPORT_SYMBOL(netdev_upper_dev_unlink);
5894
5895/**
5896 * netdev_bonding_info_change - Dispatch event about slave change
5897 * @dev: device
5898 * @bonding_info: info to dispatch
5899 *
5900 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
5901 * The caller must hold the RTNL lock.
5902 */
5903void netdev_bonding_info_change(struct net_device *dev,
5904                                struct netdev_bonding_info *bonding_info)
5905{
5906        struct netdev_notifier_bonding_info     info;
5907
5908        memcpy(&info.bonding_info, bonding_info,
5909               sizeof(struct netdev_bonding_info));
5910        call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
5911                                      &info.info);
5912}
5913EXPORT_SYMBOL(netdev_bonding_info_change);
5914
5915static void netdev_adjacent_add_links(struct net_device *dev)
5916{
5917        struct netdev_adjacent *iter;
5918
5919        struct net *net = dev_net(dev);
5920
5921        list_for_each_entry(iter, &dev->adj_list.upper, list) {
5922                if (!net_eq(net,dev_net(iter->dev)))
5923                        continue;
5924                netdev_adjacent_sysfs_add(iter->dev, dev,
5925                                          &iter->dev->adj_list.lower);
5926                netdev_adjacent_sysfs_add(dev, iter->dev,
5927                                          &dev->adj_list.upper);
5928        }
5929
5930        list_for_each_entry(iter, &dev->adj_list.lower, list) {
5931                if (!net_eq(net,dev_net(iter->dev)))
5932                        continue;
5933                netdev_adjacent_sysfs_add(iter->dev, dev,
5934                                          &iter->dev->adj_list.upper);
5935                netdev_adjacent_sysfs_add(dev, iter->dev,
5936                                          &dev->adj_list.lower);
5937        }
5938}
5939
5940static void netdev_adjacent_del_links(struct net_device *dev)
5941{
5942        struct netdev_adjacent *iter;
5943
5944        struct net *net = dev_net(dev);
5945
5946        list_for_each_entry(iter, &dev->adj_list.upper, list) {
5947                if (!net_eq(net,dev_net(iter->dev)))
5948                        continue;
5949                netdev_adjacent_sysfs_del(iter->dev, dev->name,
5950                                          &iter->dev->adj_list.lower);
5951                netdev_adjacent_sysfs_del(dev, iter->dev->name,
5952                                          &dev->adj_list.upper);
5953        }
5954
5955        list_for_each_entry(iter, &dev->adj_list.lower, list) {
5956                if (!net_eq(net,dev_net(iter->dev)))
5957                        continue;
5958                netdev_adjacent_sysfs_del(iter->dev, dev->name,
5959                                          &iter->dev->adj_list.upper);
5960                netdev_adjacent_sysfs_del(dev, iter->dev->name,
5961                                          &dev->adj_list.lower);
5962        }
5963}
5964
5965void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5966{
5967        struct netdev_adjacent *iter;
5968
5969        struct net *net = dev_net(dev);
5970
5971        list_for_each_entry(iter, &dev->adj_list.upper, list) {
5972                if (!net_eq(net,dev_net(iter->dev)))
5973                        continue;
5974                netdev_adjacent_sysfs_del(iter->dev, oldname,
5975                                          &iter->dev->adj_list.lower);
5976                netdev_adjacent_sysfs_add(iter->dev, dev,
5977                                          &iter->dev->adj_list.lower);
5978        }
5979
5980        list_for_each_entry(iter, &dev->adj_list.lower, list) {
5981                if (!net_eq(net,dev_net(iter->dev)))
5982                        continue;
5983                netdev_adjacent_sysfs_del(iter->dev, oldname,
5984                                          &iter->dev->adj_list.upper);
5985                netdev_adjacent_sysfs_add(iter->dev, dev,
5986                                          &iter->dev->adj_list.upper);
5987        }
5988}
5989
5990void *netdev_lower_dev_get_private(struct net_device *dev,
5991                                   struct net_device *lower_dev)
5992{
5993        struct netdev_adjacent *lower;
5994
5995        if (!lower_dev)
5996                return NULL;
5997        lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
5998        if (!lower)
5999                return NULL;
6000

6001        return lower->private;
6002}
6003EXPORT_SYMBOL(netdev_lower_dev_get_private);
6004
6005
6006int dev_get_nest_level(struct net_device *dev,
6007                       bool (*type_check)(const struct net_device *dev))
6008{
6009        struct net_device *lower = NULL;
6010        struct list_head *iter;
6011        int max_nest = -1;
6012        int nest;
6013
6014        ASSERT_RTNL();
6015
6016        netdev_for_each_lower_dev(dev, lower, iter) {
6017                nest = dev_get_nest_level(lower, type_check);
6018                if (max_nest < nest)
6019                        max_nest = nest;
6020        }
6021
6022        if (type_check(dev))
6023                max_nest++;
6024
6025        return max_nest;
6026}
6027EXPORT_SYMBOL(dev_get_nest_level);
6028
6029/**
6030 * netdev_lower_change - Dispatch event about lower device state change
6031 * @lower_dev: device
6032 * @lower_state_info: state to dispatch
6033 *
6034 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
6035 * The caller must hold the RTNL lock.
6036 */
6037void netdev_lower_state_changed(struct net_device *lower_dev,
6038                                void *lower_state_info)
6039{
6040        struct netdev_notifier_changelowerstate_info changelowerstate_info;
6041
6042        ASSERT_RTNL();
6043        changelowerstate_info.lower_state_info = lower_state_info;
6044        call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev,
6045                                      &changelowerstate_info.info);
6046}
6047EXPORT_SYMBOL(netdev_lower_state_changed);
6048
6049static void dev_change_rx_flags(struct net_device *dev, int flags)
6050{
6051        const struct net_device_ops *ops = dev->netdev_ops;
6052
6053        if (ops->ndo_change_rx_flags)
6054                ops->ndo_change_rx_flags(dev, flags);
6055}
6056
6057static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
6058{
6059        unsigned int old_flags = dev->flags;
6060        kuid_t uid;
6061        kgid_t gid;
6062
6063        ASSERT_RTNL();
6064
6065        dev->flags |= IFF_PROMISC;
6066        dev->promiscuity += inc;
6067        if (dev->promiscuity == 0) {
6068                /*
6069                 * Avoid overflow.
6070                 * If inc causes overflow, untouch promisc and return error.
6071                 */
6072                if (inc < 0)
6073                        dev->flags &= ~IFF_PROMISC;
6074                else {
6075                        dev->promiscuity -= inc;
6076                        pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
6077                                dev->name);
6078                        return -EOVERFLOW;
6079                }
6080        }
6081        if (dev->flags != old_flags) {
6082                pr_info("device %s %s promiscuous mode\n",
6083                        dev->name,
6084                        dev->flags & IFF_PROMISC ? "entered" : "left");
6085                if (audit_enabled) {
6086                        current_uid_gid(&uid, &gid);
6087                        audit_log(current->audit_context, GFP_ATOMIC,
6088                                AUDIT_ANOM_PROMISCUOUS,
6089                                "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
6090                                dev->name, (dev->flags & IFF_PROMISC),
6091                                (old_flags & IFF_PROMISC),
6092                                from_kuid(&init_user_ns, audit_get_loginuid(current)),
6093                                from_kuid(&init_user_ns, uid),
6094                                from_kgid(&init_user_ns, gid),
6095                                audit_get_sessionid(current));
6096                }
6097
6098                dev_change_rx_flags(dev, IFF_PROMISC);
6099        }
6100        if (notify)
6101                __dev_notify_flags(dev, old_flags, IFF_PROMISC);
6102        return 0;
6103}
6104
6105/**
6106 *      dev_set_promiscuity     - update promiscuity count on a device
6107 *      @dev: device
6108 *      @inc: modifier
6109 *
6110 *      Add or remove promiscuity from a device. While the count in the device
6111 *      remains above zero the interface remains promiscuous. Once it hits zero
6112 *      the device reverts back to normal filtering operation. A negative inc
6113 *      value is used to drop promiscuity on the device.
6114 *      Return 0 if successful or a negative errno code on error.
6115 */
6116int dev_set_promiscuity(struct net_device *dev, int inc)
6117{
6118        unsigned int old_flags = dev->flags;
6119        int err;
6120
6121        err = __dev_set_promiscuity(dev, inc, true);
6122        if (err < 0)
6123                return err;
6124        if (dev->flags != old_flags)
6125                dev_set_rx_mode(dev);
6126        return err;
6127}
6128EXPORT_SYMBOL(dev_set_promiscuity);
6129
6130static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
6131{
6132        unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
6133
6134        ASSERT_RTNL();
6135
6136        dev->flags |= IFF_ALLMULTI;
6137        dev->allmulti += inc;
6138        if (dev->allmulti == 0) {
6139                /*
6140                 * Avoid overflow.
6141                 * If inc causes overflow, untouch allmulti and return error.
6142                 */
6143                if (inc < 0)
6144                        dev->flags &= ~IFF_ALLMULTI;
6145                else {
6146                        dev->allmulti -= inc;
6147                        pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
6148                                dev->name);
6149                        return -EOVERFLOW;
6150                }
6151        }
6152        if (dev->flags ^ old_flags) {
6153                dev_change_rx_flags(dev, IFF_ALLMULTI);
6154                dev_set_rx_mode(dev);
6155                if (notify)
6156                        __dev_notify_flags(dev, old_flags,
6157                                           dev->gflags ^ old_gflags);
6158        }
6159        return 0;
6160}
6161
6162/**
6163 *      dev_set_allmulti        - update allmulti count on a device
6164 *      @dev: device
6165 *      @inc: modifier
6166 *
6167 *      Add or remove reception of all multicast frames to a device. While the
6168 *      count in the device remains above zero the interface remains listening
6169 *      to all interfaces. Once it hits zero the device reverts back to normal
6170 *      filtering operation. A negative @inc value is used to drop the counter
6171 *      when releasing a resource needing all multicasts.
6172 *      Return 0 if successful or a negative errno code on error.
6173 */
6174
6175int dev_set_allmulti(struct net_device *dev, int inc)
6176{
6177        return __dev_set_allmulti(dev, inc, true);
6178}
6179EXPORT_SYMBOL(dev_set_allmulti);
6180
6181/*
6182 *      Upload unicast and multicast address lists to device and
6183 *      configure RX filtering. When the device doesn't support unicast
6184 *      filtering it is put in promiscuous mode while unicast addresses
6185 *      are present.
6186 */
6187void __dev_set_rx_mode(struct net_device *dev)
6188{
6189        const struct net_device_ops *ops = dev->netdev_ops;
6190
6191        /* dev_open will call this function so the list will stay sane. */
6192        if (!(dev->flags&IFF_UP))
6193                return;
6194
6195        if (!netif_device_present(dev))
6196                return;
6197
6198        if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
6199                /* Unicast addresses changes may only happen under the rtnl,
6200                 * therefore calling __dev_set_promiscuity here is safe.
6201                 */
6202                if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
6203                        __dev_set_promiscuity(dev, 1, false);
6204                        dev->uc_promisc = true;
6205                } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
6206                        __dev_set_promiscuity(dev, -1, false);
6207                        dev->uc_promisc = false;
6208                }
6209        }
6210
6211        if (ops->ndo_set_rx_mode)
6212                ops->ndo_set_rx_mode(dev);
6213}
6214
6215void dev_set_rx_mode(struct net_device *dev)
6216{
6217        netif_addr_lock_bh(dev);
6218        __dev_set_rx_mode(dev);
6219        netif_addr_unlock_bh(dev);
6220}
6221
6222/**
6223 *      dev_get_flags - get flags reported to userspace
6224 *      @dev: device
6225 *
6226 *      Get the combination of flag bits exported through APIs to userspace.
6227 */
6228unsigned int dev_get_flags(const struct net_device *dev)
6229{
6230        unsigned int flags;
6231
6232        flags = (dev->flags & ~(IFF_PROMISC |
6233                                IFF_ALLMULTI |
6234                                IFF_RUNNING |
6235                                IFF_LOWER_UP |
6236                                IFF_DORMANT)) |
6237                (dev->gflags & (IFF_PROMISC |
6238                                IFF_ALLMULTI));
6239
6240        if (netif_running(dev)) {
6241                if (netif_oper_up(dev))
6242                        flags |= IFF_RUNNING;
6243                if (netif_carrier_ok(dev))
6244                        flags |= IFF_LOWER_UP;
6245                if (netif_dormant(dev))
6246                        flags |= IFF_DORMANT;
6247        }
6248
6249        return flags;
6250}
6251EXPORT_SYMBOL(dev_get_flags);
6252
6253int __dev_change_flags(struct net_device *dev, unsigned int flags)
6254{
6255        unsigned int old_flags = dev->flags;
6256        int ret;
6257
6258        ASSERT_RTNL();
6259
6260        /*
6261         *      Set the flags on our device.
6262         */
6263
6264        dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
6265                               IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
6266                               IFF_AUTOMEDIA)) |
6267                     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
6268                                    IFF_ALLMULTI));
6269
6270        /*
6271         *      Load in the correct multicast list now the flags have changed.
6272         */
6273
6274        if ((old_flags ^ flags) & IFF_MULTICAST)
6275                dev_change_rx_flags(dev, IFF_MULTICAST);
6276
6277        dev_set_rx_mode(dev);
6278
6279        /*
6280         *      Have we downed the interface. We handle IFF_UP ourselves
6281         *      according to user attempts to set it, rather than blindly
6282         *      setting it.
6283         */
6284
6285        ret = 0;
6286        if ((old_flags ^ flags) & IFF_UP)
6287                ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
6288
6289        if ((flags ^ dev->gflags) & IFF_PROMISC) {
6290                int inc = (flags & IFF_PROMISC) ? 1 : -1;
6291                unsigned int old_flags = dev->flags;
6292
6293                dev->gflags ^= IFF_PROMISC;
6294
6295                if (__dev_set_promiscuity(dev, inc, false) >= 0)
6296                        if (dev->flags != old_flags)
6297                                dev_set_rx_mode(dev);
6298        }
6299
6300        /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6301           is important. Some (broken) drivers set IFF_PROMISC, when
6302           IFF_ALLMULTI is requested not asking us and not reporting.
6303         */
6304        if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
6305                int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
6306
6307                dev->gflags ^= IFF_ALLMULTI;
6308                __dev_set_allmulti(dev, inc, false);
6309        }
6310
6311        return ret;
6312}
6313
6314void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
6315                        unsigned int gchanges)
6316{
6317        unsigned int changes = dev->flags ^ old_flags;
6318
6319        if (gchanges)
6320                rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
6321
6322        if (changes & IFF_UP) {
6323                if (dev->flags & IFF_UP)
6324                        call_netdevice_notifiers(NETDEV_UP, dev);
6325                else
6326                        call_netdevice_notifiers(NETDEV_DOWN, dev);
6327        }
6328
6329        if (dev->flags & IFF_UP &&
6330            (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
6331                struct netdev_notifier_change_info change_info;
6332
6333                change_info.flags_changed = changes;
6334                call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
6335                                              &change_info.info);
6336        }
6337}
6338
6339/**
6340 *      dev_change_flags - change device settings
6341 *      @dev: device
6342 *      @flags: device state flags
6343 *
6344 *      Change settings on device based state flags. The flags are
6345 *      in the userspace exported format.
6346 */
6347int dev_change_flags(struct net_device *dev, unsigned int flags)
6348{
6349        int ret;
6350        unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
6351
6352        ret = __dev_change_flags(dev, flags);
6353        if (ret < 0)
6354                return ret;
6355
6356        changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
6357        __dev_notify_flags(dev, old_flags, changes);
6358        return ret;
6359}
6360EXPORT_SYMBOL(dev_change_flags);
6361
6362static int __dev_set_mtu(struct net_device *dev, int new_mtu)
6363{
6364        const struct net_device_ops *ops = dev->netdev_ops;
6365
6366        if (ops->ndo_change_mtu)
6367                return ops->ndo_change_mtu(dev, new_mtu);
6368
6369        dev->mtu = new_mtu;
6370        return 0;
6371}
6372
6373/**
6374 *      dev_set_mtu - Change maximum transfer unit
6375 *      @dev: device
6376 *      @new_mtu: new transfer unit
6377 *
6378 *      Change the maximum transfer size of the network device.
6379 */
6380int dev_set_mtu(struct net_device *dev, int new_mtu)
6381{
6382        int err, orig_mtu;
6383
6384        if (new_mtu == dev->mtu)
6385                return 0;
6386
6387        /*      MTU must be positive.    */
6388        if (new_mtu < 0)
6389                return -EINVAL;
6390
6391        if (!netif_device_present(dev))
6392                return -ENODEV;
6393
6394        err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
6395        err = notifier_to_errno(err);
6396        if (err)
6397                return err;
6398
6399        orig_mtu = dev->mtu;
6400        err = __dev_set_mtu(dev, new_mtu);
6401
6402        if (!err) {
6403                err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6404                err = notifier_to_errno(err);
6405                if (err) {
6406                        /* setting mtu back and notifying everyone again,
6407                         * so that they have a chance to revert changes.
6408                         */
6409                        __dev_set_mtu(dev, orig_mtu);
6410                        call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6411                }
6412        }
6413        return err;
6414}
6415EXPORT_SYMBOL(dev_set_mtu);
6416
6417/**
6418 *      dev_set_group - Change group this device belongs to
6419 *      @dev: device
6420 *      @new_group: group this device should belong to
6421 */
6422void dev_set_group(struct net_device *dev, int new_group)
6423{
6424        dev->group = new_group;
6425}
6426EXPORT_SYMBOL(dev_set_group);
6427
6428/**
6429 *      dev_set_mac_address - Change Media Access Control Address
6430 *      @dev: device
6431 *      @sa: new address
6432 *
6433 *      Change the hardware (MAC) address of the device
6434 */
6435int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
6436{
6437        const struct net_device_ops *ops = dev->netdev_ops;
6438        int err;
6439
6440        if (!ops->ndo_set_mac_address)
6441                return -EOPNOTSUPP;
6442        if (sa->sa_family != dev->type)
6443                return -EINVAL;
6444        if (!netif_device_present(dev))
6445                return -ENODEV;
6446        err = ops->ndo_set_mac_address(dev, sa);
6447        if (err)
6448                return err;
6449        dev->addr_assign_type = NET_ADDR_SET;
6450        call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
6451        add_device_randomness(dev->dev_addr, dev->addr_len);
6452        return 0;
6453}
6454EXPORT_SYMBOL(dev_set_mac_address);
6455
6456/**
6457 *      dev_change_carrier - Change device carrier
6458 *      @dev: device
6459 *      @new_carrier: new value
6460 *
6461 *      Change device carrier
6462 */
6463int dev_change_carrier(struct net_device *dev, bool new_carrier)
6464{
6465        const struct net_device_ops *ops = dev->netdev_ops;
6466
6467        if (!ops->ndo_change_carrier)
6468                return -EOPNOTSUPP;
6469        if (!netif_device_present(dev))
6470                return -ENODEV;
6471        return ops->ndo_change_carrier(dev, new_carrier);
6472}
6473EXPORT_SYMBOL(dev_change_carrier);
6474
6475/**
6476 *      dev_get_phys_port_id - Get device physical port ID
6477 *      @dev: device
6478 *      @ppid: port ID
6479 *
6480 *      Get device physical port ID
6481 */
6482int dev_get_phys_port_id(struct net_device *dev,
6483                         struct netdev_phys_item_id *ppid)
6484{
6485        const struct net_device_ops *ops = dev->netdev_ops;
6486
6487        if (!ops->ndo_get_phys_port_id)
6488                return -EOPNOTSUPP;
6489        return ops->ndo_get_phys_port_id(dev, ppid);
6490}
6491EXPORT_SYMBOL(dev_get_phys_port_id);
6492
6493/**
6494 *      dev_get_phys_port_name - Get device physical port name
6495 *      @dev: device
6496 *      @name: port name
6497 *      @len: limit of bytes to copy to name
6498 *
6499 *      Get device physical port name
6500 */
6501int dev_get_phys_port_name(struct net_device *dev,
6502                           char *name, size_t len)
6503{
6504        const struct net_device_ops *ops = dev->netdev_ops;
6505
6506        if (!ops->ndo_get_phys_port_name)
6507                return -EOPNOTSUPP;
6508        return ops->ndo_get_phys_port_name(dev, name, len);
6509}
6510EXPORT_SYMBOL(dev_get_phys_port_name);
6511
6512/**
6513 *      dev_change_proto_down - update protocol port state information
6514 *      @dev: device
6515 *      @proto_down: new value
6516 *
6517 *      This info can be used by switch drivers to set the phys state of the
6518 *      port.
6519 */
6520int dev_change_proto_down(struct net_device *dev, bool proto_down)
6521{
6522        const struct net_device_ops *ops = dev->netdev_ops;
6523
6524        if (!ops->ndo_change_proto_down)
6525                return -EOPNOTSUPP;
6526        if (!netif_device_present(dev))
6527                return -ENODEV;
6528        return ops->ndo_change_proto_down(dev, proto_down);
6529}
6530EXPORT_SYMBOL(dev_change_proto_down);
6531
6532/**
6533 *      dev_new_index   -       allocate an ifindex
6534 *      @net: the applicable net namespace
6535 *
6536 *      Returns a suitable unique value for a new device interface
6537 *      number.  The caller must hold the rtnl semaphore or the
6538 *      dev_base_lock to be sure it remains unique.
6539 */
6540static int dev_new_index(struct net *net)
6541{
6542        int ifindex = net->ifindex;
6543        for (;;) {
6544                if (++ifindex <= 0)
6545                        ifindex = 1;
6546                if (!__dev_get_by_index(net, ifindex))
6547                        return net->ifindex = ifindex;
6548        }
6549}
6550
6551/* Delayed registration/unregisteration */
6552static LIST_HEAD(net_todo_list);
6553DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
6554
6555static void net_set_todo(struct net_device *dev)
6556{
6557        list_add_tail(&dev->todo_list, &net_todo_list);
6558        dev_net(dev)->dev_unreg_count++;
6559}
6560
6561static void rollback_registered_many(struct list_head *head)
6562{
6563        struct net_device *dev, *tmp;
6564        LIST_HEAD(close_head);
6565
6566        BUG_ON(dev_boot_phase);
6567        ASSERT_RTNL();
6568
6569        list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6570                /* Some devices call without registering
6571                 * for initialization unwind. Remove those
6572                 * devices and proceed with the remaining.
6573                 */
6574                if (dev->reg_state == NETREG_UNINITIALIZED) {
6575                        pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6576                                 dev->name, dev);
6577
6578                        WARN_ON(1);
6579                        list_del(&dev->unreg_list);
6580                        continue;
6581                }
6582                dev->dismantle = true;
6583                BUG_ON(dev->reg_state != NETREG_REGISTERED);
6584        }
6585
6586        /* If device is running, close it first. */
6587        list_for_each_entry(dev, head, unreg_list)
6588                list_add_tail(&dev->close_list, &close_head);
6589        dev_close_many(&close_head, true);
6590
6591        list_for_each_entry(dev, head, unreg_list) {
6592                /* And unlink it from device chain. */
6593                unlist_netdevice(dev);
6594
6595                dev->reg_state = NETREG_UNREGISTERING;
6596                on_each_cpu(flush_backlog, dev, 1);
6597        }
6598
6599        synchronize_net();
6600
6601        list_for_each_entry(dev, head, unreg_list) {
6602                struct sk_buff *skb = NULL;
6603
6604                /* Shutdown queueing discipline. */
6605                dev_shutdown(dev);
6606
6607
6608                /* Notify protocols, that we are about to destroy
6609                   this device. They should clean all the things.
6610                */
6611                call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6612
6613                if (!dev->rtnl_link_ops ||
6614                    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6615                        skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6616                                                     GFP_KERNEL);
6617
6618                /*
6619                 *      Flush the unicast and multicast chains
6620                 */
6621                dev_uc_flush(dev);
6622                dev_mc_flush(dev);
6623
6624                if (dev->netdev_ops->ndo_uninit)
6625                        dev->netdev_ops->ndo_uninit(dev);
6626
6627                if (skb)
6628                        rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6629
6630                /* Notifier chain MUST detach us all upper devices. */
6631                WARN_ON(netdev_has_any_upper_dev(dev));
6632
6633                /* Remove entries from kobject tree */
6634                netdev_unregister_kobject(dev);
6635#ifdef CONFIG_XPS
6636                /* Remove XPS queueing entries */
6637                netif_reset_xps_queues_gt(dev, 0);
6638#endif
6639        }
6640
6641        synchronize_net();
6642
6643        list_for_each_entry(dev, head, unreg_list)
6644                dev_put(dev);
6645}
6646
6647static void rollback_registered(struct net_device *dev)
6648{
6649        LIST_HEAD(single);
6650
6651        list_add(&dev->unreg_list, &single);
6652        rollback_registered_many(&single);
6653        list_del(&single);
6654}
6655
6656static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
6657        struct net_device *upper, netdev_features_t features)
6658{
6659        netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6660        netdev_features_t feature;
6661        int feature_bit;
6662
6663        for_each_netdev_feature(&upper_disables, feature_bit) {
6664                feature = __NETIF_F_BIT(feature_bit);
6665                if (!(upper->wanted_features & feature)
6666                    && (features & feature)) {
6667                        netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
6668                                   &feature, upper->name);
6669                        features &= ~feature;
6670                }
6671        }
6672
6673        return features;
6674}
6675
6676static void netdev_sync_lower_features(struct net_device *upper,
6677        struct net_device *lower, netdev_features_t features)
6678{
6679        netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6680        netdev_features_t feature;
6681        int feature_bit;
6682
6683        for_each_netdev_feature(&upper_disables, feature_bit) {
6684                feature = __NETIF_F_BIT(feature_bit);
6685                if (!(features & feature) && (lower->features & feature)) {
6686                        netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
6687                                   &feature, lower->name);
6688                        lower->wanted_features &= ~feature;
6689                        netdev_update_features(lower);
6690
6691                        if (unlikely(lower->features & feature))
6692                                netdev_WARN(upper, "failed to disable %pNF on %s!\n",
6693                                            &feature, lower->name);
6694                }
6695        }
6696}
6697
6698static netdev_features_t netdev_fix_features(struct net_device *dev,
6699        netdev_features_t features)
6700{
6701        /* Fix illegal checksum combinations */
6702        if ((features & NETIF_F_HW_CSUM) &&
6703            (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6704                netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6705                features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6706        }
6707
6708        /* TSO requires that SG is present as well. */
6709        if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6710                netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6711                features &= ~NETIF_F_ALL_TSO;
6712        }
6713
6714        if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6715                                        !(features & NETIF_F_IP_CSUM)) {
6716                netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6717                features &= ~NETIF_F_TSO;
6718                features &= ~NETIF_F_TSO_ECN;
6719        }
6720
6721        if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6722                                         !(features & NETIF_F_IPV6_CSUM)) {
6723                netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6724                features &= ~NETIF_F_TSO6;
6725        }
6726
6727        /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
6728        if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
6729                features &= ~NETIF_F_TSO_MANGLEID;
6730
6731        /* TSO ECN requires that TSO is present as well. */
6732        if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6733                features &= ~NETIF_F_TSO_ECN;
6734
6735        /* Software GSO depends on SG. */
6736        if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6737                netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6738                features &= ~NETIF_F_GSO;
6739        }
6740
6741        /* UFO needs SG and checksumming */
6742        if (features & NETIF_F_UFO) {
6743                /* maybe split UFO into V4 and V6? */
6744                if (!(features & NETIF_F_HW_CSUM) &&
6745                    ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) !=
6746                     (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) {
6747                        netdev_dbg(dev,
6748                                "Dropping NETIF_F_UFO since no checksum offload features.\n");
6749                        features &= ~NETIF_F_UFO;
6750                }
6751
6752                if (!(features & NETIF_F_SG)) {
6753                        netdev_dbg(dev,
6754                                "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6755                        features &= ~NETIF_F_UFO;
6756                }
6757        }
6758
6759        /* GSO partial features require GSO partial be set */
6760        if ((features & dev->gso_partial_features) &&
6761            !(features & NETIF_F_GSO_PARTIAL)) {
6762                netdev_dbg(dev,
6763                           "Dropping partially supported GSO features since no GSO partial.\n");
6764                features &= ~dev->gso_partial_features;
6765        }
6766
6767#ifdef CONFIG_NET_RX_BUSY_POLL
6768        if (dev->netdev_ops->ndo_busy_poll)
6769                features |= NETIF_F_BUSY_POLL;
6770        else
6771#endif
6772                features &= ~NETIF_F_BUSY_POLL;
6773
6774        return features;
6775}
6776
6777int __netdev_update_features(struct net_device *dev)
6778{
6779        struct net_device *upper, *lower;
6780        netdev_features_t features;
6781        struct list_head *iter;
6782        int err = -1;
6783
6784        ASSERT_RTNL();
6785
6786        features = netdev_get_wanted_features(dev);
6787
6788        if (dev->netdev_ops->ndo_fix_features)
6789                features = dev->netdev_ops->ndo_fix_features(dev, features);
6790
6791        /* driver might be less strict about feature dependencies */
6792        features = netdev_fix_features(dev, features);
6793
6794        /* some features can't be enabled if they're off an an upper device */
6795        netdev_for_each_upper_dev_rcu(dev, upper, iter)
6796                features = netdev_sync_upper_features(dev, upper, features);
6797
6798        if (dev->features == features)
6799                goto sync_lower;
6800
6801        netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6802                &dev->features, &features);
6803
6804        if (dev->netdev_ops->ndo_set_features)
6805                err = dev->netdev_ops->ndo_set_features(dev, features);
6806        else
6807                err = 0;
6808
6809        if (unlikely(err < 0)) {
6810                netdev_err(dev,
6811                        "set_features() failed (%d); wanted %pNF, left %pNF\n",
6812                        err, &features, &dev->features);
6813                /* return non-0 since some features might have changed and
6814                 * it's better to fire a spurious notification than miss it
6815                 */
6816                return -1;
6817        }
6818
6819sync_lower:
6820        /* some features must be disabled on lower devices when disabled
6821         * on an upper device (think: bonding master or bridge)
6822         */
6823        netdev_for_each_lower_dev(dev, lower, iter)
6824                netdev_sync_lower_features(dev, lower, features);
6825
6826        if (!err)
6827                dev->features = features;
6828
6829        return err < 0 ? 0 : 1;
6830}
6831
6832/**
6833 *      netdev_update_features - recalculate device features
6834 *      @dev: the device to check
6835 *
6836 *      Recalculate dev->features set and send notifications if it
6837 *      has changed. Should be called after driver or hardware dependent
6838 *      conditions might have changed that influence the features.
6839 */
6840void netdev_update_features(struct net_device *dev)
6841{
6842        if (__netdev_update_features(dev))
6843                netdev_features_change(dev);
6844}
6845EXPORT_SYMBOL(netdev_update_features);
6846
6847/**
6848 *      netdev_change_features - recalculate device features
6849 *      @dev: the device to check
6850 *
6851 *      Recalculate dev->features set and send notifications even
6852 *      if they have not changed. Should be called instead of
6853 *      netdev_update_features() if also dev->vlan_features might
6854 *      have changed to allow the changes to be propagated to stacked
6855 *      VLAN devices.
6856 */
6857void netdev_change_features(struct net_device *dev)
6858{
6859        __netdev_update_features(dev);
6860        netdev_features_change(dev);
6861}
6862EXPORT_SYMBOL(netdev_change_features);
6863
6864/**
6865 *      netif_stacked_transfer_operstate -      transfer operstate
6866 *      @rootdev: the root or lower level device to transfer state from
6867 *      @dev: the device to transfer operstate to
6868 *
6869 *      Transfer operational state from root to device. This is normally
6870 *      called when a stacking relationship exists between the root
6871 *      device and the device(a leaf device).
6872 */
6873void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6874                                        struct net_device *dev)
6875{
6876        if (rootdev->operstate == IF_OPER_DORMANT)
6877                netif_dormant_on(dev);
6878        else
6879                netif_dormant_off(dev);
6880
6881        if (netif_carrier_ok(rootdev)) {
6882                if (!netif_carrier_ok(dev))
6883                        netif_carrier_on(dev);
6884        } else {
6885                if (netif_carrier_ok(dev))
6886                        netif_carrier_off(dev);
6887        }
6888}
6889EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6890
6891#ifdef CONFIG_SYSFS
6892static int netif_alloc_rx_queues(struct net_device *dev)
6893{
6894        unsigned int i, count = dev->num_rx_queues;
6895        struct netdev_rx_queue *rx;
6896        size_t sz = count * sizeof(*rx);
6897
6898        BUG_ON(count < 1);
6899
6900        rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6901        if (!rx) {
6902                rx = vzalloc(sz);
6903                if (!rx)
6904                        return -ENOMEM;
6905        }
6906        dev->_rx = rx;
6907
6908        for (i = 0; i < count; i++)
6909                rx[i].dev = dev;
6910        return 0;
6911}
6912#endif
6913
6914static void netdev_init_one_queue(struct net_device *dev,
6915                                  struct netdev_queue *queue, void *_unused)
6916{
6917        /* Initialize queue lock */
6918        spin_lock_init(&queue->_xmit_lock);
6919        netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6920        queue->xmit_lock_owner = -1;
6921        netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6922        queue->dev = dev;
6923#ifdef CONFIG_BQL
6924        dql_init(&queue->dql, HZ);
6925#endif
6926}
6927
6928static void netif_free_tx_queues(struct net_device *dev)
6929{
6930        kvfree(dev->_tx);
6931}
6932
6933static int netif_alloc_netdev_queues(struct net_device *dev)
6934{
6935        unsigned int count = dev->num_tx_queues;
6936        struct netdev_queue *tx;
6937        size_t sz = count * sizeof(*tx);
6938
6939        if (count < 1 || count > 0xffff)
6940                return -EINVAL;
6941
6942        tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6943        if (!tx) {
6944                tx = vzalloc(sz);
6945                if (!tx)
6946                        return -ENOMEM;
6947        }
6948        dev->_tx = tx;
6949
6950        netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6951        spin_lock_init(&dev->tx_global_lock);
6952
6953        return 0;
6954}
6955
6956void netif_tx_stop_all_queues(struct net_device *dev)
6957{
6958        unsigned int i;
6959
6960        for (i = 0; i < dev->num_tx_queues; i++) {
6961                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
6962                netif_tx_stop_queue(txq);
6963        }
6964}
6965EXPORT_SYMBOL(netif_tx_stop_all_queues);
6966
6967/**
6968 *      register_netdevice      - register a network device
6969 *      @dev: device to register
6970 *
6971 *      Take a completed network device structure and add it to the kernel
6972 *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6973 *      chain. 0 is returned on success. A negative errno code is returned
6974 *      on a failure to set up the device, or if the name is a duplicate.
6975 *
6976 *      Callers must hold the rtnl semaphore. You may want
6977 *      register_netdev() instead of this.
6978 *
6979 *      BUGS:
6980 *      The locking appears insufficient to guarantee two parallel registers
6981 *      will not get the same name.
6982 */
6983
6984int register_netdevice(struct net_device *dev)
6985{
6986        int ret;
6987        struct net *net = dev_net(dev);
6988
6989        BUG_ON(dev_boot_phase);
6990        ASSERT_RTNL();
6991
6992        might_sleep();
6993
6994        /* When net_device's are persistent, this will be fatal. */
6995        BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6996        BUG_ON(!net);
6997
6998        spin_lock_init(&dev->addr_list_lock);
6999        netdev_set_addr_lockdep_class(dev);
7000

7001        ret = dev_get_valid_name(net, dev, dev->name);
7002        if (ret < 0)
7003                goto out;
7004
7005        /* Init, if this function is available */
7006        if (dev->netdev_ops->ndo_init) {
7007                ret = dev->netdev_ops->ndo_init(dev);
7008                if (ret) {
7009                        if (ret > 0)
7010                                ret = -EIO;
7011                        goto out;
7012                }
7013        }
7014
7015        if (((dev->hw_features | dev->features) &
7016             NETIF_F_HW_VLAN_CTAG_FILTER) &&
7017            (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
7018             !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
7019                netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
7020                ret = -EINVAL;
7021                goto err_uninit;
7022        }
7023
7024        ret = -EBUSY;
7025        if (!dev->ifindex)
7026                dev->ifindex = dev_new_index(net);
7027        else if (__dev_get_by_index(net, dev->ifindex))
7028                goto err_uninit;
7029
7030        /* Transfer changeable features to wanted_features and enable
7031         * software offloads (GSO and GRO).
7032         */
7033        dev->hw_features |= NETIF_F_SOFT_FEATURES;
7034        dev->features |= NETIF_F_SOFT_FEATURES;
7035        dev->wanted_features = dev->features & dev->hw_features;
7036
7037        if (!(dev->flags & IFF_LOOPBACK))
7038                dev->hw_features |= NETIF_F_NOCACHE_COPY;
7039
7040        /* If IPv4 TCP segmentation offload is supported we should also
7041         * allow the device to enable segmenting the frame with the option
7042         * of ignoring a static IP ID value.  This doesn't enable the
7043         * feature itself but allows the user to enable it later.
7044         */
7045        if (dev->hw_features & NETIF_F_TSO)
7046                dev->hw_features |= NETIF_F_TSO_MANGLEID;
7047        if (dev->vlan_features & NETIF_F_TSO)
7048                dev->vlan_features |= NETIF_F_TSO_MANGLEID;
7049        if (dev->mpls_features & NETIF_F_TSO)
7050                dev->mpls_features |= NETIF_F_TSO_MANGLEID;
7051        if (dev->hw_enc_features & NETIF_F_TSO)
7052                dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
7053
7054        /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
7055         */
7056        dev->vlan_features |= NETIF_F_HIGHDMA;
7057
7058        /* Make NETIF_F_SG inheritable to tunnel devices.
7059         */
7060        dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
7061
7062        /* Make NETIF_F_SG inheritable to MPLS.
7063         */
7064        dev->mpls_features |= NETIF_F_SG;
7065
7066        ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
7067        ret = notifier_to_errno(ret);
7068        if (ret)
7069                goto err_uninit;
7070
7071        ret = netdev_register_kobject(dev);
7072        if (ret)
7073                goto err_uninit;
7074        dev->reg_state = NETREG_REGISTERED;
7075
7076        __netdev_update_features(dev);
7077
7078        /*
7079         *      Default initial state at registry is that the
7080         *      device is present.
7081         */
7082
7083        set_bit(__LINK_STATE_PRESENT, &dev->state);
7084
7085        linkwatch_init_dev(dev);
7086
7087        dev_init_scheduler(dev);
7088        dev_hold(dev);
7089        list_netdevice(dev);
7090        add_device_randomness(dev->dev_addr, dev->addr_len);
7091
7092        /* If the device has permanent device address, driver should
7093         * set dev_addr and also addr_assign_type should be set to
7094         * NET_ADDR_PERM (default value).
7095         */
7096        if (dev->addr_assign_type == NET_ADDR_PERM)
7097                memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
7098
7099        /* Notify protocols, that a new device appeared. */
7100        ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
7101        ret = notifier_to_errno(ret);
7102        if (ret) {
7103                rollback_registered(dev);
7104                dev->reg_state = NETREG_UNREGISTERED;
7105        }
7106        /*
7107         *      Prevent userspace races by waiting until the network
7108         *      device is fully setup before sending notifications.
7109         */
7110        if (!dev->rtnl_link_ops ||
7111            dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
7112                rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7113
7114out:
7115        return ret;
7116
7117err_uninit:
7118        if (dev->netdev_ops->ndo_uninit)
7119                dev->netdev_ops->ndo_uninit(dev);
7120        goto out;
7121}
7122EXPORT_SYMBOL(register_netdevice);
7123
7124/**
7125 *      init_dummy_netdev       - init a dummy network device for NAPI
7126 *      @dev: device to init
7127 *
7128 *      This takes a network device structure and initialize the minimum
7129 *      amount of fields so it can be used to schedule NAPI polls without
7130 *      registering a full blown interface. This is to be used by drivers
7131 *      that need to tie several hardware interfaces to a single NAPI
7132 *      poll scheduler due to HW limitations.
7133 */
7134int init_dummy_netdev(struct net_device *dev)
7135{
7136        /* Clear everything. Note we don't initialize spinlocks
7137         * are they aren't supposed to be taken by any of the
7138         * NAPI code and this dummy netdev is supposed to be
7139         * only ever used for NAPI polls
7140         */
7141        memset(dev, 0, sizeof(struct net_device));
7142
7143        /* make sure we BUG if trying to hit standard
7144         * register/unregister code path
7145         */
7146        dev->reg_state = NETREG_DUMMY;
7147
7148        /* NAPI wants this */
7149        INIT_LIST_HEAD(&dev->napi_list);
7150
7151        /* a dummy interface is started by default */
7152        set_bit(__LINK_STATE_PRESENT, &dev->state);
7153        set_bit(__LINK_STATE_START, &dev->state);
7154
7155        /* Note : We dont allocate pcpu_refcnt for dummy devices,
7156         * because users of this 'device' dont need to change
7157         * its refcount.
7158         */
7159
7160        return 0;
7161}
7162EXPORT_SYMBOL_GPL(init_dummy_netdev);
7163
7164
7165/**
7166 *      register_netdev - register a network device
7167 *      @dev: device to register
7168 *
7169 *      Take a completed network device structure and add it to the kernel
7170 *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7171 *      chain. 0 is returned on success. A negative errno code is returned
7172 *      on a failure to set up the device, or if the name is a duplicate.
7173 *
7174 *      This is a wrapper around register_netdevice that takes the rtnl semaphore
7175 *      and expands the device name if you passed a format string to
7176 *      alloc_netdev.
7177 */
7178int register_netdev(struct net_device *dev)
7179{
7180        int err;
7181
7182        rtnl_lock();
7183        err = register_netdevice(dev);
7184        rtnl_unlock();
7185        return err;
7186}
7187EXPORT_SYMBOL(register_netdev);
7188
7189int netdev_refcnt_read(const struct net_device *dev)
7190{
7191        int i, refcnt = 0;
7192
7193        for_each_possible_cpu(i)
7194                refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
7195        return refcnt;
7196}
7197EXPORT_SYMBOL(netdev_refcnt_read);
7198
7199/**
7200 * netdev_wait_allrefs - wait until all references are gone.
7201 * @dev: target net_device
7202 *
7203 * This is called when unregistering network devices.
7204 *
7205 * Any protocol or device that holds a reference should register
7206 * for netdevice notification, and cleanup and put back the
7207 * reference if they receive an UNREGISTER event.
7208 * We can get stuck here if buggy protocols don't correctly
7209 * call dev_put.
7210 */
7211static void netdev_wait_allrefs(struct net_device *dev)
7212{
7213        unsigned long rebroadcast_time, warning_time;
7214        int refcnt;
7215
7216        linkwatch_forget_dev(dev);
7217
7218        rebroadcast_time = warning_time = jiffies;
7219        refcnt = netdev_refcnt_read(dev);
7220
7221        while (refcnt != 0) {
7222                if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
7223                        rtnl_lock();
7224
7225                        /* Rebroadcast unregister notification */
7226                        call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7227
7228                        __rtnl_unlock();
7229                        rcu_barrier();
7230                        rtnl_lock();
7231
7232                        call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7233                        if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
7234                                     &dev->state)) {
7235                                /* We must not have linkwatch events
7236                                 * pending on unregister. If this
7237                                 * happens, we simply run the queue
7238                                 * unscheduled, resulting in a noop
7239                                 * for this device.
7240                                 */
7241                                linkwatch_run_queue();
7242                        }
7243
7244                        __rtnl_unlock();
7245
7246                        rebroadcast_time = jiffies;
7247                }
7248
7249                msleep(250);
7250
7251                refcnt = netdev_refcnt_read(dev);
7252
7253                if (time_after(jiffies, warning_time + 10 * HZ)) {
7254                        pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
7255                                 dev->name, refcnt);
7256                        warning_time = jiffies;
7257                }
7258        }
7259}
7260
7261/* The sequence is:
7262 *
7263 *      rtnl_lock();
7264 *      ...
7265 *      register_netdevice(x1);
7266 *      register_netdevice(x2);
7267 *      ...
7268 *      unregister_netdevice(y1);
7269 *      unregister_netdevice(y2);
7270 *      ...
7271 *      rtnl_unlock();
7272 *      free_netdev(y1);
7273 *      free_netdev(y2);
7274 *
7275 * We are invoked by rtnl_unlock().
7276 * This allows us to deal with problems:
7277 * 1) We can delete sysfs objects which invoke hotplug
7278 *    without deadlocking with linkwatch via keventd.
7279 * 2) Since we run with the RTNL semaphore not held, we can sleep
7280 *    safely in order to wait for the netdev refcnt to drop to zero.
7281 *
7282 * We must not return until all unregister events added during
7283 * the interval the lock was held have been completed.
7284 */
7285void netdev_run_todo(void)
7286{
7287        struct list_head list;
7288
7289        /* Snapshot list, allow later requests */
7290        list_replace_init(&net_todo_list, &list);
7291
7292        __rtnl_unlock();
7293
7294
7295        /* Wait for rcu callbacks to finish before next phase */
7296        if (!list_empty(&list))
7297                rcu_barrier();
7298
7299        while (!list_empty(&list)) {
7300                struct net_device *dev
7301                        = list_first_entry(&list, struct net_device, todo_list);
7302                list_del(&dev->todo_list);
7303
7304                rtnl_lock();
7305                call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7306                __rtnl_unlock();
7307
7308                if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7309                        pr_err("network todo '%s' but state %d\n",
7310                               dev->name, dev->reg_state);
7311                        dump_stack();
7312                        continue;
7313                }
7314
7315                dev->reg_state = NETREG_UNREGISTERED;
7316
7317                netdev_wait_allrefs(dev);
7318
7319                /* paranoia */
7320                BUG_ON(netdev_refcnt_read(dev));
7321                BUG_ON(!list_empty(&dev->ptype_all));
7322                BUG_ON(!list_empty(&dev->ptype_specific));
7323                WARN_ON(rcu_access_pointer(dev->ip_ptr));
7324                WARN_ON(rcu_access_pointer(dev->ip6_ptr));
7325                WARN_ON(dev->dn_ptr);
7326
7327                if (dev->destructor)
7328                        dev->destructor(dev);
7329
7330                /* Report a network device has been unregistered */
7331                rtnl_lock();
7332                dev_net(dev)->dev_unreg_count--;
7333                __rtnl_unlock();
7334                wake_up(&netdev_unregistering_wq);
7335
7336                /* Free network device */
7337                kobject_put(&dev->dev.kobj);
7338        }
7339}
7340
7341/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
7342 * all the same fields in the same order as net_device_stats, with only
7343 * the type differing, but rtnl_link_stats64 may have additional fields
7344 * at the end for newer counters.
7345 */
7346void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
7347                             const struct net_device_stats *netdev_stats)
7348{
7349#if BITS_PER_LONG == 64
7350        BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
7351        memcpy(stats64, netdev_stats, sizeof(*stats64));
7352        /* zero out counters that only exist in rtnl_link_stats64 */
7353        memset((char *)stats64 + sizeof(*netdev_stats), 0,
7354               sizeof(*stats64) - sizeof(*netdev_stats));
7355#else
7356        size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
7357        const unsigned long *src = (const unsigned long *)netdev_stats;
7358        u64 *dst = (u64 *)stats64;
7359
7360        BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
7361        for (i = 0; i < n; i++)
7362                dst[i] = src[i];
7363        /* zero out counters that only exist in rtnl_link_stats64 */
7364        memset((char *)stats64 + n * sizeof(u64), 0,
7365               sizeof(*stats64) - n * sizeof(u64));
7366#endif
7367}
7368EXPORT_SYMBOL(netdev_stats_to_stats64);
7369
7370/**
7371 *      dev_get_stats   - get network device statistics
7372 *      @dev: device to get statistics from
7373 *      @storage: place to store stats
7374 *
7375 *      Get network statistics from device. Return @storage.
7376 *      The device driver may provide its own method by setting
7377 *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
7378 *      otherwise the internal statistics structure is used.
7379 */
7380struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
7381                                        struct rtnl_link_stats64 *storage)
7382{
7383        const struct net_device_ops *ops = dev->netdev_ops;
7384
7385        if (ops->ndo_get_stats64) {
7386                memset(storage, 0, sizeof(*storage));
7387                ops->ndo_get_stats64(dev, storage);
7388        } else if (ops->ndo_get_stats) {
7389                netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
7390        } else {
7391                netdev_stats_to_stats64(storage, &dev->stats);
7392        }
7393        storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
7394        storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
7395        storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler);
7396        return storage;
7397}
7398EXPORT_SYMBOL(dev_get_stats);
7399
7400struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
7401{
7402        struct netdev_queue *queue = dev_ingress_queue(dev);
7403
7404#ifdef CONFIG_NET_CLS_ACT
7405        if (queue)
7406                return queue;
7407        queue = kzalloc(sizeof(*queue), GFP_KERNEL);
7408        if (!queue)
7409                return NULL;
7410        netdev_init_one_queue(dev, queue, NULL);
7411        RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
7412        queue->qdisc_sleeping = &noop_qdisc;
7413        rcu_assign_pointer(dev->ingress_queue, queue);
7414#endif
7415        return queue;
7416}
7417
7418static const struct ethtool_ops default_ethtool_ops;
7419
7420void netdev_set_default_ethtool_ops(struct net_device *dev,
7421                                    const struct ethtool_ops *ops)
7422{
7423        if (dev->ethtool_ops == &default_ethtool_ops)
7424                dev->ethtool_ops = ops;
7425}
7426EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
7427
7428void netdev_freemem(struct net_device *dev)
7429{
7430        char *addr = (char *)dev - dev->padded;
7431
7432        kvfree(addr);
7433}
7434
7435/**
7436 *      alloc_netdev_mqs - allocate network device
7437 *      @sizeof_priv:           size of private data to allocate space for
7438 *      @name:                  device name format string
7439 *      @name_assign_type:      origin of device name
7440 *      @setup:                 callback to initialize device
7441 *      @txqs:                  the number of TX subqueues to allocate
7442 *      @rxqs:                  the number of RX subqueues to allocate
7443 *
7444 *      Allocates a struct net_device with private data area for driver use
7445 *      and performs basic initialization.  Also allocates subqueue structs
7446 *      for each queue on the device.
7447 */
7448struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7449                unsigned char name_assign_type,
7450                void (*setup)(struct net_device *),
7451                unsigned int txqs, unsigned int rxqs)
7452{
7453        struct net_device *dev;
7454        size_t alloc_size;
7455        struct net_device *p;
7456
7457        BUG_ON(strlen(name) >= sizeof(dev->name));
7458
7459        if (txqs < 1) {
7460                pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
7461                return NULL;
7462        }
7463
7464#ifdef CONFIG_SYSFS
7465        if (rxqs < 1) {
7466                pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
7467                return NULL;
7468        }
7469#endif
7470
7471        alloc_size = sizeof(struct net_device);
7472        if (sizeof_priv) {
7473                /* ensure 32-byte alignment of private area */
7474                alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
7475                alloc_size += sizeof_priv;
7476        }
7477        /* ensure 32-byte alignment of whole construct */
7478        alloc_size += NETDEV_ALIGN - 1;
7479
7480        p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7481        if (!p)
7482                p = vzalloc(alloc_size);
7483        if (!p)
7484                return NULL;
7485
7486        dev = PTR_ALIGN(p, NETDEV_ALIGN);
7487        dev->padded = (char *)dev - (char *)p;
7488
7489        dev->pcpu_refcnt = alloc_percpu(int);
7490        if (!dev->pcpu_refcnt)
7491                goto free_dev;
7492
7493        if (dev_addr_init(dev))
7494                goto free_pcpu;
7495
7496        dev_mc_init(dev);
7497        dev_uc_init(dev);
7498
7499        dev_net_set(dev, &init_net);
7500
7501        dev->gso_max_size = GSO_MAX_SIZE;
7502        dev->gso_max_segs = GSO_MAX_SEGS;
7503
7504        INIT_LIST_HEAD(&dev->napi_list);
7505        INIT_LIST_HEAD(&dev->unreg_list);
7506        INIT_LIST_HEAD(&dev->close_list);
7507        INIT_LIST_HEAD(&dev->link_watch_list);
7508        INIT_LIST_HEAD(&dev->adj_list.upper);
7509        INIT_LIST_HEAD(&dev->adj_list.lower);
7510        INIT_LIST_HEAD(&dev->all_adj_list.upper);
7511        INIT_LIST_HEAD(&dev->all_adj_list.lower);
7512        INIT_LIST_HEAD(&dev->ptype_all);
7513        INIT_LIST_HEAD(&dev->ptype_specific);
7514        dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
7515        setup(dev);
7516
7517        if (!dev->tx_queue_len) {
7518                dev->priv_flags |= IFF_NO_QUEUE;
7519                dev->tx_queue_len = 1;
7520        }
7521
7522        dev->num_tx_queues = txqs;
7523        dev->real_num_tx_queues = txqs;
7524        if (netif_alloc_netdev_queues(dev))
7525                goto free_all;
7526
7527#ifdef CONFIG_SYSFS
7528        dev->num_rx_queues = rxqs;
7529        dev->real_num_rx_queues = rxqs;
7530        if (netif_alloc_rx_queues(dev))
7531                goto free_all;
7532#endif
7533
7534        strcpy(dev->name, name);
7535        dev->name_assign_type = name_assign_type;
7536        dev->group = INIT_NETDEV_GROUP;
7537        if (!dev->ethtool_ops)
7538                dev->ethtool_ops = &default_ethtool_ops;
7539
7540        nf_hook_ingress_init(dev);
7541
7542        return dev;
7543
7544free_all:
7545        free_netdev(dev);
7546        return NULL;
7547
7548free_pcpu:
7549        free_percpu(dev->pcpu_refcnt);
7550free_dev:
7551        netdev_freemem(dev);
7552        return NULL;
7553}
7554EXPORT_SYMBOL(alloc_netdev_mqs);
7555
7556/**
7557 *      free_netdev - free network device
7558 *      @dev: device
7559 *
7560 *      This function does the last stage of destroying an allocated device
7561 *      interface. The reference to the device object is released.
7562 *      If this is the last reference then it will be freed.
7563 *      Must be called in process context.
7564 */
7565void free_netdev(struct net_device *dev)
7566{
7567        struct napi_struct *p, *n;
7568
7569        might_sleep();
7570        netif_free_tx_queues(dev);
7571#ifdef CONFIG_SYSFS
7572        kvfree(dev->_rx);
7573#endif
7574
7575        kfree(rcu_dereference_protected(dev->ingress_queue, 1));
7576
7577        /* Flush device addresses */
7578        dev_addr_flush(dev);
7579
7580        list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7581                netif_napi_del(p);
7582
7583        free_percpu(dev->pcpu_refcnt);
7584        dev->pcpu_refcnt = NULL;
7585
7586        /*  Compatibility with error handling in drivers */
7587        if (dev->reg_state == NETREG_UNINITIALIZED) {
7588                netdev_freemem(dev);
7589                return;
7590        }
7591
7592        BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7593        dev->reg_state = NETREG_RELEASED;
7594
7595        /* will free via device release */
7596        put_device(&dev->dev);
7597}
7598EXPORT_SYMBOL(free_netdev);
7599
7600/**
7601 *      synchronize_net -  Synchronize with packet receive processing
7602 *
7603 *      Wait for packets currently being received to be done.
7604 *      Does not block later packets from starting.
7605 */
7606void synchronize_net(void)
7607{
7608        might_sleep();
7609        if (rtnl_is_locked())
7610                synchronize_rcu_expedited();
7611        else
7612                synchronize_rcu();
7613}
7614EXPORT_SYMBOL(synchronize_net);
7615
7616/**
7617 *      unregister_netdevice_queue - remove device from the kernel
7618 *      @dev: device
7619 *      @head: list
7620 *
7621 *      This function shuts down a device interface and removes it
7622 *      from the kernel tables.
7623 *      If head not NULL, device is queued to be unregistered later.
7624 *
7625 *      Callers must hold the rtnl semaphore.  You may want
7626 *      unregister_netdev() instead of this.
7627 */
7628
7629void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
7630{
7631        ASSERT_RTNL();
7632
7633        if (head) {
7634                list_move_tail(&dev->unreg_list, head);
7635        } else {
7636                rollback_registered(dev);
7637                /* Finish processing unregister after unlock */
7638                net_set_todo(dev);
7639        }
7640}
7641EXPORT_SYMBOL(unregister_netdevice_queue);
7642
7643/**
7644 *      unregister_netdevice_many - unregister many devices
7645 *      @head: list of devices
7646 *
7647 *  Note: As most callers use a stack allocated list_head,
7648 *  we force a list_del() to make sure stack wont be corrupted later.
7649 */
7650void unregister_netdevice_many(struct list_head *head)
7651{
7652        struct net_device *dev;
7653
7654        if (!list_empty(head)) {
7655                rollback_registered_many(head);
7656                list_for_each_entry(dev, head, unreg_list)
7657                        net_set_todo(dev);
7658                list_del(head);
7659        }
7660}
7661EXPORT_SYMBOL(unregister_netdevice_many);
7662
7663/**
7664 *      unregister_netdev - remove device from the kernel
7665 *      @dev: device
7666 *
7667 *      This function shuts down a device interface and removes it
7668 *      from the kernel tables.
7669 *
7670 *      This is just a wrapper for unregister_netdevice that takes
7671 *      the rtnl semaphore.  In general you want to use this and not
7672 *      unregister_netdevice.
7673 */
7674void unregister_netdev(struct net_device *dev)
7675{
7676        rtnl_lock();
7677        unregister_netdevice(dev);
7678        rtnl_unlock();
7679}
7680EXPORT_SYMBOL(unregister_netdev);
7681
7682/**
7683 *      dev_change_net_namespace - move device to different nethost namespace
7684 *      @dev: device
7685 *      @net: network namespace
7686 *      @pat: If not NULL name pattern to try if the current device name
7687 *            is already taken in the destination network namespace.
7688 *
7689 *      This function shuts down a device interface and moves it
7690 *      to a new network namespace. On success 0 is returned, on
7691 *      a failure a netagive errno code is returned.
7692 *
7693 *      Callers must hold the rtnl semaphore.
7694 */
7695
7696int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7697{
7698        int err;
7699
7700        ASSERT_RTNL();
7701
7702        /* Don't allow namespace local devices to be moved. */
7703        err = -EINVAL;
7704        if (dev->features & NETIF_F_NETNS_LOCAL)
7705                goto out;
7706
7707        /* Ensure the device has been registrered */
7708        if (dev->reg_state != NETREG_REGISTERED)
7709                goto out;
7710
7711        /* Get out if there is nothing todo */
7712        err = 0;
7713        if (net_eq(dev_net(dev), net))
7714                goto out;
7715
7716        /* Pick the destination device name, and ensure
7717         * we can use it in the destination network namespace.
7718         */
7719        err = -EEXIST;
7720        if (__dev_get_by_name(net, dev->name)) {
7721                /* We get here if we can't use the current device name */
7722                if (!pat)
7723                        goto out;
7724                if (dev_get_valid_name(net, dev, pat) < 0)
7725                        goto out;
7726        }
7727
7728        /*
7729         * And now a mini version of register_netdevice unregister_netdevice.
7730         */
7731
7732        /* If device is running close it first. */
7733        dev_close(dev);
7734
7735        /* And unlink it from device chain */
7736        err = -ENODEV;
7737        unlist_netdevice(dev);
7738
7739        synchronize_net();
7740
7741        /* Shutdown queueing discipline. */
7742        dev_shutdown(dev);
7743
7744        /* Notify protocols, that we are about to destroy
7745           this device. They should clean all the things.
7746
7747           Note that dev->reg_state stays at NETREG_REGISTERED.
7748           This is wanted because this way 8021q and macvlan know
7749           the device is just moving and can keep their slaves up.
7750        */
7751        call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7752        rcu_barrier();
7753        call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7754        rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
7755
7756        /*
7757         *      Flush the unicast and multicast chains
7758         */
7759        dev_uc_flush(dev);
7760        dev_mc_flush(dev);
7761
7762        /* Send a netdev-removed uevent to the old namespace */
7763        kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7764        netdev_adjacent_del_links(dev);
7765
7766        /* Actually switch the network namespace */
7767        dev_net_set(dev, net);
7768
7769        /* If there is an ifindex conflict assign a new one */
7770        if (__dev_get_by_index(net, dev->ifindex))
7771                dev->ifindex = dev_new_index(net);
7772
7773        /* Send a netdev-add uevent to the new namespace */
7774        kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7775        netdev_adjacent_add_links(dev);
7776
7777        /* Fixup kobjects */
7778        err = device_rename(&dev->dev, dev->name);
7779        WARN_ON(err);
7780
7781        /* Add the device back in the hashes */
7782        list_netdevice(dev);
7783
7784        /* Notify protocols, that a new device appeared. */
7785        call_netdevice_notifiers(NETDEV_REGISTER, dev);
7786
7787        /*
7788         *      Prevent userspace races by waiting until the network
7789         *      device is fully setup before sending notifications.
7790         */
7791        rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7792
7793        synchronize_net();
7794        err = 0;
7795out:
7796        return err;
7797}
7798EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7799
7800static int dev_cpu_callback(struct notifier_block *nfb,
7801                            unsigned long action,
7802                            void *ocpu)
7803{
7804        struct sk_buff **list_skb;
7805        struct sk_buff *skb;
7806        unsigned int cpu, oldcpu = (unsigned long)ocpu;
7807        struct softnet_data *sd, *oldsd;
7808
7809        if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7810                return NOTIFY_OK;
7811
7812        local_irq_disable();
7813        cpu = smp_processor_id();
7814        sd = &per_cpu(softnet_data, cpu);
7815        oldsd = &per_cpu(softnet_data, oldcpu);
7816
7817        /* Find end of our completion_queue. */
7818        list_skb = &sd->completion_queue;
7819        while (*list_skb)
7820                list_skb = &(*list_skb)->next;
7821        /* Append completion queue from offline CPU. */
7822        *list_skb = oldsd->completion_queue;
7823        oldsd->completion_queue = NULL;
7824
7825        /* Append output queue from offline CPU. */
7826        if (oldsd->output_queue) {
7827                *sd->output_queue_tailp = oldsd->output_queue;
7828                sd->output_queue_tailp = oldsd->output_queue_tailp;
7829                oldsd->output_queue = NULL;
7830                oldsd->output_queue_tailp = &oldsd->output_queue;
7831        }
7832        /* Append NAPI poll list from offline CPU, with one exception :
7833         * process_backlog() must be called by cpu owning percpu backlog.
7834         * We properly handle process_queue & input_pkt_queue later.
7835         */
7836        while (!list_empty(&oldsd->poll_list)) {
7837                struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7838                                                            struct napi_struct,
7839                                                            poll_list);
7840
7841                list_del_init(&napi->poll_list);
7842                if (napi->poll == process_backlog)
7843                        napi->state = 0;
7844                else
7845                        ____napi_schedule(sd, napi);
7846        }
7847
7848        raise_softirq_irqoff(NET_TX_SOFTIRQ);
7849        local_irq_enable();
7850
7851        /* Process offline CPU's input_pkt_queue */
7852        while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7853                netif_rx_ni(skb);
7854                input_queue_head_incr(oldsd);
7855        }
7856        while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
7857                netif_rx_ni(skb);
7858                input_queue_head_incr(oldsd);
7859        }
7860
7861        return NOTIFY_OK;
7862}
7863
7864
7865/**
7866 *      netdev_increment_features - increment feature set by one
7867 *      @all: current feature set
7868 *      @one: new feature set
7869 *      @mask: mask feature set
7870 *
7871 *      Computes a new feature set after adding a device with feature set
7872 *      @one to the master device with current feature set @all.  Will not
7873 *      enable anything that is off in @mask. Returns the new feature set.
7874 */
7875netdev_features_t netdev_increment_features(netdev_features_t all,
7876        netdev_features_t one, netdev_features_t mask)
7877{
7878        if (mask & NETIF_F_HW_CSUM)
7879                mask |= NETIF_F_CSUM_MASK;
7880        mask |= NETIF_F_VLAN_CHALLENGED;
7881
7882        all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
7883        all &= one | ~NETIF_F_ALL_FOR_ALL;
7884
7885        /* If one device supports hw checksumming, set for all. */
7886        if (all & NETIF_F_HW_CSUM)
7887                all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
7888
7889        return all;
7890}
7891EXPORT_SYMBOL(netdev_increment_features);
7892
7893static struct hlist_head * __net_init netdev_create_hash(void)
7894{
7895        int i;
7896        struct hlist_head *hash;
7897
7898        hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7899        if (hash != NULL)
7900                for (i = 0; i < NETDEV_HASHENTRIES; i++)
7901                        INIT_HLIST_HEAD(&hash[i]);
7902
7903        return hash;
7904}
7905
7906/* Initialize per network namespace state */
7907static int __net_init netdev_init(struct net *net)
7908{
7909        if (net != &init_net)
7910                INIT_LIST_HEAD(&net->dev_base_head);
7911
7912        net->dev_name_head = netdev_create_hash();
7913        if (net->dev_name_head == NULL)
7914                goto err_name;
7915
7916        net->dev_index_head = netdev_create_hash();
7917        if (net->dev_index_head == NULL)
7918                goto err_idx;
7919
7920        return 0;
7921
7922err_idx:
7923        kfree(net->dev_name_head);
7924err_name:
7925        return -ENOMEM;
7926}
7927
7928/**
7929 *      netdev_drivername - network driver for the device
7930 *      @dev: network device
7931 *
7932 *      Determine network driver for device.
7933 */
7934const char *netdev_drivername(const struct net_device *dev)
7935{
7936        const struct device_driver *driver;
7937        const struct device *parent;
7938        const char *empty = "";
7939
7940        parent = dev->dev.parent;
7941        if (!parent)
7942                return empty;
7943
7944        driver = parent->driver;
7945        if (driver && driver->name)
7946                return driver->name;
7947        return empty;
7948}
7949
7950static void __netdev_printk(const char *level, const struct net_device *dev,
7951                            struct va_format *vaf)
7952{
7953        if (dev && dev->dev.parent) {
7954                dev_printk_emit(level[1] - '0',
7955                                dev->dev.parent,
7956                                "%s %s %s%s: %pV",
7957                                dev_driver_string(dev->dev.parent),
7958                                dev_name(dev->dev.parent),
7959                                netdev_name(dev), netdev_reg_state(dev),
7960                                vaf);
7961        } else if (dev) {
7962                printk("%s%s%s: %pV",
7963                       level, netdev_name(dev), netdev_reg_state(dev), vaf);
7964        } else {
7965                printk("%s(NULL net_device): %pV", level, vaf);
7966        }
7967}
7968
7969void netdev_printk(const char *level, const struct net_device *dev,
7970                   const char *format, ...)
7971{
7972        struct va_format vaf;
7973        va_list args;
7974
7975        va_start(args, format);
7976
7977        vaf.fmt = format;
7978        vaf.va = &args;
7979
7980        __netdev_printk(level, dev, &vaf);
7981
7982        va_end(args);
7983}
7984EXPORT_SYMBOL(netdev_printk);
7985
7986#define define_netdev_printk_level(func, level)                 \
7987void func(const struct net_device *dev, const char *fmt, ...)   \
7988{                                                               \
7989        struct va_format vaf;                                   \
7990        va_list args;                                           \
7991                                                                \
7992        va_start(args, fmt);                                    \
7993                                                                \
7994        vaf.fmt = fmt;                                          \
7995        vaf.va = &args;                                         \
7996                                                                \
7997        __netdev_printk(level, dev, &vaf);                      \
7998                                                                \
7999        va_end(args);                                           \
8000}                                                               \

8001EXPORT_SYMBOL(func);
8002
8003define_netdev_printk_level(netdev_emerg, KERN_EMERG);
8004define_netdev_printk_level(netdev_alert, KERN_ALERT);
8005define_netdev_printk_level(netdev_crit, KERN_CRIT);
8006define_netdev_printk_level(netdev_err, KERN_ERR);
8007define_netdev_printk_level(netdev_warn, KERN_WARNING);
8008define_netdev_printk_level(netdev_notice, KERN_NOTICE);
8009define_netdev_printk_level(netdev_info, KERN_INFO);
8010
8011static void __net_exit netdev_exit(struct net *net)
8012{
8013        kfree(net->dev_name_head);
8014        kfree(net->dev_index_head);
8015}
8016
8017static struct pernet_operations __net_initdata netdev_net_ops = {
8018        .init = netdev_init,
8019        .exit = netdev_exit,
8020};
8021
8022static void __net_exit default_device_exit(struct net *net)
8023{
8024        struct net_device *dev, *aux;
8025        /*
8026         * Push all migratable network devices back to the
8027         * initial network namespace
8028         */
8029        rtnl_lock();
8030        for_each_netdev_safe(net, dev, aux) {
8031                int err;
8032                char fb_name[IFNAMSIZ];
8033
8034                /* Ignore unmoveable devices (i.e. loopback) */
8035                if (dev->features & NETIF_F_NETNS_LOCAL)
8036                        continue;
8037
8038                /* Leave virtual devices for the generic cleanup */
8039                if (dev->rtnl_link_ops)
8040                        continue;
8041
8042                /* Push remaining network devices to init_net */
8043                snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
8044                err = dev_change_net_namespace(dev, &init_net, fb_name);
8045                if (err) {
8046                        pr_emerg("%s: failed to move %s to init_net: %d\n",
8047                                 __func__, dev->name, err);
8048                        BUG();
8049                }
8050        }
8051        rtnl_unlock();
8052}
8053
8054static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
8055{
8056        /* Return with the rtnl_lock held when there are no network
8057         * devices unregistering in any network namespace in net_list.
8058         */
8059        struct net *net;
8060        bool unregistering;
8061        DEFINE_WAIT_FUNC(wait, woken_wake_function);
8062
8063        add_wait_queue(&netdev_unregistering_wq, &wait);
8064        for (;;) {
8065                unregistering = false;
8066                rtnl_lock();
8067                list_for_each_entry(net, net_list, exit_list) {
8068                        if (net->dev_unreg_count > 0) {
8069                                unregistering = true;
8070                                break;
8071                        }
8072                }
8073                if (!unregistering)
8074                        break;
8075                __rtnl_unlock();
8076
8077                wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
8078        }
8079        remove_wait_queue(&netdev_unregistering_wq, &wait);
8080}
8081
8082static void __net_exit default_device_exit_batch(struct list_head *net_list)
8083{
8084        /* At exit all network devices most be removed from a network
8085         * namespace.  Do this in the reverse order of registration.
8086         * Do this across as many network namespaces as possible to
8087         * improve batching efficiency.
8088         */
8089        struct net_device *dev;
8090        struct net *net;
8091        LIST_HEAD(dev_kill_list);
8092
8093        /* To prevent network device cleanup code from dereferencing
8094         * loopback devices or network devices that have been freed
8095         * wait here for all pending unregistrations to complete,
8096         * before unregistring the loopback device and allowing the
8097         * network namespace be freed.
8098         *
8099         * The netdev todo list containing all network devices
8100         * unregistrations that happen in default_device_exit_batch
8101         * will run in the rtnl_unlock() at the end of
8102         * default_device_exit_batch.
8103         */
8104        rtnl_lock_unregistering(net_list);
8105        list_for_each_entry(net, net_list, exit_list) {
8106                for_each_netdev_reverse(net, dev) {
8107                        if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
8108                                dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
8109                        else
8110                                unregister_netdevice_queue(dev, &dev_kill_list);
8111                }
8112        }
8113        unregister_netdevice_many(&dev_kill_list);
8114        rtnl_unlock();
8115}
8116
8117static struct pernet_operations __net_initdata default_device_ops = {
8118        .exit = default_device_exit,
8119        .exit_batch = default_device_exit_batch,
8120};
8121
8122/*
8123 *      Initialize the DEV module. At boot time this walks the device list and
8124 *      unhooks any devices that fail to initialise (normally hardware not
8125 *      present) and leaves us with a valid list of present and active devices.
8126 *
8127 */
8128
8129/*
8130 *       This is called single threaded during boot, so no need
8131 *       to take the rtnl semaphore.
8132 */
8133static int __init net_dev_init(void)
8134{
8135        int i, rc = -ENOMEM;
8136
8137        BUG_ON(!dev_boot_phase);
8138
8139        if (dev_proc_init())
8140                goto out;
8141
8142        if (netdev_kobject_init())
8143                goto out;
8144
8145        INIT_LIST_HEAD(&ptype_all);
8146        for (i = 0; i < PTYPE_HASH_SIZE; i++)
8147                INIT_LIST_HEAD(&ptype_base[i]);
8148
8149        INIT_LIST_HEAD(&offload_base);
8150
8151        if (register_pernet_subsys(&netdev_net_ops))
8152                goto out;
8153
8154        /*
8155         *      Initialise the packet receive queues.
8156         */
8157
8158        for_each_possible_cpu(i) {
8159                struct softnet_data *sd = &per_cpu(softnet_data, i);
8160
8161                skb_queue_head_init(&sd->input_pkt_queue);
8162                skb_queue_head_init(&sd->process_queue);
8163                INIT_LIST_HEAD(&sd->poll_list);
8164                sd->output_queue_tailp = &sd->output_queue;
8165#ifdef CONFIG_RPS
8166                sd->csd.func = rps_trigger_softirq;
8167                sd->csd.info = sd;
8168                sd->cpu = i;
8169#endif
8170
8171                sd->backlog.poll = process_backlog;
8172                sd->backlog.weight = weight_p;
8173        }
8174
8175        dev_boot_phase = 0;
8176
8177        /* The loopback device is special if any other network devices
8178         * is present in a network namespace the loopback device must
8179         * be present. Since we now dynamically allocate and free the
8180         * loopback device ensure this invariant is maintained by
8181         * keeping the loopback device as the first device on the
8182         * list of network devices.  Ensuring the loopback devices
8183         * is the first device that appears and the last network device
8184         * that disappears.
8185         */
8186        if (register_pernet_device(&loopback_net_ops))
8187                goto out;
8188
8189        if (register_pernet_device(&default_device_ops))
8190                goto out;
8191
8192        open_softirq(NET_TX_SOFTIRQ, net_tx_action);
8193        open_softirq(NET_RX_SOFTIRQ, net_rx_action);
8194
8195        hotcpu_notifier(dev_cpu_callback, 0);
8196        dst_subsys_init();
8197        rc = 0;
8198out:
8199        return rc;
8200}
8201
8202subsys_initcall(net_dev_init);
8203