linux/net/core/dev.c
<<
>>
Prefs
   1/*
   2 *      NET3    Protocol independent device support routines.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 *      Derived from the non IP parts of dev.c 1.0.19
  10 *              Authors:        Ross Biro
  11 *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *      Additional Authors:
  15 *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *              David Hinds <dahinds@users.sourceforge.net>
  18 *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *              Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *      Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *                                      to 2 if register_netdev gets called
  25 *                                      before net_dev_init & also removed a
  26 *                                      few lines of code in the process.
  27 *              Alan Cox        :       device private ioctl copies fields back.
  28 *              Alan Cox        :       Transmit queue code does relevant
  29 *                                      stunts to keep the queue safe.
  30 *              Alan Cox        :       Fixed double lock.
  31 *              Alan Cox        :       Fixed promisc NULL pointer trap
  32 *              ????????        :       Support the full private ioctl range
  33 *              Alan Cox        :       Moved ioctl permission check into
  34 *                                      drivers
  35 *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36 *              Alan Cox        :       100 backlog just doesn't cut it when
  37 *                                      you start doing multicast video 8)
  38 *              Alan Cox        :       Rewrote net_bh and list manager.
  39 *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40 *              Alan Cox        :       Took out transmit every packet pass
  41 *                                      Saved a few bytes in the ioctl handler
  42 *              Alan Cox        :       Network driver sets packet type before
  43 *                                      calling netif_rx. Saves a function
  44 *                                      call a packet.
  45 *              Alan Cox        :       Hashed net_bh()
  46 *              Richard Kooijman:       Timestamp fixes.
  47 *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48 *              Alan Cox        :       Device lock protection.
  49 *              Alan Cox        :       Fixed nasty side effect of device close
  50 *                                      changes.
  51 *              Rudi Cilibrasi  :       Pass the right thing to
  52 *                                      set_mac_address()
  53 *              Dave Miller     :       32bit quantity for the device lock to
  54 *                                      make it work out on a Sparc.
  55 *              Bjorn Ekwall    :       Added KERNELD hack.
  56 *              Alan Cox        :       Cleaned up the backlog initialise.
  57 *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58 *                                      1 device.
  59 *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60 *                                      is no device open function.
  61 *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62 *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63 *              Cyrus Durgin    :       Cleaned for KMOD
  64 *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65 *                                      A network device unload needs to purge
  66 *                                      the backlog queue.
  67 *      Paul Rusty Russell      :       SIOCSIFNAME
  68 *              Pekka Riikonen  :       Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *                                      indefinitely on dev->refcnt
  71 *              J Hadi Salim    :       - Backlog queue sampling
  72 *                                      - netif_rx() feedback
  73 */
  74
  75#include <asm/uaccess.h>
  76#include <linux/bitops.h>
  77#include <linux/capability.h>
  78#include <linux/cpu.h>
  79#include <linux/types.h>
  80#include <linux/kernel.h>
  81#include <linux/hash.h>
  82#include <linux/slab.h>
  83#include <linux/sched.h>
  84#include <linux/mutex.h>
  85#include <linux/string.h>
  86#include <linux/mm.h>
  87#include <linux/socket.h>
  88#include <linux/sockios.h>
  89#include <linux/errno.h>
  90#include <linux/interrupt.h>
  91#include <linux/if_ether.h>
  92#include <linux/netdevice.h>
  93#include <linux/etherdevice.h>
  94#include <linux/ethtool.h>
  95#include <linux/notifier.h>
  96#include <linux/skbuff.h>
  97#include <linux/bpf.h>
  98#include <net/net_namespace.h>
  99#include <net/sock.h>
 100#include <net/busy_poll.h>
 101#include <linux/rtnetlink.h>
 102#include <linux/stat.h>
 103#include <net/dst.h>
 104#include <net/dst_metadata.h>
 105#include <net/pkt_sched.h>
 106#include <net/checksum.h>
 107#include <net/xfrm.h>
 108#include <linux/highmem.h>
 109#include <linux/init.h>
 110#include <linux/module.h>
 111#include <linux/netpoll.h>
 112#include <linux/rcupdate.h>
 113#include <linux/delay.h>
 114#include <net/iw_handler.h>
 115#include <asm/current.h>
 116#include <linux/audit.h>
 117#include <linux/dmaengine.h>
 118#include <linux/err.h>
 119#include <linux/ctype.h>
 120#include <linux/if_arp.h>
 121#include <linux/if_vlan.h>
 122#include <linux/ip.h>
 123#include <net/ip.h>
 124#include <net/mpls.h>
 125#include <linux/ipv6.h>
 126#include <linux/in.h>
 127#include <linux/jhash.h>
 128#include <linux/random.h>
 129#include <trace/events/napi.h>
 130#include <trace/events/net.h>
 131#include <trace/events/skb.h>
 132#include <linux/pci.h>
 133#include <linux/inetdevice.h>
 134#include <linux/cpu_rmap.h>
 135#include <linux/static_key.h>
 136#include <linux/hashtable.h>
 137#include <linux/vmalloc.h>
 138#include <linux/if_macvlan.h>
 139#include <linux/errqueue.h>
 140#include <linux/hrtimer.h>
 141#include <linux/netfilter_ingress.h>
 142#include <linux/sctp.h>
 143#include <linux/crash_dump.h>
 144
 145#include "net-sysfs.h"
 146
 147/* Instead of increasing this, you should create a hash table. */
 148#define MAX_GRO_SKBS 8
 149
 150/* This should be increased if a protocol with a bigger head is added. */
 151#define GRO_MAX_HEAD (MAX_HEADER + 128)
 152
 153static DEFINE_SPINLOCK(ptype_lock);
 154static DEFINE_SPINLOCK(offload_lock);
 155struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 156struct list_head ptype_all __read_mostly;       /* Taps */
 157static struct list_head offload_base __read_mostly;
 158
 159static int netif_rx_internal(struct sk_buff *skb);
 160static int call_netdevice_notifiers_info(unsigned long val,
 161                                         struct net_device *dev,
 162                                         struct netdev_notifier_info *info);
 163
 164/*
 165 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 166 * semaphore.
 167 *
 168 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 169 *
 170 * Writers must hold the rtnl semaphore while they loop through the
 171 * dev_base_head list, and hold dev_base_lock for writing when they do the
 172 * actual updates.  This allows pure readers to access the list even
 173 * while a writer is preparing to update it.
 174 *
 175 * To put it another way, dev_base_lock is held for writing only to
 176 * protect against pure readers; the rtnl semaphore provides the
 177 * protection against other writers.
 178 *
 179 * See, for example usages, register_netdevice() and
 180 * unregister_netdevice(), which must be called with the rtnl
 181 * semaphore held.
 182 */
 183DEFINE_RWLOCK(dev_base_lock);
 184EXPORT_SYMBOL(dev_base_lock);
 185
 186/* protects napi_hash addition/deletion and napi_gen_id */
 187static DEFINE_SPINLOCK(napi_hash_lock);
 188
 189static unsigned int napi_gen_id = NR_CPUS;
 190static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
 191
 192static seqcount_t devnet_rename_seq;
 193
 194static inline void dev_base_seq_inc(struct net *net)
 195{
 196        while (++net->dev_base_seq == 0);
 197}
 198
 199static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 200{
 201        unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
 202
 203        return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 204}
 205
 206static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 207{
 208        return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 209}
 210
 211static inline void rps_lock(struct softnet_data *sd)
 212{
 213#ifdef CONFIG_RPS
 214        spin_lock(&sd->input_pkt_queue.lock);
 215#endif
 216}
 217
 218static inline void rps_unlock(struct softnet_data *sd)
 219{
 220#ifdef CONFIG_RPS
 221        spin_unlock(&sd->input_pkt_queue.lock);
 222#endif
 223}
 224
 225/* Device list insertion */
 226static void list_netdevice(struct net_device *dev)
 227{
 228        struct net *net = dev_net(dev);
 229
 230        ASSERT_RTNL();
 231
 232        write_lock_bh(&dev_base_lock);
 233        list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 234        hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 235        hlist_add_head_rcu(&dev->index_hlist,
 236                           dev_index_hash(net, dev->ifindex));
 237        write_unlock_bh(&dev_base_lock);
 238
 239        dev_base_seq_inc(net);
 240}
 241
 242/* Device list removal
 243 * caller must respect a RCU grace period before freeing/reusing dev
 244 */
 245static void unlist_netdevice(struct net_device *dev)
 246{
 247        ASSERT_RTNL();
 248
 249        /* Unlink dev from the device chain */
 250        write_lock_bh(&dev_base_lock);
 251        list_del_rcu(&dev->dev_list);
 252        hlist_del_rcu(&dev->name_hlist);
 253        hlist_del_rcu(&dev->index_hlist);
 254        write_unlock_bh(&dev_base_lock);
 255
 256        dev_base_seq_inc(dev_net(dev));
 257}
 258
 259/*
 260 *      Our notifier list
 261 */
 262
 263static RAW_NOTIFIER_HEAD(netdev_chain);
 264
 265/*
 266 *      Device drivers call our routines to queue packets here. We empty the
 267 *      queue in the local softnet handler.
 268 */
 269
 270DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 271EXPORT_PER_CPU_SYMBOL(softnet_data);
 272
 273#ifdef CONFIG_LOCKDEP
 274/*
 275 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 276 * according to dev->type
 277 */
 278static const unsigned short netdev_lock_type[] =
 279        {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 280         ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 281         ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 282         ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 283         ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 284         ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 285         ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 286         ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 287         ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 288         ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 289         ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 290         ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 291         ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 292         ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 293         ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 294
 295static const char *const netdev_lock_name[] =
 296        {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 297         "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 298         "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 299         "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 300         "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 301         "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 302         "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 303         "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 304         "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 305         "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 306         "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 307         "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 308         "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 309         "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 310         "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 311
 312static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 313static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 314
 315static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 316{
 317        int i;
 318
 319        for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 320                if (netdev_lock_type[i] == dev_type)
 321                        return i;
 322        /* the last key is used by default */
 323        return ARRAY_SIZE(netdev_lock_type) - 1;
 324}
 325
 326static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 327                                                 unsigned short dev_type)
 328{
 329        int i;
 330
 331        i = netdev_lock_pos(dev_type);
 332        lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 333                                   netdev_lock_name[i]);
 334}
 335
 336static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 337{
 338        int i;
 339
 340        i = netdev_lock_pos(dev->type);
 341        lockdep_set_class_and_name(&dev->addr_list_lock,
 342                                   &netdev_addr_lock_key[i],
 343                                   netdev_lock_name[i]);
 344}
 345#else
 346static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 347                                                 unsigned short dev_type)
 348{
 349}
 350static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 351{
 352}
 353#endif
 354
 355/*******************************************************************************
 356
 357                Protocol management and registration routines
 358
 359*******************************************************************************/
 360
 361/*
 362 *      Add a protocol ID to the list. Now that the input handler is
 363 *      smarter we can dispense with all the messy stuff that used to be
 364 *      here.
 365 *
 366 *      BEWARE!!! Protocol handlers, mangling input packets,
 367 *      MUST BE last in hash buckets and checking protocol handlers
 368 *      MUST start from promiscuous ptype_all chain in net_bh.
 369 *      It is true now, do not change it.
 370 *      Explanation follows: if protocol handler, mangling packet, will
 371 *      be the first on list, it is not able to sense, that packet
 372 *      is cloned and should be copied-on-write, so that it will
 373 *      change it and subsequent readers will get broken packet.
 374 *                                                      --ANK (980803)
 375 */
 376
 377static inline struct list_head *ptype_head(const struct packet_type *pt)
 378{
 379        if (pt->type == htons(ETH_P_ALL))
 380                return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 381        else
 382                return pt->dev ? &pt->dev->ptype_specific :
 383                                 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 384}
 385
 386/**
 387 *      dev_add_pack - add packet handler
 388 *      @pt: packet type declaration
 389 *
 390 *      Add a protocol handler to the networking stack. The passed &packet_type
 391 *      is linked into kernel lists and may not be freed until it has been
 392 *      removed from the kernel lists.
 393 *
 394 *      This call does not sleep therefore it can not
 395 *      guarantee all CPU's that are in middle of receiving packets
 396 *      will see the new packet type (until the next received packet).
 397 */
 398
 399void dev_add_pack(struct packet_type *pt)
 400{
 401        struct list_head *head = ptype_head(pt);
 402
 403        spin_lock(&ptype_lock);
 404        list_add_rcu(&pt->list, head);
 405        spin_unlock(&ptype_lock);
 406}
 407EXPORT_SYMBOL(dev_add_pack);
 408
 409/**
 410 *      __dev_remove_pack        - remove packet handler
 411 *      @pt: packet type declaration
 412 *
 413 *      Remove a protocol handler that was previously added to the kernel
 414 *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 415 *      from the kernel lists and can be freed or reused once this function
 416 *      returns.
 417 *
 418 *      The packet type might still be in use by receivers
 419 *      and must not be freed until after all the CPU's have gone
 420 *      through a quiescent state.
 421 */
 422void __dev_remove_pack(struct packet_type *pt)
 423{
 424        struct list_head *head = ptype_head(pt);
 425        struct packet_type *pt1;
 426
 427        spin_lock(&ptype_lock);
 428
 429        list_for_each_entry(pt1, head, list) {
 430                if (pt == pt1) {
 431                        list_del_rcu(&pt->list);
 432                        goto out;
 433                }
 434        }
 435
 436        pr_warn("dev_remove_pack: %p not found\n", pt);
 437out:
 438        spin_unlock(&ptype_lock);
 439}
 440EXPORT_SYMBOL(__dev_remove_pack);
 441
 442/**
 443 *      dev_remove_pack  - remove packet handler
 444 *      @pt: packet type declaration
 445 *
 446 *      Remove a protocol handler that was previously added to the kernel
 447 *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 448 *      from the kernel lists and can be freed or reused once this function
 449 *      returns.
 450 *
 451 *      This call sleeps to guarantee that no CPU is looking at the packet
 452 *      type after return.
 453 */
 454void dev_remove_pack(struct packet_type *pt)
 455{
 456        __dev_remove_pack(pt);
 457
 458        synchronize_net();
 459}
 460EXPORT_SYMBOL(dev_remove_pack);
 461
 462
 463/**
 464 *      dev_add_offload - register offload handlers
 465 *      @po: protocol offload declaration
 466 *
 467 *      Add protocol offload handlers to the networking stack. The passed
 468 *      &proto_offload is linked into kernel lists and may not be freed until
 469 *      it has been removed from the kernel lists.
 470 *
 471 *      This call does not sleep therefore it can not
 472 *      guarantee all CPU's that are in middle of receiving packets
 473 *      will see the new offload handlers (until the next received packet).
 474 */
 475void dev_add_offload(struct packet_offload *po)
 476{
 477        struct packet_offload *elem;
 478
 479        spin_lock(&offload_lock);
 480        list_for_each_entry(elem, &offload_base, list) {
 481                if (po->priority < elem->priority)
 482                        break;
 483        }
 484        list_add_rcu(&po->list, elem->list.prev);
 485        spin_unlock(&offload_lock);
 486}
 487EXPORT_SYMBOL(dev_add_offload);
 488
 489/**
 490 *      __dev_remove_offload     - remove offload handler
 491 *      @po: packet offload declaration
 492 *
 493 *      Remove a protocol offload handler that was previously added to the
 494 *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 495 *      is removed from the kernel lists and can be freed or reused once this
 496 *      function returns.
 497 *
 498 *      The packet type might still be in use by receivers
 499 *      and must not be freed until after all the CPU's have gone
 500 *      through a quiescent state.
 501 */
 502static void __dev_remove_offload(struct packet_offload *po)
 503{
 504        struct list_head *head = &offload_base;
 505        struct packet_offload *po1;
 506
 507        spin_lock(&offload_lock);
 508
 509        list_for_each_entry(po1, head, list) {
 510                if (po == po1) {
 511                        list_del_rcu(&po->list);
 512                        goto out;
 513                }
 514        }
 515
 516        pr_warn("dev_remove_offload: %p not found\n", po);
 517out:
 518        spin_unlock(&offload_lock);
 519}
 520
 521/**
 522 *      dev_remove_offload       - remove packet offload handler
 523 *      @po: packet offload declaration
 524 *
 525 *      Remove a packet offload handler that was previously added to the kernel
 526 *      offload handlers by dev_add_offload(). The passed &offload_type is
 527 *      removed from the kernel lists and can be freed or reused once this
 528 *      function returns.
 529 *
 530 *      This call sleeps to guarantee that no CPU is looking at the packet
 531 *      type after return.
 532 */
 533void dev_remove_offload(struct packet_offload *po)
 534{
 535        __dev_remove_offload(po);
 536
 537        synchronize_net();
 538}
 539EXPORT_SYMBOL(dev_remove_offload);
 540
 541/******************************************************************************
 542
 543                      Device Boot-time Settings Routines
 544
 545*******************************************************************************/
 546
 547/* Boot time configuration table */
 548static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 549
 550/**
 551 *      netdev_boot_setup_add   - add new setup entry
 552 *      @name: name of the device
 553 *      @map: configured settings for the device
 554 *
 555 *      Adds new setup entry to the dev_boot_setup list.  The function
 556 *      returns 0 on error and 1 on success.  This is a generic routine to
 557 *      all netdevices.
 558 */
 559static int netdev_boot_setup_add(char *name, struct ifmap *map)
 560{
 561        struct netdev_boot_setup *s;
 562        int i;
 563
 564        s = dev_boot_setup;
 565        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 566                if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 567                        memset(s[i].name, 0, sizeof(s[i].name));
 568                        strlcpy(s[i].name, name, IFNAMSIZ);
 569                        memcpy(&s[i].map, map, sizeof(s[i].map));
 570                        break;
 571                }
 572        }
 573
 574        return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 575}
 576
 577/**
 578 *      netdev_boot_setup_check - check boot time settings
 579 *      @dev: the netdevice
 580 *
 581 *      Check boot time settings for the device.
 582 *      The found settings are set for the device to be used
 583 *      later in the device probing.
 584 *      Returns 0 if no settings found, 1 if they are.
 585 */
 586int netdev_boot_setup_check(struct net_device *dev)
 587{
 588        struct netdev_boot_setup *s = dev_boot_setup;
 589        int i;
 590
 591        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 592                if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 593                    !strcmp(dev->name, s[i].name)) {
 594                        dev->irq        = s[i].map.irq;
 595                        dev->base_addr  = s[i].map.base_addr;
 596                        dev->mem_start  = s[i].map.mem_start;
 597                        dev->mem_end    = s[i].map.mem_end;
 598                        return 1;
 599                }
 600        }
 601        return 0;
 602}
 603EXPORT_SYMBOL(netdev_boot_setup_check);
 604
 605
 606/**
 607 *      netdev_boot_base        - get address from boot time settings
 608 *      @prefix: prefix for network device
 609 *      @unit: id for network device
 610 *
 611 *      Check boot time settings for the base address of device.
 612 *      The found settings are set for the device to be used
 613 *      later in the device probing.
 614 *      Returns 0 if no settings found.
 615 */
 616unsigned long netdev_boot_base(const char *prefix, int unit)
 617{
 618        const struct netdev_boot_setup *s = dev_boot_setup;
 619        char name[IFNAMSIZ];
 620        int i;
 621
 622        sprintf(name, "%s%d", prefix, unit);
 623
 624        /*
 625         * If device already registered then return base of 1
 626         * to indicate not to probe for this interface
 627         */
 628        if (__dev_get_by_name(&init_net, name))
 629                return 1;
 630
 631        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 632                if (!strcmp(name, s[i].name))
 633                        return s[i].map.base_addr;
 634        return 0;
 635}
 636
 637/*
 638 * Saves at boot time configured settings for any netdevice.
 639 */
 640int __init netdev_boot_setup(char *str)
 641{
 642        int ints[5];
 643        struct ifmap map;
 644
 645        str = get_options(str, ARRAY_SIZE(ints), ints);
 646        if (!str || !*str)
 647                return 0;
 648
 649        /* Save settings */
 650        memset(&map, 0, sizeof(map));
 651        if (ints[0] > 0)
 652                map.irq = ints[1];
 653        if (ints[0] > 1)
 654                map.base_addr = ints[2];
 655        if (ints[0] > 2)
 656                map.mem_start = ints[3];
 657        if (ints[0] > 3)
 658                map.mem_end = ints[4];
 659
 660        /* Add new entry to the list */
 661        return netdev_boot_setup_add(str, &map);
 662}
 663
 664__setup("netdev=", netdev_boot_setup);
 665
 666/*******************************************************************************
 667
 668                            Device Interface Subroutines
 669
 670*******************************************************************************/
 671
 672/**
 673 *      dev_get_iflink  - get 'iflink' value of a interface
 674 *      @dev: targeted interface
 675 *
 676 *      Indicates the ifindex the interface is linked to.
 677 *      Physical interfaces have the same 'ifindex' and 'iflink' values.
 678 */
 679
 680int dev_get_iflink(const struct net_device *dev)
 681{
 682        if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
 683                return dev->netdev_ops->ndo_get_iflink(dev);
 684
 685        return dev->ifindex;
 686}
 687EXPORT_SYMBOL(dev_get_iflink);
 688
 689/**
 690 *      dev_fill_metadata_dst - Retrieve tunnel egress information.
 691 *      @dev: targeted interface
 692 *      @skb: The packet.
 693 *
 694 *      For better visibility of tunnel traffic OVS needs to retrieve
 695 *      egress tunnel information for a packet. Following API allows
 696 *      user to get this info.
 697 */
 698int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 699{
 700        struct ip_tunnel_info *info;
 701
 702        if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
 703                return -EINVAL;
 704
 705        info = skb_tunnel_info_unclone(skb);
 706        if (!info)
 707                return -ENOMEM;
 708        if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
 709                return -EINVAL;
 710
 711        return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
 712}
 713EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
 714
 715/**
 716 *      __dev_get_by_name       - find a device by its name
 717 *      @net: the applicable net namespace
 718 *      @name: name to find
 719 *
 720 *      Find an interface by name. Must be called under RTNL semaphore
 721 *      or @dev_base_lock. If the name is found a pointer to the device
 722 *      is returned. If the name is not found then %NULL is returned. The
 723 *      reference counters are not incremented so the caller must be
 724 *      careful with locks.
 725 */
 726
 727struct net_device *__dev_get_by_name(struct net *net, const char *name)
 728{
 729        struct net_device *dev;
 730        struct hlist_head *head = dev_name_hash(net, name);
 731
 732        hlist_for_each_entry(dev, head, name_hlist)
 733                if (!strncmp(dev->name, name, IFNAMSIZ))
 734                        return dev;
 735
 736        return NULL;
 737}
 738EXPORT_SYMBOL(__dev_get_by_name);
 739
 740/**
 741 *      dev_get_by_name_rcu     - find a device by its name
 742 *      @net: the applicable net namespace
 743 *      @name: name to find
 744 *
 745 *      Find an interface by name.
 746 *      If the name is found a pointer to the device is returned.
 747 *      If the name is not found then %NULL is returned.
 748 *      The reference counters are not incremented so the caller must be
 749 *      careful with locks. The caller must hold RCU lock.
 750 */
 751
 752struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 753{
 754        struct net_device *dev;
 755        struct hlist_head *head = dev_name_hash(net, name);
 756
 757        hlist_for_each_entry_rcu(dev, head, name_hlist)
 758                if (!strncmp(dev->name, name, IFNAMSIZ))
 759                        return dev;
 760
 761        return NULL;
 762}
 763EXPORT_SYMBOL(dev_get_by_name_rcu);
 764
 765/**
 766 *      dev_get_by_name         - find a device by its name
 767 *      @net: the applicable net namespace
 768 *      @name: name to find
 769 *
 770 *      Find an interface by name. This can be called from any
 771 *      context and does its own locking. The returned handle has
 772 *      the usage count incremented and the caller must use dev_put() to
 773 *      release it when it is no longer needed. %NULL is returned if no
 774 *      matching device is found.
 775 */
 776
 777struct net_device *dev_get_by_name(struct net *net, const char *name)
 778{
 779        struct net_device *dev;
 780
 781        rcu_read_lock();
 782        dev = dev_get_by_name_rcu(net, name);
 783        if (dev)
 784                dev_hold(dev);
 785        rcu_read_unlock();
 786        return dev;
 787}
 788EXPORT_SYMBOL(dev_get_by_name);
 789
 790/**
 791 *      __dev_get_by_index - find a device by its ifindex
 792 *      @net: the applicable net namespace
 793 *      @ifindex: index of device
 794 *
 795 *      Search for an interface by index. Returns %NULL if the device
 796 *      is not found or a pointer to the device. The device has not
 797 *      had its reference counter increased so the caller must be careful
 798 *      about locking. The caller must hold either the RTNL semaphore
 799 *      or @dev_base_lock.
 800 */
 801
 802struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 803{
 804        struct net_device *dev;
 805        struct hlist_head *head = dev_index_hash(net, ifindex);
 806
 807        hlist_for_each_entry(dev, head, index_hlist)
 808                if (dev->ifindex == ifindex)
 809                        return dev;
 810
 811        return NULL;
 812}
 813EXPORT_SYMBOL(__dev_get_by_index);
 814
 815/**
 816 *      dev_get_by_index_rcu - find a device by its ifindex
 817 *      @net: the applicable net namespace
 818 *      @ifindex: index of device
 819 *
 820 *      Search for an interface by index. Returns %NULL if the device
 821 *      is not found or a pointer to the device. The device has not
 822 *      had its reference counter increased so the caller must be careful
 823 *      about locking. The caller must hold RCU lock.
 824 */
 825
 826struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 827{
 828        struct net_device *dev;
 829        struct hlist_head *head = dev_index_hash(net, ifindex);
 830
 831        hlist_for_each_entry_rcu(dev, head, index_hlist)
 832                if (dev->ifindex == ifindex)
 833                        return dev;
 834
 835        return NULL;
 836}
 837EXPORT_SYMBOL(dev_get_by_index_rcu);
 838
 839
 840/**
 841 *      dev_get_by_index - find a device by its ifindex
 842 *      @net: the applicable net namespace
 843 *      @ifindex: index of device
 844 *
 845 *      Search for an interface by index. Returns NULL if the device
 846 *      is not found or a pointer to the device. The device returned has
 847 *      had a reference added and the pointer is safe until the user calls
 848 *      dev_put to indicate they have finished with it.
 849 */
 850
 851struct net_device *dev_get_by_index(struct net *net, int ifindex)
 852{
 853        struct net_device *dev;
 854
 855        rcu_read_lock();
 856        dev = dev_get_by_index_rcu(net, ifindex);
 857        if (dev)
 858                dev_hold(dev);
 859        rcu_read_unlock();
 860        return dev;
 861}
 862EXPORT_SYMBOL(dev_get_by_index);
 863
 864/**
 865 *      netdev_get_name - get a netdevice name, knowing its ifindex.
 866 *      @net: network namespace
 867 *      @name: a pointer to the buffer where the name will be stored.
 868 *      @ifindex: the ifindex of the interface to get the name from.
 869 *
 870 *      The use of raw_seqcount_begin() and cond_resched() before
 871 *      retrying is required as we want to give the writers a chance
 872 *      to complete when CONFIG_PREEMPT is not set.
 873 */
 874int netdev_get_name(struct net *net, char *name, int ifindex)
 875{
 876        struct net_device *dev;
 877        unsigned int seq;
 878
 879retry:
 880        seq = raw_seqcount_begin(&devnet_rename_seq);
 881        rcu_read_lock();
 882        dev = dev_get_by_index_rcu(net, ifindex);
 883        if (!dev) {
 884                rcu_read_unlock();
 885                return -ENODEV;
 886        }
 887
 888        strcpy(name, dev->name);
 889        rcu_read_unlock();
 890        if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 891                cond_resched();
 892                goto retry;
 893        }
 894
 895        return 0;
 896}
 897
 898/**
 899 *      dev_getbyhwaddr_rcu - find a device by its hardware address
 900 *      @net: the applicable net namespace
 901 *      @type: media type of device
 902 *      @ha: hardware address
 903 *
 904 *      Search for an interface by MAC address. Returns NULL if the device
 905 *      is not found or a pointer to the device.
 906 *      The caller must hold RCU or RTNL.
 907 *      The returned device has not had its ref count increased
 908 *      and the caller must therefore be careful about locking
 909 *
 910 */
 911
 912struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 913                                       const char *ha)
 914{
 915        struct net_device *dev;
 916
 917        for_each_netdev_rcu(net, dev)
 918                if (dev->type == type &&
 919                    !memcmp(dev->dev_addr, ha, dev->addr_len))
 920                        return dev;
 921
 922        return NULL;
 923}
 924EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 925
 926struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 927{
 928        struct net_device *dev;
 929
 930        ASSERT_RTNL();
 931        for_each_netdev(net, dev)
 932                if (dev->type == type)
 933                        return dev;
 934
 935        return NULL;
 936}
 937EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 938
 939struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 940{
 941        struct net_device *dev, *ret = NULL;
 942
 943        rcu_read_lock();
 944        for_each_netdev_rcu(net, dev)
 945                if (dev->type == type) {
 946                        dev_hold(dev);
 947                        ret = dev;
 948                        break;
 949                }
 950        rcu_read_unlock();
 951        return ret;
 952}
 953EXPORT_SYMBOL(dev_getfirstbyhwtype);
 954
 955/**
 956 *      __dev_get_by_flags - find any device with given flags
 957 *      @net: the applicable net namespace
 958 *      @if_flags: IFF_* values
 959 *      @mask: bitmask of bits in if_flags to check
 960 *
 961 *      Search for any interface with the given flags. Returns NULL if a device
 962 *      is not found or a pointer to the device. Must be called inside
 963 *      rtnl_lock(), and result refcount is unchanged.
 964 */
 965
 966struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 967                                      unsigned short mask)
 968{
 969        struct net_device *dev, *ret;
 970
 971        ASSERT_RTNL();
 972
 973        ret = NULL;
 974        for_each_netdev(net, dev) {
 975                if (((dev->flags ^ if_flags) & mask) == 0) {
 976                        ret = dev;
 977                        break;
 978                }
 979        }
 980        return ret;
 981}
 982EXPORT_SYMBOL(__dev_get_by_flags);
 983
 984/**
 985 *      dev_valid_name - check if name is okay for network device
 986 *      @name: name string
 987 *
 988 *      Network device names need to be valid file names to
 989 *      to allow sysfs to work.  We also disallow any kind of
 990 *      whitespace.
 991 */
 992bool dev_valid_name(const char *name)
 993{
 994        if (*name == '\0')
 995                return false;
 996        if (strlen(name) >= IFNAMSIZ)
 997                return false;
 998        if (!strcmp(name, ".") || !strcmp(name, ".."))
 999                return false;
1000
1001        while (*name) {
1002                if (*name == '/' || *name == ':' || isspace(*name))
1003                        return false;
1004                name++;
1005        }
1006        return true;
1007}
1008EXPORT_SYMBOL(dev_valid_name);
1009
1010/**
1011 *      __dev_alloc_name - allocate a name for a device
1012 *      @net: network namespace to allocate the device name in
1013 *      @name: name format string
1014 *      @buf:  scratch buffer and result name string
1015 *
1016 *      Passed a format string - eg "lt%d" it will try and find a suitable
1017 *      id. It scans list of devices to build up a free map, then chooses
1018 *      the first empty slot. The caller must hold the dev_base or rtnl lock
1019 *      while allocating the name and adding the device in order to avoid
1020 *      duplicates.
1021 *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1022 *      Returns the number of the unit assigned or a negative errno code.
1023 */
1024
1025static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1026{
1027        int i = 0;
1028        const char *p;
1029        const int max_netdevices = 8*PAGE_SIZE;
1030        unsigned long *inuse;
1031        struct net_device *d;
1032
1033        p = strnchr(name, IFNAMSIZ-1, '%');
1034        if (p) {
1035                /*
1036                 * Verify the string as this thing may have come from
1037                 * the user.  There must be either one "%d" and no other "%"
1038                 * characters.
1039                 */
1040                if (p[1] != 'd' || strchr(p + 2, '%'))
1041                        return -EINVAL;
1042
1043                /* Use one page as a bit array of possible slots */
1044                inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1045                if (!inuse)
1046                        return -ENOMEM;
1047
1048                for_each_netdev(net, d) {
1049                        if (!sscanf(d->name, name, &i))
1050                                continue;
1051                        if (i < 0 || i >= max_netdevices)
1052                                continue;
1053
1054                        /*  avoid cases where sscanf is not exact inverse of printf */
1055                        snprintf(buf, IFNAMSIZ, name, i);
1056                        if (!strncmp(buf, d->name, IFNAMSIZ))
1057                                set_bit(i, inuse);
1058                }
1059
1060                i = find_first_zero_bit(inuse, max_netdevices);
1061                free_page((unsigned long) inuse);
1062        }
1063
1064        if (buf != name)
1065                snprintf(buf, IFNAMSIZ, name, i);
1066        if (!__dev_get_by_name(net, buf))
1067                return i;
1068
1069        /* It is possible to run out of possible slots
1070         * when the name is long and there isn't enough space left
1071         * for the digits, or if all bits are used.
1072         */
1073        return -ENFILE;
1074}
1075
1076/**
1077 *      dev_alloc_name - allocate a name for a device
1078 *      @dev: device
1079 *      @name: name format string
1080 *
1081 *      Passed a format string - eg "lt%d" it will try and find a suitable
1082 *      id. It scans list of devices to build up a free map, then chooses
1083 *      the first empty slot. The caller must hold the dev_base or rtnl lock
1084 *      while allocating the name and adding the device in order to avoid
1085 *      duplicates.
1086 *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1087 *      Returns the number of the unit assigned or a negative errno code.
1088 */
1089
1090int dev_alloc_name(struct net_device *dev, const char *name)
1091{
1092        char buf[IFNAMSIZ];
1093        struct net *net;
1094        int ret;
1095
1096        BUG_ON(!dev_net(dev));
1097        net = dev_net(dev);
1098        ret = __dev_alloc_name(net, name, buf);
1099        if (ret >= 0)
1100                strlcpy(dev->name, buf, IFNAMSIZ);
1101        return ret;
1102}
1103EXPORT_SYMBOL(dev_alloc_name);
1104
1105static int dev_alloc_name_ns(struct net *net,
1106                             struct net_device *dev,
1107                             const char *name)
1108{
1109        char buf[IFNAMSIZ];
1110        int ret;
1111
1112        ret = __dev_alloc_name(net, name, buf);
1113        if (ret >= 0)
1114                strlcpy(dev->name, buf, IFNAMSIZ);
1115        return ret;
1116}
1117
1118static int dev_get_valid_name(struct net *net,
1119                              struct net_device *dev,
1120                              const char *name)
1121{
1122        BUG_ON(!net);
1123
1124        if (!dev_valid_name(name))
1125                return -EINVAL;
1126
1127        if (strchr(name, '%'))
1128                return dev_alloc_name_ns(net, dev, name);
1129        else if (__dev_get_by_name(net, name))
1130                return -EEXIST;
1131        else if (dev->name != name)
1132                strlcpy(dev->name, name, IFNAMSIZ);
1133
1134        return 0;
1135}
1136
1137/**
1138 *      dev_change_name - change name of a device
1139 *      @dev: device
1140 *      @newname: name (or format string) must be at least IFNAMSIZ
1141 *
1142 *      Change name of a device, can pass format strings "eth%d".
1143 *      for wildcarding.
1144 */
1145int dev_change_name(struct net_device *dev, const char *newname)
1146{
1147        unsigned char old_assign_type;
1148        char oldname[IFNAMSIZ];
1149        int err = 0;
1150        int ret;
1151        struct net *net;
1152
1153        ASSERT_RTNL();
1154        BUG_ON(!dev_net(dev));
1155
1156        net = dev_net(dev);
1157        if (dev->flags & IFF_UP)
1158                return -EBUSY;
1159
1160        write_seqcount_begin(&devnet_rename_seq);
1161
1162        if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1163                write_seqcount_end(&devnet_rename_seq);
1164                return 0;
1165        }
1166
1167        memcpy(oldname, dev->name, IFNAMSIZ);
1168
1169        err = dev_get_valid_name(net, dev, newname);
1170        if (err < 0) {
1171                write_seqcount_end(&devnet_rename_seq);
1172                return err;
1173        }
1174
1175        if (oldname[0] && !strchr(oldname, '%'))
1176                netdev_info(dev, "renamed from %s\n", oldname);
1177
1178        old_assign_type = dev->name_assign_type;
1179        dev->name_assign_type = NET_NAME_RENAMED;
1180
1181rollback:
1182        ret = device_rename(&dev->dev, dev->name);
1183        if (ret) {
1184                memcpy(dev->name, oldname, IFNAMSIZ);
1185                dev->name_assign_type = old_assign_type;
1186                write_seqcount_end(&devnet_rename_seq);
1187                return ret;
1188        }
1189
1190        write_seqcount_end(&devnet_rename_seq);
1191
1192        netdev_adjacent_rename_links(dev, oldname);
1193
1194        write_lock_bh(&dev_base_lock);
1195        hlist_del_rcu(&dev->name_hlist);
1196        write_unlock_bh(&dev_base_lock);
1197
1198        synchronize_rcu();
1199
1200        write_lock_bh(&dev_base_lock);
1201        hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1202        write_unlock_bh(&dev_base_lock);
1203
1204        ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1205        ret = notifier_to_errno(ret);
1206
1207        if (ret) {
1208                /* err >= 0 after dev_alloc_name() or stores the first errno */
1209                if (err >= 0) {
1210                        err = ret;
1211                        write_seqcount_begin(&devnet_rename_seq);
1212                        memcpy(dev->name, oldname, IFNAMSIZ);
1213                        memcpy(oldname, newname, IFNAMSIZ);
1214                        dev->name_assign_type = old_assign_type;
1215                        old_assign_type = NET_NAME_RENAMED;
1216                        goto rollback;
1217                } else {
1218                        pr_err("%s: name change rollback failed: %d\n",
1219                               dev->name, ret);
1220                }
1221        }
1222
1223        return err;
1224}
1225
1226/**
1227 *      dev_set_alias - change ifalias of a device
1228 *      @dev: device
1229 *      @alias: name up to IFALIASZ
1230 *      @len: limit of bytes to copy from info
1231 *
1232 *      Set ifalias for a device,
1233 */
1234int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1235{
1236        char *new_ifalias;
1237
1238        ASSERT_RTNL();
1239
1240        if (len >= IFALIASZ)
1241                return -EINVAL;
1242
1243        if (!len) {
1244                kfree(dev->ifalias);
1245                dev->ifalias = NULL;
1246                return 0;
1247        }
1248
1249        new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1250        if (!new_ifalias)
1251                return -ENOMEM;
1252        dev->ifalias = new_ifalias;
1253
1254        strlcpy(dev->ifalias, alias, len+1);
1255        return len;
1256}
1257
1258
1259/**
1260 *      netdev_features_change - device changes features
1261 *      @dev: device to cause notification
1262 *
1263 *      Called to indicate a device has changed features.
1264 */
1265void netdev_features_change(struct net_device *dev)
1266{
1267        call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1268}
1269EXPORT_SYMBOL(netdev_features_change);
1270
1271/**
1272 *      netdev_state_change - device changes state
1273 *      @dev: device to cause notification
1274 *
1275 *      Called to indicate a device has changed state. This function calls
1276 *      the notifier chains for netdev_chain and sends a NEWLINK message
1277 *      to the routing socket.
1278 */
1279void netdev_state_change(struct net_device *dev)
1280{
1281        if (dev->flags & IFF_UP) {
1282                struct netdev_notifier_change_info change_info;
1283
1284                change_info.flags_changed = 0;
1285                call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1286                                              &change_info.info);
1287                rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1288        }
1289}
1290EXPORT_SYMBOL(netdev_state_change);
1291
1292/**
1293 *      netdev_notify_peers - notify network peers about existence of @dev
1294 *      @dev: network device
1295 *
1296 * Generate traffic such that interested network peers are aware of
1297 * @dev, such as by generating a gratuitous ARP. This may be used when
1298 * a device wants to inform the rest of the network about some sort of
1299 * reconfiguration such as a failover event or virtual machine
1300 * migration.
1301 */
1302void netdev_notify_peers(struct net_device *dev)
1303{
1304        rtnl_lock();
1305        call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1306        rtnl_unlock();
1307}
1308EXPORT_SYMBOL(netdev_notify_peers);
1309
1310static int __dev_open(struct net_device *dev)
1311{
1312        const struct net_device_ops *ops = dev->netdev_ops;
1313        int ret;
1314
1315        ASSERT_RTNL();
1316
1317        if (!netif_device_present(dev))
1318                return -ENODEV;
1319
1320        /* Block netpoll from trying to do any rx path servicing.
1321         * If we don't do this there is a chance ndo_poll_controller
1322         * or ndo_poll may be running while we open the device
1323         */
1324        netpoll_poll_disable(dev);
1325
1326        ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1327        ret = notifier_to_errno(ret);
1328        if (ret)
1329                return ret;
1330
1331        set_bit(__LINK_STATE_START, &dev->state);
1332
1333        if (ops->ndo_validate_addr)
1334                ret = ops->ndo_validate_addr(dev);
1335
1336        if (!ret && ops->ndo_open)
1337                ret = ops->ndo_open(dev);
1338
1339        netpoll_poll_enable(dev);
1340
1341        if (ret)
1342                clear_bit(__LINK_STATE_START, &dev->state);
1343        else {
1344                dev->flags |= IFF_UP;
1345                dev_set_rx_mode(dev);
1346                dev_activate(dev);
1347                add_device_randomness(dev->dev_addr, dev->addr_len);
1348        }
1349
1350        return ret;
1351}
1352
1353/**
1354 *      dev_open        - prepare an interface for use.
1355 *      @dev:   device to open
1356 *
1357 *      Takes a device from down to up state. The device's private open
1358 *      function is invoked and then the multicast lists are loaded. Finally
1359 *      the device is moved into the up state and a %NETDEV_UP message is
1360 *      sent to the netdev notifier chain.
1361 *
1362 *      Calling this function on an active interface is a nop. On a failure
1363 *      a negative errno code is returned.
1364 */
1365int dev_open(struct net_device *dev)
1366{
1367        int ret;
1368
1369        if (dev->flags & IFF_UP)
1370                return 0;
1371
1372        ret = __dev_open(dev);
1373        if (ret < 0)
1374                return ret;
1375
1376        rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1377        call_netdevice_notifiers(NETDEV_UP, dev);
1378
1379        return ret;
1380}
1381EXPORT_SYMBOL(dev_open);
1382
1383static int __dev_close_many(struct list_head *head)
1384{
1385        struct net_device *dev;
1386
1387        ASSERT_RTNL();
1388        might_sleep();
1389
1390        list_for_each_entry(dev, head, close_list) {
1391                /* Temporarily disable netpoll until the interface is down */
1392                netpoll_poll_disable(dev);
1393
1394                call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1395
1396                clear_bit(__LINK_STATE_START, &dev->state);
1397
1398                /* Synchronize to scheduled poll. We cannot touch poll list, it
1399                 * can be even on different cpu. So just clear netif_running().
1400                 *
1401                 * dev->stop() will invoke napi_disable() on all of it's
1402                 * napi_struct instances on this device.
1403                 */
1404                smp_mb__after_atomic(); /* Commit netif_running(). */
1405        }
1406
1407        dev_deactivate_many(head);
1408
1409        list_for_each_entry(dev, head, close_list) {
1410                const struct net_device_ops *ops = dev->netdev_ops;
1411
1412                /*
1413                 *      Call the device specific close. This cannot fail.
1414                 *      Only if device is UP
1415                 *
1416                 *      We allow it to be called even after a DETACH hot-plug
1417                 *      event.
1418                 */
1419                if (ops->ndo_stop)
1420                        ops->ndo_stop(dev);
1421
1422                dev->flags &= ~IFF_UP;
1423                netpoll_poll_enable(dev);
1424        }
1425
1426        return 0;
1427}
1428
1429static int __dev_close(struct net_device *dev)
1430{
1431        int retval;
1432        LIST_HEAD(single);
1433
1434        list_add(&dev->close_list, &single);
1435        retval = __dev_close_many(&single);
1436        list_del(&single);
1437
1438        return retval;
1439}
1440
1441int dev_close_many(struct list_head *head, bool unlink)
1442{
1443        struct net_device *dev, *tmp;
1444
1445        /* Remove the devices that don't need to be closed */
1446        list_for_each_entry_safe(dev, tmp, head, close_list)
1447                if (!(dev->flags & IFF_UP))
1448                        list_del_init(&dev->close_list);
1449
1450        __dev_close_many(head);
1451
1452        list_for_each_entry_safe(dev, tmp, head, close_list) {
1453                rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1454                call_netdevice_notifiers(NETDEV_DOWN, dev);
1455                if (unlink)
1456                        list_del_init(&dev->close_list);
1457        }
1458
1459        return 0;
1460}
1461EXPORT_SYMBOL(dev_close_many);
1462
1463/**
1464 *      dev_close - shutdown an interface.
1465 *      @dev: device to shutdown
1466 *
1467 *      This function moves an active device into down state. A
1468 *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1469 *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1470 *      chain.
1471 */
1472int dev_close(struct net_device *dev)
1473{
1474        if (dev->flags & IFF_UP) {
1475                LIST_HEAD(single);
1476
1477                list_add(&dev->close_list, &single);
1478                dev_close_many(&single, true);
1479                list_del(&single);
1480        }
1481        return 0;
1482}
1483EXPORT_SYMBOL(dev_close);
1484
1485
1486/**
1487 *      dev_disable_lro - disable Large Receive Offload on a device
1488 *      @dev: device
1489 *
1490 *      Disable Large Receive Offload (LRO) on a net device.  Must be
1491 *      called under RTNL.  This is needed if received packets may be
1492 *      forwarded to another interface.
1493 */
1494void dev_disable_lro(struct net_device *dev)
1495{
1496        struct net_device *lower_dev;
1497        struct list_head *iter;
1498
1499        dev->wanted_features &= ~NETIF_F_LRO;
1500        netdev_update_features(dev);
1501
1502        if (unlikely(dev->features & NETIF_F_LRO))
1503                netdev_WARN(dev, "failed to disable LRO!\n");
1504
1505        netdev_for_each_lower_dev(dev, lower_dev, iter)
1506                dev_disable_lro(lower_dev);
1507}
1508EXPORT_SYMBOL(dev_disable_lro);
1509
1510static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1511                                   struct net_device *dev)
1512{
1513        struct netdev_notifier_info info;
1514
1515        netdev_notifier_info_init(&info, dev);
1516        return nb->notifier_call(nb, val, &info);
1517}
1518
1519static int dev_boot_phase = 1;
1520
1521/**
1522 *      register_netdevice_notifier - register a network notifier block
1523 *      @nb: notifier
1524 *
1525 *      Register a notifier to be called when network device events occur.
1526 *      The notifier passed is linked into the kernel structures and must
1527 *      not be reused until it has been unregistered. A negative errno code
1528 *      is returned on a failure.
1529 *
1530 *      When registered all registration and up events are replayed
1531 *      to the new notifier to allow device to have a race free
1532 *      view of the network device list.
1533 */
1534
1535int register_netdevice_notifier(struct notifier_block *nb)
1536{
1537        struct net_device *dev;
1538        struct net_device *last;
1539        struct net *net;
1540        int err;
1541
1542        rtnl_lock();
1543        err = raw_notifier_chain_register(&netdev_chain, nb);
1544        if (err)
1545                goto unlock;
1546        if (dev_boot_phase)
1547                goto unlock;
1548        for_each_net(net) {
1549                for_each_netdev(net, dev) {
1550                        err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1551                        err = notifier_to_errno(err);
1552                        if (err)
1553                                goto rollback;
1554
1555                        if (!(dev->flags & IFF_UP))
1556                                continue;
1557
1558                        call_netdevice_notifier(nb, NETDEV_UP, dev);
1559                }
1560        }
1561
1562unlock:
1563        rtnl_unlock();
1564        return err;
1565
1566rollback:
1567        last = dev;
1568        for_each_net(net) {
1569                for_each_netdev(net, dev) {
1570                        if (dev == last)
1571                                goto outroll;
1572
1573                        if (dev->flags & IFF_UP) {
1574                                call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1575                                                        dev);
1576                                call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1577                        }
1578                        call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1579                }
1580        }
1581
1582outroll:
1583        raw_notifier_chain_unregister(&netdev_chain, nb);
1584        goto unlock;
1585}
1586EXPORT_SYMBOL(register_netdevice_notifier);
1587
1588/**
1589 *      unregister_netdevice_notifier - unregister a network notifier block
1590 *      @nb: notifier
1591 *
1592 *      Unregister a notifier previously registered by
1593 *      register_netdevice_notifier(). The notifier is unlinked into the
1594 *      kernel structures and may then be reused. A negative errno code
1595 *      is returned on a failure.
1596 *
1597 *      After unregistering unregister and down device events are synthesized
1598 *      for all devices on the device list to the removed notifier to remove
1599 *      the need for special case cleanup code.
1600 */
1601
1602int unregister_netdevice_notifier(struct notifier_block *nb)
1603{
1604        struct net_device *dev;
1605        struct net *net;
1606        int err;
1607
1608        rtnl_lock();
1609        err = raw_notifier_chain_unregister(&netdev_chain, nb);
1610        if (err)
1611                goto unlock;
1612
1613        for_each_net(net) {
1614                for_each_netdev(net, dev) {
1615                        if (dev->flags & IFF_UP) {
1616                                call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1617                                                        dev);
1618                                call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1619                        }
1620                        call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1621                }
1622        }
1623unlock:
1624        rtnl_unlock();
1625        return err;
1626}
1627EXPORT_SYMBOL(unregister_netdevice_notifier);
1628
1629/**
1630 *      call_netdevice_notifiers_info - call all network notifier blocks
1631 *      @val: value passed unmodified to notifier function
1632 *      @dev: net_device pointer passed unmodified to notifier function
1633 *      @info: notifier information data
1634 *
1635 *      Call all network notifier blocks.  Parameters and return value
1636 *      are as for raw_notifier_call_chain().
1637 */
1638
1639static int call_netdevice_notifiers_info(unsigned long val,
1640                                         struct net_device *dev,
1641                                         struct netdev_notifier_info *info)
1642{
1643        ASSERT_RTNL();
1644        netdev_notifier_info_init(info, dev);
1645        return raw_notifier_call_chain(&netdev_chain, val, info);
1646}
1647
1648/**
1649 *      call_netdevice_notifiers - call all network notifier blocks
1650 *      @val: value passed unmodified to notifier function
1651 *      @dev: net_device pointer passed unmodified to notifier function
1652 *
1653 *      Call all network notifier blocks.  Parameters and return value
1654 *      are as for raw_notifier_call_chain().
1655 */
1656
1657int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1658{
1659        struct netdev_notifier_info info;
1660
1661        return call_netdevice_notifiers_info(val, dev, &info);
1662}
1663EXPORT_SYMBOL(call_netdevice_notifiers);
1664
1665#ifdef CONFIG_NET_INGRESS
1666static struct static_key ingress_needed __read_mostly;
1667
1668void net_inc_ingress_queue(void)
1669{
1670        static_key_slow_inc(&ingress_needed);
1671}
1672EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1673
1674void net_dec_ingress_queue(void)
1675{
1676        static_key_slow_dec(&ingress_needed);
1677}
1678EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1679#endif
1680
1681#ifdef CONFIG_NET_EGRESS
1682static struct static_key egress_needed __read_mostly;
1683
1684void net_inc_egress_queue(void)
1685{
1686        static_key_slow_inc(&egress_needed);
1687}
1688EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1689
1690void net_dec_egress_queue(void)
1691{
1692        static_key_slow_dec(&egress_needed);
1693}
1694EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1695#endif
1696
1697static struct static_key netstamp_needed __read_mostly;
1698#ifdef HAVE_JUMP_LABEL
1699/* We are not allowed to call static_key_slow_dec() from irq context
1700 * If net_disable_timestamp() is called from irq context, defer the
1701 * static_key_slow_dec() calls.
1702 */
1703static atomic_t netstamp_needed_deferred;
1704#endif
1705
1706void net_enable_timestamp(void)
1707{
1708#ifdef HAVE_JUMP_LABEL
1709        int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1710
1711        if (deferred) {
1712                while (--deferred)
1713                        static_key_slow_dec(&netstamp_needed);
1714                return;
1715        }
1716#endif
1717        static_key_slow_inc(&netstamp_needed);
1718}
1719EXPORT_SYMBOL(net_enable_timestamp);
1720
1721void net_disable_timestamp(void)
1722{
1723#ifdef HAVE_JUMP_LABEL
1724        if (in_interrupt()) {
1725                atomic_inc(&netstamp_needed_deferred);
1726                return;
1727        }
1728#endif
1729        static_key_slow_dec(&netstamp_needed);
1730}
1731EXPORT_SYMBOL(net_disable_timestamp);
1732
1733static inline void net_timestamp_set(struct sk_buff *skb)
1734{
1735        skb->tstamp.tv64 = 0;
1736        if (static_key_false(&netstamp_needed))
1737                __net_timestamp(skb);
1738}
1739
1740#define net_timestamp_check(COND, SKB)                  \
1741        if (static_key_false(&netstamp_needed)) {               \
1742                if ((COND) && !(SKB)->tstamp.tv64)      \
1743                        __net_timestamp(SKB);           \
1744        }                                               \
1745
1746bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
1747{
1748        unsigned int len;
1749
1750        if (!(dev->flags & IFF_UP))
1751                return false;
1752
1753        len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1754        if (skb->len <= len)
1755                return true;
1756
1757        /* if TSO is enabled, we don't care about the length as the packet
1758         * could be forwarded without being segmented before
1759         */
1760        if (skb_is_gso(skb))
1761                return true;
1762
1763        return false;
1764}
1765EXPORT_SYMBOL_GPL(is_skb_forwardable);
1766
1767int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1768{
1769        if (skb_orphan_frags(skb, GFP_ATOMIC) ||
1770            unlikely(!is_skb_forwardable(dev, skb))) {
1771                atomic_long_inc(&dev->rx_dropped);
1772                kfree_skb(skb);
1773                return NET_RX_DROP;
1774        }
1775
1776        skb_scrub_packet(skb, true);
1777        skb->priority = 0;
1778        skb->protocol = eth_type_trans(skb, dev);
1779        skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1780
1781        return 0;
1782}
1783EXPORT_SYMBOL_GPL(__dev_forward_skb);
1784
1785/**
1786 * dev_forward_skb - loopback an skb to another netif
1787 *
1788 * @dev: destination network device
1789 * @skb: buffer to forward
1790 *
1791 * return values:
1792 *      NET_RX_SUCCESS  (no congestion)
1793 *      NET_RX_DROP     (packet was dropped, but freed)
1794 *
1795 * dev_forward_skb can be used for injecting an skb from the
1796 * start_xmit function of one device into the receive queue
1797 * of another device.
1798 *
1799 * The receiving device may be in another namespace, so
1800 * we have to clear all information in the skb that could
1801 * impact namespace isolation.
1802 */
1803int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1804{
1805        return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1806}
1807EXPORT_SYMBOL_GPL(dev_forward_skb);
1808
1809static inline int deliver_skb(struct sk_buff *skb,
1810                              struct packet_type *pt_prev,
1811                              struct net_device *orig_dev)
1812{
1813        if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1814                return -ENOMEM;
1815        atomic_inc(&skb->users);
1816        return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1817}
1818
1819static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1820                                          struct packet_type **pt,
1821                                          struct net_device *orig_dev,
1822                                          __be16 type,
1823                                          struct list_head *ptype_list)
1824{
1825        struct packet_type *ptype, *pt_prev = *pt;
1826
1827        list_for_each_entry_rcu(ptype, ptype_list, list) {
1828                if (ptype->type != type)
1829                        continue;
1830                if (pt_prev)
1831                        deliver_skb(skb, pt_prev, orig_dev);
1832                pt_prev = ptype;
1833        }
1834        *pt = pt_prev;
1835}
1836
1837static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1838{
1839        if (!ptype->af_packet_priv || !skb->sk)
1840                return false;
1841
1842        if (ptype->id_match)
1843                return ptype->id_match(ptype, skb->sk);
1844        else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1845                return true;
1846
1847        return false;
1848}
1849
1850/*
1851 *      Support routine. Sends outgoing frames to any network
1852 *      taps currently in use.
1853 */
1854
1855void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1856{
1857        struct packet_type *ptype;
1858        struct sk_buff *skb2 = NULL;
1859        struct packet_type *pt_prev = NULL;
1860        struct list_head *ptype_list = &ptype_all;
1861
1862        rcu_read_lock();
1863again:
1864        list_for_each_entry_rcu(ptype, ptype_list, list) {
1865                /* Never send packets back to the socket
1866                 * they originated from - MvS (miquels@drinkel.ow.org)
1867                 */
1868                if (skb_loop_sk(ptype, skb))
1869                        continue;
1870
1871                if (pt_prev) {
1872                        deliver_skb(skb2, pt_prev, skb->dev);
1873                        pt_prev = ptype;
1874                        continue;
1875                }
1876
1877                /* need to clone skb, done only once */
1878                skb2 = skb_clone(skb, GFP_ATOMIC);
1879                if (!skb2)
1880                        goto out_unlock;
1881
1882                net_timestamp_set(skb2);
1883
1884                /* skb->nh should be correctly
1885                 * set by sender, so that the second statement is
1886                 * just protection against buggy protocols.
1887                 */
1888                skb_reset_mac_header(skb2);
1889
1890                if (skb_network_header(skb2) < skb2->data ||
1891                    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1892                        net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1893                                             ntohs(skb2->protocol),
1894                                             dev->name);
1895                        skb_reset_network_header(skb2);
1896                }
1897
1898                skb2->transport_header = skb2->network_header;
1899                skb2->pkt_type = PACKET_OUTGOING;
1900                pt_prev = ptype;
1901        }
1902
1903        if (ptype_list == &ptype_all) {
1904                ptype_list = &dev->ptype_all;
1905                goto again;
1906        }
1907out_unlock:
1908        if (pt_prev)
1909                pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1910        rcu_read_unlock();
1911}
1912EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
1913
1914/**
1915 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1916 * @dev: Network device
1917 * @txq: number of queues available
1918 *
1919 * If real_num_tx_queues is changed the tc mappings may no longer be
1920 * valid. To resolve this verify the tc mapping remains valid and if
1921 * not NULL the mapping. With no priorities mapping to this
1922 * offset/count pair it will no longer be used. In the worst case TC0
1923 * is invalid nothing can be done so disable priority mappings. If is
1924 * expected that drivers will fix this mapping if they can before
1925 * calling netif_set_real_num_tx_queues.
1926 */
1927static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1928{
1929        int i;
1930        struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1931
1932        /* If TC0 is invalidated disable TC mapping */
1933        if (tc->offset + tc->count > txq) {
1934                pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1935                dev->num_tc = 0;
1936                return;
1937        }
1938
1939        /* Invalidated prio to tc mappings set to TC0 */
1940        for (i = 1; i < TC_BITMASK + 1; i++) {
1941                int q = netdev_get_prio_tc_map(dev, i);
1942
1943                tc = &dev->tc_to_txq[q];
1944                if (tc->offset + tc->count > txq) {
1945                        pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1946                                i, q);
1947                        netdev_set_prio_tc_map(dev, i, 0);
1948                }
1949        }
1950}
1951
1952#ifdef CONFIG_XPS
1953static DEFINE_MUTEX(xps_map_mutex);
1954#define xmap_dereference(P)             \
1955        rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1956
1957static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1958                                        int cpu, u16 index)
1959{
1960        struct xps_map *map = NULL;
1961        int pos;
1962
1963        if (dev_maps)
1964                map = xmap_dereference(dev_maps->cpu_map[cpu]);
1965
1966        for (pos = 0; map && pos < map->len; pos++) {
1967                if (map->queues[pos] == index) {
1968                        if (map->len > 1) {
1969                                map->queues[pos] = map->queues[--map->len];
1970                        } else {
1971                                RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1972                                kfree_rcu(map, rcu);
1973                                map = NULL;
1974                        }
1975                        break;
1976                }
1977        }
1978
1979        return map;
1980}
1981
1982static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1983{
1984        struct xps_dev_maps *dev_maps;
1985        int cpu, i;
1986        bool active = false;
1987
1988        mutex_lock(&xps_map_mutex);
1989        dev_maps = xmap_dereference(dev->xps_maps);
1990
1991        if (!dev_maps)
1992                goto out_no_maps;
1993
1994        for_each_possible_cpu(cpu) {
1995                for (i = index; i < dev->num_tx_queues; i++) {
1996                        if (!remove_xps_queue(dev_maps, cpu, i))
1997                                break;
1998                }
1999                if (i == dev->num_tx_queues)
2000                        active = true;
2001        }
2002
2003        if (!active) {
2004                RCU_INIT_POINTER(dev->xps_maps, NULL);
2005                kfree_rcu(dev_maps, rcu);
2006        }
2007
2008        for (i = index; i < dev->num_tx_queues; i++)
2009                netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2010                                             NUMA_NO_NODE);
2011
2012out_no_maps:
2013        mutex_unlock(&xps_map_mutex);
2014}
2015
2016static struct xps_map *expand_xps_map(struct xps_map *map,
2017                                      int cpu, u16 index)
2018{
2019        struct xps_map *new_map;
2020        int alloc_len = XPS_MIN_MAP_ALLOC;
2021        int i, pos;
2022
2023        for (pos = 0; map && pos < map->len; pos++) {
2024                if (map->queues[pos] != index)
2025                        continue;
2026                return map;
2027        }
2028
2029        /* Need to add queue to this CPU's existing map */
2030        if (map) {
2031                if (pos < map->alloc_len)
2032                        return map;
2033
2034                alloc_len = map->alloc_len * 2;
2035        }
2036
2037        /* Need to allocate new map to store queue on this CPU's map */
2038        new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2039                               cpu_to_node(cpu));
2040        if (!new_map)
2041                return NULL;
2042
2043        for (i = 0; i < pos; i++)
2044                new_map->queues[i] = map->queues[i];
2045        new_map->alloc_len = alloc_len;
2046        new_map->len = pos;
2047
2048        return new_map;
2049}
2050
2051int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2052                        u16 index)
2053{
2054        struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2055        struct xps_map *map, *new_map;
2056        int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
2057        int cpu, numa_node_id = -2;
2058        bool active = false;
2059
2060        mutex_lock(&xps_map_mutex);
2061
2062        dev_maps = xmap_dereference(dev->xps_maps);
2063
2064        /* allocate memory for queue storage */
2065        for_each_online_cpu(cpu) {
2066                if (!cpumask_test_cpu(cpu, mask))
2067                        continue;
2068
2069                if (!new_dev_maps)
2070                        new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2071                if (!new_dev_maps) {
2072                        mutex_unlock(&xps_map_mutex);
2073                        return -ENOMEM;
2074                }
2075
2076                map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2077                                 NULL;
2078
2079                map = expand_xps_map(map, cpu, index);
2080                if (!map)
2081                        goto error;
2082
2083                RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2084        }
2085
2086        if (!new_dev_maps)
2087                goto out_no_new_maps;
2088
2089        for_each_possible_cpu(cpu) {
2090                if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2091                        /* add queue to CPU maps */
2092                        int pos = 0;
2093
2094                        map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2095                        while ((pos < map->len) && (map->queues[pos] != index))
2096                                pos++;
2097
2098                        if (pos == map->len)
2099                                map->queues[map->len++] = index;
2100#ifdef CONFIG_NUMA
2101                        if (numa_node_id == -2)
2102                                numa_node_id = cpu_to_node(cpu);
2103                        else if (numa_node_id != cpu_to_node(cpu))
2104                                numa_node_id = -1;
2105#endif
2106                } else if (dev_maps) {
2107                        /* fill in the new device map from the old device map */
2108                        map = xmap_dereference(dev_maps->cpu_map[cpu]);
2109                        RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2110                }
2111
2112        }
2113
2114        rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2115
2116        /* Cleanup old maps */
2117        if (dev_maps) {
2118                for_each_possible_cpu(cpu) {
2119                        new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2120                        map = xmap_dereference(dev_maps->cpu_map[cpu]);
2121                        if (map && map != new_map)
2122                                kfree_rcu(map, rcu);
2123                }
2124
2125                kfree_rcu(dev_maps, rcu);
2126        }
2127
2128        dev_maps = new_dev_maps;
2129        active = true;
2130
2131out_no_new_maps:
2132        /* update Tx queue numa node */
2133        netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2134                                     (numa_node_id >= 0) ? numa_node_id :
2135                                     NUMA_NO_NODE);
2136
2137        if (!dev_maps)
2138                goto out_no_maps;
2139
2140        /* removes queue from unused CPUs */
2141        for_each_possible_cpu(cpu) {
2142                if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2143                        continue;
2144
2145                if (remove_xps_queue(dev_maps, cpu, index))
2146                        active = true;
2147        }
2148
2149        /* free map if not active */
2150        if (!active) {
2151                RCU_INIT_POINTER(dev->xps_maps, NULL);
2152                kfree_rcu(dev_maps, rcu);
2153        }
2154
2155out_no_maps:
2156        mutex_unlock(&xps_map_mutex);
2157
2158        return 0;
2159error:
2160        /* remove any maps that we added */
2161        for_each_possible_cpu(cpu) {
2162                new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2163                map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2164                                 NULL;
2165                if (new_map && new_map != map)
2166                        kfree(new_map);
2167        }
2168
2169        mutex_unlock(&xps_map_mutex);
2170
2171        kfree(new_dev_maps);
2172        return -ENOMEM;
2173}
2174EXPORT_SYMBOL(netif_set_xps_queue);
2175
2176#endif
2177/*
2178 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2179 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2180 */
2181int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2182{
2183        int rc;
2184
2185        if (txq < 1 || txq > dev->num_tx_queues)
2186                return -EINVAL;
2187
2188        if (dev->reg_state == NETREG_REGISTERED ||
2189            dev->reg_state == NETREG_UNREGISTERING) {
2190                ASSERT_RTNL();
2191
2192                rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2193                                                  txq);
2194                if (rc)
2195                        return rc;
2196
2197                if (dev->num_tc)
2198                        netif_setup_tc(dev, txq);
2199
2200                if (txq < dev->real_num_tx_queues) {
2201                        qdisc_reset_all_tx_gt(dev, txq);
2202#ifdef CONFIG_XPS
2203                        netif_reset_xps_queues_gt(dev, txq);
2204#endif
2205                }
2206        }
2207
2208        dev->real_num_tx_queues = txq;
2209        return 0;
2210}
2211EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2212
2213#ifdef CONFIG_SYSFS
2214/**
2215 *      netif_set_real_num_rx_queues - set actual number of RX queues used
2216 *      @dev: Network device
2217 *      @rxq: Actual number of RX queues
2218 *
2219 *      This must be called either with the rtnl_lock held or before
2220 *      registration of the net device.  Returns 0 on success, or a
2221 *      negative error code.  If called before registration, it always
2222 *      succeeds.
2223 */
2224int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2225{
2226        int rc;
2227
2228        if (rxq < 1 || rxq > dev->num_rx_queues)
2229                return -EINVAL;
2230
2231        if (dev->reg_state == NETREG_REGISTERED) {
2232                ASSERT_RTNL();
2233
2234                rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2235                                                  rxq);
2236                if (rc)
2237                        return rc;
2238        }
2239
2240        dev->real_num_rx_queues = rxq;
2241        return 0;
2242}
2243EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2244#endif
2245
2246/**
2247 * netif_get_num_default_rss_queues - default number of RSS queues
2248 *
2249 * This routine should set an upper limit on the number of RSS queues
2250 * used by default by multiqueue devices.
2251 */
2252int netif_get_num_default_rss_queues(void)
2253{
2254        return is_kdump_kernel() ?
2255                1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2256}
2257EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2258
2259static void __netif_reschedule(struct Qdisc *q)
2260{
2261        struct softnet_data *sd;
2262        unsigned long flags;
2263
2264        local_irq_save(flags);
2265        sd = this_cpu_ptr(&softnet_data);
2266        q->next_sched = NULL;
2267        *sd->output_queue_tailp = q;
2268        sd->output_queue_tailp = &q->next_sched;
2269        raise_softirq_irqoff(NET_TX_SOFTIRQ);
2270        local_irq_restore(flags);
2271}
2272
2273void __netif_schedule(struct Qdisc *q)
2274{
2275        if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2276                __netif_reschedule(q);
2277}
2278EXPORT_SYMBOL(__netif_schedule);
2279
2280struct dev_kfree_skb_cb {
2281        enum skb_free_reason reason;
2282};
2283
2284static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2285{
2286        return (struct dev_kfree_skb_cb *)skb->cb;
2287}
2288
2289void netif_schedule_queue(struct netdev_queue *txq)
2290{
2291        rcu_read_lock();
2292        if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2293                struct Qdisc *q = rcu_dereference(txq->qdisc);
2294
2295                __netif_schedule(q);
2296        }
2297        rcu_read_unlock();
2298}
2299EXPORT_SYMBOL(netif_schedule_queue);
2300
2301/**
2302 *      netif_wake_subqueue - allow sending packets on subqueue
2303 *      @dev: network device
2304 *      @queue_index: sub queue index
2305 *
2306 * Resume individual transmit queue of a device with multiple transmit queues.
2307 */
2308void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2309{
2310        struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2311
2312        if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2313                struct Qdisc *q;
2314
2315                rcu_read_lock();
2316                q = rcu_dereference(txq->qdisc);
2317                __netif_schedule(q);
2318                rcu_read_unlock();
2319        }
2320}
2321EXPORT_SYMBOL(netif_wake_subqueue);
2322
2323void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2324{
2325        if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2326                struct Qdisc *q;
2327
2328                rcu_read_lock();
2329                q = rcu_dereference(dev_queue->qdisc);
2330                __netif_schedule(q);
2331                rcu_read_unlock();
2332        }
2333}
2334EXPORT_SYMBOL(netif_tx_wake_queue);
2335
2336void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2337{
2338        unsigned long flags;
2339
2340        if (likely(atomic_read(&skb->users) == 1)) {
2341                smp_rmb();
2342                atomic_set(&skb->users, 0);
2343        } else if (likely(!atomic_dec_and_test(&skb->users))) {
2344                return;
2345        }
2346        get_kfree_skb_cb(skb)->reason = reason;
2347        local_irq_save(flags);
2348        skb->next = __this_cpu_read(softnet_data.completion_queue);
2349        __this_cpu_write(softnet_data.completion_queue, skb);
2350        raise_softirq_irqoff(NET_TX_SOFTIRQ);
2351        local_irq_restore(flags);
2352}
2353EXPORT_SYMBOL(__dev_kfree_skb_irq);
2354
2355void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2356{
2357        if (in_irq() || irqs_disabled())
2358                __dev_kfree_skb_irq(skb, reason);
2359        else
2360                dev_kfree_skb(skb);
2361}
2362EXPORT_SYMBOL(__dev_kfree_skb_any);
2363
2364
2365/**
2366 * netif_device_detach - mark device as removed
2367 * @dev: network device
2368 *
2369 * Mark device as removed from system and therefore no longer available.
2370 */
2371void netif_device_detach(struct net_device *dev)
2372{
2373        if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2374            netif_running(dev)) {
2375                netif_tx_stop_all_queues(dev);
2376        }
2377}
2378EXPORT_SYMBOL(netif_device_detach);
2379
2380/**
2381 * netif_device_attach - mark device as attached
2382 * @dev: network device
2383 *
2384 * Mark device as attached from system and restart if needed.
2385 */
2386void netif_device_attach(struct net_device *dev)
2387{
2388        if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2389            netif_running(dev)) {
2390                netif_tx_wake_all_queues(dev);
2391                __netdev_watchdog_up(dev);
2392        }
2393}
2394EXPORT_SYMBOL(netif_device_attach);
2395
2396/*
2397 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2398 * to be used as a distribution range.
2399 */
2400u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2401                  unsigned int num_tx_queues)
2402{
2403        u32 hash;
2404        u16 qoffset = 0;
2405        u16 qcount = num_tx_queues;
2406
2407        if (skb_rx_queue_recorded(skb)) {
2408                hash = skb_get_rx_queue(skb);
2409                while (unlikely(hash >= num_tx_queues))
2410                        hash -= num_tx_queues;
2411                return hash;
2412        }
2413
2414        if (dev->num_tc) {
2415                u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2416                qoffset = dev->tc_to_txq[tc].offset;
2417                qcount = dev->tc_to_txq[tc].count;
2418        }
2419
2420        return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2421}
2422EXPORT_SYMBOL(__skb_tx_hash);
2423
2424static void skb_warn_bad_offload(const struct sk_buff *skb)
2425{
2426        static const netdev_features_t null_features;
2427        struct net_device *dev = skb->dev;
2428        const char *name = "";
2429
2430        if (!net_ratelimit())
2431                return;
2432
2433        if (dev) {
2434                if (dev->dev.parent)
2435                        name = dev_driver_string(dev->dev.parent);
2436                else
2437                        name = netdev_name(dev);
2438        }
2439        WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2440             "gso_type=%d ip_summed=%d\n",
2441             name, dev ? &dev->features : &null_features,
2442             skb->sk ? &skb->sk->sk_route_caps : &null_features,
2443             skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2444             skb_shinfo(skb)->gso_type, skb->ip_summed);
2445}
2446
2447/*
2448 * Invalidate hardware checksum when packet is to be mangled, and
2449 * complete checksum manually on outgoing path.
2450 */
2451int skb_checksum_help(struct sk_buff *skb)
2452{
2453        __wsum csum;
2454        int ret = 0, offset;
2455
2456        if (skb->ip_summed == CHECKSUM_COMPLETE)
2457                goto out_set_summed;
2458
2459        if (unlikely(skb_shinfo(skb)->gso_size)) {
2460                skb_warn_bad_offload(skb);
2461                return -EINVAL;
2462        }
2463
2464        /* Before computing a checksum, we should make sure no frag could
2465         * be modified by an external entity : checksum could be wrong.
2466         */
2467        if (skb_has_shared_frag(skb)) {
2468                ret = __skb_linearize(skb);
2469                if (ret)
2470                        goto out;
2471        }
2472
2473        offset = skb_checksum_start_offset(skb);
2474        BUG_ON(offset >= skb_headlen(skb));
2475        csum = skb_checksum(skb, offset, skb->len - offset, 0);
2476
2477        offset += skb->csum_offset;
2478        BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2479
2480        if (skb_cloned(skb) &&
2481            !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2482                ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2483                if (ret)
2484                        goto out;
2485        }
2486
2487        *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2488out_set_summed:
2489        skb->ip_summed = CHECKSUM_NONE;
2490out:
2491        return ret;
2492}
2493EXPORT_SYMBOL(skb_checksum_help);
2494
2495/* skb_csum_offload_check - Driver helper function to determine if a device
2496 * with limited checksum offload capabilities is able to offload the checksum
2497 * for a given packet.
2498 *
2499 * Arguments:
2500 *   skb - sk_buff for the packet in question
2501 *   spec - contains the description of what device can offload
2502 *   csum_encapped - returns true if the checksum being offloaded is
2503 *            encpasulated. That is it is checksum for the transport header
2504 *            in the inner headers.
2505 *   checksum_help - when set indicates that helper function should
2506 *            call skb_checksum_help if offload checks fail
2507 *
2508 * Returns:
2509 *   true: Packet has passed the checksum checks and should be offloadable to
2510 *         the device (a driver may still need to check for additional
2511 *         restrictions of its device)
2512 *   false: Checksum is not offloadable. If checksum_help was set then
2513 *         skb_checksum_help was called to resolve checksum for non-GSO
2514 *         packets and when IP protocol is not SCTP
2515 */
2516bool __skb_csum_offload_chk(struct sk_buff *skb,
2517                            const struct skb_csum_offl_spec *spec,
2518                            bool *csum_encapped,
2519                            bool csum_help)
2520{
2521        struct iphdr *iph;
2522        struct ipv6hdr *ipv6;
2523        void *nhdr;
2524        int protocol;
2525        u8 ip_proto;
2526
2527        if (skb->protocol == htons(ETH_P_8021Q) ||
2528            skb->protocol == htons(ETH_P_8021AD)) {
2529                if (!spec->vlan_okay)
2530                        goto need_help;
2531        }
2532
2533        /* We check whether the checksum refers to a transport layer checksum in
2534         * the outermost header or an encapsulated transport layer checksum that
2535         * corresponds to the inner headers of the skb. If the checksum is for
2536         * something else in the packet we need help.
2537         */
2538        if (skb_checksum_start_offset(skb) == skb_transport_offset(skb)) {
2539                /* Non-encapsulated checksum */
2540                protocol = eproto_to_ipproto(vlan_get_protocol(skb));
2541                nhdr = skb_network_header(skb);
2542                *csum_encapped = false;
2543                if (spec->no_not_encapped)
2544                        goto need_help;
2545        } else if (skb->encapsulation && spec->encap_okay &&
2546                   skb_checksum_start_offset(skb) ==
2547                   skb_inner_transport_offset(skb)) {
2548                /* Encapsulated checksum */
2549                *csum_encapped = true;
2550                switch (skb->inner_protocol_type) {
2551                case ENCAP_TYPE_ETHER:
2552                        protocol = eproto_to_ipproto(skb->inner_protocol);
2553                        break;
2554                case ENCAP_TYPE_IPPROTO:
2555                        protocol = skb->inner_protocol;
2556                        break;
2557                }
2558                nhdr = skb_inner_network_header(skb);
2559        } else {
2560                goto need_help;
2561        }
2562
2563        switch (protocol) {
2564        case IPPROTO_IP:
2565                if (!spec->ipv4_okay)
2566                        goto need_help;
2567                iph = nhdr;
2568                ip_proto = iph->protocol;
2569                if (iph->ihl != 5 && !spec->ip_options_okay)
2570                        goto need_help;
2571                break;
2572        case IPPROTO_IPV6:
2573                if (!spec->ipv6_okay)
2574                        goto need_help;
2575                if (spec->no_encapped_ipv6 && *csum_encapped)
2576                        goto need_help;
2577                ipv6 = nhdr;
2578                nhdr += sizeof(*ipv6);
2579                ip_proto = ipv6->nexthdr;
2580                break;
2581        default:
2582                goto need_help;
2583        }
2584
2585ip_proto_again:
2586        switch (ip_proto) {
2587        case IPPROTO_TCP:
2588                if (!spec->tcp_okay ||
2589                    skb->csum_offset != offsetof(struct tcphdr, check))
2590                        goto need_help;
2591                break;
2592        case IPPROTO_UDP:
2593                if (!spec->udp_okay ||
2594                    skb->csum_offset != offsetof(struct udphdr, check))
2595                        goto need_help;
2596                break;
2597        case IPPROTO_SCTP:
2598                if (!spec->sctp_okay ||
2599                    skb->csum_offset != offsetof(struct sctphdr, checksum))
2600                        goto cant_help;
2601                break;
2602        case NEXTHDR_HOP:
2603        case NEXTHDR_ROUTING:
2604        case NEXTHDR_DEST: {
2605                u8 *opthdr = nhdr;
2606
2607                if (protocol != IPPROTO_IPV6 || !spec->ext_hdrs_okay)
2608                        goto need_help;
2609
2610                ip_proto = opthdr[0];
2611                nhdr += (opthdr[1] + 1) << 3;
2612
2613                goto ip_proto_again;
2614        }
2615        default:
2616                goto need_help;
2617        }
2618
2619        /* Passed the tests for offloading checksum */
2620        return true;
2621
2622need_help:
2623        if (csum_help && !skb_shinfo(skb)->gso_size)
2624                skb_checksum_help(skb);
2625cant_help:
2626        return false;
2627}
2628EXPORT_SYMBOL(__skb_csum_offload_chk);
2629
2630__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2631{
2632        __be16 type = skb->protocol;
2633
2634        /* Tunnel gso handlers can set protocol to ethernet. */
2635        if (type == htons(ETH_P_TEB)) {
2636                struct ethhdr *eth;
2637
2638                if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2639                        return 0;
2640
2641                eth = (struct ethhdr *)skb_mac_header(skb);
2642                type = eth->h_proto;
2643        }
2644
2645        return __vlan_get_protocol(skb, type, depth);
2646}
2647
2648/**
2649 *      skb_mac_gso_segment - mac layer segmentation handler.
2650 *      @skb: buffer to segment
2651 *      @features: features for the output path (see dev->features)
2652 */
2653struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2654                                    netdev_features_t features)
2655{
2656        struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2657        struct packet_offload *ptype;
2658        int vlan_depth = skb->mac_len;
2659        __be16 type = skb_network_protocol(skb, &vlan_depth);
2660
2661        if (unlikely(!type))
2662                return ERR_PTR(-EINVAL);
2663
2664        __skb_pull(skb, vlan_depth);
2665
2666        rcu_read_lock();
2667        list_for_each_entry_rcu(ptype, &offload_base, list) {
2668                if (ptype->type == type && ptype->callbacks.gso_segment) {
2669                        segs = ptype->callbacks.gso_segment(skb, features);
2670                        break;
2671                }
2672        }
2673        rcu_read_unlock();
2674
2675        __skb_push(skb, skb->data - skb_mac_header(skb));
2676
2677        return segs;
2678}
2679EXPORT_SYMBOL(skb_mac_gso_segment);
2680
2681
2682/* openvswitch calls this on rx path, so we need a different check.
2683 */
2684static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2685{
2686        if (tx_path)
2687                return skb->ip_summed != CHECKSUM_PARTIAL;
2688        else
2689                return skb->ip_summed == CHECKSUM_NONE;
2690}
2691
2692/**
2693 *      __skb_gso_segment - Perform segmentation on skb.
2694 *      @skb: buffer to segment
2695 *      @features: features for the output path (see dev->features)
2696 *      @tx_path: whether it is called in TX path
2697 *
2698 *      This function segments the given skb and returns a list of segments.
2699 *
2700 *      It may return NULL if the skb requires no segmentation.  This is
2701 *      only possible when GSO is used for verifying header integrity.
2702 *
2703 *      Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2704 */
2705struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2706                                  netdev_features_t features, bool tx_path)
2707{
2708        if (unlikely(skb_needs_check(skb, tx_path))) {
2709                int err;
2710
2711                skb_warn_bad_offload(skb);
2712
2713                err = skb_cow_head(skb, 0);
2714                if (err < 0)
2715                        return ERR_PTR(err);
2716        }
2717
2718        /* Only report GSO partial support if it will enable us to
2719         * support segmentation on this frame without needing additional
2720         * work.
2721         */
2722        if (features & NETIF_F_GSO_PARTIAL) {
2723                netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
2724                struct net_device *dev = skb->dev;
2725
2726                partial_features |= dev->features & dev->gso_partial_features;
2727                if (!skb_gso_ok(skb, features | partial_features))
2728                        features &= ~NETIF_F_GSO_PARTIAL;
2729        }
2730
2731        BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2732                     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2733
2734        SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2735        SKB_GSO_CB(skb)->encap_level = 0;
2736
2737        skb_reset_mac_header(skb);
2738        skb_reset_mac_len(skb);
2739
2740        return skb_mac_gso_segment(skb, features);
2741}
2742EXPORT_SYMBOL(__skb_gso_segment);
2743
2744/* Take action when hardware reception checksum errors are detected. */
2745#ifdef CONFIG_BUG
2746void netdev_rx_csum_fault(struct net_device *dev)
2747{
2748        if (net_ratelimit()) {
2749                pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2750                dump_stack();
2751        }
2752}
2753EXPORT_SYMBOL(netdev_rx_csum_fault);
2754#endif
2755
2756/* Actually, we should eliminate this check as soon as we know, that:
2757 * 1. IOMMU is present and allows to map all the memory.
2758 * 2. No high memory really exists on this machine.
2759 */
2760
2761static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2762{
2763#ifdef CONFIG_HIGHMEM
2764        int i;
2765        if (!(dev->features & NETIF_F_HIGHDMA)) {
2766                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2767                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2768                        if (PageHighMem(skb_frag_page(frag)))
2769                                return 1;
2770                }
2771        }
2772
2773        if (PCI_DMA_BUS_IS_PHYS) {
2774                struct device *pdev = dev->dev.parent;
2775
2776                if (!pdev)
2777                        return 0;
2778                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2779                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2780                        dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2781                        if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2782                                return 1;
2783                }
2784        }
2785#endif
2786        return 0;
2787}
2788
2789/* If MPLS offload request, verify we are testing hardware MPLS features
2790 * instead of standard features for the netdev.
2791 */
2792#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2793static netdev_features_t net_mpls_features(struct sk_buff *skb,
2794                                           netdev_features_t features,
2795                                           __be16 type)
2796{
2797        if (eth_p_mpls(type))
2798                features &= skb->dev->mpls_features;
2799
2800        return features;
2801}
2802#else
2803static netdev_features_t net_mpls_features(struct sk_buff *skb,
2804                                           netdev_features_t features,
2805                                           __be16 type)
2806{
2807        return features;
2808}
2809#endif
2810
2811static netdev_features_t harmonize_features(struct sk_buff *skb,
2812        netdev_features_t features)
2813{
2814        int tmp;
2815        __be16 type;
2816
2817        type = skb_network_protocol(skb, &tmp);
2818        features = net_mpls_features(skb, features, type);
2819
2820        if (skb->ip_summed != CHECKSUM_NONE &&
2821            !can_checksum_protocol(features, type)) {
2822                features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
2823        } else if (illegal_highdma(skb->dev, skb)) {
2824                features &= ~NETIF_F_SG;
2825        }
2826
2827        return features;
2828}
2829
2830netdev_features_t passthru_features_check(struct sk_buff *skb,
2831                                          struct net_device *dev,
2832                                          netdev_features_t features)
2833{
2834        return features;
2835}
2836EXPORT_SYMBOL(passthru_features_check);
2837
2838static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2839                                             struct net_device *dev,
2840                                             netdev_features_t features)
2841{
2842        return vlan_features_check(skb, features);
2843}
2844
2845static netdev_features_t gso_features_check(const struct sk_buff *skb,
2846                                            struct net_device *dev,
2847                                            netdev_features_t features)
2848{
2849        u16 gso_segs = skb_shinfo(skb)->gso_segs;
2850
2851        if (gso_segs > dev->gso_max_segs)
2852                return features & ~NETIF_F_GSO_MASK;
2853
2854        /* Support for GSO partial features requires software
2855         * intervention before we can actually process the packets
2856         * so we need to strip support for any partial features now
2857         * and we can pull them back in after we have partially
2858         * segmented the frame.
2859         */
2860        if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
2861                features &= ~dev->gso_partial_features;
2862
2863        /* Make sure to clear the IPv4 ID mangling feature if the
2864         * IPv4 header has the potential to be fragmented.
2865         */
2866        if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
2867                struct iphdr *iph = skb->encapsulation ?
2868                                    inner_ip_hdr(skb) : ip_hdr(skb);
2869
2870                if (!(iph->frag_off & htons(IP_DF)))
2871                        features &= ~NETIF_F_TSO_MANGLEID;
2872        }
2873
2874        return features;
2875}
2876
2877netdev_features_t netif_skb_features(struct sk_buff *skb)
2878{
2879        struct net_device *dev = skb->dev;
2880        netdev_features_t features = dev->features;
2881
2882        if (skb_is_gso(skb))
2883                features = gso_features_check(skb, dev, features);
2884
2885        /* If encapsulation offload request, verify we are testing
2886         * hardware encapsulation features instead of standard
2887         * features for the netdev
2888         */
2889        if (skb->encapsulation)
2890                features &= dev->hw_enc_features;
2891
2892        if (skb_vlan_tagged(skb))
2893                features = netdev_intersect_features(features,
2894                                                     dev->vlan_features |
2895                                                     NETIF_F_HW_VLAN_CTAG_TX |
2896                                                     NETIF_F_HW_VLAN_STAG_TX);
2897
2898        if (dev->netdev_ops->ndo_features_check)
2899                features &= dev->netdev_ops->ndo_features_check(skb, dev,
2900                                                                features);
2901        else
2902                features &= dflt_features_check(skb, dev, features);
2903
2904        return harmonize_features(skb, features);
2905}
2906EXPORT_SYMBOL(netif_skb_features);
2907
2908static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2909                    struct netdev_queue *txq, bool more)
2910{
2911        unsigned int len;
2912        int rc;
2913
2914        if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2915                dev_queue_xmit_nit(skb, dev);
2916
2917        len = skb->len;
2918        trace_net_dev_start_xmit(skb, dev);
2919        rc = netdev_start_xmit(skb, dev, txq, more);
2920        trace_net_dev_xmit(skb, rc, dev, len);
2921
2922        return rc;
2923}
2924
2925struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2926                                    struct netdev_queue *txq, int *ret)
2927{
2928        struct sk_buff *skb = first;
2929        int rc = NETDEV_TX_OK;
2930
2931        while (skb) {
2932                struct sk_buff *next = skb->next;
2933
2934                skb->next = NULL;
2935                rc = xmit_one(skb, dev, txq, next != NULL);
2936                if (unlikely(!dev_xmit_complete(rc))) {
2937                        skb->next = next;
2938                        goto out;
2939                }
2940
2941                skb = next;
2942                if (netif_xmit_stopped(txq) && skb) {
2943                        rc = NETDEV_TX_BUSY;
2944                        break;
2945                }
2946        }
2947
2948out:
2949        *ret = rc;
2950        return skb;
2951}
2952
2953static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2954                                          netdev_features_t features)
2955{
2956        if (skb_vlan_tag_present(skb) &&
2957            !vlan_hw_offload_capable(features, skb->vlan_proto))
2958                skb = __vlan_hwaccel_push_inside(skb);
2959        return skb;
2960}
2961
2962static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2963{
2964        netdev_features_t features;
2965
2966        features = netif_skb_features(skb);
2967        skb = validate_xmit_vlan(skb, features);
2968        if (unlikely(!skb))
2969                goto out_null;
2970
2971        if (netif_needs_gso(skb, features)) {
2972                struct sk_buff *segs;
2973
2974                segs = skb_gso_segment(skb, features);
2975                if (IS_ERR(segs)) {
2976                        goto out_kfree_skb;
2977                } else if (segs) {
2978                        consume_skb(skb);
2979                        skb = segs;
2980                }
2981        } else {
2982                if (skb_needs_linearize(skb, features) &&
2983                    __skb_linearize(skb))
2984                        goto out_kfree_skb;
2985
2986                /* If packet is not checksummed and device does not
2987                 * support checksumming for this protocol, complete
2988                 * checksumming here.
2989                 */
2990                if (skb->ip_summed == CHECKSUM_PARTIAL) {
2991                        if (skb->encapsulation)
2992                                skb_set_inner_transport_header(skb,
2993                                                               skb_checksum_start_offset(skb));
2994                        else
2995                                skb_set_transport_header(skb,
2996                                                         skb_checksum_start_offset(skb));
2997                        if (!(features & NETIF_F_CSUM_MASK) &&
2998                            skb_checksum_help(skb))
2999                                goto out_kfree_skb;
3000                }
3001        }
3002
3003        return skb;
3004
3005out_kfree_skb:
3006        kfree_skb(skb);
3007out_null:
3008        atomic_long_inc(&dev->tx_dropped);
3009        return NULL;
3010}
3011
3012struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
3013{
3014        struct sk_buff *next, *head = NULL, *tail;
3015
3016        for (; skb != NULL; skb = next) {
3017                next = skb->next;
3018                skb->next = NULL;
3019
3020                /* in case skb wont be segmented, point to itself */
3021                skb->prev = skb;
3022
3023                skb = validate_xmit_skb(skb, dev);
3024                if (!skb)
3025                        continue;
3026
3027                if (!head)
3028                        head = skb;
3029                else
3030                        tail->next = skb;
3031                /* If skb was segmented, skb->prev points to
3032                 * the last segment. If not, it still contains skb.
3033                 */
3034                tail = skb->prev;
3035        }
3036        return head;
3037}
3038
3039static void qdisc_pkt_len_init(struct sk_buff *skb)
3040{
3041        const struct skb_shared_info *shinfo = skb_shinfo(skb);
3042
3043        qdisc_skb_cb(skb)->pkt_len = skb->len;
3044
3045        /* To get more precise estimation of bytes sent on wire,
3046         * we add to pkt_len the headers size of all segments
3047         */
3048        if (shinfo->gso_size)  {
3049                unsigned int hdr_len;
3050                u16 gso_segs = shinfo->gso_segs;
3051
3052                /* mac layer + network layer */
3053                hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3054
3055                /* + transport layer */
3056                if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
3057                        hdr_len += tcp_hdrlen(skb);
3058                else
3059                        hdr_len += sizeof(struct udphdr);
3060
3061                if (shinfo->gso_type & SKB_GSO_DODGY)
3062                        gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3063                                                shinfo->gso_size);
3064
3065                qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3066        }
3067}
3068
3069static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3070                                 struct net_device *dev,
3071                                 struct netdev_queue *txq)
3072{
3073        spinlock_t *root_lock = qdisc_lock(q);
3074        struct sk_buff *to_free = NULL;
3075        bool contended;
3076        int rc;
3077
3078        qdisc_calculate_pkt_len(skb, q);
3079        /*
3080         * Heuristic to force contended enqueues to serialize on a
3081         * separate lock before trying to get qdisc main lock.
3082         * This permits qdisc->running owner to get the lock more
3083         * often and dequeue packets faster.
3084         */
3085        contended = qdisc_is_running(q);
3086        if (unlikely(contended))
3087                spin_lock(&q->busylock);
3088
3089        spin_lock(root_lock);
3090        if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3091                __qdisc_drop(skb, &to_free);
3092                rc = NET_XMIT_DROP;
3093        } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3094                   qdisc_run_begin(q)) {
3095                /*
3096                 * This is a work-conserving queue; there are no old skbs
3097                 * waiting to be sent out; and the qdisc is not running -
3098                 * xmit the skb directly.
3099                 */
3100
3101                qdisc_bstats_update(q, skb);
3102
3103                if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3104                        if (unlikely(contended)) {
3105                                spin_unlock(&q->busylock);
3106                                contended = false;
3107                        }
3108                        __qdisc_run(q);
3109                } else
3110                        qdisc_run_end(q);
3111
3112                rc = NET_XMIT_SUCCESS;
3113        } else {
3114                rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3115                if (qdisc_run_begin(q)) {
3116                        if (unlikely(contended)) {
3117                                spin_unlock(&q->busylock);
3118                                contended = false;
3119                        }
3120                        __qdisc_run(q);
3121                }
3122        }
3123        spin_unlock(root_lock);
3124        if (unlikely(to_free))
3125                kfree_skb_list(to_free);
3126        if (unlikely(contended))
3127                spin_unlock(&q->busylock);
3128        return rc;
3129}
3130
3131#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3132static void skb_update_prio(struct sk_buff *skb)
3133{
3134        struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
3135
3136        if (!skb->priority && skb->sk && map) {
3137                unsigned int prioidx =
3138                        sock_cgroup_prioidx(&skb->sk->sk_cgrp_data);
3139
3140                if (prioidx < map->priomap_len)
3141                        skb->priority = map->priomap[prioidx];
3142        }
3143}
3144#else
3145#define skb_update_prio(skb)
3146#endif
3147
3148DEFINE_PER_CPU(int, xmit_recursion);
3149EXPORT_SYMBOL(xmit_recursion);
3150
3151/**
3152 *      dev_loopback_xmit - loop back @skb
3153 *      @net: network namespace this loopback is happening in
3154 *      @sk:  sk needed to be a netfilter okfn
3155 *      @skb: buffer to transmit
3156 */
3157int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3158{
3159        skb_reset_mac_header(skb);
3160        __skb_pull(skb, skb_network_offset(skb));
3161        skb->pkt_type = PACKET_LOOPBACK;
3162        skb->ip_summed = CHECKSUM_UNNECESSARY;
3163        WARN_ON(!skb_dst(skb));
3164        skb_dst_force(skb);
3165        netif_rx_ni(skb);
3166        return 0;
3167}
3168EXPORT_SYMBOL(dev_loopback_xmit);
3169
3170#ifdef CONFIG_NET_EGRESS
3171static struct sk_buff *
3172sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3173{
3174        struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list);
3175        struct tcf_result cl_res;
3176
3177        if (!cl)
3178                return skb;
3179
3180        /* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set
3181         * earlier by the caller.
3182         */
3183        qdisc_bstats_cpu_update(cl->q, skb);
3184
3185        switch (tc_classify(skb, cl, &cl_res, false)) {
3186        case TC_ACT_OK:
3187        case TC_ACT_RECLASSIFY:
3188                skb->tc_index = TC_H_MIN(cl_res.classid);
3189                break;
3190        case TC_ACT_SHOT:
3191                qdisc_qstats_cpu_drop(cl->q);
3192                *ret = NET_XMIT_DROP;
3193                kfree_skb(skb);
3194                return NULL;
3195        case TC_ACT_STOLEN:
3196        case TC_ACT_QUEUED:
3197                *ret = NET_XMIT_SUCCESS;
3198                consume_skb(skb);
3199                return NULL;
3200        case TC_ACT_REDIRECT:
3201                /* No need to push/pop skb's mac_header here on egress! */
3202                skb_do_redirect(skb);
3203                *ret = NET_XMIT_SUCCESS;
3204                return NULL;
3205        default:
3206                break;
3207        }
3208
3209        return skb;
3210}
3211#endif /* CONFIG_NET_EGRESS */
3212
3213static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
3214{
3215#ifdef CONFIG_XPS
3216        struct xps_dev_maps *dev_maps;
3217        struct xps_map *map;
3218        int queue_index = -1;
3219
3220        rcu_read_lock();
3221        dev_maps = rcu_dereference(dev->xps_maps);
3222        if (dev_maps) {
3223                map = rcu_dereference(
3224                    dev_maps->cpu_map[skb->sender_cpu - 1]);
3225                if (map) {
3226                        if (map->len == 1)
3227                                queue_index = map->queues[0];
3228                        else
3229                                queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3230                                                                           map->len)];
3231                        if (unlikely(queue_index >= dev->real_num_tx_queues))
3232                                queue_index = -1;
3233                }
3234        }
3235        rcu_read_unlock();
3236
3237        return queue_index;
3238#else
3239        return -1;
3240#endif
3241}
3242
3243static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3244{
3245        struct sock *sk = skb->sk;
3246        int queue_index = sk_tx_queue_get(sk);
3247
3248        if (queue_index < 0 || skb->ooo_okay ||
3249            queue_index >= dev->real_num_tx_queues) {
3250                int new_index = get_xps_queue(dev, skb);
3251                if (new_index < 0)
3252                        new_index = skb_tx_hash(dev, skb);
3253
3254                if (queue_index != new_index && sk &&
3255                    sk_fullsock(sk) &&
3256                    rcu_access_pointer(sk->sk_dst_cache))
3257                        sk_tx_queue_set(sk, new_index);
3258
3259                queue_index = new_index;
3260        }
3261
3262        return queue_index;
3263}
3264
3265struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3266                                    struct sk_buff *skb,
3267                                    void *accel_priv)
3268{
3269        int queue_index = 0;
3270
3271#ifdef CONFIG_XPS
3272        u32 sender_cpu = skb->sender_cpu - 1;
3273
3274        if (sender_cpu >= (u32)NR_CPUS)
3275                skb->sender_cpu = raw_smp_processor_id() + 1;
3276#endif
3277
3278        if (dev->real_num_tx_queues != 1) {
3279                const struct net_device_ops *ops = dev->netdev_ops;
3280                if (ops->ndo_select_queue)
3281                        queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3282                                                            __netdev_pick_tx);
3283                else
3284                        queue_index = __netdev_pick_tx(dev, skb);
3285
3286                if (!accel_priv)
3287                        queue_index = netdev_cap_txqueue(dev, queue_index);
3288        }
3289
3290        skb_set_queue_mapping(skb, queue_index);
3291        return netdev_get_tx_queue(dev, queue_index);
3292}
3293
3294/**
3295 *      __dev_queue_xmit - transmit a buffer
3296 *      @skb: buffer to transmit
3297 *      @accel_priv: private data used for L2 forwarding offload
3298 *
3299 *      Queue a buffer for transmission to a network device. The caller must
3300 *      have set the device and priority and built the buffer before calling
3301 *      this function. The function can be called from an interrupt.
3302 *
3303 *      A negative errno code is returned on a failure. A success does not
3304 *      guarantee the frame will be transmitted as it may be dropped due
3305 *      to congestion or traffic shaping.
3306 *
3307 * -----------------------------------------------------------------------------------
3308 *      I notice this method can also return errors from the queue disciplines,
3309 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3310 *      be positive.
3311 *
3312 *      Regardless of the return value, the skb is consumed, so it is currently
3313 *      difficult to retry a send to this method.  (You can bump the ref count
3314 *      before sending to hold a reference for retry if you are careful.)
3315 *
3316 *      When calling this method, interrupts MUST be enabled.  This is because
3317 *      the BH enable code must have IRQs enabled so that it will not deadlock.
3318 *          --BLG
3319 */
3320static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3321{
3322        struct net_device *dev = skb->dev;
3323        struct netdev_queue *txq;
3324        struct Qdisc *q;
3325        int rc = -ENOMEM;
3326
3327        skb_reset_mac_header(skb);
3328
3329        if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3330                __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3331
3332        /* Disable soft irqs for various locks below. Also
3333         * stops preemption for RCU.
3334         */
3335        rcu_read_lock_bh();
3336
3337        skb_update_prio(skb);
3338
3339        qdisc_pkt_len_init(skb);
3340#ifdef CONFIG_NET_CLS_ACT
3341        skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3342# ifdef CONFIG_NET_EGRESS
3343        if (static_key_false(&egress_needed)) {
3344                skb = sch_handle_egress(skb, &rc, dev);
3345                if (!skb)
3346                        goto out;
3347        }
3348# endif
3349#endif
3350        /* If device/qdisc don't need skb->dst, release it right now while
3351         * its hot in this cpu cache.
3352         */
3353        if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3354                skb_dst_drop(skb);
3355        else
3356                skb_dst_force(skb);
3357
3358#ifdef CONFIG_NET_SWITCHDEV
3359        /* Don't forward if offload device already forwarded */
3360        if (skb->offload_fwd_mark &&
3361            skb->offload_fwd_mark == dev->offload_fwd_mark) {
3362                consume_skb(skb);
3363                rc = NET_XMIT_SUCCESS;
3364                goto out;
3365        }
3366#endif
3367
3368        txq = netdev_pick_tx(dev, skb, accel_priv);
3369        q = rcu_dereference_bh(txq->qdisc);
3370
3371        trace_net_dev_queue(skb);
3372        if (q->enqueue) {
3373                rc = __dev_xmit_skb(skb, q, dev, txq);
3374                goto out;
3375        }
3376
3377        /* The device has no queue. Common case for software devices:
3378           loopback, all the sorts of tunnels...
3379
3380           Really, it is unlikely that netif_tx_lock protection is necessary
3381           here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3382           counters.)
3383           However, it is possible, that they rely on protection
3384           made by us here.
3385
3386           Check this and shot the lock. It is not prone from deadlocks.
3387           Either shot noqueue qdisc, it is even simpler 8)
3388         */
3389        if (dev->flags & IFF_UP) {
3390                int cpu = smp_processor_id(); /* ok because BHs are off */
3391
3392                if (txq->xmit_lock_owner != cpu) {
3393                        if (unlikely(__this_cpu_read(xmit_recursion) >
3394                                     XMIT_RECURSION_LIMIT))
3395                                goto recursion_alert;
3396
3397                        skb = validate_xmit_skb(skb, dev);
3398                        if (!skb)
3399                                goto out;
3400
3401                        HARD_TX_LOCK(dev, txq, cpu);
3402
3403                        if (!netif_xmit_stopped(txq)) {
3404                                __this_cpu_inc(xmit_recursion);
3405                                skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3406                                __this_cpu_dec(xmit_recursion);
3407                                if (dev_xmit_complete(rc)) {
3408                                        HARD_TX_UNLOCK(dev, txq);
3409                                        goto out;
3410                                }
3411                        }
3412                        HARD_TX_UNLOCK(dev, txq);
3413                        net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3414                                             dev->name);
3415                } else {
3416                        /* Recursion is detected! It is possible,
3417                         * unfortunately
3418                         */
3419recursion_alert:
3420                        net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3421                                             dev->name);
3422                }
3423        }
3424
3425        rc = -ENETDOWN;
3426        rcu_read_unlock_bh();
3427
3428        atomic_long_inc(&dev->tx_dropped);
3429        kfree_skb_list(skb);
3430        return rc;
3431out:
3432        rcu_read_unlock_bh();
3433        return rc;
3434}
3435
3436int dev_queue_xmit(struct sk_buff *skb)
3437{
3438        return __dev_queue_xmit(skb, NULL);
3439}
3440EXPORT_SYMBOL(dev_queue_xmit);
3441
3442int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3443{
3444        return __dev_queue_xmit(skb, accel_priv);
3445}
3446EXPORT_SYMBOL(dev_queue_xmit_accel);
3447
3448
3449/*=======================================================================
3450                        Receiver routines
3451  =======================================================================*/
3452
3453int netdev_max_backlog __read_mostly = 1000;
3454EXPORT_SYMBOL(netdev_max_backlog);
3455
3456int netdev_tstamp_prequeue __read_mostly = 1;
3457int netdev_budget __read_mostly = 300;
3458int weight_p __read_mostly = 64;            /* old backlog weight */
3459
3460/* Called with irq disabled */
3461static inline void ____napi_schedule(struct softnet_data *sd,
3462                                     struct napi_struct *napi)
3463{
3464        list_add_tail(&napi->poll_list, &sd->poll_list);
3465        __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3466}
3467
3468#ifdef CONFIG_RPS
3469
3470/* One global table that all flow-based protocols share. */
3471struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3472EXPORT_SYMBOL(rps_sock_flow_table);
3473u32 rps_cpu_mask __read_mostly;
3474EXPORT_SYMBOL(rps_cpu_mask);
3475
3476struct static_key rps_needed __read_mostly;
3477EXPORT_SYMBOL(rps_needed);
3478
3479static struct rps_dev_flow *
3480set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3481            struct rps_dev_flow *rflow, u16 next_cpu)
3482{
3483        if (next_cpu < nr_cpu_ids) {
3484#ifdef CONFIG_RFS_ACCEL
3485                struct netdev_rx_queue *rxqueue;
3486                struct rps_dev_flow_table *flow_table;
3487                struct rps_dev_flow *old_rflow;
3488                u32 flow_id;
3489                u16 rxq_index;
3490                int rc;
3491
3492                /* Should we steer this flow to a different hardware queue? */
3493                if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3494                    !(dev->features & NETIF_F_NTUPLE))
3495                        goto out;
3496                rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3497                if (rxq_index == skb_get_rx_queue(skb))
3498                        goto out;
3499
3500                rxqueue = dev->_rx + rxq_index;
3501                flow_table = rcu_dereference(rxqueue->rps_flow_table);
3502                if (!flow_table)
3503                        goto out;
3504                flow_id = skb_get_hash(skb) & flow_table->mask;
3505                rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3506                                                        rxq_index, flow_id);
3507                if (rc < 0)
3508                        goto out;
3509                old_rflow = rflow;
3510                rflow = &flow_table->flows[flow_id];
3511                rflow->filter = rc;
3512                if (old_rflow->filter == rflow->filter)
3513                        old_rflow->filter = RPS_NO_FILTER;
3514        out:
3515#endif
3516                rflow->last_qtail =
3517                        per_cpu(softnet_data, next_cpu).input_queue_head;
3518        }
3519
3520        rflow->cpu = next_cpu;
3521        return rflow;
3522}
3523
3524/*
3525 * get_rps_cpu is called from netif_receive_skb and returns the target
3526 * CPU from the RPS map of the receiving queue for a given skb.
3527 * rcu_read_lock must be held on entry.
3528 */
3529static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3530                       struct rps_dev_flow **rflowp)
3531{
3532        const struct rps_sock_flow_table *sock_flow_table;
3533        struct netdev_rx_queue *rxqueue = dev->_rx;
3534        struct rps_dev_flow_table *flow_table;
3535        struct rps_map *map;
3536        int cpu = -1;
3537        u32 tcpu;
3538        u32 hash;
3539
3540        if (skb_rx_queue_recorded(skb)) {
3541                u16 index = skb_get_rx_queue(skb);
3542
3543                if (unlikely(index >= dev->real_num_rx_queues)) {
3544                        WARN_ONCE(dev->real_num_rx_queues > 1,
3545                                  "%s received packet on queue %u, but number "
3546                                  "of RX queues is %u\n",
3547                                  dev->name, index, dev->real_num_rx_queues);
3548                        goto done;
3549                }
3550                rxqueue += index;
3551        }
3552
3553        /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3554
3555        flow_table = rcu_dereference(rxqueue->rps_flow_table);
3556        map = rcu_dereference(rxqueue->rps_map);
3557        if (!flow_table && !map)
3558                goto done;
3559
3560        skb_reset_network_header(skb);
3561        hash = skb_get_hash(skb);
3562        if (!hash)
3563                goto done;
3564
3565        sock_flow_table = rcu_dereference(rps_sock_flow_table);
3566        if (flow_table && sock_flow_table) {
3567                struct rps_dev_flow *rflow;
3568                u32 next_cpu;
3569                u32 ident;
3570
3571                /* First check into global flow table if there is a match */
3572                ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3573                if ((ident ^ hash) & ~rps_cpu_mask)
3574                        goto try_rps;
3575
3576                next_cpu = ident & rps_cpu_mask;
3577
3578                /* OK, now we know there is a match,
3579                 * we can look at the local (per receive queue) flow table
3580                 */
3581                rflow = &flow_table->flows[hash & flow_table->mask];
3582                tcpu = rflow->cpu;
3583
3584                /*
3585                 * If the desired CPU (where last recvmsg was done) is
3586                 * different from current CPU (one in the rx-queue flow
3587                 * table entry), switch if one of the following holds:
3588                 *   - Current CPU is unset (>= nr_cpu_ids).
3589                 *   - Current CPU is offline.
3590                 *   - The current CPU's queue tail has advanced beyond the
3591                 *     last packet that was enqueued using this table entry.
3592                 *     This guarantees that all previous packets for the flow
3593                 *     have been dequeued, thus preserving in order delivery.
3594                 */
3595                if (unlikely(tcpu != next_cpu) &&
3596                    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3597                     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3598                      rflow->last_qtail)) >= 0)) {
3599                        tcpu = next_cpu;
3600                        rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3601                }
3602
3603                if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3604                        *rflowp = rflow;
3605                        cpu = tcpu;
3606                        goto done;
3607                }
3608        }
3609
3610try_rps:
3611
3612        if (map) {
3613                tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3614                if (cpu_online(tcpu)) {
3615                        cpu = tcpu;
3616                        goto done;
3617                }
3618        }
3619
3620done:
3621        return cpu;
3622}
3623
3624#ifdef CONFIG_RFS_ACCEL
3625
3626/**
3627 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3628 * @dev: Device on which the filter was set
3629 * @rxq_index: RX queue index
3630 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3631 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3632 *
3633 * Drivers that implement ndo_rx_flow_steer() should periodically call
3634 * this function for each installed filter and remove the filters for
3635 * which it returns %true.
3636 */
3637bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3638                         u32 flow_id, u16 filter_id)
3639{
3640        struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3641        struct rps_dev_flow_table *flow_table;
3642        struct rps_dev_flow *rflow;
3643        bool expire = true;
3644        unsigned int cpu;
3645
3646        rcu_read_lock();
3647        flow_table = rcu_dereference(rxqueue->rps_flow_table);
3648        if (flow_table && flow_id <= flow_table->mask) {
3649                rflow = &flow_table->flows[flow_id];
3650                cpu = ACCESS_ONCE(rflow->cpu);
3651                if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3652                    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3653                           rflow->last_qtail) <
3654                     (int)(10 * flow_table->mask)))
3655                        expire = false;
3656        }
3657        rcu_read_unlock();
3658        return expire;
3659}
3660EXPORT_SYMBOL(rps_may_expire_flow);
3661
3662#endif /* CONFIG_RFS_ACCEL */
3663
3664/* Called from hardirq (IPI) context */
3665static void rps_trigger_softirq(void *data)
3666{
3667        struct softnet_data *sd = data;
3668
3669        ____napi_schedule(sd, &sd->backlog);
3670        sd->received_rps++;
3671}
3672
3673#endif /* CONFIG_RPS */
3674
3675/*
3676 * Check if this softnet_data structure is another cpu one
3677 * If yes, queue it to our IPI list and return 1
3678 * If no, return 0
3679 */
3680static int rps_ipi_queued(struct softnet_data *sd)
3681{
3682#ifdef CONFIG_RPS
3683        struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3684
3685        if (sd != mysd) {
3686                sd->rps_ipi_next = mysd->rps_ipi_list;
3687                mysd->rps_ipi_list = sd;
3688
3689                __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3690                return 1;
3691        }
3692#endif /* CONFIG_RPS */
3693        return 0;
3694}
3695
3696#ifdef CONFIG_NET_FLOW_LIMIT
3697int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3698#endif
3699
3700static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3701{
3702#ifdef CONFIG_NET_FLOW_LIMIT
3703        struct sd_flow_limit *fl;
3704        struct softnet_data *sd;
3705        unsigned int old_flow, new_flow;
3706
3707        if (qlen < (netdev_max_backlog >> 1))
3708                return false;
3709
3710        sd = this_cpu_ptr(&softnet_data);
3711
3712        rcu_read_lock();
3713        fl = rcu_dereference(sd->flow_limit);
3714        if (fl) {
3715                new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3716                old_flow = fl->history[fl->history_head];
3717                fl->history[fl->history_head] = new_flow;
3718
3719                fl->history_head++;
3720                fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3721
3722                if (likely(fl->buckets[old_flow]))
3723                        fl->buckets[old_flow]--;
3724
3725                if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3726                        fl->count++;
3727                        rcu_read_unlock();
3728                        return true;
3729                }
3730        }
3731        rcu_read_unlock();
3732#endif
3733        return false;
3734}
3735
3736/*
3737 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3738 * queue (may be a remote CPU queue).
3739 */
3740static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3741                              unsigned int *qtail)
3742{
3743        struct softnet_data *sd;
3744        unsigned long flags;
3745        unsigned int qlen;
3746
3747        sd = &per_cpu(softnet_data, cpu);
3748
3749        local_irq_save(flags);
3750
3751        rps_lock(sd);
3752        if (!netif_running(skb->dev))
3753                goto drop;
3754        qlen = skb_queue_len(&sd->input_pkt_queue);
3755        if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3756                if (qlen) {
3757enqueue:
3758                        __skb_queue_tail(&sd->input_pkt_queue, skb);
3759                        input_queue_tail_incr_save(sd, qtail);
3760                        rps_unlock(sd);
3761                        local_irq_restore(flags);
3762                        return NET_RX_SUCCESS;
3763                }
3764
3765                /* Schedule NAPI for backlog device
3766                 * We can use non atomic operation since we own the queue lock
3767                 */
3768                if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3769                        if (!rps_ipi_queued(sd))
3770                                ____napi_schedule(sd, &sd->backlog);
3771                }
3772                goto enqueue;
3773        }
3774
3775drop:
3776        sd->dropped++;
3777        rps_unlock(sd);
3778
3779        local_irq_restore(flags);
3780
3781        atomic_long_inc(&skb->dev->rx_dropped);
3782        kfree_skb(skb);
3783        return NET_RX_DROP;
3784}
3785
3786static int netif_rx_internal(struct sk_buff *skb)
3787{
3788        int ret;
3789
3790        net_timestamp_check(netdev_tstamp_prequeue, skb);
3791
3792        trace_netif_rx(skb);
3793#ifdef CONFIG_RPS
3794        if (static_key_false(&rps_needed)) {
3795                struct rps_dev_flow voidflow, *rflow = &voidflow;
3796                int cpu;
3797
3798                preempt_disable();
3799                rcu_read_lock();
3800
3801                cpu = get_rps_cpu(skb->dev, skb, &rflow);
3802                if (cpu < 0)
3803                        cpu = smp_processor_id();
3804
3805                ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3806
3807                rcu_read_unlock();
3808                preempt_enable();
3809        } else
3810#endif
3811        {
3812                unsigned int qtail;
3813                ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3814                put_cpu();
3815        }
3816        return ret;
3817}
3818
3819/**
3820 *      netif_rx        -       post buffer to the network code
3821 *      @skb: buffer to post
3822 *
3823 *      This function receives a packet from a device driver and queues it for
3824 *      the upper (protocol) levels to process.  It always succeeds. The buffer
3825 *      may be dropped during processing for congestion control or by the
3826 *      protocol layers.
3827 *
3828 *      return values:
3829 *      NET_RX_SUCCESS  (no congestion)
3830 *      NET_RX_DROP     (packet was dropped)
3831 *
3832 */
3833
3834int netif_rx(struct sk_buff *skb)
3835{
3836        trace_netif_rx_entry(skb);
3837
3838        return netif_rx_internal(skb);
3839}
3840EXPORT_SYMBOL(netif_rx);
3841
3842int netif_rx_ni(struct sk_buff *skb)
3843{
3844        int err;
3845
3846        trace_netif_rx_ni_entry(skb);
3847
3848        preempt_disable();
3849        err = netif_rx_internal(skb);
3850        if (local_softirq_pending())
3851                do_softirq();
3852        preempt_enable();
3853
3854        return err;
3855}
3856EXPORT_SYMBOL(netif_rx_ni);
3857
3858static void net_tx_action(struct softirq_action *h)
3859{
3860        struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3861
3862        if (sd->completion_queue) {
3863                struct sk_buff *clist;
3864
3865                local_irq_disable();
3866                clist = sd->completion_queue;
3867                sd->completion_queue = NULL;
3868                local_irq_enable();
3869
3870                while (clist) {
3871                        struct sk_buff *skb = clist;
3872                        clist = clist->next;
3873
3874                        WARN_ON(atomic_read(&skb->users));
3875                        if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3876                                trace_consume_skb(skb);
3877                        else
3878                                trace_kfree_skb(skb, net_tx_action);
3879
3880                        if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
3881                                __kfree_skb(skb);
3882                        else
3883                                __kfree_skb_defer(skb);
3884                }
3885
3886                __kfree_skb_flush();
3887        }
3888
3889        if (sd->output_queue) {
3890                struct Qdisc *head;
3891
3892                local_irq_disable();
3893                head = sd->output_queue;
3894                sd->output_queue = NULL;
3895                sd->output_queue_tailp = &sd->output_queue;
3896                local_irq_enable();
3897
3898                while (head) {
3899                        struct Qdisc *q = head;
3900                        spinlock_t *root_lock;
3901
3902                        head = head->next_sched;
3903
3904                        root_lock = qdisc_lock(q);
3905                        spin_lock(root_lock);
3906                        /* We need to make sure head->next_sched is read
3907                         * before clearing __QDISC_STATE_SCHED
3908                         */
3909                        smp_mb__before_atomic();
3910                        clear_bit(__QDISC_STATE_SCHED, &q->state);
3911                        qdisc_run(q);
3912                        spin_unlock(root_lock);
3913                }
3914        }
3915}
3916
3917#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3918    (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3919/* This hook is defined here for ATM LANE */
3920int (*br_fdb_test_addr_hook)(struct net_device *dev,
3921                             unsigned char *addr) __read_mostly;
3922EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3923#endif
3924
3925static inline struct sk_buff *
3926sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
3927                   struct net_device *orig_dev)
3928{
3929#ifdef CONFIG_NET_CLS_ACT
3930        struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3931        struct tcf_result cl_res;
3932
3933        /* If there's at least one ingress present somewhere (so
3934         * we get here via enabled static key), remaining devices
3935         * that are not configured with an ingress qdisc will bail
3936         * out here.
3937         */
3938        if (!cl)
3939                return skb;
3940        if (*pt_prev) {
3941                *ret = deliver_skb(skb, *pt_prev, orig_dev);
3942                *pt_prev = NULL;
3943        }
3944
3945        qdisc_skb_cb(skb)->pkt_len = skb->len;
3946        skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3947        qdisc_bstats_cpu_update(cl->q, skb);
3948
3949        switch (tc_classify(skb, cl, &cl_res, false)) {
3950        case TC_ACT_OK:
3951        case TC_ACT_RECLASSIFY:
3952                skb->tc_index = TC_H_MIN(cl_res.classid);
3953                break;
3954        case TC_ACT_SHOT:
3955                qdisc_qstats_cpu_drop(cl->q);
3956                kfree_skb(skb);
3957                return NULL;
3958        case TC_ACT_STOLEN:
3959        case TC_ACT_QUEUED:
3960                consume_skb(skb);
3961                return NULL;
3962        case TC_ACT_REDIRECT:
3963                /* skb_mac_header check was done by cls/act_bpf, so
3964                 * we can safely push the L2 header back before
3965                 * redirecting to another netdev
3966                 */
3967                __skb_push(skb, skb->mac_len);
3968                skb_do_redirect(skb);
3969                return NULL;
3970        default:
3971                break;
3972        }
3973#endif /* CONFIG_NET_CLS_ACT */
3974        return skb;
3975}
3976
3977/**
3978 *      netdev_is_rx_handler_busy - check if receive handler is registered
3979 *      @dev: device to check
3980 *
3981 *      Check if a receive handler is already registered for a given device.
3982 *      Return true if there one.
3983 *
3984 *      The caller must hold the rtnl_mutex.
3985 */
3986bool netdev_is_rx_handler_busy(struct net_device *dev)
3987{
3988        ASSERT_RTNL();
3989        return dev && rtnl_dereference(dev->rx_handler);
3990}
3991EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
3992
3993/**
3994 *      netdev_rx_handler_register - register receive handler
3995 *      @dev: device to register a handler for
3996 *      @rx_handler: receive handler to register
3997 *      @rx_handler_data: data pointer that is used by rx handler
3998 *
3999 *      Register a receive handler for a device. This handler will then be
4000 *      called from __netif_receive_skb. A negative errno code is returned
4001 *      on a failure.
4002 *
4003 *      The caller must hold the rtnl_mutex.
4004 *
4005 *      For a general description of rx_handler, see enum rx_handler_result.
4006 */
4007int netdev_rx_handler_register(struct net_device *dev,
4008                               rx_handler_func_t *rx_handler,
4009                               void *rx_handler_data)
4010{
4011        ASSERT_RTNL();
4012
4013        if (dev->rx_handler)
4014                return -EBUSY;
4015
4016        /* Note: rx_handler_data must be set before rx_handler */
4017        rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
4018        rcu_assign_pointer(dev->rx_handler, rx_handler);
4019
4020        return 0;
4021}
4022EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
4023
4024/**
4025 *      netdev_rx_handler_unregister - unregister receive handler
4026 *      @dev: device to unregister a handler from
4027 *
4028 *      Unregister a receive handler from a device.
4029 *
4030 *      The caller must hold the rtnl_mutex.
4031 */
4032void netdev_rx_handler_unregister(struct net_device *dev)
4033{
4034
4035        ASSERT_RTNL();
4036        RCU_INIT_POINTER(dev->rx_handler, NULL);
4037        /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
4038         * section has a guarantee to see a non NULL rx_handler_data
4039         * as well.
4040         */
4041        synchronize_net();
4042        RCU_INIT_POINTER(dev->rx_handler_data, NULL);
4043}
4044EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
4045
4046/*
4047 * Limit the use of PFMEMALLOC reserves to those protocols that implement
4048 * the special handling of PFMEMALLOC skbs.
4049 */
4050static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
4051{
4052        switch (skb->protocol) {
4053        case htons(ETH_P_ARP):
4054        case htons(ETH_P_IP):
4055        case htons(ETH_P_IPV6):
4056        case htons(ETH_P_8021Q):
4057        case htons(ETH_P_8021AD):
4058                return true;
4059        default:
4060                return false;
4061        }
4062}
4063
4064static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
4065                             int *ret, struct net_device *orig_dev)
4066{
4067#ifdef CONFIG_NETFILTER_INGRESS
4068        if (nf_hook_ingress_active(skb)) {
4069                if (*pt_prev) {
4070                        *ret = deliver_skb(skb, *pt_prev, orig_dev);
4071                        *pt_prev = NULL;
4072                }
4073
4074                return nf_hook_ingress(skb);
4075        }
4076#endif /* CONFIG_NETFILTER_INGRESS */
4077        return 0;
4078}
4079
4080static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
4081{
4082        struct packet_type *ptype, *pt_prev;
4083        rx_handler_func_t *rx_handler;
4084        struct net_device *orig_dev;
4085        bool deliver_exact = false;
4086        int ret = NET_RX_DROP;
4087        __be16 type;
4088
4089        net_timestamp_check(!netdev_tstamp_prequeue, skb);
4090
4091        trace_netif_receive_skb(skb);
4092
4093        orig_dev = skb->dev;
4094
4095        skb_reset_network_header(skb);
4096        if (!skb_transport_header_was_set(skb))
4097                skb_reset_transport_header(skb);
4098        skb_reset_mac_len(skb);
4099
4100        pt_prev = NULL;
4101
4102another_round:
4103        skb->skb_iif = skb->dev->ifindex;
4104
4105        __this_cpu_inc(softnet_data.processed);
4106
4107        if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
4108            skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
4109                skb = skb_vlan_untag(skb);
4110                if (unlikely(!skb))
4111                        goto out;
4112        }
4113
4114#ifdef CONFIG_NET_CLS_ACT
4115        if (skb->tc_verd & TC_NCLS) {
4116                skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
4117                goto ncls;
4118        }
4119#endif
4120
4121        if (pfmemalloc)
4122                goto skip_taps;
4123
4124        list_for_each_entry_rcu(ptype, &ptype_all, list) {
4125                if (pt_prev)
4126                        ret = deliver_skb(skb, pt_prev, orig_dev);
4127                pt_prev = ptype;
4128        }
4129
4130        list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
4131                if (pt_prev)
4132                        ret = deliver_skb(skb, pt_prev, orig_dev);
4133                pt_prev = ptype;
4134        }
4135
4136skip_taps:
4137#ifdef CONFIG_NET_INGRESS
4138        if (static_key_false(&ingress_needed)) {
4139                skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
4140                if (!skb)
4141                        goto out;
4142
4143                if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
4144                        goto out;
4145        }
4146#endif
4147#ifdef CONFIG_NET_CLS_ACT
4148        skb->tc_verd = 0;
4149ncls:
4150#endif
4151        if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
4152                goto drop;
4153
4154        if (skb_vlan_tag_present(skb)) {
4155                if (pt_prev) {
4156                        ret = deliver_skb(skb, pt_prev, orig_dev);
4157                        pt_prev = NULL;
4158                }
4159                if (vlan_do_receive(&skb))
4160                        goto another_round;
4161                else if (unlikely(!skb))
4162                        goto out;
4163        }
4164
4165        rx_handler = rcu_dereference(skb->dev->rx_handler);
4166        if (rx_handler) {
4167                if (pt_prev) {
4168                        ret = deliver_skb(skb, pt_prev, orig_dev);
4169                        pt_prev = NULL;
4170                }
4171                switch (rx_handler(&skb)) {
4172                case RX_HANDLER_CONSUMED:
4173                        ret = NET_RX_SUCCESS;
4174                        goto out;
4175                case RX_HANDLER_ANOTHER:
4176                        goto another_round;
4177                case RX_HANDLER_EXACT:
4178                        deliver_exact = true;
4179                case RX_HANDLER_PASS:
4180                        break;
4181                default:
4182                        BUG();
4183                }
4184        }
4185
4186        if (unlikely(skb_vlan_tag_present(skb))) {
4187                if (skb_vlan_tag_get_id(skb))
4188                        skb->pkt_type = PACKET_OTHERHOST;
4189                /* Note: we might in the future use prio bits
4190                 * and set skb->priority like in vlan_do_receive()
4191                 * For the time being, just ignore Priority Code Point
4192                 */
4193                skb->vlan_tci = 0;
4194        }
4195
4196        type = skb->protocol;
4197
4198        /* deliver only exact match when indicated */
4199        if (likely(!deliver_exact)) {
4200                deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4201                                       &ptype_base[ntohs(type) &
4202                                                   PTYPE_HASH_MASK]);
4203        }
4204
4205        deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4206                               &orig_dev->ptype_specific);
4207
4208        if (unlikely(skb->dev != orig_dev)) {
4209                deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4210                                       &skb->dev->ptype_specific);
4211        }
4212
4213        if (pt_prev) {
4214                if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
4215                        goto drop;
4216                else
4217                        ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4218        } else {
4219drop:
4220                if (!deliver_exact)
4221                        atomic_long_inc(&skb->dev->rx_dropped);
4222                else
4223                        atomic_long_inc(&skb->dev->rx_nohandler);
4224                kfree_skb(skb);
4225                /* Jamal, now you will not able to escape explaining
4226                 * me how you were going to use this. :-)
4227                 */
4228                ret = NET_RX_DROP;
4229        }
4230
4231out:
4232        return ret;
4233}
4234
4235static int __netif_receive_skb(struct sk_buff *skb)
4236{
4237        int ret;
4238
4239        if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4240                unsigned long pflags = current->flags;
4241
4242                /*
4243                 * PFMEMALLOC skbs are special, they should
4244                 * - be delivered to SOCK_MEMALLOC sockets only
4245                 * - stay away from userspace
4246                 * - have bounded memory usage
4247                 *
4248                 * Use PF_MEMALLOC as this saves us from propagating the allocation
4249                 * context down to all allocation sites.
4250                 */
4251                current->flags |= PF_MEMALLOC;
4252                ret = __netif_receive_skb_core(skb, true);
4253                tsk_restore_flags(current, pflags, PF_MEMALLOC);
4254        } else
4255                ret = __netif_receive_skb_core(skb, false);
4256
4257        return ret;
4258}
4259
4260static int netif_receive_skb_internal(struct sk_buff *skb)
4261{
4262        int ret;
4263
4264        net_timestamp_check(netdev_tstamp_prequeue, skb);
4265
4266        if (skb_defer_rx_timestamp(skb))
4267                return NET_RX_SUCCESS;
4268
4269        rcu_read_lock();
4270
4271#ifdef CONFIG_RPS
4272        if (static_key_false(&rps_needed)) {
4273                struct rps_dev_flow voidflow, *rflow = &voidflow;
4274                int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4275
4276                if (cpu >= 0) {
4277                        ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4278                        rcu_read_unlock();
4279                        return ret;
4280                }
4281        }
4282#endif
4283        ret = __netif_receive_skb(skb);
4284        rcu_read_unlock();
4285        return ret;
4286}
4287
4288/**
4289 *      netif_receive_skb - process receive buffer from network
4290 *      @skb: buffer to process
4291 *
4292 *      netif_receive_skb() is the main receive data processing function.
4293 *      It always succeeds. The buffer may be dropped during processing
4294 *      for congestion control or by the protocol layers.
4295 *
4296 *      This function may only be called from softirq context and interrupts
4297 *      should be enabled.
4298 *
4299 *      Return values (usually ignored):
4300 *      NET_RX_SUCCESS: no congestion
4301 *      NET_RX_DROP: packet was dropped
4302 */
4303int netif_receive_skb(struct sk_buff *skb)
4304{
4305        trace_netif_receive_skb_entry(skb);
4306
4307        return netif_receive_skb_internal(skb);
4308}
4309EXPORT_SYMBOL(netif_receive_skb);
4310
4311/* Network device is going away, flush any packets still pending
4312 * Called with irqs disabled.
4313 */
4314static void flush_backlog(void *arg)
4315{
4316        struct net_device *dev = arg;
4317        struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4318        struct sk_buff *skb, *tmp;
4319
4320        rps_lock(sd);
4321        skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4322                if (skb->dev == dev) {
4323                        __skb_unlink(skb, &sd->input_pkt_queue);
4324                        kfree_skb(skb);
4325                        input_queue_head_incr(sd);
4326                }
4327        }
4328        rps_unlock(sd);
4329
4330        skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4331                if (skb->dev == dev) {
4332                        __skb_unlink(skb, &sd->process_queue);
4333                        kfree_skb(skb);
4334                        input_queue_head_incr(sd);
4335                }
4336        }
4337}
4338
4339static int napi_gro_complete(struct sk_buff *skb)
4340{
4341        struct packet_offload *ptype;
4342        __be16 type = skb->protocol;
4343        struct list_head *head = &offload_base;
4344        int err = -ENOENT;
4345
4346        BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4347
4348        if (NAPI_GRO_CB(skb)->count == 1) {
4349                skb_shinfo(skb)->gso_size = 0;
4350                goto out;
4351        }
4352
4353        rcu_read_lock();
4354        list_for_each_entry_rcu(ptype, head, list) {
4355                if (ptype->type != type || !ptype->callbacks.gro_complete)
4356                        continue;
4357
4358                err = ptype->callbacks.gro_complete(skb, 0);
4359                break;
4360        }
4361        rcu_read_unlock();
4362
4363        if (err) {
4364                WARN_ON(&ptype->list == head);
4365                kfree_skb(skb);
4366                return NET_RX_SUCCESS;
4367        }
4368
4369out:
4370        return netif_receive_skb_internal(skb);
4371}
4372
4373/* napi->gro_list contains packets ordered by age.
4374 * youngest packets at the head of it.
4375 * Complete skbs in reverse order to reduce latencies.
4376 */
4377void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4378{
4379        struct sk_buff *skb, *prev = NULL;
4380
4381        /* scan list and build reverse chain */
4382        for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4383                skb->prev = prev;
4384                prev = skb;
4385        }
4386
4387        for (skb = prev; skb; skb = prev) {
4388                skb->next = NULL;
4389
4390                if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4391                        return;
4392
4393                prev = skb->prev;
4394                napi_gro_complete(skb);
4395                napi->gro_count--;
4396        }
4397
4398        napi->gro_list = NULL;
4399}
4400EXPORT_SYMBOL(napi_gro_flush);
4401
4402static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4403{
4404        struct sk_buff *p;
4405        unsigned int maclen = skb->dev->hard_header_len;
4406        u32 hash = skb_get_hash_raw(skb);
4407
4408        for (p = napi->gro_list; p; p = p->next) {
4409                unsigned long diffs;
4410
4411                NAPI_GRO_CB(p)->flush = 0;
4412
4413                if (hash != skb_get_hash_raw(p)) {
4414                        NAPI_GRO_CB(p)->same_flow = 0;
4415                        continue;
4416                }
4417
4418                diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4419                diffs |= p->vlan_tci ^ skb->vlan_tci;
4420                diffs |= skb_metadata_dst_cmp(p, skb);
4421                if (maclen == ETH_HLEN)
4422                        diffs |= compare_ether_header(skb_mac_header(p),
4423                                                      skb_mac_header(skb));
4424                else if (!diffs)
4425                        diffs = memcmp(skb_mac_header(p),
4426                                       skb_mac_header(skb),
4427                                       maclen);
4428                NAPI_GRO_CB(p)->same_flow = !diffs;
4429        }
4430}
4431
4432static void skb_gro_reset_offset(struct sk_buff *skb)
4433{
4434        const struct skb_shared_info *pinfo = skb_shinfo(skb);
4435        const skb_frag_t *frag0 = &pinfo->frags[0];
4436
4437        NAPI_GRO_CB(skb)->data_offset = 0;
4438        NAPI_GRO_CB(skb)->frag0 = NULL;
4439        NAPI_GRO_CB(skb)->frag0_len = 0;
4440
4441        if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4442            pinfo->nr_frags &&
4443            !PageHighMem(skb_frag_page(frag0))) {
4444                NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4445                NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
4446        }
4447}
4448
4449static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4450{
4451        struct skb_shared_info *pinfo = skb_shinfo(skb);
4452
4453        BUG_ON(skb->end - skb->tail < grow);
4454
4455        memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4456
4457        skb->data_len -= grow;
4458        skb->tail += grow;
4459
4460        pinfo->frags[0].page_offset += grow;
4461        skb_frag_size_sub(&pinfo->frags[0], grow);
4462
4463        if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4464                skb_frag_unref(skb, 0);
4465                memmove(pinfo->frags, pinfo->frags + 1,
4466                        --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4467        }
4468}
4469
4470static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4471{
4472        struct sk_buff **pp = NULL;
4473        struct packet_offload *ptype;
4474        __be16 type = skb->protocol;
4475        struct list_head *head = &offload_base;
4476        int same_flow;
4477        enum gro_result ret;
4478        int grow;
4479
4480        if (!(skb->dev->features & NETIF_F_GRO))
4481                goto normal;
4482
4483        if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4484                goto normal;
4485
4486        gro_list_prepare(napi, skb);
4487
4488        rcu_read_lock();
4489        list_for_each_entry_rcu(ptype, head, list) {
4490                if (ptype->type != type || !ptype->callbacks.gro_receive)
4491                        continue;
4492
4493                skb_set_network_header(skb, skb_gro_offset(skb));
4494                skb_reset_mac_len(skb);
4495                NAPI_GRO_CB(skb)->same_flow = 0;
4496                NAPI_GRO_CB(skb)->flush = 0;
4497                NAPI_GRO_CB(skb)->free = 0;
4498                NAPI_GRO_CB(skb)->encap_mark = 0;
4499                NAPI_GRO_CB(skb)->is_fou = 0;
4500                NAPI_GRO_CB(skb)->is_atomic = 1;
4501                NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4502
4503                /* Setup for GRO checksum validation */
4504                switch (skb->ip_summed) {
4505                case CHECKSUM_COMPLETE:
4506                        NAPI_GRO_CB(skb)->csum = skb->csum;
4507                        NAPI_GRO_CB(skb)->csum_valid = 1;
4508                        NAPI_GRO_CB(skb)->csum_cnt = 0;
4509                        break;
4510                case CHECKSUM_UNNECESSARY:
4511                        NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4512                        NAPI_GRO_CB(skb)->csum_valid = 0;
4513                        break;
4514                default:
4515                        NAPI_GRO_CB(skb)->csum_cnt = 0;
4516                        NAPI_GRO_CB(skb)->csum_valid = 0;
4517                }
4518
4519                pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4520                break;
4521        }
4522        rcu_read_unlock();
4523
4524        if (&ptype->list == head)
4525                goto normal;
4526
4527        same_flow = NAPI_GRO_CB(skb)->same_flow;
4528        ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4529
4530        if (pp) {
4531                struct sk_buff *nskb = *pp;
4532
4533                *pp = nskb->next;
4534                nskb->next = NULL;
4535                napi_gro_complete(nskb);
4536                napi->gro_count--;
4537        }
4538
4539        if (same_flow)
4540                goto ok;
4541
4542        if (NAPI_GRO_CB(skb)->flush)
4543                goto normal;
4544
4545        if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4546                struct sk_buff *nskb = napi->gro_list;
4547
4548                /* locate the end of the list to select the 'oldest' flow */
4549                while (nskb->next) {
4550                        pp = &nskb->next;
4551                        nskb = *pp;
4552                }
4553                *pp = NULL;
4554                nskb->next = NULL;
4555                napi_gro_complete(nskb);
4556        } else {
4557                napi->gro_count++;
4558        }
4559        NAPI_GRO_CB(skb)->count = 1;
4560        NAPI_GRO_CB(skb)->age = jiffies;
4561        NAPI_GRO_CB(skb)->last = skb;
4562        skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4563        skb->next = napi->gro_list;
4564        napi->gro_list = skb;
4565        ret = GRO_HELD;
4566
4567pull:
4568        grow = skb_gro_offset(skb) - skb_headlen(skb);
4569        if (grow > 0)
4570                gro_pull_from_frag0(skb, grow);
4571ok:
4572        return ret;
4573
4574normal:
4575        ret = GRO_NORMAL;
4576        goto pull;
4577}
4578
4579struct packet_offload *gro_find_receive_by_type(__be16 type)
4580{
4581        struct list_head *offload_head = &offload_base;
4582        struct packet_offload *ptype;
4583
4584        list_for_each_entry_rcu(ptype, offload_head, list) {
4585                if (ptype->type != type || !ptype->callbacks.gro_receive)
4586                        continue;
4587                return ptype;
4588        }
4589        return NULL;
4590}
4591EXPORT_SYMBOL(gro_find_receive_by_type);
4592
4593struct packet_offload *gro_find_complete_by_type(__be16 type)
4594{
4595        struct list_head *offload_head = &offload_base;
4596        struct packet_offload *ptype;
4597
4598        list_for_each_entry_rcu(ptype, offload_head, list) {
4599                if (ptype->type != type || !ptype->callbacks.gro_complete)
4600                        continue;
4601                return ptype;
4602        }
4603        return NULL;
4604}
4605EXPORT_SYMBOL(gro_find_complete_by_type);
4606
4607static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4608{
4609        switch (ret) {
4610        case GRO_NORMAL:
4611                if (netif_receive_skb_internal(skb))
4612                        ret = GRO_DROP;
4613                break;
4614
4615        case GRO_DROP:
4616                kfree_skb(skb);
4617                break;
4618
4619        case GRO_MERGED_FREE:
4620                if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
4621                        skb_dst_drop(skb);
4622                        kmem_cache_free(skbuff_head_cache, skb);
4623                } else {
4624                        __kfree_skb(skb);
4625                }
4626                break;
4627
4628        case GRO_HELD:
4629        case GRO_MERGED:
4630                break;
4631        }
4632
4633        return ret;
4634}
4635
4636gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4637{
4638        skb_mark_napi_id(skb, napi);
4639        trace_napi_gro_receive_entry(skb);
4640
4641        skb_gro_reset_offset(skb);
4642
4643        return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4644}
4645EXPORT_SYMBOL(napi_gro_receive);
4646
4647static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4648{
4649        if (unlikely(skb->pfmemalloc)) {
4650                consume_skb(skb);
4651                return;
4652        }
4653        __skb_pull(skb, skb_headlen(skb));
4654        /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4655        skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4656        skb->vlan_tci = 0;
4657        skb->dev = napi->dev;
4658        skb->skb_iif = 0;
4659        skb->encapsulation = 0;
4660        skb_shinfo(skb)->gso_type = 0;
4661        skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4662
4663        napi->skb = skb;
4664}
4665
4666struct sk_buff *napi_get_frags(struct napi_struct *napi)
4667{
4668        struct sk_buff *skb = napi->skb;
4669
4670        if (!skb) {
4671                skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4672                if (skb) {
4673                        napi->skb = skb;
4674                        skb_mark_napi_id(skb, napi);
4675                }
4676        }
4677        return skb;
4678}
4679EXPORT_SYMBOL(napi_get_frags);
4680
4681static gro_result_t napi_frags_finish(struct napi_struct *napi,
4682                                      struct sk_buff *skb,
4683                                      gro_result_t ret)
4684{
4685        switch (ret) {
4686        case GRO_NORMAL:
4687        case GRO_HELD:
4688                __skb_push(skb, ETH_HLEN);
4689                skb->protocol = eth_type_trans(skb, skb->dev);
4690                if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4691                        ret = GRO_DROP;
4692                break;
4693
4694        case GRO_DROP:
4695        case GRO_MERGED_FREE:
4696                napi_reuse_skb(napi, skb);
4697                break;
4698
4699        case GRO_MERGED:
4700                break;
4701        }
4702
4703        return ret;
4704}
4705
4706/* Upper GRO stack assumes network header starts at gro_offset=0
4707 * Drivers could call both napi_gro_frags() and napi_gro_receive()
4708 * We copy ethernet header into skb->data to have a common layout.
4709 */
4710static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4711{
4712        struct sk_buff *skb = napi->skb;
4713        const struct ethhdr *eth;
4714        unsigned int hlen = sizeof(*eth);
4715
4716        napi->skb = NULL;
4717
4718        skb_reset_mac_header(skb);
4719        skb_gro_reset_offset(skb);
4720
4721        eth = skb_gro_header_fast(skb, 0);
4722        if (unlikely(skb_gro_header_hard(skb, hlen))) {
4723                eth = skb_gro_header_slow(skb, hlen, 0);
4724                if (unlikely(!eth)) {
4725                        net_warn_ratelimited("%s: dropping impossible skb from %s\n",
4726                                             __func__, napi->dev->name);
4727                        napi_reuse_skb(napi, skb);
4728                        return NULL;
4729                }
4730        } else {
4731                gro_pull_from_frag0(skb, hlen);
4732                NAPI_GRO_CB(skb)->frag0 += hlen;
4733                NAPI_GRO_CB(skb)->frag0_len -= hlen;
4734        }
4735        __skb_pull(skb, hlen);
4736
4737        /*
4738         * This works because the only protocols we care about don't require
4739         * special handling.
4740         * We'll fix it up properly in napi_frags_finish()
4741         */
4742        skb->protocol = eth->h_proto;
4743
4744        return skb;
4745}
4746
4747gro_result_t napi_gro_frags(struct napi_struct *napi)
4748{
4749        struct sk_buff *skb = napi_frags_skb(napi);
4750
4751        if (!skb)
4752                return GRO_DROP;
4753
4754        trace_napi_gro_frags_entry(skb);
4755
4756        return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4757}
4758EXPORT_SYMBOL(napi_gro_frags);
4759
4760/* Compute the checksum from gro_offset and return the folded value
4761 * after adding in any pseudo checksum.
4762 */
4763__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4764{
4765        __wsum wsum;
4766        __sum16 sum;
4767
4768        wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4769
4770        /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4771        sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4772        if (likely(!sum)) {
4773                if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4774                    !skb->csum_complete_sw)
4775                        netdev_rx_csum_fault(skb->dev);
4776        }
4777
4778        NAPI_GRO_CB(skb)->csum = wsum;
4779        NAPI_GRO_CB(skb)->csum_valid = 1;
4780
4781        return sum;
4782}
4783EXPORT_SYMBOL(__skb_gro_checksum_complete);
4784
4785/*
4786 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4787 * Note: called with local irq disabled, but exits with local irq enabled.
4788 */
4789static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4790{
4791#ifdef CONFIG_RPS
4792        struct softnet_data *remsd = sd->rps_ipi_list;
4793
4794        if (remsd) {
4795                sd->rps_ipi_list = NULL;
4796
4797                local_irq_enable();
4798
4799                /* Send pending IPI's to kick RPS processing on remote cpus. */
4800                while (remsd) {
4801                        struct softnet_data *next = remsd->rps_ipi_next;
4802
4803                        if (cpu_online(remsd->cpu))
4804                                smp_call_function_single_async(remsd->cpu,
4805                                                           &remsd->csd);
4806                        remsd = next;
4807                }
4808        } else
4809#endif
4810                local_irq_enable();
4811}
4812
4813static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4814{
4815#ifdef CONFIG_RPS
4816        return sd->rps_ipi_list != NULL;
4817#else
4818        return false;
4819#endif
4820}
4821
4822static int process_backlog(struct napi_struct *napi, int quota)
4823{
4824        int work = 0;
4825        struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4826
4827        /* Check if we have pending ipi, its better to send them now,
4828         * not waiting net_rx_action() end.
4829         */
4830        if (sd_has_rps_ipi_waiting(sd)) {
4831                local_irq_disable();
4832                net_rps_action_and_irq_enable(sd);
4833        }
4834
4835        napi->weight = weight_p;
4836        local_irq_disable();
4837        while (1) {
4838                struct sk_buff *skb;
4839
4840                while ((skb = __skb_dequeue(&sd->process_queue))) {
4841                        rcu_read_lock();
4842                        local_irq_enable();
4843                        __netif_receive_skb(skb);
4844                        rcu_read_unlock();
4845                        local_irq_disable();
4846                        input_queue_head_incr(sd);
4847                        if (++work >= quota) {
4848                                local_irq_enable();
4849                                return work;
4850                        }
4851                }
4852
4853                rps_lock(sd);
4854                if (skb_queue_empty(&sd->input_pkt_queue)) {
4855                        /*
4856                         * Inline a custom version of __napi_complete().
4857                         * only current cpu owns and manipulates this napi,
4858                         * and NAPI_STATE_SCHED is the only possible flag set
4859                         * on backlog.
4860                         * We can use a plain write instead of clear_bit(),
4861                         * and we dont need an smp_mb() memory barrier.
4862                         */
4863                        napi->state = 0;
4864                        rps_unlock(sd);
4865
4866                        break;
4867                }
4868
4869                skb_queue_splice_tail_init(&sd->input_pkt_queue,
4870                                           &sd->process_queue);
4871                rps_unlock(sd);
4872        }
4873        local_irq_enable();
4874
4875        return work;
4876}
4877
4878/**
4879 * __napi_schedule - schedule for receive
4880 * @n: entry to schedule
4881 *
4882 * The entry's receive function will be scheduled to run.
4883 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4884 */
4885void __napi_schedule(struct napi_struct *n)
4886{
4887        unsigned long flags;
4888
4889        local_irq_save(flags);
4890        ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4891        local_irq_restore(flags);
4892}
4893EXPORT_SYMBOL(__napi_schedule);
4894
4895/**
4896 * __napi_schedule_irqoff - schedule for receive
4897 * @n: entry to schedule
4898 *
4899 * Variant of __napi_schedule() assuming hard irqs are masked
4900 */
4901void __napi_schedule_irqoff(struct napi_struct *n)
4902{
4903        ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4904}
4905EXPORT_SYMBOL(__napi_schedule_irqoff);
4906
4907void __napi_complete(struct napi_struct *n)
4908{
4909        BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4910
4911        list_del_init(&n->poll_list);
4912        smp_mb__before_atomic();
4913        clear_bit(NAPI_STATE_SCHED, &n->state);
4914}
4915EXPORT_SYMBOL(__napi_complete);
4916
4917void napi_complete_done(struct napi_struct *n, int work_done)
4918{
4919        unsigned long flags;
4920
4921        /*
4922         * don't let napi dequeue from the cpu poll list
4923         * just in case its running on a different cpu
4924         */
4925        if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4926                return;
4927
4928        if (n->gro_list) {
4929                unsigned long timeout = 0;
4930
4931                if (work_done)
4932                        timeout = n->dev->gro_flush_timeout;
4933
4934                if (timeout)
4935                        hrtimer_start(&n->timer, ns_to_ktime(timeout),
4936                                      HRTIMER_MODE_REL_PINNED);
4937                else
4938                        napi_gro_flush(n, false);
4939        }
4940        if (likely(list_empty(&n->poll_list))) {
4941                WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4942        } else {
4943                /* If n->poll_list is not empty, we need to mask irqs */
4944                local_irq_save(flags);
4945                __napi_complete(n);
4946                local_irq_restore(flags);
4947        }
4948}
4949EXPORT_SYMBOL(napi_complete_done);
4950
4951/* must be called under rcu_read_lock(), as we dont take a reference */
4952static struct napi_struct *napi_by_id(unsigned int napi_id)
4953{
4954        unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4955        struct napi_struct *napi;
4956
4957        hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4958                if (napi->napi_id == napi_id)
4959                        return napi;
4960
4961        return NULL;
4962}
4963
4964#if defined(CONFIG_NET_RX_BUSY_POLL)
4965#define BUSY_POLL_BUDGET 8
4966bool sk_busy_loop(struct sock *sk, int nonblock)
4967{
4968        unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
4969        int (*busy_poll)(struct napi_struct *dev);
4970        struct napi_struct *napi;
4971        int rc = false;
4972
4973        rcu_read_lock();
4974
4975        napi = napi_by_id(sk->sk_napi_id);
4976        if (!napi)
4977                goto out;
4978
4979        /* Note: ndo_busy_poll method is optional in linux-4.5 */
4980        busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
4981
4982        do {
4983                rc = 0;
4984                local_bh_disable();
4985                if (busy_poll) {
4986                        rc = busy_poll(napi);
4987                } else if (napi_schedule_prep(napi)) {
4988                        void *have = netpoll_poll_lock(napi);
4989
4990                        if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
4991                                rc = napi->poll(napi, BUSY_POLL_BUDGET);
4992                                trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
4993                                if (rc == BUSY_POLL_BUDGET) {
4994                                        napi_complete_done(napi, rc);
4995                                        napi_schedule(napi);
4996                                }
4997                        }
4998                        netpoll_poll_unlock(have);
4999                }
5000                if (rc > 0)
5001                        __NET_ADD_STATS(sock_net(sk),
5002                                        LINUX_MIB_BUSYPOLLRXPACKETS, rc);
5003                local_bh_enable();
5004
5005                if (rc == LL_FLUSH_FAILED)
5006                        break; /* permanent failure */
5007
5008                cpu_relax();
5009        } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) &&
5010                 !need_resched() && !busy_loop_timeout(end_time));
5011
5012        rc = !skb_queue_empty(&sk->sk_receive_queue);
5013out:
5014        rcu_read_unlock();
5015        return rc;
5016}
5017EXPORT_SYMBOL(sk_busy_loop);
5018
5019#endif /* CONFIG_NET_RX_BUSY_POLL */
5020
5021void napi_hash_add(struct napi_struct *napi)
5022{
5023        if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
5024            test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
5025                return;
5026
5027        spin_lock(&napi_hash_lock);
5028
5029        /* 0..NR_CPUS+1 range is reserved for sender_cpu use */
5030        do {
5031                if (unlikely(++napi_gen_id < NR_CPUS + 1))
5032                        napi_gen_id = NR_CPUS + 1;
5033        } while (napi_by_id(napi_gen_id));
5034        napi->napi_id = napi_gen_id;
5035
5036        hlist_add_head_rcu(&napi->napi_hash_node,
5037                           &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
5038
5039        spin_unlock(&napi_hash_lock);
5040}
5041EXPORT_SYMBOL_GPL(napi_hash_add);
5042
5043/* Warning : caller is responsible to make sure rcu grace period
5044 * is respected before freeing memory containing @napi
5045 */
5046bool napi_hash_del(struct napi_struct *napi)
5047{
5048        bool rcu_sync_needed = false;
5049
5050        spin_lock(&napi_hash_lock);
5051
5052        if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
5053                rcu_sync_needed = true;
5054                hlist_del_rcu(&napi->napi_hash_node);
5055        }
5056        spin_unlock(&napi_hash_lock);
5057        return rcu_sync_needed;
5058}
5059EXPORT_SYMBOL_GPL(napi_hash_del);
5060
5061static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
5062{
5063        struct napi_struct *napi;
5064
5065        napi = container_of(timer, struct napi_struct, timer);
5066        if (napi->gro_list)
5067                napi_schedule(napi);
5068
5069        return HRTIMER_NORESTART;
5070}
5071
5072void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
5073                    int (*poll)(struct napi_struct *, int), int weight)
5074{
5075        INIT_LIST_HEAD(&napi->poll_list);
5076        hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
5077        napi->timer.function = napi_watchdog;
5078        napi->gro_count = 0;
5079        napi->gro_list = NULL;
5080        napi->skb = NULL;
5081        napi->poll = poll;
5082        if (weight > NAPI_POLL_WEIGHT)
5083                pr_err_once("netif_napi_add() called with weight %d on device %s\n",
5084                            weight, dev->name);
5085        napi->weight = weight;
5086        list_add(&napi->dev_list, &dev->napi_list);
5087        napi->dev = dev;
5088#ifdef CONFIG_NETPOLL
5089        spin_lock_init(&napi->poll_lock);
5090        napi->poll_owner = -1;
5091#endif
5092        set_bit(NAPI_STATE_SCHED, &napi->state);
5093        napi_hash_add(napi);
5094}
5095EXPORT_SYMBOL(netif_napi_add);
5096
5097void napi_disable(struct napi_struct *n)
5098{
5099        might_sleep();
5100        set_bit(NAPI_STATE_DISABLE, &n->state);
5101
5102        while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
5103                msleep(1);
5104        while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
5105                msleep(1);
5106
5107        hrtimer_cancel(&n->timer);
5108
5109        clear_bit(NAPI_STATE_DISABLE, &n->state);
5110}
5111EXPORT_SYMBOL(napi_disable);
5112
5113/* Must be called in process context */
5114void netif_napi_del(struct napi_struct *napi)
5115{
5116        might_sleep();
5117        if (napi_hash_del(napi))
5118                synchronize_net();
5119        list_del_init(&napi->dev_list);
5120        napi_free_frags(napi);
5121
5122        kfree_skb_list(napi->gro_list);
5123        napi->gro_list = NULL;
5124        napi->gro_count = 0;
5125}
5126EXPORT_SYMBOL(netif_napi_del);
5127
5128static int napi_poll(struct napi_struct *n, struct list_head *repoll)
5129{
5130        void *have;
5131        int work, weight;
5132
5133        list_del_init(&n->poll_list);
5134
5135        have = netpoll_poll_lock(n);
5136
5137        weight = n->weight;
5138
5139        /* This NAPI_STATE_SCHED test is for avoiding a race
5140         * with netpoll's poll_napi().  Only the entity which
5141         * obtains the lock and sees NAPI_STATE_SCHED set will
5142         * actually make the ->poll() call.  Therefore we avoid
5143         * accidentally calling ->poll() when NAPI is not scheduled.
5144         */
5145        work = 0;
5146        if (test_bit(NAPI_STATE_SCHED, &n->state)) {
5147                work = n->poll(n, weight);
5148                trace_napi_poll(n, work, weight);
5149        }
5150
5151        WARN_ON_ONCE(work > weight);
5152
5153        if (likely(work < weight))
5154                goto out_unlock;
5155
5156        /* Drivers must not modify the NAPI state if they
5157         * consume the entire weight.  In such cases this code
5158         * still "owns" the NAPI instance and therefore can
5159         * move the instance around on the list at-will.
5160         */
5161        if (unlikely(napi_disable_pending(n))) {
5162                napi_complete(n);
5163                goto out_unlock;
5164        }
5165
5166        if (n->gro_list) {
5167                /* flush too old packets
5168                 * If HZ < 1000, flush all packets.
5169                 */
5170                napi_gro_flush(n, HZ >= 1000);
5171        }
5172
5173        /* Some drivers may have called napi_schedule
5174         * prior to exhausting their budget.
5175         */
5176        if (unlikely(!list_empty(&n->poll_list))) {
5177                pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
5178                             n->dev ? n->dev->name : "backlog");
5179                goto out_unlock;
5180        }
5181
5182        list_add_tail(&n->poll_list, repoll);
5183
5184out_unlock:
5185        netpoll_poll_unlock(have);
5186
5187        return work;
5188}
5189
5190static void net_rx_action(struct softirq_action *h)
5191{
5192        struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5193        unsigned long time_limit = jiffies + 2;
5194        int budget = netdev_budget;
5195        LIST_HEAD(list);
5196        LIST_HEAD(repoll);
5197
5198        local_irq_disable();
5199        list_splice_init(&sd->poll_list, &list);
5200        local_irq_enable();
5201
5202        for (;;) {
5203                struct napi_struct *n;
5204
5205                if (list_empty(&list)) {
5206                        if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
5207                                return;
5208                        break;
5209                }
5210
5211                n = list_first_entry(&list, struct napi_struct, poll_list);
5212                budget -= napi_poll(n, &repoll);
5213
5214                /* If softirq window is exhausted then punt.
5215                 * Allow this to run for 2 jiffies since which will allow
5216                 * an average latency of 1.5/HZ.
5217                 */
5218                if (unlikely(budget <= 0 ||
5219                             time_after_eq(jiffies, time_limit))) {
5220                        sd->time_squeeze++;
5221                        break;
5222                }
5223        }
5224
5225        __kfree_skb_flush();
5226        local_irq_disable();
5227
5228        list_splice_tail_init(&sd->poll_list, &list);
5229        list_splice_tail(&repoll, &list);
5230        list_splice(&list, &sd->poll_list);
5231        if (!list_empty(&sd->poll_list))
5232                __raise_softirq_irqoff(NET_RX_SOFTIRQ);
5233
5234        net_rps_action_and_irq_enable(sd);
5235}
5236
5237struct netdev_adjacent {
5238        struct net_device *dev;
5239
5240        /* upper master flag, there can only be one master device per list */
5241        bool master;
5242
5243        /* counter for the number of times this device was added to us */
5244        u16 ref_nr;
5245
5246        /* private field for the users */
5247        void *private;
5248
5249        struct list_head list;
5250        struct rcu_head rcu;
5251};
5252
5253static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
5254                                                 struct list_head *adj_list)
5255{
5256        struct netdev_adjacent *adj;
5257
5258        list_for_each_entry(adj, adj_list, list) {
5259                if (adj->dev == adj_dev)
5260                        return adj;
5261        }
5262        return NULL;
5263}
5264
5265/**
5266 * netdev_has_upper_dev - Check if device is linked to an upper device
5267 * @dev: device
5268 * @upper_dev: upper device to check
5269 *
5270 * Find out if a device is linked to specified upper device and return true
5271 * in case it is. Note that this checks only immediate upper device,
5272 * not through a complete stack of devices. The caller must hold the RTNL lock.
5273 */
5274bool netdev_has_upper_dev(struct net_device *dev,
5275                          struct net_device *upper_dev)
5276{
5277        ASSERT_RTNL();
5278
5279        return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper);
5280}
5281EXPORT_SYMBOL(netdev_has_upper_dev);
5282
5283/**
5284 * netdev_has_any_upper_dev - Check if device is linked to some device
5285 * @dev: device
5286 *
5287 * Find out if a device is linked to an upper device and return true in case
5288 * it is. The caller must hold the RTNL lock.
5289 */
5290static bool netdev_has_any_upper_dev(struct net_device *dev)
5291{
5292        ASSERT_RTNL();
5293
5294        return !list_empty(&dev->all_adj_list.upper);
5295}
5296
5297/**
5298 * netdev_master_upper_dev_get - Get master upper device
5299 * @dev: device
5300 *
5301 * Find a master upper device and return pointer to it or NULL in case
5302 * it's not there. The caller must hold the RTNL lock.
5303 */
5304struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5305{
5306        struct netdev_adjacent *upper;
5307
5308        ASSERT_RTNL();
5309
5310        if (list_empty(&dev->adj_list.upper))
5311                return NULL;
5312
5313        upper = list_first_entry(&dev->adj_list.upper,
5314                                 struct netdev_adjacent, list);
5315        if (likely(upper->master))
5316                return upper->dev;
5317        return NULL;
5318}
5319EXPORT_SYMBOL(netdev_master_upper_dev_get);
5320
5321void *netdev_adjacent_get_private(struct list_head *adj_list)
5322{
5323        struct netdev_adjacent *adj;
5324
5325        adj = list_entry(adj_list, struct netdev_adjacent, list);
5326
5327        return adj->private;
5328}
5329EXPORT_SYMBOL(netdev_adjacent_get_private);
5330
5331/**
5332 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5333 * @dev: device
5334 * @iter: list_head ** of the current position
5335 *
5336 * Gets the next device from the dev's upper list, starting from iter
5337 * position. The caller must hold RCU read lock.
5338 */
5339struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5340                                                 struct list_head **iter)
5341{
5342        struct netdev_adjacent *upper;
5343
5344        WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5345
5346        upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5347
5348        if (&upper->list == &dev->adj_list.upper)
5349                return NULL;
5350
5351        *iter = &upper->list;
5352
5353        return upper->dev;
5354}
5355EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5356
5357/**
5358 * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
5359 * @dev: device
5360 * @iter: list_head ** of the current position
5361 *
5362 * Gets the next device from the dev's upper list, starting from iter
5363 * position. The caller must hold RCU read lock.
5364 */
5365struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
5366                                                     struct list_head **iter)
5367{
5368        struct netdev_adjacent *upper;
5369
5370        WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5371
5372        upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5373
5374        if (&upper->list == &dev->all_adj_list.upper)
5375                return NULL;
5376
5377        *iter = &upper->list;
5378
5379        return upper->dev;
5380}
5381EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
5382
5383/**
5384 * netdev_lower_get_next_private - Get the next ->private from the
5385 *                                 lower neighbour list
5386 * @dev: device
5387 * @iter: list_head ** of the current position
5388 *
5389 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5390 * list, starting from iter position. The caller must hold either hold the
5391 * RTNL lock or its own locking that guarantees that the neighbour lower
5392 * list will remain unchanged.
5393 */
5394void *netdev_lower_get_next_private(struct net_device *dev,
5395                                    struct list_head **iter)
5396{
5397        struct netdev_adjacent *lower;
5398
5399        lower = list_entry(*iter, struct netdev_adjacent, list);
5400
5401        if (&lower->list == &dev->adj_list.lower)
5402                return NULL;
5403
5404        *iter = lower->list.next;
5405
5406        return lower->private;
5407}
5408EXPORT_SYMBOL(netdev_lower_get_next_private);
5409
5410/**
5411 * netdev_lower_get_next_private_rcu - Get the next ->private from the
5412 *                                     lower neighbour list, RCU
5413 *                                     variant
5414 * @dev: device
5415 * @iter: list_head ** of the current position
5416 *
5417 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5418 * list, starting from iter position. The caller must hold RCU read lock.
5419 */
5420void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5421                                        struct list_head **iter)
5422{
5423        struct netdev_adjacent *lower;
5424
5425        WARN_ON_ONCE(!rcu_read_lock_held());
5426
5427        lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5428
5429        if (&lower->list == &dev->adj_list.lower)
5430                return NULL;
5431
5432        *iter = &lower->list;
5433
5434        return lower->private;
5435}
5436EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5437
5438/**
5439 * netdev_lower_get_next - Get the next device from the lower neighbour
5440 *                         list
5441 * @dev: device
5442 * @iter: list_head ** of the current position
5443 *
5444 * Gets the next netdev_adjacent from the dev's lower neighbour
5445 * list, starting from iter position. The caller must hold RTNL lock or
5446 * its own locking that guarantees that the neighbour lower
5447 * list will remain unchanged.
5448 */
5449void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5450{
5451        struct netdev_adjacent *lower;
5452
5453        lower = list_entry(*iter, struct netdev_adjacent, list);
5454
5455        if (&lower->list == &dev->adj_list.lower)
5456                return NULL;
5457
5458        *iter = lower->list.next;
5459
5460        return lower->dev;
5461}
5462EXPORT_SYMBOL(netdev_lower_get_next);
5463
5464/**
5465 * netdev_all_lower_get_next - Get the next device from all lower neighbour list
5466 * @dev: device
5467 * @iter: list_head ** of the current position
5468 *
5469 * Gets the next netdev_adjacent from the dev's all lower neighbour
5470 * list, starting from iter position. The caller must hold RTNL lock or
5471 * its own locking that guarantees that the neighbour all lower
5472 * list will remain unchanged.
5473 */
5474struct net_device *netdev_all_lower_get_next(struct net_device *dev, struct list_head **iter)
5475{
5476        struct netdev_adjacent *lower;
5477
5478        lower = list_entry(*iter, struct netdev_adjacent, list);
5479
5480        if (&lower->list == &dev->all_adj_list.lower)
5481                return NULL;
5482
5483        *iter = lower->list.next;
5484
5485        return lower->dev;
5486}
5487EXPORT_SYMBOL(netdev_all_lower_get_next);
5488
5489/**
5490 * netdev_all_lower_get_next_rcu - Get the next device from all
5491 *                                 lower neighbour list, RCU variant
5492 * @dev: device
5493 * @iter: list_head ** of the current position
5494 *
5495 * Gets the next netdev_adjacent from the dev's all lower neighbour
5496 * list, starting from iter position. The caller must hold RCU read lock.
5497 */
5498struct net_device *netdev_all_lower_get_next_rcu(struct net_device *dev,
5499                                                 struct list_head **iter)
5500{
5501        struct netdev_adjacent *lower;
5502
5503        lower = list_first_or_null_rcu(&dev->all_adj_list.lower,
5504                                       struct netdev_adjacent, list);
5505
5506        return lower ? lower->dev : NULL;
5507}
5508EXPORT_SYMBOL(netdev_all_lower_get_next_rcu);
5509
5510/**
5511 * netdev_lower_get_first_private_rcu - Get the first ->private from the
5512 *                                     lower neighbour list, RCU
5513 *                                     variant
5514 * @dev: device
5515 *
5516 * Gets the first netdev_adjacent->private from the dev's lower neighbour
5517 * list. The caller must hold RCU read lock.
5518 */
5519void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5520{
5521        struct netdev_adjacent *lower;
5522
5523        lower = list_first_or_null_rcu(&dev->adj_list.lower,
5524                        struct netdev_adjacent, list);
5525        if (lower)
5526                return lower->private;
5527        return NULL;
5528}
5529EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5530
5531/**
5532 * netdev_master_upper_dev_get_rcu - Get master upper device
5533 * @dev: device
5534 *
5535 * Find a master upper device and return pointer to it or NULL in case
5536 * it's not there. The caller must hold the RCU read lock.
5537 */
5538struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5539{
5540        struct netdev_adjacent *upper;
5541
5542        upper = list_first_or_null_rcu(&dev->adj_list.upper,
5543                                       struct netdev_adjacent, list);
5544        if (upper && likely(upper->master))
5545                return upper->dev;
5546        return NULL;
5547}
5548EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5549
5550static int netdev_adjacent_sysfs_add(struct net_device *dev,
5551                              struct net_device *adj_dev,
5552                              struct list_head *dev_list)
5553{
5554        char linkname[IFNAMSIZ+7];
5555        sprintf(linkname, dev_list == &dev->adj_list.upper ?
5556                "upper_%s" : "lower_%s", adj_dev->name);
5557        return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5558                                 linkname);
5559}
5560static void netdev_adjacent_sysfs_del(struct net_device *dev,
5561                               char *name,
5562                               struct list_head *dev_list)
5563{
5564        char linkname[IFNAMSIZ+7];
5565        sprintf(linkname, dev_list == &dev->adj_list.upper ?
5566                "upper_%s" : "lower_%s", name);
5567        sysfs_remove_link(&(dev->dev.kobj), linkname);
5568}
5569
5570static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5571                                                 struct net_device *adj_dev,
5572                                                 struct list_head *dev_list)
5573{
5574        return (dev_list == &dev->adj_list.upper ||
5575                dev_list == &dev->adj_list.lower) &&
5576                net_eq(dev_net(dev), dev_net(adj_dev));
5577}
5578
5579static int __netdev_adjacent_dev_insert(struct net_device *dev,
5580                                        struct net_device *adj_dev,
5581                                        struct list_head *dev_list,
5582                                        void *private, bool master)
5583{
5584        struct netdev_adjacent *adj;
5585        int ret;
5586
5587        adj = __netdev_find_adj(adj_dev, dev_list);
5588
5589        if (adj) {
5590                adj->ref_nr++;
5591                return 0;
5592        }
5593
5594        adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5595        if (!adj)
5596                return -ENOMEM;
5597
5598        adj->dev = adj_dev;
5599        adj->master = master;
5600        adj->ref_nr = 1;
5601        adj->private = private;
5602        dev_hold(adj_dev);
5603
5604        pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5605                 adj_dev->name, dev->name, adj_dev->name);
5606
5607        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5608                ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5609                if (ret)
5610                        goto free_adj;
5611        }
5612
5613        /* Ensure that master link is always the first item in list. */
5614        if (master) {
5615                ret = sysfs_create_link(&(dev->dev.kobj),
5616                                        &(adj_dev->dev.kobj), "master");
5617                if (ret)
5618                        goto remove_symlinks;
5619
5620                list_add_rcu(&adj->list, dev_list);
5621        } else {
5622                list_add_tail_rcu(&adj->list, dev_list);
5623        }
5624
5625        return 0;
5626
5627remove_symlinks:
5628        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5629                netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5630free_adj:
5631        kfree(adj);
5632        dev_put(adj_dev);
5633
5634        return ret;
5635}
5636
5637static void __netdev_adjacent_dev_remove(struct net_device *dev,
5638                                         struct net_device *adj_dev,
5639                                         struct list_head *dev_list)
5640{
5641        struct netdev_adjacent *adj;
5642
5643        adj = __netdev_find_adj(adj_dev, dev_list);
5644
5645        if (!adj) {
5646                pr_err("tried to remove device %s from %s\n",
5647                       dev->name, adj_dev->name);
5648                BUG();
5649        }
5650
5651        if (adj->ref_nr > 1) {
5652                pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
5653                         adj->ref_nr-1);
5654                adj->ref_nr--;
5655                return;
5656        }
5657
5658        if (adj->master)
5659                sysfs_remove_link(&(dev->dev.kobj), "master");
5660
5661        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5662                netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5663
5664        list_del_rcu(&adj->list);
5665        pr_debug("dev_put for %s, because link removed from %s to %s\n",
5666                 adj_dev->name, dev->name, adj_dev->name);
5667        dev_put(adj_dev);
5668        kfree_rcu(adj, rcu);
5669}
5670
5671static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5672                                            struct net_device *upper_dev,
5673                                            struct list_head *up_list,
5674                                            struct list_head *down_list,
5675                                            void *private, bool master)
5676{
5677        int ret;
5678
5679        ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
5680                                           master);
5681        if (ret)
5682                return ret;
5683
5684        ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
5685                                           false);
5686        if (ret) {
5687                __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5688                return ret;
5689        }
5690
5691        return 0;
5692}
5693
5694static int __netdev_adjacent_dev_link(struct net_device *dev,
5695                                      struct net_device *upper_dev)
5696{
5697        return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5698                                                &dev->all_adj_list.upper,
5699                                                &upper_dev->all_adj_list.lower,
5700                                                NULL, false);
5701}
5702
5703static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5704                                               struct net_device *upper_dev,
5705                                               struct list_head *up_list,
5706                                               struct list_head *down_list)
5707{
5708        __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5709        __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5710}
5711
5712static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5713                                         struct net_device *upper_dev)
5714{
5715        __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5716                                           &dev->all_adj_list.upper,
5717                                           &upper_dev->all_adj_list.lower);
5718}
5719
5720static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5721                                                struct net_device *upper_dev,
5722                                                void *private, bool master)
5723{
5724        int ret = __netdev_adjacent_dev_link(dev, upper_dev);
5725
5726        if (ret)
5727                return ret;
5728
5729        ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
5730                                               &dev->adj_list.upper,
5731                                               &upper_dev->adj_list.lower,
5732                                               private, master);
5733        if (ret) {
5734                __netdev_adjacent_dev_unlink(dev, upper_dev);
5735                return ret;
5736        }
5737
5738        return 0;
5739}
5740
5741static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5742                                                   struct net_device *upper_dev)
5743{
5744        __netdev_adjacent_dev_unlink(dev, upper_dev);
5745        __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5746                                           &dev->adj_list.upper,
5747                                           &upper_dev->adj_list.lower);
5748}
5749
5750static int __netdev_upper_dev_link(struct net_device *dev,
5751                                   struct net_device *upper_dev, bool master,
5752                                   void *upper_priv, void *upper_info)
5753{
5754        struct netdev_notifier_changeupper_info changeupper_info;
5755        struct netdev_adjacent *i, *j, *to_i, *to_j;
5756        int ret = 0;
5757
5758        ASSERT_RTNL();
5759
5760        if (dev == upper_dev)
5761                return -EBUSY;
5762
5763        /* To prevent loops, check if dev is not upper device to upper_dev. */
5764        if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper))
5765                return -EBUSY;
5766
5767        if (__netdev_find_adj(upper_dev, &dev->adj_list.upper))
5768                return -EEXIST;
5769
5770        if (master && netdev_master_upper_dev_get(dev))
5771                return -EBUSY;
5772
5773        changeupper_info.upper_dev = upper_dev;
5774        changeupper_info.master = master;
5775        changeupper_info.linking = true;
5776        changeupper_info.upper_info = upper_info;
5777
5778        ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5779                                            &changeupper_info.info);
5780        ret = notifier_to_errno(ret);
5781        if (ret)
5782                return ret;
5783
5784        ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
5785                                                   master);
5786        if (ret)
5787                return ret;
5788
5789        /* Now that we linked these devs, make all the upper_dev's
5790         * all_adj_list.upper visible to every dev's all_adj_list.lower an
5791         * versa, and don't forget the devices itself. All of these
5792         * links are non-neighbours.
5793         */
5794        list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5795                list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5796                        pr_debug("Interlinking %s with %s, non-neighbour\n",
5797                                 i->dev->name, j->dev->name);
5798                        ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5799                        if (ret)
5800                                goto rollback_mesh;
5801                }
5802        }
5803
5804        /* add dev to every upper_dev's upper device */
5805        list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5806                pr_debug("linking %s's upper device %s with %s\n",
5807                         upper_dev->name, i->dev->name, dev->name);
5808                ret = __netdev_adjacent_dev_link(dev, i->dev);
5809                if (ret)
5810                        goto rollback_upper_mesh;
5811        }
5812
5813        /* add upper_dev to every dev's lower device */
5814        list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5815                pr_debug("linking %s's lower device %s with %s\n", dev->name,
5816                         i->dev->name, upper_dev->name);
5817                ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5818                if (ret)
5819                        goto rollback_lower_mesh;
5820        }
5821
5822        ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5823                                            &changeupper_info.info);
5824        ret = notifier_to_errno(ret);
5825        if (ret)
5826                goto rollback_lower_mesh;
5827
5828        return 0;
5829
5830rollback_lower_mesh:
5831        to_i = i;
5832        list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5833                if (i == to_i)
5834                        break;
5835                __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5836        }
5837
5838        i = NULL;
5839
5840rollback_upper_mesh:
5841        to_i = i;
5842        list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5843                if (i == to_i)
5844                        break;
5845                __netdev_adjacent_dev_unlink(dev, i->dev);
5846        }
5847
5848        i = j = NULL;
5849
5850rollback_mesh:
5851        to_i = i;
5852        to_j = j;
5853        list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5854                list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5855                        if (i == to_i && j == to_j)
5856                                break;
5857                        __netdev_adjacent_dev_unlink(i->dev, j->dev);
5858                }
5859                if (i == to_i)
5860                        break;
5861        }
5862
5863        __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5864
5865        return ret;
5866}
5867
5868/**
5869 * netdev_upper_dev_link - Add a link to the upper device
5870 * @dev: device
5871 * @upper_dev: new upper device
5872 *
5873 * Adds a link to device which is upper to this one. The caller must hold
5874 * the RTNL lock. On a failure a negative errno code is returned.
5875 * On success the reference counts are adjusted and the function
5876 * returns zero.
5877 */
5878int netdev_upper_dev_link(struct net_device *dev,
5879                          struct net_device *upper_dev)
5880{
5881        return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL);
5882}
5883EXPORT_SYMBOL(netdev_upper_dev_link);
5884
5885/**
5886 * netdev_master_upper_dev_link - Add a master link to the upper device
5887 * @dev: device
5888 * @upper_dev: new upper device
5889 * @upper_priv: upper device private
5890 * @upper_info: upper info to be passed down via notifier
5891 *
5892 * Adds a link to device which is upper to this one. In this case, only
5893 * one master upper device can be linked, although other non-master devices
5894 * might be linked as well. The caller must hold the RTNL lock.
5895 * On a failure a negative errno code is returned. On success the reference
5896 * counts are adjusted and the function returns zero.
5897 */
5898int netdev_master_upper_dev_link(struct net_device *dev,
5899                                 struct net_device *upper_dev,
5900                                 void *upper_priv, void *upper_info)
5901{
5902        return __netdev_upper_dev_link(dev, upper_dev, true,
5903                                       upper_priv, upper_info);
5904}
5905EXPORT_SYMBOL(netdev_master_upper_dev_link);
5906
5907/**
5908 * netdev_upper_dev_unlink - Removes a link to upper device
5909 * @dev: device
5910 * @upper_dev: new upper device
5911 *
5912 * Removes a link to device which is upper to this one. The caller must hold
5913 * the RTNL lock.
5914 */
5915void netdev_upper_dev_unlink(struct net_device *dev,
5916                             struct net_device *upper_dev)
5917{
5918        struct netdev_notifier_changeupper_info changeupper_info;
5919        struct netdev_adjacent *i, *j;
5920        ASSERT_RTNL();
5921
5922        changeupper_info.upper_dev = upper_dev;
5923        changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
5924        changeupper_info.linking = false;
5925
5926        call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5927                                      &changeupper_info.info);
5928
5929        __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5930
5931        /* Here is the tricky part. We must remove all dev's lower
5932         * devices from all upper_dev's upper devices and vice
5933         * versa, to maintain the graph relationship.
5934         */
5935        list_for_each_entry(i, &dev->all_adj_list.lower, list)
5936                list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5937                        __netdev_adjacent_dev_unlink(i->dev, j->dev);
5938
5939        /* remove also the devices itself from lower/upper device
5940         * list
5941         */
5942        list_for_each_entry(i, &dev->all_adj_list.lower, list)
5943                __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5944
5945        list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5946                __netdev_adjacent_dev_unlink(dev, i->dev);
5947
5948        call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5949                                      &changeupper_info.info);
5950}
5951EXPORT_SYMBOL(netdev_upper_dev_unlink);
5952
5953/**
5954 * netdev_bonding_info_change - Dispatch event about slave change
5955 * @dev: device
5956 * @bonding_info: info to dispatch
5957 *
5958 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
5959 * The caller must hold the RTNL lock.
5960 */
5961void netdev_bonding_info_change(struct net_device *dev,
5962                                struct netdev_bonding_info *bonding_info)
5963{
5964        struct netdev_notifier_bonding_info     info;
5965
5966        memcpy(&info.bonding_info, bonding_info,
5967               sizeof(struct netdev_bonding_info));
5968        call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
5969                                      &info.info);
5970}
5971EXPORT_SYMBOL(netdev_bonding_info_change);
5972
5973static void netdev_adjacent_add_links(struct net_device *dev)
5974{
5975        struct netdev_adjacent *iter;
5976
5977        struct net *net = dev_net(dev);
5978
5979        list_for_each_entry(iter, &dev->adj_list.upper, list) {
5980                if (!net_eq(net, dev_net(iter->dev)))
5981                        continue;
5982                netdev_adjacent_sysfs_add(iter->dev, dev,
5983                                          &iter->dev->adj_list.lower);
5984                netdev_adjacent_sysfs_add(dev, iter->dev,
5985                                          &dev->adj_list.upper);
5986        }
5987
5988        list_for_each_entry(iter, &dev->adj_list.lower, list) {
5989                if (!net_eq(net, dev_net(iter->dev)))
5990                        continue;
5991                netdev_adjacent_sysfs_add(iter->dev, dev,
5992                                          &iter->dev->adj_list.upper);
5993                netdev_adjacent_sysfs_add(dev, iter->dev,
5994                                          &dev->adj_list.lower);
5995        }
5996}
5997
5998static void netdev_adjacent_del_links(struct net_device *dev)
5999{
6000        struct netdev_adjacent *iter;
6001
6002        struct net *net = dev_net(dev);
6003
6004        list_for_each_entry(iter, &dev->adj_list.upper, list) {
6005                if (!net_eq(net, dev_net(iter->dev)))
6006                        continue;
6007                netdev_adjacent_sysfs_del(iter->dev, dev->name,
6008                                          &iter->dev->adj_list.lower);
6009                netdev_adjacent_sysfs_del(dev, iter->dev->name,
6010                                          &dev->adj_list.upper);
6011        }
6012
6013        list_for_each_entry(iter, &dev->adj_list.lower, list) {
6014                if (!net_eq(net, dev_net(iter->dev)))
6015                        continue;
6016                netdev_adjacent_sysfs_del(iter->dev, dev->name,
6017                                          &iter->dev->adj_list.upper);
6018                netdev_adjacent_sysfs_del(dev, iter->dev->name,
6019                                          &dev->adj_list.lower);
6020        }
6021}
6022
6023void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
6024{
6025        struct netdev_adjacent *iter;
6026
6027        struct net *net = dev_net(dev);
6028
6029        list_for_each_entry(iter, &dev->adj_list.upper, list) {
6030                if (!net_eq(net, dev_net(iter->dev)))
6031                        continue;
6032                netdev_adjacent_sysfs_del(iter->dev, oldname,
6033                                          &iter->dev->adj_list.lower);
6034                netdev_adjacent_sysfs_add(iter->dev, dev,
6035                                          &iter->dev->adj_list.lower);
6036        }
6037
6038        list_for_each_entry(iter, &dev->adj_list.lower, list) {
6039                if (!net_eq(net, dev_net(iter->dev)))
6040                        continue;
6041                netdev_adjacent_sysfs_del(iter->dev, oldname,
6042                                          &iter->dev->adj_list.upper);
6043                netdev_adjacent_sysfs_add(iter->dev, dev,
6044                                          &iter->dev->adj_list.upper);
6045        }
6046}
6047
6048void *netdev_lower_dev_get_private(struct net_device *dev,
6049                                   struct net_device *lower_dev)
6050{
6051        struct netdev_adjacent *lower;
6052
6053        if (!lower_dev)
6054                return NULL;
6055        lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
6056        if (!lower)
6057                return NULL;
6058
6059        return lower->private;
6060}
6061EXPORT_SYMBOL(netdev_lower_dev_get_private);
6062
6063
6064int dev_get_nest_level(struct net_device *dev)
6065{
6066        struct net_device *lower = NULL;
6067        struct list_head *iter;
6068        int max_nest = -1;
6069        int nest;
6070
6071        ASSERT_RTNL();
6072
6073        netdev_for_each_lower_dev(dev, lower, iter) {
6074                nest = dev_get_nest_level(lower);
6075                if (max_nest < nest)
6076                        max_nest = nest;
6077        }
6078
6079        return max_nest + 1;
6080}
6081EXPORT_SYMBOL(dev_get_nest_level);
6082
6083/**
6084 * netdev_lower_change - Dispatch event about lower device state change
6085 * @lower_dev: device
6086 * @lower_state_info: state to dispatch
6087 *
6088 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
6089 * The caller must hold the RTNL lock.
6090 */
6091void netdev_lower_state_changed(struct net_device *lower_dev,
6092                                void *lower_state_info)
6093{
6094        struct netdev_notifier_changelowerstate_info changelowerstate_info;
6095
6096        ASSERT_RTNL();
6097        changelowerstate_info.lower_state_info = lower_state_info;
6098        call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev,
6099                                      &changelowerstate_info.info);
6100}
6101EXPORT_SYMBOL(netdev_lower_state_changed);
6102
6103int netdev_default_l2upper_neigh_construct(struct net_device *dev,
6104                                           struct neighbour *n)
6105{
6106        struct net_device *lower_dev, *stop_dev;
6107        struct list_head *iter;
6108        int err;
6109
6110        netdev_for_each_lower_dev(dev, lower_dev, iter) {
6111                if (!lower_dev->netdev_ops->ndo_neigh_construct)
6112                        continue;
6113                err = lower_dev->netdev_ops->ndo_neigh_construct(lower_dev, n);
6114                if (err) {
6115                        stop_dev = lower_dev;
6116                        goto rollback;
6117                }
6118        }
6119        return 0;
6120
6121rollback:
6122        netdev_for_each_lower_dev(dev, lower_dev, iter) {
6123                if (lower_dev == stop_dev)
6124                        break;
6125                if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6126                        continue;
6127                lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6128        }
6129        return err;
6130}
6131EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_construct);
6132
6133void netdev_default_l2upper_neigh_destroy(struct net_device *dev,
6134                                          struct neighbour *n)
6135{
6136        struct net_device *lower_dev;
6137        struct list_head *iter;
6138
6139        netdev_for_each_lower_dev(dev, lower_dev, iter) {
6140                if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6141                        continue;
6142                lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6143        }
6144}
6145EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_destroy);
6146
6147static void dev_change_rx_flags(struct net_device *dev, int flags)
6148{
6149        const struct net_device_ops *ops = dev->netdev_ops;
6150
6151        if (ops->ndo_change_rx_flags)
6152                ops->ndo_change_rx_flags(dev, flags);
6153}
6154
6155static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
6156{
6157        unsigned int old_flags = dev->flags;
6158        kuid_t uid;
6159        kgid_t gid;
6160
6161        ASSERT_RTNL();
6162
6163        dev->flags |= IFF_PROMISC;
6164        dev->promiscuity += inc;
6165        if (dev->promiscuity == 0) {
6166                /*
6167                 * Avoid overflow.
6168                 * If inc causes overflow, untouch promisc and return error.
6169                 */
6170                if (inc < 0)
6171                        dev->flags &= ~IFF_PROMISC;
6172                else {
6173                        dev->promiscuity -= inc;
6174                        pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
6175                                dev->name);
6176                        return -EOVERFLOW;
6177                }
6178        }
6179        if (dev->flags != old_flags) {
6180                pr_info("device %s %s promiscuous mode\n",
6181                        dev->name,
6182                        dev->flags & IFF_PROMISC ? "entered" : "left");
6183                if (audit_enabled) {
6184                        current_uid_gid(&uid, &gid);
6185                        audit_log(current->audit_context, GFP_ATOMIC,
6186                                AUDIT_ANOM_PROMISCUOUS,
6187                                "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
6188                                dev->name, (dev->flags & IFF_PROMISC),
6189                                (old_flags & IFF_PROMISC),
6190                                from_kuid(&init_user_ns, audit_get_loginuid(current)),
6191                                from_kuid(&init_user_ns, uid),
6192                                from_kgid(&init_user_ns, gid),
6193                                audit_get_sessionid(current));
6194                }
6195
6196                dev_change_rx_flags(dev, IFF_PROMISC);
6197        }
6198        if (notify)
6199                __dev_notify_flags(dev, old_flags, IFF_PROMISC);
6200        return 0;
6201}
6202
6203/**
6204 *      dev_set_promiscuity     - update promiscuity count on a device
6205 *      @dev: device
6206 *      @inc: modifier
6207 *
6208 *      Add or remove promiscuity from a device. While the count in the device
6209 *      remains above zero the interface remains promiscuous. Once it hits zero
6210 *      the device reverts back to normal filtering operation. A negative inc
6211 *      value is used to drop promiscuity on the device.
6212 *      Return 0 if successful or a negative errno code on error.
6213 */
6214int dev_set_promiscuity(struct net_device *dev, int inc)
6215{
6216        unsigned int old_flags = dev->flags;
6217        int err;
6218
6219        err = __dev_set_promiscuity(dev, inc, true);
6220        if (err < 0)
6221                return err;
6222        if (dev->flags != old_flags)
6223                dev_set_rx_mode(dev);
6224        return err;
6225}
6226EXPORT_SYMBOL(dev_set_promiscuity);
6227
6228static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
6229{
6230        unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
6231
6232        ASSERT_RTNL();
6233
6234        dev->flags |= IFF_ALLMULTI;
6235        dev->allmulti += inc;
6236        if (dev->allmulti == 0) {
6237                /*
6238                 * Avoid overflow.
6239                 * If inc causes overflow, untouch allmulti and return error.
6240                 */
6241                if (inc < 0)
6242                        dev->flags &= ~IFF_ALLMULTI;
6243                else {
6244                        dev->allmulti -= inc;
6245                        pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
6246                                dev->name);
6247                        return -EOVERFLOW;
6248                }
6249        }
6250        if (dev->flags ^ old_flags) {
6251                dev_change_rx_flags(dev, IFF_ALLMULTI);
6252                dev_set_rx_mode(dev);
6253                if (notify)
6254                        __dev_notify_flags(dev, old_flags,
6255                                           dev->gflags ^ old_gflags);
6256        }
6257        return 0;
6258}
6259
6260/**
6261 *      dev_set_allmulti        - update allmulti count on a device
6262 *      @dev: device
6263 *      @inc: modifier
6264 *
6265 *      Add or remove reception of all multicast frames to a device. While the
6266 *      count in the device remains above zero the interface remains listening
6267 *      to all interfaces. Once it hits zero the device reverts back to normal
6268 *      filtering operation. A negative @inc value is used to drop the counter
6269 *      when releasing a resource needing all multicasts.
6270 *      Return 0 if successful or a negative errno code on error.
6271 */
6272
6273int dev_set_allmulti(struct net_device *dev, int inc)
6274{
6275        return __dev_set_allmulti(dev, inc, true);
6276}
6277EXPORT_SYMBOL(dev_set_allmulti);
6278
6279/*
6280 *      Upload unicast and multicast address lists to device and
6281 *      configure RX filtering. When the device doesn't support unicast
6282 *      filtering it is put in promiscuous mode while unicast addresses
6283 *      are present.
6284 */
6285void __dev_set_rx_mode(struct net_device *dev)
6286{
6287        const struct net_device_ops *ops = dev->netdev_ops;
6288
6289        /* dev_open will call this function so the list will stay sane. */
6290        if (!(dev->flags&IFF_UP))
6291                return;
6292
6293        if (!netif_device_present(dev))
6294                return;
6295
6296        if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
6297                /* Unicast addresses changes may only happen under the rtnl,
6298                 * therefore calling __dev_set_promiscuity here is safe.
6299                 */
6300                if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
6301                        __dev_set_promiscuity(dev, 1, false);
6302                        dev->uc_promisc = true;
6303                } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
6304                        __dev_set_promiscuity(dev, -1, false);
6305                        dev->uc_promisc = false;
6306                }
6307        }
6308
6309        if (ops->ndo_set_rx_mode)
6310                ops->ndo_set_rx_mode(dev);
6311}
6312
6313void dev_set_rx_mode(struct net_device *dev)
6314{
6315        netif_addr_lock_bh(dev);
6316        __dev_set_rx_mode(dev);
6317        netif_addr_unlock_bh(dev);
6318}
6319
6320/**
6321 *      dev_get_flags - get flags reported to userspace
6322 *      @dev: device
6323 *
6324 *      Get the combination of flag bits exported through APIs to userspace.
6325 */
6326unsigned int dev_get_flags(const struct net_device *dev)
6327{
6328        unsigned int flags;
6329
6330        flags = (dev->flags & ~(IFF_PROMISC |
6331                                IFF_ALLMULTI |
6332                                IFF_RUNNING |
6333                                IFF_LOWER_UP |
6334                                IFF_DORMANT)) |
6335                (dev->gflags & (IFF_PROMISC |
6336                                IFF_ALLMULTI));
6337
6338        if (netif_running(dev)) {
6339                if (netif_oper_up(dev))
6340                        flags |= IFF_RUNNING;
6341                if (netif_carrier_ok(dev))
6342                        flags |= IFF_LOWER_UP;
6343                if (netif_dormant(dev))
6344                        flags |= IFF_DORMANT;
6345        }
6346
6347        return flags;
6348}
6349EXPORT_SYMBOL(dev_get_flags);
6350
6351int __dev_change_flags(struct net_device *dev, unsigned int flags)
6352{
6353        unsigned int old_flags = dev->flags;
6354        int ret;
6355
6356        ASSERT_RTNL();
6357
6358        /*
6359         *      Set the flags on our device.
6360         */
6361
6362        dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
6363                               IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
6364                               IFF_AUTOMEDIA)) |
6365                     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
6366                                    IFF_ALLMULTI));
6367
6368        /*
6369         *      Load in the correct multicast list now the flags have changed.
6370         */
6371
6372        if ((old_flags ^ flags) & IFF_MULTICAST)
6373                dev_change_rx_flags(dev, IFF_MULTICAST);
6374
6375        dev_set_rx_mode(dev);
6376
6377        /*
6378         *      Have we downed the interface. We handle IFF_UP ourselves
6379         *      according to user attempts to set it, rather than blindly
6380         *      setting it.
6381         */
6382
6383        ret = 0;
6384        if ((old_flags ^ flags) & IFF_UP)
6385                ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
6386
6387        if ((flags ^ dev->gflags) & IFF_PROMISC) {
6388                int inc = (flags & IFF_PROMISC) ? 1 : -1;
6389                unsigned int old_flags = dev->flags;
6390
6391                dev->gflags ^= IFF_PROMISC;
6392
6393                if (__dev_set_promiscuity(dev, inc, false) >= 0)
6394                        if (dev->flags != old_flags)
6395                                dev_set_rx_mode(dev);
6396        }
6397
6398        /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6399           is important. Some (broken) drivers set IFF_PROMISC, when
6400           IFF_ALLMULTI is requested not asking us and not reporting.
6401         */
6402        if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
6403                int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
6404
6405                dev->gflags ^= IFF_ALLMULTI;
6406                __dev_set_allmulti(dev, inc, false);
6407        }
6408
6409        return ret;
6410}
6411
6412void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
6413                        unsigned int gchanges)
6414{
6415        unsigned int changes = dev->flags ^ old_flags;
6416
6417        if (gchanges)
6418                rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
6419
6420        if (changes & IFF_UP) {
6421                if (dev->flags & IFF_UP)
6422                        call_netdevice_notifiers(NETDEV_UP, dev);
6423                else
6424                        call_netdevice_notifiers(NETDEV_DOWN, dev);
6425        }
6426
6427        if (dev->flags & IFF_UP &&
6428            (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
6429                struct netdev_notifier_change_info change_info;
6430
6431                change_info.flags_changed = changes;
6432                call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
6433                                              &change_info.info);
6434        }
6435}
6436
6437/**
6438 *      dev_change_flags - change device settings
6439 *      @dev: device
6440 *      @flags: device state flags
6441 *
6442 *      Change settings on device based state flags. The flags are
6443 *      in the userspace exported format.
6444 */
6445int dev_change_flags(struct net_device *dev, unsigned int flags)
6446{
6447        int ret;
6448        unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
6449
6450        ret = __dev_change_flags(dev, flags);
6451        if (ret < 0)
6452                return ret;
6453
6454        changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
6455        __dev_notify_flags(dev, old_flags, changes);
6456        return ret;
6457}
6458EXPORT_SYMBOL(dev_change_flags);
6459
6460static int __dev_set_mtu(struct net_device *dev, int new_mtu)
6461{
6462        const struct net_device_ops *ops = dev->netdev_ops;
6463
6464        if (ops->ndo_change_mtu)
6465                return ops->ndo_change_mtu(dev, new_mtu);
6466
6467        dev->mtu = new_mtu;
6468        return 0;
6469}
6470
6471/**
6472 *      dev_set_mtu - Change maximum transfer unit
6473 *      @dev: device
6474 *      @new_mtu: new transfer unit
6475 *
6476 *      Change the maximum transfer size of the network device.
6477 */
6478int dev_set_mtu(struct net_device *dev, int new_mtu)
6479{
6480        int err, orig_mtu;
6481
6482        if (new_mtu == dev->mtu)
6483                return 0;
6484
6485        /*      MTU must be positive.    */
6486        if (new_mtu < 0)
6487                return -EINVAL;
6488
6489        if (!netif_device_present(dev))
6490                return -ENODEV;
6491
6492        err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
6493        err = notifier_to_errno(err);
6494        if (err)
6495                return err;
6496
6497        orig_mtu = dev->mtu;
6498        err = __dev_set_mtu(dev, new_mtu);
6499
6500        if (!err) {
6501                err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6502                err = notifier_to_errno(err);
6503                if (err) {
6504                        /* setting mtu back and notifying everyone again,
6505                         * so that they have a chance to revert changes.
6506                         */
6507                        __dev_set_mtu(dev, orig_mtu);
6508                        call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6509                }
6510        }
6511        return err;
6512}
6513EXPORT_SYMBOL(dev_set_mtu);
6514
6515/**
6516 *      dev_set_group - Change group this device belongs to
6517 *      @dev: device
6518 *      @new_group: group this device should belong to
6519 */
6520void dev_set_group(struct net_device *dev, int new_group)
6521{
6522        dev->group = new_group;
6523}
6524EXPORT_SYMBOL(dev_set_group);
6525
6526/**
6527 *      dev_set_mac_address - Change Media Access Control Address
6528 *      @dev: device
6529 *      @sa: new address
6530 *
6531 *      Change the hardware (MAC) address of the device
6532 */
6533int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
6534{
6535        const struct net_device_ops *ops = dev->netdev_ops;
6536        int err;
6537
6538        if (!ops->ndo_set_mac_address)
6539                return -EOPNOTSUPP;
6540        if (sa->sa_family != dev->type)
6541                return -EINVAL;
6542        if (!netif_device_present(dev))
6543                return -ENODEV;
6544        err = ops->ndo_set_mac_address(dev, sa);
6545        if (err)
6546                return err;
6547        dev->addr_assign_type = NET_ADDR_SET;
6548        call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
6549        add_device_randomness(dev->dev_addr, dev->addr_len);
6550        return 0;
6551}
6552EXPORT_SYMBOL(dev_set_mac_address);
6553
6554/**
6555 *      dev_change_carrier - Change device carrier
6556 *      @dev: device
6557 *      @new_carrier: new value
6558 *
6559 *      Change device carrier
6560 */
6561int dev_change_carrier(struct net_device *dev, bool new_carrier)
6562{
6563        const struct net_device_ops *ops = dev->netdev_ops;
6564
6565        if (!ops->ndo_change_carrier)
6566                return -EOPNOTSUPP;
6567        if (!netif_device_present(dev))
6568                return -ENODEV;
6569        return ops->ndo_change_carrier(dev, new_carrier);
6570}
6571EXPORT_SYMBOL(dev_change_carrier);
6572
6573/**
6574 *      dev_get_phys_port_id - Get device physical port ID
6575 *      @dev: device
6576 *      @ppid: port ID
6577 *
6578 *      Get device physical port ID
6579 */
6580int dev_get_phys_port_id(struct net_device *dev,
6581                         struct netdev_phys_item_id *ppid)
6582{
6583        const struct net_device_ops *ops = dev->netdev_ops;
6584
6585        if (!ops->ndo_get_phys_port_id)
6586                return -EOPNOTSUPP;
6587        return ops->ndo_get_phys_port_id(dev, ppid);
6588}
6589EXPORT_SYMBOL(dev_get_phys_port_id);
6590
6591/**
6592 *      dev_get_phys_port_name - Get device physical port name
6593 *      @dev: device
6594 *      @name: port name
6595 *      @len: limit of bytes to copy to name
6596 *
6597 *      Get device physical port name
6598 */
6599int dev_get_phys_port_name(struct net_device *dev,
6600                           char *name, size_t len)
6601{
6602        const struct net_device_ops *ops = dev->netdev_ops;
6603
6604        if (!ops->ndo_get_phys_port_name)
6605                return -EOPNOTSUPP;
6606        return ops->ndo_get_phys_port_name(dev, name, len);
6607}
6608EXPORT_SYMBOL(dev_get_phys_port_name);
6609
6610/**
6611 *      dev_change_proto_down - update protocol port state information
6612 *      @dev: device
6613 *      @proto_down: new value
6614 *
6615 *      This info can be used by switch drivers to set the phys state of the
6616 *      port.
6617 */
6618int dev_change_proto_down(struct net_device *dev, bool proto_down)
6619{
6620        const struct net_device_ops *ops = dev->netdev_ops;
6621
6622        if (!ops->ndo_change_proto_down)
6623                return -EOPNOTSUPP;
6624        if (!netif_device_present(dev))
6625                return -ENODEV;
6626        return ops->ndo_change_proto_down(dev, proto_down);
6627}
6628EXPORT_SYMBOL(dev_change_proto_down);
6629
6630/**
6631 *      dev_change_xdp_fd - set or clear a bpf program for a device rx path
6632 *      @dev: device
6633 *      @fd: new program fd or negative value to clear
6634 *
6635 *      Set or clear a bpf program for a device
6636 */
6637int dev_change_xdp_fd(struct net_device *dev, int fd)
6638{
6639        const struct net_device_ops *ops = dev->netdev_ops;
6640        struct bpf_prog *prog = NULL;
6641        struct netdev_xdp xdp = {};
6642        int err;
6643
6644        if (!ops->ndo_xdp)
6645                return -EOPNOTSUPP;
6646        if (fd >= 0) {
6647                prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
6648                if (IS_ERR(prog))
6649                        return PTR_ERR(prog);
6650        }
6651
6652        xdp.command = XDP_SETUP_PROG;
6653        xdp.prog = prog;
6654        err = ops->ndo_xdp(dev, &xdp);
6655        if (err < 0 && prog)
6656                bpf_prog_put(prog);
6657
6658        return err;
6659}
6660EXPORT_SYMBOL(dev_change_xdp_fd);
6661
6662/**
6663 *      dev_new_index   -       allocate an ifindex
6664 *      @net: the applicable net namespace
6665 *
6666 *      Returns a suitable unique value for a new device interface
6667 *      number.  The caller must hold the rtnl semaphore or the
6668 *      dev_base_lock to be sure it remains unique.
6669 */
6670static int dev_new_index(struct net *net)
6671{
6672        int ifindex = net->ifindex;
6673        for (;;) {
6674                if (++ifindex <= 0)
6675                        ifindex = 1;
6676                if (!__dev_get_by_index(net, ifindex))
6677                        return net->ifindex = ifindex;
6678        }
6679}
6680
6681/* Delayed registration/unregisteration */
6682static LIST_HEAD(net_todo_list);
6683DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
6684
6685static void net_set_todo(struct net_device *dev)
6686{
6687        list_add_tail(&dev->todo_list, &net_todo_list);
6688        dev_net(dev)->dev_unreg_count++;
6689}
6690
6691static void rollback_registered_many(struct list_head *head)
6692{
6693        struct net_device *dev, *tmp;
6694        LIST_HEAD(close_head);
6695
6696        BUG_ON(dev_boot_phase);
6697        ASSERT_RTNL();
6698
6699        list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6700                /* Some devices call without registering
6701                 * for initialization unwind. Remove those
6702                 * devices and proceed with the remaining.
6703                 */
6704                if (dev->reg_state == NETREG_UNINITIALIZED) {
6705                        pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6706                                 dev->name, dev);
6707
6708                        WARN_ON(1);
6709                        list_del(&dev->unreg_list);
6710                        continue;
6711                }
6712                dev->dismantle = true;
6713                BUG_ON(dev->reg_state != NETREG_REGISTERED);
6714        }
6715
6716        /* If device is running, close it first. */
6717        list_for_each_entry(dev, head, unreg_list)
6718                list_add_tail(&dev->close_list, &close_head);
6719        dev_close_many(&close_head, true);
6720
6721        list_for_each_entry(dev, head, unreg_list) {
6722                /* And unlink it from device chain. */
6723                unlist_netdevice(dev);
6724
6725                dev->reg_state = NETREG_UNREGISTERING;
6726                on_each_cpu(flush_backlog, dev, 1);
6727        }
6728
6729        synchronize_net();
6730
6731        list_for_each_entry(dev, head, unreg_list) {
6732                struct sk_buff *skb = NULL;
6733
6734                /* Shutdown queueing discipline. */
6735                dev_shutdown(dev);
6736
6737
6738                /* Notify protocols, that we are about to destroy
6739                   this device. They should clean all the things.
6740                */
6741                call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6742
6743                if (!dev->rtnl_link_ops ||
6744                    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6745                        skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6746                                                     GFP_KERNEL);
6747
6748                /*
6749                 *      Flush the unicast and multicast chains
6750                 */
6751                dev_uc_flush(dev);
6752                dev_mc_flush(dev);
6753
6754                if (dev->netdev_ops->ndo_uninit)
6755                        dev->netdev_ops->ndo_uninit(dev);
6756
6757                if (skb)
6758                        rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6759
6760                /* Notifier chain MUST detach us all upper devices. */
6761                WARN_ON(netdev_has_any_upper_dev(dev));
6762
6763                /* Remove entries from kobject tree */
6764                netdev_unregister_kobject(dev);
6765#ifdef CONFIG_XPS
6766                /* Remove XPS queueing entries */
6767                netif_reset_xps_queues_gt(dev, 0);
6768#endif
6769        }
6770
6771        synchronize_net();
6772
6773        list_for_each_entry(dev, head, unreg_list)
6774                dev_put(dev);
6775}
6776
6777static void rollback_registered(struct net_device *dev)
6778{
6779        LIST_HEAD(single);
6780
6781        list_add(&dev->unreg_list, &single);
6782        rollback_registered_many(&single);
6783        list_del(&single);
6784}
6785
6786static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
6787        struct net_device *upper, netdev_features_t features)
6788{
6789        netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6790        netdev_features_t feature;
6791        int feature_bit;
6792
6793        for_each_netdev_feature(&upper_disables, feature_bit) {
6794                feature = __NETIF_F_BIT(feature_bit);
6795                if (!(upper->wanted_features & feature)
6796                    && (features & feature)) {
6797                        netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
6798                                   &feature, upper->name);
6799                        features &= ~feature;
6800                }
6801        }
6802
6803        return features;
6804}
6805
6806static void netdev_sync_lower_features(struct net_device *upper,
6807        struct net_device *lower, netdev_features_t features)
6808{
6809        netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6810        netdev_features_t feature;
6811        int feature_bit;
6812
6813        for_each_netdev_feature(&upper_disables, feature_bit) {
6814                feature = __NETIF_F_BIT(feature_bit);
6815                if (!(features & feature) && (lower->features & feature)) {
6816                        netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
6817                                   &feature, lower->name);
6818                        lower->wanted_features &= ~feature;
6819                        netdev_update_features(lower);
6820
6821                        if (unlikely(lower->features & feature))
6822                                netdev_WARN(upper, "failed to disable %pNF on %s!\n",
6823                                            &feature, lower->name);
6824                }
6825        }
6826}
6827
6828static netdev_features_t netdev_fix_features(struct net_device *dev,
6829        netdev_features_t features)
6830{
6831        /* Fix illegal checksum combinations */
6832        if ((features & NETIF_F_HW_CSUM) &&
6833            (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6834                netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6835                features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6836        }
6837
6838        /* TSO requires that SG is present as well. */
6839        if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6840                netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6841                features &= ~NETIF_F_ALL_TSO;
6842        }
6843
6844        if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6845                                        !(features & NETIF_F_IP_CSUM)) {
6846                netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6847                features &= ~NETIF_F_TSO;
6848                features &= ~NETIF_F_TSO_ECN;
6849        }
6850
6851        if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6852                                         !(features & NETIF_F_IPV6_CSUM)) {
6853                netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6854                features &= ~NETIF_F_TSO6;
6855        }
6856
6857        /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
6858        if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
6859                features &= ~NETIF_F_TSO_MANGLEID;
6860
6861        /* TSO ECN requires that TSO is present as well. */
6862        if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6863                features &= ~NETIF_F_TSO_ECN;
6864
6865        /* Software GSO depends on SG. */
6866        if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6867                netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6868                features &= ~NETIF_F_GSO;
6869        }
6870
6871        /* UFO needs SG and checksumming */
6872        if (features & NETIF_F_UFO) {
6873                /* maybe split UFO into V4 and V6? */
6874                if (!(features & NETIF_F_HW_CSUM) &&
6875                    ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) !=
6876                     (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) {
6877                        netdev_dbg(dev,
6878                                "Dropping NETIF_F_UFO since no checksum offload features.\n");
6879                        features &= ~NETIF_F_UFO;
6880                }
6881
6882                if (!(features & NETIF_F_SG)) {
6883                        netdev_dbg(dev,
6884                                "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6885                        features &= ~NETIF_F_UFO;
6886                }
6887        }
6888
6889        /* GSO partial features require GSO partial be set */
6890        if ((features & dev->gso_partial_features) &&
6891            !(features & NETIF_F_GSO_PARTIAL)) {
6892                netdev_dbg(dev,
6893                           "Dropping partially supported GSO features since no GSO partial.\n");
6894                features &= ~dev->gso_partial_features;
6895        }
6896
6897#ifdef CONFIG_NET_RX_BUSY_POLL
6898        if (dev->netdev_ops->ndo_busy_poll)
6899                features |= NETIF_F_BUSY_POLL;
6900        else
6901#endif
6902                features &= ~NETIF_F_BUSY_POLL;
6903
6904        return features;
6905}
6906
6907int __netdev_update_features(struct net_device *dev)
6908{
6909        struct net_device *upper, *lower;
6910        netdev_features_t features;
6911        struct list_head *iter;
6912        int err = -1;
6913
6914        ASSERT_RTNL();
6915
6916        features = netdev_get_wanted_features(dev);
6917
6918        if (dev->netdev_ops->ndo_fix_features)
6919                features = dev->netdev_ops->ndo_fix_features(dev, features);
6920
6921        /* driver might be less strict about feature dependencies */
6922        features = netdev_fix_features(dev, features);
6923
6924        /* some features can't be enabled if they're off an an upper device */
6925        netdev_for_each_upper_dev_rcu(dev, upper, iter)
6926                features = netdev_sync_upper_features(dev, upper, features);
6927
6928        if (dev->features == features)
6929                goto sync_lower;
6930
6931        netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6932                &dev->features, &features);
6933
6934        if (dev->netdev_ops->ndo_set_features)
6935                err = dev->netdev_ops->ndo_set_features(dev, features);
6936        else
6937                err = 0;
6938
6939        if (unlikely(err < 0)) {
6940                netdev_err(dev,
6941                        "set_features() failed (%d); wanted %pNF, left %pNF\n",
6942                        err, &features, &dev->features);
6943                /* return non-0 since some features might have changed and
6944                 * it's better to fire a spurious notification than miss it
6945                 */
6946                return -1;
6947        }
6948
6949sync_lower:
6950        /* some features must be disabled on lower devices when disabled
6951         * on an upper device (think: bonding master or bridge)
6952         */
6953        netdev_for_each_lower_dev(dev, lower, iter)
6954                netdev_sync_lower_features(dev, lower, features);
6955
6956        if (!err)
6957                dev->features = features;
6958
6959        return err < 0 ? 0 : 1;
6960}
6961
6962/**
6963 *      netdev_update_features - recalculate device features
6964 *      @dev: the device to check
6965 *
6966 *      Recalculate dev->features set and send notifications if it
6967 *      has changed. Should be called after driver or hardware dependent
6968 *      conditions might have changed that influence the features.
6969 */
6970void netdev_update_features(struct net_device *dev)
6971{
6972        if (__netdev_update_features(dev))
6973                netdev_features_change(dev);
6974}
6975EXPORT_SYMBOL(netdev_update_features);
6976
6977/**
6978 *      netdev_change_features - recalculate device features
6979 *      @dev: the device to check
6980 *
6981 *      Recalculate dev->features set and send notifications even
6982 *      if they have not changed. Should be called instead of
6983 *      netdev_update_features() if also dev->vlan_features might
6984 *      have changed to allow the changes to be propagated to stacked
6985 *      VLAN devices.
6986 */
6987void netdev_change_features(struct net_device *dev)
6988{
6989        __netdev_update_features(dev);
6990        netdev_features_change(dev);
6991}
6992EXPORT_SYMBOL(netdev_change_features);
6993
6994/**
6995 *      netif_stacked_transfer_operstate -      transfer operstate
6996 *      @rootdev: the root or lower level device to transfer state from
6997 *      @dev: the device to transfer operstate to
6998 *
6999 *      Transfer operational state from root to device. This is normally
7000 *      called when a stacking relationship exists between the root
7001 *      device and the device(a leaf device).
7002 */
7003void netif_stacked_transfer_operstate(const struct net_device *rootdev,
7004                                        struct net_device *dev)
7005{
7006        if (rootdev->operstate == IF_OPER_DORMANT)
7007                netif_dormant_on(dev);
7008        else
7009                netif_dormant_off(dev);
7010
7011        if (netif_carrier_ok(rootdev)) {
7012                if (!netif_carrier_ok(dev))
7013                        netif_carrier_on(dev);
7014        } else {
7015                if (netif_carrier_ok(dev))
7016                        netif_carrier_off(dev);
7017        }
7018}
7019EXPORT_SYMBOL(netif_stacked_transfer_operstate);
7020
7021#ifdef CONFIG_SYSFS
7022static int netif_alloc_rx_queues(struct net_device *dev)
7023{
7024        unsigned int i, count = dev->num_rx_queues;
7025        struct netdev_rx_queue *rx;
7026        size_t sz = count * sizeof(*rx);
7027
7028        BUG_ON(count < 1);
7029
7030        rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7031        if (!rx) {
7032                rx = vzalloc(sz);
7033                if (!rx)
7034                        return -ENOMEM;
7035        }
7036        dev->_rx = rx;
7037
7038        for (i = 0; i < count; i++)
7039                rx[i].dev = dev;
7040        return 0;
7041}
7042#endif
7043
7044static void netdev_init_one_queue(struct net_device *dev,
7045                                  struct netdev_queue *queue, void *_unused)
7046{
7047        /* Initialize queue lock */
7048        spin_lock_init(&queue->_xmit_lock);
7049        netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
7050        queue->xmit_lock_owner = -1;
7051        netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
7052        queue->dev = dev;
7053#ifdef CONFIG_BQL
7054        dql_init(&queue->dql, HZ);
7055#endif
7056}
7057
7058static void netif_free_tx_queues(struct net_device *dev)
7059{
7060        kvfree(dev->_tx);
7061}
7062
7063static int netif_alloc_netdev_queues(struct net_device *dev)
7064{
7065        unsigned int count = dev->num_tx_queues;
7066        struct netdev_queue *tx;
7067        size_t sz = count * sizeof(*tx);
7068
7069        if (count < 1 || count > 0xffff)
7070                return -EINVAL;
7071
7072        tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7073        if (!tx) {
7074                tx = vzalloc(sz);
7075                if (!tx)
7076                        return -ENOMEM;
7077        }
7078        dev->_tx = tx;
7079
7080        netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
7081        spin_lock_init(&dev->tx_global_lock);
7082
7083        return 0;
7084}
7085
7086void netif_tx_stop_all_queues(struct net_device *dev)
7087{
7088        unsigned int i;
7089
7090        for (i = 0; i < dev->num_tx_queues; i++) {
7091                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
7092                netif_tx_stop_queue(txq);
7093        }
7094}
7095EXPORT_SYMBOL(netif_tx_stop_all_queues);
7096
7097/**
7098 *      register_netdevice      - register a network device
7099 *      @dev: device to register
7100 *
7101 *      Take a completed network device structure and add it to the kernel
7102 *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7103 *      chain. 0 is returned on success. A negative errno code is returned
7104 *      on a failure to set up the device, or if the name is a duplicate.
7105 *
7106 *      Callers must hold the rtnl semaphore. You may want
7107 *      register_netdev() instead of this.
7108 *
7109 *      BUGS:
7110 *      The locking appears insufficient to guarantee two parallel registers
7111 *      will not get the same name.
7112 */
7113
7114int register_netdevice(struct net_device *dev)
7115{
7116        int ret;
7117        struct net *net = dev_net(dev);
7118
7119        BUG_ON(dev_boot_phase);
7120        ASSERT_RTNL();
7121
7122        might_sleep();
7123
7124        /* When net_device's are persistent, this will be fatal. */
7125        BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
7126        BUG_ON(!net);
7127
7128        spin_lock_init(&dev->addr_list_lock);
7129        netdev_set_addr_lockdep_class(dev);
7130
7131        ret = dev_get_valid_name(net, dev, dev->name);
7132        if (ret < 0)
7133                goto out;
7134
7135        /* Init, if this function is available */
7136        if (dev->netdev_ops->ndo_init) {
7137                ret = dev->netdev_ops->ndo_init(dev);
7138                if (ret) {
7139                        if (ret > 0)
7140                                ret = -EIO;
7141                        goto out;
7142                }
7143        }
7144
7145        if (((dev->hw_features | dev->features) &
7146             NETIF_F_HW_VLAN_CTAG_FILTER) &&
7147            (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
7148             !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
7149                netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
7150                ret = -EINVAL;
7151                goto err_uninit;
7152        }
7153
7154        ret = -EBUSY;
7155        if (!dev->ifindex)
7156                dev->ifindex = dev_new_index(net);
7157        else if (__dev_get_by_index(net, dev->ifindex))
7158                goto err_uninit;
7159
7160        /* Transfer changeable features to wanted_features and enable
7161         * software offloads (GSO and GRO).
7162         */
7163        dev->hw_features |= NETIF_F_SOFT_FEATURES;
7164        dev->features |= NETIF_F_SOFT_FEATURES;
7165        dev->wanted_features = dev->features & dev->hw_features;
7166
7167        if (!(dev->flags & IFF_LOOPBACK))
7168                dev->hw_features |= NETIF_F_NOCACHE_COPY;
7169
7170        /* If IPv4 TCP segmentation offload is supported we should also
7171         * allow the device to enable segmenting the frame with the option
7172         * of ignoring a static IP ID value.  This doesn't enable the
7173         * feature itself but allows the user to enable it later.
7174         */
7175        if (dev->hw_features & NETIF_F_TSO)
7176                dev->hw_features |= NETIF_F_TSO_MANGLEID;
7177        if (dev->vlan_features & NETIF_F_TSO)
7178                dev->vlan_features |= NETIF_F_TSO_MANGLEID;
7179        if (dev->mpls_features & NETIF_F_TSO)
7180                dev->mpls_features |= NETIF_F_TSO_MANGLEID;
7181        if (dev->hw_enc_features & NETIF_F_TSO)
7182                dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
7183
7184        /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
7185         */
7186        dev->vlan_features |= NETIF_F_HIGHDMA;
7187
7188        /* Make NETIF_F_SG inheritable to tunnel devices.
7189         */
7190        dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
7191
7192        /* Make NETIF_F_SG inheritable to MPLS.
7193         */
7194        dev->mpls_features |= NETIF_F_SG;
7195
7196        ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
7197        ret = notifier_to_errno(ret);
7198        if (ret)
7199                goto err_uninit;
7200
7201        ret = netdev_register_kobject(dev);
7202        if (ret)
7203                goto err_uninit;
7204        dev->reg_state = NETREG_REGISTERED;
7205
7206        __netdev_update_features(dev);
7207
7208        /*
7209         *      Default initial state at registry is that the
7210         *      device is present.
7211         */
7212
7213        set_bit(__LINK_STATE_PRESENT, &dev->state);
7214
7215        linkwatch_init_dev(dev);
7216
7217        dev_init_scheduler(dev);
7218        dev_hold(dev);
7219        list_netdevice(dev);
7220        add_device_randomness(dev->dev_addr, dev->addr_len);
7221
7222        /* If the device has permanent device address, driver should
7223         * set dev_addr and also addr_assign_type should be set to
7224         * NET_ADDR_PERM (default value).
7225         */
7226        if (dev->addr_assign_type == NET_ADDR_PERM)
7227                memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
7228
7229        /* Notify protocols, that a new device appeared. */
7230        ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
7231        ret = notifier_to_errno(ret);
7232        if (ret) {
7233                rollback_registered(dev);
7234                dev->reg_state = NETREG_UNREGISTERED;
7235        }
7236        /*
7237         *      Prevent userspace races by waiting until the network
7238         *      device is fully setup before sending notifications.
7239         */
7240        if (!dev->rtnl_link_ops ||
7241            dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
7242                rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7243
7244out:
7245        return ret;
7246
7247err_uninit:
7248        if (dev->netdev_ops->ndo_uninit)
7249                dev->netdev_ops->ndo_uninit(dev);
7250        goto out;
7251}
7252EXPORT_SYMBOL(register_netdevice);
7253
7254/**
7255 *      init_dummy_netdev       - init a dummy network device for NAPI
7256 *      @dev: device to init
7257 *
7258 *      This takes a network device structure and initialize the minimum
7259 *      amount of fields so it can be used to schedule NAPI polls without
7260 *      registering a full blown interface. This is to be used by drivers
7261 *      that need to tie several hardware interfaces to a single NAPI
7262 *      poll scheduler due to HW limitations.
7263 */
7264int init_dummy_netdev(struct net_device *dev)
7265{
7266        /* Clear everything. Note we don't initialize spinlocks
7267         * are they aren't supposed to be taken by any of the
7268         * NAPI code and this dummy netdev is supposed to be
7269         * only ever used for NAPI polls
7270         */
7271        memset(dev, 0, sizeof(struct net_device));
7272
7273        /* make sure we BUG if trying to hit standard
7274         * register/unregister code path
7275         */
7276        dev->reg_state = NETREG_DUMMY;
7277
7278        /* NAPI wants this */
7279        INIT_LIST_HEAD(&dev->napi_list);
7280
7281        /* a dummy interface is started by default */
7282        set_bit(__LINK_STATE_PRESENT, &dev->state);
7283        set_bit(__LINK_STATE_START, &dev->state);
7284
7285        /* Note : We dont allocate pcpu_refcnt for dummy devices,
7286         * because users of this 'device' dont need to change
7287         * its refcount.
7288         */
7289
7290        return 0;
7291}
7292EXPORT_SYMBOL_GPL(init_dummy_netdev);
7293
7294
7295/**
7296 *      register_netdev - register a network device
7297 *      @dev: device to register
7298 *
7299 *      Take a completed network device structure and add it to the kernel
7300 *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7301 *      chain. 0 is returned on success. A negative errno code is returned
7302 *      on a failure to set up the device, or if the name is a duplicate.
7303 *
7304 *      This is a wrapper around register_netdevice that takes the rtnl semaphore
7305 *      and expands the device name if you passed a format string to
7306 *      alloc_netdev.
7307 */
7308int register_netdev(struct net_device *dev)
7309{
7310        int err;
7311
7312        rtnl_lock();
7313        err = register_netdevice(dev);
7314        rtnl_unlock();
7315        return err;
7316}
7317EXPORT_SYMBOL(register_netdev);
7318
7319int netdev_refcnt_read(const struct net_device *dev)
7320{
7321        int i, refcnt = 0;
7322
7323        for_each_possible_cpu(i)
7324                refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
7325        return refcnt;
7326}
7327EXPORT_SYMBOL(netdev_refcnt_read);
7328
7329/**
7330 * netdev_wait_allrefs - wait until all references are gone.
7331 * @dev: target net_device
7332 *
7333 * This is called when unregistering network devices.
7334 *
7335 * Any protocol or device that holds a reference should register
7336 * for netdevice notification, and cleanup and put back the
7337 * reference if they receive an UNREGISTER event.
7338 * We can get stuck here if buggy protocols don't correctly
7339 * call dev_put.
7340 */
7341static void netdev_wait_allrefs(struct net_device *dev)
7342{
7343        unsigned long rebroadcast_time, warning_time;
7344        int refcnt;
7345
7346        linkwatch_forget_dev(dev);
7347
7348        rebroadcast_time = warning_time = jiffies;
7349        refcnt = netdev_refcnt_read(dev);
7350
7351        while (refcnt != 0) {
7352                if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
7353                        rtnl_lock();
7354
7355                        /* Rebroadcast unregister notification */
7356                        call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7357
7358                        __rtnl_unlock();
7359                        rcu_barrier();
7360                        rtnl_lock();
7361
7362                        call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7363                        if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
7364                                     &dev->state)) {
7365                                /* We must not have linkwatch events
7366                                 * pending on unregister. If this
7367                                 * happens, we simply run the queue
7368                                 * unscheduled, resulting in a noop
7369                                 * for this device.
7370                                 */
7371                                linkwatch_run_queue();
7372                        }
7373
7374                        __rtnl_unlock();
7375
7376                        rebroadcast_time = jiffies;
7377                }
7378
7379                msleep(250);
7380
7381                refcnt = netdev_refcnt_read(dev);
7382
7383                if (time_after(jiffies, warning_time + 10 * HZ)) {
7384                        pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
7385                                 dev->name, refcnt);
7386                        warning_time = jiffies;
7387                }
7388        }
7389}
7390
7391/* The sequence is:
7392 *
7393 *      rtnl_lock();
7394 *      ...
7395 *      register_netdevice(x1);
7396 *      register_netdevice(x2);
7397 *      ...
7398 *      unregister_netdevice(y1);
7399 *      unregister_netdevice(y2);
7400 *      ...
7401 *      rtnl_unlock();
7402 *      free_netdev(y1);
7403 *      free_netdev(y2);
7404 *
7405 * We are invoked by rtnl_unlock().
7406 * This allows us to deal with problems:
7407 * 1) We can delete sysfs objects which invoke hotplug
7408 *    without deadlocking with linkwatch via keventd.
7409 * 2) Since we run with the RTNL semaphore not held, we can sleep
7410 *    safely in order to wait for the netdev refcnt to drop to zero.
7411 *
7412 * We must not return until all unregister events added during
7413 * the interval the lock was held have been completed.
7414 */
7415void netdev_run_todo(void)
7416{
7417        struct list_head list;
7418
7419        /* Snapshot list, allow later requests */
7420        list_replace_init(&net_todo_list, &list);
7421
7422        __rtnl_unlock();
7423
7424
7425        /* Wait for rcu callbacks to finish before next phase */
7426        if (!list_empty(&list))
7427                rcu_barrier();
7428
7429        while (!list_empty(&list)) {
7430                struct net_device *dev
7431                        = list_first_entry(&list, struct net_device, todo_list);
7432                list_del(&dev->todo_list);
7433
7434                rtnl_lock();
7435                call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7436                __rtnl_unlock();
7437
7438                if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7439                        pr_err("network todo '%s' but state %d\n",
7440                               dev->name, dev->reg_state);
7441                        dump_stack();
7442                        continue;
7443                }
7444
7445                dev->reg_state = NETREG_UNREGISTERED;
7446
7447                netdev_wait_allrefs(dev);
7448
7449                /* paranoia */
7450                BUG_ON(netdev_refcnt_read(dev));
7451                BUG_ON(!list_empty(&dev->ptype_all));
7452                BUG_ON(!list_empty(&dev->ptype_specific));
7453                WARN_ON(rcu_access_pointer(dev->ip_ptr));
7454                WARN_ON(rcu_access_pointer(dev->ip6_ptr));
7455                WARN_ON(dev->dn_ptr);
7456
7457                if (dev->destructor)
7458                        dev->destructor(dev);
7459
7460                /* Report a network device has been unregistered */
7461                rtnl_lock();
7462                dev_net(dev)->dev_unreg_count--;
7463                __rtnl_unlock();
7464                wake_up(&netdev_unregistering_wq);
7465
7466                /* Free network device */
7467                kobject_put(&dev->dev.kobj);
7468        }
7469}
7470
7471/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
7472 * all the same fields in the same order as net_device_stats, with only
7473 * the type differing, but rtnl_link_stats64 may have additional fields
7474 * at the end for newer counters.
7475 */
7476void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
7477                             const struct net_device_stats *netdev_stats)
7478{
7479#if BITS_PER_LONG == 64
7480        BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
7481        memcpy(stats64, netdev_stats, sizeof(*stats64));
7482        /* zero out counters that only exist in rtnl_link_stats64 */
7483        memset((char *)stats64 + sizeof(*netdev_stats), 0,
7484               sizeof(*stats64) - sizeof(*netdev_stats));
7485#else
7486        size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
7487        const unsigned long *src = (const unsigned long *)netdev_stats;
7488        u64 *dst = (u64 *)stats64;
7489
7490        BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
7491        for (i = 0; i < n; i++)
7492                dst[i] = src[i];
7493        /* zero out counters that only exist in rtnl_link_stats64 */
7494        memset((char *)stats64 + n * sizeof(u64), 0,
7495               sizeof(*stats64) - n * sizeof(u64));
7496#endif
7497}
7498EXPORT_SYMBOL(netdev_stats_to_stats64);
7499
7500/**
7501 *      dev_get_stats   - get network device statistics
7502 *      @dev: device to get statistics from
7503 *      @storage: place to store stats
7504 *
7505 *      Get network statistics from device. Return @storage.
7506 *      The device driver may provide its own method by setting
7507 *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
7508 *      otherwise the internal statistics structure is used.
7509 */
7510struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
7511                                        struct rtnl_link_stats64 *storage)
7512{
7513        const struct net_device_ops *ops = dev->netdev_ops;
7514
7515        if (ops->ndo_get_stats64) {
7516                memset(storage, 0, sizeof(*storage));
7517                ops->ndo_get_stats64(dev, storage);
7518        } else if (ops->ndo_get_stats) {
7519                netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
7520        } else {
7521                netdev_stats_to_stats64(storage, &dev->stats);
7522        }
7523        storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
7524        storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
7525        storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler);
7526        return storage;
7527}
7528EXPORT_SYMBOL(dev_get_stats);
7529
7530struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
7531{
7532        struct netdev_queue *queue = dev_ingress_queue(dev);
7533
7534#ifdef CONFIG_NET_CLS_ACT
7535        if (queue)
7536                return queue;
7537        queue = kzalloc(sizeof(*queue), GFP_KERNEL);
7538        if (!queue)
7539                return NULL;
7540        netdev_init_one_queue(dev, queue, NULL);
7541        RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
7542        queue->qdisc_sleeping = &noop_qdisc;
7543        rcu_assign_pointer(dev->ingress_queue, queue);
7544#endif
7545        return queue;
7546}
7547
7548static const struct ethtool_ops default_ethtool_ops;
7549
7550void netdev_set_default_ethtool_ops(struct net_device *dev,
7551                                    const struct ethtool_ops *ops)
7552{
7553        if (dev->ethtool_ops == &default_ethtool_ops)
7554                dev->ethtool_ops = ops;
7555}
7556EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
7557
7558void netdev_freemem(struct net_device *dev)
7559{
7560        char *addr = (char *)dev - dev->padded;
7561
7562        kvfree(addr);
7563}
7564
7565/**
7566 *      alloc_netdev_mqs - allocate network device
7567 *      @sizeof_priv:           size of private data to allocate space for
7568 *      @name:                  device name format string
7569 *      @name_assign_type:      origin of device name
7570 *      @setup:                 callback to initialize device
7571 *      @txqs:                  the number of TX subqueues to allocate
7572 *      @rxqs:                  the number of RX subqueues to allocate
7573 *
7574 *      Allocates a struct net_device with private data area for driver use
7575 *      and performs basic initialization.  Also allocates subqueue structs
7576 *      for each queue on the device.
7577 */
7578struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7579                unsigned char name_assign_type,
7580                void (*setup)(struct net_device *),
7581                unsigned int txqs, unsigned int rxqs)
7582{
7583        struct net_device *dev;
7584        size_t alloc_size;
7585        struct net_device *p;
7586
7587        BUG_ON(strlen(name) >= sizeof(dev->name));
7588
7589        if (txqs < 1) {
7590                pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
7591                return NULL;
7592        }
7593
7594#ifdef CONFIG_SYSFS
7595        if (rxqs < 1) {
7596                pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
7597                return NULL;
7598        }
7599#endif
7600
7601        alloc_size = sizeof(struct net_device);
7602        if (sizeof_priv) {
7603                /* ensure 32-byte alignment of private area */
7604                alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
7605                alloc_size += sizeof_priv;
7606        }
7607        /* ensure 32-byte alignment of whole construct */
7608        alloc_size += NETDEV_ALIGN - 1;
7609
7610        p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7611        if (!p)
7612                p = vzalloc(alloc_size);
7613        if (!p)
7614                return NULL;
7615
7616        dev = PTR_ALIGN(p, NETDEV_ALIGN);
7617        dev->padded = (char *)dev - (char *)p;
7618
7619        dev->pcpu_refcnt = alloc_percpu(int);
7620        if (!dev->pcpu_refcnt)
7621                goto free_dev;
7622
7623        if (dev_addr_init(dev))
7624                goto free_pcpu;
7625
7626        dev_mc_init(dev);
7627        dev_uc_init(dev);
7628
7629        dev_net_set(dev, &init_net);
7630
7631        dev->gso_max_size = GSO_MAX_SIZE;
7632        dev->gso_max_segs = GSO_MAX_SEGS;
7633
7634        INIT_LIST_HEAD(&dev->napi_list);
7635        INIT_LIST_HEAD(&dev->unreg_list);
7636        INIT_LIST_HEAD(&dev->close_list);
7637        INIT_LIST_HEAD(&dev->link_watch_list);
7638        INIT_LIST_HEAD(&dev->adj_list.upper);
7639        INIT_LIST_HEAD(&dev->adj_list.lower);
7640        INIT_LIST_HEAD(&dev->all_adj_list.upper);
7641        INIT_LIST_HEAD(&dev->all_adj_list.lower);
7642        INIT_LIST_HEAD(&dev->ptype_all);
7643        INIT_LIST_HEAD(&dev->ptype_specific);
7644        dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
7645        setup(dev);
7646
7647        if (!dev->tx_queue_len) {
7648                dev->priv_flags |= IFF_NO_QUEUE;
7649                dev->tx_queue_len = 1;
7650        }
7651
7652        dev->num_tx_queues = txqs;
7653        dev->real_num_tx_queues = txqs;
7654        if (netif_alloc_netdev_queues(dev))
7655                goto free_all;
7656
7657#ifdef CONFIG_SYSFS
7658        dev->num_rx_queues = rxqs;
7659        dev->real_num_rx_queues = rxqs;
7660        if (netif_alloc_rx_queues(dev))
7661                goto free_all;
7662#endif
7663
7664        strcpy(dev->name, name);
7665        dev->name_assign_type = name_assign_type;
7666        dev->group = INIT_NETDEV_GROUP;
7667        if (!dev->ethtool_ops)
7668                dev->ethtool_ops = &default_ethtool_ops;
7669
7670        nf_hook_ingress_init(dev);
7671
7672        return dev;
7673
7674free_all:
7675        free_netdev(dev);
7676        return NULL;
7677
7678free_pcpu:
7679        free_percpu(dev->pcpu_refcnt);
7680free_dev:
7681        netdev_freemem(dev);
7682        return NULL;
7683}
7684EXPORT_SYMBOL(alloc_netdev_mqs);
7685
7686/**
7687 *      free_netdev - free network device
7688 *      @dev: device
7689 *
7690 *      This function does the last stage of destroying an allocated device
7691 *      interface. The reference to the device object is released.
7692 *      If this is the last reference then it will be freed.
7693 *      Must be called in process context.
7694 */
7695void free_netdev(struct net_device *dev)
7696{
7697        struct napi_struct *p, *n;
7698
7699        might_sleep();
7700        netif_free_tx_queues(dev);
7701#ifdef CONFIG_SYSFS
7702        kvfree(dev->_rx);
7703#endif
7704
7705        kfree(rcu_dereference_protected(dev->ingress_queue, 1));
7706
7707        /* Flush device addresses */
7708        dev_addr_flush(dev);
7709
7710        list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7711                netif_napi_del(p);
7712
7713        free_percpu(dev->pcpu_refcnt);
7714        dev->pcpu_refcnt = NULL;
7715
7716        /*  Compatibility with error handling in drivers */
7717        if (dev->reg_state == NETREG_UNINITIALIZED) {
7718                netdev_freemem(dev);
7719                return;
7720        }
7721
7722        BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7723        dev->reg_state = NETREG_RELEASED;
7724
7725        /* will free via device release */
7726        put_device(&dev->dev);
7727}
7728EXPORT_SYMBOL(free_netdev);
7729
7730/**
7731 *      synchronize_net -  Synchronize with packet receive processing
7732 *
7733 *      Wait for packets currently being received to be done.
7734 *      Does not block later packets from starting.
7735 */
7736void synchronize_net(void)
7737{
7738        might_sleep();
7739        if (rtnl_is_locked())
7740                synchronize_rcu_expedited();
7741        else
7742                synchronize_rcu();
7743}
7744EXPORT_SYMBOL(synchronize_net);
7745
7746/**
7747 *      unregister_netdevice_queue - remove device from the kernel
7748 *      @dev: device
7749 *      @head: list
7750 *
7751 *      This function shuts down a device interface and removes it
7752 *      from the kernel tables.
7753 *      If head not NULL, device is queued to be unregistered later.
7754 *
7755 *      Callers must hold the rtnl semaphore.  You may want
7756 *      unregister_netdev() instead of this.
7757 */
7758
7759void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
7760{
7761        ASSERT_RTNL();
7762
7763        if (head) {
7764                list_move_tail(&dev->unreg_list, head);
7765        } else {
7766                rollback_registered(dev);
7767                /* Finish processing unregister after unlock */
7768                net_set_todo(dev);
7769        }
7770}
7771EXPORT_SYMBOL(unregister_netdevice_queue);
7772
7773/**
7774 *      unregister_netdevice_many - unregister many devices
7775 *      @head: list of devices
7776 *
7777 *  Note: As most callers use a stack allocated list_head,
7778 *  we force a list_del() to make sure stack wont be corrupted later.
7779 */
7780void unregister_netdevice_many(struct list_head *head)
7781{
7782        struct net_device *dev;
7783
7784        if (!list_empty(head)) {
7785                rollback_registered_many(head);
7786                list_for_each_entry(dev, head, unreg_list)
7787                        net_set_todo(dev);
7788                list_del(head);
7789        }
7790}
7791EXPORT_SYMBOL(unregister_netdevice_many);
7792
7793/**
7794 *      unregister_netdev - remove device from the kernel
7795 *      @dev: device
7796 *
7797 *      This function shuts down a device interface and removes it
7798 *      from the kernel tables.
7799 *
7800 *      This is just a wrapper for unregister_netdevice that takes
7801 *      the rtnl semaphore.  In general you want to use this and not
7802 *      unregister_netdevice.
7803 */
7804void unregister_netdev(struct net_device *dev)
7805{
7806        rtnl_lock();
7807        unregister_netdevice(dev);
7808        rtnl_unlock();
7809}
7810EXPORT_SYMBOL(unregister_netdev);
7811
7812/**
7813 *      dev_change_net_namespace - move device to different nethost namespace
7814 *      @dev: device
7815 *      @net: network namespace
7816 *      @pat: If not NULL name pattern to try if the current device name
7817 *            is already taken in the destination network namespace.
7818 *
7819 *      This function shuts down a device interface and moves it
7820 *      to a new network namespace. On success 0 is returned, on
7821 *      a failure a netagive errno code is returned.
7822 *
7823 *      Callers must hold the rtnl semaphore.
7824 */
7825
7826int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7827{
7828        int err;
7829
7830        ASSERT_RTNL();
7831
7832        /* Don't allow namespace local devices to be moved. */
7833        err = -EINVAL;
7834        if (dev->features & NETIF_F_NETNS_LOCAL)
7835                goto out;
7836
7837        /* Ensure the device has been registrered */
7838        if (dev->reg_state != NETREG_REGISTERED)
7839                goto out;
7840
7841        /* Get out if there is nothing todo */
7842        err = 0;
7843        if (net_eq(dev_net(dev), net))
7844                goto out;
7845
7846        /* Pick the destination device name, and ensure
7847         * we can use it in the destination network namespace.
7848         */
7849        err = -EEXIST;
7850        if (__dev_get_by_name(net, dev->name)) {
7851                /* We get here if we can't use the current device name */
7852                if (!pat)
7853                        goto out;
7854                if (dev_get_valid_name(net, dev, pat) < 0)
7855                        goto out;
7856        }
7857
7858        /*
7859         * And now a mini version of register_netdevice unregister_netdevice.
7860         */
7861
7862        /* If device is running close it first. */
7863        dev_close(dev);
7864
7865        /* And unlink it from device chain */
7866        err = -ENODEV;
7867        unlist_netdevice(dev);
7868
7869        synchronize_net();
7870
7871        /* Shutdown queueing discipline. */
7872        dev_shutdown(dev);
7873
7874        /* Notify protocols, that we are about to destroy
7875           this device. They should clean all the things.
7876
7877           Note that dev->reg_state stays at NETREG_REGISTERED.
7878           This is wanted because this way 8021q and macvlan know
7879           the device is just moving and can keep their slaves up.
7880        */
7881        call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7882        rcu_barrier();
7883        call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7884        rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
7885
7886        /*
7887         *      Flush the unicast and multicast chains
7888         */
7889        dev_uc_flush(dev);
7890        dev_mc_flush(dev);
7891
7892        /* Send a netdev-removed uevent to the old namespace */
7893        kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7894        netdev_adjacent_del_links(dev);
7895
7896        /* Actually switch the network namespace */
7897        dev_net_set(dev, net);
7898
7899        /* If there is an ifindex conflict assign a new one */
7900        if (__dev_get_by_index(net, dev->ifindex))
7901                dev->ifindex = dev_new_index(net);
7902
7903        /* Send a netdev-add uevent to the new namespace */
7904        kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7905        netdev_adjacent_add_links(dev);
7906
7907        /* Fixup kobjects */
7908        err = device_rename(&dev->dev, dev->name);
7909        WARN_ON(err);
7910
7911        /* Add the device back in the hashes */
7912        list_netdevice(dev);
7913
7914        /* Notify protocols, that a new device appeared. */
7915        call_netdevice_notifiers(NETDEV_REGISTER, dev);
7916
7917        /*
7918         *      Prevent userspace races by waiting until the network
7919         *      device is fully setup before sending notifications.
7920         */
7921        rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7922
7923        synchronize_net();
7924        err = 0;
7925out:
7926        return err;
7927}
7928EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7929
7930static int dev_cpu_callback(struct notifier_block *nfb,
7931                            unsigned long action,
7932                            void *ocpu)
7933{
7934        struct sk_buff **list_skb;
7935        struct sk_buff *skb;
7936        unsigned int cpu, oldcpu = (unsigned long)ocpu;
7937        struct softnet_data *sd, *oldsd;
7938
7939        if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7940                return NOTIFY_OK;
7941
7942        local_irq_disable();
7943        cpu = smp_processor_id();
7944        sd = &per_cpu(softnet_data, cpu);
7945        oldsd = &per_cpu(softnet_data, oldcpu);
7946
7947        /* Find end of our completion_queue. */
7948        list_skb = &sd->completion_queue;
7949        while (*list_skb)
7950                list_skb = &(*list_skb)->next;
7951        /* Append completion queue from offline CPU. */
7952        *list_skb = oldsd->completion_queue;
7953        oldsd->completion_queue = NULL;
7954
7955        /* Append output queue from offline CPU. */
7956        if (oldsd->output_queue) {
7957                *sd->output_queue_tailp = oldsd->output_queue;
7958                sd->output_queue_tailp = oldsd->output_queue_tailp;
7959                oldsd->output_queue = NULL;
7960                oldsd->output_queue_tailp = &oldsd->output_queue;
7961        }
7962        /* Append NAPI poll list from offline CPU, with one exception :
7963         * process_backlog() must be called by cpu owning percpu backlog.
7964         * We properly handle process_queue & input_pkt_queue later.
7965         */
7966        while (!list_empty(&oldsd->poll_list)) {
7967                struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7968                                                            struct napi_struct,
7969                                                            poll_list);
7970
7971                list_del_init(&napi->poll_list);
7972                if (napi->poll == process_backlog)
7973                        napi->state = 0;
7974                else
7975                        ____napi_schedule(sd, napi);
7976        }
7977
7978        raise_softirq_irqoff(NET_TX_SOFTIRQ);
7979        local_irq_enable();
7980
7981        /* Process offline CPU's input_pkt_queue */
7982        while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7983                netif_rx_ni(skb);
7984                input_queue_head_incr(oldsd);
7985        }
7986        while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
7987                netif_rx_ni(skb);
7988                input_queue_head_incr(oldsd);
7989        }
7990
7991        return NOTIFY_OK;
7992}
7993
7994
7995/**
7996 *      netdev_increment_features - increment feature set by one
7997 *      @all: current feature set
7998 *      @one: new feature set
7999 *      @mask: mask feature set
8000 *
8001 *      Computes a new feature set after adding a device with feature set
8002 *      @one to the master device with current feature set @all.  Will not
8003 *      enable anything that is off in @mask. Returns the new feature set.
8004 */
8005netdev_features_t netdev_increment_features(netdev_features_t all,
8006        netdev_features_t one, netdev_features_t mask)
8007{
8008        if (mask & NETIF_F_HW_CSUM)
8009                mask |= NETIF_F_CSUM_MASK;
8010        mask |= NETIF_F_VLAN_CHALLENGED;
8011
8012        all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
8013        all &= one | ~NETIF_F_ALL_FOR_ALL;
8014
8015        /* If one device supports hw checksumming, set for all. */
8016        if (all & NETIF_F_HW_CSUM)
8017                all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
8018
8019        return all;
8020}
8021EXPORT_SYMBOL(netdev_increment_features);
8022
8023static struct hlist_head * __net_init netdev_create_hash(void)
8024{
8025        int i;
8026        struct hlist_head *hash;
8027
8028        hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
8029        if (hash != NULL)
8030                for (i = 0; i < NETDEV_HASHENTRIES; i++)
8031                        INIT_HLIST_HEAD(&hash[i]);
8032
8033        return hash;
8034}
8035
8036/* Initialize per network namespace state */
8037static int __net_init netdev_init(struct net *net)
8038{
8039        if (net != &init_net)
8040                INIT_LIST_HEAD(&net->dev_base_head);
8041
8042        net->dev_name_head = netdev_create_hash();
8043        if (net->dev_name_head == NULL)
8044                goto err_name;
8045
8046        net->dev_index_head = netdev_create_hash();
8047        if (net->dev_index_head == NULL)
8048                goto err_idx;
8049
8050        return 0;
8051
8052err_idx:
8053        kfree(net->dev_name_head);
8054err_name:
8055        return -ENOMEM;
8056}
8057
8058/**
8059 *      netdev_drivername - network driver for the device
8060 *      @dev: network device
8061 *
8062 *      Determine network driver for device.
8063 */
8064const char *netdev_drivername(const struct net_device *dev)
8065{
8066        const struct device_driver *driver;
8067        const struct device *parent;
8068        const char *empty = "";
8069
8070        parent = dev->dev.parent;
8071        if (!parent)
8072                return empty;
8073
8074        driver = parent->driver;
8075        if (driver && driver->name)
8076                return driver->name;
8077        return empty;
8078}
8079
8080static void __netdev_printk(const char *level, const struct net_device *dev,
8081                            struct va_format *vaf)
8082{
8083        if (dev && dev->dev.parent) {
8084                dev_printk_emit(level[1] - '0',
8085                                dev->dev.parent,
8086                                "%s %s %s%s: %pV",
8087                                dev_driver_string(dev->dev.parent),
8088                                dev_name(dev->dev.parent),
8089                                netdev_name(dev), netdev_reg_state(dev),
8090                                vaf);
8091        } else if (dev) {
8092                printk("%s%s%s: %pV",
8093                       level, netdev_name(dev), netdev_reg_state(dev), vaf);
8094        } else {
8095                printk("%s(NULL net_device): %pV", level, vaf);
8096        }
8097}
8098
8099void netdev_printk(const char *level, const struct net_device *dev,
8100                   const char *format, ...)
8101{
8102        struct va_format vaf;
8103        va_list args;
8104
8105        va_start(args, format);
8106
8107        vaf.fmt = format;
8108        vaf.va = &args;
8109
8110        __netdev_printk(level, dev, &vaf);
8111
8112        va_end(args);
8113}
8114EXPORT_SYMBOL(netdev_printk);
8115
8116#define define_netdev_printk_level(func, level)                 \
8117void func(const struct net_device *dev, const char *fmt, ...)   \
8118{                                                               \
8119        struct va_format vaf;                                   \
8120        va_list args;                                           \
8121                                                                \
8122        va_start(args, fmt);                                    \
8123                                                                \
8124        vaf.fmt = fmt;                                          \
8125        vaf.va = &args;                                         \
8126                                                                \
8127        __netdev_printk(level, dev, &vaf);                      \
8128                                                                \
8129        va_end(args);                                           \
8130}                                                               \
8131EXPORT_SYMBOL(func);
8132
8133define_netdev_printk_level(netdev_emerg, KERN_EMERG);
8134define_netdev_printk_level(netdev_alert, KERN_ALERT);
8135define_netdev_printk_level(netdev_crit, KERN_CRIT);
8136define_netdev_printk_level(netdev_err, KERN_ERR);
8137define_netdev_printk_level(netdev_warn, KERN_WARNING);
8138define_netdev_printk_level(netdev_notice, KERN_NOTICE);
8139define_netdev_printk_level(netdev_info, KERN_INFO);
8140
8141static void __net_exit netdev_exit(struct net *net)
8142{
8143        kfree(net->dev_name_head);
8144        kfree(net->dev_index_head);
8145}
8146
8147static struct pernet_operations __net_initdata netdev_net_ops = {
8148        .init = netdev_init,
8149        .exit = netdev_exit,
8150};
8151
8152static void __net_exit default_device_exit(struct net *net)
8153{
8154        struct net_device *dev, *aux;
8155        /*
8156         * Push all migratable network devices back to the
8157         * initial network namespace
8158         */
8159        rtnl_lock();
8160        for_each_netdev_safe(net, dev, aux) {
8161                int err;
8162                char fb_name[IFNAMSIZ];
8163
8164                /* Ignore unmoveable devices (i.e. loopback) */
8165                if (dev->features & NETIF_F_NETNS_LOCAL)
8166                        continue;
8167
8168                /* Leave virtual devices for the generic cleanup */
8169                if (dev->rtnl_link_ops)
8170                        continue;
8171
8172                /* Push remaining network devices to init_net */
8173                snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
8174                err = dev_change_net_namespace(dev, &init_net, fb_name);
8175                if (err) {
8176                        pr_emerg("%s: failed to move %s to init_net: %d\n",
8177                                 __func__, dev->name, err);
8178                        BUG();
8179                }
8180        }
8181        rtnl_unlock();
8182}
8183
8184static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
8185{
8186        /* Return with the rtnl_lock held when there are no network
8187         * devices unregistering in any network namespace in net_list.
8188         */
8189        struct net *net;
8190        bool unregistering;
8191        DEFINE_WAIT_FUNC(wait, woken_wake_function);
8192
8193        add_wait_queue(&netdev_unregistering_wq, &wait);
8194        for (;;) {
8195                unregistering = false;
8196                rtnl_lock();
8197                list_for_each_entry(net, net_list, exit_list) {
8198                        if (net->dev_unreg_count > 0) {
8199                                unregistering = true;
8200                                break;
8201                        }
8202                }
8203                if (!unregistering)
8204                        break;
8205                __rtnl_unlock();
8206
8207                wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
8208        }
8209        remove_wait_queue(&netdev_unregistering_wq, &wait);
8210}
8211
8212static void __net_exit default_device_exit_batch(struct list_head *net_list)
8213{
8214        /* At exit all network devices most be removed from a network
8215         * namespace.  Do this in the reverse order of registration.
8216         * Do this across as many network namespaces as possible to
8217         * improve batching efficiency.
8218         */
8219        struct net_device *dev;
8220        struct net *net;
8221        LIST_HEAD(dev_kill_list);
8222
8223        /* To prevent network device cleanup code from dereferencing
8224         * loopback devices or network devices that have been freed
8225         * wait here for all pending unregistrations to complete,
8226         * before unregistring the loopback device and allowing the
8227         * network namespace be freed.
8228         *
8229         * The netdev todo list containing all network devices
8230         * unregistrations that happen in default_device_exit_batch
8231         * will run in the rtnl_unlock() at the end of
8232         * default_device_exit_batch.
8233         */
8234        rtnl_lock_unregistering(net_list);
8235        list_for_each_entry(net, net_list, exit_list) {
8236                for_each_netdev_reverse(net, dev) {
8237                        if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
8238                                dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
8239                        else
8240                                unregister_netdevice_queue(dev, &dev_kill_list);
8241                }
8242        }
8243        unregister_netdevice_many(&dev_kill_list);
8244        rtnl_unlock();
8245}
8246
8247static struct pernet_operations __net_initdata default_device_ops = {
8248        .exit = default_device_exit,
8249        .exit_batch = default_device_exit_batch,
8250};
8251
8252/*
8253 *      Initialize the DEV module. At boot time this walks the device list and
8254 *      unhooks any devices that fail to initialise (normally hardware not
8255 *      present) and leaves us with a valid list of present and active devices.
8256 *
8257 */
8258
8259/*
8260 *       This is called single threaded during boot, so no need
8261 *       to take the rtnl semaphore.
8262 */
8263static int __init net_dev_init(void)
8264{
8265        int i, rc = -ENOMEM;
8266
8267        BUG_ON(!dev_boot_phase);
8268
8269        if (dev_proc_init())
8270                goto out;
8271
8272        if (netdev_kobject_init())
8273                goto out;
8274
8275        INIT_LIST_HEAD(&ptype_all);
8276        for (i = 0; i < PTYPE_HASH_SIZE; i++)
8277                INIT_LIST_HEAD(&ptype_base[i]);
8278
8279        INIT_LIST_HEAD(&offload_base);
8280
8281        if (register_pernet_subsys(&netdev_net_ops))
8282                goto out;
8283
8284        /*
8285         *      Initialise the packet receive queues.
8286         */
8287
8288        for_each_possible_cpu(i) {
8289                struct softnet_data *sd = &per_cpu(softnet_data, i);
8290
8291                skb_queue_head_init(&sd->input_pkt_queue);
8292                skb_queue_head_init(&sd->process_queue);
8293                INIT_LIST_HEAD(&sd->poll_list);
8294                sd->output_queue_tailp = &sd->output_queue;
8295#ifdef CONFIG_RPS
8296                sd->csd.func = rps_trigger_softirq;
8297                sd->csd.info = sd;
8298                sd->cpu = i;
8299#endif
8300
8301                sd->backlog.poll = process_backlog;
8302                sd->backlog.weight = weight_p;
8303        }
8304
8305        dev_boot_phase = 0;
8306
8307        /* The loopback device is special if any other network devices
8308         * is present in a network namespace the loopback device must
8309         * be present. Since we now dynamically allocate and free the
8310         * loopback device ensure this invariant is maintained by
8311         * keeping the loopback device as the first device on the
8312         * list of network devices.  Ensuring the loopback devices
8313         * is the first device that appears and the last network device
8314         * that disappears.
8315         */
8316        if (register_pernet_device(&loopback_net_ops))
8317                goto out;
8318
8319        if (register_pernet_device(&default_device_ops))
8320                goto out;
8321
8322        open_softirq(NET_TX_SOFTIRQ, net_tx_action);
8323        open_softirq(NET_RX_SOFTIRQ, net_rx_action);
8324
8325        hotcpu_notifier(dev_cpu_callback, 0);
8326        dst_subsys_init();
8327        rc = 0;
8328out:
8329        return rc;
8330}
8331
8332subsys_initcall(net_dev_init);
8333