LXR linux/net/core/dev.c

   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 *      NET3    Protocol independent device support routines.
   4 *
   5 *      Derived from the non IP parts of dev.c 1.0.19
   6 *              Authors:        Ross Biro
   7 *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
   8 *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
   9 *
  10 *      Additional Authors:
  11 *              Florian la Roche <rzsfl@rz.uni-sb.de>
  12 *              Alan Cox <gw4pts@gw4pts.ampr.org>
  13 *              David Hinds <dahinds@users.sourceforge.net>
  14 *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  15 *              Adam Sulmicki <adam@cfar.umd.edu>
  16 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  17 *
  18 *      Changes:
  19 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  20 *                                      to 2 if register_netdev gets called
  21 *                                      before net_dev_init & also removed a
  22 *                                      few lines of code in the process.
  23 *              Alan Cox        :       device private ioctl copies fields back.
  24 *              Alan Cox        :       Transmit queue code does relevant
  25 *                                      stunts to keep the queue safe.
  26 *              Alan Cox        :       Fixed double lock.
  27 *              Alan Cox        :       Fixed promisc NULL pointer trap
  28 *              ????????        :       Support the full private ioctl range
  29 *              Alan Cox        :       Moved ioctl permission check into
  30 *                                      drivers
  31 *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  32 *              Alan Cox        :       100 backlog just doesn't cut it when
  33 *                                      you start doing multicast video 8)
  34 *              Alan Cox        :       Rewrote net_bh and list manager.
  35 *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  36 *              Alan Cox        :       Took out transmit every packet pass
  37 *                                      Saved a few bytes in the ioctl handler
  38 *              Alan Cox        :       Network driver sets packet type before
  39 *                                      calling netif_rx. Saves a function
  40 *                                      call a packet.
  41 *              Alan Cox        :       Hashed net_bh()
  42 *              Richard Kooijman:       Timestamp fixes.
  43 *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  44 *              Alan Cox        :       Device lock protection.
  45 *              Alan Cox        :       Fixed nasty side effect of device close
  46 *                                      changes.
  47 *              Rudi Cilibrasi  :       Pass the right thing to
  48 *                                      set_mac_address()
  49 *              Dave Miller     :       32bit quantity for the device lock to
  50 *                                      make it work out on a Sparc.
  51 *              Bjorn Ekwall    :       Added KERNELD hack.
  52 *              Alan Cox        :       Cleaned up the backlog initialise.
  53 *              Craig Metz      :       SIOCGIFCONF fix if space for under
  54 *                                      1 device.
  55 *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  56 *                                      is no device open function.
  57 *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  58 *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  59 *              Cyrus Durgin    :       Cleaned for KMOD
  60 *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  61 *                                      A network device unload needs to purge
  62 *                                      the backlog queue.
  63 *      Paul Rusty Russell      :       SIOCSIFNAME
  64 *              Pekka Riikonen  :       Netdev boot-time settings code
  65 *              Andrew Morton   :       Make unregister_netdevice wait
  66 *                                      indefinitely on dev->refcnt
  67 *              J Hadi Salim    :       - Backlog queue sampling
  68 *                                      - netif_rx() feedback
  69 */
  70
  71#include <linux/uaccess.h>
  72#include <linux/bitops.h>
  73#include <linux/capability.h>
  74#include <linux/cpu.h>
  75#include <linux/types.h>
  76#include <linux/kernel.h>
  77#include <linux/hash.h>
  78#include <linux/slab.h>
  79#include <linux/sched.h>
  80#include <linux/sched/mm.h>
  81#include <linux/mutex.h>
  82#include <linux/string.h>
  83#include <linux/mm.h>
  84#include <linux/socket.h>
  85#include <linux/sockios.h>
  86#include <linux/errno.h>
  87#include <linux/interrupt.h>
  88#include <linux/if_ether.h>
  89#include <linux/netdevice.h>
  90#include <linux/etherdevice.h>
  91#include <linux/ethtool.h>
  92#include <linux/skbuff.h>
  93#include <linux/bpf.h>
  94#include <linux/bpf_trace.h>
  95#include <net/net_namespace.h>
  96#include <net/sock.h>
  97#include <net/busy_poll.h>
  98#include <linux/rtnetlink.h>
  99#include <linux/stat.h>
 100#include <net/dst.h>
 101#include <net/dst_metadata.h>
 102#include <net/pkt_sched.h>
 103#include <net/pkt_cls.h>
 104#include <net/checksum.h>
 105#include <net/xfrm.h>
 106#include <linux/highmem.h>
 107#include <linux/init.h>
 108#include <linux/module.h>
 109#include <linux/netpoll.h>
 110#include <linux/rcupdate.h>
 111#include <linux/delay.h>
 112#include <net/iw_handler.h>
 113#include <asm/current.h>
 114#include <linux/audit.h>
 115#include <linux/dmaengine.h>
 116#include <linux/err.h>
 117#include <linux/ctype.h>
 118#include <linux/if_arp.h>
 119#include <linux/if_vlan.h>
 120#include <linux/ip.h>
 121#include <net/ip.h>
 122#include <net/mpls.h>
 123#include <linux/ipv6.h>
 124#include <linux/in.h>
 125#include <linux/jhash.h>
 126#include <linux/random.h>
 127#include <trace/events/napi.h>
 128#include <trace/events/net.h>
 129#include <trace/events/skb.h>
 130#include <linux/inetdevice.h>
 131#include <linux/cpu_rmap.h>
 132#include <linux/static_key.h>
 133#include <linux/hashtable.h>
 134#include <linux/vmalloc.h>
 135#include <linux/if_macvlan.h>
 136#include <linux/errqueue.h>
 137#include <linux/hrtimer.h>
 138#include <linux/netfilter_ingress.h>
 139#include <linux/crash_dump.h>
 140#include <linux/sctp.h>
 141#include <net/udp_tunnel.h>
 142#include <linux/net_namespace.h>
 143#include <linux/indirect_call_wrapper.h>
 144#include <net/devlink.h>
 145
 146#include "net-sysfs.h"
 147
 148#define MAX_GRO_SKBS 8
 149
 150/* This should be increased if a protocol with a bigger head is added. */
 151#define GRO_MAX_HEAD (MAX_HEADER + 128)
 152
 153static DEFINE_SPINLOCK(ptype_lock);
 154static DEFINE_SPINLOCK(offload_lock);
 155struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 156struct list_head ptype_all __read_mostly;       /* Taps */
 157static struct list_head offload_base __read_mostly;
 158
 159static int netif_rx_internal(struct sk_buff *skb);
 160static int call_netdevice_notifiers_info(unsigned long val,
 161                                         struct netdev_notifier_info *info);
 162static int call_netdevice_notifiers_extack(unsigned long val,
 163                                           struct net_device *dev,
 164                                           struct netlink_ext_ack *extack);
 165static struct napi_struct *napi_by_id(unsigned int napi_id);
 166
 167/*
 168 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 169 * semaphore.
 170 *
 171 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 172 *
 173 * Writers must hold the rtnl semaphore while they loop through the
 174 * dev_base_head list, and hold dev_base_lock for writing when they do the
 175 * actual updates.  This allows pure readers to access the list even
 176 * while a writer is preparing to update it.
 177 *
 178 * To put it another way, dev_base_lock is held for writing only to
 179 * protect against pure readers; the rtnl semaphore provides the
 180 * protection against other writers.
 181 *
 182 * See, for example usages, register_netdevice() and
 183 * unregister_netdevice(), which must be called with the rtnl
 184 * semaphore held.
 185 */
 186DEFINE_RWLOCK(dev_base_lock);
 187EXPORT_SYMBOL(dev_base_lock);
 188
 189static DEFINE_MUTEX(ifalias_mutex);
 190
 191/* protects napi_hash addition/deletion and napi_gen_id */
 192static DEFINE_SPINLOCK(napi_hash_lock);
 193
 194static unsigned int napi_gen_id = NR_CPUS;
 195static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
 196
 197static seqcount_t devnet_rename_seq;
 198
 199static inline void dev_base_seq_inc(struct net *net)
 200{
 201        while (++net->dev_base_seq == 0)
 202                ;
 203}
 204
 205static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 206{
 207        unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
 208
 209        return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 210}
 211
 212static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 213{
 214        return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 215}
 216
 217static inline void rps_lock(struct softnet_data *sd)
 218{
 219#ifdef CONFIG_RPS
 220        spin_lock(&sd->input_pkt_queue.lock);
 221#endif
 222}
 223
 224static inline void rps_unlock(struct softnet_data *sd)
 225{
 226#ifdef CONFIG_RPS
 227        spin_unlock(&sd->input_pkt_queue.lock);
 228#endif
 229}
 230
 231static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
 232                                                       const char *name)
 233{
 234        struct netdev_name_node *name_node;
 235
 236        name_node = kmalloc(sizeof(*name_node), GFP_KERNEL);
 237        if (!name_node)
 238                return NULL;
 239        INIT_HLIST_NODE(&name_node->hlist);
 240        name_node->dev = dev;
 241        name_node->name = name;
 242        return name_node;
 243}
 244
 245static struct netdev_name_node *
 246netdev_name_node_head_alloc(struct net_device *dev)
 247{
 248        struct netdev_name_node *name_node;
 249
 250        name_node = netdev_name_node_alloc(dev, dev->name);
 251        if (!name_node)
 252                return NULL;
 253        INIT_LIST_HEAD(&name_node->list);
 254        return name_node;
 255}
 256
 257static void netdev_name_node_free(struct netdev_name_node *name_node)
 258{
 259        kfree(name_node);
 260}
 261
 262static void netdev_name_node_add(struct net *net,
 263                                 struct netdev_name_node *name_node)
 264{
 265        hlist_add_head_rcu(&name_node->hlist,
 266                           dev_name_hash(net, name_node->name));
 267}
 268
 269static void netdev_name_node_del(struct netdev_name_node *name_node)
 270{
 271        hlist_del_rcu(&name_node->hlist);
 272}
 273
 274static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
 275                                                        const char *name)
 276{
 277        struct hlist_head *head = dev_name_hash(net, name);
 278        struct netdev_name_node *name_node;
 279
 280        hlist_for_each_entry(name_node, head, hlist)
 281                if (!strcmp(name_node->name, name))
 282                        return name_node;
 283        return NULL;
 284}
 285
 286static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
 287                                                            const char *name)
 288{
 289        struct hlist_head *head = dev_name_hash(net, name);
 290        struct netdev_name_node *name_node;
 291
 292        hlist_for_each_entry_rcu(name_node, head, hlist)
 293                if (!strcmp(name_node->name, name))
 294                        return name_node;
 295        return NULL;
 296}
 297
 298int netdev_name_node_alt_create(struct net_device *dev, const char *name)
 299{
 300        struct netdev_name_node *name_node;
 301        struct net *net = dev_net(dev);
 302
 303        name_node = netdev_name_node_lookup(net, name);
 304        if (name_node)
 305                return -EEXIST;
 306        name_node = netdev_name_node_alloc(dev, name);
 307        if (!name_node)
 308                return -ENOMEM;
 309        netdev_name_node_add(net, name_node);
 310        /* The node that holds dev->name acts as a head of per-device list. */
 311        list_add_tail(&name_node->list, &dev->name_node->list);
 312
 313        return 0;
 314}
 315EXPORT_SYMBOL(netdev_name_node_alt_create);
 316
 317static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
 318{
 319        list_del(&name_node->list);
 320        netdev_name_node_del(name_node);
 321        kfree(name_node->name);
 322        netdev_name_node_free(name_node);
 323}
 324
 325int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
 326{
 327        struct netdev_name_node *name_node;
 328        struct net *net = dev_net(dev);
 329
 330        name_node = netdev_name_node_lookup(net, name);
 331        if (!name_node)
 332                return -ENOENT;
 333        /* lookup might have found our primary name or a name belonging
 334         * to another device.
 335         */
 336        if (name_node == dev->name_node || name_node->dev != dev)
 337                return -EINVAL;
 338
 339        __netdev_name_node_alt_destroy(name_node);
 340
 341        return 0;
 342}
 343EXPORT_SYMBOL(netdev_name_node_alt_destroy);
 344
 345static void netdev_name_node_alt_flush(struct net_device *dev)
 346{
 347        struct netdev_name_node *name_node, *tmp;
 348
 349        list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list)
 350                __netdev_name_node_alt_destroy(name_node);
 351}
 352
 353/* Device list insertion */
 354static void list_netdevice(struct net_device *dev)
 355{
 356        struct net *net = dev_net(dev);
 357
 358        ASSERT_RTNL();
 359
 360        write_lock_bh(&dev_base_lock);
 361        list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 362        netdev_name_node_add(net, dev->name_node);
 363        hlist_add_head_rcu(&dev->index_hlist,
 364                           dev_index_hash(net, dev->ifindex));
 365        write_unlock_bh(&dev_base_lock);
 366
 367        dev_base_seq_inc(net);
 368}
 369
 370/* Device list removal
 371 * caller must respect a RCU grace period before freeing/reusing dev
 372 */
 373static void unlist_netdevice(struct net_device *dev)
 374{
 375        ASSERT_RTNL();
 376
 377        /* Unlink dev from the device chain */
 378        write_lock_bh(&dev_base_lock);
 379        list_del_rcu(&dev->dev_list);
 380        netdev_name_node_del(dev->name_node);
 381        hlist_del_rcu(&dev->index_hlist);
 382        write_unlock_bh(&dev_base_lock);
 383
 384        dev_base_seq_inc(dev_net(dev));
 385}
 386
 387/*
 388 *      Our notifier list
 389 */
 390
 391static RAW_NOTIFIER_HEAD(netdev_chain);
 392
 393/*
 394 *      Device drivers call our routines to queue packets here. We empty the
 395 *      queue in the local softnet handler.
 396 */
 397
 398DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 399EXPORT_PER_CPU_SYMBOL(softnet_data);
 400
 401/*******************************************************************************
 402 *
 403 *              Protocol management and registration routines
 404 *
 405 *******************************************************************************/
 406
 407
 408/*
 409 *      Add a protocol ID to the list. Now that the input handler is
 410 *      smarter we can dispense with all the messy stuff that used to be
 411 *      here.
 412 *
 413 *      BEWARE!!! Protocol handlers, mangling input packets,
 414 *      MUST BE last in hash buckets and checking protocol handlers
 415 *      MUST start from promiscuous ptype_all chain in net_bh.
 416 *      It is true now, do not change it.
 417 *      Explanation follows: if protocol handler, mangling packet, will
 418 *      be the first on list, it is not able to sense, that packet
 419 *      is cloned and should be copied-on-write, so that it will
 420 *      change it and subsequent readers will get broken packet.
 421 *                                                      --ANK (980803)
 422 */
 423
 424static inline struct list_head *ptype_head(const struct packet_type *pt)
 425{
 426        if (pt->type == htons(ETH_P_ALL))
 427                return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 428        else
 429                return pt->dev ? &pt->dev->ptype_specific :
 430                                 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 431}
 432
 433/**
 434 *      dev_add_pack - add packet handler
 435 *      @pt: packet type declaration
 436 *
 437 *      Add a protocol handler to the networking stack. The passed &packet_type
 438 *      is linked into kernel lists and may not be freed until it has been
 439 *      removed from the kernel lists.
 440 *
 441 *      This call does not sleep therefore it can not
 442 *      guarantee all CPU's that are in middle of receiving packets
 443 *      will see the new packet type (until the next received packet).
 444 */
 445
 446void dev_add_pack(struct packet_type *pt)
 447{
 448        struct list_head *head = ptype_head(pt);
 449
 450        spin_lock(&ptype_lock);
 451        list_add_rcu(&pt->list, head);
 452        spin_unlock(&ptype_lock);
 453}
 454EXPORT_SYMBOL(dev_add_pack);
 455
 456/**
 457 *      __dev_remove_pack        - remove packet handler
 458 *      @pt: packet type declaration
 459 *
 460 *      Remove a protocol handler that was previously added to the kernel
 461 *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 462 *      from the kernel lists and can be freed or reused once this function
 463 *      returns.
 464 *
 465 *      The packet type might still be in use by receivers
 466 *      and must not be freed until after all the CPU's have gone
 467 *      through a quiescent state.
 468 */
 469void __dev_remove_pack(struct packet_type *pt)
 470{
 471        struct list_head *head = ptype_head(pt);
 472        struct packet_type *pt1;
 473
 474        spin_lock(&ptype_lock);
 475
 476        list_for_each_entry(pt1, head, list) {
 477                if (pt == pt1) {
 478                        list_del_rcu(&pt->list);
 479                        goto out;
 480                }
 481        }
 482
 483        pr_warn("dev_remove_pack: %p not found\n", pt);
 484out:
 485        spin_unlock(&ptype_lock);
 486}
 487EXPORT_SYMBOL(__dev_remove_pack);
 488
 489/**
 490 *      dev_remove_pack  - remove packet handler
 491 *      @pt: packet type declaration
 492 *
 493 *      Remove a protocol handler that was previously added to the kernel
 494 *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 495 *      from the kernel lists and can be freed or reused once this function
 496 *      returns.
 497 *
 498 *      This call sleeps to guarantee that no CPU is looking at the packet
 499 *      type after return.
 500 */
 501void dev_remove_pack(struct packet_type *pt)
 502{
 503        __dev_remove_pack(pt);
 504
 505        synchronize_net();
 506}
 507EXPORT_SYMBOL(dev_remove_pack);
 508
 509
 510/**
 511 *      dev_add_offload - register offload handlers
 512 *      @po: protocol offload declaration
 513 *
 514 *      Add protocol offload handlers to the networking stack. The passed
 515 *      &proto_offload is linked into kernel lists and may not be freed until
 516 *      it has been removed from the kernel lists.
 517 *
 518 *      This call does not sleep therefore it can not
 519 *      guarantee all CPU's that are in middle of receiving packets
 520 *      will see the new offload handlers (until the next received packet).
 521 */
 522void dev_add_offload(struct packet_offload *po)
 523{
 524        struct packet_offload *elem;
 525
 526        spin_lock(&offload_lock);
 527        list_for_each_entry(elem, &offload_base, list) {
 528                if (po->priority < elem->priority)
 529                        break;
 530        }
 531        list_add_rcu(&po->list, elem->list.prev);
 532        spin_unlock(&offload_lock);
 533}
 534EXPORT_SYMBOL(dev_add_offload);
 535
 536/**
 537 *      __dev_remove_offload     - remove offload handler
 538 *      @po: packet offload declaration
 539 *
 540 *      Remove a protocol offload handler that was previously added to the
 541 *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 542 *      is removed from the kernel lists and can be freed or reused once this
 543 *      function returns.
 544 *
 545 *      The packet type might still be in use by receivers
 546 *      and must not be freed until after all the CPU's have gone
 547 *      through a quiescent state.
 548 */
 549static void __dev_remove_offload(struct packet_offload *po)
 550{
 551        struct list_head *head = &offload_base;
 552        struct packet_offload *po1;
 553
 554        spin_lock(&offload_lock);
 555
 556        list_for_each_entry(po1, head, list) {
 557                if (po == po1) {
 558                        list_del_rcu(&po->list);
 559                        goto out;
 560                }
 561        }
 562
 563        pr_warn("dev_remove_offload: %p not found\n", po);
 564out:
 565        spin_unlock(&offload_lock);
 566}
 567
 568/**
 569 *      dev_remove_offload       - remove packet offload handler
 570 *      @po: packet offload declaration
 571 *
 572 *      Remove a packet offload handler that was previously added to the kernel
 573 *      offload handlers by dev_add_offload(). The passed &offload_type is
 574 *      removed from the kernel lists and can be freed or reused once this
 575 *      function returns.
 576 *
 577 *      This call sleeps to guarantee that no CPU is looking at the packet
 578 *      type after return.
 579 */
 580void dev_remove_offload(struct packet_offload *po)
 581{
 582        __dev_remove_offload(po);
 583
 584        synchronize_net();
 585}
 586EXPORT_SYMBOL(dev_remove_offload);
 587
 588/******************************************************************************
 589 *
 590 *                    Device Boot-time Settings Routines
 591 *
 592 ******************************************************************************/
 593
 594/* Boot time configuration table */
 595static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 596
 597/**
 598 *      netdev_boot_setup_add   - add new setup entry
 599 *      @name: name of the device
 600 *      @map: configured settings for the device
 601 *
 602 *      Adds new setup entry to the dev_boot_setup list.  The function
 603 *      returns 0 on error and 1 on success.  This is a generic routine to
 604 *      all netdevices.
 605 */
 606static int netdev_boot_setup_add(char *name, struct ifmap *map)
 607{
 608        struct netdev_boot_setup *s;
 609        int i;
 610
 611        s = dev_boot_setup;
 612        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 613                if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 614                        memset(s[i].name, 0, sizeof(s[i].name));
 615                        strlcpy(s[i].name, name, IFNAMSIZ);
 616                        memcpy(&s[i].map, map, sizeof(s[i].map));
 617                        break;
 618                }
 619        }
 620
 621        return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 622}
 623
 624/**
 625 * netdev_boot_setup_check      - check boot time settings
 626 * @dev: the netdevice
 627 *
 628 * Check boot time settings for the device.
 629 * The found settings are set for the device to be used
 630 * later in the device probing.
 631 * Returns 0 if no settings found, 1 if they are.
 632 */
 633int netdev_boot_setup_check(struct net_device *dev)
 634{
 635        struct netdev_boot_setup *s = dev_boot_setup;
 636        int i;
 637
 638        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 639                if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 640                    !strcmp(dev->name, s[i].name)) {
 641                        dev->irq = s[i].map.irq;
 642                        dev->base_addr = s[i].map.base_addr;
 643                        dev->mem_start = s[i].map.mem_start;
 644                        dev->mem_end = s[i].map.mem_end;
 645                        return 1;
 646                }
 647        }
 648        return 0;
 649}
 650EXPORT_SYMBOL(netdev_boot_setup_check);
 651
 652
 653/**
 654 * netdev_boot_base     - get address from boot time settings
 655 * @prefix: prefix for network device
 656 * @unit: id for network device
 657 *
 658 * Check boot time settings for the base address of device.
 659 * The found settings are set for the device to be used
 660 * later in the device probing.
 661 * Returns 0 if no settings found.
 662 */
 663unsigned long netdev_boot_base(const char *prefix, int unit)
 664{
 665        const struct netdev_boot_setup *s = dev_boot_setup;
 666        char name[IFNAMSIZ];
 667        int i;
 668
 669        sprintf(name, "%s%d", prefix, unit);
 670
 671        /*
 672         * If device already registered then return base of 1
 673         * to indicate not to probe for this interface
 674         */
 675        if (__dev_get_by_name(&init_net, name))
 676                return 1;
 677
 678        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 679                if (!strcmp(name, s[i].name))
 680                        return s[i].map.base_addr;
 681        return 0;
 682}
 683
 684/*
 685 * Saves at boot time configured settings for any netdevice.
 686 */
 687int __init netdev_boot_setup(char *str)
 688{
 689        int ints[5];
 690        struct ifmap map;
 691
 692        str = get_options(str, ARRAY_SIZE(ints), ints);
 693        if (!str || !*str)
 694                return 0;
 695
 696        /* Save settings */
 697        memset(&map, 0, sizeof(map));
 698        if (ints[0] > 0)
 699                map.irq = ints[1];
 700        if (ints[0] > 1)
 701                map.base_addr = ints[2];
 702        if (ints[0] > 2)
 703                map.mem_start = ints[3];
 704        if (ints[0] > 3)
 705                map.mem_end = ints[4];
 706
 707        /* Add new entry to the list */
 708        return netdev_boot_setup_add(str, &map);
 709}
 710
 711__setup("netdev=", netdev_boot_setup);
 712
 713/*******************************************************************************
 714 *
 715 *                          Device Interface Subroutines
 716 *
 717 *******************************************************************************/
 718
 719/**
 720 *      dev_get_iflink  - get 'iflink' value of a interface
 721 *      @dev: targeted interface
 722 *
 723 *      Indicates the ifindex the interface is linked to.
 724 *      Physical interfaces have the same 'ifindex' and 'iflink' values.
 725 */
 726
 727int dev_get_iflink(const struct net_device *dev)
 728{
 729        if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
 730                return dev->netdev_ops->ndo_get_iflink(dev);
 731
 732        return dev->ifindex;
 733}
 734EXPORT_SYMBOL(dev_get_iflink);
 735
 736/**
 737 *      dev_fill_metadata_dst - Retrieve tunnel egress information.
 738 *      @dev: targeted interface
 739 *      @skb: The packet.
 740 *
 741 *      For better visibility of tunnel traffic OVS needs to retrieve
 742 *      egress tunnel information for a packet. Following API allows
 743 *      user to get this info.
 744 */
 745int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 746{
 747        struct ip_tunnel_info *info;
 748
 749        if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
 750                return -EINVAL;
 751
 752        info = skb_tunnel_info_unclone(skb);
 753        if (!info)
 754                return -ENOMEM;
 755        if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
 756                return -EINVAL;
 757
 758        return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
 759}
 760EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
 761
 762/**
 763 *      __dev_get_by_name       - find a device by its name
 764 *      @net: the applicable net namespace
 765 *      @name: name to find
 766 *
 767 *      Find an interface by name. Must be called under RTNL semaphore
 768 *      or @dev_base_lock. If the name is found a pointer to the device
 769 *      is returned. If the name is not found then %NULL is returned. The
 770 *      reference counters are not incremented so the caller must be
 771 *      careful with locks.
 772 */
 773
 774struct net_device *__dev_get_by_name(struct net *net, const char *name)
 775{
 776        struct netdev_name_node *node_name;
 777
 778        node_name = netdev_name_node_lookup(net, name);
 779        return node_name ? node_name->dev : NULL;
 780}
 781EXPORT_SYMBOL(__dev_get_by_name);
 782
 783/**
 784 * dev_get_by_name_rcu  - find a device by its name
 785 * @net: the applicable net namespace
 786 * @name: name to find
 787 *
 788 * Find an interface by name.
 789 * If the name is found a pointer to the device is returned.
 790 * If the name is not found then %NULL is returned.
 791 * The reference counters are not incremented so the caller must be
 792 * careful with locks. The caller must hold RCU lock.
 793 */
 794
 795struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 796{
 797        struct netdev_name_node *node_name;
 798
 799        node_name = netdev_name_node_lookup_rcu(net, name);
 800        return node_name ? node_name->dev : NULL;
 801}
 802EXPORT_SYMBOL(dev_get_by_name_rcu);
 803
 804/**
 805 *      dev_get_by_name         - find a device by its name
 806 *      @net: the applicable net namespace
 807 *      @name: name to find
 808 *
 809 *      Find an interface by name. This can be called from any
 810 *      context and does its own locking. The returned handle has
 811 *      the usage count incremented and the caller must use dev_put() to
 812 *      release it when it is no longer needed. %NULL is returned if no
 813 *      matching device is found.
 814 */
 815
 816struct net_device *dev_get_by_name(struct net *net, const char *name)
 817{
 818        struct net_device *dev;
 819
 820        rcu_read_lock();
 821        dev = dev_get_by_name_rcu(net, name);
 822        if (dev)
 823                dev_hold(dev);
 824        rcu_read_unlock();
 825        return dev;
 826}
 827EXPORT_SYMBOL(dev_get_by_name);
 828
 829/**
 830 *      __dev_get_by_index - find a device by its ifindex
 831 *      @net: the applicable net namespace
 832 *      @ifindex: index of device
 833 *
 834 *      Search for an interface by index. Returns %NULL if the device
 835 *      is not found or a pointer to the device. The device has not
 836 *      had its reference counter increased so the caller must be careful
 837 *      about locking. The caller must hold either the RTNL semaphore
 838 *      or @dev_base_lock.
 839 */
 840
 841struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 842{
 843        struct net_device *dev;
 844        struct hlist_head *head = dev_index_hash(net, ifindex);
 845
 846        hlist_for_each_entry(dev, head, index_hlist)
 847                if (dev->ifindex == ifindex)
 848                        return dev;
 849
 850        return NULL;
 851}
 852EXPORT_SYMBOL(__dev_get_by_index);
 853
 854/**
 855 *      dev_get_by_index_rcu - find a device by its ifindex
 856 *      @net: the applicable net namespace
 857 *      @ifindex: index of device
 858 *
 859 *      Search for an interface by index. Returns %NULL if the device
 860 *      is not found or a pointer to the device. The device has not
 861 *      had its reference counter increased so the caller must be careful
 862 *      about locking. The caller must hold RCU lock.
 863 */
 864
 865struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 866{
 867        struct net_device *dev;
 868        struct hlist_head *head = dev_index_hash(net, ifindex);
 869
 870        hlist_for_each_entry_rcu(dev, head, index_hlist)
 871                if (dev->ifindex == ifindex)
 872                        return dev;
 873
 874        return NULL;
 875}
 876EXPORT_SYMBOL(dev_get_by_index_rcu);
 877
 878
 879/**
 880 *      dev_get_by_index - find a device by its ifindex
 881 *      @net: the applicable net namespace
 882 *      @ifindex: index of device
 883 *
 884 *      Search for an interface by index. Returns NULL if the device
 885 *      is not found or a pointer to the device. The device returned has
 886 *      had a reference added and the pointer is safe until the user calls
 887 *      dev_put to indicate they have finished with it.
 888 */
 889
 890struct net_device *dev_get_by_index(struct net *net, int ifindex)
 891{
 892        struct net_device *dev;
 893
 894        rcu_read_lock();
 895        dev = dev_get_by_index_rcu(net, ifindex);
 896        if (dev)
 897                dev_hold(dev);
 898        rcu_read_unlock();
 899        return dev;
 900}
 901EXPORT_SYMBOL(dev_get_by_index);
 902
 903/**
 904 *      dev_get_by_napi_id - find a device by napi_id
 905 *      @napi_id: ID of the NAPI struct
 906 *
 907 *      Search for an interface by NAPI ID. Returns %NULL if the device
 908 *      is not found or a pointer to the device. The device has not had
 909 *      its reference counter increased so the caller must be careful
 910 *      about locking. The caller must hold RCU lock.
 911 */
 912
 913struct net_device *dev_get_by_napi_id(unsigned int napi_id)
 914{
 915        struct napi_struct *napi;
 916
 917        WARN_ON_ONCE(!rcu_read_lock_held());
 918
 919        if (napi_id < MIN_NAPI_ID)
 920                return NULL;
 921
 922        napi = napi_by_id(napi_id);
 923
 924        return napi ? napi->dev : NULL;
 925}
 926EXPORT_SYMBOL(dev_get_by_napi_id);
 927
 928/**
 929 *      netdev_get_name - get a netdevice name, knowing its ifindex.
 930 *      @net: network namespace
 931 *      @name: a pointer to the buffer where the name will be stored.
 932 *      @ifindex: the ifindex of the interface to get the name from.
 933 *
 934 *      The use of raw_seqcount_begin() and cond_resched() before
 935 *      retrying is required as we want to give the writers a chance
 936 *      to complete when CONFIG_PREEMPTION is not set.
 937 */
 938int netdev_get_name(struct net *net, char *name, int ifindex)
 939{
 940        struct net_device *dev;
 941        unsigned int seq;
 942
 943retry:
 944        seq = raw_seqcount_begin(&devnet_rename_seq);
 945        rcu_read_lock();
 946        dev = dev_get_by_index_rcu(net, ifindex);
 947        if (!dev) {
 948                rcu_read_unlock();
 949                return -ENODEV;
 950        }
 951
 952        strcpy(name, dev->name);
 953        rcu_read_unlock();
 954        if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 955                cond_resched();
 956                goto retry;
 957        }
 958
 959        return 0;
 960}
 961
 962/**
 963 *      dev_getbyhwaddr_rcu - find a device by its hardware address
 964 *      @net: the applicable net namespace
 965 *      @type: media type of device
 966 *      @ha: hardware address
 967 *
 968 *      Search for an interface by MAC address. Returns NULL if the device
 969 *      is not found or a pointer to the device.
 970 *      The caller must hold RCU or RTNL.
 971 *      The returned device has not had its ref count increased
 972 *      and the caller must therefore be careful about locking
 973 *
 974 */
 975
 976struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 977                                       const char *ha)
 978{
 979        struct net_device *dev;
 980
 981        for_each_netdev_rcu(net, dev)
 982                if (dev->type == type &&
 983                    !memcmp(dev->dev_addr, ha, dev->addr_len))
 984                        return dev;
 985
 986        return NULL;
 987}
 988EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 989
 990struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 991{
 992        struct net_device *dev;
 993
 994        ASSERT_RTNL();
 995        for_each_netdev(net, dev)
 996                if (dev->type == type)
 997                        return dev;
 998
 999        return NULL;
1000}

1001EXPORT_SYMBOL(__dev_getfirstbyhwtype);
1002
1003struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
1004{
1005        struct net_device *dev, *ret = NULL;
1006
1007        rcu_read_lock();
1008        for_each_netdev_rcu(net, dev)
1009                if (dev->type == type) {
1010                        dev_hold(dev);
1011                        ret = dev;
1012                        break;
1013                }
1014        rcu_read_unlock();
1015        return ret;
1016}
1017EXPORT_SYMBOL(dev_getfirstbyhwtype);
1018
1019/**
1020 *      __dev_get_by_flags - find any device with given flags
1021 *      @net: the applicable net namespace
1022 *      @if_flags: IFF_* values
1023 *      @mask: bitmask of bits in if_flags to check
1024 *
1025 *      Search for any interface with the given flags. Returns NULL if a device
1026 *      is not found or a pointer to the device. Must be called inside
1027 *      rtnl_lock(), and result refcount is unchanged.
1028 */
1029
1030struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
1031                                      unsigned short mask)
1032{
1033        struct net_device *dev, *ret;
1034
1035        ASSERT_RTNL();
1036
1037        ret = NULL;
1038        for_each_netdev(net, dev) {
1039                if (((dev->flags ^ if_flags) & mask) == 0) {
1040                        ret = dev;
1041                        break;
1042                }
1043        }
1044        return ret;
1045}
1046EXPORT_SYMBOL(__dev_get_by_flags);
1047
1048/**
1049 *      dev_valid_name - check if name is okay for network device
1050 *      @name: name string
1051 *
1052 *      Network device names need to be valid file names to
1053 *      to allow sysfs to work.  We also disallow any kind of
1054 *      whitespace.
1055 */
1056bool dev_valid_name(const char *name)
1057{
1058        if (*name == '\0')
1059                return false;
1060        if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
1061                return false;
1062        if (!strcmp(name, ".") || !strcmp(name, ".."))
1063                return false;
1064
1065        while (*name) {
1066                if (*name == '/' || *name == ':' || isspace(*name))
1067                        return false;
1068                name++;
1069        }
1070        return true;
1071}
1072EXPORT_SYMBOL(dev_valid_name);
1073
1074/**
1075 *      __dev_alloc_name - allocate a name for a device
1076 *      @net: network namespace to allocate the device name in
1077 *      @name: name format string
1078 *      @buf:  scratch buffer and result name string
1079 *
1080 *      Passed a format string - eg "lt%d" it will try and find a suitable
1081 *      id. It scans list of devices to build up a free map, then chooses
1082 *      the first empty slot. The caller must hold the dev_base or rtnl lock
1083 *      while allocating the name and adding the device in order to avoid
1084 *      duplicates.
1085 *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1086 *      Returns the number of the unit assigned or a negative errno code.
1087 */
1088
1089static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1090{
1091        int i = 0;
1092        const char *p;
1093        const int max_netdevices = 8*PAGE_SIZE;
1094        unsigned long *inuse;
1095        struct net_device *d;
1096
1097        if (!dev_valid_name(name))
1098                return -EINVAL;
1099
1100        p = strchr(name, '%');
1101        if (p) {
1102                /*
1103                 * Verify the string as this thing may have come from
1104                 * the user.  There must be either one "%d" and no other "%"
1105                 * characters.
1106                 */
1107                if (p[1] != 'd' || strchr(p + 2, '%'))
1108                        return -EINVAL;
1109
1110                /* Use one page as a bit array of possible slots */
1111                inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1112                if (!inuse)
1113                        return -ENOMEM;
1114
1115                for_each_netdev(net, d) {
1116                        if (!sscanf(d->name, name, &i))
1117                                continue;
1118                        if (i < 0 || i >= max_netdevices)
1119                                continue;
1120
1121                        /*  avoid cases where sscanf is not exact inverse of printf */
1122                        snprintf(buf, IFNAMSIZ, name, i);
1123                        if (!strncmp(buf, d->name, IFNAMSIZ))
1124                                set_bit(i, inuse);
1125                }
1126
1127                i = find_first_zero_bit(inuse, max_netdevices);
1128                free_page((unsigned long) inuse);
1129        }
1130
1131        snprintf(buf, IFNAMSIZ, name, i);
1132        if (!__dev_get_by_name(net, buf))
1133                return i;
1134
1135        /* It is possible to run out of possible slots
1136         * when the name is long and there isn't enough space left
1137         * for the digits, or if all bits are used.
1138         */
1139        return -ENFILE;
1140}
1141
1142static int dev_alloc_name_ns(struct net *net,
1143                             struct net_device *dev,
1144                             const char *name)
1145{
1146        char buf[IFNAMSIZ];
1147        int ret;
1148
1149        BUG_ON(!net);
1150        ret = __dev_alloc_name(net, name, buf);
1151        if (ret >= 0)
1152                strlcpy(dev->name, buf, IFNAMSIZ);
1153        return ret;
1154}
1155
1156/**
1157 *      dev_alloc_name - allocate a name for a device
1158 *      @dev: device
1159 *      @name: name format string
1160 *
1161 *      Passed a format string - eg "lt%d" it will try and find a suitable
1162 *      id. It scans list of devices to build up a free map, then chooses
1163 *      the first empty slot. The caller must hold the dev_base or rtnl lock
1164 *      while allocating the name and adding the device in order to avoid
1165 *      duplicates.
1166 *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1167 *      Returns the number of the unit assigned or a negative errno code.
1168 */
1169
1170int dev_alloc_name(struct net_device *dev, const char *name)
1171{
1172        return dev_alloc_name_ns(dev_net(dev), dev, name);
1173}
1174EXPORT_SYMBOL(dev_alloc_name);
1175
1176static int dev_get_valid_name(struct net *net, struct net_device *dev,
1177                              const char *name)
1178{
1179        BUG_ON(!net);
1180
1181        if (!dev_valid_name(name))
1182                return -EINVAL;
1183
1184        if (strchr(name, '%'))
1185                return dev_alloc_name_ns(net, dev, name);
1186        else if (__dev_get_by_name(net, name))
1187                return -EEXIST;
1188        else if (dev->name != name)
1189                strlcpy(dev->name, name, IFNAMSIZ);
1190
1191        return 0;
1192}
1193
1194/**
1195 *      dev_change_name - change name of a device
1196 *      @dev: device
1197 *      @newname: name (or format string) must be at least IFNAMSIZ
1198 *
1199 *      Change name of a device, can pass format strings "eth%d".
1200 *      for wildcarding.
1201 */
1202int dev_change_name(struct net_device *dev, const char *newname)
1203{
1204        unsigned char old_assign_type;
1205        char oldname[IFNAMSIZ];
1206        int err = 0;
1207        int ret;
1208        struct net *net;
1209
1210        ASSERT_RTNL();
1211        BUG_ON(!dev_net(dev));
1212
1213        net = dev_net(dev);
1214
1215        /* Some auto-enslaved devices e.g. failover slaves are
1216         * special, as userspace might rename the device after
1217         * the interface had been brought up and running since
1218         * the point kernel initiated auto-enslavement. Allow
1219         * live name change even when these slave devices are
1220         * up and running.
1221         *
1222         * Typically, users of these auto-enslaving devices
1223         * don't actually care about slave name change, as
1224         * they are supposed to operate on master interface
1225         * directly.
1226         */
1227        if (dev->flags & IFF_UP &&
1228            likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK)))
1229                return -EBUSY;
1230
1231        write_seqcount_begin(&devnet_rename_seq);
1232
1233        if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1234                write_seqcount_end(&devnet_rename_seq);
1235                return 0;
1236        }
1237
1238        memcpy(oldname, dev->name, IFNAMSIZ);
1239
1240        err = dev_get_valid_name(net, dev, newname);
1241        if (err < 0) {
1242                write_seqcount_end(&devnet_rename_seq);
1243                return err;
1244        }
1245
1246        if (oldname[0] && !strchr(oldname, '%'))
1247                netdev_info(dev, "renamed from %s\n", oldname);
1248
1249        old_assign_type = dev->name_assign_type;
1250        dev->name_assign_type = NET_NAME_RENAMED;
1251
1252rollback:
1253        ret = device_rename(&dev->dev, dev->name);
1254        if (ret) {
1255                memcpy(dev->name, oldname, IFNAMSIZ);
1256                dev->name_assign_type = old_assign_type;
1257                write_seqcount_end(&devnet_rename_seq);
1258                return ret;
1259        }
1260
1261        write_seqcount_end(&devnet_rename_seq);
1262
1263        netdev_adjacent_rename_links(dev, oldname);
1264
1265        write_lock_bh(&dev_base_lock);
1266        netdev_name_node_del(dev->name_node);
1267        write_unlock_bh(&dev_base_lock);
1268
1269        synchronize_rcu();
1270
1271        write_lock_bh(&dev_base_lock);
1272        netdev_name_node_add(net, dev->name_node);
1273        write_unlock_bh(&dev_base_lock);
1274
1275        ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1276        ret = notifier_to_errno(ret);
1277
1278        if (ret) {
1279                /* err >= 0 after dev_alloc_name() or stores the first errno */
1280                if (err >= 0) {
1281                        err = ret;
1282                        write_seqcount_begin(&devnet_rename_seq);
1283                        memcpy(dev->name, oldname, IFNAMSIZ);
1284                        memcpy(oldname, newname, IFNAMSIZ);
1285                        dev->name_assign_type = old_assign_type;
1286                        old_assign_type = NET_NAME_RENAMED;
1287                        goto rollback;
1288                } else {
1289                        pr_err("%s: name change rollback failed: %d\n",
1290                               dev->name, ret);
1291                }
1292        }
1293
1294        return err;
1295}
1296
1297/**
1298 *      dev_set_alias - change ifalias of a device
1299 *      @dev: device
1300 *      @alias: name up to IFALIASZ
1301 *      @len: limit of bytes to copy from info
1302 *
1303 *      Set ifalias for a device,
1304 */
1305int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1306{
1307        struct dev_ifalias *new_alias = NULL;
1308
1309        if (len >= IFALIASZ)
1310                return -EINVAL;
1311
1312        if (len) {
1313                new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
1314                if (!new_alias)
1315                        return -ENOMEM;
1316
1317                memcpy(new_alias->ifalias, alias, len);
1318                new_alias->ifalias[len] = 0;
1319        }
1320
1321        mutex_lock(&ifalias_mutex);
1322        new_alias = rcu_replace_pointer(dev->ifalias, new_alias,
1323                                        mutex_is_locked(&ifalias_mutex));
1324        mutex_unlock(&ifalias_mutex);
1325
1326        if (new_alias)
1327                kfree_rcu(new_alias, rcuhead);
1328
1329        return len;
1330}
1331EXPORT_SYMBOL(dev_set_alias);
1332
1333/**
1334 *      dev_get_alias - get ifalias of a device
1335 *      @dev: device
1336 *      @name: buffer to store name of ifalias
1337 *      @len: size of buffer
1338 *
1339 *      get ifalias for a device.  Caller must make sure dev cannot go
1340 *      away,  e.g. rcu read lock or own a reference count to device.
1341 */
1342int dev_get_alias(const struct net_device *dev, char *name, size_t len)
1343{
1344        const struct dev_ifalias *alias;
1345        int ret = 0;
1346
1347        rcu_read_lock();
1348        alias = rcu_dereference(dev->ifalias);
1349        if (alias)
1350                ret = snprintf(name, len, "%s", alias->ifalias);
1351        rcu_read_unlock();
1352
1353        return ret;
1354}
1355
1356/**
1357 *      netdev_features_change - device changes features
1358 *      @dev: device to cause notification
1359 *
1360 *      Called to indicate a device has changed features.
1361 */
1362void netdev_features_change(struct net_device *dev)
1363{
1364        call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1365}
1366EXPORT_SYMBOL(netdev_features_change);
1367
1368/**
1369 *      netdev_state_change - device changes state
1370 *      @dev: device to cause notification
1371 *
1372 *      Called to indicate a device has changed state. This function calls
1373 *      the notifier chains for netdev_chain and sends a NEWLINK message
1374 *      to the routing socket.
1375 */
1376void netdev_state_change(struct net_device *dev)
1377{
1378        if (dev->flags & IFF_UP) {
1379                struct netdev_notifier_change_info change_info = {
1380                        .info.dev = dev,
1381                };
1382
1383                call_netdevice_notifiers_info(NETDEV_CHANGE,
1384                                              &change_info.info);
1385                rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1386        }
1387}
1388EXPORT_SYMBOL(netdev_state_change);
1389
1390/**
1391 * netdev_notify_peers - notify network peers about existence of @dev
1392 * @dev: network device
1393 *
1394 * Generate traffic such that interested network peers are aware of
1395 * @dev, such as by generating a gratuitous ARP. This may be used when
1396 * a device wants to inform the rest of the network about some sort of
1397 * reconfiguration such as a failover event or virtual machine
1398 * migration.
1399 */
1400void netdev_notify_peers(struct net_device *dev)
1401{
1402        rtnl_lock();
1403        call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1404        call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
1405        rtnl_unlock();
1406}
1407EXPORT_SYMBOL(netdev_notify_peers);
1408
1409static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
1410{
1411        const struct net_device_ops *ops = dev->netdev_ops;
1412        int ret;
1413
1414        ASSERT_RTNL();
1415
1416        if (!netif_device_present(dev))
1417                return -ENODEV;
1418
1419        /* Block netpoll from trying to do any rx path servicing.
1420         * If we don't do this there is a chance ndo_poll_controller
1421         * or ndo_poll may be running while we open the device
1422         */
1423        netpoll_poll_disable(dev);
1424
1425        ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
1426        ret = notifier_to_errno(ret);
1427        if (ret)
1428                return ret;
1429
1430        set_bit(__LINK_STATE_START, &dev->state);
1431
1432        if (ops->ndo_validate_addr)
1433                ret = ops->ndo_validate_addr(dev);
1434
1435        if (!ret && ops->ndo_open)
1436                ret = ops->ndo_open(dev);
1437
1438        netpoll_poll_enable(dev);
1439
1440        if (ret)
1441                clear_bit(__LINK_STATE_START, &dev->state);
1442        else {
1443                dev->flags |= IFF_UP;
1444                dev_set_rx_mode(dev);
1445                dev_activate(dev);
1446                add_device_randomness(dev->dev_addr, dev->addr_len);
1447        }
1448
1449        return ret;
1450}
1451
1452/**
1453 *      dev_open        - prepare an interface for use.
1454 *      @dev: device to open
1455 *      @extack: netlink extended ack
1456 *
1457 *      Takes a device from down to up state. The device's private open
1458 *      function is invoked and then the multicast lists are loaded. Finally
1459 *      the device is moved into the up state and a %NETDEV_UP message is
1460 *      sent to the netdev notifier chain.
1461 *
1462 *      Calling this function on an active interface is a nop. On a failure
1463 *      a negative errno code is returned.
1464 */
1465int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
1466{
1467        int ret;
1468
1469        if (dev->flags & IFF_UP)
1470                return 0;
1471
1472        ret = __dev_open(dev, extack);
1473        if (ret < 0)
1474                return ret;
1475
1476        rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1477        call_netdevice_notifiers(NETDEV_UP, dev);
1478
1479        return ret;
1480}
1481EXPORT_SYMBOL(dev_open);
1482
1483static void __dev_close_many(struct list_head *head)
1484{
1485        struct net_device *dev;
1486
1487        ASSERT_RTNL();
1488        might_sleep();
1489
1490        list_for_each_entry(dev, head, close_list) {
1491                /* Temporarily disable netpoll until the interface is down */
1492                netpoll_poll_disable(dev);
1493
1494                call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1495
1496                clear_bit(__LINK_STATE_START, &dev->state);
1497
1498                /* Synchronize to scheduled poll. We cannot touch poll list, it
1499                 * can be even on different cpu. So just clear netif_running().
1500                 *
1501                 * dev->stop() will invoke napi_disable() on all of it's
1502                 * napi_struct instances on this device.
1503                 */
1504                smp_mb__after_atomic(); /* Commit netif_running(). */
1505        }
1506
1507        dev_deactivate_many(head);
1508
1509        list_for_each_entry(dev, head, close_list) {
1510                const struct net_device_ops *ops = dev->netdev_ops;
1511
1512                /*
1513                 *      Call the device specific close. This cannot fail.
1514                 *      Only if device is UP
1515                 *
1516                 *      We allow it to be called even after a DETACH hot-plug
1517                 *      event.
1518                 */
1519                if (ops->ndo_stop)
1520                        ops->ndo_stop(dev);
1521
1522                dev->flags &= ~IFF_UP;
1523                netpoll_poll_enable(dev);
1524        }
1525}
1526
1527static void __dev_close(struct net_device *dev)
1528{
1529        LIST_HEAD(single);
1530
1531        list_add(&dev->close_list, &single);
1532        __dev_close_many(&single);
1533        list_del(&single);
1534}
1535
1536void dev_close_many(struct list_head *head, bool unlink)
1537{
1538        struct net_device *dev, *tmp;
1539
1540        /* Remove the devices that don't need to be closed */
1541        list_for_each_entry_safe(dev, tmp, head, close_list)
1542                if (!(dev->flags & IFF_UP))
1543                        list_del_init(&dev->close_list);
1544
1545        __dev_close_many(head);
1546
1547        list_for_each_entry_safe(dev, tmp, head, close_list) {
1548                rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1549                call_netdevice_notifiers(NETDEV_DOWN, dev);
1550                if (unlink)
1551                        list_del_init(&dev->close_list);
1552        }
1553}
1554EXPORT_SYMBOL(dev_close_many);
1555
1556/**
1557 *      dev_close - shutdown an interface.
1558 *      @dev: device to shutdown
1559 *
1560 *      This function moves an active device into down state. A
1561 *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1562 *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1563 *      chain.
1564 */
1565void dev_close(struct net_device *dev)
1566{
1567        if (dev->flags & IFF_UP) {
1568                LIST_HEAD(single);
1569
1570                list_add(&dev->close_list, &single);
1571                dev_close_many(&single, true);
1572                list_del(&single);
1573        }
1574}
1575EXPORT_SYMBOL(dev_close);
1576
1577
1578/**
1579 *      dev_disable_lro - disable Large Receive Offload on a device
1580 *      @dev: device
1581 *
1582 *      Disable Large Receive Offload (LRO) on a net device.  Must be
1583 *      called under RTNL.  This is needed if received packets may be
1584 *      forwarded to another interface.
1585 */
1586void dev_disable_lro(struct net_device *dev)
1587{
1588        struct net_device *lower_dev;
1589        struct list_head *iter;
1590
1591        dev->wanted_features &= ~NETIF_F_LRO;
1592        netdev_update_features(dev);
1593
1594        if (unlikely(dev->features & NETIF_F_LRO))
1595                netdev_WARN(dev, "failed to disable LRO!\n");
1596
1597        netdev_for_each_lower_dev(dev, lower_dev, iter)
1598                dev_disable_lro(lower_dev);
1599}
1600EXPORT_SYMBOL(dev_disable_lro);
1601
1602/**
1603 *      dev_disable_gro_hw - disable HW Generic Receive Offload on a device
1604 *      @dev: device
1605 *
1606 *      Disable HW Generic Receive Offload (GRO_HW) on a net device.  Must be
1607 *      called under RTNL.  This is needed if Generic XDP is installed on
1608 *      the device.
1609 */
1610static void dev_disable_gro_hw(struct net_device *dev)
1611{
1612        dev->wanted_features &= ~NETIF_F_GRO_HW;
1613        netdev_update_features(dev);
1614
1615        if (unlikely(dev->features & NETIF_F_GRO_HW))
1616                netdev_WARN(dev, "failed to disable GRO_HW!\n");
1617}
1618
1619const char *netdev_cmd_to_name(enum netdev_cmd cmd)
1620{
1621#define N(val)                                          \
1622        case NETDEV_##val:                              \
1623                return "NETDEV_" __stringify(val);
1624        switch (cmd) {
1625        N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
1626        N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
1627        N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
1628        N(POST_INIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER)
1629        N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO)
1630        N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO)
1631        N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
1632        N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
1633        N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
1634        N(PRE_CHANGEADDR)
1635        }
1636#undef N
1637        return "UNKNOWN_NETDEV_EVENT";
1638}
1639EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
1640
1641static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1642                                   struct net_device *dev)
1643{
1644        struct netdev_notifier_info info = {
1645                .dev = dev,
1646        };
1647
1648        return nb->notifier_call(nb, val, &info);
1649}
1650
1651static int call_netdevice_register_notifiers(struct notifier_block *nb,
1652                                             struct net_device *dev)
1653{
1654        int err;
1655
1656        err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1657        err = notifier_to_errno(err);
1658        if (err)
1659                return err;
1660
1661        if (!(dev->flags & IFF_UP))
1662                return 0;
1663
1664        call_netdevice_notifier(nb, NETDEV_UP, dev);
1665        return 0;
1666}
1667
1668static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
1669                                                struct net_device *dev)
1670{
1671        if (dev->flags & IFF_UP) {
1672                call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1673                                        dev);
1674                call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1675        }
1676        call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1677}
1678
1679static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
1680                                                 struct net *net)
1681{
1682        struct net_device *dev;
1683        int err;
1684
1685        for_each_netdev(net, dev) {
1686                err = call_netdevice_register_notifiers(nb, dev);
1687                if (err)
1688                        goto rollback;
1689        }
1690        return 0;
1691
1692rollback:
1693        for_each_netdev_continue_reverse(net, dev)
1694                call_netdevice_unregister_notifiers(nb, dev);
1695        return err;
1696}
1697
1698static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb,
1699                                                    struct net *net)
1700{
1701        struct net_device *dev;
1702
1703        for_each_netdev(net, dev)
1704                call_netdevice_unregister_notifiers(nb, dev);
1705}
1706
1707static int dev_boot_phase = 1;
1708
1709/**
1710 * register_netdevice_notifier - register a network notifier block
1711 * @nb: notifier
1712 *
1713 * Register a notifier to be called when network device events occur.
1714 * The notifier passed is linked into the kernel structures and must
1715 * not be reused until it has been unregistered. A negative errno code
1716 * is returned on a failure.
1717 *
1718 * When registered all registration and up events are replayed
1719 * to the new notifier to allow device to have a race free
1720 * view of the network device list.
1721 */
1722
1723int register_netdevice_notifier(struct notifier_block *nb)
1724{
1725        struct net *net;
1726        int err;
1727
1728        /* Close race with setup_net() and cleanup_net() */
1729        down_write(&pernet_ops_rwsem);
1730        rtnl_lock();
1731        err = raw_notifier_chain_register(&netdev_chain, nb);
1732        if (err)
1733                goto unlock;
1734        if (dev_boot_phase)
1735                goto unlock;
1736        for_each_net(net) {
1737                err = call_netdevice_register_net_notifiers(nb, net);
1738                if (err)
1739                        goto rollback;
1740        }
1741
1742unlock:
1743        rtnl_unlock();
1744        up_write(&pernet_ops_rwsem);
1745        return err;
1746
1747rollback:
1748        for_each_net_continue_reverse(net)
1749                call_netdevice_unregister_net_notifiers(nb, net);
1750
1751        raw_notifier_chain_unregister(&netdev_chain, nb);
1752        goto unlock;
1753}
1754EXPORT_SYMBOL(register_netdevice_notifier);
1755
1756/**
1757 * unregister_netdevice_notifier - unregister a network notifier block
1758 * @nb: notifier
1759 *
1760 * Unregister a notifier previously registered by
1761 * register_netdevice_notifier(). The notifier is unlinked into the
1762 * kernel structures and may then be reused. A negative errno code
1763 * is returned on a failure.
1764 *
1765 * After unregistering unregister and down device events are synthesized
1766 * for all devices on the device list to the removed notifier to remove
1767 * the need for special case cleanup code.
1768 */
1769
1770int unregister_netdevice_notifier(struct notifier_block *nb)
1771{
1772        struct net *net;
1773        int err;
1774
1775        /* Close race with setup_net() and cleanup_net() */
1776        down_write(&pernet_ops_rwsem);
1777        rtnl_lock();
1778        err = raw_notifier_chain_unregister(&netdev_chain, nb);
1779        if (err)
1780                goto unlock;
1781
1782        for_each_net(net)
1783                call_netdevice_unregister_net_notifiers(nb, net);
1784
1785unlock:
1786        rtnl_unlock();
1787        up_write(&pernet_ops_rwsem);
1788        return err;
1789}
1790EXPORT_SYMBOL(unregister_netdevice_notifier);
1791
1792static int __register_netdevice_notifier_net(struct net *net,
1793                                             struct notifier_block *nb,
1794                                             bool ignore_call_fail)
1795{
1796        int err;
1797
1798        err = raw_notifier_chain_register(&net->netdev_chain, nb);
1799        if (err)
1800                return err;
1801        if (dev_boot_phase)
1802                return 0;
1803
1804        err = call_netdevice_register_net_notifiers(nb, net);
1805        if (err && !ignore_call_fail)
1806                goto chain_unregister;
1807
1808        return 0;
1809
1810chain_unregister:
1811        raw_notifier_chain_unregister(&net->netdev_chain, nb);
1812        return err;
1813}
1814
1815static int __unregister_netdevice_notifier_net(struct net *net,
1816                                               struct notifier_block *nb)
1817{
1818        int err;
1819
1820        err = raw_notifier_chain_unregister(&net->netdev_chain, nb);
1821        if (err)
1822                return err;
1823
1824        call_netdevice_unregister_net_notifiers(nb, net);
1825        return 0;
1826}
1827
1828/**
1829 * register_netdevice_notifier_net - register a per-netns network notifier block
1830 * @net: network namespace
1831 * @nb: notifier
1832 *
1833 * Register a notifier to be called when network device events occur.
1834 * The notifier passed is linked into the kernel structures and must
1835 * not be reused until it has been unregistered. A negative errno code
1836 * is returned on a failure.
1837 *
1838 * When registered all registration and up events are replayed
1839 * to the new notifier to allow device to have a race free
1840 * view of the network device list.
1841 */
1842
1843int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
1844{
1845        int err;
1846
1847        rtnl_lock();
1848        err = __register_netdevice_notifier_net(net, nb, false);
1849        rtnl_unlock();
1850        return err;
1851}
1852EXPORT_SYMBOL(register_netdevice_notifier_net);
1853
1854/**
1855 * unregister_netdevice_notifier_net - unregister a per-netns
1856 *                                     network notifier block
1857 * @net: network namespace
1858 * @nb: notifier
1859 *
1860 * Unregister a notifier previously registered by
1861 * register_netdevice_notifier(). The notifier is unlinked into the
1862 * kernel structures and may then be reused. A negative errno code
1863 * is returned on a failure.
1864 *
1865 * After unregistering unregister and down device events are synthesized
1866 * for all devices on the device list to the removed notifier to remove
1867 * the need for special case cleanup code.
1868 */
1869
1870int unregister_netdevice_notifier_net(struct net *net,
1871                                      struct notifier_block *nb)
1872{
1873        int err;
1874
1875        rtnl_lock();
1876        err = __unregister_netdevice_notifier_net(net, nb);
1877        rtnl_unlock();
1878        return err;
1879}
1880EXPORT_SYMBOL(unregister_netdevice_notifier_net);
1881
1882int register_netdevice_notifier_dev_net(struct net_device *dev,
1883                                        struct notifier_block *nb,
1884                                        struct netdev_net_notifier *nn)
1885{
1886        int err;
1887
1888        rtnl_lock();
1889        err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
1890        if (!err) {
1891                nn->nb = nb;
1892                list_add(&nn->list, &dev->net_notifier_list);
1893        }
1894        rtnl_unlock();
1895        return err;
1896}
1897EXPORT_SYMBOL(register_netdevice_notifier_dev_net);
1898
1899int unregister_netdevice_notifier_dev_net(struct net_device *dev,
1900                                          struct notifier_block *nb,
1901                                          struct netdev_net_notifier *nn)
1902{
1903        int err;
1904
1905        rtnl_lock();
1906        list_del(&nn->list);
1907        err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
1908        rtnl_unlock();
1909        return err;
1910}
1911EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);
1912
1913static void move_netdevice_notifiers_dev_net(struct net_device *dev,
1914                                             struct net *net)
1915{
1916        struct netdev_net_notifier *nn;
1917
1918        list_for_each_entry(nn, &dev->net_notifier_list, list) {
1919                __unregister_netdevice_notifier_net(dev_net(dev), nn->nb);
1920                __register_netdevice_notifier_net(net, nn->nb, true);
1921        }
1922}
1923
1924/**
1925 *      call_netdevice_notifiers_info - call all network notifier blocks
1926 *      @val: value passed unmodified to notifier function
1927 *      @info: notifier information data
1928 *
1929 *      Call all network notifier blocks.  Parameters and return value
1930 *      are as for raw_notifier_call_chain().
1931 */
1932
1933static int call_netdevice_notifiers_info(unsigned long val,
1934                                         struct netdev_notifier_info *info)
1935{
1936        struct net *net = dev_net(info->dev);
1937        int ret;
1938
1939        ASSERT_RTNL();
1940
1941        /* Run per-netns notifier block chain first, then run the global one.
1942         * Hopefully, one day, the global one is going to be removed after
1943         * all notifier block registrators get converted to be per-netns.
1944         */
1945        ret = raw_notifier_call_chain(&net->netdev_chain, val, info);
1946        if (ret & NOTIFY_STOP_MASK)
1947                return ret;
1948        return raw_notifier_call_chain(&netdev_chain, val, info);
1949}
1950
1951static int call_netdevice_notifiers_extack(unsigned long val,
1952                                           struct net_device *dev,
1953                                           struct netlink_ext_ack *extack)
1954{
1955        struct netdev_notifier_info info = {
1956                .dev = dev,
1957                .extack = extack,
1958        };
1959
1960        return call_netdevice_notifiers_info(val, &info);
1961}
1962
1963/**
1964 *      call_netdevice_notifiers - call all network notifier blocks
1965 *      @val: value passed unmodified to notifier function
1966 *      @dev: net_device pointer passed unmodified to notifier function
1967 *
1968 *      Call all network notifier blocks.  Parameters and return value
1969 *      are as for raw_notifier_call_chain().
1970 */
1971
1972int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1973{
1974        return call_netdevice_notifiers_extack(val, dev, NULL);
1975}
1976EXPORT_SYMBOL(call_netdevice_notifiers);
1977
1978/**
1979 *      call_netdevice_notifiers_mtu - call all network notifier blocks
1980 *      @val: value passed unmodified to notifier function
1981 *      @dev: net_device pointer passed unmodified to notifier function
1982 *      @arg: additional u32 argument passed to the notifier function
1983 *
1984 *      Call all network notifier blocks.  Parameters and return value
1985 *      are as for raw_notifier_call_chain().
1986 */
1987static int call_netdevice_notifiers_mtu(unsigned long val,
1988                                        struct net_device *dev, u32 arg)
1989{
1990        struct netdev_notifier_info_ext info = {
1991                .info.dev = dev,
1992                .ext.mtu = arg,
1993        };
1994
1995        BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
1996
1997        return call_netdevice_notifiers_info(val, &info.info);
1998}
1999
2000#ifdef CONFIG_NET_INGRESS

2001static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
2002
2003void net_inc_ingress_queue(void)
2004{
2005        static_branch_inc(&ingress_needed_key);
2006}
2007EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
2008
2009void net_dec_ingress_queue(void)
2010{
2011        static_branch_dec(&ingress_needed_key);
2012}
2013EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
2014#endif
2015
2016#ifdef CONFIG_NET_EGRESS
2017static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
2018
2019void net_inc_egress_queue(void)
2020{
2021        static_branch_inc(&egress_needed_key);
2022}
2023EXPORT_SYMBOL_GPL(net_inc_egress_queue);
2024
2025void net_dec_egress_queue(void)
2026{
2027        static_branch_dec(&egress_needed_key);
2028}
2029EXPORT_SYMBOL_GPL(net_dec_egress_queue);
2030#endif
2031
2032static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
2033#ifdef CONFIG_JUMP_LABEL
2034static atomic_t netstamp_needed_deferred;
2035static atomic_t netstamp_wanted;
2036static void netstamp_clear(struct work_struct *work)
2037{
2038        int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
2039        int wanted;
2040
2041        wanted = atomic_add_return(deferred, &netstamp_wanted);
2042        if (wanted > 0)
2043                static_branch_enable(&netstamp_needed_key);
2044        else
2045                static_branch_disable(&netstamp_needed_key);
2046}
2047static DECLARE_WORK(netstamp_work, netstamp_clear);
2048#endif
2049
2050void net_enable_timestamp(void)
2051{
2052#ifdef CONFIG_JUMP_LABEL
2053        int wanted;
2054
2055        while (1) {
2056                wanted = atomic_read(&netstamp_wanted);
2057                if (wanted <= 0)
2058                        break;
2059                if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
2060                        return;
2061        }
2062        atomic_inc(&netstamp_needed_deferred);
2063        schedule_work(&netstamp_work);
2064#else
2065        static_branch_inc(&netstamp_needed_key);
2066#endif
2067}
2068EXPORT_SYMBOL(net_enable_timestamp);
2069
2070void net_disable_timestamp(void)
2071{
2072#ifdef CONFIG_JUMP_LABEL
2073        int wanted;
2074
2075        while (1) {
2076                wanted = atomic_read(&netstamp_wanted);
2077                if (wanted <= 1)
2078                        break;
2079                if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
2080                        return;
2081        }
2082        atomic_dec(&netstamp_needed_deferred);
2083        schedule_work(&netstamp_work);
2084#else
2085        static_branch_dec(&netstamp_needed_key);
2086#endif
2087}
2088EXPORT_SYMBOL(net_disable_timestamp);
2089
2090static inline void net_timestamp_set(struct sk_buff *skb)
2091{
2092        skb->tstamp = 0;
2093        if (static_branch_unlikely(&netstamp_needed_key))
2094                __net_timestamp(skb);
2095}
2096
2097#define net_timestamp_check(COND, SKB)                          \
2098        if (static_branch_unlikely(&netstamp_needed_key)) {     \
2099                if ((COND) && !(SKB)->tstamp)                   \
2100                        __net_timestamp(SKB);                   \
2101        }                                                       \
2102
2103bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
2104{
2105        unsigned int len;
2106
2107        if (!(dev->flags & IFF_UP))
2108                return false;
2109
2110        len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
2111        if (skb->len <= len)
2112                return true;
2113
2114        /* if TSO is enabled, we don't care about the length as the packet
2115         * could be forwarded without being segmented before
2116         */
2117        if (skb_is_gso(skb))
2118                return true;
2119
2120        return false;
2121}
2122EXPORT_SYMBOL_GPL(is_skb_forwardable);
2123
2124int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
2125{
2126        int ret = ____dev_forward_skb(dev, skb);
2127
2128        if (likely(!ret)) {
2129                skb->protocol = eth_type_trans(skb, dev);
2130                skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
2131        }
2132
2133        return ret;
2134}
2135EXPORT_SYMBOL_GPL(__dev_forward_skb);
2136
2137/**
2138 * dev_forward_skb - loopback an skb to another netif
2139 *
2140 * @dev: destination network device
2141 * @skb: buffer to forward
2142 *
2143 * return values:
2144 *      NET_RX_SUCCESS  (no congestion)
2145 *      NET_RX_DROP     (packet was dropped, but freed)
2146 *
2147 * dev_forward_skb can be used for injecting an skb from the
2148 * start_xmit function of one device into the receive queue
2149 * of another device.
2150 *
2151 * The receiving device may be in another namespace, so
2152 * we have to clear all information in the skb that could
2153 * impact namespace isolation.
2154 */
2155int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
2156{
2157        return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
2158}
2159EXPORT_SYMBOL_GPL(dev_forward_skb);
2160
2161static inline int deliver_skb(struct sk_buff *skb,
2162                              struct packet_type *pt_prev,
2163                              struct net_device *orig_dev)
2164{
2165        if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
2166                return -ENOMEM;
2167        refcount_inc(&skb->users);
2168        return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2169}
2170
2171static inline void deliver_ptype_list_skb(struct sk_buff *skb,
2172                                          struct packet_type **pt,
2173                                          struct net_device *orig_dev,
2174                                          __be16 type,
2175                                          struct list_head *ptype_list)
2176{
2177        struct packet_type *ptype, *pt_prev = *pt;
2178
2179        list_for_each_entry_rcu(ptype, ptype_list, list) {
2180                if (ptype->type != type)
2181                        continue;
2182                if (pt_prev)
2183                        deliver_skb(skb, pt_prev, orig_dev);
2184                pt_prev = ptype;
2185        }
2186        *pt = pt_prev;
2187}
2188
2189static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
2190{
2191        if (!ptype->af_packet_priv || !skb->sk)
2192                return false;
2193
2194        if (ptype->id_match)
2195                return ptype->id_match(ptype, skb->sk);
2196        else if ((struct sock *)ptype->af_packet_priv == skb->sk)
2197                return true;
2198
2199        return false;
2200}
2201
2202/**
2203 * dev_nit_active - return true if any network interface taps are in use
2204 *
2205 * @dev: network device to check for the presence of taps
2206 */
2207bool dev_nit_active(struct net_device *dev)
2208{
2209        return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all);
2210}
2211EXPORT_SYMBOL_GPL(dev_nit_active);
2212
2213/*
2214 *      Support routine. Sends outgoing frames to any network
2215 *      taps currently in use.
2216 */
2217
2218void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
2219{
2220        struct packet_type *ptype;
2221        struct sk_buff *skb2 = NULL;
2222        struct packet_type *pt_prev = NULL;
2223        struct list_head *ptype_list = &ptype_all;
2224
2225        rcu_read_lock();
2226again:
2227        list_for_each_entry_rcu(ptype, ptype_list, list) {
2228                if (ptype->ignore_outgoing)
2229                        continue;
2230
2231                /* Never send packets back to the socket
2232                 * they originated from - MvS (miquels@drinkel.ow.org)
2233                 */
2234                if (skb_loop_sk(ptype, skb))
2235                        continue;
2236
2237                if (pt_prev) {
2238                        deliver_skb(skb2, pt_prev, skb->dev);
2239                        pt_prev = ptype;
2240                        continue;
2241                }
2242
2243                /* need to clone skb, done only once */
2244                skb2 = skb_clone(skb, GFP_ATOMIC);
2245                if (!skb2)
2246                        goto out_unlock;
2247
2248                net_timestamp_set(skb2);
2249
2250                /* skb->nh should be correctly
2251                 * set by sender, so that the second statement is
2252                 * just protection against buggy protocols.
2253                 */
2254                skb_reset_mac_header(skb2);
2255
2256                if (skb_network_header(skb2) < skb2->data ||
2257                    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
2258                        net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
2259                                             ntohs(skb2->protocol),
2260                                             dev->name);
2261                        skb_reset_network_header(skb2);
2262                }
2263
2264                skb2->transport_header = skb2->network_header;
2265                skb2->pkt_type = PACKET_OUTGOING;
2266                pt_prev = ptype;
2267        }
2268
2269        if (ptype_list == &ptype_all) {
2270                ptype_list = &dev->ptype_all;
2271                goto again;
2272        }
2273out_unlock:
2274        if (pt_prev) {
2275                if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
2276                        pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
2277                else
2278                        kfree_skb(skb2);
2279        }
2280        rcu_read_unlock();
2281}
2282EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
2283
2284/**
2285 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
2286 * @dev: Network device
2287 * @txq: number of queues available
2288 *
2289 * If real_num_tx_queues is changed the tc mappings may no longer be
2290 * valid. To resolve this verify the tc mapping remains valid and if
2291 * not NULL the mapping. With no priorities mapping to this
2292 * offset/count pair it will no longer be used. In the worst case TC0
2293 * is invalid nothing can be done so disable priority mappings. If is
2294 * expected that drivers will fix this mapping if they can before
2295 * calling netif_set_real_num_tx_queues.
2296 */
2297static void netif_setup_tc(struct net_device *dev, unsigned int txq)
2298{
2299        int i;
2300        struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2301
2302        /* If TC0 is invalidated disable TC mapping */
2303        if (tc->offset + tc->count > txq) {
2304                pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
2305                dev->num_tc = 0;
2306                return;
2307        }
2308
2309        /* Invalidated prio to tc mappings set to TC0 */
2310        for (i = 1; i < TC_BITMASK + 1; i++) {
2311                int q = netdev_get_prio_tc_map(dev, i);
2312
2313                tc = &dev->tc_to_txq[q];
2314                if (tc->offset + tc->count > txq) {
2315                        pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
2316                                i, q);
2317                        netdev_set_prio_tc_map(dev, i, 0);
2318                }
2319        }
2320}
2321
2322int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
2323{
2324        if (dev->num_tc) {
2325                struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2326                int i;
2327
2328                /* walk through the TCs and see if it falls into any of them */
2329                for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
2330                        if ((txq - tc->offset) < tc->count)
2331                                return i;
2332                }
2333
2334                /* didn't find it, just return -1 to indicate no match */
2335                return -1;
2336        }
2337
2338        return 0;
2339}
2340EXPORT_SYMBOL(netdev_txq_to_tc);
2341
2342#ifdef CONFIG_XPS
2343struct static_key xps_needed __read_mostly;
2344EXPORT_SYMBOL(xps_needed);
2345struct static_key xps_rxqs_needed __read_mostly;
2346EXPORT_SYMBOL(xps_rxqs_needed);
2347static DEFINE_MUTEX(xps_map_mutex);
2348#define xmap_dereference(P)             \
2349        rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
2350
2351static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
2352                             int tci, u16 index)
2353{
2354        struct xps_map *map = NULL;
2355        int pos;
2356
2357        if (dev_maps)
2358                map = xmap_dereference(dev_maps->attr_map[tci]);
2359        if (!map)
2360                return false;
2361
2362        for (pos = map->len; pos--;) {
2363                if (map->queues[pos] != index)
2364                        continue;
2365
2366                if (map->len > 1) {
2367                        map->queues[pos] = map->queues[--map->len];
2368                        break;
2369                }
2370
2371                RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
2372                kfree_rcu(map, rcu);
2373                return false;
2374        }
2375
2376        return true;
2377}
2378
2379static bool remove_xps_queue_cpu(struct net_device *dev,
2380                                 struct xps_dev_maps *dev_maps,
2381                                 int cpu, u16 offset, u16 count)
2382{
2383        int num_tc = dev->num_tc ? : 1;
2384        bool active = false;
2385        int tci;
2386
2387        for (tci = cpu * num_tc; num_tc--; tci++) {
2388                int i, j;
2389
2390                for (i = count, j = offset; i--; j++) {
2391                        if (!remove_xps_queue(dev_maps, tci, j))
2392                                break;
2393                }
2394
2395                active |= i < 0;
2396        }
2397
2398        return active;
2399}
2400
2401static void reset_xps_maps(struct net_device *dev,
2402                           struct xps_dev_maps *dev_maps,
2403                           bool is_rxqs_map)
2404{
2405        if (is_rxqs_map) {
2406                static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
2407                RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
2408        } else {
2409                RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
2410        }
2411        static_key_slow_dec_cpuslocked(&xps_needed);
2412        kfree_rcu(dev_maps, rcu);
2413}
2414
2415static void clean_xps_maps(struct net_device *dev, const unsigned long *mask,
2416                           struct xps_dev_maps *dev_maps, unsigned int nr_ids,
2417                           u16 offset, u16 count, bool is_rxqs_map)
2418{
2419        bool active = false;
2420        int i, j;
2421
2422        for (j = -1; j = netif_attrmask_next(j, mask, nr_ids),
2423             j < nr_ids;)
2424                active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
2425                                               count);
2426        if (!active)
2427                reset_xps_maps(dev, dev_maps, is_rxqs_map);
2428
2429        if (!is_rxqs_map) {
2430                for (i = offset + (count - 1); count--; i--) {
2431                        netdev_queue_numa_node_write(
2432                                netdev_get_tx_queue(dev, i),
2433                                NUMA_NO_NODE);
2434                }
2435        }
2436}
2437
2438static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2439                                   u16 count)
2440{
2441        const unsigned long *possible_mask = NULL;
2442        struct xps_dev_maps *dev_maps;
2443        unsigned int nr_ids;
2444
2445        if (!static_key_false(&xps_needed))
2446                return;
2447
2448        cpus_read_lock();
2449        mutex_lock(&xps_map_mutex);
2450
2451        if (static_key_false(&xps_rxqs_needed)) {
2452                dev_maps = xmap_dereference(dev->xps_rxqs_map);
2453                if (dev_maps) {
2454                        nr_ids = dev->num_rx_queues;
2455                        clean_xps_maps(dev, possible_mask, dev_maps, nr_ids,
2456                                       offset, count, true);
2457                }
2458        }
2459
2460        dev_maps = xmap_dereference(dev->xps_cpus_map);
2461        if (!dev_maps)
2462                goto out_no_maps;
2463
2464        if (num_possible_cpus() > 1)
2465                possible_mask = cpumask_bits(cpu_possible_mask);
2466        nr_ids = nr_cpu_ids;
2467        clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count,
2468                       false);
2469
2470out_no_maps:
2471        mutex_unlock(&xps_map_mutex);
2472        cpus_read_unlock();
2473}
2474
2475static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2476{
2477        netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
2478}
2479
2480static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
2481                                      u16 index, bool is_rxqs_map)
2482{
2483        struct xps_map *new_map;
2484        int alloc_len = XPS_MIN_MAP_ALLOC;
2485        int i, pos;
2486
2487        for (pos = 0; map && pos < map->len; pos++) {
2488                if (map->queues[pos] != index)
2489                        continue;
2490                return map;
2491        }
2492
2493        /* Need to add tx-queue to this CPU's/rx-queue's existing map */
2494        if (map) {
2495                if (pos < map->alloc_len)
2496                        return map;
2497
2498                alloc_len = map->alloc_len * 2;
2499        }
2500
2501        /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
2502         *  map
2503         */
2504        if (is_rxqs_map)
2505                new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
2506        else
2507                new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2508                                       cpu_to_node(attr_index));
2509        if (!new_map)
2510                return NULL;
2511
2512        for (i = 0; i < pos; i++)
2513                new_map->queues[i] = map->queues[i];
2514        new_map->alloc_len = alloc_len;
2515        new_map->len = pos;
2516
2517        return new_map;
2518}
2519
2520/* Must be called under cpus_read_lock */
2521int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
2522                          u16 index, bool is_rxqs_map)
2523{
2524        const unsigned long *online_mask = NULL, *possible_mask = NULL;
2525        struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2526        int i, j, tci, numa_node_id = -2;
2527        int maps_sz, num_tc = 1, tc = 0;
2528        struct xps_map *map, *new_map;
2529        bool active = false;
2530        unsigned int nr_ids;
2531
2532        if (dev->num_tc) {
2533                /* Do not allow XPS on subordinate device directly */
2534                num_tc = dev->num_tc;
2535                if (num_tc < 0)
2536                        return -EINVAL;
2537
2538                /* If queue belongs to subordinate dev use its map */
2539                dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
2540
2541                tc = netdev_txq_to_tc(dev, index);
2542                if (tc < 0)
2543                        return -EINVAL;
2544        }
2545
2546        mutex_lock(&xps_map_mutex);
2547        if (is_rxqs_map) {
2548                maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
2549                dev_maps = xmap_dereference(dev->xps_rxqs_map);
2550                nr_ids = dev->num_rx_queues;
2551        } else {
2552                maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
2553                if (num_possible_cpus() > 1) {
2554                        online_mask = cpumask_bits(cpu_online_mask);
2555                        possible_mask = cpumask_bits(cpu_possible_mask);
2556                }
2557                dev_maps = xmap_dereference(dev->xps_cpus_map);
2558                nr_ids = nr_cpu_ids;
2559        }
2560
2561        if (maps_sz < L1_CACHE_BYTES)
2562                maps_sz = L1_CACHE_BYTES;
2563
2564        /* allocate memory for queue storage */
2565        for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
2566             j < nr_ids;) {
2567                if (!new_dev_maps)
2568                        new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2569                if (!new_dev_maps) {
2570                        mutex_unlock(&xps_map_mutex);
2571                        return -ENOMEM;
2572                }
2573
2574                tci = j * num_tc + tc;
2575                map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) :
2576                                 NULL;
2577
2578                map = expand_xps_map(map, j, index, is_rxqs_map);
2579                if (!map)
2580                        goto error;
2581
2582                RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2583        }
2584
2585        if (!new_dev_maps)
2586                goto out_no_new_maps;
2587
2588        if (!dev_maps) {
2589                /* Increment static keys at most once per type */
2590                static_key_slow_inc_cpuslocked(&xps_needed);
2591                if (is_rxqs_map)
2592                        static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
2593        }
2594
2595        for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2596             j < nr_ids;) {
2597                /* copy maps belonging to foreign traffic classes */
2598                for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) {
2599                        /* fill in the new device map from the old device map */
2600                        map = xmap_dereference(dev_maps->attr_map[tci]);
2601                        RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2602                }
2603
2604                /* We need to explicitly update tci as prevous loop
2605                 * could break out early if dev_maps is NULL.
2606                 */
2607                tci = j * num_tc + tc;
2608
2609                if (netif_attr_test_mask(j, mask, nr_ids) &&
2610                    netif_attr_test_online(j, online_mask, nr_ids)) {
2611                        /* add tx-queue to CPU/rx-queue maps */
2612                        int pos = 0;
2613
2614                        map = xmap_dereference(new_dev_maps->attr_map[tci]);
2615                        while ((pos < map->len) && (map->queues[pos] != index))
2616                                pos++;
2617
2618                        if (pos == map->len)
2619                                map->queues[map->len++] = index;
2620#ifdef CONFIG_NUMA
2621                        if (!is_rxqs_map) {
2622                                if (numa_node_id == -2)
2623                                        numa_node_id = cpu_to_node(j);
2624                                else if (numa_node_id != cpu_to_node(j))
2625                                        numa_node_id = -1;
2626                        }
2627#endif
2628                } else if (dev_maps) {
2629                        /* fill in the new device map from the old device map */
2630                        map = xmap_dereference(dev_maps->attr_map[tci]);
2631                        RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2632                }
2633
2634                /* copy maps belonging to foreign traffic classes */
2635                for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
2636                        /* fill in the new device map from the old device map */
2637                        map = xmap_dereference(dev_maps->attr_map[tci]);
2638                        RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2639                }
2640        }
2641
2642        if (is_rxqs_map)
2643                rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps);
2644        else
2645                rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps);
2646
2647        /* Cleanup old maps */
2648        if (!dev_maps)
2649                goto out_no_old_maps;
2650
2651        for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2652             j < nr_ids;) {
2653                for (i = num_tc, tci = j * num_tc; i--; tci++) {
2654                        new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2655                        map = xmap_dereference(dev_maps->attr_map[tci]);
2656                        if (map && map != new_map)
2657                                kfree_rcu(map, rcu);
2658                }
2659        }
2660
2661        kfree_rcu(dev_maps, rcu);
2662
2663out_no_old_maps:
2664        dev_maps = new_dev_maps;
2665        active = true;
2666
2667out_no_new_maps:
2668        if (!is_rxqs_map) {
2669                /* update Tx queue numa node */
2670                netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2671                                             (numa_node_id >= 0) ?
2672                                             numa_node_id : NUMA_NO_NODE);
2673        }
2674
2675        if (!dev_maps)
2676                goto out_no_maps;
2677
2678        /* removes tx-queue from unused CPUs/rx-queues */
2679        for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2680             j < nr_ids;) {
2681                for (i = tc, tci = j * num_tc; i--; tci++)
2682                        active |= remove_xps_queue(dev_maps, tci, index);
2683                if (!netif_attr_test_mask(j, mask, nr_ids) ||
2684                    !netif_attr_test_online(j, online_mask, nr_ids))
2685                        active |= remove_xps_queue(dev_maps, tci, index);
2686                for (i = num_tc - tc, tci++; --i; tci++)
2687                        active |= remove_xps_queue(dev_maps, tci, index);
2688        }
2689
2690        /* free map if not active */
2691        if (!active)
2692                reset_xps_maps(dev, dev_maps, is_rxqs_map);
2693
2694out_no_maps:
2695        mutex_unlock(&xps_map_mutex);
2696
2697        return 0;
2698error:
2699        /* remove any maps that we added */
2700        for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2701             j < nr_ids;) {
2702                for (i = num_tc, tci = j * num_tc; i--; tci++) {
2703                        new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2704                        map = dev_maps ?
2705                              xmap_dereference(dev_maps->attr_map[tci]) :
2706                              NULL;
2707                        if (new_map && new_map != map)
2708                                kfree(new_map);
2709                }
2710        }
2711
2712        mutex_unlock(&xps_map_mutex);
2713
2714        kfree(new_dev_maps);
2715        return -ENOMEM;
2716}
2717EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
2718
2719int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2720                        u16 index)
2721{
2722        int ret;
2723
2724        cpus_read_lock();
2725        ret =  __netif_set_xps_queue(dev, cpumask_bits(mask), index, false);
2726        cpus_read_unlock();
2727
2728        return ret;
2729}
2730EXPORT_SYMBOL(netif_set_xps_queue);
2731
2732#endif
2733static void netdev_unbind_all_sb_channels(struct net_device *dev)
2734{
2735        struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
2736
2737        /* Unbind any subordinate channels */
2738        while (txq-- != &dev->_tx[0]) {
2739                if (txq->sb_dev)
2740                        netdev_unbind_sb_channel(dev, txq->sb_dev);
2741        }
2742}
2743
2744void netdev_reset_tc(struct net_device *dev)
2745{
2746#ifdef CONFIG_XPS
2747        netif_reset_xps_queues_gt(dev, 0);
2748#endif
2749        netdev_unbind_all_sb_channels(dev);
2750
2751        /* Reset TC configuration of device */
2752        dev->num_tc = 0;
2753        memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
2754        memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
2755}
2756EXPORT_SYMBOL(netdev_reset_tc);
2757
2758int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
2759{
2760        if (tc >= dev->num_tc)
2761                return -EINVAL;
2762
2763#ifdef CONFIG_XPS
2764        netif_reset_xps_queues(dev, offset, count);
2765#endif
2766        dev->tc_to_txq[tc].count = count;
2767        dev->tc_to_txq[tc].offset = offset;
2768        return 0;
2769}
2770EXPORT_SYMBOL(netdev_set_tc_queue);
2771
2772int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
2773{
2774        if (num_tc > TC_MAX_QUEUE)
2775                return -EINVAL;
2776
2777#ifdef CONFIG_XPS
2778        netif_reset_xps_queues_gt(dev, 0);
2779#endif
2780        netdev_unbind_all_sb_channels(dev);
2781
2782        dev->num_tc = num_tc;
2783        return 0;
2784}
2785EXPORT_SYMBOL(netdev_set_num_tc);
2786
2787void netdev_unbind_sb_channel(struct net_device *dev,
2788                              struct net_device *sb_dev)
2789{
2790        struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
2791
2792#ifdef CONFIG_XPS
2793        netif_reset_xps_queues_gt(sb_dev, 0);
2794#endif
2795        memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
2796        memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
2797
2798        while (txq-- != &dev->_tx[0]) {
2799                if (txq->sb_dev == sb_dev)
2800                        txq->sb_dev = NULL;
2801        }
2802}
2803EXPORT_SYMBOL(netdev_unbind_sb_channel);
2804
2805int netdev_bind_sb_channel_queue(struct net_device *dev,
2806                                 struct net_device *sb_dev,
2807                                 u8 tc, u16 count, u16 offset)
2808{
2809        /* Make certain the sb_dev and dev are already configured */
2810        if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
2811                return -EINVAL;
2812
2813        /* We cannot hand out queues we don't have */
2814        if ((offset + count) > dev->real_num_tx_queues)
2815                return -EINVAL;
2816
2817        /* Record the mapping */
2818        sb_dev->tc_to_txq[tc].count = count;
2819        sb_dev->tc_to_txq[tc].offset = offset;
2820
2821        /* Provide a way for Tx queue to find the tc_to_txq map or
2822         * XPS map for itself.
2823         */
2824        while (count--)
2825                netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
2826
2827        return 0;
2828}
2829EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
2830
2831int netdev_set_sb_channel(struct net_device *dev, u16 channel)
2832{
2833        /* Do not use a multiqueue device to represent a subordinate channel */
2834        if (netif_is_multiqueue(dev))
2835                return -ENODEV;
2836
2837        /* We allow channels 1 - 32767 to be used for subordinate channels.
2838         * Channel 0 is meant to be "native" mode and used only to represent
2839         * the main root device. We allow writing 0 to reset the device back
2840         * to normal mode after being used as a subordinate channel.
2841         */
2842        if (channel > S16_MAX)
2843                return -EINVAL;
2844
2845        dev->num_tc = -channel;
2846
2847        return 0;
2848}
2849EXPORT_SYMBOL(netdev_set_sb_channel);
2850
2851/*
2852 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2853 * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
2854 */
2855int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2856{
2857        bool disabling;
2858        int rc;
2859
2860        disabling = txq < dev->real_num_tx_queues;
2861
2862        if (txq < 1 || txq > dev->num_tx_queues)
2863                return -EINVAL;
2864
2865        if (dev->reg_state == NETREG_REGISTERED ||
2866            dev->reg_state == NETREG_UNREGISTERING) {
2867                ASSERT_RTNL();
2868
2869                rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2870                                                  txq);
2871                if (rc)
2872                        return rc;
2873
2874                if (dev->num_tc)
2875                        netif_setup_tc(dev, txq);
2876
2877                dev->real_num_tx_queues = txq;
2878
2879                if (disabling) {
2880                        synchronize_net();
2881                        qdisc_reset_all_tx_gt(dev, txq);
2882#ifdef CONFIG_XPS
2883                        netif_reset_xps_queues_gt(dev, txq);
2884#endif
2885                }
2886        } else {
2887                dev->real_num_tx_queues = txq;
2888        }
2889
2890        return 0;
2891}
2892EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2893
2894#ifdef CONFIG_SYSFS
2895/**
2896 *      netif_set_real_num_rx_queues - set actual number of RX queues used
2897 *      @dev: Network device
2898 *      @rxq: Actual number of RX queues
2899 *
2900 *      This must be called either with the rtnl_lock held or before
2901 *      registration of the net device.  Returns 0 on success, or a
2902 *      negative error code.  If called before registration, it always
2903 *      succeeds.
2904 */
2905int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2906{
2907        int rc;
2908
2909        if (rxq < 1 || rxq > dev->num_rx_queues)
2910                return -EINVAL;
2911
2912        if (dev->reg_state == NETREG_REGISTERED) {
2913                ASSERT_RTNL();
2914
2915                rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2916                                                  rxq);
2917                if (rc)
2918                        return rc;
2919        }
2920
2921        dev->real_num_rx_queues = rxq;
2922        return 0;
2923}
2924EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2925#endif
2926
2927/**
2928 * netif_get_num_default_rss_queues - default number of RSS queues
2929 *
2930 * This routine should set an upper limit on the number of RSS queues
2931 * used by default by multiqueue devices.
2932 */
2933int netif_get_num_default_rss_queues(void)
2934{
2935        return is_kdump_kernel() ?
2936                1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2937}
2938EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2939
2940static void __netif_reschedule(struct Qdisc *q)
2941{
2942        struct softnet_data *sd;
2943        unsigned long flags;
2944
2945        local_irq_save(flags);
2946        sd = this_cpu_ptr(&softnet_data);
2947        q->next_sched = NULL;
2948        *sd->output_queue_tailp = q;
2949        sd->output_queue_tailp = &q->next_sched;
2950        raise_softirq_irqoff(NET_TX_SOFTIRQ);
2951        local_irq_restore(flags);
2952}
2953
2954void __netif_schedule(struct Qdisc *q)
2955{
2956        if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2957                __netif_reschedule(q);
2958}
2959EXPORT_SYMBOL(__netif_schedule);
2960
2961struct dev_kfree_skb_cb {
2962        enum skb_free_reason reason;
2963};
2964
2965static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2966{
2967        return (struct dev_kfree_skb_cb *)skb->cb;
2968}
2969
2970void netif_schedule_queue(struct netdev_queue *txq)
2971{
2972        rcu_read_lock();
2973        if (!netif_xmit_stopped(txq)) {
2974                struct Qdisc *q = rcu_dereference(txq->qdisc);
2975
2976                __netif_schedule(q);
2977        }
2978        rcu_read_unlock();
2979}
2980EXPORT_SYMBOL(netif_schedule_queue);
2981
2982void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2983{
2984        if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2985                struct Qdisc *q;
2986
2987                rcu_read_lock();
2988                q = rcu_dereference(dev_queue->qdisc);
2989                __netif_schedule(q);
2990                rcu_read_unlock();
2991        }
2992}
2993EXPORT_SYMBOL(netif_tx_wake_queue);
2994
2995void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2996{
2997        unsigned long flags;
2998
2999        if (unlikely(!skb))
3000                return;

3001
3002        if (likely(refcount_read(&skb->users) == 1)) {
3003                smp_rmb();
3004                refcount_set(&skb->users, 0);
3005        } else if (likely(!refcount_dec_and_test(&skb->users))) {
3006                return;
3007        }
3008        get_kfree_skb_cb(skb)->reason = reason;
3009        local_irq_save(flags);
3010        skb->next = __this_cpu_read(softnet_data.completion_queue);
3011        __this_cpu_write(softnet_data.completion_queue, skb);
3012        raise_softirq_irqoff(NET_TX_SOFTIRQ);
3013        local_irq_restore(flags);
3014}
3015EXPORT_SYMBOL(__dev_kfree_skb_irq);
3016
3017void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
3018{
3019        if (in_irq() || irqs_disabled())
3020                __dev_kfree_skb_irq(skb, reason);
3021        else
3022                dev_kfree_skb(skb);
3023}
3024EXPORT_SYMBOL(__dev_kfree_skb_any);
3025
3026
3027/**
3028 * netif_device_detach - mark device as removed
3029 * @dev: network device
3030 *
3031 * Mark device as removed from system and therefore no longer available.
3032 */
3033void netif_device_detach(struct net_device *dev)
3034{
3035        if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
3036            netif_running(dev)) {
3037                netif_tx_stop_all_queues(dev);
3038        }
3039}
3040EXPORT_SYMBOL(netif_device_detach);
3041
3042/**
3043 * netif_device_attach - mark device as attached
3044 * @dev: network device
3045 *
3046 * Mark device as attached from system and restart if needed.
3047 */
3048void netif_device_attach(struct net_device *dev)
3049{
3050        if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
3051            netif_running(dev)) {
3052                netif_tx_wake_all_queues(dev);
3053                __netdev_watchdog_up(dev);
3054        }
3055}
3056EXPORT_SYMBOL(netif_device_attach);
3057
3058/*
3059 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
3060 * to be used as a distribution range.
3061 */
3062static u16 skb_tx_hash(const struct net_device *dev,
3063                       const struct net_device *sb_dev,
3064                       struct sk_buff *skb)
3065{
3066        u32 hash;
3067        u16 qoffset = 0;
3068        u16 qcount = dev->real_num_tx_queues;
3069
3070        if (dev->num_tc) {
3071                u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
3072
3073                qoffset = sb_dev->tc_to_txq[tc].offset;
3074                qcount = sb_dev->tc_to_txq[tc].count;
3075        }
3076
3077        if (skb_rx_queue_recorded(skb)) {
3078                hash = skb_get_rx_queue(skb);
3079                if (hash >= qoffset)
3080                        hash -= qoffset;
3081                while (unlikely(hash >= qcount))
3082                        hash -= qcount;
3083                return hash + qoffset;
3084        }
3085
3086        return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
3087}
3088
3089static void skb_warn_bad_offload(const struct sk_buff *skb)
3090{
3091        static const netdev_features_t null_features;
3092        struct net_device *dev = skb->dev;
3093        const char *name = "";
3094
3095        if (!net_ratelimit())
3096                return;
3097
3098        if (dev) {
3099                if (dev->dev.parent)
3100                        name = dev_driver_string(dev->dev.parent);
3101                else
3102                        name = netdev_name(dev);
3103        }
3104        skb_dump(KERN_WARNING, skb, false);
3105        WARN(1, "%s: caps=(%pNF, %pNF)\n",
3106             name, dev ? &dev->features : &null_features,
3107             skb->sk ? &skb->sk->sk_route_caps : &null_features);
3108}
3109
3110/*
3111 * Invalidate hardware checksum when packet is to be mangled, and
3112 * complete checksum manually on outgoing path.
3113 */
3114int skb_checksum_help(struct sk_buff *skb)
3115{
3116        __wsum csum;
3117        int ret = 0, offset;
3118
3119        if (skb->ip_summed == CHECKSUM_COMPLETE)
3120                goto out_set_summed;
3121
3122        if (unlikely(skb_shinfo(skb)->gso_size)) {
3123                skb_warn_bad_offload(skb);
3124                return -EINVAL;
3125        }
3126
3127        /* Before computing a checksum, we should make sure no frag could
3128         * be modified by an external entity : checksum could be wrong.
3129         */
3130        if (skb_has_shared_frag(skb)) {
3131                ret = __skb_linearize(skb);
3132                if (ret)
3133                        goto out;
3134        }
3135
3136        offset = skb_checksum_start_offset(skb);
3137        BUG_ON(offset >= skb_headlen(skb));
3138        csum = skb_checksum(skb, offset, skb->len - offset, 0);
3139
3140        offset += skb->csum_offset;
3141        BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
3142
3143        ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
3144        if (ret)
3145                goto out;
3146
3147        *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
3148out_set_summed:
3149        skb->ip_summed = CHECKSUM_NONE;
3150out:
3151        return ret;
3152}
3153EXPORT_SYMBOL(skb_checksum_help);
3154
3155int skb_crc32c_csum_help(struct sk_buff *skb)
3156{
3157        __le32 crc32c_csum;
3158        int ret = 0, offset, start;
3159
3160        if (skb->ip_summed != CHECKSUM_PARTIAL)
3161                goto out;
3162
3163        if (unlikely(skb_is_gso(skb)))
3164                goto out;
3165
3166        /* Before computing a checksum, we should make sure no frag could
3167         * be modified by an external entity : checksum could be wrong.
3168         */
3169        if (unlikely(skb_has_shared_frag(skb))) {
3170                ret = __skb_linearize(skb);
3171                if (ret)
3172                        goto out;
3173        }
3174        start = skb_checksum_start_offset(skb);
3175        offset = start + offsetof(struct sctphdr, checksum);
3176        if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
3177                ret = -EINVAL;
3178                goto out;
3179        }
3180
3181        ret = skb_ensure_writable(skb, offset + sizeof(__le32));
3182        if (ret)
3183                goto out;
3184
3185        crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
3186                                                  skb->len - start, ~(__u32)0,
3187                                                  crc32c_csum_stub));
3188        *(__le32 *)(skb->data + offset) = crc32c_csum;
3189        skb->ip_summed = CHECKSUM_NONE;
3190        skb->csum_not_inet = 0;
3191out:
3192        return ret;
3193}
3194
3195__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
3196{
3197        __be16 type = skb->protocol;
3198
3199        /* Tunnel gso handlers can set protocol to ethernet. */
3200        if (type == htons(ETH_P_TEB)) {
3201                struct ethhdr *eth;
3202
3203                if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
3204                        return 0;
3205
3206                eth = (struct ethhdr *)skb->data;
3207                type = eth->h_proto;
3208        }
3209
3210        return __vlan_get_protocol(skb, type, depth);
3211}
3212
3213/**
3214 *      skb_mac_gso_segment - mac layer segmentation handler.
3215 *      @skb: buffer to segment
3216 *      @features: features for the output path (see dev->features)
3217 */
3218struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
3219                                    netdev_features_t features)
3220{
3221        struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
3222        struct packet_offload *ptype;
3223        int vlan_depth = skb->mac_len;
3224        __be16 type = skb_network_protocol(skb, &vlan_depth);
3225
3226        if (unlikely(!type))
3227                return ERR_PTR(-EINVAL);
3228
3229        __skb_pull(skb, vlan_depth);
3230
3231        rcu_read_lock();
3232        list_for_each_entry_rcu(ptype, &offload_base, list) {
3233                if (ptype->type == type && ptype->callbacks.gso_segment) {
3234                        segs = ptype->callbacks.gso_segment(skb, features);
3235                        break;
3236                }
3237        }
3238        rcu_read_unlock();
3239
3240        __skb_push(skb, skb->data - skb_mac_header(skb));
3241
3242        return segs;
3243}
3244EXPORT_SYMBOL(skb_mac_gso_segment);
3245
3246
3247/* openvswitch calls this on rx path, so we need a different check.
3248 */
3249static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
3250{
3251        if (tx_path)
3252                return skb->ip_summed != CHECKSUM_PARTIAL &&
3253                       skb->ip_summed != CHECKSUM_UNNECESSARY;
3254
3255        return skb->ip_summed == CHECKSUM_NONE;
3256}
3257
3258/**
3259 *      __skb_gso_segment - Perform segmentation on skb.
3260 *      @skb: buffer to segment
3261 *      @features: features for the output path (see dev->features)
3262 *      @tx_path: whether it is called in TX path
3263 *
3264 *      This function segments the given skb and returns a list of segments.
3265 *
3266 *      It may return NULL if the skb requires no segmentation.  This is
3267 *      only possible when GSO is used for verifying header integrity.
3268 *
3269 *      Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb.
3270 */
3271struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
3272                                  netdev_features_t features, bool tx_path)
3273{
3274        struct sk_buff *segs;
3275
3276        if (unlikely(skb_needs_check(skb, tx_path))) {
3277                int err;
3278
3279                /* We're going to init ->check field in TCP or UDP header */
3280                err = skb_cow_head(skb, 0);
3281                if (err < 0)
3282                        return ERR_PTR(err);
3283        }
3284
3285        /* Only report GSO partial support if it will enable us to
3286         * support segmentation on this frame without needing additional
3287         * work.
3288         */
3289        if (features & NETIF_F_GSO_PARTIAL) {
3290                netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
3291                struct net_device *dev = skb->dev;
3292
3293                partial_features |= dev->features & dev->gso_partial_features;
3294                if (!skb_gso_ok(skb, features | partial_features))
3295                        features &= ~NETIF_F_GSO_PARTIAL;
3296        }
3297
3298        BUILD_BUG_ON(SKB_GSO_CB_OFFSET +
3299                     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
3300
3301        SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
3302        SKB_GSO_CB(skb)->encap_level = 0;
3303
3304        skb_reset_mac_header(skb);
3305        skb_reset_mac_len(skb);
3306
3307        segs = skb_mac_gso_segment(skb, features);
3308
3309        if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
3310                skb_warn_bad_offload(skb);
3311
3312        return segs;
3313}
3314EXPORT_SYMBOL(__skb_gso_segment);
3315
3316/* Take action when hardware reception checksum errors are detected. */
3317#ifdef CONFIG_BUG
3318void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
3319{
3320        if (net_ratelimit()) {
3321                pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
3322                skb_dump(KERN_ERR, skb, true);
3323                dump_stack();
3324        }
3325}
3326EXPORT_SYMBOL(netdev_rx_csum_fault);
3327#endif
3328
3329/* XXX: check that highmem exists at all on the given machine. */
3330static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
3331{
3332#ifdef CONFIG_HIGHMEM
3333        int i;
3334
3335        if (!(dev->features & NETIF_F_HIGHDMA)) {
3336                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
3337                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
3338
3339                        if (PageHighMem(skb_frag_page(frag)))
3340                                return 1;
3341                }
3342        }
3343#endif
3344        return 0;
3345}
3346
3347/* If MPLS offload request, verify we are testing hardware MPLS features
3348 * instead of standard features for the netdev.
3349 */
3350#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
3351static netdev_features_t net_mpls_features(struct sk_buff *skb,
3352                                           netdev_features_t features,
3353                                           __be16 type)
3354{
3355        if (eth_p_mpls(type))
3356                features &= skb->dev->mpls_features;
3357
3358        return features;
3359}
3360#else
3361static netdev_features_t net_mpls_features(struct sk_buff *skb,
3362                                           netdev_features_t features,
3363                                           __be16 type)
3364{
3365        return features;
3366}
3367#endif
3368
3369static netdev_features_t harmonize_features(struct sk_buff *skb,
3370        netdev_features_t features)
3371{
3372        int tmp;
3373        __be16 type;
3374
3375        type = skb_network_protocol(skb, &tmp);
3376        features = net_mpls_features(skb, features, type);
3377
3378        if (skb->ip_summed != CHECKSUM_NONE &&
3379            !can_checksum_protocol(features, type)) {
3380                features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
3381        }
3382        if (illegal_highdma(skb->dev, skb))
3383                features &= ~NETIF_F_SG;
3384
3385        return features;
3386}
3387
3388netdev_features_t passthru_features_check(struct sk_buff *skb,
3389                                          struct net_device *dev,
3390                                          netdev_features_t features)
3391{
3392        return features;
3393}
3394EXPORT_SYMBOL(passthru_features_check);
3395
3396static netdev_features_t dflt_features_check(struct sk_buff *skb,
3397                                             struct net_device *dev,
3398                                             netdev_features_t features)
3399{
3400        return vlan_features_check(skb, features);
3401}
3402
3403static netdev_features_t gso_features_check(const struct sk_buff *skb,
3404                                            struct net_device *dev,
3405                                            netdev_features_t features)
3406{
3407        u16 gso_segs = skb_shinfo(skb)->gso_segs;
3408
3409        if (gso_segs > dev->gso_max_segs)
3410                return features & ~NETIF_F_GSO_MASK;
3411
3412        /* Support for GSO partial features requires software
3413         * intervention before we can actually process the packets
3414         * so we need to strip support for any partial features now
3415         * and we can pull them back in after we have partially
3416         * segmented the frame.
3417         */
3418        if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
3419                features &= ~dev->gso_partial_features;
3420
3421        /* Make sure to clear the IPv4 ID mangling feature if the
3422         * IPv4 header has the potential to be fragmented.
3423         */
3424        if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
3425                struct iphdr *iph = skb->encapsulation ?
3426                                    inner_ip_hdr(skb) : ip_hdr(skb);
3427
3428                if (!(iph->frag_off & htons(IP_DF)))
3429                        features &= ~NETIF_F_TSO_MANGLEID;
3430        }
3431
3432        return features;
3433}
3434
3435netdev_features_t netif_skb_features(struct sk_buff *skb)
3436{
3437        struct net_device *dev = skb->dev;
3438        netdev_features_t features = dev->features;
3439
3440        if (skb_is_gso(skb))
3441                features = gso_features_check(skb, dev, features);
3442
3443        /* If encapsulation offload request, verify we are testing
3444         * hardware encapsulation features instead of standard
3445         * features for the netdev
3446         */
3447        if (skb->encapsulation)
3448                features &= dev->hw_enc_features;
3449
3450        if (skb_vlan_tagged(skb))
3451                features = netdev_intersect_features(features,
3452                                                     dev->vlan_features |
3453                                                     NETIF_F_HW_VLAN_CTAG_TX |
3454                                                     NETIF_F_HW_VLAN_STAG_TX);
3455
3456        if (dev->netdev_ops->ndo_features_check)
3457                features &= dev->netdev_ops->ndo_features_check(skb, dev,
3458                                                                features);
3459        else
3460                features &= dflt_features_check(skb, dev, features);
3461
3462        return harmonize_features(skb, features);
3463}
3464EXPORT_SYMBOL(netif_skb_features);
3465
3466static int xmit_one(struct sk_buff *skb, struct net_device *dev,
3467                    struct netdev_queue *txq, bool more)
3468{
3469        unsigned int len;
3470        int rc;
3471
3472        if (dev_nit_active(dev))
3473                dev_queue_xmit_nit(skb, dev);
3474
3475        len = skb->len;
3476        trace_net_dev_start_xmit(skb, dev);
3477        rc = netdev_start_xmit(skb, dev, txq, more);
3478        trace_net_dev_xmit(skb, rc, dev, len);
3479
3480        return rc;
3481}
3482
3483struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
3484                                    struct netdev_queue *txq, int *ret)
3485{
3486        struct sk_buff *skb = first;
3487        int rc = NETDEV_TX_OK;
3488
3489        while (skb) {
3490                struct sk_buff *next = skb->next;
3491
3492                skb_mark_not_on_list(skb);
3493                rc = xmit_one(skb, dev, txq, next != NULL);
3494                if (unlikely(!dev_xmit_complete(rc))) {
3495                        skb->next = next;
3496                        goto out;
3497                }
3498
3499                skb = next;
3500                if (netif_tx_queue_stopped(txq) && skb) {
3501                        rc = NETDEV_TX_BUSY;
3502                        break;
3503                }
3504        }
3505
3506out:
3507        *ret = rc;
3508        return skb;
3509}
3510
3511static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
3512                                          netdev_features_t features)
3513{
3514        if (skb_vlan_tag_present(skb) &&
3515            !vlan_hw_offload_capable(features, skb->vlan_proto))
3516                skb = __vlan_hwaccel_push_inside(skb);
3517        return skb;
3518}
3519
3520int skb_csum_hwoffload_help(struct sk_buff *skb,
3521                            const netdev_features_t features)
3522{
3523        if (unlikely(skb->csum_not_inet))
3524                return !!(features & NETIF_F_SCTP_CRC) ? 0 :
3525                        skb_crc32c_csum_help(skb);
3526
3527        return !!(features & NETIF_F_CSUM_MASK) ? 0 : skb_checksum_help(skb);
3528}
3529EXPORT_SYMBOL(skb_csum_hwoffload_help);
3530
3531static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
3532{
3533        netdev_features_t features;
3534
3535        features = netif_skb_features(skb);
3536        skb = validate_xmit_vlan(skb, features);
3537        if (unlikely(!skb))
3538                goto out_null;
3539
3540        skb = sk_validate_xmit_skb(skb, dev);
3541        if (unlikely(!skb))
3542                goto out_null;
3543
3544        if (netif_needs_gso(skb, features)) {
3545                struct sk_buff *segs;
3546
3547                segs = skb_gso_segment(skb, features);
3548                if (IS_ERR(segs)) {
3549                        goto out_kfree_skb;
3550                } else if (segs) {
3551                        consume_skb(skb);
3552                        skb = segs;
3553                }
3554        } else {
3555                if (skb_needs_linearize(skb, features) &&
3556                    __skb_linearize(skb))
3557                        goto out_kfree_skb;
3558
3559                /* If packet is not checksummed and device does not
3560                 * support checksumming for this protocol, complete
3561                 * checksumming here.
3562                 */
3563                if (skb->ip_summed == CHECKSUM_PARTIAL) {
3564                        if (skb->encapsulation)
3565                                skb_set_inner_transport_header(skb,
3566                                                               skb_checksum_start_offset(skb));
3567                        else
3568                                skb_set_transport_header(skb,
3569                                                         skb_checksum_start_offset(skb));
3570                        if (skb_csum_hwoffload_help(skb, features))
3571                                goto out_kfree_skb;
3572                }
3573        }
3574
3575        skb = validate_xmit_xfrm(skb, features, again);
3576
3577        return skb;
3578
3579out_kfree_skb:
3580        kfree_skb(skb);
3581out_null:
3582        atomic_long_inc(&dev->tx_dropped);
3583        return NULL;
3584}
3585
3586struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
3587{
3588        struct sk_buff *next, *head = NULL, *tail;
3589
3590        for (; skb != NULL; skb = next) {
3591                next = skb->next;
3592                skb_mark_not_on_list(skb);
3593
3594                /* in case skb wont be segmented, point to itself */
3595                skb->prev = skb;
3596
3597                skb = validate_xmit_skb(skb, dev, again);
3598                if (!skb)
3599                        continue;
3600
3601                if (!head)
3602                        head = skb;
3603                else
3604                        tail->next = skb;
3605                /* If skb was segmented, skb->prev points to
3606                 * the last segment. If not, it still contains skb.
3607                 */
3608                tail = skb->prev;
3609        }
3610        return head;
3611}
3612EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
3613
3614static void qdisc_pkt_len_init(struct sk_buff *skb)
3615{
3616        const struct skb_shared_info *shinfo = skb_shinfo(skb);
3617
3618        qdisc_skb_cb(skb)->pkt_len = skb->len;
3619
3620        /* To get more precise estimation of bytes sent on wire,
3621         * we add to pkt_len the headers size of all segments
3622         */
3623        if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
3624                unsigned int hdr_len;
3625                u16 gso_segs = shinfo->gso_segs;
3626
3627                /* mac layer + network layer */
3628                hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3629
3630                /* + transport layer */
3631                if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
3632                        const struct tcphdr *th;
3633                        struct tcphdr _tcphdr;
3634
3635                        th = skb_header_pointer(skb, skb_transport_offset(skb),
3636                                                sizeof(_tcphdr), &_tcphdr);
3637                        if (likely(th))
3638                                hdr_len += __tcp_hdrlen(th);
3639                } else {
3640                        struct udphdr _udphdr;
3641
3642                        if (skb_header_pointer(skb, skb_transport_offset(skb),
3643                                               sizeof(_udphdr), &_udphdr))
3644                                hdr_len += sizeof(struct udphdr);
3645                }
3646
3647                if (shinfo->gso_type & SKB_GSO_DODGY)
3648                        gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3649                                                shinfo->gso_size);
3650
3651                qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3652        }
3653}
3654
3655static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3656                                 struct net_device *dev,
3657                                 struct netdev_queue *txq)
3658{
3659        spinlock_t *root_lock = qdisc_lock(q);
3660        struct sk_buff *to_free = NULL;
3661        bool contended;
3662        int rc;
3663
3664        qdisc_calculate_pkt_len(skb, q);
3665
3666        if (q->flags & TCQ_F_NOLOCK) {
3667                rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3668                qdisc_run(q);
3669
3670                if (unlikely(to_free))
3671                        kfree_skb_list(to_free);
3672                return rc;
3673        }
3674
3675        /*
3676         * Heuristic to force contended enqueues to serialize on a
3677         * separate lock before trying to get qdisc main lock.
3678         * This permits qdisc->running owner to get the lock more
3679         * often and dequeue packets faster.
3680         */
3681        contended = qdisc_is_running(q);
3682        if (unlikely(contended))
3683                spin_lock(&q->busylock);
3684
3685        spin_lock(root_lock);
3686        if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3687                __qdisc_drop(skb, &to_free);
3688                rc = NET_XMIT_DROP;
3689        } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3690                   qdisc_run_begin(q)) {
3691                /*
3692                 * This is a work-conserving queue; there are no old skbs
3693                 * waiting to be sent out; and the qdisc is not running -
3694                 * xmit the skb directly.
3695                 */
3696
3697                qdisc_bstats_update(q, skb);
3698
3699                if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3700                        if (unlikely(contended)) {
3701                                spin_unlock(&q->busylock);
3702                                contended = false;
3703                        }
3704                        __qdisc_run(q);
3705                }
3706
3707                qdisc_run_end(q);
3708                rc = NET_XMIT_SUCCESS;
3709        } else {
3710                rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3711                if (qdisc_run_begin(q)) {
3712                        if (unlikely(contended)) {
3713                                spin_unlock(&q->busylock);
3714                                contended = false;
3715                        }
3716                        __qdisc_run(q);
3717                        qdisc_run_end(q);
3718                }
3719        }
3720        spin_unlock(root_lock);
3721        if (unlikely(to_free))
3722                kfree_skb_list(to_free);
3723        if (unlikely(contended))
3724                spin_unlock(&q->busylock);
3725        return rc;
3726}
3727
3728#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3729static void skb_update_prio(struct sk_buff *skb)
3730{
3731        const struct netprio_map *map;
3732        const struct sock *sk;
3733        unsigned int prioidx;
3734
3735        if (skb->priority)
3736                return;
3737        map = rcu_dereference_bh(skb->dev->priomap);
3738        if (!map)
3739                return;
3740        sk = skb_to_full_sk(skb);
3741        if (!sk)
3742                return;
3743
3744        prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
3745
3746        if (prioidx < map->priomap_len)
3747                skb->priority = map->priomap[prioidx];
3748}
3749#else
3750#define skb_update_prio(skb)
3751#endif
3752
3753/**
3754 *      dev_loopback_xmit - loop back @skb
3755 *      @net: network namespace this loopback is happening in
3756 *      @sk:  sk needed to be a netfilter okfn
3757 *      @skb: buffer to transmit
3758 */
3759int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3760{
3761        skb_reset_mac_header(skb);
3762        __skb_pull(skb, skb_network_offset(skb));
3763        skb->pkt_type = PACKET_LOOPBACK;
3764        skb->ip_summed = CHECKSUM_UNNECESSARY;
3765        WARN_ON(!skb_dst(skb));
3766        skb_dst_force(skb);
3767        netif_rx_ni(skb);
3768        return 0;
3769}
3770EXPORT_SYMBOL(dev_loopback_xmit);
3771
3772#ifdef CONFIG_NET_EGRESS
3773static struct sk_buff *
3774sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3775{
3776        struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress);
3777        struct tcf_result cl_res;
3778
3779        if (!miniq)
3780                return skb;
3781
3782        /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
3783        mini_qdisc_bstats_cpu_update(miniq, skb);
3784
3785        switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
3786        case TC_ACT_OK:
3787        case TC_ACT_RECLASSIFY:
3788                skb->tc_index = TC_H_MIN(cl_res.classid);
3789                break;
3790        case TC_ACT_SHOT:
3791                mini_qdisc_qstats_cpu_drop(miniq);
3792                *ret = NET_XMIT_DROP;
3793                kfree_skb(skb);
3794                return NULL;
3795        case TC_ACT_STOLEN:
3796        case TC_ACT_QUEUED:
3797        case TC_ACT_TRAP:
3798                *ret = NET_XMIT_SUCCESS;
3799                consume_skb(skb);
3800                return NULL;
3801        case TC_ACT_REDIRECT:
3802                /* No need to push/pop skb's mac_header here on egress! */
3803                skb_do_redirect(skb);
3804                *ret = NET_XMIT_SUCCESS;
3805                return NULL;
3806        default:
3807                break;
3808        }
3809
3810        return skb;
3811}
3812#endif /* CONFIG_NET_EGRESS */
3813
3814#ifdef CONFIG_XPS
3815static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
3816                               struct xps_dev_maps *dev_maps, unsigned int tci)
3817{
3818        struct xps_map *map;
3819        int queue_index = -1;
3820
3821        if (dev->num_tc) {
3822                tci *= dev->num_tc;
3823                tci += netdev_get_prio_tc_map(dev, skb->priority);
3824        }
3825
3826        map = rcu_dereference(dev_maps->attr_map[tci]);
3827        if (map) {
3828                if (map->len == 1)
3829                        queue_index = map->queues[0];
3830                else
3831                        queue_index = map->queues[reciprocal_scale(
3832                                                skb_get_hash(skb), map->len)];
3833                if (unlikely(queue_index >= dev->real_num_tx_queues))
3834                        queue_index = -1;
3835        }
3836        return queue_index;
3837}
3838#endif
3839
3840static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
3841                         struct sk_buff *skb)
3842{
3843#ifdef CONFIG_XPS
3844        struct xps_dev_maps *dev_maps;
3845        struct sock *sk = skb->sk;
3846        int queue_index = -1;
3847
3848        if (!static_key_false(&xps_needed))
3849                return -1;
3850
3851        rcu_read_lock();
3852        if (!static_key_false(&xps_rxqs_needed))
3853                goto get_cpus_map;
3854
3855        dev_maps = rcu_dereference(sb_dev->xps_rxqs_map);
3856        if (dev_maps) {
3857                int tci = sk_rx_queue_get(sk);
3858
3859                if (tci >= 0 && tci < dev->num_rx_queues)
3860                        queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
3861                                                          tci);
3862        }
3863
3864get_cpus_map:
3865        if (queue_index < 0) {
3866                dev_maps = rcu_dereference(sb_dev->xps_cpus_map);
3867                if (dev_maps) {
3868                        unsigned int tci = skb->sender_cpu - 1;
3869
3870                        queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
3871                                                          tci);
3872                }
3873        }
3874        rcu_read_unlock();
3875
3876        return queue_index;
3877#else
3878        return -1;
3879#endif
3880}
3881
3882u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
3883                     struct net_device *sb_dev)
3884{
3885        return 0;
3886}
3887EXPORT_SYMBOL(dev_pick_tx_zero);
3888
3889u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
3890                       struct net_device *sb_dev)
3891{
3892        return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
3893}
3894EXPORT_SYMBOL(dev_pick_tx_cpu_id);
3895
3896u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
3897                     struct net_device *sb_dev)
3898{
3899        struct sock *sk = skb->sk;
3900        int queue_index = sk_tx_queue_get(sk);
3901
3902        sb_dev = sb_dev ? : dev;
3903
3904        if (queue_index < 0 || skb->ooo_okay ||
3905            queue_index >= dev->real_num_tx_queues) {
3906                int new_index = get_xps_queue(dev, sb_dev, skb);
3907
3908                if (new_index < 0)
3909                        new_index = skb_tx_hash(dev, sb_dev, skb);
3910
3911                if (queue_index != new_index && sk &&
3912                    sk_fullsock(sk) &&
3913                    rcu_access_pointer(sk->sk_dst_cache))
3914                        sk_tx_queue_set(sk, new_index);
3915
3916                queue_index = new_index;
3917        }
3918
3919        return queue_index;
3920}
3921EXPORT_SYMBOL(netdev_pick_tx);
3922
3923struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
3924                                         struct sk_buff *skb,
3925                                         struct net_device *sb_dev)
3926{
3927        int queue_index = 0;
3928
3929#ifdef CONFIG_XPS
3930        u32 sender_cpu = skb->sender_cpu - 1;
3931
3932        if (sender_cpu >= (u32)NR_CPUS)
3933                skb->sender_cpu = raw_smp_processor_id() + 1;
3934#endif
3935
3936        if (dev->real_num_tx_queues != 1) {
3937                const struct net_device_ops *ops = dev->netdev_ops;
3938
3939                if (ops->ndo_select_queue)
3940                        queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
3941                else
3942                        queue_index = netdev_pick_tx(dev, skb, sb_dev);
3943
3944                queue_index = netdev_cap_txqueue(dev, queue_index);
3945        }
3946
3947        skb_set_queue_mapping(skb, queue_index);
3948        return netdev_get_tx_queue(dev, queue_index);
3949}
3950
3951/**
3952 *      __dev_queue_xmit - transmit a buffer
3953 *      @skb: buffer to transmit
3954 *      @sb_dev: suboordinate device used for L2 forwarding offload
3955 *
3956 *      Queue a buffer for transmission to a network device. The caller must
3957 *      have set the device and priority and built the buffer before calling
3958 *      this function. The function can be called from an interrupt.
3959 *
3960 *      A negative errno code is returned on a failure. A success does not
3961 *      guarantee the frame will be transmitted as it may be dropped due
3962 *      to congestion or traffic shaping.
3963 *
3964 * -----------------------------------------------------------------------------------
3965 *      I notice this method can also return errors from the queue disciplines,
3966 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3967 *      be positive.
3968 *
3969 *      Regardless of the return value, the skb is consumed, so it is currently
3970 *      difficult to retry a send to this method.  (You can bump the ref count
3971 *      before sending to hold a reference for retry if you are careful.)
3972 *
3973 *      When calling this method, interrupts MUST be enabled.  This is because
3974 *      the BH enable code must have IRQs enabled so that it will not deadlock.
3975 *          --BLG
3976 */
3977static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
3978{
3979        struct net_device *dev = skb->dev;
3980        struct netdev_queue *txq;
3981        struct Qdisc *q;
3982        int rc = -ENOMEM;
3983        bool again = false;
3984
3985        skb_reset_mac_header(skb);
3986
3987        if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3988                __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3989
3990        /* Disable soft irqs for various locks below. Also
3991         * stops preemption for RCU.
3992         */
3993        rcu_read_lock_bh();
3994
3995        skb_update_prio(skb);
3996
3997        qdisc_pkt_len_init(skb);
3998#ifdef CONFIG_NET_CLS_ACT
3999        skb->tc_at_ingress = 0;
4000# ifdef CONFIG_NET_EGRESS

4001        if (static_branch_unlikely(&egress_needed_key)) {
4002                skb = sch_handle_egress(skb, &rc, dev);
4003                if (!skb)
4004                        goto out;
4005        }
4006# endif
4007#endif
4008        /* If device/qdisc don't need skb->dst, release it right now while
4009         * its hot in this cpu cache.
4010         */
4011        if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
4012                skb_dst_drop(skb);
4013        else
4014                skb_dst_force(skb);
4015
4016        txq = netdev_core_pick_tx(dev, skb, sb_dev);
4017        q = rcu_dereference_bh(txq->qdisc);
4018
4019        trace_net_dev_queue(skb);
4020        if (q->enqueue) {
4021                rc = __dev_xmit_skb(skb, q, dev, txq);
4022                goto out;
4023        }
4024
4025        /* The device has no queue. Common case for software devices:
4026         * loopback, all the sorts of tunnels...
4027
4028         * Really, it is unlikely that netif_tx_lock protection is necessary
4029         * here.  (f.e. loopback and IP tunnels are clean ignoring statistics
4030         * counters.)
4031         * However, it is possible, that they rely on protection
4032         * made by us here.
4033
4034         * Check this and shot the lock. It is not prone from deadlocks.
4035         *Either shot noqueue qdisc, it is even simpler 8)
4036         */
4037        if (dev->flags & IFF_UP) {
4038                int cpu = smp_processor_id(); /* ok because BHs are off */
4039
4040                if (txq->xmit_lock_owner != cpu) {
4041                        if (dev_xmit_recursion())
4042                                goto recursion_alert;
4043
4044                        skb = validate_xmit_skb(skb, dev, &again);
4045                        if (!skb)
4046                                goto out;
4047
4048                        HARD_TX_LOCK(dev, txq, cpu);
4049
4050                        if (!netif_xmit_stopped(txq)) {
4051                                dev_xmit_recursion_inc();
4052                                skb = dev_hard_start_xmit(skb, dev, txq, &rc);
4053                                dev_xmit_recursion_dec();
4054                                if (dev_xmit_complete(rc)) {
4055                                        HARD_TX_UNLOCK(dev, txq);
4056                                        goto out;
4057                                }
4058                        }
4059                        HARD_TX_UNLOCK(dev, txq);
4060                        net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
4061                                             dev->name);
4062                } else {
4063                        /* Recursion is detected! It is possible,
4064                         * unfortunately
4065                         */
4066recursion_alert:
4067                        net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
4068                                             dev->name);
4069                }
4070        }
4071
4072        rc = -ENETDOWN;
4073        rcu_read_unlock_bh();
4074
4075        atomic_long_inc(&dev->tx_dropped);
4076        kfree_skb_list(skb);
4077        return rc;
4078out:
4079        rcu_read_unlock_bh();
4080        return rc;
4081}
4082
4083int dev_queue_xmit(struct sk_buff *skb)
4084{
4085        return __dev_queue_xmit(skb, NULL);
4086}
4087EXPORT_SYMBOL(dev_queue_xmit);
4088
4089int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev)
4090{
4091        return __dev_queue_xmit(skb, sb_dev);
4092}
4093EXPORT_SYMBOL(dev_queue_xmit_accel);
4094
4095int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
4096{
4097        struct net_device *dev = skb->dev;
4098        struct sk_buff *orig_skb = skb;
4099        struct netdev_queue *txq;
4100        int ret = NETDEV_TX_BUSY;
4101        bool again = false;
4102
4103        if (unlikely(!netif_running(dev) ||
4104                     !netif_carrier_ok(dev)))
4105                goto drop;
4106
4107        skb = validate_xmit_skb_list(skb, dev, &again);
4108        if (skb != orig_skb)
4109                goto drop;
4110
4111        skb_set_queue_mapping(skb, queue_id);
4112        txq = skb_get_tx_queue(dev, skb);
4113
4114        local_bh_disable();
4115
4116        HARD_TX_LOCK(dev, txq, smp_processor_id());
4117        if (!netif_xmit_frozen_or_drv_stopped(txq))
4118                ret = netdev_start_xmit(skb, dev, txq, false);
4119        HARD_TX_UNLOCK(dev, txq);
4120
4121        local_bh_enable();
4122
4123        if (!dev_xmit_complete(ret))
4124                kfree_skb(skb);
4125
4126        return ret;
4127drop:
4128        atomic_long_inc(&dev->tx_dropped);
4129        kfree_skb_list(skb);
4130        return NET_XMIT_DROP;
4131}
4132EXPORT_SYMBOL(dev_direct_xmit);
4133
4134/*************************************************************************
4135 *                      Receiver routines
4136 *************************************************************************/
4137
4138int netdev_max_backlog __read_mostly = 1000;
4139EXPORT_SYMBOL(netdev_max_backlog);
4140
4141int netdev_tstamp_prequeue __read_mostly = 1;
4142int netdev_budget __read_mostly = 300;
4143/* Must be at least 2 jiffes to guarantee 1 jiffy timeout */
4144unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ;
4145int weight_p __read_mostly = 64;           /* old backlog weight */
4146int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
4147int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
4148int dev_rx_weight __read_mostly = 64;
4149int dev_tx_weight __read_mostly = 64;
4150/* Maximum number of GRO_NORMAL skbs to batch up for list-RX */
4151int gro_normal_batch __read_mostly = 8;
4152
4153/* Called with irq disabled */
4154static inline void ____napi_schedule(struct softnet_data *sd,
4155                                     struct napi_struct *napi)
4156{
4157        list_add_tail(&napi->poll_list, &sd->poll_list);
4158        __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4159}
4160
4161#ifdef CONFIG_RPS
4162
4163/* One global table that all flow-based protocols share. */
4164struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
4165EXPORT_SYMBOL(rps_sock_flow_table);
4166u32 rps_cpu_mask __read_mostly;
4167EXPORT_SYMBOL(rps_cpu_mask);
4168
4169struct static_key_false rps_needed __read_mostly;
4170EXPORT_SYMBOL(rps_needed);
4171struct static_key_false rfs_needed __read_mostly;
4172EXPORT_SYMBOL(rfs_needed);
4173
4174static struct rps_dev_flow *
4175set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
4176            struct rps_dev_flow *rflow, u16 next_cpu)
4177{
4178        if (next_cpu < nr_cpu_ids) {
4179#ifdef CONFIG_RFS_ACCEL
4180                struct netdev_rx_queue *rxqueue;
4181                struct rps_dev_flow_table *flow_table;
4182                struct rps_dev_flow *old_rflow;
4183                u32 flow_id;
4184                u16 rxq_index;
4185                int rc;
4186
4187                /* Should we steer this flow to a different hardware queue? */
4188                if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
4189                    !(dev->features & NETIF_F_NTUPLE))
4190                        goto out;
4191                rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
4192                if (rxq_index == skb_get_rx_queue(skb))
4193                        goto out;
4194
4195                rxqueue = dev->_rx + rxq_index;
4196                flow_table = rcu_dereference(rxqueue->rps_flow_table);
4197                if (!flow_table)
4198                        goto out;
4199                flow_id = skb_get_hash(skb) & flow_table->mask;
4200                rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
4201                                                        rxq_index, flow_id);
4202                if (rc < 0)
4203                        goto out;
4204                old_rflow = rflow;
4205                rflow = &flow_table->flows[flow_id];
4206                rflow->filter = rc;
4207                if (old_rflow->filter == rflow->filter)
4208                        old_rflow->filter = RPS_NO_FILTER;
4209        out:
4210#endif
4211                rflow->last_qtail =
4212                        per_cpu(softnet_data, next_cpu).input_queue_head;
4213        }
4214
4215        rflow->cpu = next_cpu;
4216        return rflow;
4217}
4218
4219/*
4220 * get_rps_cpu is called from netif_receive_skb and returns the target
4221 * CPU from the RPS map of the receiving queue for a given skb.
4222 * rcu_read_lock must be held on entry.
4223 */
4224static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
4225                       struct rps_dev_flow **rflowp)
4226{
4227        const struct rps_sock_flow_table *sock_flow_table;
4228        struct netdev_rx_queue *rxqueue = dev->_rx;
4229        struct rps_dev_flow_table *flow_table;
4230        struct rps_map *map;
4231        int cpu = -1;
4232        u32 tcpu;
4233        u32 hash;
4234
4235        if (skb_rx_queue_recorded(skb)) {
4236                u16 index = skb_get_rx_queue(skb);
4237
4238                if (unlikely(index >= dev->real_num_rx_queues)) {
4239                        WARN_ONCE(dev->real_num_rx_queues > 1,
4240                                  "%s received packet on queue %u, but number "
4241                                  "of RX queues is %u\n",
4242                                  dev->name, index, dev->real_num_rx_queues);
4243                        goto done;
4244                }
4245                rxqueue += index;
4246        }
4247
4248        /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
4249
4250        flow_table = rcu_dereference(rxqueue->rps_flow_table);
4251        map = rcu_dereference(rxqueue->rps_map);
4252        if (!flow_table && !map)
4253                goto done;
4254
4255        skb_reset_network_header(skb);
4256        hash = skb_get_hash(skb);
4257        if (!hash)
4258                goto done;
4259
4260        sock_flow_table = rcu_dereference(rps_sock_flow_table);
4261        if (flow_table && sock_flow_table) {
4262                struct rps_dev_flow *rflow;
4263                u32 next_cpu;
4264                u32 ident;
4265
4266                /* First check into global flow table if there is a match */
4267                ident = sock_flow_table->ents[hash & sock_flow_table->mask];
4268                if ((ident ^ hash) & ~rps_cpu_mask)
4269                        goto try_rps;
4270
4271                next_cpu = ident & rps_cpu_mask;
4272
4273                /* OK, now we know there is a match,
4274                 * we can look at the local (per receive queue) flow table
4275                 */
4276                rflow = &flow_table->flows[hash & flow_table->mask];
4277                tcpu = rflow->cpu;
4278
4279                /*
4280                 * If the desired CPU (where last recvmsg was done) is
4281                 * different from current CPU (one in the rx-queue flow
4282                 * table entry), switch if one of the following holds:
4283                 *   - Current CPU is unset (>= nr_cpu_ids).
4284                 *   - Current CPU is offline.
4285                 *   - The current CPU's queue tail has advanced beyond the
4286                 *     last packet that was enqueued using this table entry.
4287                 *     This guarantees that all previous packets for the flow
4288                 *     have been dequeued, thus preserving in order delivery.
4289                 */
4290                if (unlikely(tcpu != next_cpu) &&
4291                    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
4292                     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
4293                      rflow->last_qtail)) >= 0)) {
4294                        tcpu = next_cpu;
4295                        rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
4296                }
4297
4298                if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
4299                        *rflowp = rflow;
4300                        cpu = tcpu;
4301                        goto done;
4302                }
4303        }
4304
4305try_rps:
4306
4307        if (map) {
4308                tcpu = map->cpus[reciprocal_scale(hash, map->len)];
4309                if (cpu_online(tcpu)) {
4310                        cpu = tcpu;
4311                        goto done;
4312                }
4313        }
4314
4315done:
4316        return cpu;
4317}
4318
4319#ifdef CONFIG_RFS_ACCEL
4320
4321/**
4322 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
4323 * @dev: Device on which the filter was set
4324 * @rxq_index: RX queue index
4325 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
4326 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
4327 *
4328 * Drivers that implement ndo_rx_flow_steer() should periodically call
4329 * this function for each installed filter and remove the filters for
4330 * which it returns %true.
4331 */
4332bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
4333                         u32 flow_id, u16 filter_id)
4334{
4335        struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
4336        struct rps_dev_flow_table *flow_table;
4337        struct rps_dev_flow *rflow;
4338        bool expire = true;
4339        unsigned int cpu;
4340
4341        rcu_read_lock();
4342        flow_table = rcu_dereference(rxqueue->rps_flow_table);
4343        if (flow_table && flow_id <= flow_table->mask) {
4344                rflow = &flow_table->flows[flow_id];
4345                cpu = READ_ONCE(rflow->cpu);
4346                if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
4347                    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
4348                           rflow->last_qtail) <
4349                     (int)(10 * flow_table->mask)))
4350                        expire = false;
4351        }
4352        rcu_read_unlock();
4353        return expire;
4354}
4355EXPORT_SYMBOL(rps_may_expire_flow);
4356
4357#endif /* CONFIG_RFS_ACCEL */
4358
4359/* Called from hardirq (IPI) context */
4360static void rps_trigger_softirq(void *data)
4361{
4362        struct softnet_data *sd = data;
4363
4364        ____napi_schedule(sd, &sd->backlog);
4365        sd->received_rps++;
4366}
4367
4368#endif /* CONFIG_RPS */
4369
4370/*
4371 * Check if this softnet_data structure is another cpu one
4372 * If yes, queue it to our IPI list and return 1
4373 * If no, return 0
4374 */
4375static int rps_ipi_queued(struct softnet_data *sd)
4376{
4377#ifdef CONFIG_RPS
4378        struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
4379
4380        if (sd != mysd) {
4381                sd->rps_ipi_next = mysd->rps_ipi_list;
4382                mysd->rps_ipi_list = sd;
4383
4384                __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4385                return 1;
4386        }
4387#endif /* CONFIG_RPS */
4388        return 0;
4389}
4390
4391#ifdef CONFIG_NET_FLOW_LIMIT
4392int netdev_flow_limit_table_len __read_mostly = (1 << 12);
4393#endif
4394
4395static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
4396{
4397#ifdef CONFIG_NET_FLOW_LIMIT
4398        struct sd_flow_limit *fl;
4399        struct softnet_data *sd;
4400        unsigned int old_flow, new_flow;
4401
4402        if (qlen < (netdev_max_backlog >> 1))
4403                return false;
4404
4405        sd = this_cpu_ptr(&softnet_data);
4406
4407        rcu_read_lock();
4408        fl = rcu_dereference(sd->flow_limit);
4409        if (fl) {
4410                new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
4411                old_flow = fl->history[fl->history_head];
4412                fl->history[fl->history_head] = new_flow;
4413
4414                fl->history_head++;
4415                fl->history_head &= FLOW_LIMIT_HISTORY - 1;
4416
4417                if (likely(fl->buckets[old_flow]))
4418                        fl->buckets[old_flow]--;
4419
4420                if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
4421                        fl->count++;
4422                        rcu_read_unlock();
4423                        return true;
4424                }
4425        }
4426        rcu_read_unlock();
4427#endif
4428        return false;
4429}
4430
4431/*
4432 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
4433 * queue (may be a remote CPU queue).
4434 */
4435static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
4436                              unsigned int *qtail)
4437{
4438        struct softnet_data *sd;
4439        unsigned long flags;
4440        unsigned int qlen;
4441
4442        sd = &per_cpu(softnet_data, cpu);
4443
4444        local_irq_save(flags);
4445
4446        rps_lock(sd);
4447        if (!netif_running(skb->dev))
4448                goto drop;
4449        qlen = skb_queue_len(&sd->input_pkt_queue);
4450        if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
4451                if (qlen) {
4452enqueue:
4453                        __skb_queue_tail(&sd->input_pkt_queue, skb);
4454                        input_queue_tail_incr_save(sd, qtail);
4455                        rps_unlock(sd);
4456                        local_irq_restore(flags);
4457                        return NET_RX_SUCCESS;
4458                }
4459
4460                /* Schedule NAPI for backlog device
4461                 * We can use non atomic operation since we own the queue lock
4462                 */
4463                if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
4464                        if (!rps_ipi_queued(sd))
4465                                ____napi_schedule(sd, &sd->backlog);
4466                }
4467                goto enqueue;
4468        }
4469
4470drop:
4471        sd->dropped++;
4472        rps_unlock(sd);
4473
4474        local_irq_restore(flags);
4475
4476        atomic_long_inc(&skb->dev->rx_dropped);
4477        kfree_skb(skb);
4478        return NET_RX_DROP;
4479}
4480
4481static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
4482{
4483        struct net_device *dev = skb->dev;
4484        struct netdev_rx_queue *rxqueue;
4485
4486        rxqueue = dev->_rx;
4487
4488        if (skb_rx_queue_recorded(skb)) {
4489                u16 index = skb_get_rx_queue(skb);
4490
4491                if (unlikely(index >= dev->real_num_rx_queues)) {
4492                        WARN_ONCE(dev->real_num_rx_queues > 1,
4493                                  "%s received packet on queue %u, but number "
4494                                  "of RX queues is %u\n",
4495                                  dev->name, index, dev->real_num_rx_queues);
4496
4497                        return rxqueue; /* Return first rxqueue */
4498                }
4499                rxqueue += index;
4500        }
4501        return rxqueue;
4502}
4503
4504static u32 netif_receive_generic_xdp(struct sk_buff *skb,
4505                                     struct xdp_buff *xdp,
4506                                     struct bpf_prog *xdp_prog)
4507{
4508        struct netdev_rx_queue *rxqueue;
4509        void *orig_data, *orig_data_end;
4510        u32 metalen, act = XDP_DROP;
4511        __be16 orig_eth_type;
4512        struct ethhdr *eth;
4513        bool orig_bcast;
4514        int hlen, off;
4515        u32 mac_len;
4516
4517        /* Reinjected packets coming from act_mirred or similar should
4518         * not get XDP generic processing.
4519         */
4520        if (skb_is_redirected(skb))
4521                return XDP_PASS;
4522
4523        /* XDP packets must be linear and must have sufficient headroom
4524         * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
4525         * native XDP provides, thus we need to do it here as well.
4526         */
4527        if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
4528            skb_headroom(skb) < XDP_PACKET_HEADROOM) {
4529                int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
4530                int troom = skb->tail + skb->data_len - skb->end;
4531
4532                /* In case we have to go down the path and also linearize,
4533                 * then lets do the pskb_expand_head() work just once here.
4534                 */
4535                if (pskb_expand_head(skb,
4536                                     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
4537                                     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
4538                        goto do_drop;
4539                if (skb_linearize(skb))
4540                        goto do_drop;
4541        }
4542
4543        /* The XDP program wants to see the packet starting at the MAC
4544         * header.
4545         */
4546        mac_len = skb->data - skb_mac_header(skb);
4547        hlen = skb_headlen(skb) + mac_len;
4548        xdp->data = skb->data - mac_len;
4549        xdp->data_meta = xdp->data;
4550        xdp->data_end = xdp->data + hlen;
4551        xdp->data_hard_start = skb->data - skb_headroom(skb);
4552        orig_data_end = xdp->data_end;
4553        orig_data = xdp->data;
4554        eth = (struct ethhdr *)xdp->data;
4555        orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
4556        orig_eth_type = eth->h_proto;
4557
4558        rxqueue = netif_get_rxqueue(skb);
4559        xdp->rxq = &rxqueue->xdp_rxq;
4560
4561        act = bpf_prog_run_xdp(xdp_prog, xdp);
4562
4563        /* check if bpf_xdp_adjust_head was used */
4564        off = xdp->data - orig_data;
4565        if (off) {
4566                if (off > 0)
4567                        __skb_pull(skb, off);
4568                else if (off < 0)
4569                        __skb_push(skb, -off);
4570
4571                skb->mac_header += off;
4572                skb_reset_network_header(skb);
4573        }
4574
4575        /* check if bpf_xdp_adjust_tail was used. it can only "shrink"
4576         * pckt.
4577         */
4578        off = orig_data_end - xdp->data_end;
4579        if (off != 0) {
4580                skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
4581                skb->len -= off;
4582
4583        }
4584
4585        /* check if XDP changed eth hdr such SKB needs update */
4586        eth = (struct ethhdr *)xdp->data;
4587        if ((orig_eth_type != eth->h_proto) ||
4588            (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
4589                __skb_push(skb, ETH_HLEN);
4590                skb->protocol = eth_type_trans(skb, skb->dev);
4591        }
4592
4593        switch (act) {
4594        case XDP_REDIRECT:
4595        case XDP_TX:
4596                __skb_push(skb, mac_len);
4597                break;
4598        case XDP_PASS:
4599                metalen = xdp->data - xdp->data_meta;
4600                if (metalen)
4601                        skb_metadata_set(skb, metalen);
4602                break;
4603        default:
4604                bpf_warn_invalid_xdp_action(act);
4605                /* fall through */
4606        case XDP_ABORTED:
4607                trace_xdp_exception(skb->dev, xdp_prog, act);
4608                /* fall through */
4609        case XDP_DROP:
4610        do_drop:
4611                kfree_skb(skb);
4612                break;
4613        }
4614
4615        return act;
4616}
4617
4618/* When doing generic XDP we have to bypass the qdisc layer and the
4619 * network taps in order to match in-driver-XDP behavior.
4620 */
4621void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
4622{
4623        struct net_device *dev = skb->dev;
4624        struct netdev_queue *txq;
4625        bool free_skb = true;
4626        int cpu, rc;
4627
4628        txq = netdev_core_pick_tx(dev, skb, NULL);
4629        cpu = smp_processor_id();
4630        HARD_TX_LOCK(dev, txq, cpu);
4631        if (!netif_xmit_stopped(txq)) {
4632                rc = netdev_start_xmit(skb, dev, txq, 0);
4633                if (dev_xmit_complete(rc))
4634                        free_skb = false;
4635        }
4636        HARD_TX_UNLOCK(dev, txq);
4637        if (free_skb) {
4638                trace_xdp_exception(dev, xdp_prog, XDP_TX);
4639                kfree_skb(skb);
4640        }
4641}
4642
4643static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
4644
4645int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
4646{
4647        if (xdp_prog) {
4648                struct xdp_buff xdp;
4649                u32 act;
4650                int err;
4651
4652                act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
4653                if (act != XDP_PASS) {
4654                        switch (act) {
4655                        case XDP_REDIRECT:
4656                                err = xdp_do_generic_redirect(skb->dev, skb,
4657                                                              &xdp, xdp_prog);
4658                                if (err)
4659                                        goto out_redir;
4660                                break;
4661                        case XDP_TX:
4662                                generic_xdp_tx(skb, xdp_prog);
4663                                break;
4664                        }
4665                        return XDP_DROP;
4666                }
4667        }
4668        return XDP_PASS;
4669out_redir:
4670        kfree_skb(skb);
4671        return XDP_DROP;
4672}
4673EXPORT_SYMBOL_GPL(do_xdp_generic);
4674
4675static int netif_rx_internal(struct sk_buff *skb)
4676{
4677        int ret;
4678
4679        net_timestamp_check(netdev_tstamp_prequeue, skb);
4680
4681        trace_netif_rx(skb);
4682
4683#ifdef CONFIG_RPS
4684        if (static_branch_unlikely(&rps_needed)) {
4685                struct rps_dev_flow voidflow, *rflow = &voidflow;
4686                int cpu;
4687
4688                preempt_disable();
4689                rcu_read_lock();
4690
4691                cpu = get_rps_cpu(skb->dev, skb, &rflow);
4692                if (cpu < 0)
4693                        cpu = smp_processor_id();
4694
4695                ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4696
4697                rcu_read_unlock();
4698                preempt_enable();
4699        } else
4700#endif
4701        {
4702                unsigned int qtail;
4703
4704                ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
4705                put_cpu();
4706        }
4707        return ret;
4708}
4709
4710/**
4711 *      netif_rx        -       post buffer to the network code
4712 *      @skb: buffer to post
4713 *
4714 *      This function receives a packet from a device driver and queues it for
4715 *      the upper (protocol) levels to process.  It always succeeds. The buffer
4716 *      may be dropped during processing for congestion control or by the
4717 *      protocol layers.
4718 *
4719 *      return values:
4720 *      NET_RX_SUCCESS  (no congestion)
4721 *      NET_RX_DROP     (packet was dropped)
4722 *
4723 */
4724
4725int netif_rx(struct sk_buff *skb)
4726{
4727        int ret;
4728
4729        trace_netif_rx_entry(skb);
4730
4731        ret = netif_rx_internal(skb);
4732        trace_netif_rx_exit(ret);
4733
4734        return ret;
4735}
4736EXPORT_SYMBOL(netif_rx);
4737
4738int netif_rx_ni(struct sk_buff *skb)
4739{
4740        int err;
4741
4742        trace_netif_rx_ni_entry(skb);
4743
4744        preempt_disable();
4745        err = netif_rx_internal(skb);
4746        if (local_softirq_pending())
4747                do_softirq();
4748        preempt_enable();
4749        trace_netif_rx_ni_exit(err);
4750
4751        return err;
4752}
4753EXPORT_SYMBOL(netif_rx_ni);
4754
4755static __latent_entropy void net_tx_action(struct softirq_action *h)
4756{
4757        struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4758
4759        if (sd->completion_queue) {
4760                struct sk_buff *clist;
4761
4762                local_irq_disable();
4763                clist = sd->completion_queue;
4764                sd->completion_queue = NULL;
4765                local_irq_enable();
4766
4767                while (clist) {
4768                        struct sk_buff *skb = clist;
4769
4770                        clist = clist->next;
4771
4772                        WARN_ON(refcount_read(&skb->users));
4773                        if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
4774                                trace_consume_skb(skb);
4775                        else
4776                                trace_kfree_skb(skb, net_tx_action);
4777
4778                        if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
4779                                __kfree_skb(skb);
4780                        else
4781                                __kfree_skb_defer(skb);
4782                }
4783
4784                __kfree_skb_flush();
4785        }
4786
4787        if (sd->output_queue) {
4788                struct Qdisc *head;
4789
4790                local_irq_disable();
4791                head = sd->output_queue;
4792                sd->output_queue = NULL;
4793                sd->output_queue_tailp = &sd->output_queue;
4794                local_irq_enable();
4795
4796                while (head) {
4797                        struct Qdisc *q = head;
4798                        spinlock_t *root_lock = NULL;
4799
4800                        head = head->next_sched;
4801
4802                        if (!(q->flags & TCQ_F_NOLOCK)) {
4803                                root_lock = qdisc_lock(q);
4804                                spin_lock(root_lock);
4805                        }
4806                        /* We need to make sure head->next_sched is read
4807                         * before clearing __QDISC_STATE_SCHED
4808                         */
4809                        smp_mb__before_atomic();
4810                        clear_bit(__QDISC_STATE_SCHED, &q->state);
4811                        qdisc_run(q);
4812                        if (root_lock)
4813                                spin_unlock(root_lock);
4814                }
4815        }
4816
4817        xfrm_dev_backlog(sd);
4818}
4819
4820#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
4821/* This hook is defined here for ATM LANE */
4822int (*br_fdb_test_addr_hook)(struct net_device *dev,
4823                             unsigned char *addr) __read_mostly;
4824EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
4825#endif
4826
4827static inline struct sk_buff *
4828sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
4829                   struct net_device *orig_dev)
4830{
4831#ifdef CONFIG_NET_CLS_ACT
4832        struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
4833        struct tcf_result cl_res;
4834
4835        /* If there's at least one ingress present somewhere (so
4836         * we get here via enabled static key), remaining devices
4837         * that are not configured with an ingress qdisc will bail
4838         * out here.
4839         */
4840        if (!miniq)
4841                return skb;
4842
4843        if (*pt_prev) {
4844                *ret = deliver_skb(skb, *pt_prev, orig_dev);
4845                *pt_prev = NULL;
4846        }
4847
4848        qdisc_skb_cb(skb)->pkt_len = skb->len;
4849        skb->tc_at_ingress = 1;
4850        mini_qdisc_bstats_cpu_update(miniq, skb);
4851
4852        switch (tcf_classify_ingress(skb, miniq->block, miniq->filter_list,
4853                                     &cl_res, false)) {
4854        case TC_ACT_OK:
4855        case TC_ACT_RECLASSIFY:
4856                skb->tc_index = TC_H_MIN(cl_res.classid);
4857                break;
4858        case TC_ACT_SHOT:
4859                mini_qdisc_qstats_cpu_drop(miniq);
4860                kfree_skb(skb);
4861                return NULL;
4862        case TC_ACT_STOLEN:
4863        case TC_ACT_QUEUED:
4864        case TC_ACT_TRAP:
4865                consume_skb(skb);
4866                return NULL;
4867        case TC_ACT_REDIRECT:
4868                /* skb_mac_header check was done by cls/act_bpf, so
4869                 * we can safely push the L2 header back before
4870                 * redirecting to another netdev
4871                 */
4872                __skb_push(skb, skb->mac_len);
4873                skb_do_redirect(skb);
4874                return NULL;
4875        case TC_ACT_CONSUMED:
4876                return NULL;
4877        default:
4878                break;
4879        }
4880#endif /* CONFIG_NET_CLS_ACT */
4881        return skb;
4882}
4883
4884/**
4885 *      netdev_is_rx_handler_busy - check if receive handler is registered
4886 *      @dev: device to check
4887 *
4888 *      Check if a receive handler is already registered for a given device.
4889 *      Return true if there one.
4890 *
4891 *      The caller must hold the rtnl_mutex.
4892 */
4893bool netdev_is_rx_handler_busy(struct net_device *dev)
4894{
4895        ASSERT_RTNL();
4896        return dev && rtnl_dereference(dev->rx_handler);
4897}
4898EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
4899
4900/**
4901 *      netdev_rx_handler_register - register receive handler
4902 *      @dev: device to register a handler for
4903 *      @rx_handler: receive handler to register
4904 *      @rx_handler_data: data pointer that is used by rx handler
4905 *
4906 *      Register a receive handler for a device. This handler will then be
4907 *      called from __netif_receive_skb. A negative errno code is returned
4908 *      on a failure.
4909 *
4910 *      The caller must hold the rtnl_mutex.
4911 *
4912 *      For a general description of rx_handler, see enum rx_handler_result.
4913 */
4914int netdev_rx_handler_register(struct net_device *dev,
4915                               rx_handler_func_t *rx_handler,
4916                               void *rx_handler_data)
4917{
4918        if (netdev_is_rx_handler_busy(dev))
4919                return -EBUSY;
4920
4921        if (dev->priv_flags & IFF_NO_RX_HANDLER)
4922                return -EINVAL;
4923
4924        /* Note: rx_handler_data must be set before rx_handler */
4925        rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
4926        rcu_assign_pointer(dev->rx_handler, rx_handler);
4927
4928        return 0;
4929}
4930EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
4931
4932/**
4933 *      netdev_rx_handler_unregister - unregister receive handler
4934 *      @dev: device to unregister a handler from
4935 *
4936 *      Unregister a receive handler from a device.
4937 *
4938 *      The caller must hold the rtnl_mutex.
4939 */
4940void netdev_rx_handler_unregister(struct net_device *dev)
4941{
4942
4943        ASSERT_RTNL();
4944        RCU_INIT_POINTER(dev->rx_handler, NULL);
4945        /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
4946         * section has a guarantee to see a non NULL rx_handler_data
4947         * as well.
4948         */
4949        synchronize_net();
4950        RCU_INIT_POINTER(dev->rx_handler_data, NULL);
4951}
4952EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
4953
4954/*
4955 * Limit the use of PFMEMALLOC reserves to those protocols that implement
4956 * the special handling of PFMEMALLOC skbs.
4957 */
4958static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
4959{
4960        switch (skb->protocol) {
4961        case htons(ETH_P_ARP):
4962        case htons(ETH_P_IP):
4963        case htons(ETH_P_IPV6):
4964        case htons(ETH_P_8021Q):
4965        case htons(ETH_P_8021AD):
4966                return true;
4967        default:
4968                return false;
4969        }
4970}
4971
4972static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
4973                             int *ret, struct net_device *orig_dev)
4974{
4975        if (nf_hook_ingress_active(skb)) {
4976                int ingress_retval;
4977
4978                if (*pt_prev) {
4979                        *ret = deliver_skb(skb, *pt_prev, orig_dev);
4980                        *pt_prev = NULL;
4981                }
4982
4983                rcu_read_lock();
4984                ingress_retval = nf_hook_ingress(skb);
4985                rcu_read_unlock();
4986                return ingress_retval;
4987        }
4988        return 0;
4989}
4990
4991static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
4992                                    struct packet_type **ppt_prev)
4993{
4994        struct packet_type *ptype, *pt_prev;
4995        rx_handler_func_t *rx_handler;
4996        struct sk_buff *skb = *pskb;
4997        struct net_device *orig_dev;
4998        bool deliver_exact = false;
4999        int ret = NET_RX_DROP;
5000        __be16 type;

5001
5002        net_timestamp_check(!netdev_tstamp_prequeue, skb);
5003
5004        trace_netif_receive_skb(skb);
5005
5006        orig_dev = skb->dev;
5007
5008        skb_reset_network_header(skb);
5009        if (!skb_transport_header_was_set(skb))
5010                skb_reset_transport_header(skb);
5011        skb_reset_mac_len(skb);
5012
5013        pt_prev = NULL;
5014
5015another_round:
5016        skb->skb_iif = skb->dev->ifindex;
5017
5018        __this_cpu_inc(softnet_data.processed);
5019
5020        if (static_branch_unlikely(&generic_xdp_needed_key)) {
5021                int ret2;
5022
5023                preempt_disable();
5024                ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
5025                preempt_enable();
5026
5027                if (ret2 != XDP_PASS) {
5028                        ret = NET_RX_DROP;
5029                        goto out;
5030                }
5031                skb_reset_mac_len(skb);
5032        }
5033
5034        if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
5035            skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
5036                skb = skb_vlan_untag(skb);
5037                if (unlikely(!skb))
5038                        goto out;
5039        }
5040
5041        if (skb_skip_tc_classify(skb))
5042                goto skip_classify;
5043
5044        if (pfmemalloc)
5045                goto skip_taps;
5046
5047        list_for_each_entry_rcu(ptype, &ptype_all, list) {
5048                if (pt_prev)
5049                        ret = deliver_skb(skb, pt_prev, orig_dev);
5050                pt_prev = ptype;
5051        }
5052
5053        list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
5054                if (pt_prev)
5055                        ret = deliver_skb(skb, pt_prev, orig_dev);
5056                pt_prev = ptype;
5057        }
5058
5059skip_taps:
5060#ifdef CONFIG_NET_INGRESS
5061        if (static_branch_unlikely(&ingress_needed_key)) {
5062                skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
5063                if (!skb)
5064                        goto out;
5065
5066                if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
5067                        goto out;
5068        }
5069#endif
5070        skb_reset_redirect(skb);
5071skip_classify:
5072        if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
5073                goto drop;
5074
5075        if (skb_vlan_tag_present(skb)) {
5076                if (pt_prev) {
5077                        ret = deliver_skb(skb, pt_prev, orig_dev);
5078                        pt_prev = NULL;
5079                }
5080                if (vlan_do_receive(&skb))
5081                        goto another_round;
5082                else if (unlikely(!skb))
5083                        goto out;
5084        }
5085
5086        rx_handler = rcu_dereference(skb->dev->rx_handler);
5087        if (rx_handler) {
5088                if (pt_prev) {
5089                        ret = deliver_skb(skb, pt_prev, orig_dev);
5090                        pt_prev = NULL;
5091                }
5092                switch (rx_handler(&skb)) {
5093                case RX_HANDLER_CONSUMED:
5094                        ret = NET_RX_SUCCESS;
5095                        goto out;
5096                case RX_HANDLER_ANOTHER:
5097                        goto another_round;
5098                case RX_HANDLER_EXACT:
5099                        deliver_exact = true;
5100                case RX_HANDLER_PASS:
5101                        break;
5102                default:
5103                        BUG();
5104                }
5105        }
5106
5107        if (unlikely(skb_vlan_tag_present(skb))) {
5108check_vlan_id:
5109                if (skb_vlan_tag_get_id(skb)) {
5110                        /* Vlan id is non 0 and vlan_do_receive() above couldn't
5111                         * find vlan device.
5112                         */
5113                        skb->pkt_type = PACKET_OTHERHOST;
5114                } else if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
5115                           skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
5116                        /* Outer header is 802.1P with vlan 0, inner header is
5117                         * 802.1Q or 802.1AD and vlan_do_receive() above could
5118                         * not find vlan dev for vlan id 0.
5119                         */
5120                        __vlan_hwaccel_clear_tag(skb);
5121                        skb = skb_vlan_untag(skb);
5122                        if (unlikely(!skb))
5123                                goto out;
5124                        if (vlan_do_receive(&skb))
5125                                /* After stripping off 802.1P header with vlan 0
5126                                 * vlan dev is found for inner header.
5127                                 */
5128                                goto another_round;
5129                        else if (unlikely(!skb))
5130                                goto out;
5131                        else
5132                                /* We have stripped outer 802.1P vlan 0 header.
5133                                 * But could not find vlan dev.
5134                                 * check again for vlan id to set OTHERHOST.
5135                                 */
5136                                goto check_vlan_id;
5137                }
5138                /* Note: we might in the future use prio bits
5139                 * and set skb->priority like in vlan_do_receive()
5140                 * For the time being, just ignore Priority Code Point
5141                 */
5142                __vlan_hwaccel_clear_tag(skb);
5143        }
5144
5145        type = skb->protocol;
5146
5147        /* deliver only exact match when indicated */
5148        if (likely(!deliver_exact)) {
5149                deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5150                                       &ptype_base[ntohs(type) &
5151                                                   PTYPE_HASH_MASK]);
5152        }
5153
5154        deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5155                               &orig_dev->ptype_specific);
5156
5157        if (unlikely(skb->dev != orig_dev)) {
5158                deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5159                                       &skb->dev->ptype_specific);
5160        }
5161
5162        if (pt_prev) {
5163                if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
5164                        goto drop;
5165                *ppt_prev = pt_prev;
5166        } else {
5167drop:
5168                if (!deliver_exact)
5169                        atomic_long_inc(&skb->dev->rx_dropped);
5170                else
5171                        atomic_long_inc(&skb->dev->rx_nohandler);
5172                kfree_skb(skb);
5173                /* Jamal, now you will not able to escape explaining
5174                 * me how you were going to use this. :-)
5175                 */
5176                ret = NET_RX_DROP;
5177        }
5178
5179out:
5180        /* The invariant here is that if *ppt_prev is not NULL
5181         * then skb should also be non-NULL.
5182         *
5183         * Apparently *ppt_prev assignment above holds this invariant due to
5184         * skb dereferencing near it.
5185         */
5186        *pskb = skb;
5187        return ret;
5188}
5189
5190static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
5191{
5192        struct net_device *orig_dev = skb->dev;
5193        struct packet_type *pt_prev = NULL;
5194        int ret;
5195
5196        ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
5197        if (pt_prev)
5198                ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
5199                                         skb->dev, pt_prev, orig_dev);
5200        return ret;
5201}
5202
5203/**
5204 *      netif_receive_skb_core - special purpose version of netif_receive_skb
5205 *      @skb: buffer to process
5206 *
5207 *      More direct receive version of netif_receive_skb().  It should
5208 *      only be used by callers that have a need to skip RPS and Generic XDP.
5209 *      Caller must also take care of handling if ``(page_is_)pfmemalloc``.
5210 *
5211 *      This function may only be called from softirq context and interrupts
5212 *      should be enabled.
5213 *
5214 *      Return values (usually ignored):
5215 *      NET_RX_SUCCESS: no congestion
5216 *      NET_RX_DROP: packet was dropped
5217 */
5218int netif_receive_skb_core(struct sk_buff *skb)
5219{
5220        int ret;
5221
5222        rcu_read_lock();
5223        ret = __netif_receive_skb_one_core(skb, false);
5224        rcu_read_unlock();
5225
5226        return ret;
5227}
5228EXPORT_SYMBOL(netif_receive_skb_core);
5229
5230static inline void __netif_receive_skb_list_ptype(struct list_head *head,
5231                                                  struct packet_type *pt_prev,
5232                                                  struct net_device *orig_dev)
5233{
5234        struct sk_buff *skb, *next;
5235
5236        if (!pt_prev)
5237                return;
5238        if (list_empty(head))
5239                return;
5240        if (pt_prev->list_func != NULL)
5241                INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
5242                                   ip_list_rcv, head, pt_prev, orig_dev);
5243        else
5244                list_for_each_entry_safe(skb, next, head, list) {
5245                        skb_list_del_init(skb);
5246                        pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
5247                }
5248}
5249
5250static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
5251{
5252        /* Fast-path assumptions:
5253         * - There is no RX handler.
5254         * - Only one packet_type matches.
5255         * If either of these fails, we will end up doing some per-packet
5256         * processing in-line, then handling the 'last ptype' for the whole
5257         * sublist.  This can't cause out-of-order delivery to any single ptype,
5258         * because the 'last ptype' must be constant across the sublist, and all
5259         * other ptypes are handled per-packet.
5260         */
5261        /* Current (common) ptype of sublist */
5262        struct packet_type *pt_curr = NULL;
5263        /* Current (common) orig_dev of sublist */
5264        struct net_device *od_curr = NULL;
5265        struct list_head sublist;
5266        struct sk_buff *skb, *next;
5267
5268        INIT_LIST_HEAD(&sublist);
5269        list_for_each_entry_safe(skb, next, head, list) {
5270                struct net_device *orig_dev = skb->dev;
5271                struct packet_type *pt_prev = NULL;
5272
5273                skb_list_del_init(skb);
5274                __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
5275                if (!pt_prev)
5276                        continue;
5277                if (pt_curr != pt_prev || od_curr != orig_dev) {
5278                        /* dispatch old sublist */
5279                        __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
5280                        /* start new sublist */
5281                        INIT_LIST_HEAD(&sublist);
5282                        pt_curr = pt_prev;
5283                        od_curr = orig_dev;
5284                }
5285                list_add_tail(&skb->list, &sublist);
5286        }
5287
5288        /* dispatch final sublist */
5289        __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
5290}
5291
5292static int __netif_receive_skb(struct sk_buff *skb)
5293{
5294        int ret;
5295
5296        if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
5297                unsigned int noreclaim_flag;
5298
5299                /*
5300                 * PFMEMALLOC skbs are special, they should
5301                 * - be delivered to SOCK_MEMALLOC sockets only
5302                 * - stay away from userspace
5303                 * - have bounded memory usage
5304                 *
5305                 * Use PF_MEMALLOC as this saves us from propagating the allocation
5306                 * context down to all allocation sites.
5307                 */
5308                noreclaim_flag = memalloc_noreclaim_save();
5309                ret = __netif_receive_skb_one_core(skb, true);
5310                memalloc_noreclaim_restore(noreclaim_flag);
5311        } else
5312                ret = __netif_receive_skb_one_core(skb, false);
5313
5314        return ret;
5315}
5316
5317static void __netif_receive_skb_list(struct list_head *head)
5318{
5319        unsigned long noreclaim_flag = 0;
5320        struct sk_buff *skb, *next;
5321        bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
5322
5323        list_for_each_entry_safe(skb, next, head, list) {
5324                if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
5325                        struct list_head sublist;
5326
5327                        /* Handle the previous sublist */
5328                        list_cut_before(&sublist, head, &skb->list);
5329                        if (!list_empty(&sublist))
5330                                __netif_receive_skb_list_core(&sublist, pfmemalloc);
5331                        pfmemalloc = !pfmemalloc;
5332                        /* See comments in __netif_receive_skb */
5333                        if (pfmemalloc)
5334                                noreclaim_flag = memalloc_noreclaim_save();
5335                        else
5336                                memalloc_noreclaim_restore(noreclaim_flag);
5337                }
5338        }
5339        /* Handle the remaining sublist */
5340        if (!list_empty(head))
5341                __netif_receive_skb_list_core(head, pfmemalloc);
5342        /* Restore pflags */
5343        if (pfmemalloc)
5344                memalloc_noreclaim_restore(noreclaim_flag);
5345}
5346
5347static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
5348{
5349        struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
5350        struct bpf_prog *new = xdp->prog;
5351        int ret = 0;
5352
5353        switch (xdp->command) {
5354        case XDP_SETUP_PROG:
5355                rcu_assign_pointer(dev->xdp_prog, new);
5356                if (old)
5357                        bpf_prog_put(old);
5358
5359                if (old && !new) {
5360                        static_branch_dec(&generic_xdp_needed_key);
5361                } else if (new && !old) {
5362                        static_branch_inc(&generic_xdp_needed_key);
5363                        dev_disable_lro(dev);
5364                        dev_disable_gro_hw(dev);
5365                }
5366                break;
5367
5368        case XDP_QUERY_PROG:
5369                xdp->prog_id = old ? old->aux->id : 0;
5370                break;
5371
5372        default:
5373                ret = -EINVAL;
5374                break;
5375        }
5376
5377        return ret;
5378}
5379
5380static int netif_receive_skb_internal(struct sk_buff *skb)
5381{
5382        int ret;
5383
5384        net_timestamp_check(netdev_tstamp_prequeue, skb);
5385
5386        if (skb_defer_rx_timestamp(skb))
5387                return NET_RX_SUCCESS;
5388
5389        rcu_read_lock();
5390#ifdef CONFIG_RPS
5391        if (static_branch_unlikely(&rps_needed)) {
5392                struct rps_dev_flow voidflow, *rflow = &voidflow;
5393                int cpu = get_rps_cpu(skb->dev, skb, &rflow);
5394
5395                if (cpu >= 0) {
5396                        ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5397                        rcu_read_unlock();
5398                        return ret;
5399                }
5400        }
5401#endif
5402        ret = __netif_receive_skb(skb);
5403        rcu_read_unlock();
5404        return ret;
5405}
5406
5407static void netif_receive_skb_list_internal(struct list_head *head)
5408{
5409        struct sk_buff *skb, *next;
5410        struct list_head sublist;
5411
5412        INIT_LIST_HEAD(&sublist);
5413        list_for_each_entry_safe(skb, next, head, list) {
5414                net_timestamp_check(netdev_tstamp_prequeue, skb);
5415                skb_list_del_init(skb);
5416                if (!skb_defer_rx_timestamp(skb))
5417                        list_add_tail(&skb->list, &sublist);
5418        }
5419        list_splice_init(&sublist, head);
5420
5421        rcu_read_lock();
5422#ifdef CONFIG_RPS
5423        if (static_branch_unlikely(&rps_needed)) {
5424                list_for_each_entry_safe(skb, next, head, list) {
5425                        struct rps_dev_flow voidflow, *rflow = &voidflow;
5426                        int cpu = get_rps_cpu(skb->dev, skb, &rflow);
5427
5428                        if (cpu >= 0) {
5429                                /* Will be handled, remove from list */
5430                                skb_list_del_init(skb);
5431                                enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5432                        }
5433                }
5434        }
5435#endif
5436        __netif_receive_skb_list(head);
5437        rcu_read_unlock();
5438}
5439
5440/**
5441 *      netif_receive_skb - process receive buffer from network
5442 *      @skb: buffer to process
5443 *
5444 *      netif_receive_skb() is the main receive data processing function.
5445 *      It always succeeds. The buffer may be dropped during processing
5446 *      for congestion control or by the protocol layers.
5447 *
5448 *      This function may only be called from softirq context and interrupts
5449 *      should be enabled.
5450 *
5451 *      Return values (usually ignored):
5452 *      NET_RX_SUCCESS: no congestion
5453 *      NET_RX_DROP: packet was dropped
5454 */
5455int netif_receive_skb(struct sk_buff *skb)
5456{
5457        int ret;
5458
5459        trace_netif_receive_skb_entry(skb);
5460
5461        ret = netif_receive_skb_internal(skb);
5462        trace_netif_receive_skb_exit(ret);
5463
5464        return ret;
5465}
5466EXPORT_SYMBOL(netif_receive_skb);
5467
5468/**
5469 *      netif_receive_skb_list - process many receive buffers from network
5470 *      @head: list of skbs to process.
5471 *
5472 *      Since return value of netif_receive_skb() is normally ignored, and
5473 *      wouldn't be meaningful for a list, this function returns void.
5474 *
5475 *      This function may only be called from softirq context and interrupts
5476 *      should be enabled.
5477 */
5478void netif_receive_skb_list(struct list_head *head)
5479{
5480        struct sk_buff *skb;
5481
5482        if (list_empty(head))
5483                return;
5484        if (trace_netif_receive_skb_list_entry_enabled()) {
5485                list_for_each_entry(skb, head, list)
5486                        trace_netif_receive_skb_list_entry(skb);
5487        }
5488        netif_receive_skb_list_internal(head);
5489        trace_netif_receive_skb_list_exit(0);
5490}
5491EXPORT_SYMBOL(netif_receive_skb_list);
5492
5493DEFINE_PER_CPU(struct work_struct, flush_works);
5494
5495/* Network device is going away, flush any packets still pending */
5496static void flush_backlog(struct work_struct *work)
5497{
5498        struct sk_buff *skb, *tmp;
5499        struct softnet_data *sd;
5500
5501        local_bh_disable();
5502        sd = this_cpu_ptr(&softnet_data);
5503
5504        local_irq_disable();
5505        rps_lock(sd);
5506        skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
5507                if (skb->dev->reg_state == NETREG_UNREGISTERING) {
5508                        __skb_unlink(skb, &sd->input_pkt_queue);
5509                        kfree_skb(skb);
5510                        input_queue_head_incr(sd);
5511                }
5512        }
5513        rps_unlock(sd);
5514        local_irq_enable();
5515
5516        skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
5517                if (skb->dev->reg_state == NETREG_UNREGISTERING) {
5518                        __skb_unlink(skb, &sd->process_queue);
5519                        kfree_skb(skb);
5520                        input_queue_head_incr(sd);
5521                }
5522        }
5523        local_bh_enable();
5524}
5525
5526static void flush_all_backlogs(void)
5527{
5528        unsigned int cpu;
5529
5530        get_online_cpus();
5531
5532        for_each_online_cpu(cpu)
5533                queue_work_on(cpu, system_highpri_wq,
5534                              per_cpu_ptr(&flush_works, cpu));
5535
5536        for_each_online_cpu(cpu)
5537                flush_work(per_cpu_ptr(&flush_works, cpu));
5538
5539        put_online_cpus();
5540}
5541
5542/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */
5543static void gro_normal_list(struct napi_struct *napi)
5544{
5545        if (!napi->rx_count)
5546                return;
5547        netif_receive_skb_list_internal(&napi->rx_list);
5548        INIT_LIST_HEAD(&napi->rx_list);
5549        napi->rx_count = 0;
5550}
5551
5552/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded,
5553 * pass the whole batch up to the stack.
5554 */
5555static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb)
5556{
5557        list_add_tail(&skb->list, &napi->rx_list);
5558        if (++napi->rx_count >= gro_normal_batch)
5559                gro_normal_list(napi);
5560}
5561
5562INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int));
5563INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int));
5564static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
5565{
5566        struct packet_offload *ptype;
5567        __be16 type = skb->protocol;
5568        struct list_head *head = &offload_base;
5569        int err = -ENOENT;
5570
5571        BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
5572
5573        if (NAPI_GRO_CB(skb)->count == 1) {
5574                skb_shinfo(skb)->gso_size = 0;
5575                goto out;
5576        }
5577
5578        rcu_read_lock();
5579        list_for_each_entry_rcu(ptype, head, list) {
5580                if (ptype->type != type || !ptype->callbacks.gro_complete)
5581                        continue;
5582
5583                err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete,
5584                                         ipv6_gro_complete, inet_gro_complete,
5585                                         skb, 0);
5586                break;
5587        }
5588        rcu_read_unlock();
5589
5590        if (err) {
5591                WARN_ON(&ptype->list == head);
5592                kfree_skb(skb);
5593                return NET_RX_SUCCESS;
5594        }
5595
5596out:
5597        gro_normal_one(napi, skb);
5598        return NET_RX_SUCCESS;
5599}
5600
5601static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
5602                                   bool flush_old)
5603{
5604        struct list_head *head = &napi->gro_hash[index].list;
5605        struct sk_buff *skb, *p;
5606
5607        list_for_each_entry_safe_reverse(skb, p, head, list) {
5608                if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
5609                        return;
5610                skb_list_del_init(skb);
5611                napi_gro_complete(napi, skb);
5612                napi->gro_hash[index].count--;
5613        }
5614
5615        if (!napi->gro_hash[index].count)
5616                __clear_bit(index, &napi->gro_bitmask);
5617}
5618
5619/* napi->gro_hash[].list contains packets ordered by age.
5620 * youngest packets at the head of it.
5621 * Complete skbs in reverse order to reduce latencies.
5622 */
5623void napi_gro_flush(struct napi_struct *napi, bool flush_old)
5624{
5625        unsigned long bitmask = napi->gro_bitmask;
5626        unsigned int i, base = ~0U;
5627
5628        while ((i = ffs(bitmask)) != 0) {
5629                bitmask >>= i;
5630                base += i;
5631                __napi_gro_flush_chain(napi, base, flush_old);
5632        }
5633}
5634EXPORT_SYMBOL(napi_gro_flush);
5635
5636static struct list_head *gro_list_prepare(struct napi_struct *napi,
5637                                          struct sk_buff *skb)
5638{
5639        unsigned int maclen = skb->dev->hard_header_len;
5640        u32 hash = skb_get_hash_raw(skb);
5641        struct list_head *head;
5642        struct sk_buff *p;
5643
5644        head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list;
5645        list_for_each_entry(p, head, list) {
5646                unsigned long diffs;
5647
5648                NAPI_GRO_CB(p)->flush = 0;
5649
5650                if (hash != skb_get_hash_raw(p)) {
5651                        NAPI_GRO_CB(p)->same_flow = 0;
5652                        continue;
5653                }
5654
5655                diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
5656                diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb);
5657                if (skb_vlan_tag_present(p))
5658                        diffs |= skb_vlan_tag_get(p) ^ skb_vlan_tag_get(skb);
5659                diffs |= skb_metadata_dst_cmp(p, skb);
5660                diffs |= skb_metadata_differs(p, skb);
5661                if (maclen == ETH_HLEN)
5662                        diffs |= compare_ether_header(skb_mac_header(p),
5663                                                      skb_mac_header(skb));
5664                else if (!diffs)
5665                        diffs = memcmp(skb_mac_header(p),
5666                                       skb_mac_header(skb),
5667                                       maclen);
5668                NAPI_GRO_CB(p)->same_flow = !diffs;
5669        }
5670
5671        return head;
5672}
5673
5674static void skb_gro_reset_offset(struct sk_buff *skb)
5675{
5676        const struct skb_shared_info *pinfo = skb_shinfo(skb);
5677        const skb_frag_t *frag0 = &pinfo->frags[0];
5678
5679        NAPI_GRO_CB(skb)->data_offset = 0;
5680        NAPI_GRO_CB(skb)->frag0 = NULL;
5681        NAPI_GRO_CB(skb)->frag0_len = 0;
5682
5683        if (!skb_headlen(skb) && pinfo->nr_frags &&
5684            !PageHighMem(skb_frag_page(frag0))) {
5685                NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
5686                NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
5687                                                    skb_frag_size(frag0),
5688                                                    skb->end - skb->tail);
5689        }
5690}
5691
5692static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
5693{
5694        struct skb_shared_info *pinfo = skb_shinfo(skb);
5695
5696        BUG_ON(skb->end - skb->tail < grow);
5697
5698        memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
5699
5700        skb->data_len -= grow;
5701        skb->tail += grow;
5702
5703        skb_frag_off_add(&pinfo->frags[0], grow);
5704        skb_frag_size_sub(&pinfo->frags[0], grow);
5705
5706        if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
5707                skb_frag_unref(skb, 0);
5708                memmove(pinfo->frags, pinfo->frags + 1,
5709                        --pinfo->nr_frags * sizeof(pinfo->frags[0]));
5710        }
5711}
5712
5713static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head)
5714{
5715        struct sk_buff *oldest;
5716
5717        oldest = list_last_entry(head, struct sk_buff, list);
5718
5719        /* We are called with head length >= MAX_GRO_SKBS, so this is
5720         * impossible.
5721         */
5722        if (WARN_ON_ONCE(!oldest))
5723                return;
5724
5725        /* Do not adjust napi->gro_hash[].count, caller is adding a new
5726         * SKB to the chain.
5727         */
5728        skb_list_del_init(oldest);
5729        napi_gro_complete(napi, oldest);
5730}
5731
5732INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *,
5733                                                           struct sk_buff *));
5734INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *,
5735                                                           struct sk_buff *));
5736static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
5737{
5738        u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
5739        struct list_head *head = &offload_base;
5740        struct packet_offload *ptype;
5741        __be16 type = skb->protocol;
5742        struct list_head *gro_head;
5743        struct sk_buff *pp = NULL;
5744        enum gro_result ret;
5745        int same_flow;
5746        int grow;
5747
5748        if (netif_elide_gro(skb->dev))
5749                goto normal;
5750
5751        gro_head = gro_list_prepare(napi, skb);
5752
5753        rcu_read_lock();
5754        list_for_each_entry_rcu(ptype, head, list) {
5755                if (ptype->type != type || !ptype->callbacks.gro_receive)
5756                        continue;
5757
5758                skb_set_network_header(skb, skb_gro_offset(skb));
5759                skb_reset_mac_len(skb);
5760                NAPI_GRO_CB(skb)->same_flow = 0;
5761                NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
5762                NAPI_GRO_CB(skb)->free = 0;
5763                NAPI_GRO_CB(skb)->encap_mark = 0;
5764                NAPI_GRO_CB(skb)->recursion_counter = 0;
5765                NAPI_GRO_CB(skb)->is_fou = 0;
5766                NAPI_GRO_CB(skb)->is_atomic = 1;
5767                NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
5768
5769                /* Setup for GRO checksum validation */
5770                switch (skb->ip_summed) {
5771                case CHECKSUM_COMPLETE:
5772                        NAPI_GRO_CB(skb)->csum = skb->csum;
5773                        NAPI_GRO_CB(skb)->csum_valid = 1;
5774                        NAPI_GRO_CB(skb)->csum_cnt = 0;
5775                        break;
5776                case CHECKSUM_UNNECESSARY:
5777                        NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
5778                        NAPI_GRO_CB(skb)->csum_valid = 0;
5779                        break;
5780                default:
5781                        NAPI_GRO_CB(skb)->csum_cnt = 0;
5782                        NAPI_GRO_CB(skb)->csum_valid = 0;
5783                }
5784
5785                pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive,
5786                                        ipv6_gro_receive, inet_gro_receive,
5787                                        gro_head, skb);
5788                break;
5789        }
5790        rcu_read_unlock();
5791
5792        if (&ptype->list == head)
5793                goto normal;
5794
5795        if (PTR_ERR(pp) == -EINPROGRESS) {
5796                ret = GRO_CONSUMED;
5797                goto ok;
5798        }
5799
5800        same_flow = NAPI_GRO_CB(skb)->same_flow;
5801        ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
5802
5803        if (pp) {
5804                skb_list_del_init(pp);
5805                napi_gro_complete(napi, pp);
5806                napi->gro_hash[hash].count--;
5807        }
5808
5809        if (same_flow)
5810                goto ok;
5811
5812        if (NAPI_GRO_CB(skb)->flush)
5813                goto normal;
5814
5815        if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) {
5816                gro_flush_oldest(napi, gro_head);
5817        } else {
5818                napi->gro_hash[hash].count++;
5819        }
5820        NAPI_GRO_CB(skb)->count = 1;
5821        NAPI_GRO_CB(skb)->age = jiffies;
5822        NAPI_GRO_CB(skb)->last = skb;
5823        skb_shinfo(skb)->gso_size = skb_gro_len(skb);
5824        list_add(&skb->list, gro_head);
5825        ret = GRO_HELD;
5826
5827pull:
5828        grow = skb_gro_offset(skb) - skb_headlen(skb);
5829        if (grow > 0)
5830                gro_pull_from_frag0(skb, grow);
5831ok:
5832        if (napi->gro_hash[hash].count) {
5833                if (!test_bit(hash, &napi->gro_bitmask))
5834                        __set_bit(hash, &napi->gro_bitmask);
5835        } else if (test_bit(hash, &napi->gro_bitmask)) {
5836                __clear_bit(hash, &napi->gro_bitmask);
5837        }
5838
5839        return ret;
5840
5841normal:
5842        ret = GRO_NORMAL;
5843        goto pull;
5844}
5845
5846struct packet_offload *gro_find_receive_by_type(__be16 type)
5847{
5848        struct list_head *offload_head = &offload_base;
5849        struct packet_offload *ptype;
5850
5851        list_for_each_entry_rcu(ptype, offload_head, list) {
5852                if (ptype->type != type || !ptype->callbacks.gro_receive)
5853                        continue;
5854                return ptype;
5855        }
5856        return NULL;
5857}
5858EXPORT_SYMBOL(gro_find_receive_by_type);
5859
5860struct packet_offload *gro_find_complete_by_type(__be16 type)
5861{
5862        struct list_head *offload_head = &offload_base;
5863        struct packet_offload *ptype;
5864
5865        list_for_each_entry_rcu(ptype, offload_head, list) {
5866                if (ptype->type != type || !ptype->callbacks.gro_complete)
5867                        continue;
5868                return ptype;
5869        }
5870        return NULL;
5871}
5872EXPORT_SYMBOL(gro_find_complete_by_type);
5873
5874static void napi_skb_free_stolen_head(struct sk_buff *skb)
5875{
5876        skb_dst_drop(skb);
5877        skb_ext_put(skb);
5878        kmem_cache_free(skbuff_head_cache, skb);
5879}
5880
5881static gro_result_t napi_skb_finish(struct napi_struct *napi,
5882                                    struct sk_buff *skb,
5883                                    gro_result_t ret)
5884{
5885        switch (ret) {
5886        case GRO_NORMAL:
5887                gro_normal_one(napi, skb);
5888                break;
5889
5890        case GRO_DROP:
5891                kfree_skb(skb);
5892                break;
5893
5894        case GRO_MERGED_FREE:
5895                if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
5896                        napi_skb_free_stolen_head(skb);
5897                else
5898                        __kfree_skb(skb);
5899                break;
5900
5901        case GRO_HELD:
5902        case GRO_MERGED:
5903        case GRO_CONSUMED:
5904                break;
5905        }
5906
5907        return ret;
5908}
5909
5910gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
5911{
5912        gro_result_t ret;
5913
5914        skb_mark_napi_id(skb, napi);
5915        trace_napi_gro_receive_entry(skb);
5916
5917        skb_gro_reset_offset(skb);
5918
5919        ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb));
5920        trace_napi_gro_receive_exit(ret);
5921
5922        return ret;
5923}
5924EXPORT_SYMBOL(napi_gro_receive);
5925
5926static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
5927{
5928        if (unlikely(skb->pfmemalloc)) {
5929                consume_skb(skb);
5930                return;
5931        }
5932        __skb_pull(skb, skb_headlen(skb));
5933        /* restore the reserve we had after netdev_alloc_skb_ip_align() */
5934        skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
5935        __vlan_hwaccel_clear_tag(skb);
5936        skb->dev = napi->dev;
5937        skb->skb_iif = 0;
5938
5939        /* eth_type_trans() assumes pkt_type is PACKET_HOST */
5940        skb->pkt_type = PACKET_HOST;
5941
5942        skb->encapsulation = 0;
5943        skb_shinfo(skb)->gso_type = 0;
5944        skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
5945        skb_ext_reset(skb);
5946
5947        napi->skb = skb;
5948}
5949
5950struct sk_buff *napi_get_frags(struct napi_struct *napi)
5951{
5952        struct sk_buff *skb = napi->skb;
5953
5954        if (!skb) {
5955                skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
5956                if (skb) {
5957                        napi->skb = skb;
5958                        skb_mark_napi_id(skb, napi);
5959                }
5960        }
5961        return skb;
5962}
5963EXPORT_SYMBOL(napi_get_frags);
5964
5965static gro_result_t napi_frags_finish(struct napi_struct *napi,
5966                                      struct sk_buff *skb,
5967                                      gro_result_t ret)
5968{
5969        switch (ret) {
5970        case GRO_NORMAL:
5971        case GRO_HELD:
5972                __skb_push(skb, ETH_HLEN);
5973                skb->protocol = eth_type_trans(skb, skb->dev);
5974                if (ret == GRO_NORMAL)
5975                        gro_normal_one(napi, skb);
5976                break;
5977
5978        case GRO_DROP:
5979                napi_reuse_skb(napi, skb);
5980                break;
5981
5982        case GRO_MERGED_FREE:
5983                if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
5984                        napi_skb_free_stolen_head(skb);
5985                else
5986                        napi_reuse_skb(napi, skb);
5987                break;
5988
5989        case GRO_MERGED:
5990        case GRO_CONSUMED:
5991                break;
5992        }
5993
5994        return ret;
5995}
5996
5997/* Upper GRO stack assumes network header starts at gro_offset=0
5998 * Drivers could call both napi_gro_frags() and napi_gro_receive()
5999 * We copy ethernet header into skb->data to have a common layout.
6000 */

6001static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
6002{
6003        struct sk_buff *skb = napi->skb;
6004        const struct ethhdr *eth;
6005        unsigned int hlen = sizeof(*eth);
6006
6007        napi->skb = NULL;
6008
6009        skb_reset_mac_header(skb);
6010        skb_gro_reset_offset(skb);
6011
6012        if (unlikely(skb_gro_header_hard(skb, hlen))) {
6013                eth = skb_gro_header_slow(skb, hlen, 0);
6014                if (unlikely(!eth)) {
6015                        net_warn_ratelimited("%s: dropping impossible skb from %s\n",
6016                                             __func__, napi->dev->name);
6017                        napi_reuse_skb(napi, skb);
6018                        return NULL;
6019                }
6020        } else {
6021                eth = (const struct ethhdr *)skb->data;
6022                gro_pull_from_frag0(skb, hlen);
6023                NAPI_GRO_CB(skb)->frag0 += hlen;
6024                NAPI_GRO_CB(skb)->frag0_len -= hlen;
6025        }
6026        __skb_pull(skb, hlen);
6027
6028        /*
6029         * This works because the only protocols we care about don't require
6030         * special handling.
6031         * We'll fix it up properly in napi_frags_finish()
6032         */
6033        skb->protocol = eth->h_proto;
6034
6035        return skb;
6036}
6037
6038gro_result_t napi_gro_frags(struct napi_struct *napi)
6039{
6040        gro_result_t ret;
6041        struct sk_buff *skb = napi_frags_skb(napi);
6042
6043        if (!skb)
6044                return GRO_DROP;
6045
6046        trace_napi_gro_frags_entry(skb);
6047
6048        ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
6049        trace_napi_gro_frags_exit(ret);
6050
6051        return ret;
6052}
6053EXPORT_SYMBOL(napi_gro_frags);
6054
6055/* Compute the checksum from gro_offset and return the folded value
6056 * after adding in any pseudo checksum.
6057 */
6058__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
6059{
6060        __wsum wsum;
6061        __sum16 sum;
6062
6063        wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
6064
6065        /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
6066        sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
6067        /* See comments in __skb_checksum_complete(). */
6068        if (likely(!sum)) {
6069                if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
6070                    !skb->csum_complete_sw)
6071                        netdev_rx_csum_fault(skb->dev, skb);
6072        }
6073
6074        NAPI_GRO_CB(skb)->csum = wsum;
6075        NAPI_GRO_CB(skb)->csum_valid = 1;
6076
6077        return sum;
6078}
6079EXPORT_SYMBOL(__skb_gro_checksum_complete);
6080
6081static void net_rps_send_ipi(struct softnet_data *remsd)
6082{
6083#ifdef CONFIG_RPS
6084        while (remsd) {
6085                struct softnet_data *next = remsd->rps_ipi_next;
6086
6087                if (cpu_online(remsd->cpu))
6088                        smp_call_function_single_async(remsd->cpu, &remsd->csd);
6089                remsd = next;
6090        }
6091#endif
6092}
6093
6094/*
6095 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
6096 * Note: called with local irq disabled, but exits with local irq enabled.
6097 */
6098static void net_rps_action_and_irq_enable(struct softnet_data *sd)
6099{
6100#ifdef CONFIG_RPS
6101        struct softnet_data *remsd = sd->rps_ipi_list;
6102
6103        if (remsd) {
6104                sd->rps_ipi_list = NULL;
6105
6106                local_irq_enable();
6107
6108                /* Send pending IPI's to kick RPS processing on remote cpus. */
6109                net_rps_send_ipi(remsd);
6110        } else
6111#endif
6112                local_irq_enable();
6113}
6114
6115static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
6116{
6117#ifdef CONFIG_RPS
6118        return sd->rps_ipi_list != NULL;
6119#else
6120        return false;
6121#endif
6122}
6123
6124static int process_backlog(struct napi_struct *napi, int quota)
6125{
6126        struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
6127        bool again = true;
6128        int work = 0;
6129
6130        /* Check if we have pending ipi, its better to send them now,
6131         * not waiting net_rx_action() end.
6132         */
6133        if (sd_has_rps_ipi_waiting(sd)) {
6134                local_irq_disable();
6135                net_rps_action_and_irq_enable(sd);
6136        }
6137
6138        napi->weight = dev_rx_weight;
6139        while (again) {
6140                struct sk_buff *skb;
6141
6142                while ((skb = __skb_dequeue(&sd->process_queue))) {
6143                        rcu_read_lock();
6144                        __netif_receive_skb(skb);
6145                        rcu_read_unlock();
6146                        input_queue_head_incr(sd);
6147                        if (++work >= quota)
6148                                return work;
6149
6150                }
6151
6152                local_irq_disable();
6153                rps_lock(sd);
6154                if (skb_queue_empty(&sd->input_pkt_queue)) {
6155                        /*
6156                         * Inline a custom version of __napi_complete().
6157                         * only current cpu owns and manipulates this napi,
6158                         * and NAPI_STATE_SCHED is the only possible flag set
6159                         * on backlog.
6160                         * We can use a plain write instead of clear_bit(),
6161                         * and we dont need an smp_mb() memory barrier.
6162                         */
6163                        napi->state = 0;
6164                        again = false;
6165                } else {
6166                        skb_queue_splice_tail_init(&sd->input_pkt_queue,
6167                                                   &sd->process_queue);
6168                }
6169                rps_unlock(sd);
6170                local_irq_enable();
6171        }
6172
6173        return work;
6174}
6175
6176/**
6177 * __napi_schedule - schedule for receive
6178 * @n: entry to schedule
6179 *
6180 * The entry's receive function will be scheduled to run.
6181 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
6182 */
6183void __napi_schedule(struct napi_struct *n)
6184{
6185        unsigned long flags;
6186
6187        local_irq_save(flags);
6188        ____napi_schedule(this_cpu_ptr(&softnet_data), n);
6189        local_irq_restore(flags);
6190}
6191EXPORT_SYMBOL(__napi_schedule);
6192
6193/**
6194 *      napi_schedule_prep - check if napi can be scheduled
6195 *      @n: napi context
6196 *
6197 * Test if NAPI routine is already running, and if not mark
6198 * it as running.  This is used as a condition variable
6199 * insure only one NAPI poll instance runs.  We also make
6200 * sure there is no pending NAPI disable.
6201 */
6202bool napi_schedule_prep(struct napi_struct *n)
6203{
6204        unsigned long val, new;
6205
6206        do {
6207                val = READ_ONCE(n->state);
6208                if (unlikely(val & NAPIF_STATE_DISABLE))
6209                        return false;
6210                new = val | NAPIF_STATE_SCHED;
6211
6212                /* Sets STATE_MISSED bit if STATE_SCHED was already set
6213                 * This was suggested by Alexander Duyck, as compiler
6214                 * emits better code than :
6215                 * if (val & NAPIF_STATE_SCHED)
6216                 *     new |= NAPIF_STATE_MISSED;
6217                 */
6218                new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
6219                                                   NAPIF_STATE_MISSED;
6220        } while (cmpxchg(&n->state, val, new) != val);
6221
6222        return !(val & NAPIF_STATE_SCHED);
6223}
6224EXPORT_SYMBOL(napi_schedule_prep);
6225
6226/**
6227 * __napi_schedule_irqoff - schedule for receive
6228 * @n: entry to schedule
6229 *
6230 * Variant of __napi_schedule() assuming hard irqs are masked
6231 */
6232void __napi_schedule_irqoff(struct napi_struct *n)
6233{
6234        ____napi_schedule(this_cpu_ptr(&softnet_data), n);
6235}
6236EXPORT_SYMBOL(__napi_schedule_irqoff);
6237
6238bool napi_complete_done(struct napi_struct *n, int work_done)
6239{
6240        unsigned long flags, val, new;
6241
6242        /*
6243         * 1) Don't let napi dequeue from the cpu poll list
6244         *    just in case its running on a different cpu.
6245         * 2) If we are busy polling, do nothing here, we have
6246         *    the guarantee we will be called later.
6247         */
6248        if (unlikely(n->state & (NAPIF_STATE_NPSVC |
6249                                 NAPIF_STATE_IN_BUSY_POLL)))
6250                return false;
6251
6252        if (n->gro_bitmask) {
6253                unsigned long timeout = 0;
6254
6255                if (work_done)
6256                        timeout = n->dev->gro_flush_timeout;
6257
6258                /* When the NAPI instance uses a timeout and keeps postponing
6259                 * it, we need to bound somehow the time packets are kept in
6260                 * the GRO layer
6261                 */
6262                napi_gro_flush(n, !!timeout);
6263                if (timeout)
6264                        hrtimer_start(&n->timer, ns_to_ktime(timeout),
6265                                      HRTIMER_MODE_REL_PINNED);
6266        }
6267
6268        gro_normal_list(n);
6269
6270        if (unlikely(!list_empty(&n->poll_list))) {
6271                /* If n->poll_list is not empty, we need to mask irqs */
6272                local_irq_save(flags);
6273                list_del_init(&n->poll_list);
6274                local_irq_restore(flags);
6275        }
6276
6277        do {
6278                val = READ_ONCE(n->state);
6279
6280                WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
6281
6282                new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
6283
6284                /* If STATE_MISSED was set, leave STATE_SCHED set,
6285                 * because we will call napi->poll() one more time.
6286                 * This C code was suggested by Alexander Duyck to help gcc.
6287                 */
6288                new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
6289                                                    NAPIF_STATE_SCHED;
6290        } while (cmpxchg(&n->state, val, new) != val);
6291
6292        if (unlikely(val & NAPIF_STATE_MISSED)) {
6293                __napi_schedule(n);
6294                return false;
6295        }
6296
6297        return true;
6298}
6299EXPORT_SYMBOL(napi_complete_done);
6300
6301/* must be called under rcu_read_lock(), as we dont take a reference */
6302static struct napi_struct *napi_by_id(unsigned int napi_id)
6303{
6304        unsigned int hash = napi_id % HASH_SIZE(napi_hash);
6305        struct napi_struct *napi;
6306
6307        hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
6308                if (napi->napi_id == napi_id)
6309                        return napi;
6310
6311        return NULL;
6312}
6313
6314#if defined(CONFIG_NET_RX_BUSY_POLL)
6315
6316#define BUSY_POLL_BUDGET 8
6317
6318static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
6319{
6320        int rc;
6321
6322        /* Busy polling means there is a high chance device driver hard irq
6323         * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
6324         * set in napi_schedule_prep().
6325         * Since we are about to call napi->poll() once more, we can safely
6326         * clear NAPI_STATE_MISSED.
6327         *
6328         * Note: x86 could use a single "lock and ..." instruction
6329         * to perform these two clear_bit()
6330         */
6331        clear_bit(NAPI_STATE_MISSED, &napi->state);
6332        clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
6333
6334        local_bh_disable();
6335
6336        /* All we really want here is to re-enable device interrupts.
6337         * Ideally, a new ndo_busy_poll_stop() could avoid another round.
6338         */
6339        rc = napi->poll(napi, BUSY_POLL_BUDGET);
6340        /* We can't gro_normal_list() here, because napi->poll() might have
6341         * rearmed the napi (napi_complete_done()) in which case it could
6342         * already be running on another CPU.
6343         */
6344        trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
6345        netpoll_poll_unlock(have_poll_lock);
6346        if (rc == BUSY_POLL_BUDGET) {
6347                /* As the whole budget was spent, we still own the napi so can
6348                 * safely handle the rx_list.
6349                 */
6350                gro_normal_list(napi);
6351                __napi_schedule(napi);
6352        }
6353        local_bh_enable();
6354}
6355
6356void napi_busy_loop(unsigned int napi_id,
6357                    bool (*loop_end)(void *, unsigned long),
6358                    void *loop_end_arg)
6359{
6360        unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
6361        int (*napi_poll)(struct napi_struct *napi, int budget);
6362        void *have_poll_lock = NULL;
6363        struct napi_struct *napi;
6364
6365restart:
6366        napi_poll = NULL;
6367
6368        rcu_read_lock();
6369
6370        napi = napi_by_id(napi_id);
6371        if (!napi)
6372                goto out;
6373
6374        preempt_disable();
6375        for (;;) {
6376                int work = 0;
6377
6378                local_bh_disable();
6379                if (!napi_poll) {
6380                        unsigned long val = READ_ONCE(napi->state);
6381
6382                        /* If multiple threads are competing for this napi,
6383                         * we avoid dirtying napi->state as much as we can.
6384                         */
6385                        if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
6386                                   NAPIF_STATE_IN_BUSY_POLL))
6387                                goto count;
6388                        if (cmpxchg(&napi->state, val,
6389                                    val | NAPIF_STATE_IN_BUSY_POLL |
6390                                          NAPIF_STATE_SCHED) != val)
6391                                goto count;
6392                        have_poll_lock = netpoll_poll_lock(napi);
6393                        napi_poll = napi->poll;
6394                }
6395                work = napi_poll(napi, BUSY_POLL_BUDGET);
6396                trace_napi_poll(napi, work, BUSY_POLL_BUDGET);
6397                gro_normal_list(napi);
6398count:
6399                if (work > 0)
6400                        __NET_ADD_STATS(dev_net(napi->dev),
6401                                        LINUX_MIB_BUSYPOLLRXPACKETS, work);
6402                local_bh_enable();
6403
6404                if (!loop_end || loop_end(loop_end_arg, start_time))
6405                        break;
6406
6407                if (unlikely(need_resched())) {
6408                        if (napi_poll)
6409                                busy_poll_stop(napi, have_poll_lock);
6410                        preempt_enable();
6411                        rcu_read_unlock();
6412                        cond_resched();
6413                        if (loop_end(loop_end_arg, start_time))
6414                                return;
6415                        goto restart;
6416                }
6417                cpu_relax();
6418        }
6419        if (napi_poll)
6420                busy_poll_stop(napi, have_poll_lock);
6421        preempt_enable();
6422out:
6423        rcu_read_unlock();
6424}
6425EXPORT_SYMBOL(napi_busy_loop);
6426
6427#endif /* CONFIG_NET_RX_BUSY_POLL */
6428
6429static void napi_hash_add(struct napi_struct *napi)
6430{
6431        if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
6432            test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
6433                return;
6434
6435        spin_lock(&napi_hash_lock);
6436
6437        /* 0..NR_CPUS range is reserved for sender_cpu use */
6438        do {
6439                if (unlikely(++napi_gen_id < MIN_NAPI_ID))
6440                        napi_gen_id = MIN_NAPI_ID;
6441        } while (napi_by_id(napi_gen_id));
6442        napi->napi_id = napi_gen_id;
6443
6444        hlist_add_head_rcu(&napi->napi_hash_node,
6445                           &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
6446
6447        spin_unlock(&napi_hash_lock);
6448}
6449
6450/* Warning : caller is responsible to make sure rcu grace period
6451 * is respected before freeing memory containing @napi
6452 */
6453bool napi_hash_del(struct napi_struct *napi)
6454{
6455        bool rcu_sync_needed = false;
6456
6457        spin_lock(&napi_hash_lock);
6458
6459        if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
6460                rcu_sync_needed = true;
6461                hlist_del_rcu(&napi->napi_hash_node);
6462        }
6463        spin_unlock(&napi_hash_lock);
6464        return rcu_sync_needed;
6465}
6466EXPORT_SYMBOL_GPL(napi_hash_del);
6467
6468static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
6469{
6470        struct napi_struct *napi;
6471
6472        napi = container_of(timer, struct napi_struct, timer);
6473
6474        /* Note : we use a relaxed variant of napi_schedule_prep() not setting
6475         * NAPI_STATE_MISSED, since we do not react to a device IRQ.
6476         */
6477        if (napi->gro_bitmask && !napi_disable_pending(napi) &&
6478            !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
6479                __napi_schedule_irqoff(napi);
6480
6481        return HRTIMER_NORESTART;
6482}
6483
6484static void init_gro_hash(struct napi_struct *napi)
6485{
6486        int i;
6487
6488        for (i = 0; i < GRO_HASH_BUCKETS; i++) {
6489                INIT_LIST_HEAD(&napi->gro_hash[i].list);
6490                napi->gro_hash[i].count = 0;
6491        }
6492        napi->gro_bitmask = 0;
6493}
6494
6495void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
6496                    int (*poll)(struct napi_struct *, int), int weight)
6497{
6498        INIT_LIST_HEAD(&napi->poll_list);
6499        hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
6500        napi->timer.function = napi_watchdog;
6501        init_gro_hash(napi);
6502        napi->skb = NULL;
6503        INIT_LIST_HEAD(&napi->rx_list);
6504        napi->rx_count = 0;
6505        napi->poll = poll;
6506        if (weight > NAPI_POLL_WEIGHT)
6507                netdev_err_once(dev, "%s() called with weight %d\n", __func__,
6508                                weight);
6509        napi->weight = weight;
6510        list_add(&napi->dev_list, &dev->napi_list);
6511        napi->dev = dev;
6512#ifdef CONFIG_NETPOLL
6513        napi->poll_owner = -1;
6514#endif
6515        set_bit(NAPI_STATE_SCHED, &napi->state);
6516        napi_hash_add(napi);
6517}
6518EXPORT_SYMBOL(netif_napi_add);
6519
6520void napi_disable(struct napi_struct *n)
6521{
6522        might_sleep();
6523        set_bit(NAPI_STATE_DISABLE, &n->state);
6524
6525        while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
6526                msleep(1);
6527        while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
6528                msleep(1);
6529
6530        hrtimer_cancel(&n->timer);
6531
6532        clear_bit(NAPI_STATE_DISABLE, &n->state);
6533}
6534EXPORT_SYMBOL(napi_disable);
6535
6536static void flush_gro_hash(struct napi_struct *napi)
6537{
6538        int i;
6539
6540        for (i = 0; i < GRO_HASH_BUCKETS; i++) {
6541                struct sk_buff *skb, *n;
6542
6543                list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
6544                        kfree_skb(skb);
6545                napi->gro_hash[i].count = 0;
6546        }
6547}
6548
6549/* Must be called in process context */
6550void netif_napi_del(struct napi_struct *napi)
6551{
6552        might_sleep();
6553        if (napi_hash_del(napi))
6554                synchronize_net();
6555        list_del_init(&napi->dev_list);
6556        napi_free_frags(napi);
6557
6558        flush_gro_hash(napi);
6559        napi->gro_bitmask = 0;
6560}
6561EXPORT_SYMBOL(netif_napi_del);
6562
6563static int napi_poll(struct napi_struct *n, struct list_head *repoll)
6564{
6565        void *have;
6566        int work, weight;
6567
6568        list_del_init(&n->poll_list);
6569
6570        have = netpoll_poll_lock(n);
6571
6572        weight = n->weight;
6573
6574        /* This NAPI_STATE_SCHED test is for avoiding a race
6575         * with netpoll's poll_napi().  Only the entity which
6576         * obtains the lock and sees NAPI_STATE_SCHED set will
6577         * actually make the ->poll() call.  Therefore we avoid
6578         * accidentally calling ->poll() when NAPI is not scheduled.
6579         */
6580        work = 0;
6581        if (test_bit(NAPI_STATE_SCHED, &n->state)) {
6582                work = n->poll(n, weight);
6583                trace_napi_poll(n, work, weight);
6584        }
6585
6586        WARN_ON_ONCE(work > weight);
6587
6588        if (likely(work < weight))
6589                goto out_unlock;
6590
6591        /* Drivers must not modify the NAPI state if they
6592         * consume the entire weight.  In such cases this code
6593         * still "owns" the NAPI instance and therefore can
6594         * move the instance around on the list at-will.
6595         */
6596        if (unlikely(napi_disable_pending(n))) {
6597                napi_complete(n);
6598                goto out_unlock;
6599        }
6600
6601        if (n->gro_bitmask) {
6602                /* flush too old packets
6603                 * If HZ < 1000, flush all packets.
6604                 */
6605                napi_gro_flush(n, HZ >= 1000);
6606        }
6607
6608        gro_normal_list(n);
6609
6610        /* Some drivers may have called napi_schedule
6611         * prior to exhausting their budget.
6612         */
6613        if (unlikely(!list_empty(&n->poll_list))) {
6614                pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
6615                             n->dev ? n->dev->name : "backlog");
6616                goto out_unlock;
6617        }
6618
6619        list_add_tail(&n->poll_list, repoll);
6620
6621out_unlock:
6622        netpoll_poll_unlock(have);
6623
6624        return work;
6625}
6626
6627static __latent_entropy void net_rx_action(struct softirq_action *h)
6628{
6629        struct softnet_data *sd = this_cpu_ptr(&softnet_data);
6630        unsigned long time_limit = jiffies +
6631                usecs_to_jiffies(netdev_budget_usecs);
6632        int budget = netdev_budget;
6633        LIST_HEAD(list);
6634        LIST_HEAD(repoll);
6635
6636        local_irq_disable();
6637        list_splice_init(&sd->poll_list, &list);
6638        local_irq_enable();
6639
6640        for (;;) {
6641                struct napi_struct *n;
6642
6643                if (list_empty(&list)) {
6644                        if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
6645                                goto out;
6646                        break;
6647                }
6648
6649                n = list_first_entry(&list, struct napi_struct, poll_list);
6650                budget -= napi_poll(n, &repoll);
6651
6652                /* If softirq window is exhausted then punt.
6653                 * Allow this to run for 2 jiffies since which will allow
6654                 * an average latency of 1.5/HZ.
6655                 */
6656                if (unlikely(budget <= 0 ||
6657                             time_after_eq(jiffies, time_limit))) {
6658                        sd->time_squeeze++;
6659                        break;
6660                }
6661        }
6662
6663        local_irq_disable();
6664
6665        list_splice_tail_init(&sd->poll_list, &list);
6666        list_splice_tail(&repoll, &list);
6667        list_splice(&list, &sd->poll_list);
6668        if (!list_empty(&sd->poll_list))
6669                __raise_softirq_irqoff(NET_RX_SOFTIRQ);
6670
6671        net_rps_action_and_irq_enable(sd);
6672out:
6673        __kfree_skb_flush();
6674}
6675
6676struct netdev_adjacent {
6677        struct net_device *dev;
6678
6679        /* upper master flag, there can only be one master device per list */
6680        bool master;
6681
6682        /* lookup ignore flag */
6683        bool ignore;
6684
6685        /* counter for the number of times this device was added to us */
6686        u16 ref_nr;
6687
6688        /* private field for the users */
6689        void *private;
6690
6691        struct list_head list;
6692        struct rcu_head rcu;
6693};
6694
6695static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
6696                                                 struct list_head *adj_list)
6697{
6698        struct netdev_adjacent *adj;
6699
6700        list_for_each_entry(adj, adj_list, list) {
6701                if (adj->dev == adj_dev)
6702                        return adj;
6703        }
6704        return NULL;
6705}
6706
6707static int ____netdev_has_upper_dev(struct net_device *upper_dev, void *data)
6708{
6709        struct net_device *dev = data;
6710
6711        return upper_dev == dev;
6712}
6713
6714/**
6715 * netdev_has_upper_dev - Check if device is linked to an upper device
6716 * @dev: device
6717 * @upper_dev: upper device to check
6718 *
6719 * Find out if a device is linked to specified upper device and return true
6720 * in case it is. Note that this checks only immediate upper device,
6721 * not through a complete stack of devices. The caller must hold the RTNL lock.
6722 */
6723bool netdev_has_upper_dev(struct net_device *dev,
6724                          struct net_device *upper_dev)
6725{
6726        ASSERT_RTNL();
6727
6728        return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
6729                                             upper_dev);
6730}
6731EXPORT_SYMBOL(netdev_has_upper_dev);
6732
6733/**
6734 * netdev_has_upper_dev_all - Check if device is linked to an upper device
6735 * @dev: device
6736 * @upper_dev: upper device to check
6737 *
6738 * Find out if a device is linked to specified upper device and return true
6739 * in case it is. Note that this checks the entire upper device chain.
6740 * The caller must hold rcu lock.
6741 */
6742
6743bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
6744                                  struct net_device *upper_dev)
6745{
6746        return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
6747                                               upper_dev);
6748}
6749EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
6750
6751/**
6752 * netdev_has_any_upper_dev - Check if device is linked to some device
6753 * @dev: device
6754 *
6755 * Find out if a device is linked to an upper device and return true in case
6756 * it is. The caller must hold the RTNL lock.
6757 */
6758bool netdev_has_any_upper_dev(struct net_device *dev)
6759{
6760        ASSERT_RTNL();
6761
6762        return !list_empty(&dev->adj_list.upper);
6763}
6764EXPORT_SYMBOL(netdev_has_any_upper_dev);
6765
6766/**
6767 * netdev_master_upper_dev_get - Get master upper device
6768 * @dev: device
6769 *
6770 * Find a master upper device and return pointer to it or NULL in case
6771 * it's not there. The caller must hold the RTNL lock.
6772 */
6773struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
6774{
6775        struct netdev_adjacent *upper;
6776
6777        ASSERT_RTNL();
6778
6779        if (list_empty(&dev->adj_list.upper))
6780                return NULL;
6781
6782        upper = list_first_entry(&dev->adj_list.upper,
6783                                 struct netdev_adjacent, list);
6784        if (likely(upper->master))
6785                return upper->dev;
6786        return NULL;
6787}
6788EXPORT_SYMBOL(netdev_master_upper_dev_get);
6789
6790static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
6791{
6792        struct netdev_adjacent *upper;
6793
6794        ASSERT_RTNL();
6795
6796        if (list_empty(&dev->adj_list.upper))
6797                return NULL;
6798
6799        upper = list_first_entry(&dev->adj_list.upper,
6800                                 struct netdev_adjacent, list);
6801        if (likely(upper->master) && !upper->ignore)
6802                return upper->dev;
6803        return NULL;
6804}
6805
6806/**
6807 * netdev_has_any_lower_dev - Check if device is linked to some device
6808 * @dev: device
6809 *
6810 * Find out if a device is linked to a lower device and return true in case
6811 * it is. The caller must hold the RTNL lock.
6812 */
6813static bool netdev_has_any_lower_dev(struct net_device *dev)
6814{
6815        ASSERT_RTNL();
6816
6817        return !list_empty(&dev->adj_list.lower);
6818}
6819
6820void *netdev_adjacent_get_private(struct list_head *adj_list)
6821{
6822        struct netdev_adjacent *adj;
6823
6824        adj = list_entry(adj_list, struct netdev_adjacent, list);
6825
6826        return adj->private;
6827}
6828EXPORT_SYMBOL(netdev_adjacent_get_private);
6829
6830/**
6831 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
6832 * @dev: device
6833 * @iter: list_head ** of the current position
6834 *
6835 * Gets the next device from the dev's upper list, starting from iter
6836 * position. The caller must hold RCU read lock.
6837 */
6838struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
6839                                                 struct list_head **iter)
6840{
6841        struct netdev_adjacent *upper;
6842
6843        WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
6844
6845        upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
6846
6847        if (&upper->list == &dev->adj_list.upper)
6848                return NULL;
6849
6850        *iter = &upper->list;
6851
6852        return upper->dev;
6853}
6854EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
6855
6856static struct net_device *__netdev_next_upper_dev(struct net_device *dev,
6857                                                  struct list_head **iter,
6858                                                  bool *ignore)
6859{
6860        struct netdev_adjacent *upper;
6861
6862        upper = list_entry((*iter)->next, struct netdev_adjacent, list);
6863
6864        if (&upper->list == &dev->adj_list.upper)
6865                return NULL;
6866
6867        *iter = &upper->list;
6868        *ignore = upper->ignore;
6869
6870        return upper->dev;
6871}
6872
6873static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
6874                                                    struct list_head **iter)
6875{
6876        struct netdev_adjacent *upper;
6877
6878        WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
6879
6880        upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
6881
6882        if (&upper->list == &dev->adj_list.upper)
6883                return NULL;
6884
6885        *iter = &upper->list;
6886
6887        return upper->dev;
6888}
6889
6890static int __netdev_walk_all_upper_dev(struct net_device *dev,
6891                                       int (*fn)(struct net_device *dev,
6892                                                 void *data),
6893                                       void *data)
6894{
6895        struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
6896        struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
6897        int ret, cur = 0;
6898        bool ignore;
6899
6900        now = dev;
6901        iter = &dev->adj_list.upper;
6902
6903        while (1) {
6904                if (now != dev) {
6905                        ret = fn(now, data);
6906                        if (ret)
6907                                return ret;
6908                }
6909
6910                next = NULL;
6911                while (1) {
6912                        udev = __netdev_next_upper_dev(now, &iter, &ignore);
6913                        if (!udev)
6914                                break;
6915                        if (ignore)
6916                                continue;
6917
6918                        next = udev;
6919                        niter = &udev->adj_list.upper;
6920                        dev_stack[cur] = now;
6921                        iter_stack[cur++] = iter;
6922                        break;
6923                }
6924
6925                if (!next) {
6926                        if (!cur)
6927                                return 0;
6928                        next = dev_stack[--cur];
6929                        niter = iter_stack[cur];
6930                }
6931
6932                now = next;
6933                iter = niter;
6934        }
6935
6936        return 0;
6937}
6938
6939int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
6940                                  int (*fn)(struct net_device *dev,
6941                                            void *data),
6942                                  void *data)
6943{
6944        struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
6945        struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
6946        int ret, cur = 0;
6947
6948        now = dev;
6949        iter = &dev->adj_list.upper;
6950
6951        while (1) {
6952                if (now != dev) {
6953                        ret = fn(now, data);
6954                        if (ret)
6955                                return ret;
6956                }
6957
6958                next = NULL;
6959                while (1) {
6960                        udev = netdev_next_upper_dev_rcu(now, &iter);
6961                        if (!udev)
6962                                break;
6963
6964                        next = udev;
6965                        niter = &udev->adj_list.upper;
6966                        dev_stack[cur] = now;
6967                        iter_stack[cur++] = iter;
6968                        break;
6969                }
6970
6971                if (!next) {
6972                        if (!cur)
6973                                return 0;
6974                        next = dev_stack[--cur];
6975                        niter = iter_stack[cur];
6976                }
6977
6978                now = next;
6979                iter = niter;
6980        }
6981
6982        return 0;
6983}
6984EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
6985
6986static bool __netdev_has_upper_dev(struct net_device *dev,
6987                                   struct net_device *upper_dev)
6988{
6989        ASSERT_RTNL();
6990
6991        return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
6992                                           upper_dev);
6993}
6994
6995/**
6996 * netdev_lower_get_next_private - Get the next ->private from the
6997 *                                 lower neighbour list
6998 * @dev: device
6999 * @iter: list_head ** of the current position
7000 *

7001 * Gets the next netdev_adjacent->private from the dev's lower neighbour
7002 * list, starting from iter position. The caller must hold either hold the
7003 * RTNL lock or its own locking that guarantees that the neighbour lower
7004 * list will remain unchanged.
7005 */
7006void *netdev_lower_get_next_private(struct net_device *dev,
7007                                    struct list_head **iter)
7008{
7009        struct netdev_adjacent *lower;
7010
7011        lower = list_entry(*iter, struct netdev_adjacent, list);
7012
7013        if (&lower->list == &dev->adj_list.lower)
7014                return NULL;
7015
7016        *iter = lower->list.next;
7017
7018        return lower->private;
7019}
7020EXPORT_SYMBOL(netdev_lower_get_next_private);
7021
7022/**
7023 * netdev_lower_get_next_private_rcu - Get the next ->private from the
7024 *                                     lower neighbour list, RCU
7025 *                                     variant
7026 * @dev: device
7027 * @iter: list_head ** of the current position
7028 *
7029 * Gets the next netdev_adjacent->private from the dev's lower neighbour
7030 * list, starting from iter position. The caller must hold RCU read lock.
7031 */
7032void *netdev_lower_get_next_private_rcu(struct net_device *dev,
7033                                        struct list_head **iter)
7034{
7035        struct netdev_adjacent *lower;
7036
7037        WARN_ON_ONCE(!rcu_read_lock_held());
7038
7039        lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7040
7041        if (&lower->list == &dev->adj_list.lower)
7042                return NULL;
7043
7044        *iter = &lower->list;
7045
7046        return lower->private;
7047}
7048EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
7049
7050/**
7051 * netdev_lower_get_next - Get the next device from the lower neighbour
7052 *                         list
7053 * @dev: device
7054 * @iter: list_head ** of the current position
7055 *
7056 * Gets the next netdev_adjacent from the dev's lower neighbour
7057 * list, starting from iter position. The caller must hold RTNL lock or
7058 * its own locking that guarantees that the neighbour lower
7059 * list will remain unchanged.
7060 */
7061void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
7062{
7063        struct netdev_adjacent *lower;
7064
7065        lower = list_entry(*iter, struct netdev_adjacent, list);
7066
7067        if (&lower->list == &dev->adj_list.lower)
7068                return NULL;
7069
7070        *iter = lower->list.next;
7071
7072        return lower->dev;
7073}
7074EXPORT_SYMBOL(netdev_lower_get_next);
7075
7076static struct net_device *netdev_next_lower_dev(struct net_device *dev,
7077                                                struct list_head **iter)
7078{
7079        struct netdev_adjacent *lower;
7080
7081        lower = list_entry((*iter)->next, struct netdev_adjacent, list);
7082
7083        if (&lower->list == &dev->adj_list.lower)
7084                return NULL;
7085
7086        *iter = &lower->list;
7087
7088        return lower->dev;
7089}
7090
7091static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
7092                                                  struct list_head **iter,
7093                                                  bool *ignore)
7094{
7095        struct netdev_adjacent *lower;
7096
7097        lower = list_entry((*iter)->next, struct netdev_adjacent, list);
7098
7099        if (&lower->list == &dev->adj_list.lower)
7100                return NULL;
7101
7102        *iter = &lower->list;
7103        *ignore = lower->ignore;
7104
7105        return lower->dev;
7106}
7107
7108int netdev_walk_all_lower_dev(struct net_device *dev,
7109                              int (*fn)(struct net_device *dev,
7110                                        void *data),
7111                              void *data)
7112{
7113        struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7114        struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7115        int ret, cur = 0;
7116
7117        now = dev;
7118        iter = &dev->adj_list.lower;
7119
7120        while (1) {
7121                if (now != dev) {
7122                        ret = fn(now, data);
7123                        if (ret)
7124                                return ret;
7125                }
7126
7127                next = NULL;
7128                while (1) {
7129                        ldev = netdev_next_lower_dev(now, &iter);
7130                        if (!ldev)
7131                                break;
7132
7133                        next = ldev;
7134                        niter = &ldev->adj_list.lower;
7135                        dev_stack[cur] = now;
7136                        iter_stack[cur++] = iter;
7137                        break;
7138                }
7139
7140                if (!next) {
7141                        if (!cur)
7142                                return 0;
7143                        next = dev_stack[--cur];
7144                        niter = iter_stack[cur];
7145                }
7146
7147                now = next;
7148                iter = niter;
7149        }
7150
7151        return 0;
7152}
7153EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
7154
7155static int __netdev_walk_all_lower_dev(struct net_device *dev,
7156                                       int (*fn)(struct net_device *dev,
7157                                                 void *data),
7158                                       void *data)
7159{
7160        struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7161        struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7162        int ret, cur = 0;
7163        bool ignore;
7164
7165        now = dev;
7166        iter = &dev->adj_list.lower;
7167
7168        while (1) {
7169                if (now != dev) {
7170                        ret = fn(now, data);
7171                        if (ret)
7172                                return ret;
7173                }
7174
7175                next = NULL;
7176                while (1) {
7177                        ldev = __netdev_next_lower_dev(now, &iter, &ignore);
7178                        if (!ldev)
7179                                break;
7180                        if (ignore)
7181                                continue;
7182
7183                        next = ldev;
7184                        niter = &ldev->adj_list.lower;
7185                        dev_stack[cur] = now;
7186                        iter_stack[cur++] = iter;
7187                        break;
7188                }
7189
7190                if (!next) {
7191                        if (!cur)
7192                                return 0;
7193                        next = dev_stack[--cur];
7194                        niter = iter_stack[cur];
7195                }
7196
7197                now = next;
7198                iter = niter;
7199        }
7200
7201        return 0;
7202}
7203
7204struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
7205                                             struct list_head **iter)
7206{
7207        struct netdev_adjacent *lower;
7208
7209        lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7210        if (&lower->list == &dev->adj_list.lower)
7211                return NULL;
7212
7213        *iter = &lower->list;
7214
7215        return lower->dev;
7216}
7217EXPORT_SYMBOL(netdev_next_lower_dev_rcu);
7218
7219static u8 __netdev_upper_depth(struct net_device *dev)
7220{
7221        struct net_device *udev;
7222        struct list_head *iter;
7223        u8 max_depth = 0;
7224        bool ignore;
7225
7226        for (iter = &dev->adj_list.upper,
7227             udev = __netdev_next_upper_dev(dev, &iter, &ignore);
7228             udev;
7229             udev = __netdev_next_upper_dev(dev, &iter, &ignore)) {
7230                if (ignore)
7231                        continue;
7232                if (max_depth < udev->upper_level)
7233                        max_depth = udev->upper_level;
7234        }
7235
7236        return max_depth;
7237}
7238
7239static u8 __netdev_lower_depth(struct net_device *dev)
7240{
7241        struct net_device *ldev;
7242        struct list_head *iter;
7243        u8 max_depth = 0;
7244        bool ignore;
7245
7246        for (iter = &dev->adj_list.lower,
7247             ldev = __netdev_next_lower_dev(dev, &iter, &ignore);
7248             ldev;
7249             ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) {
7250                if (ignore)
7251                        continue;
7252                if (max_depth < ldev->lower_level)
7253                        max_depth = ldev->lower_level;
7254        }
7255
7256        return max_depth;
7257}
7258
7259static int __netdev_update_upper_level(struct net_device *dev, void *data)
7260{
7261        dev->upper_level = __netdev_upper_depth(dev) + 1;
7262        return 0;
7263}
7264
7265static int __netdev_update_lower_level(struct net_device *dev, void *data)
7266{
7267        dev->lower_level = __netdev_lower_depth(dev) + 1;
7268        return 0;
7269}
7270
7271int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
7272                                  int (*fn)(struct net_device *dev,
7273                                            void *data),
7274                                  void *data)
7275{
7276        struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7277        struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7278        int ret, cur = 0;
7279
7280        now = dev;
7281        iter = &dev->adj_list.lower;
7282
7283        while (1) {
7284                if (now != dev) {
7285                        ret = fn(now, data);
7286                        if (ret)
7287                                return ret;
7288                }
7289
7290                next = NULL;
7291                while (1) {
7292                        ldev = netdev_next_lower_dev_rcu(now, &iter);
7293                        if (!ldev)
7294                                break;
7295
7296                        next = ldev;
7297                        niter = &ldev->adj_list.lower;
7298                        dev_stack[cur] = now;
7299                        iter_stack[cur++] = iter;
7300                        break;
7301                }
7302
7303                if (!next) {
7304                        if (!cur)
7305                                return 0;
7306                        next = dev_stack[--cur];
7307                        niter = iter_stack[cur];
7308                }
7309
7310                now = next;
7311                iter = niter;
7312        }
7313
7314        return 0;
7315}
7316EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
7317
7318/**
7319 * netdev_lower_get_first_private_rcu - Get the first ->private from the
7320 *                                     lower neighbour list, RCU
7321 *                                     variant
7322 * @dev: device
7323 *
7324 * Gets the first netdev_adjacent->private from the dev's lower neighbour
7325 * list. The caller must hold RCU read lock.
7326 */
7327void *netdev_lower_get_first_private_rcu(struct net_device *dev)
7328{
7329        struct netdev_adjacent *lower;
7330
7331        lower = list_first_or_null_rcu(&dev->adj_list.lower,
7332                        struct netdev_adjacent, list);
7333        if (lower)
7334                return lower->private;
7335        return NULL;
7336}
7337EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
7338
7339/**
7340 * netdev_master_upper_dev_get_rcu - Get master upper device
7341 * @dev: device
7342 *
7343 * Find a master upper device and return pointer to it or NULL in case
7344 * it's not there. The caller must hold the RCU read lock.
7345 */
7346struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
7347{
7348        struct netdev_adjacent *upper;
7349
7350        upper = list_first_or_null_rcu(&dev->adj_list.upper,
7351                                       struct netdev_adjacent, list);
7352        if (upper && likely(upper->master))
7353                return upper->dev;
7354        return NULL;
7355}
7356EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
7357
7358static int netdev_adjacent_sysfs_add(struct net_device *dev,
7359                              struct net_device *adj_dev,
7360                              struct list_head *dev_list)
7361{
7362        char linkname[IFNAMSIZ+7];
7363
7364        sprintf(linkname, dev_list == &dev->adj_list.upper ?
7365                "upper_%s" : "lower_%s", adj_dev->name);
7366        return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
7367                                 linkname);
7368}
7369static void netdev_adjacent_sysfs_del(struct net_device *dev,
7370                               char *name,
7371                               struct list_head *dev_list)
7372{
7373        char linkname[IFNAMSIZ+7];
7374
7375        sprintf(linkname, dev_list == &dev->adj_list.upper ?
7376                "upper_%s" : "lower_%s", name);
7377        sysfs_remove_link(&(dev->dev.kobj), linkname);
7378}
7379
7380static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
7381                                                 struct net_device *adj_dev,
7382                                                 struct list_head *dev_list)
7383{
7384        return (dev_list == &dev->adj_list.upper ||
7385                dev_list == &dev->adj_list.lower) &&
7386                net_eq(dev_net(dev), dev_net(adj_dev));
7387}
7388
7389static int __netdev_adjacent_dev_insert(struct net_device *dev,
7390                                        struct net_device *adj_dev,
7391                                        struct list_head *dev_list,
7392                                        void *private, bool master)
7393{
7394        struct netdev_adjacent *adj;
7395        int ret;
7396
7397        adj = __netdev_find_adj(adj_dev, dev_list);
7398
7399        if (adj) {
7400                adj->ref_nr += 1;
7401                pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
7402                         dev->name, adj_dev->name, adj->ref_nr);
7403
7404                return 0;
7405        }
7406
7407        adj = kmalloc(sizeof(*adj), GFP_KERNEL);
7408        if (!adj)
7409                return -ENOMEM;
7410
7411        adj->dev = adj_dev;
7412        adj->master = master;
7413        adj->ref_nr = 1;
7414        adj->private = private;
7415        adj->ignore = false;
7416        dev_hold(adj_dev);
7417
7418        pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
7419                 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
7420
7421        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
7422                ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
7423                if (ret)
7424                        goto free_adj;
7425        }
7426
7427        /* Ensure that master link is always the first item in list. */
7428        if (master) {
7429                ret = sysfs_create_link(&(dev->dev.kobj),
7430                                        &(adj_dev->dev.kobj), "master");
7431                if (ret)
7432                        goto remove_symlinks;
7433
7434                list_add_rcu(&adj->list, dev_list);
7435        } else {
7436                list_add_tail_rcu(&adj->list, dev_list);
7437        }
7438
7439        return 0;
7440
7441remove_symlinks:
7442        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
7443                netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
7444free_adj:
7445        kfree(adj);
7446        dev_put(adj_dev);
7447
7448        return ret;
7449}
7450
7451static void __netdev_adjacent_dev_remove(struct net_device *dev,
7452                                         struct net_device *adj_dev,
7453                                         u16 ref_nr,
7454                                         struct list_head *dev_list)
7455{
7456        struct netdev_adjacent *adj;
7457
7458        pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
7459                 dev->name, adj_dev->name, ref_nr);
7460
7461        adj = __netdev_find_adj(adj_dev, dev_list);
7462
7463        if (!adj) {
7464                pr_err("Adjacency does not exist for device %s from %s\n",
7465                       dev->name, adj_dev->name);
7466                WARN_ON(1);
7467                return;
7468        }
7469
7470        if (adj->ref_nr > ref_nr) {
7471                pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
7472                         dev->name, adj_dev->name, ref_nr,
7473                         adj->ref_nr - ref_nr);
7474                adj->ref_nr -= ref_nr;
7475                return;
7476        }
7477
7478        if (adj->master)
7479                sysfs_remove_link(&(dev->dev.kobj), "master");
7480
7481        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
7482                netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
7483
7484        list_del_rcu(&adj->list);
7485        pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
7486                 adj_dev->name, dev->name, adj_dev->name);
7487        dev_put(adj_dev);
7488        kfree_rcu(adj, rcu);
7489}
7490
7491static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
7492                                            struct net_device *upper_dev,
7493                                            struct list_head *up_list,
7494                                            struct list_head *down_list,
7495                                            void *private, bool master)
7496{
7497        int ret;
7498
7499        ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
7500                                           private, master);
7501        if (ret)
7502                return ret;
7503
7504        ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
7505                                           private, false);
7506        if (ret) {
7507                __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
7508                return ret;
7509        }
7510
7511        return 0;
7512}
7513
7514static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
7515                                               struct net_device *upper_dev,
7516                                               u16 ref_nr,
7517                                               struct list_head *up_list,
7518                                               struct list_head *down_list)
7519{
7520        __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
7521        __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
7522}
7523
7524static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
7525                                                struct net_device *upper_dev,
7526                                                void *private, bool master)
7527{
7528        return __netdev_adjacent_dev_link_lists(dev, upper_dev,
7529                                                &dev->adj_list.upper,
7530                                                &upper_dev->adj_list.lower,
7531                                                private, master);
7532}
7533
7534static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
7535                                                   struct net_device *upper_dev)
7536{
7537        __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
7538                                           &dev->adj_list.upper,
7539                                           &upper_dev->adj_list.lower);
7540}
7541
7542static int __netdev_upper_dev_link(struct net_device *dev,
7543                                   struct net_device *upper_dev, bool master,
7544                                   void *upper_priv, void *upper_info,
7545                                   struct netlink_ext_ack *extack)
7546{
7547        struct netdev_notifier_changeupper_info changeupper_info = {
7548                .info = {
7549                        .dev = dev,
7550                        .extack = extack,
7551                },
7552                .upper_dev = upper_dev,
7553                .master = master,
7554                .linking = true,
7555                .upper_info = upper_info,
7556        };
7557        struct net_device *master_dev;
7558        int ret = 0;
7559
7560        ASSERT_RTNL();
7561
7562        if (dev == upper_dev)
7563                return -EBUSY;
7564
7565        /* To prevent loops, check if dev is not upper device to upper_dev. */
7566        if (__netdev_has_upper_dev(upper_dev, dev))
7567                return -EBUSY;
7568
7569        if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
7570                return -EMLINK;
7571
7572        if (!master) {
7573                if (__netdev_has_upper_dev(dev, upper_dev))
7574                        return -EEXIST;
7575        } else {
7576                master_dev = __netdev_master_upper_dev_get(dev);
7577                if (master_dev)
7578                        return master_dev == upper_dev ? -EEXIST : -EBUSY;
7579        }
7580
7581        ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
7582                                            &changeupper_info.info);
7583        ret = notifier_to_errno(ret);
7584        if (ret)
7585                return ret;
7586
7587        ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
7588                                                   master);
7589        if (ret)
7590                return ret;
7591
7592        ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
7593                                            &changeupper_info.info);
7594        ret = notifier_to_errno(ret);
7595        if (ret)
7596                goto rollback;
7597
7598        __netdev_update_upper_level(dev, NULL);
7599        __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
7600
7601        __netdev_update_lower_level(upper_dev, NULL);
7602        __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
7603                                    NULL);
7604
7605        return 0;
7606
7607rollback:
7608        __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
7609
7610        return ret;
7611}
7612
7613/**
7614 * netdev_upper_dev_link - Add a link to the upper device
7615 * @dev: device
7616 * @upper_dev: new upper device
7617 * @extack: netlink extended ack
7618 *
7619 * Adds a link to device which is upper to this one. The caller must hold
7620 * the RTNL lock. On a failure a negative errno code is returned.
7621 * On success the reference counts are adjusted and the function
7622 * returns zero.
7623 */
7624int netdev_upper_dev_link(struct net_device *dev,
7625                          struct net_device *upper_dev,
7626                          struct netlink_ext_ack *extack)
7627{
7628        return __netdev_upper_dev_link(dev, upper_dev, false,
7629                                       NULL, NULL, extack);
7630}
7631EXPORT_SYMBOL(netdev_upper_dev_link);
7632
7633/**
7634 * netdev_master_upper_dev_link - Add a master link to the upper device
7635 * @dev: device
7636 * @upper_dev: new upper device
7637 * @upper_priv: upper device private
7638 * @upper_info: upper info to be passed down via notifier
7639 * @extack: netlink extended ack
7640 *
7641 * Adds a link to device which is upper to this one. In this case, only
7642 * one master upper device can be linked, although other non-master devices
7643 * might be linked as well. The caller must hold the RTNL lock.
7644 * On a failure a negative errno code is returned. On success the reference
7645 * counts are adjusted and the function returns zero.
7646 */
7647int netdev_master_upper_dev_link(struct net_device *dev,
7648                                 struct net_device *upper_dev,
7649                                 void *upper_priv, void *upper_info,
7650                                 struct netlink_ext_ack *extack)
7651{
7652        return __netdev_upper_dev_link(dev, upper_dev, true,
7653                                       upper_priv, upper_info, extack);
7654}
7655EXPORT_SYMBOL(netdev_master_upper_dev_link);
7656
7657/**
7658 * netdev_upper_dev_unlink - Removes a link to upper device
7659 * @dev: device
7660 * @upper_dev: new upper device
7661 *
7662 * Removes a link to device which is upper to this one. The caller must hold
7663 * the RTNL lock.
7664 */
7665void netdev_upper_dev_unlink(struct net_device *dev,
7666                             struct net_device *upper_dev)
7667{
7668        struct netdev_notifier_changeupper_info changeupper_info = {
7669                .info = {
7670                        .dev = dev,
7671                },
7672                .upper_dev = upper_dev,
7673                .linking = false,
7674        };
7675
7676        ASSERT_RTNL();
7677
7678        changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
7679
7680        call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
7681                                      &changeupper_info.info);
7682
7683        __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
7684
7685        call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
7686                                      &changeupper_info.info);
7687
7688        __netdev_update_upper_level(dev, NULL);
7689        __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
7690
7691        __netdev_update_lower_level(upper_dev, NULL);
7692        __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
7693                                    NULL);
7694}
7695EXPORT_SYMBOL(netdev_upper_dev_unlink);
7696
7697static void __netdev_adjacent_dev_set(struct net_device *upper_dev,
7698                                      struct net_device *lower_dev,
7699                                      bool val)
7700{
7701        struct netdev_adjacent *adj;
7702
7703        adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
7704        if (adj)
7705                adj->ignore = val;
7706
7707        adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
7708        if (adj)
7709                adj->ignore = val;
7710}
7711
7712static void netdev_adjacent_dev_disable(struct net_device *upper_dev,
7713                                        struct net_device *lower_dev)
7714{
7715        __netdev_adjacent_dev_set(upper_dev, lower_dev, true);
7716}
7717
7718static void netdev_adjacent_dev_enable(struct net_device *upper_dev,
7719                                       struct net_device *lower_dev)
7720{
7721        __netdev_adjacent_dev_set(upper_dev, lower_dev, false);
7722}
7723
7724int netdev_adjacent_change_prepare(struct net_device *old_dev,
7725                                   struct net_device *new_dev,
7726                                   struct net_device *dev,
7727                                   struct netlink_ext_ack *extack)
7728{
7729        int err;
7730
7731        if (!new_dev)
7732                return 0;
7733
7734        if (old_dev && new_dev != old_dev)
7735                netdev_adjacent_dev_disable(dev, old_dev);
7736
7737        err = netdev_upper_dev_link(new_dev, dev, extack);
7738        if (err) {
7739                if (old_dev && new_dev != old_dev)
7740                        netdev_adjacent_dev_enable(dev, old_dev);
7741                return err;
7742        }
7743
7744        return 0;
7745}
7746EXPORT_SYMBOL(netdev_adjacent_change_prepare);
7747
7748void netdev_adjacent_change_commit(struct net_device *old_dev,
7749                                   struct net_device *new_dev,
7750                                   struct net_device *dev)
7751{
7752        if (!new_dev || !old_dev)
7753                return;
7754
7755        if (new_dev == old_dev)
7756                return;
7757
7758        netdev_adjacent_dev_enable(dev, old_dev);
7759        netdev_upper_dev_unlink(old_dev, dev);
7760}
7761EXPORT_SYMBOL(netdev_adjacent_change_commit);
7762
7763void netdev_adjacent_change_abort(struct net_device *old_dev,
7764                                  struct net_device *new_dev,
7765                                  struct net_device *dev)
7766{
7767        if (!new_dev)
7768                return;
7769
7770        if (old_dev && new_dev != old_dev)
7771                netdev_adjacent_dev_enable(dev, old_dev);
7772
7773        netdev_upper_dev_unlink(new_dev, dev);
7774}
7775EXPORT_SYMBOL(netdev_adjacent_change_abort);
7776
7777/**
7778 * netdev_bonding_info_change - Dispatch event about slave change
7779 * @dev: device
7780 * @bonding_info: info to dispatch
7781 *
7782 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
7783 * The caller must hold the RTNL lock.
7784 */
7785void netdev_bonding_info_change(struct net_device *dev,
7786                                struct netdev_bonding_info *bonding_info)
7787{
7788        struct netdev_notifier_bonding_info info = {
7789                .info.dev = dev,
7790        };
7791
7792        memcpy(&info.bonding_info, bonding_info,
7793               sizeof(struct netdev_bonding_info));
7794        call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
7795                                      &info.info);
7796}
7797EXPORT_SYMBOL(netdev_bonding_info_change);
7798
7799static void netdev_adjacent_add_links(struct net_device *dev)
7800{
7801        struct netdev_adjacent *iter;
7802
7803        struct net *net = dev_net(dev);
7804
7805        list_for_each_entry(iter, &dev->adj_list.upper, list) {
7806                if (!net_eq(net, dev_net(iter->dev)))
7807                        continue;
7808                netdev_adjacent_sysfs_add(iter->dev, dev,
7809                                          &iter->dev->adj_list.lower);
7810                netdev_adjacent_sysfs_add(dev, iter->dev,
7811                                          &dev->adj_list.upper);
7812        }
7813
7814        list_for_each_entry(iter, &dev->adj_list.lower, list) {
7815                if (!net_eq(net, dev_net(iter->dev)))
7816                        continue;
7817                netdev_adjacent_sysfs_add(iter->dev, dev,
7818                                          &iter->dev->adj_list.upper);
7819                netdev_adjacent_sysfs_add(dev, iter->dev,
7820                                          &dev->adj_list.lower);
7821        }
7822}
7823
7824static void netdev_adjacent_del_links(struct net_device *dev)
7825{
7826        struct netdev_adjacent *iter;
7827
7828        struct net *net = dev_net(dev);
7829
7830        list_for_each_entry(iter, &dev->adj_list.upper, list) {
7831                if (!net_eq(net, dev_net(iter->dev)))
7832                        continue;
7833                netdev_adjacent_sysfs_del(iter->dev, dev->name,
7834                                          &iter->dev->adj_list.lower);
7835                netdev_adjacent_sysfs_del(dev, iter->dev->name,
7836                                          &dev->adj_list.upper);
7837        }
7838
7839        list_for_each_entry(iter, &dev->adj_list.lower, list) {
7840                if (!net_eq(net, dev_net(iter->dev)))
7841                        continue;
7842                netdev_adjacent_sysfs_del(iter->dev, dev->name,
7843                                          &iter->dev->adj_list.upper);
7844                netdev_adjacent_sysfs_del(dev, iter->dev->name,
7845                                          &dev->adj_list.lower);
7846        }
7847}
7848
7849void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
7850{
7851        struct netdev_adjacent *iter;
7852
7853        struct net *net = dev_net(dev);
7854
7855        list_for_each_entry(iter, &dev->adj_list.upper, list) {
7856                if (!net_eq(net, dev_net(iter->dev)))
7857                        continue;
7858                netdev_adjacent_sysfs_del(iter->dev, oldname,
7859                                          &iter->dev->adj_list.lower);
7860                netdev_adjacent_sysfs_add(iter->dev, dev,
7861                                          &iter->dev->adj_list.lower);
7862        }
7863
7864        list_for_each_entry(iter, &dev->adj_list.lower, list) {
7865                if (!net_eq(net, dev_net(iter->dev)))
7866                        continue;
7867                netdev_adjacent_sysfs_del(iter->dev, oldname,
7868                                          &iter->dev->adj_list.upper);
7869                netdev_adjacent_sysfs_add(iter->dev, dev,
7870                                          &iter->dev->adj_list.upper);
7871        }
7872}
7873
7874void *netdev_lower_dev_get_private(struct net_device *dev,
7875                                   struct net_device *lower_dev)
7876{
7877        struct netdev_adjacent *lower;
7878
7879        if (!lower_dev)
7880                return NULL;
7881        lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
7882        if (!lower)
7883                return NULL;
7884
7885        return lower->private;
7886}
7887EXPORT_SYMBOL(netdev_lower_dev_get_private);
7888
7889
7890/**
7891 * netdev_lower_change - Dispatch event about lower device state change
7892 * @lower_dev: device
7893 * @lower_state_info: state to dispatch
7894 *
7895 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
7896 * The caller must hold the RTNL lock.
7897 */
7898void netdev_lower_state_changed(struct net_device *lower_dev,
7899                                void *lower_state_info)
7900{
7901        struct netdev_notifier_changelowerstate_info changelowerstate_info = {
7902                .info.dev = lower_dev,
7903        };
7904
7905        ASSERT_RTNL();
7906        changelowerstate_info.lower_state_info = lower_state_info;
7907        call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
7908                                      &changelowerstate_info.info);
7909}
7910EXPORT_SYMBOL(netdev_lower_state_changed);
7911
7912static void dev_change_rx_flags(struct net_device *dev, int flags)
7913{
7914        const struct net_device_ops *ops = dev->netdev_ops;
7915
7916        if (ops->ndo_change_rx_flags)
7917                ops->ndo_change_rx_flags(dev, flags);
7918}
7919
7920static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
7921{
7922        unsigned int old_flags = dev->flags;
7923        kuid_t uid;
7924        kgid_t gid;
7925
7926        ASSERT_RTNL();
7927
7928        dev->flags |= IFF_PROMISC;
7929        dev->promiscuity += inc;
7930        if (dev->promiscuity == 0) {
7931                /*
7932                 * Avoid overflow.
7933                 * If inc causes overflow, untouch promisc and return error.
7934                 */
7935                if (inc < 0)
7936                        dev->flags &= ~IFF_PROMISC;
7937                else {
7938                        dev->promiscuity -= inc;
7939                        pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
7940                                dev->name);
7941                        return -EOVERFLOW;
7942                }
7943        }
7944        if (dev->flags != old_flags) {
7945                pr_info("device %s %s promiscuous mode\n",
7946                        dev->name,
7947                        dev->flags & IFF_PROMISC ? "entered" : "left");
7948                if (audit_enabled) {
7949                        current_uid_gid(&uid, &gid);
7950                        audit_log(audit_context(), GFP_ATOMIC,
7951                                  AUDIT_ANOM_PROMISCUOUS,
7952                                  "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
7953                                  dev->name, (dev->flags & IFF_PROMISC),
7954                                  (old_flags & IFF_PROMISC),
7955                                  from_kuid(&init_user_ns, audit_get_loginuid(current)),
7956                                  from_kuid(&init_user_ns, uid),
7957                                  from_kgid(&init_user_ns, gid),
7958                                  audit_get_sessionid(current));
7959                }
7960
7961                dev_change_rx_flags(dev, IFF_PROMISC);
7962        }
7963        if (notify)
7964                __dev_notify_flags(dev, old_flags, IFF_PROMISC);
7965        return 0;
7966}
7967
7968/**
7969 *      dev_set_promiscuity     - update promiscuity count on a device
7970 *      @dev: device
7971 *      @inc: modifier
7972 *
7973 *      Add or remove promiscuity from a device. While the count in the device
7974 *      remains above zero the interface remains promiscuous. Once it hits zero
7975 *      the device reverts back to normal filtering operation. A negative inc
7976 *      value is used to drop promiscuity on the device.
7977 *      Return 0 if successful or a negative errno code on error.
7978 */
7979int dev_set_promiscuity(struct net_device *dev, int inc)
7980{
7981        unsigned int old_flags = dev->flags;
7982        int err;
7983
7984        err = __dev_set_promiscuity(dev, inc, true);
7985        if (err < 0)
7986                return err;
7987        if (dev->flags != old_flags)
7988                dev_set_rx_mode(dev);
7989        return err;
7990}
7991EXPORT_SYMBOL(dev_set_promiscuity);
7992
7993static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
7994{
7995        unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
7996
7997        ASSERT_RTNL();
7998
7999        dev->flags |= IFF_ALLMULTI;
8000        dev->allmulti += inc;

8001        if (dev->allmulti == 0) {
8002                /*
8003                 * Avoid overflow.
8004                 * If inc causes overflow, untouch allmulti and return error.
8005                 */
8006                if (inc < 0)
8007                        dev->flags &= ~IFF_ALLMULTI;
8008                else {
8009                        dev->allmulti -= inc;
8010                        pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
8011                                dev->name);
8012                        return -EOVERFLOW;
8013                }
8014        }
8015        if (dev->flags ^ old_flags) {
8016                dev_change_rx_flags(dev, IFF_ALLMULTI);
8017                dev_set_rx_mode(dev);
8018                if (notify)
8019                        __dev_notify_flags(dev, old_flags,
8020                                           dev->gflags ^ old_gflags);
8021        }
8022        return 0;
8023}
8024
8025/**
8026 *      dev_set_allmulti        - update allmulti count on a device
8027 *      @dev: device
8028 *      @inc: modifier
8029 *
8030 *      Add or remove reception of all multicast frames to a device. While the
8031 *      count in the device remains above zero the interface remains listening
8032 *      to all interfaces. Once it hits zero the device reverts back to normal
8033 *      filtering operation. A negative @inc value is used to drop the counter
8034 *      when releasing a resource needing all multicasts.
8035 *      Return 0 if successful or a negative errno code on error.
8036 */
8037
8038int dev_set_allmulti(struct net_device *dev, int inc)
8039{
8040        return __dev_set_allmulti(dev, inc, true);
8041}
8042EXPORT_SYMBOL(dev_set_allmulti);
8043
8044/*
8045 *      Upload unicast and multicast address lists to device and
8046 *      configure RX filtering. When the device doesn't support unicast
8047 *      filtering it is put in promiscuous mode while unicast addresses
8048 *      are present.
8049 */
8050void __dev_set_rx_mode(struct net_device *dev)
8051{
8052        const struct net_device_ops *ops = dev->netdev_ops;
8053
8054        /* dev_open will call this function so the list will stay sane. */
8055        if (!(dev->flags&IFF_UP))
8056                return;
8057
8058        if (!netif_device_present(dev))
8059                return;
8060
8061        if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
8062                /* Unicast addresses changes may only happen under the rtnl,
8063                 * therefore calling __dev_set_promiscuity here is safe.
8064                 */
8065                if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
8066                        __dev_set_promiscuity(dev, 1, false);
8067                        dev->uc_promisc = true;
8068                } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
8069                        __dev_set_promiscuity(dev, -1, false);
8070                        dev->uc_promisc = false;
8071                }
8072        }
8073
8074        if (ops->ndo_set_rx_mode)
8075                ops->ndo_set_rx_mode(dev);
8076}
8077
8078void dev_set_rx_mode(struct net_device *dev)
8079{
8080        netif_addr_lock_bh(dev);
8081        __dev_set_rx_mode(dev);
8082        netif_addr_unlock_bh(dev);
8083}
8084
8085/**
8086 *      dev_get_flags - get flags reported to userspace
8087 *      @dev: device
8088 *
8089 *      Get the combination of flag bits exported through APIs to userspace.
8090 */
8091unsigned int dev_get_flags(const struct net_device *dev)
8092{
8093        unsigned int flags;
8094
8095        flags = (dev->flags & ~(IFF_PROMISC |
8096                                IFF_ALLMULTI |
8097                                IFF_RUNNING |
8098                                IFF_LOWER_UP |
8099                                IFF_DORMANT)) |
8100                (dev->gflags & (IFF_PROMISC |
8101                                IFF_ALLMULTI));
8102
8103        if (netif_running(dev)) {
8104                if (netif_oper_up(dev))
8105                        flags |= IFF_RUNNING;
8106                if (netif_carrier_ok(dev))
8107                        flags |= IFF_LOWER_UP;
8108                if (netif_dormant(dev))
8109                        flags |= IFF_DORMANT;
8110        }
8111
8112        return flags;
8113}
8114EXPORT_SYMBOL(dev_get_flags);
8115
8116int __dev_change_flags(struct net_device *dev, unsigned int flags,
8117                       struct netlink_ext_ack *extack)
8118{
8119        unsigned int old_flags = dev->flags;
8120        int ret;
8121
8122        ASSERT_RTNL();
8123
8124        /*
8125         *      Set the flags on our device.
8126         */
8127
8128        dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
8129                               IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
8130                               IFF_AUTOMEDIA)) |
8131                     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
8132                                    IFF_ALLMULTI));
8133
8134        /*
8135         *      Load in the correct multicast list now the flags have changed.
8136         */
8137
8138        if ((old_flags ^ flags) & IFF_MULTICAST)
8139                dev_change_rx_flags(dev, IFF_MULTICAST);
8140
8141        dev_set_rx_mode(dev);
8142
8143        /*
8144         *      Have we downed the interface. We handle IFF_UP ourselves
8145         *      according to user attempts to set it, rather than blindly
8146         *      setting it.
8147         */
8148
8149        ret = 0;
8150        if ((old_flags ^ flags) & IFF_UP) {
8151                if (old_flags & IFF_UP)
8152                        __dev_close(dev);
8153                else
8154                        ret = __dev_open(dev, extack);
8155        }
8156
8157        if ((flags ^ dev->gflags) & IFF_PROMISC) {
8158                int inc = (flags & IFF_PROMISC) ? 1 : -1;
8159                unsigned int old_flags = dev->flags;
8160
8161                dev->gflags ^= IFF_PROMISC;
8162
8163                if (__dev_set_promiscuity(dev, inc, false) >= 0)
8164                        if (dev->flags != old_flags)
8165                                dev_set_rx_mode(dev);
8166        }
8167
8168        /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
8169         * is important. Some (broken) drivers set IFF_PROMISC, when
8170         * IFF_ALLMULTI is requested not asking us and not reporting.
8171         */
8172        if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
8173                int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
8174
8175                dev->gflags ^= IFF_ALLMULTI;
8176                __dev_set_allmulti(dev, inc, false);
8177        }
8178
8179        return ret;
8180}
8181
8182void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
8183                        unsigned int gchanges)
8184{
8185        unsigned int changes = dev->flags ^ old_flags;
8186
8187        if (gchanges)
8188                rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
8189
8190        if (changes & IFF_UP) {
8191                if (dev->flags & IFF_UP)
8192                        call_netdevice_notifiers(NETDEV_UP, dev);
8193                else
8194                        call_netdevice_notifiers(NETDEV_DOWN, dev);
8195        }
8196
8197        if (dev->flags & IFF_UP &&
8198            (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
8199                struct netdev_notifier_change_info change_info = {
8200                        .info = {
8201                                .dev = dev,
8202                        },
8203                        .flags_changed = changes,
8204                };
8205
8206                call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
8207        }
8208}
8209
8210/**
8211 *      dev_change_flags - change device settings
8212 *      @dev: device
8213 *      @flags: device state flags
8214 *      @extack: netlink extended ack
8215 *
8216 *      Change settings on device based state flags. The flags are
8217 *      in the userspace exported format.
8218 */
8219int dev_change_flags(struct net_device *dev, unsigned int flags,
8220                     struct netlink_ext_ack *extack)
8221{
8222        int ret;
8223        unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
8224
8225        ret = __dev_change_flags(dev, flags, extack);
8226        if (ret < 0)
8227                return ret;
8228
8229        changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
8230        __dev_notify_flags(dev, old_flags, changes);
8231        return ret;
8232}
8233EXPORT_SYMBOL(dev_change_flags);
8234
8235int __dev_set_mtu(struct net_device *dev, int new_mtu)
8236{
8237        const struct net_device_ops *ops = dev->netdev_ops;
8238
8239        if (ops->ndo_change_mtu)
8240                return ops->ndo_change_mtu(dev, new_mtu);
8241
8242        /* Pairs with all the lockless reads of dev->mtu in the stack */
8243        WRITE_ONCE(dev->mtu, new_mtu);
8244        return 0;
8245}
8246EXPORT_SYMBOL(__dev_set_mtu);
8247
8248int dev_validate_mtu(struct net_device *dev, int new_mtu,
8249                     struct netlink_ext_ack *extack)
8250{
8251        /* MTU must be positive, and in range */
8252        if (new_mtu < 0 || new_mtu < dev->min_mtu) {
8253                NL_SET_ERR_MSG(extack, "mtu less than device minimum");
8254                return -EINVAL;
8255        }
8256
8257        if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
8258                NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
8259                return -EINVAL;
8260        }
8261        return 0;
8262}
8263
8264/**
8265 *      dev_set_mtu_ext - Change maximum transfer unit
8266 *      @dev: device
8267 *      @new_mtu: new transfer unit
8268 *      @extack: netlink extended ack
8269 *
8270 *      Change the maximum transfer size of the network device.
8271 */
8272int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
8273                    struct netlink_ext_ack *extack)
8274{
8275        int err, orig_mtu;
8276
8277        if (new_mtu == dev->mtu)
8278                return 0;
8279
8280        err = dev_validate_mtu(dev, new_mtu, extack);
8281        if (err)
8282                return err;
8283
8284        if (!netif_device_present(dev))
8285                return -ENODEV;
8286
8287        err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
8288        err = notifier_to_errno(err);
8289        if (err)
8290                return err;
8291
8292        orig_mtu = dev->mtu;
8293        err = __dev_set_mtu(dev, new_mtu);
8294
8295        if (!err) {
8296                err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
8297                                                   orig_mtu);
8298                err = notifier_to_errno(err);
8299                if (err) {
8300                        /* setting mtu back and notifying everyone again,
8301                         * so that they have a chance to revert changes.
8302                         */
8303                        __dev_set_mtu(dev, orig_mtu);
8304                        call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
8305                                                     new_mtu);
8306                }
8307        }
8308        return err;
8309}
8310
8311int dev_set_mtu(struct net_device *dev, int new_mtu)
8312{
8313        struct netlink_ext_ack extack;
8314        int err;
8315
8316        memset(&extack, 0, sizeof(extack));
8317        err = dev_set_mtu_ext(dev, new_mtu, &extack);
8318        if (err && extack._msg)
8319                net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
8320        return err;
8321}
8322EXPORT_SYMBOL(dev_set_mtu);
8323
8324/**
8325 *      dev_change_tx_queue_len - Change TX queue length of a netdevice
8326 *      @dev: device
8327 *      @new_len: new tx queue length
8328 */
8329int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
8330{
8331        unsigned int orig_len = dev->tx_queue_len;
8332        int res;
8333
8334        if (new_len != (unsigned int)new_len)
8335                return -ERANGE;
8336
8337        if (new_len != orig_len) {
8338                dev->tx_queue_len = new_len;
8339                res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
8340                res = notifier_to_errno(res);
8341                if (res)
8342                        goto err_rollback;
8343                res = dev_qdisc_change_tx_queue_len(dev);
8344                if (res)
8345                        goto err_rollback;
8346        }
8347
8348        return 0;
8349
8350err_rollback:
8351        netdev_err(dev, "refused to change device tx_queue_len\n");
8352        dev->tx_queue_len = orig_len;
8353        return res;
8354}
8355
8356/**
8357 *      dev_set_group - Change group this device belongs to
8358 *      @dev: device
8359 *      @new_group: group this device should belong to
8360 */
8361void dev_set_group(struct net_device *dev, int new_group)
8362{
8363        dev->group = new_group;
8364}
8365EXPORT_SYMBOL(dev_set_group);
8366
8367/**
8368 *      dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
8369 *      @dev: device
8370 *      @addr: new address
8371 *      @extack: netlink extended ack
8372 */
8373int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
8374                              struct netlink_ext_ack *extack)
8375{
8376        struct netdev_notifier_pre_changeaddr_info info = {
8377                .info.dev = dev,
8378                .info.extack = extack,
8379                .dev_addr = addr,
8380        };
8381        int rc;
8382
8383        rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
8384        return notifier_to_errno(rc);
8385}
8386EXPORT_SYMBOL(dev_pre_changeaddr_notify);
8387
8388/**
8389 *      dev_set_mac_address - Change Media Access Control Address
8390 *      @dev: device
8391 *      @sa: new address
8392 *      @extack: netlink extended ack
8393 *
8394 *      Change the hardware (MAC) address of the device
8395 */
8396int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
8397                        struct netlink_ext_ack *extack)
8398{
8399        const struct net_device_ops *ops = dev->netdev_ops;
8400        int err;
8401
8402        if (!ops->ndo_set_mac_address)
8403                return -EOPNOTSUPP;
8404        if (sa->sa_family != dev->type)
8405                return -EINVAL;
8406        if (!netif_device_present(dev))
8407                return -ENODEV;
8408        err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
8409        if (err)
8410                return err;
8411        err = ops->ndo_set_mac_address(dev, sa);
8412        if (err)
8413                return err;
8414        dev->addr_assign_type = NET_ADDR_SET;
8415        call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
8416        add_device_randomness(dev->dev_addr, dev->addr_len);
8417        return 0;
8418}
8419EXPORT_SYMBOL(dev_set_mac_address);
8420
8421/**
8422 *      dev_change_carrier - Change device carrier
8423 *      @dev: device
8424 *      @new_carrier: new value
8425 *
8426 *      Change device carrier
8427 */
8428int dev_change_carrier(struct net_device *dev, bool new_carrier)
8429{
8430        const struct net_device_ops *ops = dev->netdev_ops;
8431
8432        if (!ops->ndo_change_carrier)
8433                return -EOPNOTSUPP;
8434        if (!netif_device_present(dev))
8435                return -ENODEV;
8436        return ops->ndo_change_carrier(dev, new_carrier);
8437}
8438EXPORT_SYMBOL(dev_change_carrier);
8439
8440/**
8441 *      dev_get_phys_port_id - Get device physical port ID
8442 *      @dev: device
8443 *      @ppid: port ID
8444 *
8445 *      Get device physical port ID
8446 */
8447int dev_get_phys_port_id(struct net_device *dev,
8448                         struct netdev_phys_item_id *ppid)
8449{
8450        const struct net_device_ops *ops = dev->netdev_ops;
8451
8452        if (!ops->ndo_get_phys_port_id)
8453                return -EOPNOTSUPP;
8454        return ops->ndo_get_phys_port_id(dev, ppid);
8455}
8456EXPORT_SYMBOL(dev_get_phys_port_id);
8457
8458/**
8459 *      dev_get_phys_port_name - Get device physical port name
8460 *      @dev: device
8461 *      @name: port name
8462 *      @len: limit of bytes to copy to name
8463 *
8464 *      Get device physical port name
8465 */
8466int dev_get_phys_port_name(struct net_device *dev,
8467                           char *name, size_t len)
8468{
8469        const struct net_device_ops *ops = dev->netdev_ops;
8470        int err;
8471
8472        if (ops->ndo_get_phys_port_name) {
8473                err = ops->ndo_get_phys_port_name(dev, name, len);
8474                if (err != -EOPNOTSUPP)
8475                        return err;
8476        }
8477        return devlink_compat_phys_port_name_get(dev, name, len);
8478}
8479EXPORT_SYMBOL(dev_get_phys_port_name);
8480
8481/**
8482 *      dev_get_port_parent_id - Get the device's port parent identifier
8483 *      @dev: network device
8484 *      @ppid: pointer to a storage for the port's parent identifier
8485 *      @recurse: allow/disallow recursion to lower devices
8486 *
8487 *      Get the devices's port parent identifier
8488 */
8489int dev_get_port_parent_id(struct net_device *dev,
8490                           struct netdev_phys_item_id *ppid,
8491                           bool recurse)
8492{
8493        const struct net_device_ops *ops = dev->netdev_ops;
8494        struct netdev_phys_item_id first = { };
8495        struct net_device *lower_dev;
8496        struct list_head *iter;
8497        int err;
8498
8499        if (ops->ndo_get_port_parent_id) {
8500                err = ops->ndo_get_port_parent_id(dev, ppid);
8501                if (err != -EOPNOTSUPP)
8502                        return err;
8503        }
8504
8505        err = devlink_compat_switch_id_get(dev, ppid);
8506        if (!err || err != -EOPNOTSUPP)
8507                return err;
8508
8509        if (!recurse)
8510                return -EOPNOTSUPP;
8511
8512        netdev_for_each_lower_dev(dev, lower_dev, iter) {
8513                err = dev_get_port_parent_id(lower_dev, ppid, recurse);
8514                if (err)
8515                        break;
8516                if (!first.id_len)
8517                        first = *ppid;
8518                else if (memcmp(&first, ppid, sizeof(*ppid)))
8519                        return -ENODATA;
8520        }
8521
8522        return err;
8523}
8524EXPORT_SYMBOL(dev_get_port_parent_id);
8525
8526/**
8527 *      netdev_port_same_parent_id - Indicate if two network devices have
8528 *      the same port parent identifier
8529 *      @a: first network device
8530 *      @b: second network device
8531 */
8532bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
8533{
8534        struct netdev_phys_item_id a_id = { };
8535        struct netdev_phys_item_id b_id = { };
8536
8537        if (dev_get_port_parent_id(a, &a_id, true) ||
8538            dev_get_port_parent_id(b, &b_id, true))
8539                return false;
8540
8541        return netdev_phys_item_id_same(&a_id, &b_id);
8542}
8543EXPORT_SYMBOL(netdev_port_same_parent_id);
8544
8545/**
8546 *      dev_change_proto_down - update protocol port state information
8547 *      @dev: device
8548 *      @proto_down: new value
8549 *
8550 *      This info can be used by switch drivers to set the phys state of the
8551 *      port.
8552 */
8553int dev_change_proto_down(struct net_device *dev, bool proto_down)
8554{
8555        const struct net_device_ops *ops = dev->netdev_ops;
8556
8557        if (!ops->ndo_change_proto_down)
8558                return -EOPNOTSUPP;
8559        if (!netif_device_present(dev))
8560                return -ENODEV;
8561        return ops->ndo_change_proto_down(dev, proto_down);
8562}
8563EXPORT_SYMBOL(dev_change_proto_down);
8564
8565/**
8566 *      dev_change_proto_down_generic - generic implementation for
8567 *      ndo_change_proto_down that sets carrier according to
8568 *      proto_down.
8569 *
8570 *      @dev: device
8571 *      @proto_down: new value
8572 */
8573int dev_change_proto_down_generic(struct net_device *dev, bool proto_down)
8574{
8575        if (proto_down)
8576                netif_carrier_off(dev);
8577        else
8578                netif_carrier_on(dev);
8579        dev->proto_down = proto_down;
8580        return 0;
8581}
8582EXPORT_SYMBOL(dev_change_proto_down_generic);
8583
8584u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
8585                    enum bpf_netdev_command cmd)
8586{
8587        struct netdev_bpf xdp;
8588
8589        if (!bpf_op)
8590                return 0;
8591
8592        memset(&xdp, 0, sizeof(xdp));
8593        xdp.command = cmd;
8594
8595        /* Query must always succeed. */
8596        WARN_ON(bpf_op(dev, &xdp) < 0 && cmd == XDP_QUERY_PROG);
8597
8598        return xdp.prog_id;
8599}
8600
8601static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
8602                           struct netlink_ext_ack *extack, u32 flags,
8603                           struct bpf_prog *prog)
8604{
8605        bool non_hw = !(flags & XDP_FLAGS_HW_MODE);
8606        struct bpf_prog *prev_prog = NULL;
8607        struct netdev_bpf xdp;
8608        int err;
8609
8610        if (non_hw) {
8611                prev_prog = bpf_prog_by_id(__dev_xdp_query(dev, bpf_op,
8612                                                           XDP_QUERY_PROG));
8613                if (IS_ERR(prev_prog))
8614                        prev_prog = NULL;
8615        }
8616
8617        memset(&xdp, 0, sizeof(xdp));
8618        if (flags & XDP_FLAGS_HW_MODE)
8619                xdp.command = XDP_SETUP_PROG_HW;
8620        else
8621                xdp.command = XDP_SETUP_PROG;
8622        xdp.extack = extack;
8623        xdp.flags = flags;
8624        xdp.prog = prog;
8625
8626        err = bpf_op(dev, &xdp);
8627        if (!err && non_hw)
8628                bpf_prog_change_xdp(prev_prog, prog);
8629
8630        if (prev_prog)
8631                bpf_prog_put(prev_prog);
8632
8633        return err;
8634}
8635
8636static void dev_xdp_uninstall(struct net_device *dev)
8637{
8638        struct netdev_bpf xdp;
8639        bpf_op_t ndo_bpf;
8640
8641        /* Remove generic XDP */
8642        WARN_ON(dev_xdp_install(dev, generic_xdp_install, NULL, 0, NULL));
8643
8644        /* Remove from the driver */
8645        ndo_bpf = dev->netdev_ops->ndo_bpf;
8646        if (!ndo_bpf)
8647                return;
8648
8649        memset(&xdp, 0, sizeof(xdp));
8650        xdp.command = XDP_QUERY_PROG;
8651        WARN_ON(ndo_bpf(dev, &xdp));
8652        if (xdp.prog_id)
8653                WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
8654                                        NULL));
8655
8656        /* Remove HW offload */
8657        memset(&xdp, 0, sizeof(xdp));
8658        xdp.command = XDP_QUERY_PROG_HW;
8659        if (!ndo_bpf(dev, &xdp) && xdp.prog_id)
8660                WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
8661                                        NULL));
8662}
8663
8664/**
8665 *      dev_change_xdp_fd - set or clear a bpf program for a device rx path
8666 *      @dev: device
8667 *      @extack: netlink extended ack
8668 *      @fd: new program fd or negative value to clear
8669 *      @expected_fd: old program fd that userspace expects to replace or clear
8670 *      @flags: xdp-related flags
8671 *
8672 *      Set or clear a bpf program for a device
8673 */
8674int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
8675                      int fd, int expected_fd, u32 flags)
8676{
8677        const struct net_device_ops *ops = dev->netdev_ops;
8678        enum bpf_netdev_command query;
8679        u32 prog_id, expected_id = 0;
8680        bpf_op_t bpf_op, bpf_chk;
8681        struct bpf_prog *prog;
8682        bool offload;
8683        int err;
8684
8685        ASSERT_RTNL();
8686
8687        offload = flags & XDP_FLAGS_HW_MODE;
8688        query = offload ? XDP_QUERY_PROG_HW : XDP_QUERY_PROG;
8689
8690        bpf_op = bpf_chk = ops->ndo_bpf;
8691        if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE))) {
8692                NL_SET_ERR_MSG(extack, "underlying driver does not support XDP in native mode");
8693                return -EOPNOTSUPP;
8694        }
8695        if (!bpf_op || (flags & XDP_FLAGS_SKB_MODE))
8696                bpf_op = generic_xdp_install;
8697        if (bpf_op == bpf_chk)
8698                bpf_chk = generic_xdp_install;
8699
8700        prog_id = __dev_xdp_query(dev, bpf_op, query);
8701        if (flags & XDP_FLAGS_REPLACE) {
8702                if (expected_fd >= 0) {
8703                        prog = bpf_prog_get_type_dev(expected_fd,
8704                                                     BPF_PROG_TYPE_XDP,
8705                                                     bpf_op == ops->ndo_bpf);
8706                        if (IS_ERR(prog))
8707                                return PTR_ERR(prog);
8708                        expected_id = prog->aux->id;
8709                        bpf_prog_put(prog);
8710                }
8711
8712                if (prog_id != expected_id) {
8713                        NL_SET_ERR_MSG(extack, "Active program does not match expected");
8714                        return -EEXIST;
8715                }
8716        }
8717        if (fd >= 0) {
8718                if (!offload && __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG)) {
8719                        NL_SET_ERR_MSG(extack, "native and generic XDP can't be active at the same time");
8720                        return -EEXIST;
8721                }
8722
8723                if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && prog_id) {
8724                        NL_SET_ERR_MSG(extack, "XDP program already attached");
8725                        return -EBUSY;
8726                }
8727
8728                prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
8729                                             bpf_op == ops->ndo_bpf);
8730                if (IS_ERR(prog))
8731                        return PTR_ERR(prog);
8732
8733                if (!offload && bpf_prog_is_dev_bound(prog->aux)) {
8734                        NL_SET_ERR_MSG(extack, "using device-bound program without HW_MODE flag is not supported");
8735                        bpf_prog_put(prog);
8736                        return -EINVAL;
8737                }
8738
8739                /* prog->aux->id may be 0 for orphaned device-bound progs */
8740                if (prog->aux->id && prog->aux->id == prog_id) {
8741                        bpf_prog_put(prog);
8742                        return 0;
8743                }
8744        } else {
8745                if (!prog_id)
8746                        return 0;
8747                prog = NULL;
8748        }
8749
8750        err = dev_xdp_install(dev, bpf_op, extack, flags, prog);
8751        if (err < 0 && prog)
8752                bpf_prog_put(prog);
8753
8754        return err;
8755}
8756
8757/**
8758 *      dev_new_index   -       allocate an ifindex
8759 *      @net: the applicable net namespace
8760 *
8761 *      Returns a suitable unique value for a new device interface
8762 *      number.  The caller must hold the rtnl semaphore or the
8763 *      dev_base_lock to be sure it remains unique.
8764 */
8765static int dev_new_index(struct net *net)
8766{
8767        int ifindex = net->ifindex;
8768
8769        for (;;) {
8770                if (++ifindex <= 0)
8771                        ifindex = 1;
8772                if (!__dev_get_by_index(net, ifindex))
8773                        return net->ifindex = ifindex;
8774        }
8775}
8776
8777/* Delayed registration/unregisteration */
8778static LIST_HEAD(net_todo_list);
8779DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
8780
8781static void net_set_todo(struct net_device *dev)
8782{
8783        list_add_tail(&dev->todo_list, &net_todo_list);
8784        dev_net(dev)->dev_unreg_count++;
8785}
8786
8787static void rollback_registered_many(struct list_head *head)
8788{
8789        struct net_device *dev, *tmp;
8790        LIST_HEAD(close_head);
8791
8792        BUG_ON(dev_boot_phase);
8793        ASSERT_RTNL();
8794
8795        list_for_each_entry_safe(dev, tmp, head, unreg_list) {
8796                /* Some devices call without registering
8797                 * for initialization unwind. Remove those
8798                 * devices and proceed with the remaining.
8799                 */
8800                if (dev->reg_state == NETREG_UNINITIALIZED) {
8801                        pr_debug("unregister_netdevice: device %s/%p never was registered\n",
8802                                 dev->name, dev);
8803
8804                        WARN_ON(1);
8805                        list_del(&dev->unreg_list);
8806                        continue;
8807                }
8808                dev->dismantle = true;
8809                BUG_ON(dev->reg_state != NETREG_REGISTERED);
8810        }
8811
8812        /* If device is running, close it first. */
8813        list_for_each_entry(dev, head, unreg_list)
8814                list_add_tail(&dev->close_list, &close_head);
8815        dev_close_many(&close_head, true);
8816
8817        list_for_each_entry(dev, head, unreg_list) {
8818                /* And unlink it from device chain. */
8819                unlist_netdevice(dev);
8820
8821                dev->reg_state = NETREG_UNREGISTERING;
8822        }
8823        flush_all_backlogs();
8824
8825        synchronize_net();
8826
8827        list_for_each_entry(dev, head, unreg_list) {
8828                struct sk_buff *skb = NULL;
8829
8830                /* Shutdown queueing discipline. */
8831                dev_shutdown(dev);
8832
8833                dev_xdp_uninstall(dev);
8834
8835                /* Notify protocols, that we are about to destroy
8836                 * this device. They should clean all the things.
8837                 */
8838                call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
8839
8840                if (!dev->rtnl_link_ops ||
8841                    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
8842                        skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
8843                                                     GFP_KERNEL, NULL, 0);
8844
8845                /*
8846                 *      Flush the unicast and multicast chains
8847                 */
8848                dev_uc_flush(dev);
8849                dev_mc_flush(dev);
8850
8851                netdev_name_node_alt_flush(dev);
8852                netdev_name_node_free(dev->name_node);
8853
8854                if (dev->netdev_ops->ndo_uninit)
8855                        dev->netdev_ops->ndo_uninit(dev);
8856
8857                if (skb)
8858                        rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
8859
8860                /* Notifier chain MUST detach us all upper devices. */
8861                WARN_ON(netdev_has_any_upper_dev(dev));
8862                WARN_ON(netdev_has_any_lower_dev(dev));
8863
8864                /* Remove entries from kobject tree */
8865                netdev_unregister_kobject(dev);
8866#ifdef CONFIG_XPS
8867                /* Remove XPS queueing entries */
8868                netif_reset_xps_queues_gt(dev, 0);
8869#endif
8870        }
8871
8872        synchronize_net();
8873
8874        list_for_each_entry(dev, head, unreg_list)
8875                dev_put(dev);
8876}
8877
8878static void rollback_registered(struct net_device *dev)
8879{
8880        LIST_HEAD(single);
8881
8882        list_add(&dev->unreg_list, &single);
8883        rollback_registered_many(&single);
8884        list_del(&single);
8885}
8886
8887static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
8888        struct net_device *upper, netdev_features_t features)
8889{
8890        netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
8891        netdev_features_t feature;
8892        int feature_bit;
8893
8894        for_each_netdev_feature(upper_disables, feature_bit) {
8895                feature = __NETIF_F_BIT(feature_bit);
8896                if (!(upper->wanted_features & feature)
8897                    && (features & feature)) {
8898                        netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
8899                                   &feature, upper->name);
8900                        features &= ~feature;
8901                }
8902        }
8903
8904        return features;
8905}
8906
8907static void netdev_sync_lower_features(struct net_device *upper,
8908        struct net_device *lower, netdev_features_t features)
8909{
8910        netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
8911        netdev_features_t feature;
8912        int feature_bit;
8913
8914        for_each_netdev_feature(upper_disables, feature_bit) {
8915                feature = __NETIF_F_BIT(feature_bit);
8916                if (!(features & feature) && (lower->features & feature)) {
8917                        netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
8918                                   &feature, lower->name);
8919                        lower->wanted_features &= ~feature;
8920                        __netdev_update_features(lower);
8921
8922                        if (unlikely(lower->features & feature))
8923                                netdev_WARN(upper, "failed to disable %pNF on %s!\n",
8924                                            &feature, lower->name);
8925                        else
8926                                netdev_features_change(lower);
8927                }
8928        }
8929}
8930
8931static netdev_features_t netdev_fix_features(struct net_device *dev,
8932        netdev_features_t features)
8933{
8934        /* Fix illegal checksum combinations */
8935        if ((features & NETIF_F_HW_CSUM) &&
8936            (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
8937                netdev_warn(dev, "mixed HW and IP checksum settings.\n");
8938                features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
8939        }
8940
8941        /* TSO requires that SG is present as well. */
8942        if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
8943                netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
8944                features &= ~NETIF_F_ALL_TSO;
8945        }
8946
8947        if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
8948                                        !(features & NETIF_F_IP_CSUM)) {
8949                netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
8950                features &= ~NETIF_F_TSO;
8951                features &= ~NETIF_F_TSO_ECN;
8952        }
8953
8954        if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
8955                                         !(features & NETIF_F_IPV6_CSUM)) {
8956                netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
8957                features &= ~NETIF_F_TSO6;
8958        }
8959
8960        /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
8961        if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
8962                features &= ~NETIF_F_TSO_MANGLEID;
8963
8964        /* TSO ECN requires that TSO is present as well. */
8965        if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
8966                features &= ~NETIF_F_TSO_ECN;
8967
8968        /* Software GSO depends on SG. */
8969        if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
8970                netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
8971                features &= ~NETIF_F_GSO;
8972        }
8973
8974        /* GSO partial features require GSO partial be set */
8975        if ((features & dev->gso_partial_features) &&
8976            !(features & NETIF_F_GSO_PARTIAL)) {
8977                netdev_dbg(dev,
8978                           "Dropping partially supported GSO features since no GSO partial.\n");
8979                features &= ~dev->gso_partial_features;
8980        }
8981
8982        if (!(features & NETIF_F_RXCSUM)) {
8983                /* NETIF_F_GRO_HW implies doing RXCSUM since every packet
8984                 * successfully merged by hardware must also have the
8985                 * checksum verified by hardware.  If the user does not
8986                 * want to enable RXCSUM, logically, we should disable GRO_HW.
8987                 */
8988                if (features & NETIF_F_GRO_HW) {
8989                        netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
8990                        features &= ~NETIF_F_GRO_HW;
8991                }
8992        }
8993
8994        /* LRO/HW-GRO features cannot be combined with RX-FCS */
8995        if (features & NETIF_F_RXFCS) {
8996                if (features & NETIF_F_LRO) {
8997                        netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
8998                        features &= ~NETIF_F_LRO;
8999                }
9000

9001                if (features & NETIF_F_GRO_HW) {
9002                        netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
9003                        features &= ~NETIF_F_GRO_HW;
9004                }
9005        }
9006
9007        return features;
9008}
9009
9010int __netdev_update_features(struct net_device *dev)
9011{
9012        struct net_device *upper, *lower;
9013        netdev_features_t features;
9014        struct list_head *iter;
9015        int err = -1;
9016
9017        ASSERT_RTNL();
9018
9019        features = netdev_get_wanted_features(dev);
9020
9021        if (dev->netdev_ops->ndo_fix_features)
9022                features = dev->netdev_ops->ndo_fix_features(dev, features);
9023
9024        /* driver might be less strict about feature dependencies */
9025        features = netdev_fix_features(dev, features);
9026
9027        /* some features can't be enabled if they're off an an upper device */
9028        netdev_for_each_upper_dev_rcu(dev, upper, iter)
9029                features = netdev_sync_upper_features(dev, upper, features);
9030
9031        if (dev->features == features)
9032                goto sync_lower;
9033
9034        netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
9035                &dev->features, &features);
9036
9037        if (dev->netdev_ops->ndo_set_features)
9038                err = dev->netdev_ops->ndo_set_features(dev, features);
9039        else
9040                err = 0;
9041
9042        if (unlikely(err < 0)) {
9043                netdev_err(dev,
9044                        "set_features() failed (%d); wanted %pNF, left %pNF\n",
9045                        err, &features, &dev->features);
9046                /* return non-0 since some features might have changed and
9047                 * it's better to fire a spurious notification than miss it
9048                 */
9049                return -1;
9050        }
9051
9052sync_lower:
9053        /* some features must be disabled on lower devices when disabled
9054         * on an upper device (think: bonding master or bridge)
9055         */
9056        netdev_for_each_lower_dev(dev, lower, iter)
9057                netdev_sync_lower_features(dev, lower, features);
9058
9059        if (!err) {
9060                netdev_features_t diff = features ^ dev->features;
9061
9062                if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
9063                        /* udp_tunnel_{get,drop}_rx_info both need
9064                         * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
9065                         * device, or they won't do anything.
9066                         * Thus we need to update dev->features
9067                         * *before* calling udp_tunnel_get_rx_info,
9068                         * but *after* calling udp_tunnel_drop_rx_info.
9069                         */
9070                        if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
9071                                dev->features = features;
9072                                udp_tunnel_get_rx_info(dev);
9073                        } else {
9074                                udp_tunnel_drop_rx_info(dev);
9075                        }
9076                }
9077
9078                if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
9079                        if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
9080                                dev->features = features;
9081                                err |= vlan_get_rx_ctag_filter_info(dev);
9082                        } else {
9083                                vlan_drop_rx_ctag_filter_info(dev);
9084                        }
9085                }
9086
9087                if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
9088                        if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
9089                                dev->features = features;
9090                                err |= vlan_get_rx_stag_filter_info(dev);
9091                        } else {
9092                                vlan_drop_rx_stag_filter_info(dev);
9093                        }
9094                }
9095
9096                dev->features = features;
9097        }
9098
9099        return err < 0 ? 0 : 1;
9100}
9101
9102/**
9103 *      netdev_update_features - recalculate device features
9104 *      @dev: the device to check
9105 *
9106 *      Recalculate dev->features set and send notifications if it
9107 *      has changed. Should be called after driver or hardware dependent
9108 *      conditions might have changed that influence the features.
9109 */
9110void netdev_update_features(struct net_device *dev)
9111{
9112        if (__netdev_update_features(dev))
9113                netdev_features_change(dev);
9114}
9115EXPORT_SYMBOL(netdev_update_features);
9116
9117/**
9118 *      netdev_change_features - recalculate device features
9119 *      @dev: the device to check
9120 *
9121 *      Recalculate dev->features set and send notifications even
9122 *      if they have not changed. Should be called instead of
9123 *      netdev_update_features() if also dev->vlan_features might
9124 *      have changed to allow the changes to be propagated to stacked
9125 *      VLAN devices.
9126 */
9127void netdev_change_features(struct net_device *dev)
9128{
9129        __netdev_update_features(dev);
9130        netdev_features_change(dev);
9131}
9132EXPORT_SYMBOL(netdev_change_features);
9133
9134/**
9135 *      netif_stacked_transfer_operstate -      transfer operstate
9136 *      @rootdev: the root or lower level device to transfer state from
9137 *      @dev: the device to transfer operstate to
9138 *
9139 *      Transfer operational state from root to device. This is normally
9140 *      called when a stacking relationship exists between the root
9141 *      device and the device(a leaf device).
9142 */
9143void netif_stacked_transfer_operstate(const struct net_device *rootdev,
9144                                        struct net_device *dev)
9145{
9146        if (rootdev->operstate == IF_OPER_DORMANT)
9147                netif_dormant_on(dev);
9148        else
9149                netif_dormant_off(dev);
9150
9151        if (netif_carrier_ok(rootdev))
9152                netif_carrier_on(dev);
9153        else
9154                netif_carrier_off(dev);
9155}
9156EXPORT_SYMBOL(netif_stacked_transfer_operstate);
9157
9158static int netif_alloc_rx_queues(struct net_device *dev)
9159{
9160        unsigned int i, count = dev->num_rx_queues;
9161        struct netdev_rx_queue *rx;
9162        size_t sz = count * sizeof(*rx);
9163        int err = 0;
9164
9165        BUG_ON(count < 1);
9166
9167        rx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
9168        if (!rx)
9169                return -ENOMEM;
9170
9171        dev->_rx = rx;
9172
9173        for (i = 0; i < count; i++) {
9174                rx[i].dev = dev;
9175
9176                /* XDP RX-queue setup */
9177                err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i);
9178                if (err < 0)
9179                        goto err_rxq_info;
9180        }
9181        return 0;
9182
9183err_rxq_info:
9184        /* Rollback successful reg's and free other resources */
9185        while (i--)
9186                xdp_rxq_info_unreg(&rx[i].xdp_rxq);
9187        kvfree(dev->_rx);
9188        dev->_rx = NULL;
9189        return err;
9190}
9191
9192static void netif_free_rx_queues(struct net_device *dev)
9193{
9194        unsigned int i, count = dev->num_rx_queues;
9195
9196        /* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
9197        if (!dev->_rx)
9198                return;
9199
9200        for (i = 0; i < count; i++)
9201                xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);
9202
9203        kvfree(dev->_rx);
9204}
9205
9206static void netdev_init_one_queue(struct net_device *dev,
9207                                  struct netdev_queue *queue, void *_unused)
9208{
9209        /* Initialize queue lock */
9210        spin_lock_init(&queue->_xmit_lock);
9211        lockdep_set_class(&queue->_xmit_lock, &dev->qdisc_xmit_lock_key);
9212        queue->xmit_lock_owner = -1;
9213        netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
9214        queue->dev = dev;
9215#ifdef CONFIG_BQL
9216        dql_init(&queue->dql, HZ);
9217#endif
9218}
9219
9220static void netif_free_tx_queues(struct net_device *dev)
9221{
9222        kvfree(dev->_tx);
9223}
9224
9225static int netif_alloc_netdev_queues(struct net_device *dev)
9226{
9227        unsigned int count = dev->num_tx_queues;
9228        struct netdev_queue *tx;
9229        size_t sz = count * sizeof(*tx);
9230
9231        if (count < 1 || count > 0xffff)
9232                return -EINVAL;
9233
9234        tx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
9235        if (!tx)
9236                return -ENOMEM;
9237
9238        dev->_tx = tx;
9239
9240        netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
9241        spin_lock_init(&dev->tx_global_lock);
9242
9243        return 0;
9244}
9245
9246void netif_tx_stop_all_queues(struct net_device *dev)
9247{
9248        unsigned int i;
9249
9250        for (i = 0; i < dev->num_tx_queues; i++) {
9251                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
9252
9253                netif_tx_stop_queue(txq);
9254        }
9255}
9256EXPORT_SYMBOL(netif_tx_stop_all_queues);
9257
9258static void netdev_register_lockdep_key(struct net_device *dev)
9259{
9260        lockdep_register_key(&dev->qdisc_tx_busylock_key);
9261        lockdep_register_key(&dev->qdisc_running_key);
9262        lockdep_register_key(&dev->qdisc_xmit_lock_key);
9263        lockdep_register_key(&dev->addr_list_lock_key);
9264}
9265
9266static void netdev_unregister_lockdep_key(struct net_device *dev)
9267{
9268        lockdep_unregister_key(&dev->qdisc_tx_busylock_key);
9269        lockdep_unregister_key(&dev->qdisc_running_key);
9270        lockdep_unregister_key(&dev->qdisc_xmit_lock_key);
9271        lockdep_unregister_key(&dev->addr_list_lock_key);
9272}
9273
9274void netdev_update_lockdep_key(struct net_device *dev)
9275{
9276        lockdep_unregister_key(&dev->addr_list_lock_key);
9277        lockdep_register_key(&dev->addr_list_lock_key);
9278
9279        lockdep_set_class(&dev->addr_list_lock, &dev->addr_list_lock_key);
9280}
9281EXPORT_SYMBOL(netdev_update_lockdep_key);
9282
9283/**
9284 *      register_netdevice      - register a network device
9285 *      @dev: device to register
9286 *
9287 *      Take a completed network device structure and add it to the kernel
9288 *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
9289 *      chain. 0 is returned on success. A negative errno code is returned
9290 *      on a failure to set up the device, or if the name is a duplicate.
9291 *
9292 *      Callers must hold the rtnl semaphore. You may want
9293 *      register_netdev() instead of this.
9294 *
9295 *      BUGS:
9296 *      The locking appears insufficient to guarantee two parallel registers
9297 *      will not get the same name.
9298 */
9299
9300int register_netdevice(struct net_device *dev)
9301{
9302        int ret;
9303        struct net *net = dev_net(dev);
9304
9305        BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
9306                     NETDEV_FEATURE_COUNT);
9307        BUG_ON(dev_boot_phase);
9308        ASSERT_RTNL();
9309
9310        might_sleep();
9311
9312        /* When net_device's are persistent, this will be fatal. */
9313        BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
9314        BUG_ON(!net);
9315
9316        ret = ethtool_check_ops(dev->ethtool_ops);
9317        if (ret)
9318                return ret;
9319
9320        spin_lock_init(&dev->addr_list_lock);
9321        lockdep_set_class(&dev->addr_list_lock, &dev->addr_list_lock_key);
9322
9323        ret = dev_get_valid_name(net, dev, dev->name);
9324        if (ret < 0)
9325                goto out;
9326
9327        ret = -ENOMEM;
9328        dev->name_node = netdev_name_node_head_alloc(dev);
9329        if (!dev->name_node)
9330                goto out;
9331
9332        /* Init, if this function is available */
9333        if (dev->netdev_ops->ndo_init) {
9334                ret = dev->netdev_ops->ndo_init(dev);
9335                if (ret) {
9336                        if (ret > 0)
9337                                ret = -EIO;
9338                        goto err_free_name;
9339                }
9340        }
9341
9342        if (((dev->hw_features | dev->features) &
9343             NETIF_F_HW_VLAN_CTAG_FILTER) &&
9344            (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
9345             !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
9346                netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
9347                ret = -EINVAL;
9348                goto err_uninit;
9349        }
9350
9351        ret = -EBUSY;
9352        if (!dev->ifindex)
9353                dev->ifindex = dev_new_index(net);
9354        else if (__dev_get_by_index(net, dev->ifindex))
9355                goto err_uninit;
9356
9357        /* Transfer changeable features to wanted_features and enable
9358         * software offloads (GSO and GRO).
9359         */
9360        dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF);
9361        dev->features |= NETIF_F_SOFT_FEATURES;
9362
9363        if (dev->netdev_ops->ndo_udp_tunnel_add) {
9364                dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
9365                dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
9366        }
9367
9368        dev->wanted_features = dev->features & dev->hw_features;
9369
9370        if (!(dev->flags & IFF_LOOPBACK))
9371                dev->hw_features |= NETIF_F_NOCACHE_COPY;
9372
9373        /* If IPv4 TCP segmentation offload is supported we should also
9374         * allow the device to enable segmenting the frame with the option
9375         * of ignoring a static IP ID value.  This doesn't enable the
9376         * feature itself but allows the user to enable it later.
9377         */
9378        if (dev->hw_features & NETIF_F_TSO)
9379                dev->hw_features |= NETIF_F_TSO_MANGLEID;
9380        if (dev->vlan_features & NETIF_F_TSO)
9381                dev->vlan_features |= NETIF_F_TSO_MANGLEID;
9382        if (dev->mpls_features & NETIF_F_TSO)
9383                dev->mpls_features |= NETIF_F_TSO_MANGLEID;
9384        if (dev->hw_enc_features & NETIF_F_TSO)
9385                dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
9386
9387        /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
9388         */
9389        dev->vlan_features |= NETIF_F_HIGHDMA;
9390
9391        /* Make NETIF_F_SG inheritable to tunnel devices.
9392         */
9393        dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
9394
9395        /* Make NETIF_F_SG inheritable to MPLS.
9396         */
9397        dev->mpls_features |= NETIF_F_SG;
9398
9399        ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
9400        ret = notifier_to_errno(ret);
9401        if (ret)
9402                goto err_uninit;
9403
9404        ret = netdev_register_kobject(dev);
9405        if (ret) {
9406                dev->reg_state = NETREG_UNREGISTERED;
9407                goto err_uninit;
9408        }
9409        dev->reg_state = NETREG_REGISTERED;
9410
9411        __netdev_update_features(dev);
9412
9413        /*
9414         *      Default initial state at registry is that the
9415         *      device is present.
9416         */
9417
9418        set_bit(__LINK_STATE_PRESENT, &dev->state);
9419
9420        linkwatch_init_dev(dev);
9421
9422        dev_init_scheduler(dev);
9423        dev_hold(dev);
9424        list_netdevice(dev);
9425        add_device_randomness(dev->dev_addr, dev->addr_len);
9426
9427        /* If the device has permanent device address, driver should
9428         * set dev_addr and also addr_assign_type should be set to
9429         * NET_ADDR_PERM (default value).
9430         */
9431        if (dev->addr_assign_type == NET_ADDR_PERM)
9432                memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
9433
9434        /* Notify protocols, that a new device appeared. */
9435        ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
9436        ret = notifier_to_errno(ret);
9437        if (ret) {
9438                rollback_registered(dev);
9439                rcu_barrier();
9440
9441                dev->reg_state = NETREG_UNREGISTERED;
9442        }
9443        /*
9444         *      Prevent userspace races by waiting until the network
9445         *      device is fully setup before sending notifications.
9446         */
9447        if (!dev->rtnl_link_ops ||
9448            dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
9449                rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
9450
9451out:
9452        return ret;
9453
9454err_uninit:
9455        if (dev->netdev_ops->ndo_uninit)
9456                dev->netdev_ops->ndo_uninit(dev);
9457        if (dev->priv_destructor)
9458                dev->priv_destructor(dev);
9459err_free_name:
9460        netdev_name_node_free(dev->name_node);
9461        goto out;
9462}
9463EXPORT_SYMBOL(register_netdevice);
9464
9465/**
9466 *      init_dummy_netdev       - init a dummy network device for NAPI
9467 *      @dev: device to init
9468 *
9469 *      This takes a network device structure and initialize the minimum
9470 *      amount of fields so it can be used to schedule NAPI polls without
9471 *      registering a full blown interface. This is to be used by drivers
9472 *      that need to tie several hardware interfaces to a single NAPI
9473 *      poll scheduler due to HW limitations.
9474 */
9475int init_dummy_netdev(struct net_device *dev)
9476{
9477        /* Clear everything. Note we don't initialize spinlocks
9478         * are they aren't supposed to be taken by any of the
9479         * NAPI code and this dummy netdev is supposed to be
9480         * only ever used for NAPI polls
9481         */
9482        memset(dev, 0, sizeof(struct net_device));
9483
9484        /* make sure we BUG if trying to hit standard
9485         * register/unregister code path
9486         */
9487        dev->reg_state = NETREG_DUMMY;
9488
9489        /* NAPI wants this */
9490        INIT_LIST_HEAD(&dev->napi_list);
9491
9492        /* a dummy interface is started by default */
9493        set_bit(__LINK_STATE_PRESENT, &dev->state);
9494        set_bit(__LINK_STATE_START, &dev->state);
9495
9496        /* napi_busy_loop stats accounting wants this */
9497        dev_net_set(dev, &init_net);
9498
9499        /* Note : We dont allocate pcpu_refcnt for dummy devices,
9500         * because users of this 'device' dont need to change
9501         * its refcount.
9502         */
9503
9504        return 0;
9505}
9506EXPORT_SYMBOL_GPL(init_dummy_netdev);
9507
9508
9509/**
9510 *      register_netdev - register a network device
9511 *      @dev: device to register
9512 *
9513 *      Take a completed network device structure and add it to the kernel
9514 *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
9515 *      chain. 0 is returned on success. A negative errno code is returned
9516 *      on a failure to set up the device, or if the name is a duplicate.
9517 *
9518 *      This is a wrapper around register_netdevice that takes the rtnl semaphore
9519 *      and expands the device name if you passed a format string to
9520 *      alloc_netdev.
9521 */
9522int register_netdev(struct net_device *dev)
9523{
9524        int err;
9525
9526        if (rtnl_lock_killable())
9527                return -EINTR;
9528        err = register_netdevice(dev);
9529        rtnl_unlock();
9530        return err;
9531}
9532EXPORT_SYMBOL(register_netdev);
9533
9534int netdev_refcnt_read(const struct net_device *dev)
9535{
9536        int i, refcnt = 0;
9537
9538        for_each_possible_cpu(i)
9539                refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
9540        return refcnt;
9541}
9542EXPORT_SYMBOL(netdev_refcnt_read);
9543
9544/**
9545 * netdev_wait_allrefs - wait until all references are gone.
9546 * @dev: target net_device
9547 *
9548 * This is called when unregistering network devices.
9549 *
9550 * Any protocol or device that holds a reference should register
9551 * for netdevice notification, and cleanup and put back the
9552 * reference if they receive an UNREGISTER event.
9553 * We can get stuck here if buggy protocols don't correctly
9554 * call dev_put.
9555 */
9556static void netdev_wait_allrefs(struct net_device *dev)
9557{
9558        unsigned long rebroadcast_time, warning_time;
9559        int refcnt;
9560
9561        linkwatch_forget_dev(dev);
9562
9563        rebroadcast_time = warning_time = jiffies;
9564        refcnt = netdev_refcnt_read(dev);
9565
9566        while (refcnt != 0) {
9567                if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
9568                        rtnl_lock();
9569
9570                        /* Rebroadcast unregister notification */
9571                        call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
9572
9573                        __rtnl_unlock();
9574                        rcu_barrier();
9575                        rtnl_lock();
9576
9577                        if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
9578                                     &dev->state)) {
9579                                /* We must not have linkwatch events
9580                                 * pending on unregister. If this
9581                                 * happens, we simply run the queue
9582                                 * unscheduled, resulting in a noop
9583                                 * for this device.
9584                                 */
9585                                linkwatch_run_queue();
9586                        }
9587
9588                        __rtnl_unlock();
9589
9590                        rebroadcast_time = jiffies;
9591                }
9592
9593                msleep(250);
9594
9595                refcnt = netdev_refcnt_read(dev);
9596
9597                if (refcnt && time_after(jiffies, warning_time + 10 * HZ)) {
9598                        pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
9599                                 dev->name, refcnt);
9600                        warning_time = jiffies;
9601                }
9602        }
9603}
9604
9605/* The sequence is:
9606 *
9607 *      rtnl_lock();
9608 *      ...
9609 *      register_netdevice(x1);
9610 *      register_netdevice(x2);
9611 *      ...
9612 *      unregister_netdevice(y1);
9613 *      unregister_netdevice(y2);
9614 *      ...
9615 *      rtnl_unlock();
9616 *      free_netdev(y1);
9617 *      free_netdev(y2);
9618 *
9619 * We are invoked by rtnl_unlock().
9620 * This allows us to deal with problems:
9621 * 1) We can delete sysfs objects which invoke hotplug
9622 *    without deadlocking with linkwatch via keventd.
9623 * 2) Since we run with the RTNL semaphore not held, we can sleep
9624 *    safely in order to wait for the netdev refcnt to drop to zero.
9625 *
9626 * We must not return until all unregister events added during
9627 * the interval the lock was held have been completed.
9628 */
9629void netdev_run_todo(void)
9630{
9631        struct list_head list;
9632
9633        /* Snapshot list, allow later requests */
9634        list_replace_init(&net_todo_list, &list);
9635
9636        __rtnl_unlock();
9637
9638
9639        /* Wait for rcu callbacks to finish before next phase */
9640        if (!list_empty(&list))
9641                rcu_barrier();
9642
9643        while (!list_empty(&list)) {
9644                struct net_device *dev
9645                        = list_first_entry(&list, struct net_device, todo_list);
9646                list_del(&dev->todo_list);
9647
9648                if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
9649                        pr_err("network todo '%s' but state %d\n",
9650                               dev->name, dev->reg_state);
9651                        dump_stack();
9652                        continue;
9653                }
9654
9655                dev->reg_state = NETREG_UNREGISTERED;
9656
9657                netdev_wait_allrefs(dev);
9658
9659                /* paranoia */
9660                BUG_ON(netdev_refcnt_read(dev));
9661                BUG_ON(!list_empty(&dev->ptype_all));
9662                BUG_ON(!list_empty(&dev->ptype_specific));
9663                WARN_ON(rcu_access_pointer(dev->ip_ptr));
9664                WARN_ON(rcu_access_pointer(dev->ip6_ptr));
9665#if IS_ENABLED(CONFIG_DECNET)
9666                WARN_ON(dev->dn_ptr);
9667#endif
9668                if (dev->priv_destructor)
9669                        dev->priv_destructor(dev);
9670                if (dev->needs_free_netdev)
9671                        free_netdev(dev);
9672
9673                /* Report a network device has been unregistered */
9674                rtnl_lock();
9675                dev_net(dev)->dev_unreg_count--;
9676                __rtnl_unlock();
9677                wake_up(&netdev_unregistering_wq);
9678
9679                /* Free network device */
9680                kobject_put(&dev->dev.kobj);
9681        }
9682}
9683
9684/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
9685 * all the same fields in the same order as net_device_stats, with only
9686 * the type differing, but rtnl_link_stats64 may have additional fields
9687 * at the end for newer counters.
9688 */
9689void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
9690                             const struct net_device_stats *netdev_stats)
9691{
9692#if BITS_PER_LONG == 64
9693        BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
9694        memcpy(stats64, netdev_stats, sizeof(*netdev_stats));
9695        /* zero out counters that only exist in rtnl_link_stats64 */
9696        memset((char *)stats64 + sizeof(*netdev_stats), 0,
9697               sizeof(*stats64) - sizeof(*netdev_stats));
9698#else
9699        size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
9700        const unsigned long *src = (const unsigned long *)netdev_stats;
9701        u64 *dst = (u64 *)stats64;
9702
9703        BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
9704        for (i = 0; i < n; i++)
9705                dst[i] = src[i];
9706        /* zero out counters that only exist in rtnl_link_stats64 */
9707        memset((char *)stats64 + n * sizeof(u64), 0,
9708               sizeof(*stats64) - n * sizeof(u64));
9709#endif
9710}
9711EXPORT_SYMBOL(netdev_stats_to_stats64);
9712
9713/**
9714 *      dev_get_stats   - get network device statistics
9715 *      @dev: device to get statistics from
9716 *      @storage: place to store stats
9717 *
9718 *      Get network statistics from device. Return @storage.
9719 *      The device driver may provide its own method by setting
9720 *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
9721 *      otherwise the internal statistics structure is used.
9722 */
9723struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
9724                                        struct rtnl_link_stats64 *storage)
9725{
9726        const struct net_device_ops *ops = dev->netdev_ops;
9727
9728        if (ops->ndo_get_stats64) {
9729                memset(storage, 0, sizeof(*storage));
9730                ops->ndo_get_stats64(dev, storage);
9731        } else if (ops->ndo_get_stats) {
9732                netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
9733        } else {
9734                netdev_stats_to_stats64(storage, &dev->stats);
9735        }
9736        storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped);
9737        storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped);
9738        storage->rx_nohandler += (unsigned long)atomic_long_read(&dev->rx_nohandler);
9739        return storage;
9740}
9741EXPORT_SYMBOL(dev_get_stats);
9742
9743struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
9744{
9745        struct netdev_queue *queue = dev_ingress_queue(dev);
9746
9747#ifdef CONFIG_NET_CLS_ACT
9748        if (queue)
9749                return queue;
9750        queue = kzalloc(sizeof(*queue), GFP_KERNEL);
9751        if (!queue)
9752                return NULL;
9753        netdev_init_one_queue(dev, queue, NULL);
9754        RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
9755        queue->qdisc_sleeping = &noop_qdisc;
9756        rcu_assign_pointer(dev->ingress_queue, queue);
9757#endif
9758        return queue;
9759}
9760
9761static const struct ethtool_ops default_ethtool_ops;
9762
9763void netdev_set_default_ethtool_ops(struct net_device *dev,
9764                                    const struct ethtool_ops *ops)
9765{
9766        if (dev->ethtool_ops == &default_ethtool_ops)
9767                dev->ethtool_ops = ops;
9768}
9769EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
9770
9771void netdev_freemem(struct net_device *dev)
9772{
9773        char *addr = (char *)dev - dev->padded;
9774
9775        kvfree(addr);
9776}
9777
9778/**
9779 * alloc_netdev_mqs - allocate network device
9780 * @sizeof_priv: size of private data to allocate space for
9781 * @name: device name format string
9782 * @name_assign_type: origin of device name
9783 * @setup: callback to initialize device
9784 * @txqs: the number of TX subqueues to allocate
9785 * @rxqs: the number of RX subqueues to allocate
9786 *
9787 * Allocates a struct net_device with private data area for driver use
9788 * and performs basic initialization.  Also allocates subqueue structs
9789 * for each queue on the device.
9790 */
9791struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
9792                unsigned char name_assign_type,
9793                void (*setup)(struct net_device *),
9794                unsigned int txqs, unsigned int rxqs)
9795{
9796        struct net_device *dev;
9797        unsigned int alloc_size;
9798        struct net_device *p;
9799
9800        BUG_ON(strlen(name) >= sizeof(dev->name));
9801
9802        if (txqs < 1) {
9803                pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
9804                return NULL;
9805        }
9806
9807        if (rxqs < 1) {
9808                pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
9809                return NULL;
9810        }
9811
9812        alloc_size = sizeof(struct net_device);
9813        if (sizeof_priv) {
9814                /* ensure 32-byte alignment of private area */
9815                alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
9816                alloc_size += sizeof_priv;
9817        }
9818        /* ensure 32-byte alignment of whole construct */
9819        alloc_size += NETDEV_ALIGN - 1;
9820
9821        p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
9822        if (!p)
9823                return NULL;
9824
9825        dev = PTR_ALIGN(p, NETDEV_ALIGN);
9826        dev->padded = (char *)dev - (char *)p;
9827
9828        dev->pcpu_refcnt = alloc_percpu(int);
9829        if (!dev->pcpu_refcnt)
9830                goto free_dev;
9831
9832        if (dev_addr_init(dev))
9833                goto free_pcpu;
9834
9835        dev_mc_init(dev);
9836        dev_uc_init(dev);
9837
9838        dev_net_set(dev, &init_net);
9839
9840        netdev_register_lockdep_key(dev);
9841
9842        dev->gso_max_size = GSO_MAX_SIZE;
9843        dev->gso_max_segs = GSO_MAX_SEGS;
9844        dev->upper_level = 1;
9845        dev->lower_level = 1;
9846
9847        INIT_LIST_HEAD(&dev->napi_list);
9848        INIT_LIST_HEAD(&dev->unreg_list);
9849        INIT_LIST_HEAD(&dev->close_list);
9850        INIT_LIST_HEAD(&dev->link_watch_list);
9851        INIT_LIST_HEAD(&dev->adj_list.upper);
9852        INIT_LIST_HEAD(&dev->adj_list.lower);
9853        INIT_LIST_HEAD(&dev->ptype_all);
9854        INIT_LIST_HEAD(&dev->ptype_specific);
9855        INIT_LIST_HEAD(&dev->net_notifier_list);
9856#ifdef CONFIG_NET_SCHED
9857        hash_init(dev->qdisc_hash);
9858#endif
9859        dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
9860        setup(dev);
9861
9862        if (!dev->tx_queue_len) {
9863                dev->priv_flags |= IFF_NO_QUEUE;
9864                dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
9865        }
9866
9867        dev->num_tx_queues = txqs;
9868        dev->real_num_tx_queues = txqs;
9869        if (netif_alloc_netdev_queues(dev))
9870                goto free_all;
9871
9872        dev->num_rx_queues = rxqs;
9873        dev->real_num_rx_queues = rxqs;
9874        if (netif_alloc_rx_queues(dev))
9875                goto free_all;
9876
9877        strcpy(dev->name, name);
9878        dev->name_assign_type = name_assign_type;
9879        dev->group = INIT_NETDEV_GROUP;
9880        if (!dev->ethtool_ops)
9881                dev->ethtool_ops = &default_ethtool_ops;
9882
9883        nf_hook_ingress_init(dev);
9884
9885        return dev;
9886
9887free_all:
9888        free_netdev(dev);
9889        return NULL;
9890
9891free_pcpu:
9892        free_percpu(dev->pcpu_refcnt);
9893free_dev:
9894        netdev_freemem(dev);
9895        return NULL;
9896}
9897EXPORT_SYMBOL(alloc_netdev_mqs);
9898
9899/**
9900 * free_netdev - free network device
9901 * @dev: device
9902 *
9903 * This function does the last stage of destroying an allocated device
9904 * interface. The reference to the device object is released. If this
9905 * is the last reference then it will be freed.Must be called in process
9906 * context.
9907 */
9908void free_netdev(struct net_device *dev)
9909{
9910        struct napi_struct *p, *n;
9911
9912        might_sleep();
9913        netif_free_tx_queues(dev);
9914        netif_free_rx_queues(dev);
9915
9916        kfree(rcu_dereference_protected(dev->ingress_queue, 1));
9917
9918        /* Flush device addresses */
9919        dev_addr_flush(dev);
9920
9921        list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
9922                netif_napi_del(p);
9923
9924        free_percpu(dev->pcpu_refcnt);
9925        dev->pcpu_refcnt = NULL;
9926        free_percpu(dev->xdp_bulkq);
9927        dev->xdp_bulkq = NULL;
9928
9929        netdev_unregister_lockdep_key(dev);
9930
9931        /*  Compatibility with error handling in drivers */
9932        if (dev->reg_state == NETREG_UNINITIALIZED) {
9933                netdev_freemem(dev);
9934                return;
9935        }
9936
9937        BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
9938        dev->reg_state = NETREG_RELEASED;
9939
9940        /* will free via device release */
9941        put_device(&dev->dev);
9942}
9943EXPORT_SYMBOL(free_netdev);
9944
9945/**
9946 *      synchronize_net -  Synchronize with packet receive processing
9947 *
9948 *      Wait for packets currently being received to be done.
9949 *      Does not block later packets from starting.
9950 */
9951void synchronize_net(void)
9952{
9953        might_sleep();
9954        if (rtnl_is_locked())
9955                synchronize_rcu_expedited();
9956        else
9957                synchronize_rcu();
9958}
9959EXPORT_SYMBOL(synchronize_net);
9960
9961/**
9962 *      unregister_netdevice_queue - remove device from the kernel
9963 *      @dev: device
9964 *      @head: list
9965 *
9966 *      This function shuts down a device interface and removes it
9967 *      from the kernel tables.
9968 *      If head not NULL, device is queued to be unregistered later.
9969 *
9970 *      Callers must hold the rtnl semaphore.  You may want
9971 *      unregister_netdev() instead of this.
9972 */
9973
9974void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
9975{
9976        ASSERT_RTNL();
9977
9978        if (head) {
9979                list_move_tail(&dev->unreg_list, head);
9980        } else {
9981                rollback_registered(dev);
9982                /* Finish processing unregister after unlock */
9983                net_set_todo(dev);
9984        }
9985}
9986EXPORT_SYMBOL(unregister_netdevice_queue);
9987
9988/**
9989 *      unregister_netdevice_many - unregister many devices
9990 *      @head: list of devices
9991 *
9992 *  Note: As most callers use a stack allocated list_head,
9993 *  we force a list_del() to make sure stack wont be corrupted later.
9994 */
9995void unregister_netdevice_many(struct list_head *head)
9996{
9997        struct net_device *dev;
9998
9999        if (!list_empty(head)) {
10000                rollback_registered_many(head);

10001                list_for_each_entry(dev, head, unreg_list)
10002                        net_set_todo(dev);
10003                list_del(head);
10004        }
10005}
10006EXPORT_SYMBOL(unregister_netdevice_many);
10007
10008/**
10009 *      unregister_netdev - remove device from the kernel
10010 *      @dev: device
10011 *
10012 *      This function shuts down a device interface and removes it
10013 *      from the kernel tables.
10014 *
10015 *      This is just a wrapper for unregister_netdevice that takes
10016 *      the rtnl semaphore.  In general you want to use this and not
10017 *      unregister_netdevice.
10018 */
10019void unregister_netdev(struct net_device *dev)
10020{
10021        rtnl_lock();
10022        unregister_netdevice(dev);
10023        rtnl_unlock();
10024}
10025EXPORT_SYMBOL(unregister_netdev);
10026
10027/**
10028 *      dev_change_net_namespace - move device to different nethost namespace
10029 *      @dev: device
10030 *      @net: network namespace
10031 *      @pat: If not NULL name pattern to try if the current device name
10032 *            is already taken in the destination network namespace.
10033 *
10034 *      This function shuts down a device interface and moves it
10035 *      to a new network namespace. On success 0 is returned, on
10036 *      a failure a netagive errno code is returned.
10037 *
10038 *      Callers must hold the rtnl semaphore.
10039 */
10040
10041int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
10042{
10043        struct net *net_old = dev_net(dev);
10044        int err, new_nsid, new_ifindex;
10045
10046        ASSERT_RTNL();
10047
10048        /* Don't allow namespace local devices to be moved. */
10049        err = -EINVAL;
10050        if (dev->features & NETIF_F_NETNS_LOCAL)
10051                goto out;
10052
10053        /* Ensure the device has been registrered */
10054        if (dev->reg_state != NETREG_REGISTERED)
10055                goto out;
10056
10057        /* Get out if there is nothing todo */
10058        err = 0;
10059        if (net_eq(net_old, net))
10060                goto out;
10061
10062        /* Pick the destination device name, and ensure
10063         * we can use it in the destination network namespace.
10064         */
10065        err = -EEXIST;
10066        if (__dev_get_by_name(net, dev->name)) {
10067                /* We get here if we can't use the current device name */
10068                if (!pat)
10069                        goto out;
10070                err = dev_get_valid_name(net, dev, pat);
10071                if (err < 0)
10072                        goto out;
10073        }
10074
10075        /*
10076         * And now a mini version of register_netdevice unregister_netdevice.
10077         */
10078
10079        /* If device is running close it first. */
10080        dev_close(dev);
10081
10082        /* And unlink it from device chain */
10083        unlist_netdevice(dev);
10084
10085        synchronize_net();
10086
10087        /* Shutdown queueing discipline. */
10088        dev_shutdown(dev);
10089
10090        /* Notify protocols, that we are about to destroy
10091         * this device. They should clean all the things.
10092         *
10093         * Note that dev->reg_state stays at NETREG_REGISTERED.
10094         * This is wanted because this way 8021q and macvlan know
10095         * the device is just moving and can keep their slaves up.
10096         */
10097        call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
10098        rcu_barrier();
10099
10100        new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
10101        /* If there is an ifindex conflict assign a new one */
10102        if (__dev_get_by_index(net, dev->ifindex))
10103                new_ifindex = dev_new_index(net);
10104        else
10105                new_ifindex = dev->ifindex;
10106
10107        rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
10108                            new_ifindex);
10109
10110        /*
10111         *      Flush the unicast and multicast chains
10112         */
10113        dev_uc_flush(dev);
10114        dev_mc_flush(dev);
10115
10116        /* Send a netdev-removed uevent to the old namespace */
10117        kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
10118        netdev_adjacent_del_links(dev);
10119
10120        /* Move per-net netdevice notifiers that are following the netdevice */
10121        move_netdevice_notifiers_dev_net(dev, net);
10122
10123        /* Actually switch the network namespace */
10124        dev_net_set(dev, net);
10125        dev->ifindex = new_ifindex;
10126
10127        /* Send a netdev-add uevent to the new namespace */
10128        kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
10129        netdev_adjacent_add_links(dev);
10130
10131        /* Fixup kobjects */
10132        err = device_rename(&dev->dev, dev->name);
10133        WARN_ON(err);
10134
10135        /* Adapt owner in case owning user namespace of target network
10136         * namespace is different from the original one.
10137         */
10138        err = netdev_change_owner(dev, net_old, net);
10139        WARN_ON(err);
10140
10141        /* Add the device back in the hashes */
10142        list_netdevice(dev);
10143
10144        /* Notify protocols, that a new device appeared. */
10145        call_netdevice_notifiers(NETDEV_REGISTER, dev);
10146
10147        /*
10148         *      Prevent userspace races by waiting until the network
10149         *      device is fully setup before sending notifications.
10150         */
10151        rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
10152
10153        synchronize_net();
10154        err = 0;
10155out:
10156        return err;
10157}
10158EXPORT_SYMBOL_GPL(dev_change_net_namespace);
10159
10160static int dev_cpu_dead(unsigned int oldcpu)
10161{
10162        struct sk_buff **list_skb;
10163        struct sk_buff *skb;
10164        unsigned int cpu;
10165        struct softnet_data *sd, *oldsd, *remsd = NULL;
10166
10167        local_irq_disable();
10168        cpu = smp_processor_id();
10169        sd = &per_cpu(softnet_data, cpu);
10170        oldsd = &per_cpu(softnet_data, oldcpu);
10171
10172        /* Find end of our completion_queue. */
10173        list_skb = &sd->completion_queue;
10174        while (*list_skb)
10175                list_skb = &(*list_skb)->next;
10176        /* Append completion queue from offline CPU. */
10177        *list_skb = oldsd->completion_queue;
10178        oldsd->completion_queue = NULL;
10179
10180        /* Append output queue from offline CPU. */
10181        if (oldsd->output_queue) {
10182                *sd->output_queue_tailp = oldsd->output_queue;
10183                sd->output_queue_tailp = oldsd->output_queue_tailp;
10184                oldsd->output_queue = NULL;
10185                oldsd->output_queue_tailp = &oldsd->output_queue;
10186        }
10187        /* Append NAPI poll list from offline CPU, with one exception :
10188         * process_backlog() must be called by cpu owning percpu backlog.
10189         * We properly handle process_queue & input_pkt_queue later.
10190         */
10191        while (!list_empty(&oldsd->poll_list)) {
10192                struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
10193                                                            struct napi_struct,
10194                                                            poll_list);
10195
10196                list_del_init(&napi->poll_list);
10197                if (napi->poll == process_backlog)
10198                        napi->state = 0;
10199                else
10200                        ____napi_schedule(sd, napi);
10201        }
10202
10203        raise_softirq_irqoff(NET_TX_SOFTIRQ);
10204        local_irq_enable();
10205
10206#ifdef CONFIG_RPS
10207        remsd = oldsd->rps_ipi_list;
10208        oldsd->rps_ipi_list = NULL;
10209#endif
10210        /* send out pending IPI's on offline CPU */
10211        net_rps_send_ipi(remsd);
10212
10213        /* Process offline CPU's input_pkt_queue */
10214        while ((skb = __skb_dequeue(&oldsd->process_queue))) {
10215                netif_rx_ni(skb);
10216                input_queue_head_incr(oldsd);
10217        }
10218        while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
10219                netif_rx_ni(skb);
10220                input_queue_head_incr(oldsd);
10221        }
10222
10223        return 0;
10224}
10225
10226/**
10227 *      netdev_increment_features - increment feature set by one
10228 *      @all: current feature set
10229 *      @one: new feature set
10230 *      @mask: mask feature set
10231 *
10232 *      Computes a new feature set after adding a device with feature set
10233 *      @one to the master device with current feature set @all.  Will not
10234 *      enable anything that is off in @mask. Returns the new feature set.
10235 */
10236netdev_features_t netdev_increment_features(netdev_features_t all,
10237        netdev_features_t one, netdev_features_t mask)
10238{
10239        if (mask & NETIF_F_HW_CSUM)
10240                mask |= NETIF_F_CSUM_MASK;
10241        mask |= NETIF_F_VLAN_CHALLENGED;
10242
10243        all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
10244        all &= one | ~NETIF_F_ALL_FOR_ALL;
10245
10246        /* If one device supports hw checksumming, set for all. */
10247        if (all & NETIF_F_HW_CSUM)
10248                all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
10249
10250        return all;
10251}
10252EXPORT_SYMBOL(netdev_increment_features);
10253
10254static struct hlist_head * __net_init netdev_create_hash(void)
10255{
10256        int i;
10257        struct hlist_head *hash;
10258
10259        hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
10260        if (hash != NULL)
10261                for (i = 0; i < NETDEV_HASHENTRIES; i++)
10262                        INIT_HLIST_HEAD(&hash[i]);
10263
10264        return hash;
10265}
10266
10267/* Initialize per network namespace state */
10268static int __net_init netdev_init(struct net *net)
10269{
10270        BUILD_BUG_ON(GRO_HASH_BUCKETS >
10271                     8 * sizeof_field(struct napi_struct, gro_bitmask));
10272
10273        if (net != &init_net)
10274                INIT_LIST_HEAD(&net->dev_base_head);
10275
10276        net->dev_name_head = netdev_create_hash();
10277        if (net->dev_name_head == NULL)
10278                goto err_name;
10279
10280        net->dev_index_head = netdev_create_hash();
10281        if (net->dev_index_head == NULL)
10282                goto err_idx;
10283
10284        RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);
10285
10286        return 0;
10287
10288err_idx:
10289        kfree(net->dev_name_head);
10290err_name:
10291        return -ENOMEM;
10292}
10293
10294/**
10295 *      netdev_drivername - network driver for the device
10296 *      @dev: network device
10297 *
10298 *      Determine network driver for device.
10299 */
10300const char *netdev_drivername(const struct net_device *dev)
10301{
10302        const struct device_driver *driver;
10303        const struct device *parent;
10304        const char *empty = "";
10305
10306        parent = dev->dev.parent;
10307        if (!parent)
10308                return empty;
10309
10310        driver = parent->driver;
10311        if (driver && driver->name)
10312                return driver->name;
10313        return empty;
10314}
10315
10316static void __netdev_printk(const char *level, const struct net_device *dev,
10317                            struct va_format *vaf)
10318{
10319        if (dev && dev->dev.parent) {
10320                dev_printk_emit(level[1] - '0',
10321                                dev->dev.parent,
10322                                "%s %s %s%s: %pV",
10323                                dev_driver_string(dev->dev.parent),
10324                                dev_name(dev->dev.parent),
10325                                netdev_name(dev), netdev_reg_state(dev),
10326                                vaf);
10327        } else if (dev) {
10328                printk("%s%s%s: %pV",
10329                       level, netdev_name(dev), netdev_reg_state(dev), vaf);
10330        } else {
10331                printk("%s(NULL net_device): %pV", level, vaf);
10332        }
10333}
10334
10335void netdev_printk(const char *level, const struct net_device *dev,
10336                   const char *format, ...)
10337{
10338        struct va_format vaf;
10339        va_list args;
10340
10341        va_start(args, format);
10342
10343        vaf.fmt = format;
10344        vaf.va = &args;
10345
10346        __netdev_printk(level, dev, &vaf);
10347
10348        va_end(args);
10349}
10350EXPORT_SYMBOL(netdev_printk);
10351
10352#define define_netdev_printk_level(func, level)                 \
10353void func(const struct net_device *dev, const char *fmt, ...)   \
10354{                                                               \
10355        struct va_format vaf;                                   \
10356        va_list args;                                           \
10357                                                                \
10358        va_start(args, fmt);                                    \
10359                                                                \
10360        vaf.fmt = fmt;                                          \
10361        vaf.va = &args;                                         \
10362                                                                \
10363        __netdev_printk(level, dev, &vaf);                      \
10364                                                                \
10365        va_end(args);                                           \
10366}                                                               \
10367EXPORT_SYMBOL(func);
10368
10369define_netdev_printk_level(netdev_emerg, KERN_EMERG);
10370define_netdev_printk_level(netdev_alert, KERN_ALERT);
10371define_netdev_printk_level(netdev_crit, KERN_CRIT);
10372define_netdev_printk_level(netdev_err, KERN_ERR);
10373define_netdev_printk_level(netdev_warn, KERN_WARNING);
10374define_netdev_printk_level(netdev_notice, KERN_NOTICE);
10375define_netdev_printk_level(netdev_info, KERN_INFO);
10376
10377static void __net_exit netdev_exit(struct net *net)
10378{
10379        kfree(net->dev_name_head);
10380        kfree(net->dev_index_head);
10381        if (net != &init_net)
10382                WARN_ON_ONCE(!list_empty(&net->dev_base_head));
10383}
10384
10385static struct pernet_operations __net_initdata netdev_net_ops = {
10386        .init = netdev_init,
10387        .exit = netdev_exit,
10388};
10389
10390static void __net_exit default_device_exit(struct net *net)
10391{
10392        struct net_device *dev, *aux;
10393        /*
10394         * Push all migratable network devices back to the
10395         * initial network namespace
10396         */
10397        rtnl_lock();
10398        for_each_netdev_safe(net, dev, aux) {
10399                int err;
10400                char fb_name[IFNAMSIZ];
10401
10402                /* Ignore unmoveable devices (i.e. loopback) */
10403                if (dev->features & NETIF_F_NETNS_LOCAL)
10404                        continue;
10405
10406                /* Leave virtual devices for the generic cleanup */
10407                if (dev->rtnl_link_ops)
10408                        continue;
10409
10410                /* Push remaining network devices to init_net */
10411                snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
10412                if (__dev_get_by_name(&init_net, fb_name))
10413                        snprintf(fb_name, IFNAMSIZ, "dev%%d");
10414                err = dev_change_net_namespace(dev, &init_net, fb_name);
10415                if (err) {
10416                        pr_emerg("%s: failed to move %s to init_net: %d\n",
10417                                 __func__, dev->name, err);
10418                        BUG();
10419                }
10420        }
10421        rtnl_unlock();
10422}
10423
10424static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
10425{
10426        /* Return with the rtnl_lock held when there are no network
10427         * devices unregistering in any network namespace in net_list.
10428         */
10429        struct net *net;
10430        bool unregistering;
10431        DEFINE_WAIT_FUNC(wait, woken_wake_function);
10432
10433        add_wait_queue(&netdev_unregistering_wq, &wait);
10434        for (;;) {
10435                unregistering = false;
10436                rtnl_lock();
10437                list_for_each_entry(net, net_list, exit_list) {
10438                        if (net->dev_unreg_count > 0) {
10439                                unregistering = true;
10440                                break;
10441                        }
10442                }
10443                if (!unregistering)
10444                        break;
10445                __rtnl_unlock();
10446
10447                wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
10448        }
10449        remove_wait_queue(&netdev_unregistering_wq, &wait);
10450}
10451
10452static void __net_exit default_device_exit_batch(struct list_head *net_list)
10453{
10454        /* At exit all network devices most be removed from a network
10455         * namespace.  Do this in the reverse order of registration.
10456         * Do this across as many network namespaces as possible to
10457         * improve batching efficiency.
10458         */
10459        struct net_device *dev;
10460        struct net *net;
10461        LIST_HEAD(dev_kill_list);
10462
10463        /* To prevent network device cleanup code from dereferencing
10464         * loopback devices or network devices that have been freed
10465         * wait here for all pending unregistrations to complete,
10466         * before unregistring the loopback device and allowing the
10467         * network namespace be freed.
10468         *
10469         * The netdev todo list containing all network devices
10470         * unregistrations that happen in default_device_exit_batch
10471         * will run in the rtnl_unlock() at the end of
10472         * default_device_exit_batch.
10473         */
10474        rtnl_lock_unregistering(net_list);
10475        list_for_each_entry(net, net_list, exit_list) {
10476                for_each_netdev_reverse(net, dev) {
10477                        if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
10478                                dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
10479                        else
10480                                unregister_netdevice_queue(dev, &dev_kill_list);
10481                }
10482        }
10483        unregister_netdevice_many(&dev_kill_list);
10484        rtnl_unlock();
10485}
10486
10487static struct pernet_operations __net_initdata default_device_ops = {
10488        .exit = default_device_exit,
10489        .exit_batch = default_device_exit_batch,
10490};
10491
10492/*
10493 *      Initialize the DEV module. At boot time this walks the device list and
10494 *      unhooks any devices that fail to initialise (normally hardware not
10495 *      present) and leaves us with a valid list of present and active devices.
10496 *
10497 */
10498
10499/*
10500 *       This is called single threaded during boot, so no need
10501 *       to take the rtnl semaphore.
10502 */
10503static int __init net_dev_init(void)
10504{
10505        int i, rc = -ENOMEM;
10506
10507        BUG_ON(!dev_boot_phase);
10508
10509        if (dev_proc_init())
10510                goto out;
10511
10512        if (netdev_kobject_init())
10513                goto out;
10514
10515        INIT_LIST_HEAD(&ptype_all);
10516        for (i = 0; i < PTYPE_HASH_SIZE; i++)
10517                INIT_LIST_HEAD(&ptype_base[i]);
10518
10519        INIT_LIST_HEAD(&offload_base);
10520
10521        if (register_pernet_subsys(&netdev_net_ops))
10522                goto out;
10523
10524        /*
10525         *      Initialise the packet receive queues.
10526         */
10527
10528        for_each_possible_cpu(i) {
10529                struct work_struct *flush = per_cpu_ptr(&flush_works, i);
10530                struct softnet_data *sd = &per_cpu(softnet_data, i);
10531
10532                INIT_WORK(flush, flush_backlog);
10533
10534                skb_queue_head_init(&sd->input_pkt_queue);
10535                skb_queue_head_init(&sd->process_queue);
10536#ifdef CONFIG_XFRM_OFFLOAD
10537                skb_queue_head_init(&sd->xfrm_backlog);
10538#endif
10539                INIT_LIST_HEAD(&sd->poll_list);
10540                sd->output_queue_tailp = &sd->output_queue;
10541#ifdef CONFIG_RPS
10542                sd->csd.func = rps_trigger_softirq;
10543                sd->csd.info = sd;
10544                sd->cpu = i;
10545#endif
10546
10547                init_gro_hash(&sd->backlog);
10548                sd->backlog.poll = process_backlog;
10549                sd->backlog.weight = weight_p;
10550        }
10551
10552        dev_boot_phase = 0;
10553
10554        /* The loopback device is special if any other network devices
10555         * is present in a network namespace the loopback device must
10556         * be present. Since we now dynamically allocate and free the
10557         * loopback device ensure this invariant is maintained by
10558         * keeping the loopback device as the first device on the
10559         * list of network devices.  Ensuring the loopback devices
10560         * is the first device that appears and the last network device
10561         * that disappears.
10562         */
10563        if (register_pernet_device(&loopback_net_ops))
10564                goto out;
10565
10566        if (register_pernet_device(&default_device_ops))
10567                goto out;
10568
10569        open_softirq(NET_TX_SOFTIRQ, net_tx_action);
10570        open_softirq(NET_RX_SOFTIRQ, net_rx_action);
10571
10572        rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
10573                                       NULL, dev_cpu_dead);
10574        WARN_ON(rc < 0);
10575        rc = 0;
10576out:
10577        return rc;
10578}
10579
10580subsys_initcall(net_dev_init);
10581