LXR linux/net/core/dev.c

   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 *      NET3    Protocol independent device support routines.
   4 *
   5 *      Derived from the non IP parts of dev.c 1.0.19
   6 *              Authors:        Ross Biro
   7 *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
   8 *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
   9 *
  10 *      Additional Authors:
  11 *              Florian la Roche <rzsfl@rz.uni-sb.de>
  12 *              Alan Cox <gw4pts@gw4pts.ampr.org>
  13 *              David Hinds <dahinds@users.sourceforge.net>
  14 *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  15 *              Adam Sulmicki <adam@cfar.umd.edu>
  16 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  17 *
  18 *      Changes:
  19 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  20 *                                      to 2 if register_netdev gets called
  21 *                                      before net_dev_init & also removed a
  22 *                                      few lines of code in the process.
  23 *              Alan Cox        :       device private ioctl copies fields back.
  24 *              Alan Cox        :       Transmit queue code does relevant
  25 *                                      stunts to keep the queue safe.
  26 *              Alan Cox        :       Fixed double lock.
  27 *              Alan Cox        :       Fixed promisc NULL pointer trap
  28 *              ????????        :       Support the full private ioctl range
  29 *              Alan Cox        :       Moved ioctl permission check into
  30 *                                      drivers
  31 *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  32 *              Alan Cox        :       100 backlog just doesn't cut it when
  33 *                                      you start doing multicast video 8)
  34 *              Alan Cox        :       Rewrote net_bh and list manager.
  35 *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  36 *              Alan Cox        :       Took out transmit every packet pass
  37 *                                      Saved a few bytes in the ioctl handler
  38 *              Alan Cox        :       Network driver sets packet type before
  39 *                                      calling netif_rx. Saves a function
  40 *                                      call a packet.
  41 *              Alan Cox        :       Hashed net_bh()
  42 *              Richard Kooijman:       Timestamp fixes.
  43 *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  44 *              Alan Cox        :       Device lock protection.
  45 *              Alan Cox        :       Fixed nasty side effect of device close
  46 *                                      changes.
  47 *              Rudi Cilibrasi  :       Pass the right thing to
  48 *                                      set_mac_address()
  49 *              Dave Miller     :       32bit quantity for the device lock to
  50 *                                      make it work out on a Sparc.
  51 *              Bjorn Ekwall    :       Added KERNELD hack.
  52 *              Alan Cox        :       Cleaned up the backlog initialise.
  53 *              Craig Metz      :       SIOCGIFCONF fix if space for under
  54 *                                      1 device.
  55 *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  56 *                                      is no device open function.
  57 *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  58 *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  59 *              Cyrus Durgin    :       Cleaned for KMOD
  60 *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  61 *                                      A network device unload needs to purge
  62 *                                      the backlog queue.
  63 *      Paul Rusty Russell      :       SIOCSIFNAME
  64 *              Pekka Riikonen  :       Netdev boot-time settings code
  65 *              Andrew Morton   :       Make unregister_netdevice wait
  66 *                                      indefinitely on dev->refcnt
  67 *              J Hadi Salim    :       - Backlog queue sampling
  68 *                                      - netif_rx() feedback
  69 */
  70
  71#include <linux/uaccess.h>
  72#include <linux/bitops.h>
  73#include <linux/capability.h>
  74#include <linux/cpu.h>
  75#include <linux/types.h>
  76#include <linux/kernel.h>
  77#include <linux/hash.h>
  78#include <linux/slab.h>
  79#include <linux/sched.h>
  80#include <linux/sched/mm.h>
  81#include <linux/mutex.h>
  82#include <linux/string.h>
  83#include <linux/mm.h>
  84#include <linux/socket.h>
  85#include <linux/sockios.h>
  86#include <linux/errno.h>
  87#include <linux/interrupt.h>
  88#include <linux/if_ether.h>
  89#include <linux/netdevice.h>
  90#include <linux/etherdevice.h>
  91#include <linux/ethtool.h>
  92#include <linux/skbuff.h>
  93#include <linux/bpf.h>
  94#include <linux/bpf_trace.h>
  95#include <net/net_namespace.h>
  96#include <net/sock.h>
  97#include <net/busy_poll.h>
  98#include <linux/rtnetlink.h>
  99#include <linux/stat.h>
 100#include <net/dst.h>
 101#include <net/dst_metadata.h>
 102#include <net/pkt_sched.h>
 103#include <net/pkt_cls.h>
 104#include <net/checksum.h>
 105#include <net/xfrm.h>
 106#include <linux/highmem.h>
 107#include <linux/init.h>
 108#include <linux/module.h>
 109#include <linux/netpoll.h>
 110#include <linux/rcupdate.h>
 111#include <linux/delay.h>
 112#include <net/iw_handler.h>
 113#include <asm/current.h>
 114#include <linux/audit.h>
 115#include <linux/dmaengine.h>
 116#include <linux/err.h>
 117#include <linux/ctype.h>
 118#include <linux/if_arp.h>
 119#include <linux/if_vlan.h>
 120#include <linux/ip.h>
 121#include <net/ip.h>
 122#include <net/mpls.h>
 123#include <linux/ipv6.h>
 124#include <linux/in.h>
 125#include <linux/jhash.h>
 126#include <linux/random.h>
 127#include <trace/events/napi.h>
 128#include <trace/events/net.h>
 129#include <trace/events/skb.h>
 130#include <linux/inetdevice.h>
 131#include <linux/cpu_rmap.h>
 132#include <linux/static_key.h>
 133#include <linux/hashtable.h>
 134#include <linux/vmalloc.h>
 135#include <linux/if_macvlan.h>
 136#include <linux/errqueue.h>
 137#include <linux/hrtimer.h>
 138#include <linux/netfilter_ingress.h>
 139#include <linux/crash_dump.h>
 140#include <linux/sctp.h>
 141#include <net/udp_tunnel.h>
 142#include <linux/net_namespace.h>
 143#include <linux/indirect_call_wrapper.h>
 144#include <net/devlink.h>
 145
 146#include "net-sysfs.h"
 147
 148#define MAX_GRO_SKBS 8
 149#define MAX_NEST_DEV 8
 150
 151/* This should be increased if a protocol with a bigger head is added. */
 152#define GRO_MAX_HEAD (MAX_HEADER + 128)
 153
 154static DEFINE_SPINLOCK(ptype_lock);
 155static DEFINE_SPINLOCK(offload_lock);
 156struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 157struct list_head ptype_all __read_mostly;       /* Taps */
 158static struct list_head offload_base __read_mostly;
 159
 160static int netif_rx_internal(struct sk_buff *skb);
 161static int call_netdevice_notifiers_info(unsigned long val,
 162                                         struct netdev_notifier_info *info);
 163static int call_netdevice_notifiers_extack(unsigned long val,
 164                                           struct net_device *dev,
 165                                           struct netlink_ext_ack *extack);
 166static struct napi_struct *napi_by_id(unsigned int napi_id);
 167
 168/*
 169 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 170 * semaphore.
 171 *
 172 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 173 *
 174 * Writers must hold the rtnl semaphore while they loop through the
 175 * dev_base_head list, and hold dev_base_lock for writing when they do the
 176 * actual updates.  This allows pure readers to access the list even
 177 * while a writer is preparing to update it.
 178 *
 179 * To put it another way, dev_base_lock is held for writing only to
 180 * protect against pure readers; the rtnl semaphore provides the
 181 * protection against other writers.
 182 *
 183 * See, for example usages, register_netdevice() and
 184 * unregister_netdevice(), which must be called with the rtnl
 185 * semaphore held.
 186 */
 187DEFINE_RWLOCK(dev_base_lock);
 188EXPORT_SYMBOL(dev_base_lock);
 189
 190static DEFINE_MUTEX(ifalias_mutex);
 191
 192/* protects napi_hash addition/deletion and napi_gen_id */
 193static DEFINE_SPINLOCK(napi_hash_lock);
 194
 195static unsigned int napi_gen_id = NR_CPUS;
 196static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
 197
 198static seqcount_t devnet_rename_seq;
 199
 200static inline void dev_base_seq_inc(struct net *net)
 201{
 202        while (++net->dev_base_seq == 0)
 203                ;
 204}
 205
 206static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 207{
 208        unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
 209
 210        return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 211}
 212
 213static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 214{
 215        return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 216}
 217
 218static inline void rps_lock(struct softnet_data *sd)
 219{
 220#ifdef CONFIG_RPS
 221        spin_lock(&sd->input_pkt_queue.lock);
 222#endif
 223}
 224
 225static inline void rps_unlock(struct softnet_data *sd)
 226{
 227#ifdef CONFIG_RPS
 228        spin_unlock(&sd->input_pkt_queue.lock);
 229#endif
 230}
 231
 232/* Device list insertion */
 233static void list_netdevice(struct net_device *dev)
 234{
 235        struct net *net = dev_net(dev);
 236
 237        ASSERT_RTNL();
 238
 239        write_lock_bh(&dev_base_lock);
 240        list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 241        hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 242        hlist_add_head_rcu(&dev->index_hlist,
 243                           dev_index_hash(net, dev->ifindex));
 244        write_unlock_bh(&dev_base_lock);
 245
 246        dev_base_seq_inc(net);
 247}
 248
 249/* Device list removal
 250 * caller must respect a RCU grace period before freeing/reusing dev
 251 */
 252static void unlist_netdevice(struct net_device *dev)
 253{
 254        ASSERT_RTNL();
 255
 256        /* Unlink dev from the device chain */
 257        write_lock_bh(&dev_base_lock);
 258        list_del_rcu(&dev->dev_list);
 259        hlist_del_rcu(&dev->name_hlist);
 260        hlist_del_rcu(&dev->index_hlist);
 261        write_unlock_bh(&dev_base_lock);
 262
 263        dev_base_seq_inc(dev_net(dev));
 264}
 265
 266/*
 267 *      Our notifier list
 268 */
 269
 270static RAW_NOTIFIER_HEAD(netdev_chain);
 271
 272/*
 273 *      Device drivers call our routines to queue packets here. We empty the
 274 *      queue in the local softnet handler.
 275 */
 276
 277DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 278EXPORT_PER_CPU_SYMBOL(softnet_data);
 279
 280/*******************************************************************************
 281 *
 282 *              Protocol management and registration routines
 283 *
 284 *******************************************************************************/
 285
 286
 287/*
 288 *      Add a protocol ID to the list. Now that the input handler is
 289 *      smarter we can dispense with all the messy stuff that used to be
 290 *      here.
 291 *
 292 *      BEWARE!!! Protocol handlers, mangling input packets,
 293 *      MUST BE last in hash buckets and checking protocol handlers
 294 *      MUST start from promiscuous ptype_all chain in net_bh.
 295 *      It is true now, do not change it.
 296 *      Explanation follows: if protocol handler, mangling packet, will
 297 *      be the first on list, it is not able to sense, that packet
 298 *      is cloned and should be copied-on-write, so that it will
 299 *      change it and subsequent readers will get broken packet.
 300 *                                                      --ANK (980803)
 301 */
 302
 303static inline struct list_head *ptype_head(const struct packet_type *pt)
 304{
 305        if (pt->type == htons(ETH_P_ALL))
 306                return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 307        else
 308                return pt->dev ? &pt->dev->ptype_specific :
 309                                 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 310}
 311
 312/**
 313 *      dev_add_pack - add packet handler
 314 *      @pt: packet type declaration
 315 *
 316 *      Add a protocol handler to the networking stack. The passed &packet_type
 317 *      is linked into kernel lists and may not be freed until it has been
 318 *      removed from the kernel lists.
 319 *
 320 *      This call does not sleep therefore it can not
 321 *      guarantee all CPU's that are in middle of receiving packets
 322 *      will see the new packet type (until the next received packet).
 323 */
 324
 325void dev_add_pack(struct packet_type *pt)
 326{
 327        struct list_head *head = ptype_head(pt);
 328
 329        spin_lock(&ptype_lock);
 330        list_add_rcu(&pt->list, head);
 331        spin_unlock(&ptype_lock);
 332}
 333EXPORT_SYMBOL(dev_add_pack);
 334
 335/**
 336 *      __dev_remove_pack        - remove packet handler
 337 *      @pt: packet type declaration
 338 *
 339 *      Remove a protocol handler that was previously added to the kernel
 340 *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 341 *      from the kernel lists and can be freed or reused once this function
 342 *      returns.
 343 *
 344 *      The packet type might still be in use by receivers
 345 *      and must not be freed until after all the CPU's have gone
 346 *      through a quiescent state.
 347 */
 348void __dev_remove_pack(struct packet_type *pt)
 349{
 350        struct list_head *head = ptype_head(pt);
 351        struct packet_type *pt1;
 352
 353        spin_lock(&ptype_lock);
 354
 355        list_for_each_entry(pt1, head, list) {
 356                if (pt == pt1) {
 357                        list_del_rcu(&pt->list);
 358                        goto out;
 359                }
 360        }
 361
 362        pr_warn("dev_remove_pack: %p not found\n", pt);
 363out:
 364        spin_unlock(&ptype_lock);
 365}
 366EXPORT_SYMBOL(__dev_remove_pack);
 367
 368/**
 369 *      dev_remove_pack  - remove packet handler
 370 *      @pt: packet type declaration
 371 *
 372 *      Remove a protocol handler that was previously added to the kernel
 373 *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 374 *      from the kernel lists and can be freed or reused once this function
 375 *      returns.
 376 *
 377 *      This call sleeps to guarantee that no CPU is looking at the packet
 378 *      type after return.
 379 */
 380void dev_remove_pack(struct packet_type *pt)
 381{
 382        __dev_remove_pack(pt);
 383
 384        synchronize_net();
 385}
 386EXPORT_SYMBOL(dev_remove_pack);
 387
 388
 389/**
 390 *      dev_add_offload - register offload handlers
 391 *      @po: protocol offload declaration
 392 *
 393 *      Add protocol offload handlers to the networking stack. The passed
 394 *      &proto_offload is linked into kernel lists and may not be freed until
 395 *      it has been removed from the kernel lists.
 396 *
 397 *      This call does not sleep therefore it can not
 398 *      guarantee all CPU's that are in middle of receiving packets
 399 *      will see the new offload handlers (until the next received packet).
 400 */
 401void dev_add_offload(struct packet_offload *po)
 402{
 403        struct packet_offload *elem;
 404
 405        spin_lock(&offload_lock);
 406        list_for_each_entry(elem, &offload_base, list) {
 407                if (po->priority < elem->priority)
 408                        break;
 409        }
 410        list_add_rcu(&po->list, elem->list.prev);
 411        spin_unlock(&offload_lock);
 412}
 413EXPORT_SYMBOL(dev_add_offload);
 414
 415/**
 416 *      __dev_remove_offload     - remove offload handler
 417 *      @po: packet offload declaration
 418 *
 419 *      Remove a protocol offload handler that was previously added to the
 420 *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 421 *      is removed from the kernel lists and can be freed or reused once this
 422 *      function returns.
 423 *
 424 *      The packet type might still be in use by receivers
 425 *      and must not be freed until after all the CPU's have gone
 426 *      through a quiescent state.
 427 */
 428static void __dev_remove_offload(struct packet_offload *po)
 429{
 430        struct list_head *head = &offload_base;
 431        struct packet_offload *po1;
 432
 433        spin_lock(&offload_lock);
 434
 435        list_for_each_entry(po1, head, list) {
 436                if (po == po1) {
 437                        list_del_rcu(&po->list);
 438                        goto out;
 439                }
 440        }
 441
 442        pr_warn("dev_remove_offload: %p not found\n", po);
 443out:
 444        spin_unlock(&offload_lock);
 445}
 446
 447/**
 448 *      dev_remove_offload       - remove packet offload handler
 449 *      @po: packet offload declaration
 450 *
 451 *      Remove a packet offload handler that was previously added to the kernel
 452 *      offload handlers by dev_add_offload(). The passed &offload_type is
 453 *      removed from the kernel lists and can be freed or reused once this
 454 *      function returns.
 455 *
 456 *      This call sleeps to guarantee that no CPU is looking at the packet
 457 *      type after return.
 458 */
 459void dev_remove_offload(struct packet_offload *po)
 460{
 461        __dev_remove_offload(po);
 462
 463        synchronize_net();
 464}
 465EXPORT_SYMBOL(dev_remove_offload);
 466
 467/******************************************************************************
 468 *
 469 *                    Device Boot-time Settings Routines
 470 *
 471 ******************************************************************************/
 472
 473/* Boot time configuration table */
 474static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 475
 476/**
 477 *      netdev_boot_setup_add   - add new setup entry
 478 *      @name: name of the device
 479 *      @map: configured settings for the device
 480 *
 481 *      Adds new setup entry to the dev_boot_setup list.  The function
 482 *      returns 0 on error and 1 on success.  This is a generic routine to
 483 *      all netdevices.
 484 */
 485static int netdev_boot_setup_add(char *name, struct ifmap *map)
 486{
 487        struct netdev_boot_setup *s;
 488        int i;
 489
 490        s = dev_boot_setup;
 491        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 492                if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 493                        memset(s[i].name, 0, sizeof(s[i].name));
 494                        strlcpy(s[i].name, name, IFNAMSIZ);
 495                        memcpy(&s[i].map, map, sizeof(s[i].map));
 496                        break;
 497                }
 498        }
 499
 500        return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 501}
 502
 503/**
 504 * netdev_boot_setup_check      - check boot time settings
 505 * @dev: the netdevice
 506 *
 507 * Check boot time settings for the device.
 508 * The found settings are set for the device to be used
 509 * later in the device probing.
 510 * Returns 0 if no settings found, 1 if they are.
 511 */
 512int netdev_boot_setup_check(struct net_device *dev)
 513{
 514        struct netdev_boot_setup *s = dev_boot_setup;
 515        int i;
 516
 517        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 518                if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 519                    !strcmp(dev->name, s[i].name)) {
 520                        dev->irq = s[i].map.irq;
 521                        dev->base_addr = s[i].map.base_addr;
 522                        dev->mem_start = s[i].map.mem_start;
 523                        dev->mem_end = s[i].map.mem_end;
 524                        return 1;
 525                }
 526        }
 527        return 0;
 528}
 529EXPORT_SYMBOL(netdev_boot_setup_check);
 530
 531
 532/**
 533 * netdev_boot_base     - get address from boot time settings
 534 * @prefix: prefix for network device
 535 * @unit: id for network device
 536 *
 537 * Check boot time settings for the base address of device.
 538 * The found settings are set for the device to be used
 539 * later in the device probing.
 540 * Returns 0 if no settings found.
 541 */
 542unsigned long netdev_boot_base(const char *prefix, int unit)
 543{
 544        const struct netdev_boot_setup *s = dev_boot_setup;
 545        char name[IFNAMSIZ];
 546        int i;
 547
 548        sprintf(name, "%s%d", prefix, unit);
 549
 550        /*
 551         * If device already registered then return base of 1
 552         * to indicate not to probe for this interface
 553         */
 554        if (__dev_get_by_name(&init_net, name))
 555                return 1;
 556
 557        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 558                if (!strcmp(name, s[i].name))
 559                        return s[i].map.base_addr;
 560        return 0;
 561}
 562
 563/*
 564 * Saves at boot time configured settings for any netdevice.
 565 */
 566int __init netdev_boot_setup(char *str)
 567{
 568        int ints[5];
 569        struct ifmap map;
 570
 571        str = get_options(str, ARRAY_SIZE(ints), ints);
 572        if (!str || !*str)
 573                return 0;
 574
 575        /* Save settings */
 576        memset(&map, 0, sizeof(map));
 577        if (ints[0] > 0)
 578                map.irq = ints[1];
 579        if (ints[0] > 1)
 580                map.base_addr = ints[2];
 581        if (ints[0] > 2)
 582                map.mem_start = ints[3];
 583        if (ints[0] > 3)
 584                map.mem_end = ints[4];
 585
 586        /* Add new entry to the list */
 587        return netdev_boot_setup_add(str, &map);
 588}
 589
 590__setup("netdev=", netdev_boot_setup);
 591
 592/*******************************************************************************
 593 *
 594 *                          Device Interface Subroutines
 595 *
 596 *******************************************************************************/
 597
 598/**
 599 *      dev_get_iflink  - get 'iflink' value of a interface
 600 *      @dev: targeted interface
 601 *
 602 *      Indicates the ifindex the interface is linked to.
 603 *      Physical interfaces have the same 'ifindex' and 'iflink' values.
 604 */
 605
 606int dev_get_iflink(const struct net_device *dev)
 607{
 608        if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
 609                return dev->netdev_ops->ndo_get_iflink(dev);
 610
 611        return dev->ifindex;
 612}
 613EXPORT_SYMBOL(dev_get_iflink);
 614
 615/**
 616 *      dev_fill_metadata_dst - Retrieve tunnel egress information.
 617 *      @dev: targeted interface
 618 *      @skb: The packet.
 619 *
 620 *      For better visibility of tunnel traffic OVS needs to retrieve
 621 *      egress tunnel information for a packet. Following API allows
 622 *      user to get this info.
 623 */
 624int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 625{
 626        struct ip_tunnel_info *info;
 627
 628        if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
 629                return -EINVAL;
 630
 631        info = skb_tunnel_info_unclone(skb);
 632        if (!info)
 633                return -ENOMEM;
 634        if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
 635                return -EINVAL;
 636
 637        return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
 638}
 639EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
 640
 641/**
 642 *      __dev_get_by_name       - find a device by its name
 643 *      @net: the applicable net namespace
 644 *      @name: name to find
 645 *
 646 *      Find an interface by name. Must be called under RTNL semaphore
 647 *      or @dev_base_lock. If the name is found a pointer to the device
 648 *      is returned. If the name is not found then %NULL is returned. The
 649 *      reference counters are not incremented so the caller must be
 650 *      careful with locks.
 651 */
 652
 653struct net_device *__dev_get_by_name(struct net *net, const char *name)
 654{
 655        struct net_device *dev;
 656        struct hlist_head *head = dev_name_hash(net, name);
 657
 658        hlist_for_each_entry(dev, head, name_hlist)
 659                if (!strncmp(dev->name, name, IFNAMSIZ))
 660                        return dev;
 661
 662        return NULL;
 663}
 664EXPORT_SYMBOL(__dev_get_by_name);
 665
 666/**
 667 * dev_get_by_name_rcu  - find a device by its name
 668 * @net: the applicable net namespace
 669 * @name: name to find
 670 *
 671 * Find an interface by name.
 672 * If the name is found a pointer to the device is returned.
 673 * If the name is not found then %NULL is returned.
 674 * The reference counters are not incremented so the caller must be
 675 * careful with locks. The caller must hold RCU lock.
 676 */
 677
 678struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 679{
 680        struct net_device *dev;
 681        struct hlist_head *head = dev_name_hash(net, name);
 682
 683        hlist_for_each_entry_rcu(dev, head, name_hlist)
 684                if (!strncmp(dev->name, name, IFNAMSIZ))
 685                        return dev;
 686
 687        return NULL;
 688}
 689EXPORT_SYMBOL(dev_get_by_name_rcu);
 690
 691/**
 692 *      dev_get_by_name         - find a device by its name
 693 *      @net: the applicable net namespace
 694 *      @name: name to find
 695 *
 696 *      Find an interface by name. This can be called from any
 697 *      context and does its own locking. The returned handle has
 698 *      the usage count incremented and the caller must use dev_put() to
 699 *      release it when it is no longer needed. %NULL is returned if no
 700 *      matching device is found.
 701 */
 702
 703struct net_device *dev_get_by_name(struct net *net, const char *name)
 704{
 705        struct net_device *dev;
 706
 707        rcu_read_lock();
 708        dev = dev_get_by_name_rcu(net, name);
 709        if (dev)
 710                dev_hold(dev);
 711        rcu_read_unlock();
 712        return dev;
 713}
 714EXPORT_SYMBOL(dev_get_by_name);
 715
 716/**
 717 *      __dev_get_by_index - find a device by its ifindex
 718 *      @net: the applicable net namespace
 719 *      @ifindex: index of device
 720 *
 721 *      Search for an interface by index. Returns %NULL if the device
 722 *      is not found or a pointer to the device. The device has not
 723 *      had its reference counter increased so the caller must be careful
 724 *      about locking. The caller must hold either the RTNL semaphore
 725 *      or @dev_base_lock.
 726 */
 727
 728struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 729{
 730        struct net_device *dev;
 731        struct hlist_head *head = dev_index_hash(net, ifindex);
 732
 733        hlist_for_each_entry(dev, head, index_hlist)
 734                if (dev->ifindex == ifindex)
 735                        return dev;
 736
 737        return NULL;
 738}
 739EXPORT_SYMBOL(__dev_get_by_index);
 740
 741/**
 742 *      dev_get_by_index_rcu - find a device by its ifindex
 743 *      @net: the applicable net namespace
 744 *      @ifindex: index of device
 745 *
 746 *      Search for an interface by index. Returns %NULL if the device
 747 *      is not found or a pointer to the device. The device has not
 748 *      had its reference counter increased so the caller must be careful
 749 *      about locking. The caller must hold RCU lock.
 750 */
 751
 752struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 753{
 754        struct net_device *dev;
 755        struct hlist_head *head = dev_index_hash(net, ifindex);
 756
 757        hlist_for_each_entry_rcu(dev, head, index_hlist)
 758                if (dev->ifindex == ifindex)
 759                        return dev;
 760
 761        return NULL;
 762}
 763EXPORT_SYMBOL(dev_get_by_index_rcu);
 764
 765
 766/**
 767 *      dev_get_by_index - find a device by its ifindex
 768 *      @net: the applicable net namespace
 769 *      @ifindex: index of device
 770 *
 771 *      Search for an interface by index. Returns NULL if the device
 772 *      is not found or a pointer to the device. The device returned has
 773 *      had a reference added and the pointer is safe until the user calls
 774 *      dev_put to indicate they have finished with it.
 775 */
 776
 777struct net_device *dev_get_by_index(struct net *net, int ifindex)
 778{
 779        struct net_device *dev;
 780
 781        rcu_read_lock();
 782        dev = dev_get_by_index_rcu(net, ifindex);
 783        if (dev)
 784                dev_hold(dev);
 785        rcu_read_unlock();
 786        return dev;
 787}
 788EXPORT_SYMBOL(dev_get_by_index);
 789
 790/**
 791 *      dev_get_by_napi_id - find a device by napi_id
 792 *      @napi_id: ID of the NAPI struct
 793 *
 794 *      Search for an interface by NAPI ID. Returns %NULL if the device
 795 *      is not found or a pointer to the device. The device has not had
 796 *      its reference counter increased so the caller must be careful
 797 *      about locking. The caller must hold RCU lock.
 798 */
 799
 800struct net_device *dev_get_by_napi_id(unsigned int napi_id)
 801{
 802        struct napi_struct *napi;
 803
 804        WARN_ON_ONCE(!rcu_read_lock_held());
 805
 806        if (napi_id < MIN_NAPI_ID)
 807                return NULL;
 808
 809        napi = napi_by_id(napi_id);
 810
 811        return napi ? napi->dev : NULL;
 812}
 813EXPORT_SYMBOL(dev_get_by_napi_id);
 814
 815/**
 816 *      netdev_get_name - get a netdevice name, knowing its ifindex.
 817 *      @net: network namespace
 818 *      @name: a pointer to the buffer where the name will be stored.
 819 *      @ifindex: the ifindex of the interface to get the name from.
 820 *
 821 *      The use of raw_seqcount_begin() and cond_resched() before
 822 *      retrying is required as we want to give the writers a chance
 823 *      to complete when CONFIG_PREEMPT is not set.
 824 */
 825int netdev_get_name(struct net *net, char *name, int ifindex)
 826{
 827        struct net_device *dev;
 828        unsigned int seq;
 829
 830retry:
 831        seq = raw_seqcount_begin(&devnet_rename_seq);
 832        rcu_read_lock();
 833        dev = dev_get_by_index_rcu(net, ifindex);
 834        if (!dev) {
 835                rcu_read_unlock();
 836                return -ENODEV;
 837        }
 838
 839        strcpy(name, dev->name);
 840        rcu_read_unlock();
 841        if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 842                cond_resched();
 843                goto retry;
 844        }
 845
 846        return 0;
 847}
 848
 849/**
 850 *      dev_getbyhwaddr_rcu - find a device by its hardware address
 851 *      @net: the applicable net namespace
 852 *      @type: media type of device
 853 *      @ha: hardware address
 854 *
 855 *      Search for an interface by MAC address. Returns NULL if the device
 856 *      is not found or a pointer to the device.
 857 *      The caller must hold RCU or RTNL.
 858 *      The returned device has not had its ref count increased
 859 *      and the caller must therefore be careful about locking
 860 *
 861 */
 862
 863struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 864                                       const char *ha)
 865{
 866        struct net_device *dev;
 867
 868        for_each_netdev_rcu(net, dev)
 869                if (dev->type == type &&
 870                    !memcmp(dev->dev_addr, ha, dev->addr_len))
 871                        return dev;
 872
 873        return NULL;
 874}
 875EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 876
 877struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 878{
 879        struct net_device *dev;
 880
 881        ASSERT_RTNL();
 882        for_each_netdev(net, dev)
 883                if (dev->type == type)
 884                        return dev;
 885
 886        return NULL;
 887}
 888EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 889
 890struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 891{
 892        struct net_device *dev, *ret = NULL;
 893
 894        rcu_read_lock();
 895        for_each_netdev_rcu(net, dev)
 896                if (dev->type == type) {
 897                        dev_hold(dev);
 898                        ret = dev;
 899                        break;
 900                }
 901        rcu_read_unlock();
 902        return ret;
 903}
 904EXPORT_SYMBOL(dev_getfirstbyhwtype);
 905
 906/**
 907 *      __dev_get_by_flags - find any device with given flags
 908 *      @net: the applicable net namespace
 909 *      @if_flags: IFF_* values
 910 *      @mask: bitmask of bits in if_flags to check
 911 *
 912 *      Search for any interface with the given flags. Returns NULL if a device
 913 *      is not found or a pointer to the device. Must be called inside
 914 *      rtnl_lock(), and result refcount is unchanged.
 915 */
 916
 917struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 918                                      unsigned short mask)
 919{
 920        struct net_device *dev, *ret;
 921
 922        ASSERT_RTNL();
 923
 924        ret = NULL;
 925        for_each_netdev(net, dev) {
 926                if (((dev->flags ^ if_flags) & mask) == 0) {
 927                        ret = dev;
 928                        break;
 929                }
 930        }
 931        return ret;
 932}
 933EXPORT_SYMBOL(__dev_get_by_flags);
 934
 935/**
 936 *      dev_valid_name - check if name is okay for network device
 937 *      @name: name string
 938 *
 939 *      Network device names need to be valid file names to
 940 *      to allow sysfs to work.  We also disallow any kind of
 941 *      whitespace.
 942 */
 943bool dev_valid_name(const char *name)
 944{
 945        if (*name == '\0')
 946                return false;
 947        if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
 948                return false;
 949        if (!strcmp(name, ".") || !strcmp(name, ".."))
 950                return false;
 951
 952        while (*name) {
 953                if (*name == '/' || *name == ':' || isspace(*name))
 954                        return false;
 955                name++;
 956        }
 957        return true;
 958}
 959EXPORT_SYMBOL(dev_valid_name);
 960
 961/**
 962 *      __dev_alloc_name - allocate a name for a device
 963 *      @net: network namespace to allocate the device name in
 964 *      @name: name format string
 965 *      @buf:  scratch buffer and result name string
 966 *
 967 *      Passed a format string - eg "lt%d" it will try and find a suitable
 968 *      id. It scans list of devices to build up a free map, then chooses
 969 *      the first empty slot. The caller must hold the dev_base or rtnl lock
 970 *      while allocating the name and adding the device in order to avoid
 971 *      duplicates.
 972 *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 973 *      Returns the number of the unit assigned or a negative errno code.
 974 */
 975
 976static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 977{
 978        int i = 0;
 979        const char *p;
 980        const int max_netdevices = 8*PAGE_SIZE;
 981        unsigned long *inuse;
 982        struct net_device *d;
 983
 984        if (!dev_valid_name(name))
 985                return -EINVAL;
 986
 987        p = strchr(name, '%');
 988        if (p) {
 989                /*
 990                 * Verify the string as this thing may have come from
 991                 * the user.  There must be either one "%d" and no other "%"
 992                 * characters.
 993                 */
 994                if (p[1] != 'd' || strchr(p + 2, '%'))
 995                        return -EINVAL;
 996
 997                /* Use one page as a bit array of possible slots */
 998                inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 999                if (!inuse)
1000                        return -ENOMEM;

1001
1002                for_each_netdev(net, d) {
1003                        if (!sscanf(d->name, name, &i))
1004                                continue;
1005                        if (i < 0 || i >= max_netdevices)
1006                                continue;
1007
1008                        /*  avoid cases where sscanf is not exact inverse of printf */
1009                        snprintf(buf, IFNAMSIZ, name, i);
1010                        if (!strncmp(buf, d->name, IFNAMSIZ))
1011                                set_bit(i, inuse);
1012                }
1013
1014                i = find_first_zero_bit(inuse, max_netdevices);
1015                free_page((unsigned long) inuse);
1016        }
1017
1018        snprintf(buf, IFNAMSIZ, name, i);
1019        if (!__dev_get_by_name(net, buf))
1020                return i;
1021
1022        /* It is possible to run out of possible slots
1023         * when the name is long and there isn't enough space left
1024         * for the digits, or if all bits are used.
1025         */
1026        return -ENFILE;
1027}
1028
1029static int dev_alloc_name_ns(struct net *net,
1030                             struct net_device *dev,
1031                             const char *name)
1032{
1033        char buf[IFNAMSIZ];
1034        int ret;
1035
1036        BUG_ON(!net);
1037        ret = __dev_alloc_name(net, name, buf);
1038        if (ret >= 0)
1039                strlcpy(dev->name, buf, IFNAMSIZ);
1040        return ret;
1041}
1042
1043/**
1044 *      dev_alloc_name - allocate a name for a device
1045 *      @dev: device
1046 *      @name: name format string
1047 *
1048 *      Passed a format string - eg "lt%d" it will try and find a suitable
1049 *      id. It scans list of devices to build up a free map, then chooses
1050 *      the first empty slot. The caller must hold the dev_base or rtnl lock
1051 *      while allocating the name and adding the device in order to avoid
1052 *      duplicates.
1053 *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1054 *      Returns the number of the unit assigned or a negative errno code.
1055 */
1056
1057int dev_alloc_name(struct net_device *dev, const char *name)
1058{
1059        return dev_alloc_name_ns(dev_net(dev), dev, name);
1060}
1061EXPORT_SYMBOL(dev_alloc_name);
1062
1063int dev_get_valid_name(struct net *net, struct net_device *dev,
1064                       const char *name)
1065{
1066        BUG_ON(!net);
1067
1068        if (!dev_valid_name(name))
1069                return -EINVAL;
1070
1071        if (strchr(name, '%'))
1072                return dev_alloc_name_ns(net, dev, name);
1073        else if (__dev_get_by_name(net, name))
1074                return -EEXIST;
1075        else if (dev->name != name)
1076                strlcpy(dev->name, name, IFNAMSIZ);
1077
1078        return 0;
1079}
1080EXPORT_SYMBOL(dev_get_valid_name);
1081
1082/**
1083 *      dev_change_name - change name of a device
1084 *      @dev: device
1085 *      @newname: name (or format string) must be at least IFNAMSIZ
1086 *
1087 *      Change name of a device, can pass format strings "eth%d".
1088 *      for wildcarding.
1089 */
1090int dev_change_name(struct net_device *dev, const char *newname)
1091{
1092        unsigned char old_assign_type;
1093        char oldname[IFNAMSIZ];
1094        int err = 0;
1095        int ret;
1096        struct net *net;
1097
1098        ASSERT_RTNL();
1099        BUG_ON(!dev_net(dev));
1100
1101        net = dev_net(dev);
1102
1103        /* Some auto-enslaved devices e.g. failover slaves are
1104         * special, as userspace might rename the device after
1105         * the interface had been brought up and running since
1106         * the point kernel initiated auto-enslavement. Allow
1107         * live name change even when these slave devices are
1108         * up and running.
1109         *
1110         * Typically, users of these auto-enslaving devices
1111         * don't actually care about slave name change, as
1112         * they are supposed to operate on master interface
1113         * directly.
1114         */
1115        if (dev->flags & IFF_UP &&
1116            likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK)))
1117                return -EBUSY;
1118
1119        write_seqcount_begin(&devnet_rename_seq);
1120
1121        if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1122                write_seqcount_end(&devnet_rename_seq);
1123                return 0;
1124        }
1125
1126        memcpy(oldname, dev->name, IFNAMSIZ);
1127
1128        err = dev_get_valid_name(net, dev, newname);
1129        if (err < 0) {
1130                write_seqcount_end(&devnet_rename_seq);
1131                return err;
1132        }
1133
1134        if (oldname[0] && !strchr(oldname, '%'))
1135                netdev_info(dev, "renamed from %s\n", oldname);
1136
1137        old_assign_type = dev->name_assign_type;
1138        dev->name_assign_type = NET_NAME_RENAMED;
1139
1140rollback:
1141        ret = device_rename(&dev->dev, dev->name);
1142        if (ret) {
1143                memcpy(dev->name, oldname, IFNAMSIZ);
1144                dev->name_assign_type = old_assign_type;
1145                write_seqcount_end(&devnet_rename_seq);
1146                return ret;
1147        }
1148
1149        write_seqcount_end(&devnet_rename_seq);
1150
1151        netdev_adjacent_rename_links(dev, oldname);
1152
1153        write_lock_bh(&dev_base_lock);
1154        hlist_del_rcu(&dev->name_hlist);
1155        write_unlock_bh(&dev_base_lock);
1156
1157        synchronize_rcu();
1158
1159        write_lock_bh(&dev_base_lock);
1160        hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1161        write_unlock_bh(&dev_base_lock);
1162
1163        ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1164        ret = notifier_to_errno(ret);
1165
1166        if (ret) {
1167                /* err >= 0 after dev_alloc_name() or stores the first errno */
1168                if (err >= 0) {
1169                        err = ret;
1170                        write_seqcount_begin(&devnet_rename_seq);
1171                        memcpy(dev->name, oldname, IFNAMSIZ);
1172                        memcpy(oldname, newname, IFNAMSIZ);
1173                        dev->name_assign_type = old_assign_type;
1174                        old_assign_type = NET_NAME_RENAMED;
1175                        goto rollback;
1176                } else {
1177                        pr_err("%s: name change rollback failed: %d\n",
1178                               dev->name, ret);
1179                }
1180        }
1181
1182        return err;
1183}
1184
1185/**
1186 *      dev_set_alias - change ifalias of a device
1187 *      @dev: device
1188 *      @alias: name up to IFALIASZ
1189 *      @len: limit of bytes to copy from info
1190 *
1191 *      Set ifalias for a device,
1192 */
1193int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1194{
1195        struct dev_ifalias *new_alias = NULL;
1196
1197        if (len >= IFALIASZ)
1198                return -EINVAL;
1199
1200        if (len) {
1201                new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
1202                if (!new_alias)
1203                        return -ENOMEM;
1204
1205                memcpy(new_alias->ifalias, alias, len);
1206                new_alias->ifalias[len] = 0;
1207        }
1208
1209        mutex_lock(&ifalias_mutex);
1210        rcu_swap_protected(dev->ifalias, new_alias,
1211                           mutex_is_locked(&ifalias_mutex));
1212        mutex_unlock(&ifalias_mutex);
1213
1214        if (new_alias)
1215                kfree_rcu(new_alias, rcuhead);
1216
1217        return len;
1218}
1219EXPORT_SYMBOL(dev_set_alias);
1220
1221/**
1222 *      dev_get_alias - get ifalias of a device
1223 *      @dev: device
1224 *      @name: buffer to store name of ifalias
1225 *      @len: size of buffer
1226 *
1227 *      get ifalias for a device.  Caller must make sure dev cannot go
1228 *      away,  e.g. rcu read lock or own a reference count to device.
1229 */
1230int dev_get_alias(const struct net_device *dev, char *name, size_t len)
1231{
1232        const struct dev_ifalias *alias;
1233        int ret = 0;
1234
1235        rcu_read_lock();
1236        alias = rcu_dereference(dev->ifalias);
1237        if (alias)
1238                ret = snprintf(name, len, "%s", alias->ifalias);
1239        rcu_read_unlock();
1240
1241        return ret;
1242}
1243
1244/**
1245 *      netdev_features_change - device changes features
1246 *      @dev: device to cause notification
1247 *
1248 *      Called to indicate a device has changed features.
1249 */
1250void netdev_features_change(struct net_device *dev)
1251{
1252        call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1253}
1254EXPORT_SYMBOL(netdev_features_change);
1255
1256/**
1257 *      netdev_state_change - device changes state
1258 *      @dev: device to cause notification
1259 *
1260 *      Called to indicate a device has changed state. This function calls
1261 *      the notifier chains for netdev_chain and sends a NEWLINK message
1262 *      to the routing socket.
1263 */
1264void netdev_state_change(struct net_device *dev)
1265{
1266        if (dev->flags & IFF_UP) {
1267                struct netdev_notifier_change_info change_info = {
1268                        .info.dev = dev,
1269                };
1270
1271                call_netdevice_notifiers_info(NETDEV_CHANGE,
1272                                              &change_info.info);
1273                rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1274        }
1275}
1276EXPORT_SYMBOL(netdev_state_change);
1277
1278/**
1279 * netdev_notify_peers - notify network peers about existence of @dev
1280 * @dev: network device
1281 *
1282 * Generate traffic such that interested network peers are aware of
1283 * @dev, such as by generating a gratuitous ARP. This may be used when
1284 * a device wants to inform the rest of the network about some sort of
1285 * reconfiguration such as a failover event or virtual machine
1286 * migration.
1287 */
1288void netdev_notify_peers(struct net_device *dev)
1289{
1290        rtnl_lock();
1291        call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1292        call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
1293        rtnl_unlock();
1294}
1295EXPORT_SYMBOL(netdev_notify_peers);
1296
1297static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
1298{
1299        const struct net_device_ops *ops = dev->netdev_ops;
1300        int ret;
1301
1302        ASSERT_RTNL();
1303
1304        if (!netif_device_present(dev))
1305                return -ENODEV;
1306
1307        /* Block netpoll from trying to do any rx path servicing.
1308         * If we don't do this there is a chance ndo_poll_controller
1309         * or ndo_poll may be running while we open the device
1310         */
1311        netpoll_poll_disable(dev);
1312
1313        ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
1314        ret = notifier_to_errno(ret);
1315        if (ret)
1316                return ret;
1317
1318        set_bit(__LINK_STATE_START, &dev->state);
1319
1320        if (ops->ndo_validate_addr)
1321                ret = ops->ndo_validate_addr(dev);
1322
1323        if (!ret && ops->ndo_open)
1324                ret = ops->ndo_open(dev);
1325
1326        netpoll_poll_enable(dev);
1327
1328        if (ret)
1329                clear_bit(__LINK_STATE_START, &dev->state);
1330        else {
1331                dev->flags |= IFF_UP;
1332                dev_set_rx_mode(dev);
1333                dev_activate(dev);
1334                add_device_randomness(dev->dev_addr, dev->addr_len);
1335        }
1336
1337        return ret;
1338}
1339
1340/**
1341 *      dev_open        - prepare an interface for use.
1342 *      @dev: device to open
1343 *      @extack: netlink extended ack
1344 *
1345 *      Takes a device from down to up state. The device's private open
1346 *      function is invoked and then the multicast lists are loaded. Finally
1347 *      the device is moved into the up state and a %NETDEV_UP message is
1348 *      sent to the netdev notifier chain.
1349 *
1350 *      Calling this function on an active interface is a nop. On a failure
1351 *      a negative errno code is returned.
1352 */
1353int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
1354{
1355        int ret;
1356
1357        if (dev->flags & IFF_UP)
1358                return 0;
1359
1360        ret = __dev_open(dev, extack);
1361        if (ret < 0)
1362                return ret;
1363
1364        rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1365        call_netdevice_notifiers(NETDEV_UP, dev);
1366
1367        return ret;
1368}
1369EXPORT_SYMBOL(dev_open);
1370
1371static void __dev_close_many(struct list_head *head)
1372{
1373        struct net_device *dev;
1374
1375        ASSERT_RTNL();
1376        might_sleep();
1377
1378        list_for_each_entry(dev, head, close_list) {
1379                /* Temporarily disable netpoll until the interface is down */
1380                netpoll_poll_disable(dev);
1381
1382                call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1383
1384                clear_bit(__LINK_STATE_START, &dev->state);
1385
1386                /* Synchronize to scheduled poll. We cannot touch poll list, it
1387                 * can be even on different cpu. So just clear netif_running().
1388                 *
1389                 * dev->stop() will invoke napi_disable() on all of it's
1390                 * napi_struct instances on this device.
1391                 */
1392                smp_mb__after_atomic(); /* Commit netif_running(). */
1393        }
1394
1395        dev_deactivate_many(head);
1396
1397        list_for_each_entry(dev, head, close_list) {
1398                const struct net_device_ops *ops = dev->netdev_ops;
1399
1400                /*
1401                 *      Call the device specific close. This cannot fail.
1402                 *      Only if device is UP
1403                 *
1404                 *      We allow it to be called even after a DETACH hot-plug
1405                 *      event.
1406                 */
1407                if (ops->ndo_stop)
1408                        ops->ndo_stop(dev);
1409
1410                dev->flags &= ~IFF_UP;
1411                netpoll_poll_enable(dev);
1412        }
1413}
1414
1415static void __dev_close(struct net_device *dev)
1416{
1417        LIST_HEAD(single);
1418
1419        list_add(&dev->close_list, &single);
1420        __dev_close_many(&single);
1421        list_del(&single);
1422}
1423
1424void dev_close_many(struct list_head *head, bool unlink)
1425{
1426        struct net_device *dev, *tmp;
1427
1428        /* Remove the devices that don't need to be closed */
1429        list_for_each_entry_safe(dev, tmp, head, close_list)
1430                if (!(dev->flags & IFF_UP))
1431                        list_del_init(&dev->close_list);
1432
1433        __dev_close_many(head);
1434
1435        list_for_each_entry_safe(dev, tmp, head, close_list) {
1436                rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1437                call_netdevice_notifiers(NETDEV_DOWN, dev);
1438                if (unlink)
1439                        list_del_init(&dev->close_list);
1440        }
1441}
1442EXPORT_SYMBOL(dev_close_many);
1443
1444/**
1445 *      dev_close - shutdown an interface.
1446 *      @dev: device to shutdown
1447 *
1448 *      This function moves an active device into down state. A
1449 *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1450 *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1451 *      chain.
1452 */
1453void dev_close(struct net_device *dev)
1454{
1455        if (dev->flags & IFF_UP) {
1456                LIST_HEAD(single);
1457
1458                list_add(&dev->close_list, &single);
1459                dev_close_many(&single, true);
1460                list_del(&single);
1461        }
1462}
1463EXPORT_SYMBOL(dev_close);
1464
1465
1466/**
1467 *      dev_disable_lro - disable Large Receive Offload on a device
1468 *      @dev: device
1469 *
1470 *      Disable Large Receive Offload (LRO) on a net device.  Must be
1471 *      called under RTNL.  This is needed if received packets may be
1472 *      forwarded to another interface.
1473 */
1474void dev_disable_lro(struct net_device *dev)
1475{
1476        struct net_device *lower_dev;
1477        struct list_head *iter;
1478
1479        dev->wanted_features &= ~NETIF_F_LRO;
1480        netdev_update_features(dev);
1481
1482        if (unlikely(dev->features & NETIF_F_LRO))
1483                netdev_WARN(dev, "failed to disable LRO!\n");
1484
1485        netdev_for_each_lower_dev(dev, lower_dev, iter)
1486                dev_disable_lro(lower_dev);
1487}
1488EXPORT_SYMBOL(dev_disable_lro);
1489
1490/**
1491 *      dev_disable_gro_hw - disable HW Generic Receive Offload on a device
1492 *      @dev: device
1493 *
1494 *      Disable HW Generic Receive Offload (GRO_HW) on a net device.  Must be
1495 *      called under RTNL.  This is needed if Generic XDP is installed on
1496 *      the device.
1497 */
1498static void dev_disable_gro_hw(struct net_device *dev)
1499{
1500        dev->wanted_features &= ~NETIF_F_GRO_HW;
1501        netdev_update_features(dev);
1502
1503        if (unlikely(dev->features & NETIF_F_GRO_HW))
1504                netdev_WARN(dev, "failed to disable GRO_HW!\n");
1505}
1506
1507const char *netdev_cmd_to_name(enum netdev_cmd cmd)
1508{
1509#define N(val)                                          \
1510        case NETDEV_##val:                              \
1511                return "NETDEV_" __stringify(val);
1512        switch (cmd) {
1513        N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
1514        N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
1515        N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
1516        N(POST_INIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER)
1517        N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO)
1518        N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO)
1519        N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
1520        N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
1521        N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
1522        N(PRE_CHANGEADDR)
1523        }
1524#undef N
1525        return "UNKNOWN_NETDEV_EVENT";
1526}
1527EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
1528
1529static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1530                                   struct net_device *dev)
1531{
1532        struct netdev_notifier_info info = {
1533                .dev = dev,
1534        };
1535
1536        return nb->notifier_call(nb, val, &info);
1537}
1538
1539static int dev_boot_phase = 1;
1540
1541/**
1542 * register_netdevice_notifier - register a network notifier block
1543 * @nb: notifier
1544 *
1545 * Register a notifier to be called when network device events occur.
1546 * The notifier passed is linked into the kernel structures and must
1547 * not be reused until it has been unregistered. A negative errno code
1548 * is returned on a failure.
1549 *
1550 * When registered all registration and up events are replayed
1551 * to the new notifier to allow device to have a race free
1552 * view of the network device list.
1553 */
1554
1555int register_netdevice_notifier(struct notifier_block *nb)
1556{
1557        struct net_device *dev;
1558        struct net_device *last;
1559        struct net *net;
1560        int err;
1561
1562        /* Close race with setup_net() and cleanup_net() */
1563        down_write(&pernet_ops_rwsem);
1564        rtnl_lock();
1565        err = raw_notifier_chain_register(&netdev_chain, nb);
1566        if (err)
1567                goto unlock;
1568        if (dev_boot_phase)
1569                goto unlock;
1570        for_each_net(net) {
1571                for_each_netdev(net, dev) {
1572                        err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1573                        err = notifier_to_errno(err);
1574                        if (err)
1575                                goto rollback;
1576
1577                        if (!(dev->flags & IFF_UP))
1578                                continue;
1579
1580                        call_netdevice_notifier(nb, NETDEV_UP, dev);
1581                }
1582        }
1583
1584unlock:
1585        rtnl_unlock();
1586        up_write(&pernet_ops_rwsem);
1587        return err;
1588
1589rollback:
1590        last = dev;
1591        for_each_net(net) {
1592                for_each_netdev(net, dev) {
1593                        if (dev == last)
1594                                goto outroll;
1595
1596                        if (dev->flags & IFF_UP) {
1597                                call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1598                                                        dev);
1599                                call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1600                        }
1601                        call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1602                }
1603        }
1604
1605outroll:
1606        raw_notifier_chain_unregister(&netdev_chain, nb);
1607        goto unlock;
1608}
1609EXPORT_SYMBOL(register_netdevice_notifier);
1610
1611/**
1612 * unregister_netdevice_notifier - unregister a network notifier block
1613 * @nb: notifier
1614 *
1615 * Unregister a notifier previously registered by
1616 * register_netdevice_notifier(). The notifier is unlinked into the
1617 * kernel structures and may then be reused. A negative errno code
1618 * is returned on a failure.
1619 *
1620 * After unregistering unregister and down device events are synthesized
1621 * for all devices on the device list to the removed notifier to remove
1622 * the need for special case cleanup code.
1623 */
1624
1625int unregister_netdevice_notifier(struct notifier_block *nb)
1626{
1627        struct net_device *dev;
1628        struct net *net;
1629        int err;
1630
1631        /* Close race with setup_net() and cleanup_net() */
1632        down_write(&pernet_ops_rwsem);
1633        rtnl_lock();
1634        err = raw_notifier_chain_unregister(&netdev_chain, nb);
1635        if (err)
1636                goto unlock;
1637
1638        for_each_net(net) {
1639                for_each_netdev(net, dev) {
1640                        if (dev->flags & IFF_UP) {
1641                                call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1642                                                        dev);
1643                                call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1644                        }
1645                        call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1646                }
1647        }
1648unlock:
1649        rtnl_unlock();
1650        up_write(&pernet_ops_rwsem);
1651        return err;
1652}
1653EXPORT_SYMBOL(unregister_netdevice_notifier);
1654
1655/**
1656 *      call_netdevice_notifiers_info - call all network notifier blocks
1657 *      @val: value passed unmodified to notifier function
1658 *      @info: notifier information data
1659 *
1660 *      Call all network notifier blocks.  Parameters and return value
1661 *      are as for raw_notifier_call_chain().
1662 */
1663
1664static int call_netdevice_notifiers_info(unsigned long val,
1665                                         struct netdev_notifier_info *info)
1666{
1667        ASSERT_RTNL();
1668        return raw_notifier_call_chain(&netdev_chain, val, info);
1669}
1670
1671static int call_netdevice_notifiers_extack(unsigned long val,
1672                                           struct net_device *dev,
1673                                           struct netlink_ext_ack *extack)
1674{
1675        struct netdev_notifier_info info = {
1676                .dev = dev,
1677                .extack = extack,
1678        };
1679
1680        return call_netdevice_notifiers_info(val, &info);
1681}
1682
1683/**
1684 *      call_netdevice_notifiers - call all network notifier blocks
1685 *      @val: value passed unmodified to notifier function
1686 *      @dev: net_device pointer passed unmodified to notifier function
1687 *
1688 *      Call all network notifier blocks.  Parameters and return value
1689 *      are as for raw_notifier_call_chain().
1690 */
1691
1692int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1693{
1694        return call_netdevice_notifiers_extack(val, dev, NULL);
1695}
1696EXPORT_SYMBOL(call_netdevice_notifiers);
1697
1698/**
1699 *      call_netdevice_notifiers_mtu - call all network notifier blocks
1700 *      @val: value passed unmodified to notifier function
1701 *      @dev: net_device pointer passed unmodified to notifier function
1702 *      @arg: additional u32 argument passed to the notifier function
1703 *
1704 *      Call all network notifier blocks.  Parameters and return value
1705 *      are as for raw_notifier_call_chain().
1706 */
1707static int call_netdevice_notifiers_mtu(unsigned long val,
1708                                        struct net_device *dev, u32 arg)
1709{
1710        struct netdev_notifier_info_ext info = {
1711                .info.dev = dev,
1712                .ext.mtu = arg,
1713        };
1714
1715        BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
1716
1717        return call_netdevice_notifiers_info(val, &info.info);
1718}
1719
1720#ifdef CONFIG_NET_INGRESS
1721static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
1722
1723void net_inc_ingress_queue(void)
1724{
1725        static_branch_inc(&ingress_needed_key);
1726}
1727EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1728
1729void net_dec_ingress_queue(void)
1730{
1731        static_branch_dec(&ingress_needed_key);
1732}
1733EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1734#endif
1735
1736#ifdef CONFIG_NET_EGRESS
1737static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
1738
1739void net_inc_egress_queue(void)
1740{
1741        static_branch_inc(&egress_needed_key);
1742}
1743EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1744
1745void net_dec_egress_queue(void)
1746{
1747        static_branch_dec(&egress_needed_key);
1748}
1749EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1750#endif
1751
1752static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
1753#ifdef CONFIG_JUMP_LABEL
1754static atomic_t netstamp_needed_deferred;
1755static atomic_t netstamp_wanted;
1756static void netstamp_clear(struct work_struct *work)
1757{
1758        int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1759        int wanted;
1760
1761        wanted = atomic_add_return(deferred, &netstamp_wanted);
1762        if (wanted > 0)
1763                static_branch_enable(&netstamp_needed_key);
1764        else
1765                static_branch_disable(&netstamp_needed_key);
1766}
1767static DECLARE_WORK(netstamp_work, netstamp_clear);
1768#endif
1769
1770void net_enable_timestamp(void)
1771{
1772#ifdef CONFIG_JUMP_LABEL
1773        int wanted;
1774
1775        while (1) {
1776                wanted = atomic_read(&netstamp_wanted);
1777                if (wanted <= 0)
1778                        break;
1779                if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
1780                        return;
1781        }
1782        atomic_inc(&netstamp_needed_deferred);
1783        schedule_work(&netstamp_work);
1784#else
1785        static_branch_inc(&netstamp_needed_key);
1786#endif
1787}
1788EXPORT_SYMBOL(net_enable_timestamp);
1789
1790void net_disable_timestamp(void)
1791{
1792#ifdef CONFIG_JUMP_LABEL
1793        int wanted;
1794
1795        while (1) {
1796                wanted = atomic_read(&netstamp_wanted);
1797                if (wanted <= 1)
1798                        break;
1799                if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
1800                        return;
1801        }
1802        atomic_dec(&netstamp_needed_deferred);
1803        schedule_work(&netstamp_work);
1804#else
1805        static_branch_dec(&netstamp_needed_key);
1806#endif
1807}
1808EXPORT_SYMBOL(net_disable_timestamp);
1809
1810static inline void net_timestamp_set(struct sk_buff *skb)
1811{
1812        skb->tstamp = 0;
1813        if (static_branch_unlikely(&netstamp_needed_key))
1814                __net_timestamp(skb);
1815}
1816
1817#define net_timestamp_check(COND, SKB)                          \
1818        if (static_branch_unlikely(&netstamp_needed_key)) {     \
1819                if ((COND) && !(SKB)->tstamp)                   \
1820                        __net_timestamp(SKB);                   \
1821        }                                                       \
1822
1823bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
1824{
1825        unsigned int len;
1826
1827        if (!(dev->flags & IFF_UP))
1828                return false;
1829
1830        len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1831        if (skb->len <= len)
1832                return true;
1833
1834        /* if TSO is enabled, we don't care about the length as the packet
1835         * could be forwarded without being segmented before
1836         */
1837        if (skb_is_gso(skb))
1838                return true;
1839
1840        return false;
1841}
1842EXPORT_SYMBOL_GPL(is_skb_forwardable);
1843
1844int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1845{
1846        int ret = ____dev_forward_skb(dev, skb);
1847
1848        if (likely(!ret)) {
1849                skb->protocol = eth_type_trans(skb, dev);
1850                skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1851        }
1852
1853        return ret;
1854}
1855EXPORT_SYMBOL_GPL(__dev_forward_skb);
1856
1857/**
1858 * dev_forward_skb - loopback an skb to another netif
1859 *
1860 * @dev: destination network device
1861 * @skb: buffer to forward
1862 *
1863 * return values:
1864 *      NET_RX_SUCCESS  (no congestion)
1865 *      NET_RX_DROP     (packet was dropped, but freed)
1866 *
1867 * dev_forward_skb can be used for injecting an skb from the
1868 * start_xmit function of one device into the receive queue
1869 * of another device.
1870 *
1871 * The receiving device may be in another namespace, so
1872 * we have to clear all information in the skb that could
1873 * impact namespace isolation.
1874 */
1875int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1876{
1877        return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1878}
1879EXPORT_SYMBOL_GPL(dev_forward_skb);
1880
1881static inline int deliver_skb(struct sk_buff *skb,
1882                              struct packet_type *pt_prev,
1883                              struct net_device *orig_dev)
1884{
1885        if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
1886                return -ENOMEM;
1887        refcount_inc(&skb->users);
1888        return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1889}
1890
1891static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1892                                          struct packet_type **pt,
1893                                          struct net_device *orig_dev,
1894                                          __be16 type,
1895                                          struct list_head *ptype_list)
1896{
1897        struct packet_type *ptype, *pt_prev = *pt;
1898
1899        list_for_each_entry_rcu(ptype, ptype_list, list) {
1900                if (ptype->type != type)
1901                        continue;
1902                if (pt_prev)
1903                        deliver_skb(skb, pt_prev, orig_dev);
1904                pt_prev = ptype;
1905        }
1906        *pt = pt_prev;
1907}
1908
1909static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1910{
1911        if (!ptype->af_packet_priv || !skb->sk)
1912                return false;
1913
1914        if (ptype->id_match)
1915                return ptype->id_match(ptype, skb->sk);
1916        else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1917                return true;
1918
1919        return false;
1920}
1921
1922/**
1923 * dev_nit_active - return true if any network interface taps are in use
1924 *
1925 * @dev: network device to check for the presence of taps
1926 */
1927bool dev_nit_active(struct net_device *dev)
1928{
1929        return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all);
1930}
1931EXPORT_SYMBOL_GPL(dev_nit_active);
1932
1933/*
1934 *      Support routine. Sends outgoing frames to any network
1935 *      taps currently in use.
1936 */
1937
1938void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1939{
1940        struct packet_type *ptype;
1941        struct sk_buff *skb2 = NULL;
1942        struct packet_type *pt_prev = NULL;
1943        struct list_head *ptype_list = &ptype_all;
1944
1945        rcu_read_lock();
1946again:
1947        list_for_each_entry_rcu(ptype, ptype_list, list) {
1948                if (ptype->ignore_outgoing)
1949                        continue;
1950
1951                /* Never send packets back to the socket
1952                 * they originated from - MvS (miquels@drinkel.ow.org)
1953                 */
1954                if (skb_loop_sk(ptype, skb))
1955                        continue;
1956
1957                if (pt_prev) {
1958                        deliver_skb(skb2, pt_prev, skb->dev);
1959                        pt_prev = ptype;
1960                        continue;
1961                }
1962
1963                /* need to clone skb, done only once */
1964                skb2 = skb_clone(skb, GFP_ATOMIC);
1965                if (!skb2)
1966                        goto out_unlock;
1967
1968                net_timestamp_set(skb2);
1969
1970                /* skb->nh should be correctly
1971                 * set by sender, so that the second statement is
1972                 * just protection against buggy protocols.
1973                 */
1974                skb_reset_mac_header(skb2);
1975
1976                if (skb_network_header(skb2) < skb2->data ||
1977                    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1978                        net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1979                                             ntohs(skb2->protocol),
1980                                             dev->name);
1981                        skb_reset_network_header(skb2);
1982                }
1983
1984                skb2->transport_header = skb2->network_header;
1985                skb2->pkt_type = PACKET_OUTGOING;
1986                pt_prev = ptype;
1987        }
1988
1989        if (ptype_list == &ptype_all) {
1990                ptype_list = &dev->ptype_all;
1991                goto again;
1992        }
1993out_unlock:
1994        if (pt_prev) {
1995                if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
1996                        pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1997                else
1998                        kfree_skb(skb2);
1999        }
2000        rcu_read_unlock();

2001}
2002EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
2003
2004/**
2005 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
2006 * @dev: Network device
2007 * @txq: number of queues available
2008 *
2009 * If real_num_tx_queues is changed the tc mappings may no longer be
2010 * valid. To resolve this verify the tc mapping remains valid and if
2011 * not NULL the mapping. With no priorities mapping to this
2012 * offset/count pair it will no longer be used. In the worst case TC0
2013 * is invalid nothing can be done so disable priority mappings. If is
2014 * expected that drivers will fix this mapping if they can before
2015 * calling netif_set_real_num_tx_queues.
2016 */
2017static void netif_setup_tc(struct net_device *dev, unsigned int txq)
2018{
2019        int i;
2020        struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2021
2022        /* If TC0 is invalidated disable TC mapping */
2023        if (tc->offset + tc->count > txq) {
2024                pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
2025                dev->num_tc = 0;
2026                return;
2027        }
2028
2029        /* Invalidated prio to tc mappings set to TC0 */
2030        for (i = 1; i < TC_BITMASK + 1; i++) {
2031                int q = netdev_get_prio_tc_map(dev, i);
2032
2033                tc = &dev->tc_to_txq[q];
2034                if (tc->offset + tc->count > txq) {
2035                        pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
2036                                i, q);
2037                        netdev_set_prio_tc_map(dev, i, 0);
2038                }
2039        }
2040}
2041
2042int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
2043{
2044        if (dev->num_tc) {
2045                struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2046                int i;
2047
2048                /* walk through the TCs and see if it falls into any of them */
2049                for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
2050                        if ((txq - tc->offset) < tc->count)
2051                                return i;
2052                }
2053
2054                /* didn't find it, just return -1 to indicate no match */
2055                return -1;
2056        }
2057
2058        return 0;
2059}
2060EXPORT_SYMBOL(netdev_txq_to_tc);
2061
2062#ifdef CONFIG_XPS
2063struct static_key xps_needed __read_mostly;
2064EXPORT_SYMBOL(xps_needed);
2065struct static_key xps_rxqs_needed __read_mostly;
2066EXPORT_SYMBOL(xps_rxqs_needed);
2067static DEFINE_MUTEX(xps_map_mutex);
2068#define xmap_dereference(P)             \
2069        rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
2070
2071static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
2072                             int tci, u16 index)
2073{
2074        struct xps_map *map = NULL;
2075        int pos;
2076
2077        if (dev_maps)
2078                map = xmap_dereference(dev_maps->attr_map[tci]);
2079        if (!map)
2080                return false;
2081
2082        for (pos = map->len; pos--;) {
2083                if (map->queues[pos] != index)
2084                        continue;
2085
2086                if (map->len > 1) {
2087                        map->queues[pos] = map->queues[--map->len];
2088                        break;
2089                }
2090
2091                RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
2092                kfree_rcu(map, rcu);
2093                return false;
2094        }
2095
2096        return true;
2097}
2098
2099static bool remove_xps_queue_cpu(struct net_device *dev,
2100                                 struct xps_dev_maps *dev_maps,
2101                                 int cpu, u16 offset, u16 count)
2102{
2103        int num_tc = dev->num_tc ? : 1;
2104        bool active = false;
2105        int tci;
2106
2107        for (tci = cpu * num_tc; num_tc--; tci++) {
2108                int i, j;
2109
2110                for (i = count, j = offset; i--; j++) {
2111                        if (!remove_xps_queue(dev_maps, tci, j))
2112                                break;
2113                }
2114
2115                active |= i < 0;
2116        }
2117
2118        return active;
2119}
2120
2121static void reset_xps_maps(struct net_device *dev,
2122                           struct xps_dev_maps *dev_maps,
2123                           bool is_rxqs_map)
2124{
2125        if (is_rxqs_map) {
2126                static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
2127                RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
2128        } else {
2129                RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
2130        }
2131        static_key_slow_dec_cpuslocked(&xps_needed);
2132        kfree_rcu(dev_maps, rcu);
2133}
2134
2135static void clean_xps_maps(struct net_device *dev, const unsigned long *mask,
2136                           struct xps_dev_maps *dev_maps, unsigned int nr_ids,
2137                           u16 offset, u16 count, bool is_rxqs_map)
2138{
2139        bool active = false;
2140        int i, j;
2141
2142        for (j = -1; j = netif_attrmask_next(j, mask, nr_ids),
2143             j < nr_ids;)
2144                active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
2145                                               count);
2146        if (!active)
2147                reset_xps_maps(dev, dev_maps, is_rxqs_map);
2148
2149        if (!is_rxqs_map) {
2150                for (i = offset + (count - 1); count--; i--) {
2151                        netdev_queue_numa_node_write(
2152                                netdev_get_tx_queue(dev, i),
2153                                NUMA_NO_NODE);
2154                }
2155        }
2156}
2157
2158static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2159                                   u16 count)
2160{
2161        const unsigned long *possible_mask = NULL;
2162        struct xps_dev_maps *dev_maps;
2163        unsigned int nr_ids;
2164
2165        if (!static_key_false(&xps_needed))
2166                return;
2167
2168        cpus_read_lock();
2169        mutex_lock(&xps_map_mutex);
2170
2171        if (static_key_false(&xps_rxqs_needed)) {
2172                dev_maps = xmap_dereference(dev->xps_rxqs_map);
2173                if (dev_maps) {
2174                        nr_ids = dev->num_rx_queues;
2175                        clean_xps_maps(dev, possible_mask, dev_maps, nr_ids,
2176                                       offset, count, true);
2177                }
2178        }
2179
2180        dev_maps = xmap_dereference(dev->xps_cpus_map);
2181        if (!dev_maps)
2182                goto out_no_maps;
2183
2184        if (num_possible_cpus() > 1)
2185                possible_mask = cpumask_bits(cpu_possible_mask);
2186        nr_ids = nr_cpu_ids;
2187        clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count,
2188                       false);
2189
2190out_no_maps:
2191        mutex_unlock(&xps_map_mutex);
2192        cpus_read_unlock();
2193}
2194
2195static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2196{
2197        netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
2198}
2199
2200static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
2201                                      u16 index, bool is_rxqs_map)
2202{
2203        struct xps_map *new_map;
2204        int alloc_len = XPS_MIN_MAP_ALLOC;
2205        int i, pos;
2206
2207        for (pos = 0; map && pos < map->len; pos++) {
2208                if (map->queues[pos] != index)
2209                        continue;
2210                return map;
2211        }
2212
2213        /* Need to add tx-queue to this CPU's/rx-queue's existing map */
2214        if (map) {
2215                if (pos < map->alloc_len)
2216                        return map;
2217
2218                alloc_len = map->alloc_len * 2;
2219        }
2220
2221        /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
2222         *  map
2223         */
2224        if (is_rxqs_map)
2225                new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
2226        else
2227                new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2228                                       cpu_to_node(attr_index));
2229        if (!new_map)
2230                return NULL;
2231
2232        for (i = 0; i < pos; i++)
2233                new_map->queues[i] = map->queues[i];
2234        new_map->alloc_len = alloc_len;
2235        new_map->len = pos;
2236
2237        return new_map;
2238}
2239
2240/* Must be called under cpus_read_lock */
2241int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
2242                          u16 index, bool is_rxqs_map)
2243{
2244        const unsigned long *online_mask = NULL, *possible_mask = NULL;
2245        struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2246        int i, j, tci, numa_node_id = -2;
2247        int maps_sz, num_tc = 1, tc = 0;
2248        struct xps_map *map, *new_map;
2249        bool active = false;
2250        unsigned int nr_ids;
2251
2252        if (dev->num_tc) {
2253                /* Do not allow XPS on subordinate device directly */
2254                num_tc = dev->num_tc;
2255                if (num_tc < 0)
2256                        return -EINVAL;
2257
2258                /* If queue belongs to subordinate dev use its map */
2259                dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
2260
2261                tc = netdev_txq_to_tc(dev, index);
2262                if (tc < 0)
2263                        return -EINVAL;
2264        }
2265
2266        mutex_lock(&xps_map_mutex);
2267        if (is_rxqs_map) {
2268                maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
2269                dev_maps = xmap_dereference(dev->xps_rxqs_map);
2270                nr_ids = dev->num_rx_queues;
2271        } else {
2272                maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
2273                if (num_possible_cpus() > 1) {
2274                        online_mask = cpumask_bits(cpu_online_mask);
2275                        possible_mask = cpumask_bits(cpu_possible_mask);
2276                }
2277                dev_maps = xmap_dereference(dev->xps_cpus_map);
2278                nr_ids = nr_cpu_ids;
2279        }
2280
2281        if (maps_sz < L1_CACHE_BYTES)
2282                maps_sz = L1_CACHE_BYTES;
2283
2284        /* allocate memory for queue storage */
2285        for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
2286             j < nr_ids;) {
2287                if (!new_dev_maps)
2288                        new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2289                if (!new_dev_maps) {
2290                        mutex_unlock(&xps_map_mutex);
2291                        return -ENOMEM;
2292                }
2293
2294                tci = j * num_tc + tc;
2295                map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) :
2296                                 NULL;
2297
2298                map = expand_xps_map(map, j, index, is_rxqs_map);
2299                if (!map)
2300                        goto error;
2301
2302                RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2303        }
2304
2305        if (!new_dev_maps)
2306                goto out_no_new_maps;
2307
2308        if (!dev_maps) {
2309                /* Increment static keys at most once per type */
2310                static_key_slow_inc_cpuslocked(&xps_needed);
2311                if (is_rxqs_map)
2312                        static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
2313        }
2314
2315        for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2316             j < nr_ids;) {
2317                /* copy maps belonging to foreign traffic classes */
2318                for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) {
2319                        /* fill in the new device map from the old device map */
2320                        map = xmap_dereference(dev_maps->attr_map[tci]);
2321                        RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2322                }
2323
2324                /* We need to explicitly update tci as prevous loop
2325                 * could break out early if dev_maps is NULL.
2326                 */
2327                tci = j * num_tc + tc;
2328
2329                if (netif_attr_test_mask(j, mask, nr_ids) &&
2330                    netif_attr_test_online(j, online_mask, nr_ids)) {
2331                        /* add tx-queue to CPU/rx-queue maps */
2332                        int pos = 0;
2333
2334                        map = xmap_dereference(new_dev_maps->attr_map[tci]);
2335                        while ((pos < map->len) && (map->queues[pos] != index))
2336                                pos++;
2337
2338                        if (pos == map->len)
2339                                map->queues[map->len++] = index;
2340#ifdef CONFIG_NUMA
2341                        if (!is_rxqs_map) {
2342                                if (numa_node_id == -2)
2343                                        numa_node_id = cpu_to_node(j);
2344                                else if (numa_node_id != cpu_to_node(j))
2345                                        numa_node_id = -1;
2346                        }
2347#endif
2348                } else if (dev_maps) {
2349                        /* fill in the new device map from the old device map */
2350                        map = xmap_dereference(dev_maps->attr_map[tci]);
2351                        RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2352                }
2353
2354                /* copy maps belonging to foreign traffic classes */
2355                for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
2356                        /* fill in the new device map from the old device map */
2357                        map = xmap_dereference(dev_maps->attr_map[tci]);
2358                        RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2359                }
2360        }
2361
2362        if (is_rxqs_map)
2363                rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps);
2364        else
2365                rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps);
2366
2367        /* Cleanup old maps */
2368        if (!dev_maps)
2369                goto out_no_old_maps;
2370
2371        for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2372             j < nr_ids;) {
2373                for (i = num_tc, tci = j * num_tc; i--; tci++) {
2374                        new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2375                        map = xmap_dereference(dev_maps->attr_map[tci]);
2376                        if (map && map != new_map)
2377                                kfree_rcu(map, rcu);
2378                }
2379        }
2380
2381        kfree_rcu(dev_maps, rcu);
2382
2383out_no_old_maps:
2384        dev_maps = new_dev_maps;
2385        active = true;
2386
2387out_no_new_maps:
2388        if (!is_rxqs_map) {
2389                /* update Tx queue numa node */
2390                netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2391                                             (numa_node_id >= 0) ?
2392                                             numa_node_id : NUMA_NO_NODE);
2393        }
2394
2395        if (!dev_maps)
2396                goto out_no_maps;
2397
2398        /* removes tx-queue from unused CPUs/rx-queues */
2399        for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2400             j < nr_ids;) {
2401                for (i = tc, tci = j * num_tc; i--; tci++)
2402                        active |= remove_xps_queue(dev_maps, tci, index);
2403                if (!netif_attr_test_mask(j, mask, nr_ids) ||
2404                    !netif_attr_test_online(j, online_mask, nr_ids))
2405                        active |= remove_xps_queue(dev_maps, tci, index);
2406                for (i = num_tc - tc, tci++; --i; tci++)
2407                        active |= remove_xps_queue(dev_maps, tci, index);
2408        }
2409
2410        /* free map if not active */
2411        if (!active)
2412                reset_xps_maps(dev, dev_maps, is_rxqs_map);
2413
2414out_no_maps:
2415        mutex_unlock(&xps_map_mutex);
2416
2417        return 0;
2418error:
2419        /* remove any maps that we added */
2420        for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2421             j < nr_ids;) {
2422                for (i = num_tc, tci = j * num_tc; i--; tci++) {
2423                        new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2424                        map = dev_maps ?
2425                              xmap_dereference(dev_maps->attr_map[tci]) :
2426                              NULL;
2427                        if (new_map && new_map != map)
2428                                kfree(new_map);
2429                }
2430        }
2431
2432        mutex_unlock(&xps_map_mutex);
2433
2434        kfree(new_dev_maps);
2435        return -ENOMEM;
2436}
2437EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
2438
2439int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2440                        u16 index)
2441{
2442        int ret;
2443
2444        cpus_read_lock();
2445        ret =  __netif_set_xps_queue(dev, cpumask_bits(mask), index, false);
2446        cpus_read_unlock();
2447
2448        return ret;
2449}
2450EXPORT_SYMBOL(netif_set_xps_queue);
2451
2452#endif
2453static void netdev_unbind_all_sb_channels(struct net_device *dev)
2454{
2455        struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
2456
2457        /* Unbind any subordinate channels */
2458        while (txq-- != &dev->_tx[0]) {
2459                if (txq->sb_dev)
2460                        netdev_unbind_sb_channel(dev, txq->sb_dev);
2461        }
2462}
2463
2464void netdev_reset_tc(struct net_device *dev)
2465{
2466#ifdef CONFIG_XPS
2467        netif_reset_xps_queues_gt(dev, 0);
2468#endif
2469        netdev_unbind_all_sb_channels(dev);
2470
2471        /* Reset TC configuration of device */
2472        dev->num_tc = 0;
2473        memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
2474        memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
2475}
2476EXPORT_SYMBOL(netdev_reset_tc);
2477
2478int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
2479{
2480        if (tc >= dev->num_tc)
2481                return -EINVAL;
2482
2483#ifdef CONFIG_XPS
2484        netif_reset_xps_queues(dev, offset, count);
2485#endif
2486        dev->tc_to_txq[tc].count = count;
2487        dev->tc_to_txq[tc].offset = offset;
2488        return 0;
2489}
2490EXPORT_SYMBOL(netdev_set_tc_queue);
2491
2492int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
2493{
2494        if (num_tc > TC_MAX_QUEUE)
2495                return -EINVAL;
2496
2497#ifdef CONFIG_XPS
2498        netif_reset_xps_queues_gt(dev, 0);
2499#endif
2500        netdev_unbind_all_sb_channels(dev);
2501
2502        dev->num_tc = num_tc;
2503        return 0;
2504}
2505EXPORT_SYMBOL(netdev_set_num_tc);
2506
2507void netdev_unbind_sb_channel(struct net_device *dev,
2508                              struct net_device *sb_dev)
2509{
2510        struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
2511
2512#ifdef CONFIG_XPS
2513        netif_reset_xps_queues_gt(sb_dev, 0);
2514#endif
2515        memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
2516        memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
2517
2518        while (txq-- != &dev->_tx[0]) {
2519                if (txq->sb_dev == sb_dev)
2520                        txq->sb_dev = NULL;
2521        }
2522}
2523EXPORT_SYMBOL(netdev_unbind_sb_channel);
2524
2525int netdev_bind_sb_channel_queue(struct net_device *dev,
2526                                 struct net_device *sb_dev,
2527                                 u8 tc, u16 count, u16 offset)
2528{
2529        /* Make certain the sb_dev and dev are already configured */
2530        if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
2531                return -EINVAL;
2532
2533        /* We cannot hand out queues we don't have */
2534        if ((offset + count) > dev->real_num_tx_queues)
2535                return -EINVAL;
2536
2537        /* Record the mapping */
2538        sb_dev->tc_to_txq[tc].count = count;
2539        sb_dev->tc_to_txq[tc].offset = offset;
2540
2541        /* Provide a way for Tx queue to find the tc_to_txq map or
2542         * XPS map for itself.
2543         */
2544        while (count--)
2545                netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
2546
2547        return 0;
2548}
2549EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
2550
2551int netdev_set_sb_channel(struct net_device *dev, u16 channel)
2552{
2553        /* Do not use a multiqueue device to represent a subordinate channel */
2554        if (netif_is_multiqueue(dev))
2555                return -ENODEV;
2556
2557        /* We allow channels 1 - 32767 to be used for subordinate channels.
2558         * Channel 0 is meant to be "native" mode and used only to represent
2559         * the main root device. We allow writing 0 to reset the device back
2560         * to normal mode after being used as a subordinate channel.
2561         */
2562        if (channel > S16_MAX)
2563                return -EINVAL;
2564
2565        dev->num_tc = -channel;
2566
2567        return 0;
2568}
2569EXPORT_SYMBOL(netdev_set_sb_channel);
2570
2571/*
2572 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2573 * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
2574 */
2575int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2576{
2577        bool disabling;
2578        int rc;
2579
2580        disabling = txq < dev->real_num_tx_queues;
2581
2582        if (txq < 1 || txq > dev->num_tx_queues)
2583                return -EINVAL;
2584
2585        if (dev->reg_state == NETREG_REGISTERED ||
2586            dev->reg_state == NETREG_UNREGISTERING) {
2587                ASSERT_RTNL();
2588
2589                rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2590                                                  txq);
2591                if (rc)
2592                        return rc;
2593
2594                if (dev->num_tc)
2595                        netif_setup_tc(dev, txq);
2596
2597                dev->real_num_tx_queues = txq;
2598
2599                if (disabling) {
2600                        synchronize_net();
2601                        qdisc_reset_all_tx_gt(dev, txq);
2602#ifdef CONFIG_XPS
2603                        netif_reset_xps_queues_gt(dev, txq);
2604#endif
2605                }
2606        } else {
2607                dev->real_num_tx_queues = txq;
2608        }
2609
2610        return 0;
2611}
2612EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2613
2614#ifdef CONFIG_SYSFS
2615/**
2616 *      netif_set_real_num_rx_queues - set actual number of RX queues used
2617 *      @dev: Network device
2618 *      @rxq: Actual number of RX queues
2619 *
2620 *      This must be called either with the rtnl_lock held or before
2621 *      registration of the net device.  Returns 0 on success, or a
2622 *      negative error code.  If called before registration, it always
2623 *      succeeds.
2624 */
2625int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2626{
2627        int rc;
2628
2629        if (rxq < 1 || rxq > dev->num_rx_queues)
2630                return -EINVAL;
2631
2632        if (dev->reg_state == NETREG_REGISTERED) {
2633                ASSERT_RTNL();
2634
2635                rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2636                                                  rxq);
2637                if (rc)
2638                        return rc;
2639        }
2640
2641        dev->real_num_rx_queues = rxq;
2642        return 0;
2643}
2644EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2645#endif
2646
2647/**
2648 * netif_get_num_default_rss_queues - default number of RSS queues
2649 *
2650 * This routine should set an upper limit on the number of RSS queues
2651 * used by default by multiqueue devices.
2652 */
2653int netif_get_num_default_rss_queues(void)
2654{
2655        return is_kdump_kernel() ?
2656                1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2657}
2658EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2659
2660static void __netif_reschedule(struct Qdisc *q)
2661{
2662        struct softnet_data *sd;
2663        unsigned long flags;
2664
2665        local_irq_save(flags);
2666        sd = this_cpu_ptr(&softnet_data);
2667        q->next_sched = NULL;
2668        *sd->output_queue_tailp = q;
2669        sd->output_queue_tailp = &q->next_sched;
2670        raise_softirq_irqoff(NET_TX_SOFTIRQ);
2671        local_irq_restore(flags);
2672}
2673
2674void __netif_schedule(struct Qdisc *q)
2675{
2676        if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2677                __netif_reschedule(q);
2678}
2679EXPORT_SYMBOL(__netif_schedule);
2680
2681struct dev_kfree_skb_cb {
2682        enum skb_free_reason reason;
2683};
2684
2685static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2686{
2687        return (struct dev_kfree_skb_cb *)skb->cb;
2688}
2689
2690void netif_schedule_queue(struct netdev_queue *txq)
2691{
2692        rcu_read_lock();
2693        if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2694                struct Qdisc *q = rcu_dereference(txq->qdisc);
2695
2696                __netif_schedule(q);
2697        }
2698        rcu_read_unlock();
2699}
2700EXPORT_SYMBOL(netif_schedule_queue);
2701
2702void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2703{
2704        if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2705                struct Qdisc *q;
2706
2707                rcu_read_lock();
2708                q = rcu_dereference(dev_queue->qdisc);
2709                __netif_schedule(q);
2710                rcu_read_unlock();
2711        }
2712}
2713EXPORT_SYMBOL(netif_tx_wake_queue);
2714
2715void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2716{
2717        unsigned long flags;
2718
2719        if (unlikely(!skb))
2720                return;
2721
2722        if (likely(refcount_read(&skb->users) == 1)) {
2723                smp_rmb();
2724                refcount_set(&skb->users, 0);
2725        } else if (likely(!refcount_dec_and_test(&skb->users))) {
2726                return;
2727        }
2728        get_kfree_skb_cb(skb)->reason = reason;
2729        local_irq_save(flags);
2730        skb->next = __this_cpu_read(softnet_data.completion_queue);
2731        __this_cpu_write(softnet_data.completion_queue, skb);
2732        raise_softirq_irqoff(NET_TX_SOFTIRQ);
2733        local_irq_restore(flags);
2734}
2735EXPORT_SYMBOL(__dev_kfree_skb_irq);
2736
2737void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2738{
2739        if (in_irq() || irqs_disabled())
2740                __dev_kfree_skb_irq(skb, reason);
2741        else
2742                dev_kfree_skb(skb);
2743}
2744EXPORT_SYMBOL(__dev_kfree_skb_any);
2745
2746
2747/**
2748 * netif_device_detach - mark device as removed
2749 * @dev: network device
2750 *
2751 * Mark device as removed from system and therefore no longer available.
2752 */
2753void netif_device_detach(struct net_device *dev)
2754{
2755        if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2756            netif_running(dev)) {
2757                netif_tx_stop_all_queues(dev);
2758        }
2759}
2760EXPORT_SYMBOL(netif_device_detach);
2761
2762/**
2763 * netif_device_attach - mark device as attached
2764 * @dev: network device
2765 *
2766 * Mark device as attached from system and restart if needed.
2767 */
2768void netif_device_attach(struct net_device *dev)
2769{
2770        if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2771            netif_running(dev)) {
2772                netif_tx_wake_all_queues(dev);
2773                __netdev_watchdog_up(dev);
2774        }
2775}
2776EXPORT_SYMBOL(netif_device_attach);
2777
2778/*
2779 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2780 * to be used as a distribution range.
2781 */
2782static u16 skb_tx_hash(const struct net_device *dev,
2783                       const struct net_device *sb_dev,
2784                       struct sk_buff *skb)
2785{
2786        u32 hash;
2787        u16 qoffset = 0;
2788        u16 qcount = dev->real_num_tx_queues;
2789
2790        if (dev->num_tc) {
2791                u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2792
2793                qoffset = sb_dev->tc_to_txq[tc].offset;
2794                qcount = sb_dev->tc_to_txq[tc].count;
2795        }
2796
2797        if (skb_rx_queue_recorded(skb)) {
2798                hash = skb_get_rx_queue(skb);
2799                while (unlikely(hash >= qcount))
2800                        hash -= qcount;
2801                return hash + qoffset;
2802        }
2803
2804        return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2805}
2806
2807static void skb_warn_bad_offload(const struct sk_buff *skb)
2808{
2809        static const netdev_features_t null_features;
2810        struct net_device *dev = skb->dev;
2811        const char *name = "";
2812
2813        if (!net_ratelimit())
2814                return;
2815
2816        if (dev) {
2817                if (dev->dev.parent)
2818                        name = dev_driver_string(dev->dev.parent);
2819                else
2820                        name = netdev_name(dev);
2821        }
2822        skb_dump(KERN_WARNING, skb, false);
2823        WARN(1, "%s: caps=(%pNF, %pNF)\n",
2824             name, dev ? &dev->features : &null_features,
2825             skb->sk ? &skb->sk->sk_route_caps : &null_features);
2826}
2827
2828/*
2829 * Invalidate hardware checksum when packet is to be mangled, and
2830 * complete checksum manually on outgoing path.
2831 */
2832int skb_checksum_help(struct sk_buff *skb)
2833{
2834        __wsum csum;
2835        int ret = 0, offset;
2836
2837        if (skb->ip_summed == CHECKSUM_COMPLETE)
2838                goto out_set_summed;
2839
2840        if (unlikely(skb_shinfo(skb)->gso_size)) {
2841                skb_warn_bad_offload(skb);
2842                return -EINVAL;
2843        }
2844
2845        /* Before computing a checksum, we should make sure no frag could
2846         * be modified by an external entity : checksum could be wrong.
2847         */
2848        if (skb_has_shared_frag(skb)) {
2849                ret = __skb_linearize(skb);
2850                if (ret)
2851                        goto out;
2852        }
2853
2854        offset = skb_checksum_start_offset(skb);
2855        BUG_ON(offset >= skb_headlen(skb));
2856        csum = skb_checksum(skb, offset, skb->len - offset, 0);
2857
2858        offset += skb->csum_offset;
2859        BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2860
2861        if (skb_cloned(skb) &&
2862            !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2863                ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2864                if (ret)
2865                        goto out;
2866        }
2867
2868        *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
2869out_set_summed:
2870        skb->ip_summed = CHECKSUM_NONE;
2871out:
2872        return ret;
2873}
2874EXPORT_SYMBOL(skb_checksum_help);
2875
2876int skb_crc32c_csum_help(struct sk_buff *skb)
2877{
2878        __le32 crc32c_csum;
2879        int ret = 0, offset, start;
2880
2881        if (skb->ip_summed != CHECKSUM_PARTIAL)
2882                goto out;
2883
2884        if (unlikely(skb_is_gso(skb)))
2885                goto out;
2886
2887        /* Before computing a checksum, we should make sure no frag could
2888         * be modified by an external entity : checksum could be wrong.
2889         */
2890        if (unlikely(skb_has_shared_frag(skb))) {
2891                ret = __skb_linearize(skb);
2892                if (ret)
2893                        goto out;
2894        }
2895        start = skb_checksum_start_offset(skb);
2896        offset = start + offsetof(struct sctphdr, checksum);
2897        if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
2898                ret = -EINVAL;
2899                goto out;
2900        }
2901        if (skb_cloned(skb) &&
2902            !skb_clone_writable(skb, offset + sizeof(__le32))) {
2903                ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2904                if (ret)
2905                        goto out;
2906        }
2907        crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
2908                                                  skb->len - start, ~(__u32)0,
2909                                                  crc32c_csum_stub));
2910        *(__le32 *)(skb->data + offset) = crc32c_csum;
2911        skb->ip_summed = CHECKSUM_NONE;
2912        skb->csum_not_inet = 0;
2913out:
2914        return ret;
2915}
2916
2917__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2918{
2919        __be16 type = skb->protocol;
2920
2921        /* Tunnel gso handlers can set protocol to ethernet. */
2922        if (type == htons(ETH_P_TEB)) {
2923                struct ethhdr *eth;
2924
2925                if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2926                        return 0;
2927
2928                eth = (struct ethhdr *)skb->data;
2929                type = eth->h_proto;
2930        }
2931
2932        return __vlan_get_protocol(skb, type, depth);
2933}
2934
2935/**
2936 *      skb_mac_gso_segment - mac layer segmentation handler.
2937 *      @skb: buffer to segment
2938 *      @features: features for the output path (see dev->features)
2939 */
2940struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2941                                    netdev_features_t features)
2942{
2943        struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2944        struct packet_offload *ptype;
2945        int vlan_depth = skb->mac_len;
2946        __be16 type = skb_network_protocol(skb, &vlan_depth);
2947
2948        if (unlikely(!type))
2949                return ERR_PTR(-EINVAL);
2950
2951        __skb_pull(skb, vlan_depth);
2952
2953        rcu_read_lock();
2954        list_for_each_entry_rcu(ptype, &offload_base, list) {
2955                if (ptype->type == type && ptype->callbacks.gso_segment) {
2956                        segs = ptype->callbacks.gso_segment(skb, features);
2957                        break;
2958                }
2959        }
2960        rcu_read_unlock();
2961
2962        __skb_push(skb, skb->data - skb_mac_header(skb));
2963
2964        return segs;
2965}
2966EXPORT_SYMBOL(skb_mac_gso_segment);
2967
2968
2969/* openvswitch calls this on rx path, so we need a different check.
2970 */
2971static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2972{
2973        if (tx_path)
2974                return skb->ip_summed != CHECKSUM_PARTIAL &&
2975                       skb->ip_summed != CHECKSUM_UNNECESSARY;
2976
2977        return skb->ip_summed == CHECKSUM_NONE;
2978}
2979
2980/**
2981 *      __skb_gso_segment - Perform segmentation on skb.
2982 *      @skb: buffer to segment
2983 *      @features: features for the output path (see dev->features)
2984 *      @tx_path: whether it is called in TX path
2985 *
2986 *      This function segments the given skb and returns a list of segments.
2987 *
2988 *      It may return NULL if the skb requires no segmentation.  This is
2989 *      only possible when GSO is used for verifying header integrity.
2990 *
2991 *      Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2992 */
2993struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2994                                  netdev_features_t features, bool tx_path)
2995{
2996        struct sk_buff *segs;
2997
2998        if (unlikely(skb_needs_check(skb, tx_path))) {
2999                int err;
3000

3001                /* We're going to init ->check field in TCP or UDP header */
3002                err = skb_cow_head(skb, 0);
3003                if (err < 0)
3004                        return ERR_PTR(err);
3005        }
3006
3007        /* Only report GSO partial support if it will enable us to
3008         * support segmentation on this frame without needing additional
3009         * work.
3010         */
3011        if (features & NETIF_F_GSO_PARTIAL) {
3012                netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
3013                struct net_device *dev = skb->dev;
3014
3015                partial_features |= dev->features & dev->gso_partial_features;
3016                if (!skb_gso_ok(skb, features | partial_features))
3017                        features &= ~NETIF_F_GSO_PARTIAL;
3018        }
3019
3020        BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
3021                     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
3022
3023        SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
3024        SKB_GSO_CB(skb)->encap_level = 0;
3025
3026        skb_reset_mac_header(skb);
3027        skb_reset_mac_len(skb);
3028
3029        segs = skb_mac_gso_segment(skb, features);
3030
3031        if (unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
3032                skb_warn_bad_offload(skb);
3033
3034        return segs;
3035}
3036EXPORT_SYMBOL(__skb_gso_segment);
3037
3038/* Take action when hardware reception checksum errors are detected. */
3039#ifdef CONFIG_BUG
3040void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
3041{
3042        if (net_ratelimit()) {
3043                pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
3044                skb_dump(KERN_ERR, skb, true);
3045                dump_stack();
3046        }
3047}
3048EXPORT_SYMBOL(netdev_rx_csum_fault);
3049#endif
3050
3051/* XXX: check that highmem exists at all on the given machine. */
3052static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
3053{
3054#ifdef CONFIG_HIGHMEM
3055        int i;
3056
3057        if (!(dev->features & NETIF_F_HIGHDMA)) {
3058                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
3059                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
3060
3061                        if (PageHighMem(skb_frag_page(frag)))
3062                                return 1;
3063                }
3064        }
3065#endif
3066        return 0;
3067}
3068
3069/* If MPLS offload request, verify we are testing hardware MPLS features
3070 * instead of standard features for the netdev.
3071 */
3072#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
3073static netdev_features_t net_mpls_features(struct sk_buff *skb,
3074                                           netdev_features_t features,
3075                                           __be16 type)
3076{
3077        if (eth_p_mpls(type))
3078                features &= skb->dev->mpls_features;
3079
3080        return features;
3081}
3082#else
3083static netdev_features_t net_mpls_features(struct sk_buff *skb,
3084                                           netdev_features_t features,
3085                                           __be16 type)
3086{
3087        return features;
3088}
3089#endif
3090
3091static netdev_features_t harmonize_features(struct sk_buff *skb,
3092        netdev_features_t features)
3093{
3094        int tmp;
3095        __be16 type;
3096
3097        type = skb_network_protocol(skb, &tmp);
3098        features = net_mpls_features(skb, features, type);
3099
3100        if (skb->ip_summed != CHECKSUM_NONE &&
3101            !can_checksum_protocol(features, type)) {
3102                features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
3103        }
3104        if (illegal_highdma(skb->dev, skb))
3105                features &= ~NETIF_F_SG;
3106
3107        return features;
3108}
3109
3110netdev_features_t passthru_features_check(struct sk_buff *skb,
3111                                          struct net_device *dev,
3112                                          netdev_features_t features)
3113{
3114        return features;
3115}
3116EXPORT_SYMBOL(passthru_features_check);
3117
3118static netdev_features_t dflt_features_check(struct sk_buff *skb,
3119                                             struct net_device *dev,
3120                                             netdev_features_t features)
3121{
3122        return vlan_features_check(skb, features);
3123}
3124
3125static netdev_features_t gso_features_check(const struct sk_buff *skb,
3126                                            struct net_device *dev,
3127                                            netdev_features_t features)
3128{
3129        u16 gso_segs = skb_shinfo(skb)->gso_segs;
3130
3131        if (gso_segs > dev->gso_max_segs)
3132                return features & ~NETIF_F_GSO_MASK;
3133
3134        /* Support for GSO partial features requires software
3135         * intervention before we can actually process the packets
3136         * so we need to strip support for any partial features now
3137         * and we can pull them back in after we have partially
3138         * segmented the frame.
3139         */
3140        if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
3141                features &= ~dev->gso_partial_features;
3142
3143        /* Make sure to clear the IPv4 ID mangling feature if the
3144         * IPv4 header has the potential to be fragmented.
3145         */
3146        if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
3147                struct iphdr *iph = skb->encapsulation ?
3148                                    inner_ip_hdr(skb) : ip_hdr(skb);
3149
3150                if (!(iph->frag_off & htons(IP_DF)))
3151                        features &= ~NETIF_F_TSO_MANGLEID;
3152        }
3153
3154        return features;
3155}
3156
3157netdev_features_t netif_skb_features(struct sk_buff *skb)
3158{
3159        struct net_device *dev = skb->dev;
3160        netdev_features_t features = dev->features;
3161
3162        if (skb_is_gso(skb))
3163                features = gso_features_check(skb, dev, features);
3164
3165        /* If encapsulation offload request, verify we are testing
3166         * hardware encapsulation features instead of standard
3167         * features for the netdev
3168         */
3169        if (skb->encapsulation)
3170                features &= dev->hw_enc_features;
3171
3172        if (skb_vlan_tagged(skb))
3173                features = netdev_intersect_features(features,
3174                                                     dev->vlan_features |
3175                                                     NETIF_F_HW_VLAN_CTAG_TX |
3176                                                     NETIF_F_HW_VLAN_STAG_TX);
3177
3178        if (dev->netdev_ops->ndo_features_check)
3179                features &= dev->netdev_ops->ndo_features_check(skb, dev,
3180                                                                features);
3181        else
3182                features &= dflt_features_check(skb, dev, features);
3183
3184        return harmonize_features(skb, features);
3185}
3186EXPORT_SYMBOL(netif_skb_features);
3187
3188static int xmit_one(struct sk_buff *skb, struct net_device *dev,
3189                    struct netdev_queue *txq, bool more)
3190{
3191        unsigned int len;
3192        int rc;
3193
3194        if (dev_nit_active(dev))
3195                dev_queue_xmit_nit(skb, dev);
3196
3197        len = skb->len;
3198        trace_net_dev_start_xmit(skb, dev);
3199        rc = netdev_start_xmit(skb, dev, txq, more);
3200        trace_net_dev_xmit(skb, rc, dev, len);
3201
3202        return rc;
3203}
3204
3205struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
3206                                    struct netdev_queue *txq, int *ret)
3207{
3208        struct sk_buff *skb = first;
3209        int rc = NETDEV_TX_OK;
3210
3211        while (skb) {
3212                struct sk_buff *next = skb->next;
3213
3214                skb_mark_not_on_list(skb);
3215                rc = xmit_one(skb, dev, txq, next != NULL);
3216                if (unlikely(!dev_xmit_complete(rc))) {
3217                        skb->next = next;
3218                        goto out;
3219                }
3220
3221                skb = next;
3222                if (netif_tx_queue_stopped(txq) && skb) {
3223                        rc = NETDEV_TX_BUSY;
3224                        break;
3225                }
3226        }
3227
3228out:
3229        *ret = rc;
3230        return skb;
3231}
3232
3233static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
3234                                          netdev_features_t features)
3235{
3236        if (skb_vlan_tag_present(skb) &&
3237            !vlan_hw_offload_capable(features, skb->vlan_proto))
3238                skb = __vlan_hwaccel_push_inside(skb);
3239        return skb;
3240}
3241
3242int skb_csum_hwoffload_help(struct sk_buff *skb,
3243                            const netdev_features_t features)
3244{
3245        if (unlikely(skb->csum_not_inet))
3246                return !!(features & NETIF_F_SCTP_CRC) ? 0 :
3247                        skb_crc32c_csum_help(skb);
3248
3249        return !!(features & NETIF_F_CSUM_MASK) ? 0 : skb_checksum_help(skb);
3250}
3251EXPORT_SYMBOL(skb_csum_hwoffload_help);
3252
3253static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
3254{
3255        netdev_features_t features;
3256
3257        features = netif_skb_features(skb);
3258        skb = validate_xmit_vlan(skb, features);
3259        if (unlikely(!skb))
3260                goto out_null;
3261
3262        skb = sk_validate_xmit_skb(skb, dev);
3263        if (unlikely(!skb))
3264                goto out_null;
3265
3266        if (netif_needs_gso(skb, features)) {
3267                struct sk_buff *segs;
3268
3269                segs = skb_gso_segment(skb, features);
3270                if (IS_ERR(segs)) {
3271                        goto out_kfree_skb;
3272                } else if (segs) {
3273                        consume_skb(skb);
3274                        skb = segs;
3275                }
3276        } else {
3277                if (skb_needs_linearize(skb, features) &&
3278                    __skb_linearize(skb))
3279                        goto out_kfree_skb;
3280
3281                /* If packet is not checksummed and device does not
3282                 * support checksumming for this protocol, complete
3283                 * checksumming here.
3284                 */
3285                if (skb->ip_summed == CHECKSUM_PARTIAL) {
3286                        if (skb->encapsulation)
3287                                skb_set_inner_transport_header(skb,
3288                                                               skb_checksum_start_offset(skb));
3289                        else
3290                                skb_set_transport_header(skb,
3291                                                         skb_checksum_start_offset(skb));
3292                        if (skb_csum_hwoffload_help(skb, features))
3293                                goto out_kfree_skb;
3294                }
3295        }
3296
3297        skb = validate_xmit_xfrm(skb, features, again);
3298
3299        return skb;
3300
3301out_kfree_skb:
3302        kfree_skb(skb);
3303out_null:
3304        atomic_long_inc(&dev->tx_dropped);
3305        return NULL;
3306}
3307
3308struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
3309{
3310        struct sk_buff *next, *head = NULL, *tail;
3311
3312        for (; skb != NULL; skb = next) {
3313                next = skb->next;
3314                skb_mark_not_on_list(skb);
3315
3316                /* in case skb wont be segmented, point to itself */
3317                skb->prev = skb;
3318
3319                skb = validate_xmit_skb(skb, dev, again);
3320                if (!skb)
3321                        continue;
3322
3323                if (!head)
3324                        head = skb;
3325                else
3326                        tail->next = skb;
3327                /* If skb was segmented, skb->prev points to
3328                 * the last segment. If not, it still contains skb.
3329                 */
3330                tail = skb->prev;
3331        }
3332        return head;
3333}
3334EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
3335
3336static void qdisc_pkt_len_init(struct sk_buff *skb)
3337{
3338        const struct skb_shared_info *shinfo = skb_shinfo(skb);
3339
3340        qdisc_skb_cb(skb)->pkt_len = skb->len;
3341
3342        /* To get more precise estimation of bytes sent on wire,
3343         * we add to pkt_len the headers size of all segments
3344         */
3345        if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
3346                unsigned int hdr_len;
3347                u16 gso_segs = shinfo->gso_segs;
3348
3349                /* mac layer + network layer */
3350                hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3351
3352                /* + transport layer */
3353                if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
3354                        const struct tcphdr *th;
3355                        struct tcphdr _tcphdr;
3356
3357                        th = skb_header_pointer(skb, skb_transport_offset(skb),
3358                                                sizeof(_tcphdr), &_tcphdr);
3359                        if (likely(th))
3360                                hdr_len += __tcp_hdrlen(th);
3361                } else {
3362                        struct udphdr _udphdr;
3363
3364                        if (skb_header_pointer(skb, skb_transport_offset(skb),
3365                                               sizeof(_udphdr), &_udphdr))
3366                                hdr_len += sizeof(struct udphdr);
3367                }
3368
3369                if (shinfo->gso_type & SKB_GSO_DODGY)
3370                        gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3371                                                shinfo->gso_size);
3372
3373                qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3374        }
3375}
3376
3377static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3378                                 struct net_device *dev,
3379                                 struct netdev_queue *txq)
3380{
3381        spinlock_t *root_lock = qdisc_lock(q);
3382        struct sk_buff *to_free = NULL;
3383        bool contended;
3384        int rc;
3385
3386        qdisc_calculate_pkt_len(skb, q);
3387
3388        if (q->flags & TCQ_F_NOLOCK) {
3389                if ((q->flags & TCQ_F_CAN_BYPASS) && q->empty &&
3390                    qdisc_run_begin(q)) {
3391                        if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED,
3392                                              &q->state))) {
3393                                __qdisc_drop(skb, &to_free);
3394                                rc = NET_XMIT_DROP;
3395                                goto end_run;
3396                        }
3397                        qdisc_bstats_cpu_update(q, skb);
3398
3399                        rc = NET_XMIT_SUCCESS;
3400                        if (sch_direct_xmit(skb, q, dev, txq, NULL, true))
3401                                __qdisc_run(q);
3402
3403end_run:
3404                        qdisc_run_end(q);
3405                } else {
3406                        rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3407                        qdisc_run(q);
3408                }
3409
3410                if (unlikely(to_free))
3411                        kfree_skb_list(to_free);
3412                return rc;
3413        }
3414
3415        /*
3416         * Heuristic to force contended enqueues to serialize on a
3417         * separate lock before trying to get qdisc main lock.
3418         * This permits qdisc->running owner to get the lock more
3419         * often and dequeue packets faster.
3420         */
3421        contended = qdisc_is_running(q);
3422        if (unlikely(contended))
3423                spin_lock(&q->busylock);
3424
3425        spin_lock(root_lock);
3426        if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3427                __qdisc_drop(skb, &to_free);
3428                rc = NET_XMIT_DROP;
3429        } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3430                   qdisc_run_begin(q)) {
3431                /*
3432                 * This is a work-conserving queue; there are no old skbs
3433                 * waiting to be sent out; and the qdisc is not running -
3434                 * xmit the skb directly.
3435                 */
3436
3437                qdisc_bstats_update(q, skb);
3438
3439                if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3440                        if (unlikely(contended)) {
3441                                spin_unlock(&q->busylock);
3442                                contended = false;
3443                        }
3444                        __qdisc_run(q);
3445                }
3446
3447                qdisc_run_end(q);
3448                rc = NET_XMIT_SUCCESS;
3449        } else {
3450                rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3451                if (qdisc_run_begin(q)) {
3452                        if (unlikely(contended)) {
3453                                spin_unlock(&q->busylock);
3454                                contended = false;
3455                        }
3456                        __qdisc_run(q);
3457                        qdisc_run_end(q);
3458                }
3459        }
3460        spin_unlock(root_lock);
3461        if (unlikely(to_free))
3462                kfree_skb_list(to_free);
3463        if (unlikely(contended))
3464                spin_unlock(&q->busylock);
3465        return rc;
3466}
3467
3468#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3469static void skb_update_prio(struct sk_buff *skb)
3470{
3471        const struct netprio_map *map;
3472        const struct sock *sk;
3473        unsigned int prioidx;
3474
3475        if (skb->priority)
3476                return;
3477        map = rcu_dereference_bh(skb->dev->priomap);
3478        if (!map)
3479                return;
3480        sk = skb_to_full_sk(skb);
3481        if (!sk)
3482                return;
3483
3484        prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
3485
3486        if (prioidx < map->priomap_len)
3487                skb->priority = map->priomap[prioidx];
3488}
3489#else
3490#define skb_update_prio(skb)
3491#endif
3492
3493/**
3494 *      dev_loopback_xmit - loop back @skb
3495 *      @net: network namespace this loopback is happening in
3496 *      @sk:  sk needed to be a netfilter okfn
3497 *      @skb: buffer to transmit
3498 */
3499int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3500{
3501        skb_reset_mac_header(skb);
3502        __skb_pull(skb, skb_network_offset(skb));
3503        skb->pkt_type = PACKET_LOOPBACK;
3504        skb->ip_summed = CHECKSUM_UNNECESSARY;
3505        WARN_ON(!skb_dst(skb));
3506        skb_dst_force(skb);
3507        netif_rx_ni(skb);
3508        return 0;
3509}
3510EXPORT_SYMBOL(dev_loopback_xmit);
3511
3512#ifdef CONFIG_NET_EGRESS
3513static struct sk_buff *
3514sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3515{
3516        struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress);
3517        struct tcf_result cl_res;
3518
3519        if (!miniq)
3520                return skb;
3521
3522        /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
3523        mini_qdisc_bstats_cpu_update(miniq, skb);
3524
3525        switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
3526        case TC_ACT_OK:
3527        case TC_ACT_RECLASSIFY:
3528                skb->tc_index = TC_H_MIN(cl_res.classid);
3529                break;
3530        case TC_ACT_SHOT:
3531                mini_qdisc_qstats_cpu_drop(miniq);
3532                *ret = NET_XMIT_DROP;
3533                kfree_skb(skb);
3534                return NULL;
3535        case TC_ACT_STOLEN:
3536        case TC_ACT_QUEUED:
3537        case TC_ACT_TRAP:
3538                *ret = NET_XMIT_SUCCESS;
3539                consume_skb(skb);
3540                return NULL;
3541        case TC_ACT_REDIRECT:
3542                /* No need to push/pop skb's mac_header here on egress! */
3543                skb_do_redirect(skb);
3544                *ret = NET_XMIT_SUCCESS;
3545                return NULL;
3546        default:
3547                break;
3548        }
3549
3550        return skb;
3551}
3552#endif /* CONFIG_NET_EGRESS */
3553
3554#ifdef CONFIG_XPS
3555static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
3556                               struct xps_dev_maps *dev_maps, unsigned int tci)
3557{
3558        struct xps_map *map;
3559        int queue_index = -1;
3560
3561        if (dev->num_tc) {
3562                tci *= dev->num_tc;
3563                tci += netdev_get_prio_tc_map(dev, skb->priority);
3564        }
3565
3566        map = rcu_dereference(dev_maps->attr_map[tci]);
3567        if (map) {
3568                if (map->len == 1)
3569                        queue_index = map->queues[0];
3570                else
3571                        queue_index = map->queues[reciprocal_scale(
3572                                                skb_get_hash(skb), map->len)];
3573                if (unlikely(queue_index >= dev->real_num_tx_queues))
3574                        queue_index = -1;
3575        }
3576        return queue_index;
3577}
3578#endif
3579
3580static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
3581                         struct sk_buff *skb)
3582{
3583#ifdef CONFIG_XPS
3584        struct xps_dev_maps *dev_maps;
3585        struct sock *sk = skb->sk;
3586        int queue_index = -1;
3587
3588        if (!static_key_false(&xps_needed))
3589                return -1;
3590
3591        rcu_read_lock();
3592        if (!static_key_false(&xps_rxqs_needed))
3593                goto get_cpus_map;
3594
3595        dev_maps = rcu_dereference(sb_dev->xps_rxqs_map);
3596        if (dev_maps) {
3597                int tci = sk_rx_queue_get(sk);
3598
3599                if (tci >= 0 && tci < dev->num_rx_queues)
3600                        queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
3601                                                          tci);
3602        }
3603
3604get_cpus_map:
3605        if (queue_index < 0) {
3606                dev_maps = rcu_dereference(sb_dev->xps_cpus_map);
3607                if (dev_maps) {
3608                        unsigned int tci = skb->sender_cpu - 1;
3609
3610                        queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
3611                                                          tci);
3612                }
3613        }
3614        rcu_read_unlock();
3615
3616        return queue_index;
3617#else
3618        return -1;
3619#endif
3620}
3621
3622u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
3623                     struct net_device *sb_dev)
3624{
3625        return 0;
3626}
3627EXPORT_SYMBOL(dev_pick_tx_zero);
3628
3629u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
3630                       struct net_device *sb_dev)
3631{
3632        return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
3633}
3634EXPORT_SYMBOL(dev_pick_tx_cpu_id);
3635
3636u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
3637                     struct net_device *sb_dev)
3638{
3639        struct sock *sk = skb->sk;
3640        int queue_index = sk_tx_queue_get(sk);
3641
3642        sb_dev = sb_dev ? : dev;
3643
3644        if (queue_index < 0 || skb->ooo_okay ||
3645            queue_index >= dev->real_num_tx_queues) {
3646                int new_index = get_xps_queue(dev, sb_dev, skb);
3647
3648                if (new_index < 0)
3649                        new_index = skb_tx_hash(dev, sb_dev, skb);
3650
3651                if (queue_index != new_index && sk &&
3652                    sk_fullsock(sk) &&
3653                    rcu_access_pointer(sk->sk_dst_cache))
3654                        sk_tx_queue_set(sk, new_index);
3655
3656                queue_index = new_index;
3657        }
3658
3659        return queue_index;
3660}
3661EXPORT_SYMBOL(netdev_pick_tx);
3662
3663struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
3664                                         struct sk_buff *skb,
3665                                         struct net_device *sb_dev)
3666{
3667        int queue_index = 0;
3668
3669#ifdef CONFIG_XPS
3670        u32 sender_cpu = skb->sender_cpu - 1;
3671
3672        if (sender_cpu >= (u32)NR_CPUS)
3673                skb->sender_cpu = raw_smp_processor_id() + 1;
3674#endif
3675
3676        if (dev->real_num_tx_queues != 1) {
3677                const struct net_device_ops *ops = dev->netdev_ops;
3678
3679                if (ops->ndo_select_queue)
3680                        queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
3681                else
3682                        queue_index = netdev_pick_tx(dev, skb, sb_dev);
3683
3684                queue_index = netdev_cap_txqueue(dev, queue_index);
3685        }
3686
3687        skb_set_queue_mapping(skb, queue_index);
3688        return netdev_get_tx_queue(dev, queue_index);
3689}
3690
3691/**
3692 *      __dev_queue_xmit - transmit a buffer
3693 *      @skb: buffer to transmit
3694 *      @sb_dev: suboordinate device used for L2 forwarding offload
3695 *
3696 *      Queue a buffer for transmission to a network device. The caller must
3697 *      have set the device and priority and built the buffer before calling
3698 *      this function. The function can be called from an interrupt.
3699 *
3700 *      A negative errno code is returned on a failure. A success does not
3701 *      guarantee the frame will be transmitted as it may be dropped due
3702 *      to congestion or traffic shaping.
3703 *
3704 * -----------------------------------------------------------------------------------
3705 *      I notice this method can also return errors from the queue disciplines,
3706 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3707 *      be positive.
3708 *
3709 *      Regardless of the return value, the skb is consumed, so it is currently
3710 *      difficult to retry a send to this method.  (You can bump the ref count
3711 *      before sending to hold a reference for retry if you are careful.)
3712 *
3713 *      When calling this method, interrupts MUST be enabled.  This is because
3714 *      the BH enable code must have IRQs enabled so that it will not deadlock.
3715 *          --BLG
3716 */
3717static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
3718{
3719        struct net_device *dev = skb->dev;
3720        struct netdev_queue *txq;
3721        struct Qdisc *q;
3722        int rc = -ENOMEM;
3723        bool again = false;
3724
3725        skb_reset_mac_header(skb);
3726
3727        if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3728                __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3729
3730        /* Disable soft irqs for various locks below. Also
3731         * stops preemption for RCU.
3732         */
3733        rcu_read_lock_bh();
3734
3735        skb_update_prio(skb);
3736
3737        qdisc_pkt_len_init(skb);
3738#ifdef CONFIG_NET_CLS_ACT
3739        skb->tc_at_ingress = 0;
3740# ifdef CONFIG_NET_EGRESS
3741        if (static_branch_unlikely(&egress_needed_key)) {
3742                skb = sch_handle_egress(skb, &rc, dev);
3743                if (!skb)
3744                        goto out;
3745        }
3746# endif
3747#endif
3748        /* If device/qdisc don't need skb->dst, release it right now while
3749         * its hot in this cpu cache.
3750         */
3751        if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3752                skb_dst_drop(skb);
3753        else
3754                skb_dst_force(skb);
3755
3756        txq = netdev_core_pick_tx(dev, skb, sb_dev);
3757        q = rcu_dereference_bh(txq->qdisc);
3758
3759        trace_net_dev_queue(skb);
3760        if (q->enqueue) {
3761                rc = __dev_xmit_skb(skb, q, dev, txq);
3762                goto out;
3763        }
3764
3765        /* The device has no queue. Common case for software devices:
3766         * loopback, all the sorts of tunnels...
3767
3768         * Really, it is unlikely that netif_tx_lock protection is necessary
3769         * here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3770         * counters.)
3771         * However, it is possible, that they rely on protection
3772         * made by us here.
3773
3774         * Check this and shot the lock. It is not prone from deadlocks.
3775         *Either shot noqueue qdisc, it is even simpler 8)
3776         */
3777        if (dev->flags & IFF_UP) {
3778                int cpu = smp_processor_id(); /* ok because BHs are off */
3779
3780                if (txq->xmit_lock_owner != cpu) {
3781                        if (dev_xmit_recursion())
3782                                goto recursion_alert;
3783
3784                        skb = validate_xmit_skb(skb, dev, &again);
3785                        if (!skb)
3786                                goto out;
3787
3788                        HARD_TX_LOCK(dev, txq, cpu);
3789
3790                        if (!netif_xmit_stopped(txq)) {
3791                                dev_xmit_recursion_inc();
3792                                skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3793                                dev_xmit_recursion_dec();
3794                                if (dev_xmit_complete(rc)) {
3795                                        HARD_TX_UNLOCK(dev, txq);
3796                                        goto out;
3797                                }
3798                        }
3799                        HARD_TX_UNLOCK(dev, txq);
3800                        net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3801                                             dev->name);
3802                } else {
3803                        /* Recursion is detected! It is possible,
3804                         * unfortunately
3805                         */
3806recursion_alert:
3807                        net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3808                                             dev->name);
3809                }
3810        }
3811
3812        rc = -ENETDOWN;
3813        rcu_read_unlock_bh();
3814
3815        atomic_long_inc(&dev->tx_dropped);
3816        kfree_skb_list(skb);
3817        return rc;
3818out:
3819        rcu_read_unlock_bh();
3820        return rc;
3821}
3822
3823int dev_queue_xmit(struct sk_buff *skb)
3824{
3825        return __dev_queue_xmit(skb, NULL);
3826}
3827EXPORT_SYMBOL(dev_queue_xmit);
3828
3829int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev)
3830{
3831        return __dev_queue_xmit(skb, sb_dev);
3832}
3833EXPORT_SYMBOL(dev_queue_xmit_accel);
3834
3835int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
3836{
3837        struct net_device *dev = skb->dev;
3838        struct sk_buff *orig_skb = skb;
3839        struct netdev_queue *txq;
3840        int ret = NETDEV_TX_BUSY;
3841        bool again = false;
3842
3843        if (unlikely(!netif_running(dev) ||
3844                     !netif_carrier_ok(dev)))
3845                goto drop;
3846
3847        skb = validate_xmit_skb_list(skb, dev, &again);
3848        if (skb != orig_skb)
3849                goto drop;
3850
3851        skb_set_queue_mapping(skb, queue_id);
3852        txq = skb_get_tx_queue(dev, skb);
3853
3854        local_bh_disable();
3855
3856        HARD_TX_LOCK(dev, txq, smp_processor_id());
3857        if (!netif_xmit_frozen_or_drv_stopped(txq))
3858                ret = netdev_start_xmit(skb, dev, txq, false);
3859        HARD_TX_UNLOCK(dev, txq);
3860
3861        local_bh_enable();
3862
3863        if (!dev_xmit_complete(ret))
3864                kfree_skb(skb);
3865
3866        return ret;
3867drop:
3868        atomic_long_inc(&dev->tx_dropped);
3869        kfree_skb_list(skb);
3870        return NET_XMIT_DROP;
3871}
3872EXPORT_SYMBOL(dev_direct_xmit);
3873
3874/*************************************************************************
3875 *                      Receiver routines
3876 *************************************************************************/
3877
3878int netdev_max_backlog __read_mostly = 1000;
3879EXPORT_SYMBOL(netdev_max_backlog);
3880
3881int netdev_tstamp_prequeue __read_mostly = 1;
3882int netdev_budget __read_mostly = 300;
3883unsigned int __read_mostly netdev_budget_usecs = 2000;
3884int weight_p __read_mostly = 64;           /* old backlog weight */
3885int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
3886int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
3887int dev_rx_weight __read_mostly = 64;
3888int dev_tx_weight __read_mostly = 64;
3889/* Maximum number of GRO_NORMAL skbs to batch up for list-RX */
3890int gro_normal_batch __read_mostly = 8;
3891
3892/* Called with irq disabled */
3893static inline void ____napi_schedule(struct softnet_data *sd,
3894                                     struct napi_struct *napi)
3895{
3896        list_add_tail(&napi->poll_list, &sd->poll_list);
3897        __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3898}
3899
3900#ifdef CONFIG_RPS
3901
3902/* One global table that all flow-based protocols share. */
3903struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3904EXPORT_SYMBOL(rps_sock_flow_table);
3905u32 rps_cpu_mask __read_mostly;
3906EXPORT_SYMBOL(rps_cpu_mask);
3907
3908struct static_key_false rps_needed __read_mostly;
3909EXPORT_SYMBOL(rps_needed);
3910struct static_key_false rfs_needed __read_mostly;
3911EXPORT_SYMBOL(rfs_needed);
3912
3913static struct rps_dev_flow *
3914set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3915            struct rps_dev_flow *rflow, u16 next_cpu)
3916{
3917        if (next_cpu < nr_cpu_ids) {
3918#ifdef CONFIG_RFS_ACCEL
3919                struct netdev_rx_queue *rxqueue;
3920                struct rps_dev_flow_table *flow_table;
3921                struct rps_dev_flow *old_rflow;
3922                u32 flow_id;
3923                u16 rxq_index;
3924                int rc;
3925
3926                /* Should we steer this flow to a different hardware queue? */
3927                if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3928                    !(dev->features & NETIF_F_NTUPLE))
3929                        goto out;
3930                rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3931                if (rxq_index == skb_get_rx_queue(skb))
3932                        goto out;
3933
3934                rxqueue = dev->_rx + rxq_index;
3935                flow_table = rcu_dereference(rxqueue->rps_flow_table);
3936                if (!flow_table)
3937                        goto out;
3938                flow_id = skb_get_hash(skb) & flow_table->mask;
3939                rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3940                                                        rxq_index, flow_id);
3941                if (rc < 0)
3942                        goto out;
3943                old_rflow = rflow;
3944                rflow = &flow_table->flows[flow_id];
3945                rflow->filter = rc;
3946                if (old_rflow->filter == rflow->filter)
3947                        old_rflow->filter = RPS_NO_FILTER;
3948        out:
3949#endif
3950                rflow->last_qtail =
3951                        per_cpu(softnet_data, next_cpu).input_queue_head;
3952        }
3953
3954        rflow->cpu = next_cpu;
3955        return rflow;
3956}
3957
3958/*
3959 * get_rps_cpu is called from netif_receive_skb and returns the target
3960 * CPU from the RPS map of the receiving queue for a given skb.
3961 * rcu_read_lock must be held on entry.
3962 */
3963static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3964                       struct rps_dev_flow **rflowp)
3965{
3966        const struct rps_sock_flow_table *sock_flow_table;
3967        struct netdev_rx_queue *rxqueue = dev->_rx;
3968        struct rps_dev_flow_table *flow_table;
3969        struct rps_map *map;
3970        int cpu = -1;
3971        u32 tcpu;
3972        u32 hash;
3973
3974        if (skb_rx_queue_recorded(skb)) {
3975                u16 index = skb_get_rx_queue(skb);
3976
3977                if (unlikely(index >= dev->real_num_rx_queues)) {
3978                        WARN_ONCE(dev->real_num_rx_queues > 1,
3979                                  "%s received packet on queue %u, but number "
3980                                  "of RX queues is %u\n",
3981                                  dev->name, index, dev->real_num_rx_queues);
3982                        goto done;
3983                }
3984                rxqueue += index;
3985        }
3986
3987        /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3988
3989        flow_table = rcu_dereference(rxqueue->rps_flow_table);
3990        map = rcu_dereference(rxqueue->rps_map);
3991        if (!flow_table && !map)
3992                goto done;
3993
3994        skb_reset_network_header(skb);
3995        hash = skb_get_hash(skb);
3996        if (!hash)
3997                goto done;
3998
3999        sock_flow_table = rcu_dereference(rps_sock_flow_table);
4000        if (flow_table && sock_flow_table) {

4001                struct rps_dev_flow *rflow;
4002                u32 next_cpu;
4003                u32 ident;
4004
4005                /* First check into global flow table if there is a match */
4006                ident = sock_flow_table->ents[hash & sock_flow_table->mask];
4007                if ((ident ^ hash) & ~rps_cpu_mask)
4008                        goto try_rps;
4009
4010                next_cpu = ident & rps_cpu_mask;
4011
4012                /* OK, now we know there is a match,
4013                 * we can look at the local (per receive queue) flow table
4014                 */
4015                rflow = &flow_table->flows[hash & flow_table->mask];
4016                tcpu = rflow->cpu;
4017
4018                /*
4019                 * If the desired CPU (where last recvmsg was done) is
4020                 * different from current CPU (one in the rx-queue flow
4021                 * table entry), switch if one of the following holds:
4022                 *   - Current CPU is unset (>= nr_cpu_ids).
4023                 *   - Current CPU is offline.
4024                 *   - The current CPU's queue tail has advanced beyond the
4025                 *     last packet that was enqueued using this table entry.
4026                 *     This guarantees that all previous packets for the flow
4027                 *     have been dequeued, thus preserving in order delivery.
4028                 */
4029                if (unlikely(tcpu != next_cpu) &&
4030                    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
4031                     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
4032                      rflow->last_qtail)) >= 0)) {
4033                        tcpu = next_cpu;
4034                        rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
4035                }
4036
4037                if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
4038                        *rflowp = rflow;
4039                        cpu = tcpu;
4040                        goto done;
4041                }
4042        }
4043
4044try_rps:
4045
4046        if (map) {
4047                tcpu = map->cpus[reciprocal_scale(hash, map->len)];
4048                if (cpu_online(tcpu)) {
4049                        cpu = tcpu;
4050                        goto done;
4051                }
4052        }
4053
4054done:
4055        return cpu;
4056}
4057
4058#ifdef CONFIG_RFS_ACCEL
4059
4060/**
4061 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
4062 * @dev: Device on which the filter was set
4063 * @rxq_index: RX queue index
4064 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
4065 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
4066 *
4067 * Drivers that implement ndo_rx_flow_steer() should periodically call
4068 * this function for each installed filter and remove the filters for
4069 * which it returns %true.
4070 */
4071bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
4072                         u32 flow_id, u16 filter_id)
4073{
4074        struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
4075        struct rps_dev_flow_table *flow_table;
4076        struct rps_dev_flow *rflow;
4077        bool expire = true;
4078        unsigned int cpu;
4079
4080        rcu_read_lock();
4081        flow_table = rcu_dereference(rxqueue->rps_flow_table);
4082        if (flow_table && flow_id <= flow_table->mask) {
4083                rflow = &flow_table->flows[flow_id];
4084                cpu = READ_ONCE(rflow->cpu);
4085                if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
4086                    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
4087                           rflow->last_qtail) <
4088                     (int)(10 * flow_table->mask)))
4089                        expire = false;
4090        }
4091        rcu_read_unlock();
4092        return expire;
4093}
4094EXPORT_SYMBOL(rps_may_expire_flow);
4095
4096#endif /* CONFIG_RFS_ACCEL */
4097
4098/* Called from hardirq (IPI) context */
4099static void rps_trigger_softirq(void *data)
4100{
4101        struct softnet_data *sd = data;
4102
4103        ____napi_schedule(sd, &sd->backlog);
4104        sd->received_rps++;
4105}
4106
4107#endif /* CONFIG_RPS */
4108
4109/*
4110 * Check if this softnet_data structure is another cpu one
4111 * If yes, queue it to our IPI list and return 1
4112 * If no, return 0
4113 */
4114static int rps_ipi_queued(struct softnet_data *sd)
4115{
4116#ifdef CONFIG_RPS
4117        struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
4118
4119        if (sd != mysd) {
4120                sd->rps_ipi_next = mysd->rps_ipi_list;
4121                mysd->rps_ipi_list = sd;
4122
4123                __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4124                return 1;
4125        }
4126#endif /* CONFIG_RPS */
4127        return 0;
4128}
4129
4130#ifdef CONFIG_NET_FLOW_LIMIT
4131int netdev_flow_limit_table_len __read_mostly = (1 << 12);
4132#endif
4133
4134static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
4135{
4136#ifdef CONFIG_NET_FLOW_LIMIT
4137        struct sd_flow_limit *fl;
4138        struct softnet_data *sd;
4139        unsigned int old_flow, new_flow;
4140
4141        if (qlen < (netdev_max_backlog >> 1))
4142                return false;
4143
4144        sd = this_cpu_ptr(&softnet_data);
4145
4146        rcu_read_lock();
4147        fl = rcu_dereference(sd->flow_limit);
4148        if (fl) {
4149                new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
4150                old_flow = fl->history[fl->history_head];
4151                fl->history[fl->history_head] = new_flow;
4152
4153                fl->history_head++;
4154                fl->history_head &= FLOW_LIMIT_HISTORY - 1;
4155
4156                if (likely(fl->buckets[old_flow]))
4157                        fl->buckets[old_flow]--;
4158
4159                if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
4160                        fl->count++;
4161                        rcu_read_unlock();
4162                        return true;
4163                }
4164        }
4165        rcu_read_unlock();
4166#endif
4167        return false;
4168}
4169
4170/*
4171 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
4172 * queue (may be a remote CPU queue).
4173 */
4174static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
4175                              unsigned int *qtail)
4176{
4177        struct softnet_data *sd;
4178        unsigned long flags;
4179        unsigned int qlen;
4180
4181        sd = &per_cpu(softnet_data, cpu);
4182
4183        local_irq_save(flags);
4184
4185        rps_lock(sd);
4186        if (!netif_running(skb->dev))
4187                goto drop;
4188        qlen = skb_queue_len(&sd->input_pkt_queue);
4189        if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
4190                if (qlen) {
4191enqueue:
4192                        __skb_queue_tail(&sd->input_pkt_queue, skb);
4193                        input_queue_tail_incr_save(sd, qtail);
4194                        rps_unlock(sd);
4195                        local_irq_restore(flags);
4196                        return NET_RX_SUCCESS;
4197                }
4198
4199                /* Schedule NAPI for backlog device
4200                 * We can use non atomic operation since we own the queue lock
4201                 */
4202                if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
4203                        if (!rps_ipi_queued(sd))
4204                                ____napi_schedule(sd, &sd->backlog);
4205                }
4206                goto enqueue;
4207        }
4208
4209drop:
4210        sd->dropped++;
4211        rps_unlock(sd);
4212
4213        local_irq_restore(flags);
4214
4215        atomic_long_inc(&skb->dev->rx_dropped);
4216        kfree_skb(skb);
4217        return NET_RX_DROP;
4218}
4219
4220static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
4221{
4222        struct net_device *dev = skb->dev;
4223        struct netdev_rx_queue *rxqueue;
4224
4225        rxqueue = dev->_rx;
4226
4227        if (skb_rx_queue_recorded(skb)) {
4228                u16 index = skb_get_rx_queue(skb);
4229
4230                if (unlikely(index >= dev->real_num_rx_queues)) {
4231                        WARN_ONCE(dev->real_num_rx_queues > 1,
4232                                  "%s received packet on queue %u, but number "
4233                                  "of RX queues is %u\n",
4234                                  dev->name, index, dev->real_num_rx_queues);
4235
4236                        return rxqueue; /* Return first rxqueue */
4237                }
4238                rxqueue += index;
4239        }
4240        return rxqueue;
4241}
4242
4243static u32 netif_receive_generic_xdp(struct sk_buff *skb,
4244                                     struct xdp_buff *xdp,
4245                                     struct bpf_prog *xdp_prog)
4246{
4247        struct netdev_rx_queue *rxqueue;
4248        void *orig_data, *orig_data_end;
4249        u32 metalen, act = XDP_DROP;
4250        __be16 orig_eth_type;
4251        struct ethhdr *eth;
4252        bool orig_bcast;
4253        int hlen, off;
4254        u32 mac_len;
4255
4256        /* Reinjected packets coming from act_mirred or similar should
4257         * not get XDP generic processing.
4258         */
4259        if (skb_cloned(skb) || skb_is_tc_redirected(skb))
4260                return XDP_PASS;
4261
4262        /* XDP packets must be linear and must have sufficient headroom
4263         * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
4264         * native XDP provides, thus we need to do it here as well.
4265         */
4266        if (skb_is_nonlinear(skb) ||
4267            skb_headroom(skb) < XDP_PACKET_HEADROOM) {
4268                int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
4269                int troom = skb->tail + skb->data_len - skb->end;
4270
4271                /* In case we have to go down the path and also linearize,
4272                 * then lets do the pskb_expand_head() work just once here.
4273                 */
4274                if (pskb_expand_head(skb,
4275                                     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
4276                                     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
4277                        goto do_drop;
4278                if (skb_linearize(skb))
4279                        goto do_drop;
4280        }
4281
4282        /* The XDP program wants to see the packet starting at the MAC
4283         * header.
4284         */
4285        mac_len = skb->data - skb_mac_header(skb);
4286        hlen = skb_headlen(skb) + mac_len;
4287        xdp->data = skb->data - mac_len;
4288        xdp->data_meta = xdp->data;
4289        xdp->data_end = xdp->data + hlen;
4290        xdp->data_hard_start = skb->data - skb_headroom(skb);
4291        orig_data_end = xdp->data_end;
4292        orig_data = xdp->data;
4293        eth = (struct ethhdr *)xdp->data;
4294        orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
4295        orig_eth_type = eth->h_proto;
4296
4297        rxqueue = netif_get_rxqueue(skb);
4298        xdp->rxq = &rxqueue->xdp_rxq;
4299
4300        act = bpf_prog_run_xdp(xdp_prog, xdp);
4301
4302        /* check if bpf_xdp_adjust_head was used */
4303        off = xdp->data - orig_data;
4304        if (off) {
4305                if (off > 0)
4306                        __skb_pull(skb, off);
4307                else if (off < 0)
4308                        __skb_push(skb, -off);
4309
4310                skb->mac_header += off;
4311                skb_reset_network_header(skb);
4312        }
4313
4314        /* check if bpf_xdp_adjust_tail was used. it can only "shrink"
4315         * pckt.
4316         */
4317        off = orig_data_end - xdp->data_end;
4318        if (off != 0) {
4319                skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
4320                skb->len -= off;
4321
4322        }
4323
4324        /* check if XDP changed eth hdr such SKB needs update */
4325        eth = (struct ethhdr *)xdp->data;
4326        if ((orig_eth_type != eth->h_proto) ||
4327            (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
4328                __skb_push(skb, ETH_HLEN);
4329                skb->protocol = eth_type_trans(skb, skb->dev);
4330        }
4331
4332        switch (act) {
4333        case XDP_REDIRECT:
4334        case XDP_TX:
4335                __skb_push(skb, mac_len);
4336                break;
4337        case XDP_PASS:
4338                metalen = xdp->data - xdp->data_meta;
4339                if (metalen)
4340                        skb_metadata_set(skb, metalen);
4341                break;
4342        default:
4343                bpf_warn_invalid_xdp_action(act);
4344                /* fall through */
4345        case XDP_ABORTED:
4346                trace_xdp_exception(skb->dev, xdp_prog, act);
4347                /* fall through */
4348        case XDP_DROP:
4349        do_drop:
4350                kfree_skb(skb);
4351                break;
4352        }
4353
4354        return act;
4355}
4356
4357/* When doing generic XDP we have to bypass the qdisc layer and the
4358 * network taps in order to match in-driver-XDP behavior.
4359 */
4360void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
4361{
4362        struct net_device *dev = skb->dev;
4363        struct netdev_queue *txq;
4364        bool free_skb = true;
4365        int cpu, rc;
4366
4367        txq = netdev_core_pick_tx(dev, skb, NULL);
4368        cpu = smp_processor_id();
4369        HARD_TX_LOCK(dev, txq, cpu);
4370        if (!netif_xmit_stopped(txq)) {
4371                rc = netdev_start_xmit(skb, dev, txq, 0);
4372                if (dev_xmit_complete(rc))
4373                        free_skb = false;
4374        }
4375        HARD_TX_UNLOCK(dev, txq);
4376        if (free_skb) {
4377                trace_xdp_exception(dev, xdp_prog, XDP_TX);
4378                kfree_skb(skb);
4379        }
4380}
4381EXPORT_SYMBOL_GPL(generic_xdp_tx);
4382
4383static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
4384
4385int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
4386{
4387        if (xdp_prog) {
4388                struct xdp_buff xdp;
4389                u32 act;
4390                int err;
4391
4392                act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
4393                if (act != XDP_PASS) {
4394                        switch (act) {
4395                        case XDP_REDIRECT:
4396                                err = xdp_do_generic_redirect(skb->dev, skb,
4397                                                              &xdp, xdp_prog);
4398                                if (err)
4399                                        goto out_redir;
4400                                break;
4401                        case XDP_TX:
4402                                generic_xdp_tx(skb, xdp_prog);
4403                                break;
4404                        }
4405                        return XDP_DROP;
4406                }
4407        }
4408        return XDP_PASS;
4409out_redir:
4410        kfree_skb(skb);
4411        return XDP_DROP;
4412}
4413EXPORT_SYMBOL_GPL(do_xdp_generic);
4414
4415static int netif_rx_internal(struct sk_buff *skb)
4416{
4417        int ret;
4418
4419        net_timestamp_check(netdev_tstamp_prequeue, skb);
4420
4421        trace_netif_rx(skb);
4422
4423#ifdef CONFIG_RPS
4424        if (static_branch_unlikely(&rps_needed)) {
4425                struct rps_dev_flow voidflow, *rflow = &voidflow;
4426                int cpu;
4427
4428                preempt_disable();
4429                rcu_read_lock();
4430
4431                cpu = get_rps_cpu(skb->dev, skb, &rflow);
4432                if (cpu < 0)
4433                        cpu = smp_processor_id();
4434
4435                ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4436
4437                rcu_read_unlock();
4438                preempt_enable();
4439        } else
4440#endif
4441        {
4442                unsigned int qtail;
4443
4444                ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
4445                put_cpu();
4446        }
4447        return ret;
4448}
4449
4450/**
4451 *      netif_rx        -       post buffer to the network code
4452 *      @skb: buffer to post
4453 *
4454 *      This function receives a packet from a device driver and queues it for
4455 *      the upper (protocol) levels to process.  It always succeeds. The buffer
4456 *      may be dropped during processing for congestion control or by the
4457 *      protocol layers.
4458 *
4459 *      return values:
4460 *      NET_RX_SUCCESS  (no congestion)
4461 *      NET_RX_DROP     (packet was dropped)
4462 *
4463 */
4464
4465int netif_rx(struct sk_buff *skb)
4466{
4467        int ret;
4468
4469        trace_netif_rx_entry(skb);
4470
4471        ret = netif_rx_internal(skb);
4472        trace_netif_rx_exit(ret);
4473
4474        return ret;
4475}
4476EXPORT_SYMBOL(netif_rx);
4477
4478int netif_rx_ni(struct sk_buff *skb)
4479{
4480        int err;
4481
4482        trace_netif_rx_ni_entry(skb);
4483
4484        preempt_disable();
4485        err = netif_rx_internal(skb);
4486        if (local_softirq_pending())
4487                do_softirq();
4488        preempt_enable();
4489        trace_netif_rx_ni_exit(err);
4490
4491        return err;
4492}
4493EXPORT_SYMBOL(netif_rx_ni);
4494
4495static __latent_entropy void net_tx_action(struct softirq_action *h)
4496{
4497        struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4498
4499        if (sd->completion_queue) {
4500                struct sk_buff *clist;
4501
4502                local_irq_disable();
4503                clist = sd->completion_queue;
4504                sd->completion_queue = NULL;
4505                local_irq_enable();
4506
4507                while (clist) {
4508                        struct sk_buff *skb = clist;
4509
4510                        clist = clist->next;
4511
4512                        WARN_ON(refcount_read(&skb->users));
4513                        if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
4514                                trace_consume_skb(skb);
4515                        else
4516                                trace_kfree_skb(skb, net_tx_action);
4517
4518                        if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
4519                                __kfree_skb(skb);
4520                        else
4521                                __kfree_skb_defer(skb);
4522                }
4523
4524                __kfree_skb_flush();
4525        }
4526
4527        if (sd->output_queue) {
4528                struct Qdisc *head;
4529
4530                local_irq_disable();
4531                head = sd->output_queue;
4532                sd->output_queue = NULL;
4533                sd->output_queue_tailp = &sd->output_queue;
4534                local_irq_enable();
4535
4536                while (head) {
4537                        struct Qdisc *q = head;
4538                        spinlock_t *root_lock = NULL;
4539
4540                        head = head->next_sched;
4541
4542                        if (!(q->flags & TCQ_F_NOLOCK)) {
4543                                root_lock = qdisc_lock(q);
4544                                spin_lock(root_lock);
4545                        }
4546                        /* We need to make sure head->next_sched is read
4547                         * before clearing __QDISC_STATE_SCHED
4548                         */
4549                        smp_mb__before_atomic();
4550                        clear_bit(__QDISC_STATE_SCHED, &q->state);
4551                        qdisc_run(q);
4552                        if (root_lock)
4553                                spin_unlock(root_lock);
4554                }
4555        }
4556
4557        xfrm_dev_backlog(sd);
4558}
4559
4560#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
4561/* This hook is defined here for ATM LANE */
4562int (*br_fdb_test_addr_hook)(struct net_device *dev,
4563                             unsigned char *addr) __read_mostly;
4564EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
4565#endif
4566
4567static inline struct sk_buff *
4568sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
4569                   struct net_device *orig_dev)
4570{
4571#ifdef CONFIG_NET_CLS_ACT
4572        struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
4573        struct tcf_result cl_res;
4574
4575        /* If there's at least one ingress present somewhere (so
4576         * we get here via enabled static key), remaining devices
4577         * that are not configured with an ingress qdisc will bail
4578         * out here.
4579         */
4580        if (!miniq)
4581                return skb;
4582
4583        if (*pt_prev) {
4584                *ret = deliver_skb(skb, *pt_prev, orig_dev);
4585                *pt_prev = NULL;
4586        }
4587
4588        qdisc_skb_cb(skb)->pkt_len = skb->len;
4589        skb->tc_at_ingress = 1;
4590        mini_qdisc_bstats_cpu_update(miniq, skb);
4591
4592        switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
4593        case TC_ACT_OK:
4594        case TC_ACT_RECLASSIFY:
4595                skb->tc_index = TC_H_MIN(cl_res.classid);
4596                break;
4597        case TC_ACT_SHOT:
4598                mini_qdisc_qstats_cpu_drop(miniq);
4599                kfree_skb(skb);
4600                return NULL;
4601        case TC_ACT_STOLEN:
4602        case TC_ACT_QUEUED:
4603        case TC_ACT_TRAP:
4604                consume_skb(skb);
4605                return NULL;
4606        case TC_ACT_REDIRECT:
4607                /* skb_mac_header check was done by cls/act_bpf, so
4608                 * we can safely push the L2 header back before
4609                 * redirecting to another netdev
4610                 */
4611                __skb_push(skb, skb->mac_len);
4612                skb_do_redirect(skb);
4613                return NULL;
4614        case TC_ACT_CONSUMED:
4615                return NULL;
4616        default:
4617                break;
4618        }
4619#endif /* CONFIG_NET_CLS_ACT */
4620        return skb;
4621}
4622
4623/**
4624 *      netdev_is_rx_handler_busy - check if receive handler is registered
4625 *      @dev: device to check
4626 *
4627 *      Check if a receive handler is already registered for a given device.
4628 *      Return true if there one.
4629 *
4630 *      The caller must hold the rtnl_mutex.
4631 */
4632bool netdev_is_rx_handler_busy(struct net_device *dev)
4633{
4634        ASSERT_RTNL();
4635        return dev && rtnl_dereference(dev->rx_handler);
4636}
4637EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
4638
4639/**
4640 *      netdev_rx_handler_register - register receive handler
4641 *      @dev: device to register a handler for
4642 *      @rx_handler: receive handler to register
4643 *      @rx_handler_data: data pointer that is used by rx handler
4644 *
4645 *      Register a receive handler for a device. This handler will then be
4646 *      called from __netif_receive_skb. A negative errno code is returned
4647 *      on a failure.
4648 *
4649 *      The caller must hold the rtnl_mutex.
4650 *
4651 *      For a general description of rx_handler, see enum rx_handler_result.
4652 */
4653int netdev_rx_handler_register(struct net_device *dev,
4654                               rx_handler_func_t *rx_handler,
4655                               void *rx_handler_data)
4656{
4657        if (netdev_is_rx_handler_busy(dev))
4658                return -EBUSY;
4659
4660        if (dev->priv_flags & IFF_NO_RX_HANDLER)
4661                return -EINVAL;
4662
4663        /* Note: rx_handler_data must be set before rx_handler */
4664        rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
4665        rcu_assign_pointer(dev->rx_handler, rx_handler);
4666
4667        return 0;
4668}
4669EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
4670
4671/**
4672 *      netdev_rx_handler_unregister - unregister receive handler
4673 *      @dev: device to unregister a handler from
4674 *
4675 *      Unregister a receive handler from a device.
4676 *
4677 *      The caller must hold the rtnl_mutex.
4678 */
4679void netdev_rx_handler_unregister(struct net_device *dev)
4680{
4681
4682        ASSERT_RTNL();
4683        RCU_INIT_POINTER(dev->rx_handler, NULL);
4684        /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
4685         * section has a guarantee to see a non NULL rx_handler_data
4686         * as well.
4687         */
4688        synchronize_net();
4689        RCU_INIT_POINTER(dev->rx_handler_data, NULL);
4690}
4691EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
4692
4693/*
4694 * Limit the use of PFMEMALLOC reserves to those protocols that implement
4695 * the special handling of PFMEMALLOC skbs.
4696 */
4697static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
4698{
4699        switch (skb->protocol) {
4700        case htons(ETH_P_ARP):
4701        case htons(ETH_P_IP):
4702        case htons(ETH_P_IPV6):
4703        case htons(ETH_P_8021Q):
4704        case htons(ETH_P_8021AD):
4705                return true;
4706        default:
4707                return false;
4708        }
4709}
4710
4711static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
4712                             int *ret, struct net_device *orig_dev)
4713{
4714#ifdef CONFIG_NETFILTER_INGRESS
4715        if (nf_hook_ingress_active(skb)) {
4716                int ingress_retval;
4717
4718                if (*pt_prev) {
4719                        *ret = deliver_skb(skb, *pt_prev, orig_dev);
4720                        *pt_prev = NULL;
4721                }
4722
4723                rcu_read_lock();
4724                ingress_retval = nf_hook_ingress(skb);
4725                rcu_read_unlock();
4726                return ingress_retval;
4727        }
4728#endif /* CONFIG_NETFILTER_INGRESS */
4729        return 0;
4730}
4731
4732static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc,
4733                                    struct packet_type **ppt_prev)
4734{
4735        struct packet_type *ptype, *pt_prev;
4736        rx_handler_func_t *rx_handler;
4737        struct net_device *orig_dev;
4738        bool deliver_exact = false;
4739        int ret = NET_RX_DROP;
4740        __be16 type;
4741
4742        net_timestamp_check(!netdev_tstamp_prequeue, skb);
4743
4744        trace_netif_receive_skb(skb);
4745
4746        orig_dev = skb->dev;
4747
4748        skb_reset_network_header(skb);
4749        if (!skb_transport_header_was_set(skb))
4750                skb_reset_transport_header(skb);
4751        skb_reset_mac_len(skb);
4752
4753        pt_prev = NULL;
4754
4755another_round:
4756        skb->skb_iif = skb->dev->ifindex;
4757
4758        __this_cpu_inc(softnet_data.processed);
4759
4760        if (static_branch_unlikely(&generic_xdp_needed_key)) {
4761                int ret2;
4762
4763                preempt_disable();
4764                ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
4765                preempt_enable();
4766
4767                if (ret2 != XDP_PASS)
4768                        return NET_RX_DROP;
4769                skb_reset_mac_len(skb);
4770        }
4771
4772        if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
4773            skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
4774                skb = skb_vlan_untag(skb);
4775                if (unlikely(!skb))
4776                        goto out;
4777        }
4778
4779        if (skb_skip_tc_classify(skb))
4780                goto skip_classify;
4781
4782        if (pfmemalloc)
4783                goto skip_taps;
4784
4785        list_for_each_entry_rcu(ptype, &ptype_all, list) {
4786                if (pt_prev)
4787                        ret = deliver_skb(skb, pt_prev, orig_dev);
4788                pt_prev = ptype;
4789        }
4790
4791        list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
4792                if (pt_prev)
4793                        ret = deliver_skb(skb, pt_prev, orig_dev);
4794                pt_prev = ptype;
4795        }
4796
4797skip_taps:
4798#ifdef CONFIG_NET_INGRESS
4799        if (static_branch_unlikely(&ingress_needed_key)) {
4800                skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
4801                if (!skb)
4802                        goto out;
4803
4804                if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
4805                        goto out;
4806        }
4807#endif
4808        skb_reset_tc(skb);
4809skip_classify:
4810        if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
4811                goto drop;
4812
4813        if (skb_vlan_tag_present(skb)) {
4814                if (pt_prev) {
4815                        ret = deliver_skb(skb, pt_prev, orig_dev);
4816                        pt_prev = NULL;
4817                }
4818                if (vlan_do_receive(&skb))
4819                        goto another_round;
4820                else if (unlikely(!skb))
4821                        goto out;
4822        }
4823
4824        rx_handler = rcu_dereference(skb->dev->rx_handler);
4825        if (rx_handler) {
4826                if (pt_prev) {
4827                        ret = deliver_skb(skb, pt_prev, orig_dev);
4828                        pt_prev = NULL;
4829                }
4830                switch (rx_handler(&skb)) {
4831                case RX_HANDLER_CONSUMED:
4832                        ret = NET_RX_SUCCESS;
4833                        goto out;
4834                case RX_HANDLER_ANOTHER:
4835                        goto another_round;
4836                case RX_HANDLER_EXACT:
4837                        deliver_exact = true;
4838                case RX_HANDLER_PASS:
4839                        break;
4840                default:
4841                        BUG();
4842                }
4843        }
4844
4845        if (unlikely(skb_vlan_tag_present(skb))) {
4846check_vlan_id:
4847                if (skb_vlan_tag_get_id(skb)) {
4848                        /* Vlan id is non 0 and vlan_do_receive() above couldn't
4849                         * find vlan device.
4850                         */
4851                        skb->pkt_type = PACKET_OTHERHOST;
4852                } else if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
4853                           skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
4854                        /* Outer header is 802.1P with vlan 0, inner header is
4855                         * 802.1Q or 802.1AD and vlan_do_receive() above could
4856                         * not find vlan dev for vlan id 0.
4857                         */
4858                        __vlan_hwaccel_clear_tag(skb);
4859                        skb = skb_vlan_untag(skb);
4860                        if (unlikely(!skb))
4861                                goto out;
4862                        if (vlan_do_receive(&skb))
4863                                /* After stripping off 802.1P header with vlan 0
4864                                 * vlan dev is found for inner header.
4865                                 */
4866                                goto another_round;
4867                        else if (unlikely(!skb))
4868                                goto out;
4869                        else
4870                                /* We have stripped outer 802.1P vlan 0 header.
4871                                 * But could not find vlan dev.
4872                                 * check again for vlan id to set OTHERHOST.
4873                                 */
4874                                goto check_vlan_id;
4875                }
4876                /* Note: we might in the future use prio bits
4877                 * and set skb->priority like in vlan_do_receive()
4878                 * For the time being, just ignore Priority Code Point
4879                 */
4880                __vlan_hwaccel_clear_tag(skb);
4881        }
4882
4883        type = skb->protocol;
4884
4885        /* deliver only exact match when indicated */
4886        if (likely(!deliver_exact)) {
4887                deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4888                                       &ptype_base[ntohs(type) &
4889                                                   PTYPE_HASH_MASK]);
4890        }
4891
4892        deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4893                               &orig_dev->ptype_specific);
4894
4895        if (unlikely(skb->dev != orig_dev)) {
4896                deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4897                                       &skb->dev->ptype_specific);
4898        }
4899
4900        if (pt_prev) {
4901                if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
4902                        goto drop;
4903                *ppt_prev = pt_prev;
4904        } else {
4905drop:
4906                if (!deliver_exact)
4907                        atomic_long_inc(&skb->dev->rx_dropped);
4908                else
4909                        atomic_long_inc(&skb->dev->rx_nohandler);
4910                kfree_skb(skb);
4911                /* Jamal, now you will not able to escape explaining
4912                 * me how you were going to use this. :-)
4913                 */
4914                ret = NET_RX_DROP;
4915        }
4916
4917out:
4918        return ret;
4919}
4920
4921static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
4922{
4923        struct net_device *orig_dev = skb->dev;
4924        struct packet_type *pt_prev = NULL;
4925        int ret;
4926
4927        ret = __netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
4928        if (pt_prev)
4929                ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
4930                                         skb->dev, pt_prev, orig_dev);
4931        return ret;
4932}
4933
4934/**
4935 *      netif_receive_skb_core - special purpose version of netif_receive_skb
4936 *      @skb: buffer to process
4937 *
4938 *      More direct receive version of netif_receive_skb().  It should
4939 *      only be used by callers that have a need to skip RPS and Generic XDP.
4940 *      Caller must also take care of handling if (page_is_)pfmemalloc.
4941 *
4942 *      This function may only be called from softirq context and interrupts
4943 *      should be enabled.
4944 *
4945 *      Return values (usually ignored):
4946 *      NET_RX_SUCCESS: no congestion
4947 *      NET_RX_DROP: packet was dropped
4948 */
4949int netif_receive_skb_core(struct sk_buff *skb)
4950{
4951        int ret;
4952
4953        rcu_read_lock();
4954        ret = __netif_receive_skb_one_core(skb, false);
4955        rcu_read_unlock();
4956
4957        return ret;
4958}
4959EXPORT_SYMBOL(netif_receive_skb_core);
4960
4961static inline void __netif_receive_skb_list_ptype(struct list_head *head,
4962                                                  struct packet_type *pt_prev,
4963                                                  struct net_device *orig_dev)
4964{
4965        struct sk_buff *skb, *next;
4966
4967        if (!pt_prev)
4968                return;
4969        if (list_empty(head))
4970                return;
4971        if (pt_prev->list_func != NULL)
4972                INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
4973                                   ip_list_rcv, head, pt_prev, orig_dev);
4974        else
4975                list_for_each_entry_safe(skb, next, head, list) {
4976                        skb_list_del_init(skb);
4977                        pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4978                }
4979}
4980
4981static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
4982{
4983        /* Fast-path assumptions:
4984         * - There is no RX handler.
4985         * - Only one packet_type matches.
4986         * If either of these fails, we will end up doing some per-packet
4987         * processing in-line, then handling the 'last ptype' for the whole
4988         * sublist.  This can't cause out-of-order delivery to any single ptype,
4989         * because the 'last ptype' must be constant across the sublist, and all
4990         * other ptypes are handled per-packet.
4991         */
4992        /* Current (common) ptype of sublist */
4993        struct packet_type *pt_curr = NULL;
4994        /* Current (common) orig_dev of sublist */
4995        struct net_device *od_curr = NULL;
4996        struct list_head sublist;
4997        struct sk_buff *skb, *next;
4998
4999        INIT_LIST_HEAD(&sublist);
5000        list_for_each_entry_safe(skb, next, head, list) {

5001                struct net_device *orig_dev = skb->dev;
5002                struct packet_type *pt_prev = NULL;
5003
5004                skb_list_del_init(skb);
5005                __netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
5006                if (!pt_prev)
5007                        continue;
5008                if (pt_curr != pt_prev || od_curr != orig_dev) {
5009                        /* dispatch old sublist */
5010                        __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
5011                        /* start new sublist */
5012                        INIT_LIST_HEAD(&sublist);
5013                        pt_curr = pt_prev;
5014                        od_curr = orig_dev;
5015                }
5016                list_add_tail(&skb->list, &sublist);
5017        }
5018
5019        /* dispatch final sublist */
5020        __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
5021}
5022
5023static int __netif_receive_skb(struct sk_buff *skb)
5024{
5025        int ret;
5026
5027        if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
5028                unsigned int noreclaim_flag;
5029
5030                /*
5031                 * PFMEMALLOC skbs are special, they should
5032                 * - be delivered to SOCK_MEMALLOC sockets only
5033                 * - stay away from userspace
5034                 * - have bounded memory usage
5035                 *
5036                 * Use PF_MEMALLOC as this saves us from propagating the allocation
5037                 * context down to all allocation sites.
5038                 */
5039                noreclaim_flag = memalloc_noreclaim_save();
5040                ret = __netif_receive_skb_one_core(skb, true);
5041                memalloc_noreclaim_restore(noreclaim_flag);
5042        } else
5043                ret = __netif_receive_skb_one_core(skb, false);
5044
5045        return ret;
5046}
5047
5048static void __netif_receive_skb_list(struct list_head *head)
5049{
5050        unsigned long noreclaim_flag = 0;
5051        struct sk_buff *skb, *next;
5052        bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
5053
5054        list_for_each_entry_safe(skb, next, head, list) {
5055                if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
5056                        struct list_head sublist;
5057
5058                        /* Handle the previous sublist */
5059                        list_cut_before(&sublist, head, &skb->list);
5060                        if (!list_empty(&sublist))
5061                                __netif_receive_skb_list_core(&sublist, pfmemalloc);
5062                        pfmemalloc = !pfmemalloc;
5063                        /* See comments in __netif_receive_skb */
5064                        if (pfmemalloc)
5065                                noreclaim_flag = memalloc_noreclaim_save();
5066                        else
5067                                memalloc_noreclaim_restore(noreclaim_flag);
5068                }
5069        }
5070        /* Handle the remaining sublist */
5071        if (!list_empty(head))
5072                __netif_receive_skb_list_core(head, pfmemalloc);
5073        /* Restore pflags */
5074        if (pfmemalloc)
5075                memalloc_noreclaim_restore(noreclaim_flag);
5076}
5077
5078static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
5079{
5080        struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
5081        struct bpf_prog *new = xdp->prog;
5082        int ret = 0;
5083
5084        switch (xdp->command) {
5085        case XDP_SETUP_PROG:
5086                rcu_assign_pointer(dev->xdp_prog, new);
5087                if (old)
5088                        bpf_prog_put(old);
5089
5090                if (old && !new) {
5091                        static_branch_dec(&generic_xdp_needed_key);
5092                } else if (new && !old) {
5093                        static_branch_inc(&generic_xdp_needed_key);
5094                        dev_disable_lro(dev);
5095                        dev_disable_gro_hw(dev);
5096                }
5097                break;
5098
5099        case XDP_QUERY_PROG:
5100                xdp->prog_id = old ? old->aux->id : 0;
5101                break;
5102
5103        default:
5104                ret = -EINVAL;
5105                break;
5106        }
5107
5108        return ret;
5109}
5110
5111static int netif_receive_skb_internal(struct sk_buff *skb)
5112{
5113        int ret;
5114
5115        net_timestamp_check(netdev_tstamp_prequeue, skb);
5116
5117        if (skb_defer_rx_timestamp(skb))
5118                return NET_RX_SUCCESS;
5119
5120        rcu_read_lock();
5121#ifdef CONFIG_RPS
5122        if (static_branch_unlikely(&rps_needed)) {
5123                struct rps_dev_flow voidflow, *rflow = &voidflow;
5124                int cpu = get_rps_cpu(skb->dev, skb, &rflow);
5125
5126                if (cpu >= 0) {
5127                        ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5128                        rcu_read_unlock();
5129                        return ret;
5130                }
5131        }
5132#endif
5133        ret = __netif_receive_skb(skb);
5134        rcu_read_unlock();
5135        return ret;
5136}
5137
5138static void netif_receive_skb_list_internal(struct list_head *head)
5139{
5140        struct sk_buff *skb, *next;
5141        struct list_head sublist;
5142
5143        INIT_LIST_HEAD(&sublist);
5144        list_for_each_entry_safe(skb, next, head, list) {
5145                net_timestamp_check(netdev_tstamp_prequeue, skb);
5146                skb_list_del_init(skb);
5147                if (!skb_defer_rx_timestamp(skb))
5148                        list_add_tail(&skb->list, &sublist);
5149        }
5150        list_splice_init(&sublist, head);
5151
5152        rcu_read_lock();
5153#ifdef CONFIG_RPS
5154        if (static_branch_unlikely(&rps_needed)) {
5155                list_for_each_entry_safe(skb, next, head, list) {
5156                        struct rps_dev_flow voidflow, *rflow = &voidflow;
5157                        int cpu = get_rps_cpu(skb->dev, skb, &rflow);
5158
5159                        if (cpu >= 0) {
5160                                /* Will be handled, remove from list */
5161                                skb_list_del_init(skb);
5162                                enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5163                        }
5164                }
5165        }
5166#endif
5167        __netif_receive_skb_list(head);
5168        rcu_read_unlock();
5169}
5170
5171/**
5172 *      netif_receive_skb - process receive buffer from network
5173 *      @skb: buffer to process
5174 *
5175 *      netif_receive_skb() is the main receive data processing function.
5176 *      It always succeeds. The buffer may be dropped during processing
5177 *      for congestion control or by the protocol layers.
5178 *
5179 *      This function may only be called from softirq context and interrupts
5180 *      should be enabled.
5181 *
5182 *      Return values (usually ignored):
5183 *      NET_RX_SUCCESS: no congestion
5184 *      NET_RX_DROP: packet was dropped
5185 */
5186int netif_receive_skb(struct sk_buff *skb)
5187{
5188        int ret;
5189
5190        trace_netif_receive_skb_entry(skb);
5191
5192        ret = netif_receive_skb_internal(skb);
5193        trace_netif_receive_skb_exit(ret);
5194
5195        return ret;
5196}
5197EXPORT_SYMBOL(netif_receive_skb);
5198
5199/**
5200 *      netif_receive_skb_list - process many receive buffers from network
5201 *      @head: list of skbs to process.
5202 *
5203 *      Since return value of netif_receive_skb() is normally ignored, and
5204 *      wouldn't be meaningful for a list, this function returns void.
5205 *
5206 *      This function may only be called from softirq context and interrupts
5207 *      should be enabled.
5208 */
5209void netif_receive_skb_list(struct list_head *head)
5210{
5211        struct sk_buff *skb;
5212
5213        if (list_empty(head))
5214                return;
5215        if (trace_netif_receive_skb_list_entry_enabled()) {
5216                list_for_each_entry(skb, head, list)
5217                        trace_netif_receive_skb_list_entry(skb);
5218        }
5219        netif_receive_skb_list_internal(head);
5220        trace_netif_receive_skb_list_exit(0);
5221}
5222EXPORT_SYMBOL(netif_receive_skb_list);
5223
5224DEFINE_PER_CPU(struct work_struct, flush_works);
5225
5226/* Network device is going away, flush any packets still pending */
5227static void flush_backlog(struct work_struct *work)
5228{
5229        struct sk_buff *skb, *tmp;
5230        struct softnet_data *sd;
5231
5232        local_bh_disable();
5233        sd = this_cpu_ptr(&softnet_data);
5234
5235        local_irq_disable();
5236        rps_lock(sd);
5237        skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
5238                if (skb->dev->reg_state == NETREG_UNREGISTERING) {
5239                        __skb_unlink(skb, &sd->input_pkt_queue);
5240                        kfree_skb(skb);
5241                        input_queue_head_incr(sd);
5242                }
5243        }
5244        rps_unlock(sd);
5245        local_irq_enable();
5246
5247        skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
5248                if (skb->dev->reg_state == NETREG_UNREGISTERING) {
5249                        __skb_unlink(skb, &sd->process_queue);
5250                        kfree_skb(skb);
5251                        input_queue_head_incr(sd);
5252                }
5253        }
5254        local_bh_enable();
5255}
5256
5257static void flush_all_backlogs(void)
5258{
5259        unsigned int cpu;
5260
5261        get_online_cpus();
5262
5263        for_each_online_cpu(cpu)
5264                queue_work_on(cpu, system_highpri_wq,
5265                              per_cpu_ptr(&flush_works, cpu));
5266
5267        for_each_online_cpu(cpu)
5268                flush_work(per_cpu_ptr(&flush_works, cpu));
5269
5270        put_online_cpus();
5271}
5272
5273INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int));
5274INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int));
5275static int napi_gro_complete(struct sk_buff *skb)
5276{
5277        struct packet_offload *ptype;
5278        __be16 type = skb->protocol;
5279        struct list_head *head = &offload_base;
5280        int err = -ENOENT;
5281
5282        BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
5283
5284        if (NAPI_GRO_CB(skb)->count == 1) {
5285                skb_shinfo(skb)->gso_size = 0;
5286                goto out;
5287        }
5288
5289        rcu_read_lock();
5290        list_for_each_entry_rcu(ptype, head, list) {
5291                if (ptype->type != type || !ptype->callbacks.gro_complete)
5292                        continue;
5293
5294                err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete,
5295                                         ipv6_gro_complete, inet_gro_complete,
5296                                         skb, 0);
5297                break;
5298        }
5299        rcu_read_unlock();
5300
5301        if (err) {
5302                WARN_ON(&ptype->list == head);
5303                kfree_skb(skb);
5304                return NET_RX_SUCCESS;
5305        }
5306
5307out:
5308        return netif_receive_skb_internal(skb);
5309}
5310
5311static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
5312                                   bool flush_old)
5313{
5314        struct list_head *head = &napi->gro_hash[index].list;
5315        struct sk_buff *skb, *p;
5316
5317        list_for_each_entry_safe_reverse(skb, p, head, list) {
5318                if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
5319                        return;
5320                skb_list_del_init(skb);
5321                napi_gro_complete(skb);
5322                napi->gro_hash[index].count--;
5323        }
5324
5325        if (!napi->gro_hash[index].count)
5326                __clear_bit(index, &napi->gro_bitmask);
5327}
5328
5329/* napi->gro_hash[].list contains packets ordered by age.
5330 * youngest packets at the head of it.
5331 * Complete skbs in reverse order to reduce latencies.
5332 */
5333void napi_gro_flush(struct napi_struct *napi, bool flush_old)
5334{
5335        unsigned long bitmask = napi->gro_bitmask;
5336        unsigned int i, base = ~0U;
5337
5338        while ((i = ffs(bitmask)) != 0) {
5339                bitmask >>= i;
5340                base += i;
5341                __napi_gro_flush_chain(napi, base, flush_old);
5342        }
5343}
5344EXPORT_SYMBOL(napi_gro_flush);
5345
5346static struct list_head *gro_list_prepare(struct napi_struct *napi,
5347                                          struct sk_buff *skb)
5348{
5349        unsigned int maclen = skb->dev->hard_header_len;
5350        u32 hash = skb_get_hash_raw(skb);
5351        struct list_head *head;
5352        struct sk_buff *p;
5353
5354        head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list;
5355        list_for_each_entry(p, head, list) {
5356                unsigned long diffs;
5357
5358                NAPI_GRO_CB(p)->flush = 0;
5359
5360                if (hash != skb_get_hash_raw(p)) {
5361                        NAPI_GRO_CB(p)->same_flow = 0;
5362                        continue;
5363                }
5364
5365                diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
5366                diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb);
5367                if (skb_vlan_tag_present(p))
5368                        diffs |= p->vlan_tci ^ skb->vlan_tci;
5369                diffs |= skb_metadata_dst_cmp(p, skb);
5370                diffs |= skb_metadata_differs(p, skb);
5371                if (maclen == ETH_HLEN)
5372                        diffs |= compare_ether_header(skb_mac_header(p),
5373                                                      skb_mac_header(skb));
5374                else if (!diffs)
5375                        diffs = memcmp(skb_mac_header(p),
5376                                       skb_mac_header(skb),
5377                                       maclen);
5378                NAPI_GRO_CB(p)->same_flow = !diffs;
5379        }
5380
5381        return head;
5382}
5383
5384static void skb_gro_reset_offset(struct sk_buff *skb)
5385{
5386        const struct skb_shared_info *pinfo = skb_shinfo(skb);
5387        const skb_frag_t *frag0 = &pinfo->frags[0];
5388
5389        NAPI_GRO_CB(skb)->data_offset = 0;
5390        NAPI_GRO_CB(skb)->frag0 = NULL;
5391        NAPI_GRO_CB(skb)->frag0_len = 0;
5392
5393        if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
5394            pinfo->nr_frags &&
5395            !PageHighMem(skb_frag_page(frag0))) {
5396                NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
5397                NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
5398                                                    skb_frag_size(frag0),
5399                                                    skb->end - skb->tail);
5400        }
5401}
5402
5403static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
5404{
5405        struct skb_shared_info *pinfo = skb_shinfo(skb);
5406
5407        BUG_ON(skb->end - skb->tail < grow);
5408
5409        memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
5410
5411        skb->data_len -= grow;
5412        skb->tail += grow;
5413
5414        skb_frag_off_add(&pinfo->frags[0], grow);
5415        skb_frag_size_sub(&pinfo->frags[0], grow);
5416
5417        if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
5418                skb_frag_unref(skb, 0);
5419                memmove(pinfo->frags, pinfo->frags + 1,
5420                        --pinfo->nr_frags * sizeof(pinfo->frags[0]));
5421        }
5422}
5423
5424static void gro_flush_oldest(struct list_head *head)
5425{
5426        struct sk_buff *oldest;
5427
5428        oldest = list_last_entry(head, struct sk_buff, list);
5429
5430        /* We are called with head length >= MAX_GRO_SKBS, so this is
5431         * impossible.
5432         */
5433        if (WARN_ON_ONCE(!oldest))
5434                return;
5435
5436        /* Do not adjust napi->gro_hash[].count, caller is adding a new
5437         * SKB to the chain.
5438         */
5439        skb_list_del_init(oldest);
5440        napi_gro_complete(oldest);
5441}
5442
5443INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *,
5444                                                           struct sk_buff *));
5445INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *,
5446                                                           struct sk_buff *));
5447static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
5448{
5449        u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
5450        struct list_head *head = &offload_base;
5451        struct packet_offload *ptype;
5452        __be16 type = skb->protocol;
5453        struct list_head *gro_head;
5454        struct sk_buff *pp = NULL;
5455        enum gro_result ret;
5456        int same_flow;
5457        int grow;
5458
5459        if (netif_elide_gro(skb->dev))
5460                goto normal;
5461
5462        gro_head = gro_list_prepare(napi, skb);
5463
5464        rcu_read_lock();
5465        list_for_each_entry_rcu(ptype, head, list) {
5466                if (ptype->type != type || !ptype->callbacks.gro_receive)
5467                        continue;
5468
5469                skb_set_network_header(skb, skb_gro_offset(skb));
5470                skb_reset_mac_len(skb);
5471                NAPI_GRO_CB(skb)->same_flow = 0;
5472                NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
5473                NAPI_GRO_CB(skb)->free = 0;
5474                NAPI_GRO_CB(skb)->encap_mark = 0;
5475                NAPI_GRO_CB(skb)->recursion_counter = 0;
5476                NAPI_GRO_CB(skb)->is_fou = 0;
5477                NAPI_GRO_CB(skb)->is_atomic = 1;
5478                NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
5479
5480                /* Setup for GRO checksum validation */
5481                switch (skb->ip_summed) {
5482                case CHECKSUM_COMPLETE:
5483                        NAPI_GRO_CB(skb)->csum = skb->csum;
5484                        NAPI_GRO_CB(skb)->csum_valid = 1;
5485                        NAPI_GRO_CB(skb)->csum_cnt = 0;
5486                        break;
5487                case CHECKSUM_UNNECESSARY:
5488                        NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
5489                        NAPI_GRO_CB(skb)->csum_valid = 0;
5490                        break;
5491                default:
5492                        NAPI_GRO_CB(skb)->csum_cnt = 0;
5493                        NAPI_GRO_CB(skb)->csum_valid = 0;
5494                }
5495
5496                pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive,
5497                                        ipv6_gro_receive, inet_gro_receive,
5498                                        gro_head, skb);
5499                break;
5500        }
5501        rcu_read_unlock();
5502
5503        if (&ptype->list == head)
5504                goto normal;
5505
5506        if (IS_ERR(pp) && PTR_ERR(pp) == -EINPROGRESS) {
5507                ret = GRO_CONSUMED;
5508                goto ok;
5509        }
5510
5511        same_flow = NAPI_GRO_CB(skb)->same_flow;
5512        ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
5513
5514        if (pp) {
5515                skb_list_del_init(pp);
5516                napi_gro_complete(pp);
5517                napi->gro_hash[hash].count--;
5518        }
5519
5520        if (same_flow)
5521                goto ok;
5522
5523        if (NAPI_GRO_CB(skb)->flush)
5524                goto normal;
5525
5526        if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) {
5527                gro_flush_oldest(gro_head);
5528        } else {
5529                napi->gro_hash[hash].count++;
5530        }
5531        NAPI_GRO_CB(skb)->count = 1;
5532        NAPI_GRO_CB(skb)->age = jiffies;
5533        NAPI_GRO_CB(skb)->last = skb;
5534        skb_shinfo(skb)->gso_size = skb_gro_len(skb);
5535        list_add(&skb->list, gro_head);
5536        ret = GRO_HELD;
5537
5538pull:
5539        grow = skb_gro_offset(skb) - skb_headlen(skb);
5540        if (grow > 0)
5541                gro_pull_from_frag0(skb, grow);
5542ok:
5543        if (napi->gro_hash[hash].count) {
5544                if (!test_bit(hash, &napi->gro_bitmask))
5545                        __set_bit(hash, &napi->gro_bitmask);
5546        } else if (test_bit(hash, &napi->gro_bitmask)) {
5547                __clear_bit(hash, &napi->gro_bitmask);
5548        }
5549
5550        return ret;
5551
5552normal:
5553        ret = GRO_NORMAL;
5554        goto pull;
5555}
5556
5557struct packet_offload *gro_find_receive_by_type(__be16 type)
5558{
5559        struct list_head *offload_head = &offload_base;
5560        struct packet_offload *ptype;
5561
5562        list_for_each_entry_rcu(ptype, offload_head, list) {
5563                if (ptype->type != type || !ptype->callbacks.gro_receive)
5564                        continue;
5565                return ptype;
5566        }
5567        return NULL;
5568}
5569EXPORT_SYMBOL(gro_find_receive_by_type);
5570
5571struct packet_offload *gro_find_complete_by_type(__be16 type)
5572{
5573        struct list_head *offload_head = &offload_base;
5574        struct packet_offload *ptype;
5575
5576        list_for_each_entry_rcu(ptype, offload_head, list) {
5577                if (ptype->type != type || !ptype->callbacks.gro_complete)
5578                        continue;
5579                return ptype;
5580        }
5581        return NULL;
5582}
5583EXPORT_SYMBOL(gro_find_complete_by_type);
5584
5585static void napi_skb_free_stolen_head(struct sk_buff *skb)
5586{
5587        skb_dst_drop(skb);
5588        skb_ext_put(skb);
5589        kmem_cache_free(skbuff_head_cache, skb);
5590}
5591
5592static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
5593{
5594        switch (ret) {
5595        case GRO_NORMAL:
5596                if (netif_receive_skb_internal(skb))
5597                        ret = GRO_DROP;
5598                break;
5599
5600        case GRO_DROP:
5601                kfree_skb(skb);
5602                break;
5603
5604        case GRO_MERGED_FREE:
5605                if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
5606                        napi_skb_free_stolen_head(skb);
5607                else
5608                        __kfree_skb(skb);
5609                break;
5610
5611        case GRO_HELD:
5612        case GRO_MERGED:
5613        case GRO_CONSUMED:
5614                break;
5615        }
5616
5617        return ret;
5618}
5619
5620gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
5621{
5622        gro_result_t ret;
5623
5624        skb_mark_napi_id(skb, napi);
5625        trace_napi_gro_receive_entry(skb);
5626
5627        skb_gro_reset_offset(skb);
5628
5629        ret = napi_skb_finish(dev_gro_receive(napi, skb), skb);
5630        trace_napi_gro_receive_exit(ret);
5631
5632        return ret;
5633}
5634EXPORT_SYMBOL(napi_gro_receive);
5635
5636static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
5637{
5638        if (unlikely(skb->pfmemalloc)) {
5639                consume_skb(skb);
5640                return;
5641        }
5642        __skb_pull(skb, skb_headlen(skb));
5643        /* restore the reserve we had after netdev_alloc_skb_ip_align() */
5644        skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
5645        __vlan_hwaccel_clear_tag(skb);
5646        skb->dev = napi->dev;
5647        skb->skb_iif = 0;
5648
5649        /* eth_type_trans() assumes pkt_type is PACKET_HOST */
5650        skb->pkt_type = PACKET_HOST;
5651
5652        skb->encapsulation = 0;
5653        skb_shinfo(skb)->gso_type = 0;
5654        skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
5655        skb_ext_reset(skb);
5656
5657        napi->skb = skb;
5658}
5659
5660struct sk_buff *napi_get_frags(struct napi_struct *napi)
5661{
5662        struct sk_buff *skb = napi->skb;
5663
5664        if (!skb) {
5665                skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
5666                if (skb) {
5667                        napi->skb = skb;
5668                        skb_mark_napi_id(skb, napi);
5669                }
5670        }
5671        return skb;
5672}
5673EXPORT_SYMBOL(napi_get_frags);
5674
5675/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */
5676static void gro_normal_list(struct napi_struct *napi)
5677{
5678        if (!napi->rx_count)
5679                return;
5680        netif_receive_skb_list_internal(&napi->rx_list);
5681        INIT_LIST_HEAD(&napi->rx_list);
5682        napi->rx_count = 0;
5683}
5684
5685/* Queue one GRO_NORMAL SKB up for list processing.  If batch size exceeded,
5686 * pass the whole batch up to the stack.
5687 */
5688static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb)
5689{
5690        list_add_tail(&skb->list, &napi->rx_list);
5691        if (++napi->rx_count >= gro_normal_batch)
5692                gro_normal_list(napi);
5693}
5694
5695static gro_result_t napi_frags_finish(struct napi_struct *napi,
5696                                      struct sk_buff *skb,
5697                                      gro_result_t ret)
5698{
5699        switch (ret) {
5700        case GRO_NORMAL:
5701        case GRO_HELD:
5702                __skb_push(skb, ETH_HLEN);
5703                skb->protocol = eth_type_trans(skb, skb->dev);
5704                if (ret == GRO_NORMAL)
5705                        gro_normal_one(napi, skb);
5706                break;
5707
5708        case GRO_DROP:
5709                napi_reuse_skb(napi, skb);
5710                break;
5711
5712        case GRO_MERGED_FREE:
5713                if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
5714                        napi_skb_free_stolen_head(skb);
5715                else
5716                        napi_reuse_skb(napi, skb);
5717                break;
5718
5719        case GRO_MERGED:
5720        case GRO_CONSUMED:
5721                break;
5722        }
5723
5724        return ret;
5725}
5726
5727/* Upper GRO stack assumes network header starts at gro_offset=0
5728 * Drivers could call both napi_gro_frags() and napi_gro_receive()
5729 * We copy ethernet header into skb->data to have a common layout.
5730 */
5731static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
5732{
5733        struct sk_buff *skb = napi->skb;
5734        const struct ethhdr *eth;
5735        unsigned int hlen = sizeof(*eth);
5736
5737        napi->skb = NULL;
5738
5739        skb_reset_mac_header(skb);
5740        skb_gro_reset_offset(skb);
5741
5742        if (unlikely(skb_gro_header_hard(skb, hlen))) {
5743                eth = skb_gro_header_slow(skb, hlen, 0);
5744                if (unlikely(!eth)) {
5745                        net_warn_ratelimited("%s: dropping impossible skb from %s\n",
5746                                             __func__, napi->dev->name);
5747                        napi_reuse_skb(napi, skb);
5748                        return NULL;
5749                }
5750        } else {
5751                eth = (const struct ethhdr *)skb->data;
5752                gro_pull_from_frag0(skb, hlen);
5753                NAPI_GRO_CB(skb)->frag0 += hlen;
5754                NAPI_GRO_CB(skb)->frag0_len -= hlen;
5755        }
5756        __skb_pull(skb, hlen);
5757
5758        /*
5759         * This works because the only protocols we care about don't require
5760         * special handling.
5761         * We'll fix it up properly in napi_frags_finish()
5762         */
5763        skb->protocol = eth->h_proto;
5764
5765        return skb;
5766}
5767
5768gro_result_t napi_gro_frags(struct napi_struct *napi)
5769{
5770        gro_result_t ret;
5771        struct sk_buff *skb = napi_frags_skb(napi);
5772
5773        if (!skb)
5774                return GRO_DROP;
5775
5776        trace_napi_gro_frags_entry(skb);
5777
5778        ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
5779        trace_napi_gro_frags_exit(ret);
5780
5781        return ret;
5782}
5783EXPORT_SYMBOL(napi_gro_frags);
5784
5785/* Compute the checksum from gro_offset and return the folded value
5786 * after adding in any pseudo checksum.
5787 */
5788__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
5789{
5790        __wsum wsum;
5791        __sum16 sum;
5792
5793        wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
5794
5795        /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
5796        sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
5797        /* See comments in __skb_checksum_complete(). */
5798        if (likely(!sum)) {
5799                if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
5800                    !skb->csum_complete_sw)
5801                        netdev_rx_csum_fault(skb->dev, skb);
5802        }
5803
5804        NAPI_GRO_CB(skb)->csum = wsum;
5805        NAPI_GRO_CB(skb)->csum_valid = 1;
5806
5807        return sum;
5808}
5809EXPORT_SYMBOL(__skb_gro_checksum_complete);
5810
5811static void net_rps_send_ipi(struct softnet_data *remsd)
5812{
5813#ifdef CONFIG_RPS
5814        while (remsd) {
5815                struct softnet_data *next = remsd->rps_ipi_next;
5816
5817                if (cpu_online(remsd->cpu))
5818                        smp_call_function_single_async(remsd->cpu, &remsd->csd);
5819                remsd = next;
5820        }
5821#endif
5822}
5823
5824/*
5825 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
5826 * Note: called with local irq disabled, but exits with local irq enabled.
5827 */
5828static void net_rps_action_and_irq_enable(struct softnet_data *sd)
5829{
5830#ifdef CONFIG_RPS
5831        struct softnet_data *remsd = sd->rps_ipi_list;
5832
5833        if (remsd) {
5834                sd->rps_ipi_list = NULL;
5835
5836                local_irq_enable();
5837
5838                /* Send pending IPI's to kick RPS processing on remote cpus. */
5839                net_rps_send_ipi(remsd);
5840        } else
5841#endif
5842                local_irq_enable();
5843}
5844
5845static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
5846{
5847#ifdef CONFIG_RPS
5848        return sd->rps_ipi_list != NULL;
5849#else
5850        return false;
5851#endif
5852}
5853
5854static int process_backlog(struct napi_struct *napi, int quota)
5855{
5856        struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
5857        bool again = true;
5858        int work = 0;
5859
5860        /* Check if we have pending ipi, its better to send them now,
5861         * not waiting net_rx_action() end.
5862         */
5863        if (sd_has_rps_ipi_waiting(sd)) {
5864                local_irq_disable();
5865                net_rps_action_and_irq_enable(sd);
5866        }
5867
5868        napi->weight = dev_rx_weight;
5869        while (again) {
5870                struct sk_buff *skb;
5871
5872                while ((skb = __skb_dequeue(&sd->process_queue))) {
5873                        rcu_read_lock();
5874                        __netif_receive_skb(skb);
5875                        rcu_read_unlock();
5876                        input_queue_head_incr(sd);
5877                        if (++work >= quota)
5878                                return work;
5879
5880                }
5881
5882                local_irq_disable();
5883                rps_lock(sd);
5884                if (skb_queue_empty(&sd->input_pkt_queue)) {
5885                        /*
5886                         * Inline a custom version of __napi_complete().
5887                         * only current cpu owns and manipulates this napi,
5888                         * and NAPI_STATE_SCHED is the only possible flag set
5889                         * on backlog.
5890                         * We can use a plain write instead of clear_bit(),
5891                         * and we dont need an smp_mb() memory barrier.
5892                         */
5893                        napi->state = 0;
5894                        again = false;
5895                } else {
5896                        skb_queue_splice_tail_init(&sd->input_pkt_queue,
5897                                                   &sd->process_queue);
5898                }
5899                rps_unlock(sd);
5900                local_irq_enable();
5901        }
5902
5903        return work;
5904}
5905
5906/**
5907 * __napi_schedule - schedule for receive
5908 * @n: entry to schedule
5909 *
5910 * The entry's receive function will be scheduled to run.
5911 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
5912 */
5913void __napi_schedule(struct napi_struct *n)
5914{
5915        unsigned long flags;
5916
5917        local_irq_save(flags);
5918        ____napi_schedule(this_cpu_ptr(&softnet_data), n);
5919        local_irq_restore(flags);
5920}
5921EXPORT_SYMBOL(__napi_schedule);
5922
5923/**
5924 *      napi_schedule_prep - check if napi can be scheduled
5925 *      @n: napi context
5926 *
5927 * Test if NAPI routine is already running, and if not mark
5928 * it as running.  This is used as a condition variable
5929 * insure only one NAPI poll instance runs.  We also make
5930 * sure there is no pending NAPI disable.
5931 */
5932bool napi_schedule_prep(struct napi_struct *n)
5933{
5934        unsigned long val, new;
5935
5936        do {
5937                val = READ_ONCE(n->state);
5938                if (unlikely(val & NAPIF_STATE_DISABLE))
5939                        return false;
5940                new = val | NAPIF_STATE_SCHED;
5941
5942                /* Sets STATE_MISSED bit if STATE_SCHED was already set
5943                 * This was suggested by Alexander Duyck, as compiler
5944                 * emits better code than :
5945                 * if (val & NAPIF_STATE_SCHED)
5946                 *     new |= NAPIF_STATE_MISSED;
5947                 */
5948                new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
5949                                                   NAPIF_STATE_MISSED;
5950        } while (cmpxchg(&n->state, val, new) != val);
5951
5952        return !(val & NAPIF_STATE_SCHED);
5953}
5954EXPORT_SYMBOL(napi_schedule_prep);
5955
5956/**
5957 * __napi_schedule_irqoff - schedule for receive
5958 * @n: entry to schedule
5959 *
5960 * Variant of __napi_schedule() assuming hard irqs are masked
5961 */
5962void __napi_schedule_irqoff(struct napi_struct *n)
5963{
5964        ____napi_schedule(this_cpu_ptr(&softnet_data), n);
5965}
5966EXPORT_SYMBOL(__napi_schedule_irqoff);
5967
5968bool napi_complete_done(struct napi_struct *n, int work_done)
5969{
5970        unsigned long flags, val, new;
5971
5972        /*
5973         * 1) Don't let napi dequeue from the cpu poll list
5974         *    just in case its running on a different cpu.
5975         * 2) If we are busy polling, do nothing here, we have
5976         *    the guarantee we will be called later.
5977         */
5978        if (unlikely(n->state & (NAPIF_STATE_NPSVC |
5979                                 NAPIF_STATE_IN_BUSY_POLL)))
5980                return false;
5981
5982        gro_normal_list(n);
5983
5984        if (n->gro_bitmask) {
5985                unsigned long timeout = 0;
5986
5987                if (work_done)
5988                        timeout = n->dev->gro_flush_timeout;
5989
5990                /* When the NAPI instance uses a timeout and keeps postponing
5991                 * it, we need to bound somehow the time packets are kept in
5992                 * the GRO layer
5993                 */
5994                napi_gro_flush(n, !!timeout);
5995                if (timeout)
5996                        hrtimer_start(&n->timer, ns_to_ktime(timeout),
5997                                      HRTIMER_MODE_REL_PINNED);
5998        }
5999        if (unlikely(!list_empty(&n->poll_list))) {
6000                /* If n->poll_list is not empty, we need to mask irqs */

6001                local_irq_save(flags);
6002                list_del_init(&n->poll_list);
6003                local_irq_restore(flags);
6004        }
6005
6006        do {
6007                val = READ_ONCE(n->state);
6008
6009                WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
6010
6011                new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
6012
6013                /* If STATE_MISSED was set, leave STATE_SCHED set,
6014                 * because we will call napi->poll() one more time.
6015                 * This C code was suggested by Alexander Duyck to help gcc.
6016                 */
6017                new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
6018                                                    NAPIF_STATE_SCHED;
6019        } while (cmpxchg(&n->state, val, new) != val);
6020
6021        if (unlikely(val & NAPIF_STATE_MISSED)) {
6022                __napi_schedule(n);
6023                return false;
6024        }
6025
6026        return true;
6027}
6028EXPORT_SYMBOL(napi_complete_done);
6029
6030/* must be called under rcu_read_lock(), as we dont take a reference */
6031static struct napi_struct *napi_by_id(unsigned int napi_id)
6032{
6033        unsigned int hash = napi_id % HASH_SIZE(napi_hash);
6034        struct napi_struct *napi;
6035
6036        hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
6037                if (napi->napi_id == napi_id)
6038                        return napi;
6039
6040        return NULL;
6041}
6042
6043#if defined(CONFIG_NET_RX_BUSY_POLL)
6044
6045#define BUSY_POLL_BUDGET 8
6046
6047static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
6048{
6049        int rc;
6050
6051        /* Busy polling means there is a high chance device driver hard irq
6052         * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
6053         * set in napi_schedule_prep().
6054         * Since we are about to call napi->poll() once more, we can safely
6055         * clear NAPI_STATE_MISSED.
6056         *
6057         * Note: x86 could use a single "lock and ..." instruction
6058         * to perform these two clear_bit()
6059         */
6060        clear_bit(NAPI_STATE_MISSED, &napi->state);
6061        clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
6062
6063        local_bh_disable();
6064
6065        /* All we really want here is to re-enable device interrupts.
6066         * Ideally, a new ndo_busy_poll_stop() could avoid another round.
6067         */
6068        rc = napi->poll(napi, BUSY_POLL_BUDGET);
6069        /* We can't gro_normal_list() here, because napi->poll() might have
6070         * rearmed the napi (napi_complete_done()) in which case it could
6071         * already be running on another CPU.
6072         */
6073        trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
6074        netpoll_poll_unlock(have_poll_lock);
6075        if (rc == BUSY_POLL_BUDGET) {
6076                /* As the whole budget was spent, we still own the napi so can
6077                 * safely handle the rx_list.
6078                 */
6079                gro_normal_list(napi);
6080                __napi_schedule(napi);
6081        }
6082        local_bh_enable();
6083}
6084
6085void napi_busy_loop(unsigned int napi_id,
6086                    bool (*loop_end)(void *, unsigned long),
6087                    void *loop_end_arg)
6088{
6089        unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
6090        int (*napi_poll)(struct napi_struct *napi, int budget);
6091        void *have_poll_lock = NULL;
6092        struct napi_struct *napi;
6093
6094restart:
6095        napi_poll = NULL;
6096
6097        rcu_read_lock();
6098
6099        napi = napi_by_id(napi_id);
6100        if (!napi)
6101                goto out;
6102
6103        preempt_disable();
6104        for (;;) {
6105                int work = 0;
6106
6107                local_bh_disable();
6108                if (!napi_poll) {
6109                        unsigned long val = READ_ONCE(napi->state);
6110
6111                        /* If multiple threads are competing for this napi,
6112                         * we avoid dirtying napi->state as much as we can.
6113                         */
6114                        if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
6115                                   NAPIF_STATE_IN_BUSY_POLL))
6116                                goto count;
6117                        if (cmpxchg(&napi->state, val,
6118                                    val | NAPIF_STATE_IN_BUSY_POLL |
6119                                          NAPIF_STATE_SCHED) != val)
6120                                goto count;
6121                        have_poll_lock = netpoll_poll_lock(napi);
6122                        napi_poll = napi->poll;
6123                }
6124                work = napi_poll(napi, BUSY_POLL_BUDGET);
6125                trace_napi_poll(napi, work, BUSY_POLL_BUDGET);
6126                gro_normal_list(napi);
6127count:
6128                if (work > 0)
6129                        __NET_ADD_STATS(dev_net(napi->dev),
6130                                        LINUX_MIB_BUSYPOLLRXPACKETS, work);
6131                local_bh_enable();
6132
6133                if (!loop_end || loop_end(loop_end_arg, start_time))
6134                        break;
6135
6136                if (unlikely(need_resched())) {
6137                        if (napi_poll)
6138                                busy_poll_stop(napi, have_poll_lock);
6139                        preempt_enable();
6140                        rcu_read_unlock();
6141                        cond_resched();
6142                        if (loop_end(loop_end_arg, start_time))
6143                                return;
6144                        goto restart;
6145                }
6146                cpu_relax();
6147        }
6148        if (napi_poll)
6149                busy_poll_stop(napi, have_poll_lock);
6150        preempt_enable();
6151out:
6152        rcu_read_unlock();
6153}
6154EXPORT_SYMBOL(napi_busy_loop);
6155
6156#endif /* CONFIG_NET_RX_BUSY_POLL */
6157
6158static void napi_hash_add(struct napi_struct *napi)
6159{
6160        if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
6161            test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
6162                return;
6163
6164        spin_lock(&napi_hash_lock);
6165
6166        /* 0..NR_CPUS range is reserved for sender_cpu use */
6167        do {
6168                if (unlikely(++napi_gen_id < MIN_NAPI_ID))
6169                        napi_gen_id = MIN_NAPI_ID;
6170        } while (napi_by_id(napi_gen_id));
6171        napi->napi_id = napi_gen_id;
6172
6173        hlist_add_head_rcu(&napi->napi_hash_node,
6174                           &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
6175
6176        spin_unlock(&napi_hash_lock);
6177}
6178
6179/* Warning : caller is responsible to make sure rcu grace period
6180 * is respected before freeing memory containing @napi
6181 */
6182bool napi_hash_del(struct napi_struct *napi)
6183{
6184        bool rcu_sync_needed = false;
6185
6186        spin_lock(&napi_hash_lock);
6187
6188        if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
6189                rcu_sync_needed = true;
6190                hlist_del_rcu(&napi->napi_hash_node);
6191        }
6192        spin_unlock(&napi_hash_lock);
6193        return rcu_sync_needed;
6194}
6195EXPORT_SYMBOL_GPL(napi_hash_del);
6196
6197static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
6198{
6199        struct napi_struct *napi;
6200
6201        napi = container_of(timer, struct napi_struct, timer);
6202
6203        /* Note : we use a relaxed variant of napi_schedule_prep() not setting
6204         * NAPI_STATE_MISSED, since we do not react to a device IRQ.
6205         */
6206        if (napi->gro_bitmask && !napi_disable_pending(napi) &&
6207            !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
6208                __napi_schedule_irqoff(napi);
6209
6210        return HRTIMER_NORESTART;
6211}
6212
6213static void init_gro_hash(struct napi_struct *napi)
6214{
6215        int i;
6216
6217        for (i = 0; i < GRO_HASH_BUCKETS; i++) {
6218                INIT_LIST_HEAD(&napi->gro_hash[i].list);
6219                napi->gro_hash[i].count = 0;
6220        }
6221        napi->gro_bitmask = 0;
6222}
6223
6224void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
6225                    int (*poll)(struct napi_struct *, int), int weight)
6226{
6227        INIT_LIST_HEAD(&napi->poll_list);
6228        hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
6229        napi->timer.function = napi_watchdog;
6230        init_gro_hash(napi);
6231        napi->skb = NULL;
6232        INIT_LIST_HEAD(&napi->rx_list);
6233        napi->rx_count = 0;
6234        napi->poll = poll;
6235        if (weight > NAPI_POLL_WEIGHT)
6236                netdev_err_once(dev, "%s() called with weight %d\n", __func__,
6237                                weight);
6238        napi->weight = weight;
6239        list_add(&napi->dev_list, &dev->napi_list);
6240        napi->dev = dev;
6241#ifdef CONFIG_NETPOLL
6242        napi->poll_owner = -1;
6243#endif
6244        set_bit(NAPI_STATE_SCHED, &napi->state);
6245        napi_hash_add(napi);
6246}
6247EXPORT_SYMBOL(netif_napi_add);
6248
6249void napi_disable(struct napi_struct *n)
6250{
6251        might_sleep();
6252        set_bit(NAPI_STATE_DISABLE, &n->state);
6253
6254        while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
6255                msleep(1);
6256        while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
6257                msleep(1);
6258
6259        hrtimer_cancel(&n->timer);
6260
6261        clear_bit(NAPI_STATE_DISABLE, &n->state);
6262}
6263EXPORT_SYMBOL(napi_disable);
6264
6265static void flush_gro_hash(struct napi_struct *napi)
6266{
6267        int i;
6268
6269        for (i = 0; i < GRO_HASH_BUCKETS; i++) {
6270                struct sk_buff *skb, *n;
6271
6272                list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
6273                        kfree_skb(skb);
6274                napi->gro_hash[i].count = 0;
6275        }
6276}
6277
6278/* Must be called in process context */
6279void netif_napi_del(struct napi_struct *napi)
6280{
6281        might_sleep();
6282        if (napi_hash_del(napi))
6283                synchronize_net();
6284        list_del_init(&napi->dev_list);
6285        napi_free_frags(napi);
6286
6287        flush_gro_hash(napi);
6288        napi->gro_bitmask = 0;
6289}
6290EXPORT_SYMBOL(netif_napi_del);
6291
6292static int napi_poll(struct napi_struct *n, struct list_head *repoll)
6293{
6294        void *have;
6295        int work, weight;
6296
6297        list_del_init(&n->poll_list);
6298
6299        have = netpoll_poll_lock(n);
6300
6301        weight = n->weight;
6302
6303        /* This NAPI_STATE_SCHED test is for avoiding a race
6304         * with netpoll's poll_napi().  Only the entity which
6305         * obtains the lock and sees NAPI_STATE_SCHED set will
6306         * actually make the ->poll() call.  Therefore we avoid
6307         * accidentally calling ->poll() when NAPI is not scheduled.
6308         */
6309        work = 0;
6310        if (test_bit(NAPI_STATE_SCHED, &n->state)) {
6311                work = n->poll(n, weight);
6312                trace_napi_poll(n, work, weight);
6313        }
6314
6315        WARN_ON_ONCE(work > weight);
6316
6317        if (likely(work < weight))
6318                goto out_unlock;
6319
6320        /* Drivers must not modify the NAPI state if they
6321         * consume the entire weight.  In such cases this code
6322         * still "owns" the NAPI instance and therefore can
6323         * move the instance around on the list at-will.
6324         */
6325        if (unlikely(napi_disable_pending(n))) {
6326                napi_complete(n);
6327                goto out_unlock;
6328        }
6329
6330        gro_normal_list(n);
6331
6332        if (n->gro_bitmask) {
6333                /* flush too old packets
6334                 * If HZ < 1000, flush all packets.
6335                 */
6336                napi_gro_flush(n, HZ >= 1000);
6337        }
6338
6339        /* Some drivers may have called napi_schedule
6340         * prior to exhausting their budget.
6341         */
6342        if (unlikely(!list_empty(&n->poll_list))) {
6343                pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
6344                             n->dev ? n->dev->name : "backlog");
6345                goto out_unlock;
6346        }
6347
6348        list_add_tail(&n->poll_list, repoll);
6349
6350out_unlock:
6351        netpoll_poll_unlock(have);
6352
6353        return work;
6354}
6355
6356static __latent_entropy void net_rx_action(struct softirq_action *h)
6357{
6358        struct softnet_data *sd = this_cpu_ptr(&softnet_data);
6359        unsigned long time_limit = jiffies +
6360                usecs_to_jiffies(netdev_budget_usecs);
6361        int budget = netdev_budget;
6362        LIST_HEAD(list);
6363        LIST_HEAD(repoll);
6364
6365        local_irq_disable();
6366        list_splice_init(&sd->poll_list, &list);
6367        local_irq_enable();
6368
6369        for (;;) {
6370                struct napi_struct *n;
6371
6372                if (list_empty(&list)) {
6373                        if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
6374                                goto out;
6375                        break;
6376                }
6377
6378                n = list_first_entry(&list, struct napi_struct, poll_list);
6379                budget -= napi_poll(n, &repoll);
6380
6381                /* If softirq window is exhausted then punt.
6382                 * Allow this to run for 2 jiffies since which will allow
6383                 * an average latency of 1.5/HZ.
6384                 */
6385                if (unlikely(budget <= 0 ||
6386                             time_after_eq(jiffies, time_limit))) {
6387                        sd->time_squeeze++;
6388                        break;
6389                }
6390        }
6391
6392        local_irq_disable();
6393
6394        list_splice_tail_init(&sd->poll_list, &list);
6395        list_splice_tail(&repoll, &list);
6396        list_splice(&list, &sd->poll_list);
6397        if (!list_empty(&sd->poll_list))
6398                __raise_softirq_irqoff(NET_RX_SOFTIRQ);
6399
6400        net_rps_action_and_irq_enable(sd);
6401out:
6402        __kfree_skb_flush();
6403}
6404
6405struct netdev_adjacent {
6406        struct net_device *dev;
6407
6408        /* upper master flag, there can only be one master device per list */
6409        bool master;
6410
6411        /* lookup ignore flag */
6412        bool ignore;
6413
6414        /* counter for the number of times this device was added to us */
6415        u16 ref_nr;
6416
6417        /* private field for the users */
6418        void *private;
6419
6420        struct list_head list;
6421        struct rcu_head rcu;
6422};
6423
6424static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
6425                                                 struct list_head *adj_list)
6426{
6427        struct netdev_adjacent *adj;
6428
6429        list_for_each_entry(adj, adj_list, list) {
6430                if (adj->dev == adj_dev)
6431                        return adj;
6432        }
6433        return NULL;
6434}
6435
6436static int ____netdev_has_upper_dev(struct net_device *upper_dev, void *data)
6437{
6438        struct net_device *dev = data;
6439
6440        return upper_dev == dev;
6441}
6442
6443/**
6444 * netdev_has_upper_dev - Check if device is linked to an upper device
6445 * @dev: device
6446 * @upper_dev: upper device to check
6447 *
6448 * Find out if a device is linked to specified upper device and return true
6449 * in case it is. Note that this checks only immediate upper device,
6450 * not through a complete stack of devices. The caller must hold the RTNL lock.
6451 */
6452bool netdev_has_upper_dev(struct net_device *dev,
6453                          struct net_device *upper_dev)
6454{
6455        ASSERT_RTNL();
6456
6457        return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
6458                                             upper_dev);
6459}
6460EXPORT_SYMBOL(netdev_has_upper_dev);
6461
6462/**
6463 * netdev_has_upper_dev_all - Check if device is linked to an upper device
6464 * @dev: device
6465 * @upper_dev: upper device to check
6466 *
6467 * Find out if a device is linked to specified upper device and return true
6468 * in case it is. Note that this checks the entire upper device chain.
6469 * The caller must hold rcu lock.
6470 */
6471
6472bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
6473                                  struct net_device *upper_dev)
6474{
6475        return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
6476                                               upper_dev);
6477}
6478EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
6479
6480/**
6481 * netdev_has_any_upper_dev - Check if device is linked to some device
6482 * @dev: device
6483 *
6484 * Find out if a device is linked to an upper device and return true in case
6485 * it is. The caller must hold the RTNL lock.
6486 */
6487bool netdev_has_any_upper_dev(struct net_device *dev)
6488{
6489        ASSERT_RTNL();
6490
6491        return !list_empty(&dev->adj_list.upper);
6492}
6493EXPORT_SYMBOL(netdev_has_any_upper_dev);
6494
6495/**
6496 * netdev_master_upper_dev_get - Get master upper device
6497 * @dev: device
6498 *
6499 * Find a master upper device and return pointer to it or NULL in case
6500 * it's not there. The caller must hold the RTNL lock.
6501 */
6502struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
6503{
6504        struct netdev_adjacent *upper;
6505
6506        ASSERT_RTNL();
6507
6508        if (list_empty(&dev->adj_list.upper))
6509                return NULL;
6510
6511        upper = list_first_entry(&dev->adj_list.upper,
6512                                 struct netdev_adjacent, list);
6513        if (likely(upper->master))
6514                return upper->dev;
6515        return NULL;
6516}
6517EXPORT_SYMBOL(netdev_master_upper_dev_get);
6518
6519static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
6520{
6521        struct netdev_adjacent *upper;
6522
6523        ASSERT_RTNL();
6524
6525        if (list_empty(&dev->adj_list.upper))
6526                return NULL;
6527
6528        upper = list_first_entry(&dev->adj_list.upper,
6529                                 struct netdev_adjacent, list);
6530        if (likely(upper->master) && !upper->ignore)
6531                return upper->dev;
6532        return NULL;
6533}
6534
6535/**
6536 * netdev_has_any_lower_dev - Check if device is linked to some device
6537 * @dev: device
6538 *
6539 * Find out if a device is linked to a lower device and return true in case
6540 * it is. The caller must hold the RTNL lock.
6541 */
6542static bool netdev_has_any_lower_dev(struct net_device *dev)
6543{
6544        ASSERT_RTNL();
6545
6546        return !list_empty(&dev->adj_list.lower);
6547}
6548
6549void *netdev_adjacent_get_private(struct list_head *adj_list)
6550{
6551        struct netdev_adjacent *adj;
6552
6553        adj = list_entry(adj_list, struct netdev_adjacent, list);
6554
6555        return adj->private;
6556}
6557EXPORT_SYMBOL(netdev_adjacent_get_private);
6558
6559/**
6560 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
6561 * @dev: device
6562 * @iter: list_head ** of the current position
6563 *
6564 * Gets the next device from the dev's upper list, starting from iter
6565 * position. The caller must hold RCU read lock.
6566 */
6567struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
6568                                                 struct list_head **iter)
6569{
6570        struct netdev_adjacent *upper;
6571
6572        WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
6573
6574        upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
6575
6576        if (&upper->list == &dev->adj_list.upper)
6577                return NULL;
6578
6579        *iter = &upper->list;
6580
6581        return upper->dev;
6582}
6583EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
6584
6585static struct net_device *__netdev_next_upper_dev(struct net_device *dev,
6586                                                  struct list_head **iter,
6587                                                  bool *ignore)
6588{
6589        struct netdev_adjacent *upper;
6590
6591        upper = list_entry((*iter)->next, struct netdev_adjacent, list);
6592
6593        if (&upper->list == &dev->adj_list.upper)
6594                return NULL;
6595
6596        *iter = &upper->list;
6597        *ignore = upper->ignore;
6598
6599        return upper->dev;
6600}
6601
6602static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
6603                                                    struct list_head **iter)
6604{
6605        struct netdev_adjacent *upper;
6606
6607        WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
6608
6609        upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
6610
6611        if (&upper->list == &dev->adj_list.upper)
6612                return NULL;
6613
6614        *iter = &upper->list;
6615
6616        return upper->dev;
6617}
6618
6619static int __netdev_walk_all_upper_dev(struct net_device *dev,
6620                                       int (*fn)(struct net_device *dev,
6621                                                 void *data),
6622                                       void *data)
6623{
6624        struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
6625        struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
6626        int ret, cur = 0;
6627        bool ignore;
6628
6629        now = dev;
6630        iter = &dev->adj_list.upper;
6631
6632        while (1) {
6633                if (now != dev) {
6634                        ret = fn(now, data);
6635                        if (ret)
6636                                return ret;
6637                }
6638
6639                next = NULL;
6640                while (1) {
6641                        udev = __netdev_next_upper_dev(now, &iter, &ignore);
6642                        if (!udev)
6643                                break;
6644                        if (ignore)
6645                                continue;
6646
6647                        next = udev;
6648                        niter = &udev->adj_list.upper;
6649                        dev_stack[cur] = now;
6650                        iter_stack[cur++] = iter;
6651                        break;
6652                }
6653
6654                if (!next) {
6655                        if (!cur)
6656                                return 0;
6657                        next = dev_stack[--cur];
6658                        niter = iter_stack[cur];
6659                }
6660
6661                now = next;
6662                iter = niter;
6663        }
6664
6665        return 0;
6666}
6667
6668int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
6669                                  int (*fn)(struct net_device *dev,
6670                                            void *data),
6671                                  void *data)
6672{
6673        struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
6674        struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
6675        int ret, cur = 0;
6676
6677        now = dev;
6678        iter = &dev->adj_list.upper;
6679
6680        while (1) {
6681                if (now != dev) {
6682                        ret = fn(now, data);
6683                        if (ret)
6684                                return ret;
6685                }
6686
6687                next = NULL;
6688                while (1) {
6689                        udev = netdev_next_upper_dev_rcu(now, &iter);
6690                        if (!udev)
6691                                break;
6692
6693                        next = udev;
6694                        niter = &udev->adj_list.upper;
6695                        dev_stack[cur] = now;
6696                        iter_stack[cur++] = iter;
6697                        break;
6698                }
6699
6700                if (!next) {
6701                        if (!cur)
6702                                return 0;
6703                        next = dev_stack[--cur];
6704                        niter = iter_stack[cur];
6705                }
6706
6707                now = next;
6708                iter = niter;
6709        }
6710
6711        return 0;
6712}
6713EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
6714
6715static bool __netdev_has_upper_dev(struct net_device *dev,
6716                                   struct net_device *upper_dev)
6717{
6718        ASSERT_RTNL();
6719
6720        return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
6721                                           upper_dev);
6722}
6723
6724/**
6725 * netdev_lower_get_next_private - Get the next ->private from the
6726 *                                 lower neighbour list
6727 * @dev: device
6728 * @iter: list_head ** of the current position
6729 *
6730 * Gets the next netdev_adjacent->private from the dev's lower neighbour
6731 * list, starting from iter position. The caller must hold either hold the
6732 * RTNL lock or its own locking that guarantees that the neighbour lower
6733 * list will remain unchanged.
6734 */
6735void *netdev_lower_get_next_private(struct net_device *dev,
6736                                    struct list_head **iter)
6737{
6738        struct netdev_adjacent *lower;
6739
6740        lower = list_entry(*iter, struct netdev_adjacent, list);
6741
6742        if (&lower->list == &dev->adj_list.lower)
6743                return NULL;
6744
6745        *iter = lower->list.next;
6746
6747        return lower->private;
6748}
6749EXPORT_SYMBOL(netdev_lower_get_next_private);
6750
6751/**
6752 * netdev_lower_get_next_private_rcu - Get the next ->private from the
6753 *                                     lower neighbour list, RCU
6754 *                                     variant
6755 * @dev: device
6756 * @iter: list_head ** of the current position
6757 *
6758 * Gets the next netdev_adjacent->private from the dev's lower neighbour
6759 * list, starting from iter position. The caller must hold RCU read lock.
6760 */
6761void *netdev_lower_get_next_private_rcu(struct net_device *dev,
6762                                        struct list_head **iter)
6763{
6764        struct netdev_adjacent *lower;
6765
6766        WARN_ON_ONCE(!rcu_read_lock_held());
6767
6768        lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
6769
6770        if (&lower->list == &dev->adj_list.lower)
6771                return NULL;
6772
6773        *iter = &lower->list;
6774
6775        return lower->private;
6776}
6777EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
6778
6779/**
6780 * netdev_lower_get_next - Get the next device from the lower neighbour
6781 *                         list
6782 * @dev: device
6783 * @iter: list_head ** of the current position
6784 *
6785 * Gets the next netdev_adjacent from the dev's lower neighbour
6786 * list, starting from iter position. The caller must hold RTNL lock or
6787 * its own locking that guarantees that the neighbour lower
6788 * list will remain unchanged.
6789 */
6790void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
6791{
6792        struct netdev_adjacent *lower;
6793
6794        lower = list_entry(*iter, struct netdev_adjacent, list);
6795
6796        if (&lower->list == &dev->adj_list.lower)
6797                return NULL;
6798
6799        *iter = lower->list.next;
6800
6801        return lower->dev;
6802}
6803EXPORT_SYMBOL(netdev_lower_get_next);
6804
6805static struct net_device *netdev_next_lower_dev(struct net_device *dev,
6806                                                struct list_head **iter)
6807{
6808        struct netdev_adjacent *lower;
6809
6810        lower = list_entry((*iter)->next, struct netdev_adjacent, list);
6811
6812        if (&lower->list == &dev->adj_list.lower)
6813                return NULL;
6814
6815        *iter = &lower->list;
6816
6817        return lower->dev;
6818}
6819
6820static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
6821                                                  struct list_head **iter,
6822                                                  bool *ignore)
6823{
6824        struct netdev_adjacent *lower;
6825
6826        lower = list_entry((*iter)->next, struct netdev_adjacent, list);
6827
6828        if (&lower->list == &dev->adj_list.lower)
6829                return NULL;
6830
6831        *iter = &lower->list;
6832        *ignore = lower->ignore;
6833
6834        return lower->dev;
6835}
6836
6837int netdev_walk_all_lower_dev(struct net_device *dev,
6838                              int (*fn)(struct net_device *dev,
6839                                        void *data),
6840                              void *data)
6841{
6842        struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
6843        struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
6844        int ret, cur = 0;
6845
6846        now = dev;
6847        iter = &dev->adj_list.lower;
6848
6849        while (1) {
6850                if (now != dev) {
6851                        ret = fn(now, data);
6852                        if (ret)
6853                                return ret;
6854                }
6855
6856                next = NULL;
6857                while (1) {
6858                        ldev = netdev_next_lower_dev(now, &iter);
6859                        if (!ldev)
6860                                break;
6861
6862                        next = ldev;
6863                        niter = &ldev->adj_list.lower;
6864                        dev_stack[cur] = now;
6865                        iter_stack[cur++] = iter;
6866                        break;
6867                }
6868
6869                if (!next) {
6870                        if (!cur)
6871                                return 0;
6872                        next = dev_stack[--cur];
6873                        niter = iter_stack[cur];
6874                }
6875
6876                now = next;
6877                iter = niter;
6878        }
6879
6880        return 0;
6881}
6882EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
6883
6884static int __netdev_walk_all_lower_dev(struct net_device *dev,
6885                                       int (*fn)(struct net_device *dev,
6886                                                 void *data),
6887                                       void *data)
6888{
6889        struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
6890        struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
6891        int ret, cur = 0;
6892        bool ignore;
6893
6894        now = dev;
6895        iter = &dev->adj_list.lower;
6896
6897        while (1) {
6898                if (now != dev) {
6899                        ret = fn(now, data);
6900                        if (ret)
6901                                return ret;
6902                }
6903
6904                next = NULL;
6905                while (1) {
6906                        ldev = __netdev_next_lower_dev(now, &iter, &ignore);
6907                        if (!ldev)
6908                                break;
6909                        if (ignore)
6910                                continue;
6911
6912                        next = ldev;
6913                        niter = &ldev->adj_list.lower;
6914                        dev_stack[cur] = now;
6915                        iter_stack[cur++] = iter;
6916                        break;
6917                }
6918
6919                if (!next) {
6920                        if (!cur)
6921                                return 0;
6922                        next = dev_stack[--cur];
6923                        niter = iter_stack[cur];
6924                }
6925
6926                now = next;
6927                iter = niter;
6928        }
6929
6930        return 0;
6931}
6932
6933static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
6934                                                    struct list_head **iter)
6935{
6936        struct netdev_adjacent *lower;
6937
6938        lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
6939        if (&lower->list == &dev->adj_list.lower)
6940                return NULL;
6941
6942        *iter = &lower->list;
6943
6944        return lower->dev;
6945}
6946
6947static u8 __netdev_upper_depth(struct net_device *dev)
6948{
6949        struct net_device *udev;
6950        struct list_head *iter;
6951        u8 max_depth = 0;
6952        bool ignore;
6953
6954        for (iter = &dev->adj_list.upper,
6955             udev = __netdev_next_upper_dev(dev, &iter, &ignore);
6956             udev;
6957             udev = __netdev_next_upper_dev(dev, &iter, &ignore)) {
6958                if (ignore)
6959                        continue;
6960                if (max_depth < udev->upper_level)
6961                        max_depth = udev->upper_level;
6962        }
6963
6964        return max_depth;
6965}
6966
6967static u8 __netdev_lower_depth(struct net_device *dev)
6968{
6969        struct net_device *ldev;
6970        struct list_head *iter;
6971        u8 max_depth = 0;
6972        bool ignore;
6973
6974        for (iter = &dev->adj_list.lower,
6975             ldev = __netdev_next_lower_dev(dev, &iter, &ignore);
6976             ldev;
6977             ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) {
6978                if (ignore)
6979                        continue;
6980                if (max_depth < ldev->lower_level)
6981                        max_depth = ldev->lower_level;
6982        }
6983
6984        return max_depth;
6985}
6986
6987static int __netdev_update_upper_level(struct net_device *dev, void *data)
6988{
6989        dev->upper_level = __netdev_upper_depth(dev) + 1;
6990        return 0;
6991}
6992
6993static int __netdev_update_lower_level(struct net_device *dev, void *data)
6994{
6995        dev->lower_level = __netdev_lower_depth(dev) + 1;
6996        return 0;
6997}
6998
6999int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
7000                                  int (*fn)(struct net_device *dev,

7001                                            void *data),
7002                                  void *data)
7003{
7004        struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7005        struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7006        int ret, cur = 0;
7007
7008        now = dev;
7009        iter = &dev->adj_list.lower;
7010
7011        while (1) {
7012                if (now != dev) {
7013                        ret = fn(now, data);
7014                        if (ret)
7015                                return ret;
7016                }
7017
7018                next = NULL;
7019                while (1) {
7020                        ldev = netdev_next_lower_dev_rcu(now, &iter);
7021                        if (!ldev)
7022                                break;
7023
7024                        next = ldev;
7025                        niter = &ldev->adj_list.lower;
7026                        dev_stack[cur] = now;
7027                        iter_stack[cur++] = iter;
7028                        break;
7029                }
7030
7031                if (!next) {
7032                        if (!cur)
7033                                return 0;
7034                        next = dev_stack[--cur];
7035                        niter = iter_stack[cur];
7036                }
7037
7038                now = next;
7039                iter = niter;
7040        }
7041
7042        return 0;
7043}
7044EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
7045
7046/**
7047 * netdev_lower_get_first_private_rcu - Get the first ->private from the
7048 *                                     lower neighbour list, RCU
7049 *                                     variant
7050 * @dev: device
7051 *
7052 * Gets the first netdev_adjacent->private from the dev's lower neighbour
7053 * list. The caller must hold RCU read lock.
7054 */
7055void *netdev_lower_get_first_private_rcu(struct net_device *dev)
7056{
7057        struct netdev_adjacent *lower;
7058
7059        lower = list_first_or_null_rcu(&dev->adj_list.lower,
7060                        struct netdev_adjacent, list);
7061        if (lower)
7062                return lower->private;
7063        return NULL;
7064}
7065EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
7066
7067/**
7068 * netdev_master_upper_dev_get_rcu - Get master upper device
7069 * @dev: device
7070 *
7071 * Find a master upper device and return pointer to it or NULL in case
7072 * it's not there. The caller must hold the RCU read lock.
7073 */
7074struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
7075{
7076        struct netdev_adjacent *upper;
7077
7078        upper = list_first_or_null_rcu(&dev->adj_list.upper,
7079                                       struct netdev_adjacent, list);
7080        if (upper && likely(upper->master))
7081                return upper->dev;
7082        return NULL;
7083}
7084EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
7085
7086static int netdev_adjacent_sysfs_add(struct net_device *dev,
7087                              struct net_device *adj_dev,
7088                              struct list_head *dev_list)
7089{
7090        char linkname[IFNAMSIZ+7];
7091
7092        sprintf(linkname, dev_list == &dev->adj_list.upper ?
7093                "upper_%s" : "lower_%s", adj_dev->name);
7094        return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
7095                                 linkname);
7096}
7097static void netdev_adjacent_sysfs_del(struct net_device *dev,
7098                               char *name,
7099                               struct list_head *dev_list)
7100{
7101        char linkname[IFNAMSIZ+7];
7102
7103        sprintf(linkname, dev_list == &dev->adj_list.upper ?
7104                "upper_%s" : "lower_%s", name);
7105        sysfs_remove_link(&(dev->dev.kobj), linkname);
7106}
7107
7108static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
7109                                                 struct net_device *adj_dev,
7110                                                 struct list_head *dev_list)
7111{
7112        return (dev_list == &dev->adj_list.upper ||
7113                dev_list == &dev->adj_list.lower) &&
7114                net_eq(dev_net(dev), dev_net(adj_dev));
7115}
7116
7117static int __netdev_adjacent_dev_insert(struct net_device *dev,
7118                                        struct net_device *adj_dev,
7119                                        struct list_head *dev_list,
7120                                        void *private, bool master)
7121{
7122        struct netdev_adjacent *adj;
7123        int ret;
7124
7125        adj = __netdev_find_adj(adj_dev, dev_list);
7126
7127        if (adj) {
7128                adj->ref_nr += 1;
7129                pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
7130                         dev->name, adj_dev->name, adj->ref_nr);
7131
7132                return 0;
7133        }
7134
7135        adj = kmalloc(sizeof(*adj), GFP_KERNEL);
7136        if (!adj)
7137                return -ENOMEM;
7138
7139        adj->dev = adj_dev;
7140        adj->master = master;
7141        adj->ref_nr = 1;
7142        adj->private = private;
7143        adj->ignore = false;
7144        dev_hold(adj_dev);
7145
7146        pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
7147                 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
7148
7149        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
7150                ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
7151                if (ret)
7152                        goto free_adj;
7153        }
7154
7155        /* Ensure that master link is always the first item in list. */
7156        if (master) {
7157                ret = sysfs_create_link(&(dev->dev.kobj),
7158                                        &(adj_dev->dev.kobj), "master");
7159                if (ret)
7160                        goto remove_symlinks;
7161
7162                list_add_rcu(&adj->list, dev_list);
7163        } else {
7164                list_add_tail_rcu(&adj->list, dev_list);
7165        }
7166
7167        return 0;
7168
7169remove_symlinks:
7170        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
7171                netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
7172free_adj:
7173        kfree(adj);
7174        dev_put(adj_dev);
7175
7176        return ret;
7177}
7178
7179static void __netdev_adjacent_dev_remove(struct net_device *dev,
7180                                         struct net_device *adj_dev,
7181                                         u16 ref_nr,
7182                                         struct list_head *dev_list)
7183{
7184        struct netdev_adjacent *adj;
7185
7186        pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
7187                 dev->name, adj_dev->name, ref_nr);
7188
7189        adj = __netdev_find_adj(adj_dev, dev_list);
7190
7191        if (!adj) {
7192                pr_err("Adjacency does not exist for device %s from %s\n",
7193                       dev->name, adj_dev->name);
7194                WARN_ON(1);
7195                return;
7196        }
7197
7198        if (adj->ref_nr > ref_nr) {
7199                pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
7200                         dev->name, adj_dev->name, ref_nr,
7201                         adj->ref_nr - ref_nr);
7202                adj->ref_nr -= ref_nr;
7203                return;
7204        }
7205
7206        if (adj->master)
7207                sysfs_remove_link(&(dev->dev.kobj), "master");
7208
7209        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
7210                netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
7211
7212        list_del_rcu(&adj->list);
7213        pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
7214                 adj_dev->name, dev->name, adj_dev->name);
7215        dev_put(adj_dev);
7216        kfree_rcu(adj, rcu);
7217}
7218
7219static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
7220                                            struct net_device *upper_dev,
7221                                            struct list_head *up_list,
7222                                            struct list_head *down_list,
7223                                            void *private, bool master)
7224{
7225        int ret;
7226
7227        ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
7228                                           private, master);
7229        if (ret)
7230                return ret;
7231
7232        ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
7233                                           private, false);
7234        if (ret) {
7235                __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
7236                return ret;
7237        }
7238
7239        return 0;
7240}
7241
7242static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
7243                                               struct net_device *upper_dev,
7244                                               u16 ref_nr,
7245                                               struct list_head *up_list,
7246                                               struct list_head *down_list)
7247{
7248        __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
7249        __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
7250}
7251
7252static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
7253                                                struct net_device *upper_dev,
7254                                                void *private, bool master)
7255{
7256        return __netdev_adjacent_dev_link_lists(dev, upper_dev,
7257                                                &dev->adj_list.upper,
7258                                                &upper_dev->adj_list.lower,
7259                                                private, master);
7260}
7261
7262static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
7263                                                   struct net_device *upper_dev)
7264{
7265        __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
7266                                           &dev->adj_list.upper,
7267                                           &upper_dev->adj_list.lower);
7268}
7269
7270static int __netdev_upper_dev_link(struct net_device *dev,
7271                                   struct net_device *upper_dev, bool master,
7272                                   void *upper_priv, void *upper_info,
7273                                   struct netlink_ext_ack *extack)
7274{
7275        struct netdev_notifier_changeupper_info changeupper_info = {
7276                .info = {
7277                        .dev = dev,
7278                        .extack = extack,
7279                },
7280                .upper_dev = upper_dev,
7281                .master = master,
7282                .linking = true,
7283                .upper_info = upper_info,
7284        };
7285        struct net_device *master_dev;
7286        int ret = 0;
7287
7288        ASSERT_RTNL();
7289
7290        if (dev == upper_dev)
7291                return -EBUSY;
7292
7293        /* To prevent loops, check if dev is not upper device to upper_dev. */
7294        if (__netdev_has_upper_dev(upper_dev, dev))
7295                return -EBUSY;
7296
7297        if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
7298                return -EMLINK;
7299
7300        if (!master) {
7301                if (__netdev_has_upper_dev(dev, upper_dev))
7302                        return -EEXIST;
7303        } else {
7304                master_dev = __netdev_master_upper_dev_get(dev);
7305                if (master_dev)
7306                        return master_dev == upper_dev ? -EEXIST : -EBUSY;
7307        }
7308
7309        ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
7310                                            &changeupper_info.info);
7311        ret = notifier_to_errno(ret);
7312        if (ret)
7313                return ret;
7314
7315        ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
7316                                                   master);
7317        if (ret)
7318                return ret;
7319
7320        ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
7321                                            &changeupper_info.info);
7322        ret = notifier_to_errno(ret);
7323        if (ret)
7324                goto rollback;
7325
7326        __netdev_update_upper_level(dev, NULL);
7327        __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
7328
7329        __netdev_update_lower_level(upper_dev, NULL);
7330        __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
7331                                    NULL);
7332
7333        return 0;
7334
7335rollback:
7336        __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
7337
7338        return ret;
7339}
7340
7341/**
7342 * netdev_upper_dev_link - Add a link to the upper device
7343 * @dev: device
7344 * @upper_dev: new upper device
7345 * @extack: netlink extended ack
7346 *
7347 * Adds a link to device which is upper to this one. The caller must hold
7348 * the RTNL lock. On a failure a negative errno code is returned.
7349 * On success the reference counts are adjusted and the function
7350 * returns zero.
7351 */
7352int netdev_upper_dev_link(struct net_device *dev,
7353                          struct net_device *upper_dev,
7354                          struct netlink_ext_ack *extack)
7355{
7356        return __netdev_upper_dev_link(dev, upper_dev, false,
7357                                       NULL, NULL, extack);
7358}
7359EXPORT_SYMBOL(netdev_upper_dev_link);
7360
7361/**
7362 * netdev_master_upper_dev_link - Add a master link to the upper device
7363 * @dev: device
7364 * @upper_dev: new upper device
7365 * @upper_priv: upper device private
7366 * @upper_info: upper info to be passed down via notifier
7367 * @extack: netlink extended ack
7368 *
7369 * Adds a link to device which is upper to this one. In this case, only
7370 * one master upper device can be linked, although other non-master devices
7371 * might be linked as well. The caller must hold the RTNL lock.
7372 * On a failure a negative errno code is returned. On success the reference
7373 * counts are adjusted and the function returns zero.
7374 */
7375int netdev_master_upper_dev_link(struct net_device *dev,
7376                                 struct net_device *upper_dev,
7377                                 void *upper_priv, void *upper_info,
7378                                 struct netlink_ext_ack *extack)
7379{
7380        return __netdev_upper_dev_link(dev, upper_dev, true,
7381                                       upper_priv, upper_info, extack);
7382}
7383EXPORT_SYMBOL(netdev_master_upper_dev_link);
7384
7385/**
7386 * netdev_upper_dev_unlink - Removes a link to upper device
7387 * @dev: device
7388 * @upper_dev: new upper device
7389 *
7390 * Removes a link to device which is upper to this one. The caller must hold
7391 * the RTNL lock.
7392 */
7393void netdev_upper_dev_unlink(struct net_device *dev,
7394                             struct net_device *upper_dev)
7395{
7396        struct netdev_notifier_changeupper_info changeupper_info = {
7397                .info = {
7398                        .dev = dev,
7399                },
7400                .upper_dev = upper_dev,
7401                .linking = false,
7402        };
7403
7404        ASSERT_RTNL();
7405
7406        changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
7407
7408        call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
7409                                      &changeupper_info.info);
7410
7411        __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
7412
7413        call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
7414                                      &changeupper_info.info);
7415
7416        __netdev_update_upper_level(dev, NULL);
7417        __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
7418
7419        __netdev_update_lower_level(upper_dev, NULL);
7420        __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
7421                                    NULL);
7422}
7423EXPORT_SYMBOL(netdev_upper_dev_unlink);
7424
7425static void __netdev_adjacent_dev_set(struct net_device *upper_dev,
7426                                      struct net_device *lower_dev,
7427                                      bool val)
7428{
7429        struct netdev_adjacent *adj;
7430
7431        adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
7432        if (adj)
7433                adj->ignore = val;
7434
7435        adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
7436        if (adj)
7437                adj->ignore = val;
7438}
7439
7440static void netdev_adjacent_dev_disable(struct net_device *upper_dev,
7441                                        struct net_device *lower_dev)
7442{
7443        __netdev_adjacent_dev_set(upper_dev, lower_dev, true);
7444}
7445
7446static void netdev_adjacent_dev_enable(struct net_device *upper_dev,
7447                                       struct net_device *lower_dev)
7448{
7449        __netdev_adjacent_dev_set(upper_dev, lower_dev, false);
7450}
7451
7452int netdev_adjacent_change_prepare(struct net_device *old_dev,
7453                                   struct net_device *new_dev,
7454                                   struct net_device *dev,
7455                                   struct netlink_ext_ack *extack)
7456{
7457        int err;
7458
7459        if (!new_dev)
7460                return 0;
7461
7462        if (old_dev && new_dev != old_dev)
7463                netdev_adjacent_dev_disable(dev, old_dev);
7464
7465        err = netdev_upper_dev_link(new_dev, dev, extack);
7466        if (err) {
7467                if (old_dev && new_dev != old_dev)
7468                        netdev_adjacent_dev_enable(dev, old_dev);
7469                return err;
7470        }
7471
7472        return 0;
7473}
7474EXPORT_SYMBOL(netdev_adjacent_change_prepare);
7475
7476void netdev_adjacent_change_commit(struct net_device *old_dev,
7477                                   struct net_device *new_dev,
7478                                   struct net_device *dev)
7479{
7480        if (!new_dev || !old_dev)
7481                return;
7482
7483        if (new_dev == old_dev)
7484                return;
7485
7486        netdev_adjacent_dev_enable(dev, old_dev);
7487        netdev_upper_dev_unlink(old_dev, dev);
7488}
7489EXPORT_SYMBOL(netdev_adjacent_change_commit);
7490
7491void netdev_adjacent_change_abort(struct net_device *old_dev,
7492                                  struct net_device *new_dev,
7493                                  struct net_device *dev)
7494{
7495        if (!new_dev)
7496                return;
7497
7498        if (old_dev && new_dev != old_dev)
7499                netdev_adjacent_dev_enable(dev, old_dev);
7500
7501        netdev_upper_dev_unlink(new_dev, dev);
7502}
7503EXPORT_SYMBOL(netdev_adjacent_change_abort);
7504
7505/**
7506 * netdev_bonding_info_change - Dispatch event about slave change
7507 * @dev: device
7508 * @bonding_info: info to dispatch
7509 *
7510 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
7511 * The caller must hold the RTNL lock.
7512 */
7513void netdev_bonding_info_change(struct net_device *dev,
7514                                struct netdev_bonding_info *bonding_info)
7515{
7516        struct netdev_notifier_bonding_info info = {
7517                .info.dev = dev,
7518        };
7519
7520        memcpy(&info.bonding_info, bonding_info,
7521               sizeof(struct netdev_bonding_info));
7522        call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
7523                                      &info.info);
7524}
7525EXPORT_SYMBOL(netdev_bonding_info_change);
7526
7527static void netdev_adjacent_add_links(struct net_device *dev)
7528{
7529        struct netdev_adjacent *iter;
7530
7531        struct net *net = dev_net(dev);
7532
7533        list_for_each_entry(iter, &dev->adj_list.upper, list) {
7534                if (!net_eq(net, dev_net(iter->dev)))
7535                        continue;
7536                netdev_adjacent_sysfs_add(iter->dev, dev,
7537                                          &iter->dev->adj_list.lower);
7538                netdev_adjacent_sysfs_add(dev, iter->dev,
7539                                          &dev->adj_list.upper);
7540        }
7541
7542        list_for_each_entry(iter, &dev->adj_list.lower, list) {
7543                if (!net_eq(net, dev_net(iter->dev)))
7544                        continue;
7545                netdev_adjacent_sysfs_add(iter->dev, dev,
7546                                          &iter->dev->adj_list.upper);
7547                netdev_adjacent_sysfs_add(dev, iter->dev,
7548                                          &dev->adj_list.lower);
7549        }
7550}
7551
7552static void netdev_adjacent_del_links(struct net_device *dev)
7553{
7554        struct netdev_adjacent *iter;
7555
7556        struct net *net = dev_net(dev);
7557
7558        list_for_each_entry(iter, &dev->adj_list.upper, list) {
7559                if (!net_eq(net, dev_net(iter->dev)))
7560                        continue;
7561                netdev_adjacent_sysfs_del(iter->dev, dev->name,
7562                                          &iter->dev->adj_list.lower);
7563                netdev_adjacent_sysfs_del(dev, iter->dev->name,
7564                                          &dev->adj_list.upper);
7565        }
7566
7567        list_for_each_entry(iter, &dev->adj_list.lower, list) {
7568                if (!net_eq(net, dev_net(iter->dev)))
7569                        continue;
7570                netdev_adjacent_sysfs_del(iter->dev, dev->name,
7571                                          &iter->dev->adj_list.upper);
7572                netdev_adjacent_sysfs_del(dev, iter->dev->name,
7573                                          &dev->adj_list.lower);
7574        }
7575}
7576
7577void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
7578{
7579        struct netdev_adjacent *iter;
7580
7581        struct net *net = dev_net(dev);
7582
7583        list_for_each_entry(iter, &dev->adj_list.upper, list) {
7584                if (!net_eq(net, dev_net(iter->dev)))
7585                        continue;
7586                netdev_adjacent_sysfs_del(iter->dev, oldname,
7587                                          &iter->dev->adj_list.lower);
7588                netdev_adjacent_sysfs_add(iter->dev, dev,
7589                                          &iter->dev->adj_list.lower);
7590        }
7591
7592        list_for_each_entry(iter, &dev->adj_list.lower, list) {
7593                if (!net_eq(net, dev_net(iter->dev)))
7594                        continue;
7595                netdev_adjacent_sysfs_del(iter->dev, oldname,
7596                                          &iter->dev->adj_list.upper);
7597                netdev_adjacent_sysfs_add(iter->dev, dev,
7598                                          &iter->dev->adj_list.upper);
7599        }
7600}
7601
7602void *netdev_lower_dev_get_private(struct net_device *dev,
7603                                   struct net_device *lower_dev)
7604{
7605        struct netdev_adjacent *lower;
7606
7607        if (!lower_dev)
7608                return NULL;
7609        lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
7610        if (!lower)
7611                return NULL;
7612
7613        return lower->private;
7614}
7615EXPORT_SYMBOL(netdev_lower_dev_get_private);
7616
7617
7618/**
7619 * netdev_lower_change - Dispatch event about lower device state change
7620 * @lower_dev: device
7621 * @lower_state_info: state to dispatch
7622 *
7623 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
7624 * The caller must hold the RTNL lock.
7625 */
7626void netdev_lower_state_changed(struct net_device *lower_dev,
7627                                void *lower_state_info)
7628{
7629        struct netdev_notifier_changelowerstate_info changelowerstate_info = {
7630                .info.dev = lower_dev,
7631        };
7632
7633        ASSERT_RTNL();
7634        changelowerstate_info.lower_state_info = lower_state_info;
7635        call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
7636                                      &changelowerstate_info.info);
7637}
7638EXPORT_SYMBOL(netdev_lower_state_changed);
7639
7640static void dev_change_rx_flags(struct net_device *dev, int flags)
7641{
7642        const struct net_device_ops *ops = dev->netdev_ops;
7643
7644        if (ops->ndo_change_rx_flags)
7645                ops->ndo_change_rx_flags(dev, flags);
7646}
7647
7648static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
7649{
7650        unsigned int old_flags = dev->flags;
7651        kuid_t uid;
7652        kgid_t gid;
7653
7654        ASSERT_RTNL();
7655
7656        dev->flags |= IFF_PROMISC;
7657        dev->promiscuity += inc;
7658        if (dev->promiscuity == 0) {
7659                /*
7660                 * Avoid overflow.
7661                 * If inc causes overflow, untouch promisc and return error.
7662                 */
7663                if (inc < 0)
7664                        dev->flags &= ~IFF_PROMISC;
7665                else {
7666                        dev->promiscuity -= inc;
7667                        pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
7668                                dev->name);
7669                        return -EOVERFLOW;
7670                }
7671        }
7672        if (dev->flags != old_flags) {
7673                pr_info("device %s %s promiscuous mode\n",
7674                        dev->name,
7675                        dev->flags & IFF_PROMISC ? "entered" : "left");
7676                if (audit_enabled) {
7677                        current_uid_gid(&uid, &gid);
7678                        audit_log(audit_context(), GFP_ATOMIC,
7679                                  AUDIT_ANOM_PROMISCUOUS,
7680                                  "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
7681                                  dev->name, (dev->flags & IFF_PROMISC),
7682                                  (old_flags & IFF_PROMISC),
7683                                  from_kuid(&init_user_ns, audit_get_loginuid(current)),
7684                                  from_kuid(&init_user_ns, uid),
7685                                  from_kgid(&init_user_ns, gid),
7686                                  audit_get_sessionid(current));
7687                }
7688
7689                dev_change_rx_flags(dev, IFF_PROMISC);
7690        }
7691        if (notify)
7692                __dev_notify_flags(dev, old_flags, IFF_PROMISC);
7693        return 0;
7694}
7695
7696/**
7697 *      dev_set_promiscuity     - update promiscuity count on a device
7698 *      @dev: device
7699 *      @inc: modifier
7700 *
7701 *      Add or remove promiscuity from a device. While the count in the device
7702 *      remains above zero the interface remains promiscuous. Once it hits zero
7703 *      the device reverts back to normal filtering operation. A negative inc
7704 *      value is used to drop promiscuity on the device.
7705 *      Return 0 if successful or a negative errno code on error.
7706 */
7707int dev_set_promiscuity(struct net_device *dev, int inc)
7708{
7709        unsigned int old_flags = dev->flags;
7710        int err;
7711
7712        err = __dev_set_promiscuity(dev, inc, true);
7713        if (err < 0)
7714                return err;
7715        if (dev->flags != old_flags)
7716                dev_set_rx_mode(dev);
7717        return err;
7718}
7719EXPORT_SYMBOL(dev_set_promiscuity);
7720
7721static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
7722{
7723        unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
7724
7725        ASSERT_RTNL();
7726
7727        dev->flags |= IFF_ALLMULTI;
7728        dev->allmulti += inc;
7729        if (dev->allmulti == 0) {
7730                /*
7731                 * Avoid overflow.
7732                 * If inc causes overflow, untouch allmulti and return error.
7733                 */
7734                if (inc < 0)
7735                        dev->flags &= ~IFF_ALLMULTI;
7736                else {
7737                        dev->allmulti -= inc;
7738                        pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
7739                                dev->name);
7740                        return -EOVERFLOW;
7741                }
7742        }
7743        if (dev->flags ^ old_flags) {
7744                dev_change_rx_flags(dev, IFF_ALLMULTI);
7745                dev_set_rx_mode(dev);
7746                if (notify)
7747                        __dev_notify_flags(dev, old_flags,
7748                                           dev->gflags ^ old_gflags);
7749        }
7750        return 0;
7751}
7752
7753/**
7754 *      dev_set_allmulti        - update allmulti count on a device
7755 *      @dev: device
7756 *      @inc: modifier
7757 *
7758 *      Add or remove reception of all multicast frames to a device. While the
7759 *      count in the device remains above zero the interface remains listening
7760 *      to all interfaces. Once it hits zero the device reverts back to normal
7761 *      filtering operation. A negative @inc value is used to drop the counter
7762 *      when releasing a resource needing all multicasts.
7763 *      Return 0 if successful or a negative errno code on error.
7764 */
7765
7766int dev_set_allmulti(struct net_device *dev, int inc)
7767{
7768        return __dev_set_allmulti(dev, inc, true);
7769}
7770EXPORT_SYMBOL(dev_set_allmulti);
7771
7772/*
7773 *      Upload unicast and multicast address lists to device and
7774 *      configure RX filtering. When the device doesn't support unicast
7775 *      filtering it is put in promiscuous mode while unicast addresses
7776 *      are present.
7777 */
7778void __dev_set_rx_mode(struct net_device *dev)
7779{
7780        const struct net_device_ops *ops = dev->netdev_ops;
7781
7782        /* dev_open will call this function so the list will stay sane. */
7783        if (!(dev->flags&IFF_UP))
7784                return;
7785
7786        if (!netif_device_present(dev))
7787                return;
7788
7789        if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
7790                /* Unicast addresses changes may only happen under the rtnl,
7791                 * therefore calling __dev_set_promiscuity here is safe.
7792                 */
7793                if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
7794                        __dev_set_promiscuity(dev, 1, false);
7795                        dev->uc_promisc = true;
7796                } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
7797                        __dev_set_promiscuity(dev, -1, false);
7798                        dev->uc_promisc = false;
7799                }
7800        }
7801
7802        if (ops->ndo_set_rx_mode)
7803                ops->ndo_set_rx_mode(dev);
7804}
7805
7806void dev_set_rx_mode(struct net_device *dev)
7807{
7808        netif_addr_lock_bh(dev);
7809        __dev_set_rx_mode(dev);
7810        netif_addr_unlock_bh(dev);
7811}
7812
7813/**
7814 *      dev_get_flags - get flags reported to userspace
7815 *      @dev: device
7816 *
7817 *      Get the combination of flag bits exported through APIs to userspace.
7818 */
7819unsigned int dev_get_flags(const struct net_device *dev)
7820{
7821        unsigned int flags;
7822
7823        flags = (dev->flags & ~(IFF_PROMISC |
7824                                IFF_ALLMULTI |
7825                                IFF_RUNNING |
7826                                IFF_LOWER_UP |
7827                                IFF_DORMANT)) |
7828                (dev->gflags & (IFF_PROMISC |
7829                                IFF_ALLMULTI));
7830
7831        if (netif_running(dev)) {
7832                if (netif_oper_up(dev))
7833                        flags |= IFF_RUNNING;
7834                if (netif_carrier_ok(dev))
7835                        flags |= IFF_LOWER_UP;
7836                if (netif_dormant(dev))
7837                        flags |= IFF_DORMANT;
7838        }
7839
7840        return flags;
7841}
7842EXPORT_SYMBOL(dev_get_flags);
7843
7844int __dev_change_flags(struct net_device *dev, unsigned int flags,
7845                       struct netlink_ext_ack *extack)
7846{
7847        unsigned int old_flags = dev->flags;
7848        int ret;
7849
7850        ASSERT_RTNL();
7851
7852        /*
7853         *      Set the flags on our device.
7854         */
7855
7856        dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
7857                               IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
7858                               IFF_AUTOMEDIA)) |
7859                     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
7860                                    IFF_ALLMULTI));
7861
7862        /*
7863         *      Load in the correct multicast list now the flags have changed.
7864         */
7865
7866        if ((old_flags ^ flags) & IFF_MULTICAST)
7867                dev_change_rx_flags(dev, IFF_MULTICAST);
7868
7869        dev_set_rx_mode(dev);
7870
7871        /*
7872         *      Have we downed the interface. We handle IFF_UP ourselves
7873         *      according to user attempts to set it, rather than blindly
7874         *      setting it.
7875         */
7876
7877        ret = 0;
7878        if ((old_flags ^ flags) & IFF_UP) {
7879                if (old_flags & IFF_UP)
7880                        __dev_close(dev);
7881                else
7882                        ret = __dev_open(dev, extack);
7883        }
7884
7885        if ((flags ^ dev->gflags) & IFF_PROMISC) {
7886                int inc = (flags & IFF_PROMISC) ? 1 : -1;
7887                unsigned int old_flags = dev->flags;
7888
7889                dev->gflags ^= IFF_PROMISC;
7890
7891                if (__dev_set_promiscuity(dev, inc, false) >= 0)
7892                        if (dev->flags != old_flags)
7893                                dev_set_rx_mode(dev);
7894        }
7895
7896        /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
7897         * is important. Some (broken) drivers set IFF_PROMISC, when
7898         * IFF_ALLMULTI is requested not asking us and not reporting.
7899         */
7900        if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
7901                int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
7902
7903                dev->gflags ^= IFF_ALLMULTI;
7904                __dev_set_allmulti(dev, inc, false);
7905        }
7906
7907        return ret;
7908}
7909
7910void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
7911                        unsigned int gchanges)
7912{
7913        unsigned int changes = dev->flags ^ old_flags;
7914
7915        if (gchanges)
7916                rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
7917
7918        if (changes & IFF_UP) {
7919                if (dev->flags & IFF_UP)
7920                        call_netdevice_notifiers(NETDEV_UP, dev);
7921                else
7922                        call_netdevice_notifiers(NETDEV_DOWN, dev);
7923        }
7924
7925        if (dev->flags & IFF_UP &&
7926            (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
7927                struct netdev_notifier_change_info change_info = {
7928                        .info = {
7929                                .dev = dev,
7930                        },
7931                        .flags_changed = changes,
7932                };
7933
7934                call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
7935        }
7936}
7937
7938/**
7939 *      dev_change_flags - change device settings
7940 *      @dev: device
7941 *      @flags: device state flags
7942 *      @extack: netlink extended ack
7943 *
7944 *      Change settings on device based state flags. The flags are
7945 *      in the userspace exported format.
7946 */
7947int dev_change_flags(struct net_device *dev, unsigned int flags,
7948                     struct netlink_ext_ack *extack)
7949{
7950        int ret;
7951        unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
7952
7953        ret = __dev_change_flags(dev, flags, extack);
7954        if (ret < 0)
7955                return ret;
7956
7957        changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
7958        __dev_notify_flags(dev, old_flags, changes);
7959        return ret;
7960}
7961EXPORT_SYMBOL(dev_change_flags);
7962
7963int __dev_set_mtu(struct net_device *dev, int new_mtu)
7964{
7965        const struct net_device_ops *ops = dev->netdev_ops;
7966
7967        if (ops->ndo_change_mtu)
7968                return ops->ndo_change_mtu(dev, new_mtu);
7969
7970        dev->mtu = new_mtu;
7971        return 0;
7972}
7973EXPORT_SYMBOL(__dev_set_mtu);
7974
7975/**
7976 *      dev_set_mtu_ext - Change maximum transfer unit
7977 *      @dev: device
7978 *      @new_mtu: new transfer unit
7979 *      @extack: netlink extended ack
7980 *
7981 *      Change the maximum transfer size of the network device.
7982 */
7983int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
7984                    struct netlink_ext_ack *extack)
7985{
7986        int err, orig_mtu;
7987
7988        if (new_mtu == dev->mtu)
7989                return 0;
7990
7991        /* MTU must be positive, and in range */
7992        if (new_mtu < 0 || new_mtu < dev->min_mtu) {
7993                NL_SET_ERR_MSG(extack, "mtu less than device minimum");
7994                return -EINVAL;
7995        }
7996
7997        if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
7998                NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
7999                return -EINVAL;
8000        }

8001
8002        if (!netif_device_present(dev))
8003                return -ENODEV;
8004
8005        err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
8006        err = notifier_to_errno(err);
8007        if (err)
8008                return err;
8009
8010        orig_mtu = dev->mtu;
8011        err = __dev_set_mtu(dev, new_mtu);
8012
8013        if (!err) {
8014                err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
8015                                                   orig_mtu);
8016                err = notifier_to_errno(err);
8017                if (err) {
8018                        /* setting mtu back and notifying everyone again,
8019                         * so that they have a chance to revert changes.
8020                         */
8021                        __dev_set_mtu(dev, orig_mtu);
8022                        call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
8023                                                     new_mtu);
8024                }
8025        }
8026        return err;
8027}
8028
8029int dev_set_mtu(struct net_device *dev, int new_mtu)
8030{
8031        struct netlink_ext_ack extack;
8032        int err;
8033
8034        memset(&extack, 0, sizeof(extack));
8035        err = dev_set_mtu_ext(dev, new_mtu, &extack);
8036        if (err && extack._msg)
8037                net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
8038        return err;
8039}
8040EXPORT_SYMBOL(dev_set_mtu);
8041
8042/**
8043 *      dev_change_tx_queue_len - Change TX queue length of a netdevice
8044 *      @dev: device
8045 *      @new_len: new tx queue length
8046 */
8047int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
8048{
8049        unsigned int orig_len = dev->tx_queue_len;
8050        int res;
8051
8052        if (new_len != (unsigned int)new_len)
8053                return -ERANGE;
8054
8055        if (new_len != orig_len) {
8056                dev->tx_queue_len = new_len;
8057                res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
8058                res = notifier_to_errno(res);
8059                if (res)
8060                        goto err_rollback;
8061                res = dev_qdisc_change_tx_queue_len(dev);
8062                if (res)
8063                        goto err_rollback;
8064        }
8065
8066        return 0;
8067
8068err_rollback:
8069        netdev_err(dev, "refused to change device tx_queue_len\n");
8070        dev->tx_queue_len = orig_len;
8071        return res;
8072}
8073
8074/**
8075 *      dev_set_group - Change group this device belongs to
8076 *      @dev: device
8077 *      @new_group: group this device should belong to
8078 */
8079void dev_set_group(struct net_device *dev, int new_group)
8080{
8081        dev->group = new_group;
8082}
8083EXPORT_SYMBOL(dev_set_group);
8084
8085/**
8086 *      dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
8087 *      @dev: device
8088 *      @addr: new address
8089 *      @extack: netlink extended ack
8090 */
8091int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
8092                              struct netlink_ext_ack *extack)
8093{
8094        struct netdev_notifier_pre_changeaddr_info info = {
8095                .info.dev = dev,
8096                .info.extack = extack,
8097                .dev_addr = addr,
8098        };
8099        int rc;
8100
8101        rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
8102        return notifier_to_errno(rc);
8103}
8104EXPORT_SYMBOL(dev_pre_changeaddr_notify);
8105
8106/**
8107 *      dev_set_mac_address - Change Media Access Control Address
8108 *      @dev: device
8109 *      @sa: new address
8110 *      @extack: netlink extended ack
8111 *
8112 *      Change the hardware (MAC) address of the device
8113 */
8114int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
8115                        struct netlink_ext_ack *extack)
8116{
8117        const struct net_device_ops *ops = dev->netdev_ops;
8118        int err;
8119
8120        if (!ops->ndo_set_mac_address)
8121                return -EOPNOTSUPP;
8122        if (sa->sa_family != dev->type)
8123                return -EINVAL;
8124        if (!netif_device_present(dev))
8125                return -ENODEV;
8126        err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
8127        if (err)
8128                return err;
8129        err = ops->ndo_set_mac_address(dev, sa);
8130        if (err)
8131                return err;
8132        dev->addr_assign_type = NET_ADDR_SET;
8133        call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
8134        add_device_randomness(dev->dev_addr, dev->addr_len);
8135        return 0;
8136}
8137EXPORT_SYMBOL(dev_set_mac_address);
8138
8139/**
8140 *      dev_change_carrier - Change device carrier
8141 *      @dev: device
8142 *      @new_carrier: new value
8143 *
8144 *      Change device carrier
8145 */
8146int dev_change_carrier(struct net_device *dev, bool new_carrier)
8147{
8148        const struct net_device_ops *ops = dev->netdev_ops;
8149
8150        if (!ops->ndo_change_carrier)
8151                return -EOPNOTSUPP;
8152        if (!netif_device_present(dev))
8153                return -ENODEV;
8154        return ops->ndo_change_carrier(dev, new_carrier);
8155}
8156EXPORT_SYMBOL(dev_change_carrier);
8157
8158/**
8159 *      dev_get_phys_port_id - Get device physical port ID
8160 *      @dev: device
8161 *      @ppid: port ID
8162 *
8163 *      Get device physical port ID
8164 */
8165int dev_get_phys_port_id(struct net_device *dev,
8166                         struct netdev_phys_item_id *ppid)
8167{
8168        const struct net_device_ops *ops = dev->netdev_ops;
8169
8170        if (!ops->ndo_get_phys_port_id)
8171                return -EOPNOTSUPP;
8172        return ops->ndo_get_phys_port_id(dev, ppid);
8173}
8174EXPORT_SYMBOL(dev_get_phys_port_id);
8175
8176/**
8177 *      dev_get_phys_port_name - Get device physical port name
8178 *      @dev: device
8179 *      @name: port name
8180 *      @len: limit of bytes to copy to name
8181 *
8182 *      Get device physical port name
8183 */
8184int dev_get_phys_port_name(struct net_device *dev,
8185                           char *name, size_t len)
8186{
8187        const struct net_device_ops *ops = dev->netdev_ops;
8188        int err;
8189
8190        if (ops->ndo_get_phys_port_name) {
8191                err = ops->ndo_get_phys_port_name(dev, name, len);
8192                if (err != -EOPNOTSUPP)
8193                        return err;
8194        }
8195        return devlink_compat_phys_port_name_get(dev, name, len);
8196}
8197EXPORT_SYMBOL(dev_get_phys_port_name);
8198
8199/**
8200 *      dev_get_port_parent_id - Get the device's port parent identifier
8201 *      @dev: network device
8202 *      @ppid: pointer to a storage for the port's parent identifier
8203 *      @recurse: allow/disallow recursion to lower devices
8204 *
8205 *      Get the devices's port parent identifier
8206 */
8207int dev_get_port_parent_id(struct net_device *dev,
8208                           struct netdev_phys_item_id *ppid,
8209                           bool recurse)
8210{
8211        const struct net_device_ops *ops = dev->netdev_ops;
8212        struct netdev_phys_item_id first = { };
8213        struct net_device *lower_dev;
8214        struct list_head *iter;
8215        int err;
8216
8217        if (ops->ndo_get_port_parent_id) {
8218                err = ops->ndo_get_port_parent_id(dev, ppid);
8219                if (err != -EOPNOTSUPP)
8220                        return err;
8221        }
8222
8223        err = devlink_compat_switch_id_get(dev, ppid);
8224        if (!err || err != -EOPNOTSUPP)
8225                return err;
8226
8227        if (!recurse)
8228                return -EOPNOTSUPP;
8229
8230        netdev_for_each_lower_dev(dev, lower_dev, iter) {
8231                err = dev_get_port_parent_id(lower_dev, ppid, recurse);
8232                if (err)
8233                        break;
8234                if (!first.id_len)
8235                        first = *ppid;
8236                else if (memcmp(&first, ppid, sizeof(*ppid)))
8237                        return -ENODATA;
8238        }
8239
8240        return err;
8241}
8242EXPORT_SYMBOL(dev_get_port_parent_id);
8243
8244/**
8245 *      netdev_port_same_parent_id - Indicate if two network devices have
8246 *      the same port parent identifier
8247 *      @a: first network device
8248 *      @b: second network device
8249 */
8250bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
8251{
8252        struct netdev_phys_item_id a_id = { };
8253        struct netdev_phys_item_id b_id = { };
8254
8255        if (dev_get_port_parent_id(a, &a_id, true) ||
8256            dev_get_port_parent_id(b, &b_id, true))
8257                return false;
8258
8259        return netdev_phys_item_id_same(&a_id, &b_id);
8260}
8261EXPORT_SYMBOL(netdev_port_same_parent_id);
8262
8263/**
8264 *      dev_change_proto_down - update protocol port state information
8265 *      @dev: device
8266 *      @proto_down: new value
8267 *
8268 *      This info can be used by switch drivers to set the phys state of the
8269 *      port.
8270 */
8271int dev_change_proto_down(struct net_device *dev, bool proto_down)
8272{
8273        const struct net_device_ops *ops = dev->netdev_ops;
8274
8275        if (!ops->ndo_change_proto_down)
8276                return -EOPNOTSUPP;
8277        if (!netif_device_present(dev))
8278                return -ENODEV;
8279        return ops->ndo_change_proto_down(dev, proto_down);
8280}
8281EXPORT_SYMBOL(dev_change_proto_down);
8282
8283/**
8284 *      dev_change_proto_down_generic - generic implementation for
8285 *      ndo_change_proto_down that sets carrier according to
8286 *      proto_down.
8287 *
8288 *      @dev: device
8289 *      @proto_down: new value
8290 */
8291int dev_change_proto_down_generic(struct net_device *dev, bool proto_down)
8292{
8293        if (proto_down)
8294                netif_carrier_off(dev);
8295        else
8296                netif_carrier_on(dev);
8297        dev->proto_down = proto_down;
8298        return 0;
8299}
8300EXPORT_SYMBOL(dev_change_proto_down_generic);
8301
8302u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
8303                    enum bpf_netdev_command cmd)
8304{
8305        struct netdev_bpf xdp;
8306
8307        if (!bpf_op)
8308                return 0;
8309
8310        memset(&xdp, 0, sizeof(xdp));
8311        xdp.command = cmd;
8312
8313        /* Query must always succeed. */
8314        WARN_ON(bpf_op(dev, &xdp) < 0 && cmd == XDP_QUERY_PROG);
8315
8316        return xdp.prog_id;
8317}
8318
8319static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
8320                           struct netlink_ext_ack *extack, u32 flags,
8321                           struct bpf_prog *prog)
8322{
8323        struct netdev_bpf xdp;
8324
8325        memset(&xdp, 0, sizeof(xdp));
8326        if (flags & XDP_FLAGS_HW_MODE)
8327                xdp.command = XDP_SETUP_PROG_HW;
8328        else
8329                xdp.command = XDP_SETUP_PROG;
8330        xdp.extack = extack;
8331        xdp.flags = flags;
8332        xdp.prog = prog;
8333
8334        return bpf_op(dev, &xdp);
8335}
8336
8337static void dev_xdp_uninstall(struct net_device *dev)
8338{
8339        struct netdev_bpf xdp;
8340        bpf_op_t ndo_bpf;
8341
8342        /* Remove generic XDP */
8343        WARN_ON(dev_xdp_install(dev, generic_xdp_install, NULL, 0, NULL));
8344
8345        /* Remove from the driver */
8346        ndo_bpf = dev->netdev_ops->ndo_bpf;
8347        if (!ndo_bpf)
8348                return;
8349
8350        memset(&xdp, 0, sizeof(xdp));
8351        xdp.command = XDP_QUERY_PROG;
8352        WARN_ON(ndo_bpf(dev, &xdp));
8353        if (xdp.prog_id)
8354                WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
8355                                        NULL));
8356
8357        /* Remove HW offload */
8358        memset(&xdp, 0, sizeof(xdp));
8359        xdp.command = XDP_QUERY_PROG_HW;
8360        if (!ndo_bpf(dev, &xdp) && xdp.prog_id)
8361                WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
8362                                        NULL));
8363}
8364
8365/**
8366 *      dev_change_xdp_fd - set or clear a bpf program for a device rx path
8367 *      @dev: device
8368 *      @extack: netlink extended ack
8369 *      @fd: new program fd or negative value to clear
8370 *      @flags: xdp-related flags
8371 *
8372 *      Set or clear a bpf program for a device
8373 */
8374int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
8375                      int fd, u32 flags)
8376{
8377        const struct net_device_ops *ops = dev->netdev_ops;
8378        enum bpf_netdev_command query;
8379        struct bpf_prog *prog = NULL;
8380        bpf_op_t bpf_op, bpf_chk;
8381        bool offload;
8382        int err;
8383
8384        ASSERT_RTNL();
8385
8386        offload = flags & XDP_FLAGS_HW_MODE;
8387        query = offload ? XDP_QUERY_PROG_HW : XDP_QUERY_PROG;
8388
8389        bpf_op = bpf_chk = ops->ndo_bpf;
8390        if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE))) {
8391                NL_SET_ERR_MSG(extack, "underlying driver does not support XDP in native mode");
8392                return -EOPNOTSUPP;
8393        }
8394        if (!bpf_op || (flags & XDP_FLAGS_SKB_MODE))
8395                bpf_op = generic_xdp_install;
8396        if (bpf_op == bpf_chk)
8397                bpf_chk = generic_xdp_install;
8398
8399        if (fd >= 0) {
8400                u32 prog_id;
8401
8402                if (!offload && __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG)) {
8403                        NL_SET_ERR_MSG(extack, "native and generic XDP can't be active at the same time");
8404                        return -EEXIST;
8405                }
8406
8407                prog_id = __dev_xdp_query(dev, bpf_op, query);
8408                if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && prog_id) {
8409                        NL_SET_ERR_MSG(extack, "XDP program already attached");
8410                        return -EBUSY;
8411                }
8412
8413                prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
8414                                             bpf_op == ops->ndo_bpf);
8415                if (IS_ERR(prog))
8416                        return PTR_ERR(prog);
8417
8418                if (!offload && bpf_prog_is_dev_bound(prog->aux)) {
8419                        NL_SET_ERR_MSG(extack, "using device-bound program without HW_MODE flag is not supported");
8420                        bpf_prog_put(prog);
8421                        return -EINVAL;
8422                }
8423
8424                /* prog->aux->id may be 0 for orphaned device-bound progs */
8425                if (prog->aux->id && prog->aux->id == prog_id) {
8426                        bpf_prog_put(prog);
8427                        return 0;
8428                }
8429        } else {
8430                if (!__dev_xdp_query(dev, bpf_op, query))
8431                        return 0;
8432        }
8433
8434        err = dev_xdp_install(dev, bpf_op, extack, flags, prog);
8435        if (err < 0 && prog)
8436                bpf_prog_put(prog);
8437
8438        return err;
8439}
8440
8441/**
8442 *      dev_new_index   -       allocate an ifindex
8443 *      @net: the applicable net namespace
8444 *
8445 *      Returns a suitable unique value for a new device interface
8446 *      number.  The caller must hold the rtnl semaphore or the
8447 *      dev_base_lock to be sure it remains unique.
8448 */
8449static int dev_new_index(struct net *net)
8450{
8451        int ifindex = net->ifindex;
8452
8453        for (;;) {
8454                if (++ifindex <= 0)
8455                        ifindex = 1;
8456                if (!__dev_get_by_index(net, ifindex))
8457                        return net->ifindex = ifindex;
8458        }
8459}
8460
8461/* Delayed registration/unregisteration */
8462static LIST_HEAD(net_todo_list);
8463DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
8464
8465static void net_set_todo(struct net_device *dev)
8466{
8467        list_add_tail(&dev->todo_list, &net_todo_list);
8468        dev_net(dev)->dev_unreg_count++;
8469}
8470
8471static void rollback_registered_many(struct list_head *head)
8472{
8473        struct net_device *dev, *tmp;
8474        LIST_HEAD(close_head);
8475
8476        BUG_ON(dev_boot_phase);
8477        ASSERT_RTNL();
8478
8479        list_for_each_entry_safe(dev, tmp, head, unreg_list) {
8480                /* Some devices call without registering
8481                 * for initialization unwind. Remove those
8482                 * devices and proceed with the remaining.
8483                 */
8484                if (dev->reg_state == NETREG_UNINITIALIZED) {
8485                        pr_debug("unregister_netdevice: device %s/%p never was registered\n",
8486                                 dev->name, dev);
8487
8488                        WARN_ON(1);
8489                        list_del(&dev->unreg_list);
8490                        continue;
8491                }
8492                dev->dismantle = true;
8493                BUG_ON(dev->reg_state != NETREG_REGISTERED);
8494        }
8495
8496        /* If device is running, close it first. */
8497        list_for_each_entry(dev, head, unreg_list)
8498                list_add_tail(&dev->close_list, &close_head);
8499        dev_close_many(&close_head, true);
8500
8501        list_for_each_entry(dev, head, unreg_list) {
8502                /* And unlink it from device chain. */
8503                unlist_netdevice(dev);
8504
8505                dev->reg_state = NETREG_UNREGISTERING;
8506        }
8507        flush_all_backlogs();
8508
8509        synchronize_net();
8510
8511        list_for_each_entry(dev, head, unreg_list) {
8512                struct sk_buff *skb = NULL;
8513
8514                /* Shutdown queueing discipline. */
8515                dev_shutdown(dev);
8516
8517                dev_xdp_uninstall(dev);
8518
8519                /* Notify protocols, that we are about to destroy
8520                 * this device. They should clean all the things.
8521                 */
8522                call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
8523
8524                if (!dev->rtnl_link_ops ||
8525                    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
8526                        skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
8527                                                     GFP_KERNEL, NULL, 0);
8528
8529                /*
8530                 *      Flush the unicast and multicast chains
8531                 */
8532                dev_uc_flush(dev);
8533                dev_mc_flush(dev);
8534
8535                if (dev->netdev_ops->ndo_uninit)
8536                        dev->netdev_ops->ndo_uninit(dev);
8537
8538                if (skb)
8539                        rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
8540
8541                /* Notifier chain MUST detach us all upper devices. */
8542                WARN_ON(netdev_has_any_upper_dev(dev));
8543                WARN_ON(netdev_has_any_lower_dev(dev));
8544
8545                /* Remove entries from kobject tree */
8546                netdev_unregister_kobject(dev);
8547#ifdef CONFIG_XPS
8548                /* Remove XPS queueing entries */
8549                netif_reset_xps_queues_gt(dev, 0);
8550#endif
8551        }
8552
8553        synchronize_net();
8554
8555        list_for_each_entry(dev, head, unreg_list)
8556                dev_put(dev);
8557}
8558
8559static void rollback_registered(struct net_device *dev)
8560{
8561        LIST_HEAD(single);
8562
8563        list_add(&dev->unreg_list, &single);
8564        rollback_registered_many(&single);
8565        list_del(&single);
8566}
8567
8568static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
8569        struct net_device *upper, netdev_features_t features)
8570{
8571        netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
8572        netdev_features_t feature;
8573        int feature_bit;
8574
8575        for_each_netdev_feature(upper_disables, feature_bit) {
8576                feature = __NETIF_F_BIT(feature_bit);
8577                if (!(upper->wanted_features & feature)
8578                    && (features & feature)) {
8579                        netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
8580                                   &feature, upper->name);
8581                        features &= ~feature;
8582                }
8583        }
8584
8585        return features;
8586}
8587
8588static void netdev_sync_lower_features(struct net_device *upper,
8589        struct net_device *lower, netdev_features_t features)
8590{
8591        netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
8592        netdev_features_t feature;
8593        int feature_bit;
8594
8595        for_each_netdev_feature(upper_disables, feature_bit) {
8596                feature = __NETIF_F_BIT(feature_bit);
8597                if (!(features & feature) && (lower->features & feature)) {
8598                        netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
8599                                   &feature, lower->name);
8600                        lower->wanted_features &= ~feature;
8601                        netdev_update_features(lower);
8602
8603                        if (unlikely(lower->features & feature))
8604                                netdev_WARN(upper, "failed to disable %pNF on %s!\n",
8605                                            &feature, lower->name);
8606                }
8607        }
8608}
8609
8610static netdev_features_t netdev_fix_features(struct net_device *dev,
8611        netdev_features_t features)
8612{
8613        /* Fix illegal checksum combinations */
8614        if ((features & NETIF_F_HW_CSUM) &&
8615            (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
8616                netdev_warn(dev, "mixed HW and IP checksum settings.\n");
8617                features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
8618        }
8619
8620        /* TSO requires that SG is present as well. */
8621        if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
8622                netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
8623                features &= ~NETIF_F_ALL_TSO;
8624        }
8625
8626        if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
8627                                        !(features & NETIF_F_IP_CSUM)) {
8628                netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
8629                features &= ~NETIF_F_TSO;
8630                features &= ~NETIF_F_TSO_ECN;
8631        }
8632
8633        if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
8634                                         !(features & NETIF_F_IPV6_CSUM)) {
8635                netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
8636                features &= ~NETIF_F_TSO6;
8637        }
8638
8639        /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
8640        if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
8641                features &= ~NETIF_F_TSO_MANGLEID;
8642
8643        /* TSO ECN requires that TSO is present as well. */
8644        if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
8645                features &= ~NETIF_F_TSO_ECN;
8646
8647        /* Software GSO depends on SG. */
8648        if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
8649                netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
8650                features &= ~NETIF_F_GSO;
8651        }
8652
8653        /* GSO partial features require GSO partial be set */
8654        if ((features & dev->gso_partial_features) &&
8655            !(features & NETIF_F_GSO_PARTIAL)) {
8656                netdev_dbg(dev,
8657                           "Dropping partially supported GSO features since no GSO partial.\n");
8658                features &= ~dev->gso_partial_features;
8659        }
8660
8661        if (!(features & NETIF_F_RXCSUM)) {
8662                /* NETIF_F_GRO_HW implies doing RXCSUM since every packet
8663                 * successfully merged by hardware must also have the
8664                 * checksum verified by hardware.  If the user does not
8665                 * want to enable RXCSUM, logically, we should disable GRO_HW.
8666                 */
8667                if (features & NETIF_F_GRO_HW) {
8668                        netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
8669                        features &= ~NETIF_F_GRO_HW;
8670                }
8671        }
8672
8673        /* LRO/HW-GRO features cannot be combined with RX-FCS */
8674        if (features & NETIF_F_RXFCS) {
8675                if (features & NETIF_F_LRO) {
8676                        netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
8677                        features &= ~NETIF_F_LRO;
8678                }
8679
8680                if (features & NETIF_F_GRO_HW) {
8681                        netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
8682                        features &= ~NETIF_F_GRO_HW;
8683                }
8684        }
8685
8686        return features;
8687}
8688
8689int __netdev_update_features(struct net_device *dev)
8690{
8691        struct net_device *upper, *lower;
8692        netdev_features_t features;
8693        struct list_head *iter;
8694        int err = -1;
8695
8696        ASSERT_RTNL();
8697
8698        features = netdev_get_wanted_features(dev);
8699
8700        if (dev->netdev_ops->ndo_fix_features)
8701                features = dev->netdev_ops->ndo_fix_features(dev, features);
8702
8703        /* driver might be less strict about feature dependencies */
8704        features = netdev_fix_features(dev, features);
8705
8706        /* some features can't be enabled if they're off an an upper device */
8707        netdev_for_each_upper_dev_rcu(dev, upper, iter)
8708                features = netdev_sync_upper_features(dev, upper, features);
8709
8710        if (dev->features == features)
8711                goto sync_lower;
8712
8713        netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
8714                &dev->features, &features);
8715
8716        if (dev->netdev_ops->ndo_set_features)
8717                err = dev->netdev_ops->ndo_set_features(dev, features);
8718        else
8719                err = 0;
8720
8721        if (unlikely(err < 0)) {
8722                netdev_err(dev,
8723                        "set_features() failed (%d); wanted %pNF, left %pNF\n",
8724                        err, &features, &dev->features);
8725                /* return non-0 since some features might have changed and
8726                 * it's better to fire a spurious notification than miss it
8727                 */
8728                return -1;
8729        }
8730
8731sync_lower:
8732        /* some features must be disabled on lower devices when disabled
8733         * on an upper device (think: bonding master or bridge)
8734         */
8735        netdev_for_each_lower_dev(dev, lower, iter)
8736                netdev_sync_lower_features(dev, lower, features);
8737
8738        if (!err) {
8739                netdev_features_t diff = features ^ dev->features;
8740
8741                if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
8742                        /* udp_tunnel_{get,drop}_rx_info both need
8743                         * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
8744                         * device, or they won't do anything.
8745                         * Thus we need to update dev->features
8746                         * *before* calling udp_tunnel_get_rx_info,
8747                         * but *after* calling udp_tunnel_drop_rx_info.
8748                         */
8749                        if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
8750                                dev->features = features;
8751                                udp_tunnel_get_rx_info(dev);
8752                        } else {
8753                                udp_tunnel_drop_rx_info(dev);
8754                        }
8755                }
8756
8757                if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
8758                        if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
8759                                dev->features = features;
8760                                err |= vlan_get_rx_ctag_filter_info(dev);
8761                        } else {
8762                                vlan_drop_rx_ctag_filter_info(dev);
8763                        }
8764                }
8765
8766                if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
8767                        if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
8768                                dev->features = features;
8769                                err |= vlan_get_rx_stag_filter_info(dev);
8770                        } else {
8771                                vlan_drop_rx_stag_filter_info(dev);
8772                        }
8773                }
8774
8775                dev->features = features;
8776        }
8777
8778        return err < 0 ? 0 : 1;
8779}
8780
8781/**
8782 *      netdev_update_features - recalculate device features
8783 *      @dev: the device to check
8784 *
8785 *      Recalculate dev->features set and send notifications if it
8786 *      has changed. Should be called after driver or hardware dependent
8787 *      conditions might have changed that influence the features.
8788 */
8789void netdev_update_features(struct net_device *dev)
8790{
8791        if (__netdev_update_features(dev))
8792                netdev_features_change(dev);
8793}
8794EXPORT_SYMBOL(netdev_update_features);
8795
8796/**
8797 *      netdev_change_features - recalculate device features
8798 *      @dev: the device to check
8799 *
8800 *      Recalculate dev->features set and send notifications even
8801 *      if they have not changed. Should be called instead of
8802 *      netdev_update_features() if also dev->vlan_features might
8803 *      have changed to allow the changes to be propagated to stacked
8804 *      VLAN devices.
8805 */
8806void netdev_change_features(struct net_device *dev)
8807{
8808        __netdev_update_features(dev);
8809        netdev_features_change(dev);
8810}
8811EXPORT_SYMBOL(netdev_change_features);
8812
8813/**
8814 *      netif_stacked_transfer_operstate -      transfer operstate
8815 *      @rootdev: the root or lower level device to transfer state from
8816 *      @dev: the device to transfer operstate to
8817 *
8818 *      Transfer operational state from root to device. This is normally
8819 *      called when a stacking relationship exists between the root
8820 *      device and the device(a leaf device).
8821 */
8822void netif_stacked_transfer_operstate(const struct net_device *rootdev,
8823                                        struct net_device *dev)
8824{
8825        if (rootdev->operstate == IF_OPER_DORMANT)
8826                netif_dormant_on(dev);
8827        else
8828                netif_dormant_off(dev);
8829
8830        if (netif_carrier_ok(rootdev))
8831                netif_carrier_on(dev);
8832        else
8833                netif_carrier_off(dev);
8834}
8835EXPORT_SYMBOL(netif_stacked_transfer_operstate);
8836
8837static int netif_alloc_rx_queues(struct net_device *dev)
8838{
8839        unsigned int i, count = dev->num_rx_queues;
8840        struct netdev_rx_queue *rx;
8841        size_t sz = count * sizeof(*rx);
8842        int err = 0;
8843
8844        BUG_ON(count < 1);
8845
8846        rx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
8847        if (!rx)
8848                return -ENOMEM;
8849
8850        dev->_rx = rx;
8851
8852        for (i = 0; i < count; i++) {
8853                rx[i].dev = dev;
8854
8855                /* XDP RX-queue setup */
8856                err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i);
8857                if (err < 0)
8858                        goto err_rxq_info;
8859        }
8860        return 0;
8861
8862err_rxq_info:
8863        /* Rollback successful reg's and free other resources */
8864        while (i--)
8865                xdp_rxq_info_unreg(&rx[i].xdp_rxq);
8866        kvfree(dev->_rx);
8867        dev->_rx = NULL;
8868        return err;
8869}
8870
8871static void netif_free_rx_queues(struct net_device *dev)
8872{
8873        unsigned int i, count = dev->num_rx_queues;
8874
8875        /* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
8876        if (!dev->_rx)
8877                return;
8878
8879        for (i = 0; i < count; i++)
8880                xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);
8881
8882        kvfree(dev->_rx);
8883}
8884
8885static void netdev_init_one_queue(struct net_device *dev,
8886                                  struct netdev_queue *queue, void *_unused)
8887{
8888        /* Initialize queue lock */
8889        spin_lock_init(&queue->_xmit_lock);
8890        lockdep_set_class(&queue->_xmit_lock, &dev->qdisc_xmit_lock_key);
8891        queue->xmit_lock_owner = -1;
8892        netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
8893        queue->dev = dev;
8894#ifdef CONFIG_BQL
8895        dql_init(&queue->dql, HZ);
8896#endif
8897}
8898
8899static void netif_free_tx_queues(struct net_device *dev)
8900{
8901        kvfree(dev->_tx);
8902}
8903
8904static int netif_alloc_netdev_queues(struct net_device *dev)
8905{
8906        unsigned int count = dev->num_tx_queues;
8907        struct netdev_queue *tx;
8908        size_t sz = count * sizeof(*tx);
8909
8910        if (count < 1 || count > 0xffff)
8911                return -EINVAL;
8912
8913        tx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
8914        if (!tx)
8915                return -ENOMEM;
8916
8917        dev->_tx = tx;
8918
8919        netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
8920        spin_lock_init(&dev->tx_global_lock);
8921
8922        return 0;
8923}
8924
8925void netif_tx_stop_all_queues(struct net_device *dev)
8926{
8927        unsigned int i;
8928
8929        for (i = 0; i < dev->num_tx_queues; i++) {
8930                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
8931
8932                netif_tx_stop_queue(txq);
8933        }
8934}
8935EXPORT_SYMBOL(netif_tx_stop_all_queues);
8936
8937static void netdev_register_lockdep_key(struct net_device *dev)
8938{
8939        lockdep_register_key(&dev->qdisc_tx_busylock_key);
8940        lockdep_register_key(&dev->qdisc_running_key);
8941        lockdep_register_key(&dev->qdisc_xmit_lock_key);
8942        lockdep_register_key(&dev->addr_list_lock_key);
8943}
8944
8945static void netdev_unregister_lockdep_key(struct net_device *dev)
8946{
8947        lockdep_unregister_key(&dev->qdisc_tx_busylock_key);
8948        lockdep_unregister_key(&dev->qdisc_running_key);
8949        lockdep_unregister_key(&dev->qdisc_xmit_lock_key);
8950        lockdep_unregister_key(&dev->addr_list_lock_key);
8951}
8952
8953void netdev_update_lockdep_key(struct net_device *dev)
8954{
8955        struct netdev_queue *queue;
8956        int i;
8957
8958        lockdep_unregister_key(&dev->qdisc_xmit_lock_key);
8959        lockdep_unregister_key(&dev->addr_list_lock_key);
8960
8961        lockdep_register_key(&dev->qdisc_xmit_lock_key);
8962        lockdep_register_key(&dev->addr_list_lock_key);
8963
8964        lockdep_set_class(&dev->addr_list_lock, &dev->addr_list_lock_key);
8965        for (i = 0; i < dev->num_tx_queues; i++) {
8966                queue = netdev_get_tx_queue(dev, i);
8967
8968                lockdep_set_class(&queue->_xmit_lock,
8969                                  &dev->qdisc_xmit_lock_key);
8970        }
8971}
8972EXPORT_SYMBOL(netdev_update_lockdep_key);
8973
8974/**
8975 *      register_netdevice      - register a network device
8976 *      @dev: device to register
8977 *
8978 *      Take a completed network device structure and add it to the kernel
8979 *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
8980 *      chain. 0 is returned on success. A negative errno code is returned
8981 *      on a failure to set up the device, or if the name is a duplicate.
8982 *
8983 *      Callers must hold the rtnl semaphore. You may want
8984 *      register_netdev() instead of this.
8985 *
8986 *      BUGS:
8987 *      The locking appears insufficient to guarantee two parallel registers
8988 *      will not get the same name.
8989 */
8990
8991int register_netdevice(struct net_device *dev)
8992{
8993        int ret;
8994        struct net *net = dev_net(dev);
8995
8996        BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
8997                     NETDEV_FEATURE_COUNT);
8998        BUG_ON(dev_boot_phase);
8999        ASSERT_RTNL();
9000

9001        might_sleep();
9002
9003        /* When net_device's are persistent, this will be fatal. */
9004        BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
9005        BUG_ON(!net);
9006
9007        spin_lock_init(&dev->addr_list_lock);
9008        lockdep_set_class(&dev->addr_list_lock, &dev->addr_list_lock_key);
9009
9010        ret = dev_get_valid_name(net, dev, dev->name);
9011        if (ret < 0)
9012                goto out;
9013
9014        /* Init, if this function is available */
9015        if (dev->netdev_ops->ndo_init) {
9016                ret = dev->netdev_ops->ndo_init(dev);
9017                if (ret) {
9018                        if (ret > 0)
9019                                ret = -EIO;
9020                        goto out;
9021                }
9022        }
9023
9024        if (((dev->hw_features | dev->features) &
9025             NETIF_F_HW_VLAN_CTAG_FILTER) &&
9026            (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
9027             !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
9028                netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
9029                ret = -EINVAL;
9030                goto err_uninit;
9031        }
9032
9033        ret = -EBUSY;
9034        if (!dev->ifindex)
9035                dev->ifindex = dev_new_index(net);
9036        else if (__dev_get_by_index(net, dev->ifindex))
9037                goto err_uninit;
9038
9039        /* Transfer changeable features to wanted_features and enable
9040         * software offloads (GSO and GRO).
9041         */
9042        dev->hw_features |= NETIF_F_SOFT_FEATURES;
9043        dev->features |= NETIF_F_SOFT_FEATURES;
9044
9045        if (dev->netdev_ops->ndo_udp_tunnel_add) {
9046                dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
9047                dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
9048        }
9049
9050        dev->wanted_features = dev->features & dev->hw_features;
9051
9052        if (!(dev->flags & IFF_LOOPBACK))
9053                dev->hw_features |= NETIF_F_NOCACHE_COPY;
9054
9055        /* If IPv4 TCP segmentation offload is supported we should also
9056         * allow the device to enable segmenting the frame with the option
9057         * of ignoring a static IP ID value.  This doesn't enable the
9058         * feature itself but allows the user to enable it later.
9059         */
9060        if (dev->hw_features & NETIF_F_TSO)
9061                dev->hw_features |= NETIF_F_TSO_MANGLEID;
9062        if (dev->vlan_features & NETIF_F_TSO)
9063                dev->vlan_features |= NETIF_F_TSO_MANGLEID;
9064        if (dev->mpls_features & NETIF_F_TSO)
9065                dev->mpls_features |= NETIF_F_TSO_MANGLEID;
9066        if (dev->hw_enc_features & NETIF_F_TSO)
9067                dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
9068
9069        /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
9070         */
9071        dev->vlan_features |= NETIF_F_HIGHDMA;
9072
9073        /* Make NETIF_F_SG inheritable to tunnel devices.
9074         */
9075        dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
9076
9077        /* Make NETIF_F_SG inheritable to MPLS.
9078         */
9079        dev->mpls_features |= NETIF_F_SG;
9080
9081        ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
9082        ret = notifier_to_errno(ret);
9083        if (ret)
9084                goto err_uninit;
9085
9086        ret = netdev_register_kobject(dev);
9087        if (ret)
9088                goto err_uninit;
9089        dev->reg_state = NETREG_REGISTERED;
9090
9091        __netdev_update_features(dev);
9092
9093        /*
9094         *      Default initial state at registry is that the
9095         *      device is present.
9096         */
9097
9098        set_bit(__LINK_STATE_PRESENT, &dev->state);
9099
9100        linkwatch_init_dev(dev);
9101
9102        dev_init_scheduler(dev);
9103        dev_hold(dev);
9104        list_netdevice(dev);
9105        add_device_randomness(dev->dev_addr, dev->addr_len);
9106
9107        /* If the device has permanent device address, driver should
9108         * set dev_addr and also addr_assign_type should be set to
9109         * NET_ADDR_PERM (default value).
9110         */
9111        if (dev->addr_assign_type == NET_ADDR_PERM)
9112                memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
9113
9114        /* Notify protocols, that a new device appeared. */
9115        ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
9116        ret = notifier_to_errno(ret);
9117        if (ret) {
9118                rollback_registered(dev);
9119                rcu_barrier();
9120
9121                dev->reg_state = NETREG_UNREGISTERED;
9122        }
9123        /*
9124         *      Prevent userspace races by waiting until the network
9125         *      device is fully setup before sending notifications.
9126         */
9127        if (!dev->rtnl_link_ops ||
9128            dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
9129                rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
9130
9131out:
9132        return ret;
9133
9134err_uninit:
9135        if (dev->netdev_ops->ndo_uninit)
9136                dev->netdev_ops->ndo_uninit(dev);
9137        if (dev->priv_destructor)
9138                dev->priv_destructor(dev);
9139        goto out;
9140}
9141EXPORT_SYMBOL(register_netdevice);
9142
9143/**
9144 *      init_dummy_netdev       - init a dummy network device for NAPI
9145 *      @dev: device to init
9146 *
9147 *      This takes a network device structure and initialize the minimum
9148 *      amount of fields so it can be used to schedule NAPI polls without
9149 *      registering a full blown interface. This is to be used by drivers
9150 *      that need to tie several hardware interfaces to a single NAPI
9151 *      poll scheduler due to HW limitations.
9152 */
9153int init_dummy_netdev(struct net_device *dev)
9154{
9155        /* Clear everything. Note we don't initialize spinlocks
9156         * are they aren't supposed to be taken by any of the
9157         * NAPI code and this dummy netdev is supposed to be
9158         * only ever used for NAPI polls
9159         */
9160        memset(dev, 0, sizeof(struct net_device));
9161
9162        /* make sure we BUG if trying to hit standard
9163         * register/unregister code path
9164         */
9165        dev->reg_state = NETREG_DUMMY;
9166
9167        /* NAPI wants this */
9168        INIT_LIST_HEAD(&dev->napi_list);
9169
9170        /* a dummy interface is started by default */
9171        set_bit(__LINK_STATE_PRESENT, &dev->state);
9172        set_bit(__LINK_STATE_START, &dev->state);
9173
9174        /* napi_busy_loop stats accounting wants this */
9175        dev_net_set(dev, &init_net);
9176
9177        /* Note : We dont allocate pcpu_refcnt for dummy devices,
9178         * because users of this 'device' dont need to change
9179         * its refcount.
9180         */
9181
9182        return 0;
9183}
9184EXPORT_SYMBOL_GPL(init_dummy_netdev);
9185
9186
9187/**
9188 *      register_netdev - register a network device
9189 *      @dev: device to register
9190 *
9191 *      Take a completed network device structure and add it to the kernel
9192 *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
9193 *      chain. 0 is returned on success. A negative errno code is returned
9194 *      on a failure to set up the device, or if the name is a duplicate.
9195 *
9196 *      This is a wrapper around register_netdevice that takes the rtnl semaphore
9197 *      and expands the device name if you passed a format string to
9198 *      alloc_netdev.
9199 */
9200int register_netdev(struct net_device *dev)
9201{
9202        int err;
9203
9204        if (rtnl_lock_killable())
9205                return -EINTR;
9206        err = register_netdevice(dev);
9207        rtnl_unlock();
9208        return err;
9209}
9210EXPORT_SYMBOL(register_netdev);
9211
9212int netdev_refcnt_read(const struct net_device *dev)
9213{
9214        int i, refcnt = 0;
9215
9216        for_each_possible_cpu(i)
9217                refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
9218        return refcnt;
9219}
9220EXPORT_SYMBOL(netdev_refcnt_read);
9221
9222/**
9223 * netdev_wait_allrefs - wait until all references are gone.
9224 * @dev: target net_device
9225 *
9226 * This is called when unregistering network devices.
9227 *
9228 * Any protocol or device that holds a reference should register
9229 * for netdevice notification, and cleanup and put back the
9230 * reference if they receive an UNREGISTER event.
9231 * We can get stuck here if buggy protocols don't correctly
9232 * call dev_put.
9233 */
9234static void netdev_wait_allrefs(struct net_device *dev)
9235{
9236        unsigned long rebroadcast_time, warning_time;
9237        int refcnt;
9238
9239        linkwatch_forget_dev(dev);
9240
9241        rebroadcast_time = warning_time = jiffies;
9242        refcnt = netdev_refcnt_read(dev);
9243
9244        while (refcnt != 0) {
9245                if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
9246                        rtnl_lock();
9247
9248                        /* Rebroadcast unregister notification */
9249                        call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
9250
9251                        __rtnl_unlock();
9252                        rcu_barrier();
9253                        rtnl_lock();
9254
9255                        if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
9256                                     &dev->state)) {
9257                                /* We must not have linkwatch events
9258                                 * pending on unregister. If this
9259                                 * happens, we simply run the queue
9260                                 * unscheduled, resulting in a noop
9261                                 * for this device.
9262                                 */
9263                                linkwatch_run_queue();
9264                        }
9265
9266                        __rtnl_unlock();
9267
9268                        rebroadcast_time = jiffies;
9269                }
9270
9271                msleep(250);
9272
9273                refcnt = netdev_refcnt_read(dev);
9274
9275                if (refcnt && time_after(jiffies, warning_time + 10 * HZ)) {
9276                        pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
9277                                 dev->name, refcnt);
9278                        warning_time = jiffies;
9279                }
9280        }
9281}
9282
9283/* The sequence is:
9284 *
9285 *      rtnl_lock();
9286 *      ...
9287 *      register_netdevice(x1);
9288 *      register_netdevice(x2);
9289 *      ...
9290 *      unregister_netdevice(y1);
9291 *      unregister_netdevice(y2);
9292 *      ...
9293 *      rtnl_unlock();
9294 *      free_netdev(y1);
9295 *      free_netdev(y2);
9296 *
9297 * We are invoked by rtnl_unlock().
9298 * This allows us to deal with problems:
9299 * 1) We can delete sysfs objects which invoke hotplug
9300 *    without deadlocking with linkwatch via keventd.
9301 * 2) Since we run with the RTNL semaphore not held, we can sleep
9302 *    safely in order to wait for the netdev refcnt to drop to zero.
9303 *
9304 * We must not return until all unregister events added during
9305 * the interval the lock was held have been completed.
9306 */
9307void netdev_run_todo(void)
9308{
9309        struct list_head list;
9310
9311        /* Snapshot list, allow later requests */
9312        list_replace_init(&net_todo_list, &list);
9313
9314        __rtnl_unlock();
9315
9316
9317        /* Wait for rcu callbacks to finish before next phase */
9318        if (!list_empty(&list))
9319                rcu_barrier();
9320
9321        while (!list_empty(&list)) {
9322                struct net_device *dev
9323                        = list_first_entry(&list, struct net_device, todo_list);
9324                list_del(&dev->todo_list);
9325
9326                if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
9327                        pr_err("network todo '%s' but state %d\n",
9328                               dev->name, dev->reg_state);
9329                        dump_stack();
9330                        continue;
9331                }
9332
9333                dev->reg_state = NETREG_UNREGISTERED;
9334
9335                netdev_wait_allrefs(dev);
9336
9337                /* paranoia */
9338                BUG_ON(netdev_refcnt_read(dev));
9339                BUG_ON(!list_empty(&dev->ptype_all));
9340                BUG_ON(!list_empty(&dev->ptype_specific));
9341                WARN_ON(rcu_access_pointer(dev->ip_ptr));
9342                WARN_ON(rcu_access_pointer(dev->ip6_ptr));
9343#if IS_ENABLED(CONFIG_DECNET)
9344                WARN_ON(dev->dn_ptr);
9345#endif
9346                if (dev->priv_destructor)
9347                        dev->priv_destructor(dev);
9348                if (dev->needs_free_netdev)
9349                        free_netdev(dev);
9350
9351                /* Report a network device has been unregistered */
9352                rtnl_lock();
9353                dev_net(dev)->dev_unreg_count--;
9354                __rtnl_unlock();
9355                wake_up(&netdev_unregistering_wq);
9356
9357                /* Free network device */
9358                kobject_put(&dev->dev.kobj);
9359        }
9360}
9361
9362/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
9363 * all the same fields in the same order as net_device_stats, with only
9364 * the type differing, but rtnl_link_stats64 may have additional fields
9365 * at the end for newer counters.
9366 */
9367void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
9368                             const struct net_device_stats *netdev_stats)
9369{
9370#if BITS_PER_LONG == 64
9371        BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
9372        memcpy(stats64, netdev_stats, sizeof(*netdev_stats));
9373        /* zero out counters that only exist in rtnl_link_stats64 */
9374        memset((char *)stats64 + sizeof(*netdev_stats), 0,
9375               sizeof(*stats64) - sizeof(*netdev_stats));
9376#else
9377        size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
9378        const unsigned long *src = (const unsigned long *)netdev_stats;
9379        u64 *dst = (u64 *)stats64;
9380
9381        BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
9382        for (i = 0; i < n; i++)
9383                dst[i] = src[i];
9384        /* zero out counters that only exist in rtnl_link_stats64 */
9385        memset((char *)stats64 + n * sizeof(u64), 0,
9386               sizeof(*stats64) - n * sizeof(u64));
9387#endif
9388}
9389EXPORT_SYMBOL(netdev_stats_to_stats64);
9390
9391/**
9392 *      dev_get_stats   - get network device statistics
9393 *      @dev: device to get statistics from
9394 *      @storage: place to store stats
9395 *
9396 *      Get network statistics from device. Return @storage.
9397 *      The device driver may provide its own method by setting
9398 *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
9399 *      otherwise the internal statistics structure is used.
9400 */
9401struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
9402                                        struct rtnl_link_stats64 *storage)
9403{
9404        const struct net_device_ops *ops = dev->netdev_ops;
9405
9406        if (ops->ndo_get_stats64) {
9407                memset(storage, 0, sizeof(*storage));
9408                ops->ndo_get_stats64(dev, storage);
9409        } else if (ops->ndo_get_stats) {
9410                netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
9411        } else {
9412                netdev_stats_to_stats64(storage, &dev->stats);
9413        }
9414        storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped);
9415        storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped);
9416        storage->rx_nohandler += (unsigned long)atomic_long_read(&dev->rx_nohandler);
9417        return storage;
9418}
9419EXPORT_SYMBOL(dev_get_stats);
9420
9421struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
9422{
9423        struct netdev_queue *queue = dev_ingress_queue(dev);
9424
9425#ifdef CONFIG_NET_CLS_ACT
9426        if (queue)
9427                return queue;
9428        queue = kzalloc(sizeof(*queue), GFP_KERNEL);
9429        if (!queue)
9430                return NULL;
9431        netdev_init_one_queue(dev, queue, NULL);
9432        RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
9433        queue->qdisc_sleeping = &noop_qdisc;
9434        rcu_assign_pointer(dev->ingress_queue, queue);
9435#endif
9436        return queue;
9437}
9438
9439static const struct ethtool_ops default_ethtool_ops;
9440
9441void netdev_set_default_ethtool_ops(struct net_device *dev,
9442                                    const struct ethtool_ops *ops)
9443{
9444        if (dev->ethtool_ops == &default_ethtool_ops)
9445                dev->ethtool_ops = ops;
9446}
9447EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
9448
9449void netdev_freemem(struct net_device *dev)
9450{
9451        char *addr = (char *)dev - dev->padded;
9452
9453        kvfree(addr);
9454}
9455
9456/**
9457 * alloc_netdev_mqs - allocate network device
9458 * @sizeof_priv: size of private data to allocate space for
9459 * @name: device name format string
9460 * @name_assign_type: origin of device name
9461 * @setup: callback to initialize device
9462 * @txqs: the number of TX subqueues to allocate
9463 * @rxqs: the number of RX subqueues to allocate
9464 *
9465 * Allocates a struct net_device with private data area for driver use
9466 * and performs basic initialization.  Also allocates subqueue structs
9467 * for each queue on the device.
9468 */
9469struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
9470                unsigned char name_assign_type,
9471                void (*setup)(struct net_device *),
9472                unsigned int txqs, unsigned int rxqs)
9473{
9474        struct net_device *dev;
9475        unsigned int alloc_size;
9476        struct net_device *p;
9477
9478        BUG_ON(strlen(name) >= sizeof(dev->name));
9479
9480        if (txqs < 1) {
9481                pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
9482                return NULL;
9483        }
9484
9485        if (rxqs < 1) {
9486                pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
9487                return NULL;
9488        }
9489
9490        alloc_size = sizeof(struct net_device);
9491        if (sizeof_priv) {
9492                /* ensure 32-byte alignment of private area */
9493                alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
9494                alloc_size += sizeof_priv;
9495        }
9496        /* ensure 32-byte alignment of whole construct */
9497        alloc_size += NETDEV_ALIGN - 1;
9498
9499        p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
9500        if (!p)
9501                return NULL;
9502
9503        dev = PTR_ALIGN(p, NETDEV_ALIGN);
9504        dev->padded = (char *)dev - (char *)p;
9505
9506        dev->pcpu_refcnt = alloc_percpu(int);
9507        if (!dev->pcpu_refcnt)
9508                goto free_dev;
9509
9510        if (dev_addr_init(dev))
9511                goto free_pcpu;
9512
9513        dev_mc_init(dev);
9514        dev_uc_init(dev);
9515
9516        dev_net_set(dev, &init_net);
9517
9518        netdev_register_lockdep_key(dev);
9519
9520        dev->gso_max_size = GSO_MAX_SIZE;
9521        dev->gso_max_segs = GSO_MAX_SEGS;
9522        dev->upper_level = 1;
9523        dev->lower_level = 1;
9524
9525        INIT_LIST_HEAD(&dev->napi_list);
9526        INIT_LIST_HEAD(&dev->unreg_list);
9527        INIT_LIST_HEAD(&dev->close_list);
9528        INIT_LIST_HEAD(&dev->link_watch_list);
9529        INIT_LIST_HEAD(&dev->adj_list.upper);
9530        INIT_LIST_HEAD(&dev->adj_list.lower);
9531        INIT_LIST_HEAD(&dev->ptype_all);
9532        INIT_LIST_HEAD(&dev->ptype_specific);
9533#ifdef CONFIG_NET_SCHED
9534        hash_init(dev->qdisc_hash);
9535#endif
9536        dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
9537        setup(dev);
9538
9539        if (!dev->tx_queue_len) {
9540                dev->priv_flags |= IFF_NO_QUEUE;
9541                dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
9542        }
9543
9544        dev->num_tx_queues = txqs;
9545        dev->real_num_tx_queues = txqs;
9546        if (netif_alloc_netdev_queues(dev))
9547                goto free_all;
9548
9549        dev->num_rx_queues = rxqs;
9550        dev->real_num_rx_queues = rxqs;
9551        if (netif_alloc_rx_queues(dev))
9552                goto free_all;
9553
9554        strcpy(dev->name, name);
9555        dev->name_assign_type = name_assign_type;
9556        dev->group = INIT_NETDEV_GROUP;
9557        if (!dev->ethtool_ops)
9558                dev->ethtool_ops = &default_ethtool_ops;
9559
9560        nf_hook_ingress_init(dev);
9561
9562        return dev;
9563
9564free_all:
9565        free_netdev(dev);
9566        return NULL;
9567
9568free_pcpu:
9569        free_percpu(dev->pcpu_refcnt);
9570free_dev:
9571        netdev_freemem(dev);
9572        return NULL;
9573}
9574EXPORT_SYMBOL(alloc_netdev_mqs);
9575
9576/**
9577 * free_netdev - free network device
9578 * @dev: device
9579 *
9580 * This function does the last stage of destroying an allocated device
9581 * interface. The reference to the device object is released. If this
9582 * is the last reference then it will be freed.Must be called in process
9583 * context.
9584 */
9585void free_netdev(struct net_device *dev)
9586{
9587        struct napi_struct *p, *n;
9588
9589        might_sleep();
9590        netif_free_tx_queues(dev);
9591        netif_free_rx_queues(dev);
9592
9593        kfree(rcu_dereference_protected(dev->ingress_queue, 1));
9594
9595        /* Flush device addresses */
9596        dev_addr_flush(dev);
9597
9598        list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
9599                netif_napi_del(p);
9600
9601        free_percpu(dev->pcpu_refcnt);
9602        dev->pcpu_refcnt = NULL;
9603
9604        netdev_unregister_lockdep_key(dev);
9605
9606        /*  Compatibility with error handling in drivers */
9607        if (dev->reg_state == NETREG_UNINITIALIZED) {
9608                netdev_freemem(dev);
9609                return;
9610        }
9611
9612        BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
9613        dev->reg_state = NETREG_RELEASED;
9614
9615        /* will free via device release */
9616        put_device(&dev->dev);
9617}
9618EXPORT_SYMBOL(free_netdev);
9619
9620/**
9621 *      synchronize_net -  Synchronize with packet receive processing
9622 *
9623 *      Wait for packets currently being received to be done.
9624 *      Does not block later packets from starting.
9625 */
9626void synchronize_net(void)
9627{
9628        might_sleep();
9629        if (rtnl_is_locked())
9630                synchronize_rcu_expedited();
9631        else
9632                synchronize_rcu();
9633}
9634EXPORT_SYMBOL(synchronize_net);
9635
9636/**
9637 *      unregister_netdevice_queue - remove device from the kernel
9638 *      @dev: device
9639 *      @head: list
9640 *
9641 *      This function shuts down a device interface and removes it
9642 *      from the kernel tables.
9643 *      If head not NULL, device is queued to be unregistered later.
9644 *
9645 *      Callers must hold the rtnl semaphore.  You may want
9646 *      unregister_netdev() instead of this.
9647 */
9648
9649void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
9650{
9651        ASSERT_RTNL();
9652
9653        if (head) {
9654                list_move_tail(&dev->unreg_list, head);
9655        } else {
9656                rollback_registered(dev);
9657                /* Finish processing unregister after unlock */
9658                net_set_todo(dev);
9659        }
9660}
9661EXPORT_SYMBOL(unregister_netdevice_queue);
9662
9663/**
9664 *      unregister_netdevice_many - unregister many devices
9665 *      @head: list of devices
9666 *
9667 *  Note: As most callers use a stack allocated list_head,
9668 *  we force a list_del() to make sure stack wont be corrupted later.
9669 */
9670void unregister_netdevice_many(struct list_head *head)
9671{
9672        struct net_device *dev;
9673
9674        if (!list_empty(head)) {
9675                rollback_registered_many(head);
9676                list_for_each_entry(dev, head, unreg_list)
9677                        net_set_todo(dev);
9678                list_del(head);
9679        }
9680}
9681EXPORT_SYMBOL(unregister_netdevice_many);
9682
9683/**
9684 *      unregister_netdev - remove device from the kernel
9685 *      @dev: device
9686 *
9687 *      This function shuts down a device interface and removes it
9688 *      from the kernel tables.
9689 *
9690 *      This is just a wrapper for unregister_netdevice that takes
9691 *      the rtnl semaphore.  In general you want to use this and not
9692 *      unregister_netdevice.
9693 */
9694void unregister_netdev(struct net_device *dev)
9695{
9696        rtnl_lock();
9697        unregister_netdevice(dev);
9698        rtnl_unlock();
9699}
9700EXPORT_SYMBOL(unregister_netdev);
9701
9702/**
9703 *      dev_change_net_namespace - move device to different nethost namespace
9704 *      @dev: device
9705 *      @net: network namespace
9706 *      @pat: If not NULL name pattern to try if the current device name
9707 *            is already taken in the destination network namespace.
9708 *
9709 *      This function shuts down a device interface and moves it
9710 *      to a new network namespace. On success 0 is returned, on
9711 *      a failure a netagive errno code is returned.
9712 *
9713 *      Callers must hold the rtnl semaphore.
9714 */
9715
9716int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
9717{
9718        int err, new_nsid, new_ifindex;
9719
9720        ASSERT_RTNL();
9721
9722        /* Don't allow namespace local devices to be moved. */
9723        err = -EINVAL;
9724        if (dev->features & NETIF_F_NETNS_LOCAL)
9725                goto out;
9726
9727        /* Ensure the device has been registrered */
9728        if (dev->reg_state != NETREG_REGISTERED)
9729                goto out;
9730
9731        /* Get out if there is nothing todo */
9732        err = 0;
9733        if (net_eq(dev_net(dev), net))
9734                goto out;
9735
9736        /* Pick the destination device name, and ensure
9737         * we can use it in the destination network namespace.
9738         */
9739        err = -EEXIST;
9740        if (__dev_get_by_name(net, dev->name)) {
9741                /* We get here if we can't use the current device name */
9742                if (!pat)
9743                        goto out;
9744                err = dev_get_valid_name(net, dev, pat);
9745                if (err < 0)
9746                        goto out;
9747        }
9748
9749        /*
9750         * And now a mini version of register_netdevice unregister_netdevice.
9751         */
9752
9753        /* If device is running close it first. */
9754        dev_close(dev);
9755
9756        /* And unlink it from device chain */
9757        unlist_netdevice(dev);
9758
9759        synchronize_net();
9760
9761        /* Shutdown queueing discipline. */
9762        dev_shutdown(dev);
9763
9764        /* Notify protocols, that we are about to destroy
9765         * this device. They should clean all the things.
9766         *
9767         * Note that dev->reg_state stays at NETREG_REGISTERED.
9768         * This is wanted because this way 8021q and macvlan know
9769         * the device is just moving and can keep their slaves up.
9770         */
9771        call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
9772        rcu_barrier();
9773
9774        new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
9775        /* If there is an ifindex conflict assign a new one */
9776        if (__dev_get_by_index(net, dev->ifindex))
9777                new_ifindex = dev_new_index(net);
9778        else
9779                new_ifindex = dev->ifindex;
9780
9781        rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
9782                            new_ifindex);
9783
9784        /*
9785         *      Flush the unicast and multicast chains
9786         */
9787        dev_uc_flush(dev);
9788        dev_mc_flush(dev);
9789
9790        /* Send a netdev-removed uevent to the old namespace */
9791        kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
9792        netdev_adjacent_del_links(dev);
9793
9794        /* Actually switch the network namespace */
9795        dev_net_set(dev, net);
9796        dev->ifindex = new_ifindex;
9797
9798        /* Send a netdev-add uevent to the new namespace */
9799        kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
9800        netdev_adjacent_add_links(dev);
9801
9802        /* Fixup kobjects */
9803        err = device_rename(&dev->dev, dev->name);
9804        WARN_ON(err);
9805
9806        /* Add the device back in the hashes */
9807        list_netdevice(dev);
9808
9809        /* Notify protocols, that a new device appeared. */
9810        call_netdevice_notifiers(NETDEV_REGISTER, dev);
9811
9812        /*
9813         *      Prevent userspace races by waiting until the network
9814         *      device is fully setup before sending notifications.
9815         */
9816        rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
9817
9818        synchronize_net();
9819        err = 0;
9820out:
9821        return err;
9822}
9823EXPORT_SYMBOL_GPL(dev_change_net_namespace);
9824
9825static int dev_cpu_dead(unsigned int oldcpu)
9826{
9827        struct sk_buff **list_skb;
9828        struct sk_buff *skb;
9829        unsigned int cpu;
9830        struct softnet_data *sd, *oldsd, *remsd = NULL;
9831
9832        local_irq_disable();
9833        cpu = smp_processor_id();
9834        sd = &per_cpu(softnet_data, cpu);
9835        oldsd = &per_cpu(softnet_data, oldcpu);
9836
9837        /* Find end of our completion_queue. */
9838        list_skb = &sd->completion_queue;
9839        while (*list_skb)
9840                list_skb = &(*list_skb)->next;
9841        /* Append completion queue from offline CPU. */
9842        *list_skb = oldsd->completion_queue;
9843        oldsd->completion_queue = NULL;
9844
9845        /* Append output queue from offline CPU. */
9846        if (oldsd->output_queue) {
9847                *sd->output_queue_tailp = oldsd->output_queue;
9848                sd->output_queue_tailp = oldsd->output_queue_tailp;
9849                oldsd->output_queue = NULL;
9850                oldsd->output_queue_tailp = &oldsd->output_queue;
9851        }
9852        /* Append NAPI poll list from offline CPU, with one exception :
9853         * process_backlog() must be called by cpu owning percpu backlog.
9854         * We properly handle process_queue & input_pkt_queue later.
9855         */
9856        while (!list_empty(&oldsd->poll_list)) {
9857                struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
9858                                                            struct napi_struct,
9859                                                            poll_list);
9860
9861                list_del_init(&napi->poll_list);
9862                if (napi->poll == process_backlog)
9863                        napi->state = 0;
9864                else
9865                        ____napi_schedule(sd, napi);
9866        }
9867
9868        raise_softirq_irqoff(NET_TX_SOFTIRQ);
9869        local_irq_enable();
9870
9871#ifdef CONFIG_RPS
9872        remsd = oldsd->rps_ipi_list;
9873        oldsd->rps_ipi_list = NULL;
9874#endif
9875        /* send out pending IPI's on offline CPU */
9876        net_rps_send_ipi(remsd);
9877
9878        /* Process offline CPU's input_pkt_queue */
9879        while ((skb = __skb_dequeue(&oldsd->process_queue))) {
9880                netif_rx_ni(skb);
9881                input_queue_head_incr(oldsd);
9882        }
9883        while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
9884                netif_rx_ni(skb);
9885                input_queue_head_incr(oldsd);
9886        }
9887
9888        return 0;
9889}
9890
9891/**
9892 *      netdev_increment_features - increment feature set by one
9893 *      @all: current feature set
9894 *      @one: new feature set
9895 *      @mask: mask feature set
9896 *
9897 *      Computes a new feature set after adding a device with feature set
9898 *      @one to the master device with current feature set @all.  Will not
9899 *      enable anything that is off in @mask. Returns the new feature set.
9900 */
9901netdev_features_t netdev_increment_features(netdev_features_t all,
9902        netdev_features_t one, netdev_features_t mask)
9903{
9904        if (mask & NETIF_F_HW_CSUM)
9905                mask |= NETIF_F_CSUM_MASK;
9906        mask |= NETIF_F_VLAN_CHALLENGED;
9907
9908        all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
9909        all &= one | ~NETIF_F_ALL_FOR_ALL;
9910
9911        /* If one device supports hw checksumming, set for all. */
9912        if (all & NETIF_F_HW_CSUM)
9913                all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
9914
9915        return all;
9916}
9917EXPORT_SYMBOL(netdev_increment_features);
9918
9919static struct hlist_head * __net_init netdev_create_hash(void)
9920{
9921        int i;
9922        struct hlist_head *hash;
9923
9924        hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
9925        if (hash != NULL)
9926                for (i = 0; i < NETDEV_HASHENTRIES; i++)
9927                        INIT_HLIST_HEAD(&hash[i]);
9928
9929        return hash;
9930}
9931
9932/* Initialize per network namespace state */
9933static int __net_init netdev_init(struct net *net)
9934{
9935        BUILD_BUG_ON(GRO_HASH_BUCKETS >
9936                     8 * FIELD_SIZEOF(struct napi_struct, gro_bitmask));
9937
9938        if (net != &init_net)
9939                INIT_LIST_HEAD(&net->dev_base_head);
9940
9941        net->dev_name_head = netdev_create_hash();
9942        if (net->dev_name_head == NULL)
9943                goto err_name;
9944
9945        net->dev_index_head = netdev_create_hash();
9946        if (net->dev_index_head == NULL)
9947                goto err_idx;
9948
9949        return 0;
9950
9951err_idx:
9952        kfree(net->dev_name_head);
9953err_name:
9954        return -ENOMEM;
9955}
9956
9957/**
9958 *      netdev_drivername - network driver for the device
9959 *      @dev: network device
9960 *
9961 *      Determine network driver for device.
9962 */
9963const char *netdev_drivername(const struct net_device *dev)
9964{
9965        const struct device_driver *driver;
9966        const struct device *parent;
9967        const char *empty = "";
9968
9969        parent = dev->dev.parent;
9970        if (!parent)
9971                return empty;
9972
9973        driver = parent->driver;
9974        if (driver && driver->name)
9975                return driver->name;
9976        return empty;
9977}
9978
9979static void __netdev_printk(const char *level, const struct net_device *dev,
9980                            struct va_format *vaf)
9981{
9982        if (dev && dev->dev.parent) {
9983                dev_printk_emit(level[1] - '0',
9984                                dev->dev.parent,
9985                                "%s %s %s%s: %pV",
9986                                dev_driver_string(dev->dev.parent),
9987                                dev_name(dev->dev.parent),
9988                                netdev_name(dev), netdev_reg_state(dev),
9989                                vaf);
9990        } else if (dev) {
9991                printk("%s%s%s: %pV",
9992                       level, netdev_name(dev), netdev_reg_state(dev), vaf);
9993        } else {
9994                printk("%s(NULL net_device): %pV", level, vaf);
9995        }
9996}
9997
9998void netdev_printk(const char *level, const struct net_device *dev,
9999                   const char *format, ...)
10000{

10001        struct va_format vaf;
10002        va_list args;
10003
10004        va_start(args, format);
10005
10006        vaf.fmt = format;
10007        vaf.va = &args;
10008
10009        __netdev_printk(level, dev, &vaf);
10010
10011        va_end(args);
10012}
10013EXPORT_SYMBOL(netdev_printk);
10014
10015#define define_netdev_printk_level(func, level)                 \
10016void func(const struct net_device *dev, const char *fmt, ...)   \
10017{                                                               \
10018        struct va_format vaf;                                   \
10019        va_list args;                                           \
10020                                                                \
10021        va_start(args, fmt);                                    \
10022                                                                \
10023        vaf.fmt = fmt;                                          \
10024        vaf.va = &args;                                         \
10025                                                                \
10026        __netdev_printk(level, dev, &vaf);                      \
10027                                                                \
10028        va_end(args);                                           \
10029}                                                               \
10030EXPORT_SYMBOL(func);
10031
10032define_netdev_printk_level(netdev_emerg, KERN_EMERG);
10033define_netdev_printk_level(netdev_alert, KERN_ALERT);
10034define_netdev_printk_level(netdev_crit, KERN_CRIT);
10035define_netdev_printk_level(netdev_err, KERN_ERR);
10036define_netdev_printk_level(netdev_warn, KERN_WARNING);
10037define_netdev_printk_level(netdev_notice, KERN_NOTICE);
10038define_netdev_printk_level(netdev_info, KERN_INFO);
10039
10040static void __net_exit netdev_exit(struct net *net)
10041{
10042        kfree(net->dev_name_head);
10043        kfree(net->dev_index_head);
10044        if (net != &init_net)
10045                WARN_ON_ONCE(!list_empty(&net->dev_base_head));
10046}
10047
10048static struct pernet_operations __net_initdata netdev_net_ops = {
10049        .init = netdev_init,
10050        .exit = netdev_exit,
10051};
10052
10053static void __net_exit default_device_exit(struct net *net)
10054{
10055        struct net_device *dev, *aux;
10056        /*
10057         * Push all migratable network devices back to the
10058         * initial network namespace
10059         */
10060        rtnl_lock();
10061        for_each_netdev_safe(net, dev, aux) {
10062                int err;
10063                char fb_name[IFNAMSIZ];
10064
10065                /* Ignore unmoveable devices (i.e. loopback) */
10066                if (dev->features & NETIF_F_NETNS_LOCAL)
10067                        continue;
10068
10069                /* Leave virtual devices for the generic cleanup */
10070                if (dev->rtnl_link_ops)
10071                        continue;
10072
10073                /* Push remaining network devices to init_net */
10074                snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
10075                if (__dev_get_by_name(&init_net, fb_name))
10076                        snprintf(fb_name, IFNAMSIZ, "dev%%d");
10077                err = dev_change_net_namespace(dev, &init_net, fb_name);
10078                if (err) {
10079                        pr_emerg("%s: failed to move %s to init_net: %d\n",
10080                                 __func__, dev->name, err);
10081                        BUG();
10082                }
10083        }
10084        rtnl_unlock();
10085}
10086
10087static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
10088{
10089        /* Return with the rtnl_lock held when there are no network
10090         * devices unregistering in any network namespace in net_list.
10091         */
10092        struct net *net;
10093        bool unregistering;
10094        DEFINE_WAIT_FUNC(wait, woken_wake_function);
10095
10096        add_wait_queue(&netdev_unregistering_wq, &wait);
10097        for (;;) {
10098                unregistering = false;
10099                rtnl_lock();
10100                list_for_each_entry(net, net_list, exit_list) {
10101                        if (net->dev_unreg_count > 0) {
10102                                unregistering = true;
10103                                break;
10104                        }
10105                }
10106                if (!unregistering)
10107                        break;
10108                __rtnl_unlock();
10109
10110                wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
10111        }
10112        remove_wait_queue(&netdev_unregistering_wq, &wait);
10113}
10114
10115static void __net_exit default_device_exit_batch(struct list_head *net_list)
10116{
10117        /* At exit all network devices most be removed from a network
10118         * namespace.  Do this in the reverse order of registration.
10119         * Do this across as many network namespaces as possible to
10120         * improve batching efficiency.
10121         */
10122        struct net_device *dev;
10123        struct net *net;
10124        LIST_HEAD(dev_kill_list);
10125
10126        /* To prevent network device cleanup code from dereferencing
10127         * loopback devices or network devices that have been freed
10128         * wait here for all pending unregistrations to complete,
10129         * before unregistring the loopback device and allowing the
10130         * network namespace be freed.
10131         *
10132         * The netdev todo list containing all network devices
10133         * unregistrations that happen in default_device_exit_batch
10134         * will run in the rtnl_unlock() at the end of
10135         * default_device_exit_batch.
10136         */
10137        rtnl_lock_unregistering(net_list);
10138        list_for_each_entry(net, net_list, exit_list) {
10139                for_each_netdev_reverse(net, dev) {
10140                        if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
10141                                dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
10142                        else
10143                                unregister_netdevice_queue(dev, &dev_kill_list);
10144                }
10145        }
10146        unregister_netdevice_many(&dev_kill_list);
10147        rtnl_unlock();
10148}
10149
10150static struct pernet_operations __net_initdata default_device_ops = {
10151        .exit = default_device_exit,
10152        .exit_batch = default_device_exit_batch,
10153};
10154
10155/*
10156 *      Initialize the DEV module. At boot time this walks the device list and
10157 *      unhooks any devices that fail to initialise (normally hardware not
10158 *      present) and leaves us with a valid list of present and active devices.
10159 *
10160 */
10161
10162/*
10163 *       This is called single threaded during boot, so no need
10164 *       to take the rtnl semaphore.
10165 */
10166static int __init net_dev_init(void)
10167{
10168        int i, rc = -ENOMEM;
10169
10170        BUG_ON(!dev_boot_phase);
10171
10172        if (dev_proc_init())
10173                goto out;
10174
10175        if (netdev_kobject_init())
10176                goto out;
10177
10178        INIT_LIST_HEAD(&ptype_all);
10179        for (i = 0; i < PTYPE_HASH_SIZE; i++)
10180                INIT_LIST_HEAD(&ptype_base[i]);
10181
10182        INIT_LIST_HEAD(&offload_base);
10183
10184        if (register_pernet_subsys(&netdev_net_ops))
10185                goto out;
10186
10187        /*
10188         *      Initialise the packet receive queues.
10189         */
10190
10191        for_each_possible_cpu(i) {
10192                struct work_struct *flush = per_cpu_ptr(&flush_works, i);
10193                struct softnet_data *sd = &per_cpu(softnet_data, i);
10194
10195                INIT_WORK(flush, flush_backlog);
10196
10197                skb_queue_head_init(&sd->input_pkt_queue);
10198                skb_queue_head_init(&sd->process_queue);
10199#ifdef CONFIG_XFRM_OFFLOAD
10200                skb_queue_head_init(&sd->xfrm_backlog);
10201#endif
10202                INIT_LIST_HEAD(&sd->poll_list);
10203                sd->output_queue_tailp = &sd->output_queue;
10204#ifdef CONFIG_RPS
10205                sd->csd.func = rps_trigger_softirq;
10206                sd->csd.info = sd;
10207                sd->cpu = i;
10208#endif
10209
10210                init_gro_hash(&sd->backlog);
10211                sd->backlog.poll = process_backlog;
10212                sd->backlog.weight = weight_p;
10213        }
10214
10215        dev_boot_phase = 0;
10216
10217        /* The loopback device is special if any other network devices
10218         * is present in a network namespace the loopback device must
10219         * be present. Since we now dynamically allocate and free the
10220         * loopback device ensure this invariant is maintained by
10221         * keeping the loopback device as the first device on the
10222         * list of network devices.  Ensuring the loopback devices
10223         * is the first device that appears and the last network device
10224         * that disappears.
10225         */
10226        if (register_pernet_device(&loopback_net_ops))
10227                goto out;
10228
10229        if (register_pernet_device(&default_device_ops))
10230                goto out;
10231
10232        open_softirq(NET_TX_SOFTIRQ, net_tx_action);
10233        open_softirq(NET_RX_SOFTIRQ, net_rx_action);
10234
10235        rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
10236                                       NULL, dev_cpu_dead);
10237        WARN_ON(rc < 0);
10238        rc = 0;
10239out:
10240        return rc;
10241}
10242
10243subsys_initcall(net_dev_init);
10244